1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 47 #include <sys/kmem.h> 48 #include <sys/systm.h> 49 #include <sys/param.h> 50 #include <sys/socket.h> 51 #include <sys/isa_defs.h> 52 #include <net/if.h> 53 #include <net/if_arp.h> 54 #include <net/if_types.h> 55 #include <net/if_dl.h> 56 #include <net/route.h> 57 #include <sys/sockio.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/igmp_var.h> 62 #include <sys/strsun.h> 63 #include <sys/policy.h> 64 #include <sys/ethernet.h> 65 66 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 67 #include <inet/mi.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/mib2.h> 71 #include <inet/ip.h> 72 #include <inet/ip6.h> 73 #include <inet/ip6_asp.h> 74 #include <inet/tcp.h> 75 #include <inet/ip_multi.h> 76 #include <inet/ip_ire.h> 77 #include <inet/ip_ftable.h> 78 #include <inet/ip_rts.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/ip_if.h> 81 #include <inet/ip_impl.h> 82 #include <inet/tun.h> 83 #include <inet/sctp_ip.h> 84 85 #include <net/pfkeyv2.h> 86 #include <inet/ipsec_info.h> 87 #include <inet/sadb.h> 88 #include <inet/ipsec_impl.h> 89 #include <sys/iphada.h> 90 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac.h> 96 97 #include <sys/systeminfo.h> 98 #include <sys/bootconf.h> 99 100 #include <sys/tsol/tndb.h> 101 #include <sys/tsol/tnet.h> 102 103 /* The character which tells where the ill_name ends */ 104 #define IPIF_SEPARATOR_CHAR ':' 105 106 /* IP ioctl function table entry */ 107 typedef struct ipft_s { 108 int ipft_cmd; 109 pfi_t ipft_pfi; 110 int ipft_min_size; 111 int ipft_flags; 112 } ipft_t; 113 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 114 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 115 116 typedef struct ip_sock_ar_s { 117 union { 118 area_t ip_sock_area; 119 ared_t ip_sock_ared; 120 areq_t ip_sock_areq; 121 } ip_sock_ar_u; 122 queue_t *ip_sock_ar_q; 123 } ip_sock_ar_t; 124 125 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 126 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 127 char *value, caddr_t cp, cred_t *ioc_cr); 128 129 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 130 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 131 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 132 mblk_t *mp, boolean_t need_up); 133 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 134 mblk_t *mp, boolean_t need_up); 135 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 136 queue_t *q, mblk_t *mp, boolean_t need_up); 137 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 138 mblk_t *mp, boolean_t need_up); 139 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 140 mblk_t *mp); 141 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 142 queue_t *q, mblk_t *mp, boolean_t need_up); 143 static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, 144 sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl); 145 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **); 146 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 147 static void ipsq_flush(ill_t *ill); 148 static void ipsq_clean_all(ill_t *ill); 149 static void ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring); 150 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 151 queue_t *q, mblk_t *mp, boolean_t need_up); 152 static void ipsq_delete(ipsq_t *); 153 154 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 155 boolean_t initialize); 156 static void ipif_check_bcast_ires(ipif_t *test_ipif); 157 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 158 static void ipif_delete_cache_ire(ire_t *, char *); 159 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 160 static void ipif_free(ipif_t *ipif); 161 static void ipif_free_tail(ipif_t *ipif); 162 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 163 static void ipif_multicast_down(ipif_t *ipif); 164 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 165 static void ipif_set_default(ipif_t *ipif); 166 static int ipif_set_values(queue_t *q, mblk_t *mp, 167 char *interf_name, uint_t *ppa); 168 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 169 queue_t *q); 170 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 171 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 172 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error); 173 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 174 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 175 176 static int ill_alloc_ppa(ill_if_t *, ill_t *); 177 static int ill_arp_off(ill_t *ill); 178 static int ill_arp_on(ill_t *ill); 179 static void ill_delete_interface_type(ill_if_t *); 180 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 181 static void ill_dl_down(ill_t *ill); 182 static void ill_down(ill_t *ill); 183 static void ill_downi(ire_t *ire, char *ill_arg); 184 static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); 185 static void ill_down_tail(ill_t *ill); 186 static void ill_free_mib(ill_t *ill); 187 static void ill_glist_delete(ill_t *); 188 static boolean_t ill_has_usable_ipif(ill_t *); 189 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 190 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 191 static void ill_phyint_free(ill_t *ill); 192 static void ill_phyint_reinit(ill_t *ill); 193 static void ill_set_nce_router_flags(ill_t *, boolean_t); 194 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 195 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 196 static void ill_stq_cache_delete(ire_t *, char *); 197 198 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 199 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 200 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 201 in6_addr_t *); 202 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 203 ipaddr_t *); 204 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 205 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 206 in6_addr_t *); 207 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 208 ipaddr_t *); 209 210 static void ipif_save_ire(ipif_t *, ire_t *); 211 static void ipif_remove_ire(ipif_t *, ire_t *); 212 static void ip_cgtp_bcast_add(ire_t *, ire_t *); 213 static void ip_cgtp_bcast_delete(ire_t *); 214 215 /* 216 * Per-ill IPsec capabilities management. 217 */ 218 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 219 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 220 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 221 static void ill_ipsec_capab_delete(ill_t *, uint_t); 222 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 223 static void ill_capability_proto(ill_t *, int, mblk_t *); 224 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 225 boolean_t); 226 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 227 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 228 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 229 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 230 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 231 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 232 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 233 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 234 dl_capability_sub_t *); 235 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 236 237 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 238 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 239 static void ill_capability_dls_reset(ill_t *, mblk_t **); 240 static void ill_capability_dls_disable(ill_t *); 241 242 static void illgrp_cache_delete(ire_t *, char *); 243 static void illgrp_delete(ill_t *ill); 244 static void illgrp_reset_schednext(ill_t *ill); 245 246 static ill_t *ill_prev_usesrc(ill_t *); 247 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 248 static void ill_disband_usesrc_group(ill_t *); 249 250 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 251 252 /* 253 * if we go over the memory footprint limit more than once in this msec 254 * interval, we'll start pruning aggressively. 255 */ 256 int ip_min_frag_prune_time = 0; 257 258 /* 259 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 260 * and the IPsec DOI 261 */ 262 #define MAX_IPSEC_ALGS 256 263 264 #define BITSPERBYTE 8 265 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 266 267 #define IPSEC_ALG_ENABLE(algs, algid) \ 268 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 269 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 270 271 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 272 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 273 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 274 275 typedef uint8_t ipsec_capab_elem_t; 276 277 /* 278 * Per-algorithm parameters. Note that at present, only encryption 279 * algorithms have variable keysize (IKE does not provide a way to negotiate 280 * auth algorithm keysize). 281 * 282 * All sizes here are in bits. 283 */ 284 typedef struct 285 { 286 uint16_t minkeylen; 287 uint16_t maxkeylen; 288 } ipsec_capab_algparm_t; 289 290 /* 291 * Per-ill capabilities. 292 */ 293 struct ill_ipsec_capab_s { 294 ipsec_capab_elem_t *encr_hw_algs; 295 ipsec_capab_elem_t *auth_hw_algs; 296 uint32_t algs_size; /* size of _hw_algs in bytes */ 297 /* algorithm key lengths */ 298 ipsec_capab_algparm_t *encr_algparm; 299 uint32_t encr_algparm_size; 300 uint32_t encr_algparm_end; 301 }; 302 303 /* 304 * List of AH and ESP IPsec acceleration capable ills 305 */ 306 typedef struct ipsec_capab_ill_s { 307 uint_t ill_index; 308 boolean_t ill_isv6; 309 struct ipsec_capab_ill_s *next; 310 } ipsec_capab_ill_t; 311 312 static ipsec_capab_ill_t *ipsec_capab_ills_ah; 313 static ipsec_capab_ill_t *ipsec_capab_ills_esp; 314 krwlock_t ipsec_capab_ills_lock; 315 316 /* 317 * The field values are larger than strictly necessary for simple 318 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 319 */ 320 static area_t ip_area_template = { 321 AR_ENTRY_ADD, /* area_cmd */ 322 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 323 /* area_name_offset */ 324 /* area_name_length temporarily holds this structure length */ 325 sizeof (area_t), /* area_name_length */ 326 IP_ARP_PROTO_TYPE, /* area_proto */ 327 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 328 IP_ADDR_LEN, /* area_proto_addr_length */ 329 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 330 /* area_proto_mask_offset */ 331 0, /* area_flags */ 332 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 333 /* area_hw_addr_offset */ 334 /* Zero length hw_addr_length means 'use your idea of the address' */ 335 0 /* area_hw_addr_length */ 336 }; 337 338 /* 339 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 340 * support 341 */ 342 static area_t ip6_area_template = { 343 AR_ENTRY_ADD, /* area_cmd */ 344 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 345 /* area_name_offset */ 346 /* area_name_length temporarily holds this structure length */ 347 sizeof (area_t), /* area_name_length */ 348 IP_ARP_PROTO_TYPE, /* area_proto */ 349 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 350 IPV6_ADDR_LEN, /* area_proto_addr_length */ 351 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 352 /* area_proto_mask_offset */ 353 0, /* area_flags */ 354 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 355 /* area_hw_addr_offset */ 356 /* Zero length hw_addr_length means 'use your idea of the address' */ 357 0 /* area_hw_addr_length */ 358 }; 359 360 static ared_t ip_ared_template = { 361 AR_ENTRY_DELETE, 362 sizeof (ared_t) + IP_ADDR_LEN, 363 sizeof (ared_t), 364 IP_ARP_PROTO_TYPE, 365 sizeof (ared_t), 366 IP_ADDR_LEN 367 }; 368 369 static ared_t ip6_ared_template = { 370 AR_ENTRY_DELETE, 371 sizeof (ared_t) + IPV6_ADDR_LEN, 372 sizeof (ared_t), 373 IP_ARP_PROTO_TYPE, 374 sizeof (ared_t), 375 IPV6_ADDR_LEN 376 }; 377 378 /* 379 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 380 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 381 * areq is used). 382 */ 383 static areq_t ip_areq_template = { 384 AR_ENTRY_QUERY, /* cmd */ 385 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 386 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 387 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 388 sizeof (areq_t), /* target addr offset */ 389 IP_ADDR_LEN, /* target addr_length */ 390 0, /* flags */ 391 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 392 IP_ADDR_LEN, /* sender addr length */ 393 6, /* xmit_count */ 394 1000, /* (re)xmit_interval in milliseconds */ 395 4 /* max # of requests to buffer */ 396 /* anything else filled in by the code */ 397 }; 398 399 static arc_t ip_aru_template = { 400 AR_INTERFACE_UP, 401 sizeof (arc_t), /* Name offset */ 402 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 403 }; 404 405 static arc_t ip_ard_template = { 406 AR_INTERFACE_DOWN, 407 sizeof (arc_t), /* Name offset */ 408 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 409 }; 410 411 static arc_t ip_aron_template = { 412 AR_INTERFACE_ON, 413 sizeof (arc_t), /* Name offset */ 414 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 415 }; 416 417 static arc_t ip_aroff_template = { 418 AR_INTERFACE_OFF, 419 sizeof (arc_t), /* Name offset */ 420 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 421 }; 422 423 424 static arma_t ip_arma_multi_template = { 425 AR_MAPPING_ADD, 426 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 427 /* Name offset */ 428 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 429 IP_ARP_PROTO_TYPE, 430 sizeof (arma_t), /* proto_addr_offset */ 431 IP_ADDR_LEN, /* proto_addr_length */ 432 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 433 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 434 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 435 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 436 IP_MAX_HW_LEN, /* hw_addr_length */ 437 0, /* hw_mapping_start */ 438 }; 439 440 static ipft_t ip_ioctl_ftbl[] = { 441 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 442 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 443 IPFT_F_NO_REPLY }, 444 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 445 IPFT_F_NO_REPLY }, 446 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 447 { 0 } 448 }; 449 450 /* Simple ICMP IP Header Template */ 451 static ipha_t icmp_ipha = { 452 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 453 }; 454 455 /* Flag descriptors for ip_ipif_report */ 456 static nv_t ipif_nv_tbl[] = { 457 { IPIF_UP, "UP" }, 458 { IPIF_BROADCAST, "BROADCAST" }, 459 { ILLF_DEBUG, "DEBUG" }, 460 { PHYI_LOOPBACK, "LOOPBACK" }, 461 { IPIF_POINTOPOINT, "POINTOPOINT" }, 462 { ILLF_NOTRAILERS, "NOTRAILERS" }, 463 { PHYI_RUNNING, "RUNNING" }, 464 { ILLF_NOARP, "NOARP" }, 465 { PHYI_PROMISC, "PROMISC" }, 466 { PHYI_ALLMULTI, "ALLMULTI" }, 467 { PHYI_INTELLIGENT, "INTELLIGENT" }, 468 { ILLF_MULTICAST, "MULTICAST" }, 469 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 470 { IPIF_UNNUMBERED, "UNNUMBERED" }, 471 { IPIF_DHCPRUNNING, "DHCP" }, 472 { IPIF_PRIVATE, "PRIVATE" }, 473 { IPIF_NOXMIT, "NOXMIT" }, 474 { IPIF_NOLOCAL, "NOLOCAL" }, 475 { IPIF_DEPRECATED, "DEPRECATED" }, 476 { IPIF_PREFERRED, "PREFERRED" }, 477 { IPIF_TEMPORARY, "TEMPORARY" }, 478 { IPIF_ADDRCONF, "ADDRCONF" }, 479 { PHYI_VIRTUAL, "VIRTUAL" }, 480 { ILLF_ROUTER, "ROUTER" }, 481 { ILLF_NONUD, "NONUD" }, 482 { IPIF_ANYCAST, "ANYCAST" }, 483 { ILLF_NORTEXCH, "NORTEXCH" }, 484 { ILLF_IPV4, "IPV4" }, 485 { ILLF_IPV6, "IPV6" }, 486 { IPIF_MIPRUNNING, "MIP" }, 487 { IPIF_NOFAILOVER, "NOFAILOVER" }, 488 { PHYI_FAILED, "FAILED" }, 489 { PHYI_STANDBY, "STANDBY" }, 490 { PHYI_INACTIVE, "INACTIVE" }, 491 { PHYI_OFFLINE, "OFFLINE" }, 492 }; 493 494 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 495 496 static ip_m_t ip_m_tbl[] = { 497 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 498 ip_ether_v6intfid }, 499 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 500 ip_nodef_v6intfid }, 501 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 502 ip_nodef_v6intfid }, 503 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 504 ip_nodef_v6intfid }, 505 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 506 ip_ether_v6intfid }, 507 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 508 ip_ib_v6intfid }, 509 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 510 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 511 ip_nodef_v6intfid } 512 }; 513 514 static ill_t ill_null; /* Empty ILL for init. */ 515 char ipif_loopback_name[] = "lo0"; 516 static char *ipv4_forward_suffix = ":ip_forwarding"; 517 static char *ipv6_forward_suffix = ":ip6_forwarding"; 518 static kstat_t *loopback_ksp = NULL; 519 static sin6_t sin6_null; /* Zero address for quick clears */ 520 static sin_t sin_null; /* Zero address for quick clears */ 521 static uint_t ill_index = 1; /* Used to assign interface indicies */ 522 /* When set search for unused index */ 523 static boolean_t ill_index_wrap = B_FALSE; 524 /* When set search for unused ipif_seqid */ 525 static ipif_t ipif_zero; 526 uint_t ipif_src_random; 527 528 /* 529 * For details on the protection offered by these locks please refer 530 * to the notes under the Synchronization section at the start of ip.c 531 */ 532 krwlock_t ill_g_lock; /* The global ill_g_lock */ 533 kmutex_t ip_addr_avail_lock; /* Address availability check lock */ 534 ipsq_t *ipsq_g_head; /* List of all ipsq's on the system */ 535 536 krwlock_t ill_g_usesrc_lock; /* Protects usesrc related fields */ 537 538 /* 539 * illgrp_head/ifgrp_head is protected by IP's perimeter. 540 */ 541 static ill_group_t *illgrp_head_v4; /* Head of IPv4 ill groups */ 542 ill_group_t *illgrp_head_v6; /* Head of IPv6 ill groups */ 543 544 ill_g_head_t ill_g_heads[MAX_G_HEADS]; /* ILL List Head */ 545 546 /* 547 * ppa arena is created after these many 548 * interfaces have been plumbed. 549 */ 550 uint_t ill_no_arena = 12; 551 552 #pragma align CACHE_ALIGN_SIZE(phyint_g_list) 553 static phyint_list_t phyint_g_list; /* start of phyint list */ 554 555 /* 556 * Reflects value of FAILBACK variable in IPMP config file 557 * /etc/default/mpathd. Default value is B_TRUE. 558 * Set to B_FALSE if user disabled failback by configuring "FAILBACK=no" 559 * in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel. 560 */ 561 static boolean_t ipmp_enable_failback = B_TRUE; 562 563 /* 564 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 565 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 566 * set through platform specific code (Niagara/Ontario). 567 */ 568 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 569 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 570 571 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 572 573 static uint_t 574 ipif_rand(void) 575 { 576 ipif_src_random = ipif_src_random * 1103515245 + 12345; 577 return ((ipif_src_random >> 16) & 0x7fff); 578 } 579 580 /* 581 * Allocate per-interface mibs. Only used for ipv6. 582 * Returns true if ok. False otherwise. 583 * ipsq may not yet be allocated (loopback case ). 584 */ 585 static boolean_t 586 ill_allocate_mibs(ill_t *ill) 587 { 588 ASSERT(ill->ill_isv6); 589 590 /* Already allocated? */ 591 if (ill->ill_ip6_mib != NULL) { 592 ASSERT(ill->ill_icmp6_mib != NULL); 593 return (B_TRUE); 594 } 595 596 ill->ill_ip6_mib = kmem_zalloc(sizeof (*ill->ill_ip6_mib), 597 KM_NOSLEEP); 598 if (ill->ill_ip6_mib == NULL) { 599 return (B_FALSE); 600 } 601 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 602 KM_NOSLEEP); 603 if (ill->ill_icmp6_mib == NULL) { 604 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 605 ill->ill_ip6_mib = NULL; 606 return (B_FALSE); 607 } 608 /* 609 * The ipv6Ifindex and ipv6IfIcmpIndex will be assigned later 610 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 611 * -> ill_phyint_reinit 612 */ 613 return (B_TRUE); 614 } 615 616 /* 617 * Common code for preparation of ARP commands. Two points to remember: 618 * 1) The ill_name is tacked on at the end of the allocated space so 619 * the templates name_offset field must contain the total space 620 * to allocate less the name length. 621 * 622 * 2) The templates name_length field should contain the *template* 623 * length. We use it as a parameter to bcopy() and then write 624 * the real ill_name_length into the name_length field of the copy. 625 * (Always called as writer.) 626 */ 627 mblk_t * 628 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 629 { 630 arc_t *arc = (arc_t *)template; 631 char *cp; 632 int len; 633 mblk_t *mp; 634 uint_t name_length = ill->ill_name_length; 635 uint_t template_len = arc->arc_name_length; 636 637 len = arc->arc_name_offset + name_length; 638 mp = allocb(len, BPRI_HI); 639 if (mp == NULL) 640 return (NULL); 641 cp = (char *)mp->b_rptr; 642 mp->b_wptr = (uchar_t *)&cp[len]; 643 if (template_len) 644 bcopy(template, cp, template_len); 645 if (len > template_len) 646 bzero(&cp[template_len], len - template_len); 647 mp->b_datap->db_type = M_PROTO; 648 649 arc = (arc_t *)cp; 650 arc->arc_name_length = name_length; 651 cp = (char *)arc + arc->arc_name_offset; 652 bcopy(ill->ill_name, cp, name_length); 653 654 if (addr) { 655 area_t *area = (area_t *)mp->b_rptr; 656 657 cp = (char *)area + area->area_proto_addr_offset; 658 bcopy(addr, cp, area->area_proto_addr_length); 659 if (area->area_cmd == AR_ENTRY_ADD) { 660 cp = (char *)area; 661 len = area->area_proto_addr_length; 662 if (area->area_proto_mask_offset) 663 cp += area->area_proto_mask_offset; 664 else 665 cp += area->area_proto_addr_offset + len; 666 while (len-- > 0) 667 *cp++ = (char)~0; 668 } 669 } 670 return (mp); 671 } 672 673 mblk_t * 674 ipif_area_alloc(ipif_t *ipif) 675 { 676 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, 677 (char *)&ipif->ipif_lcl_addr)); 678 } 679 680 mblk_t * 681 ipif_ared_alloc(ipif_t *ipif) 682 { 683 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, 684 (char *)&ipif->ipif_lcl_addr)); 685 } 686 687 mblk_t * 688 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 689 { 690 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 691 (char *)&addr)); 692 } 693 694 /* 695 * Completely vaporize a lower level tap and all associated interfaces. 696 * ill_delete is called only out of ip_close when the device control 697 * stream is being closed. 698 */ 699 void 700 ill_delete(ill_t *ill) 701 { 702 ipif_t *ipif; 703 ill_t *prev_ill; 704 705 /* 706 * ill_delete may be forcibly entering the ipsq. The previous 707 * ioctl may not have completed and may need to be aborted. 708 * ipsq_flush takes care of it. If we don't need to enter the 709 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 710 * ill_delete_tail is sufficient. 711 */ 712 ipsq_flush(ill); 713 714 /* 715 * Nuke all interfaces. ipif_free will take down the interface, 716 * remove it from the list, and free the data structure. 717 * Walk down the ipif list and remove the logical interfaces 718 * first before removing the main ipif. We can't unplumb 719 * zeroth interface first in the case of IPv6 as reset_conn_ill 720 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 721 * POINTOPOINT. 722 * 723 * If ill_ipif was not properly initialized (i.e low on memory), 724 * then no interfaces to clean up. In this case just clean up the 725 * ill. 726 */ 727 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 728 ipif_free(ipif); 729 730 /* 731 * Used only by ill_arp_on and ill_arp_off, which are writers. 732 * So nobody can be using this mp now. Free the mp allocated for 733 * honoring ILLF_NOARP 734 */ 735 freemsg(ill->ill_arp_on_mp); 736 ill->ill_arp_on_mp = NULL; 737 738 /* Clean up msgs on pending upcalls for mrouted */ 739 reset_mrt_ill(ill); 740 741 /* 742 * ipif_free -> reset_conn_ipif will remove all multicast 743 * references for IPv4. For IPv6, we need to do it here as 744 * it points only at ills. 745 */ 746 reset_conn_ill(ill); 747 748 /* 749 * ill_down will arrange to blow off any IRE's dependent on this 750 * ILL, and shut down fragmentation reassembly. 751 */ 752 ill_down(ill); 753 754 /* Let SCTP know, so that it can remove this from its list. */ 755 sctp_update_ill(ill, SCTP_ILL_REMOVE); 756 757 /* 758 * If an address on this ILL is being used as a source address then 759 * clear out the pointers in other ILLs that point to this ILL. 760 */ 761 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 762 if (ill->ill_usesrc_grp_next != NULL) { 763 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 764 ill_disband_usesrc_group(ill); 765 } else { /* consumer of the usesrc ILL */ 766 prev_ill = ill_prev_usesrc(ill); 767 prev_ill->ill_usesrc_grp_next = 768 ill->ill_usesrc_grp_next; 769 } 770 } 771 rw_exit(&ill_g_usesrc_lock); 772 } 773 774 static void 775 ipif_non_duplicate(ipif_t *ipif) 776 { 777 ill_t *ill = ipif->ipif_ill; 778 mutex_enter(&ill->ill_lock); 779 if (ipif->ipif_flags & IPIF_DUPLICATE) { 780 ipif->ipif_flags &= ~IPIF_DUPLICATE; 781 ASSERT(ill->ill_ipif_dup_count > 0); 782 ill->ill_ipif_dup_count--; 783 } 784 mutex_exit(&ill->ill_lock); 785 } 786 787 /* 788 * ill_delete_tail is called from ip_modclose after all references 789 * to the closing ill are gone. The wait is done in ip_modclose 790 */ 791 void 792 ill_delete_tail(ill_t *ill) 793 { 794 mblk_t **mpp; 795 ipif_t *ipif; 796 797 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 798 ipif_non_duplicate(ipif); 799 ipif_down_tail(ipif); 800 } 801 802 ASSERT(ill->ill_ipif_dup_count == 0 && 803 ill->ill_arp_down_mp == NULL && 804 ill->ill_arp_del_mapping_mp == NULL); 805 806 /* 807 * If polling capability is enabled (which signifies direct 808 * upcall into IP and driver has ill saved as a handle), 809 * we need to make sure that unbind has completed before we 810 * let the ill disappear and driver no longer has any reference 811 * to this ill. 812 */ 813 mutex_enter(&ill->ill_lock); 814 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 815 cv_wait(&ill->ill_cv, &ill->ill_lock); 816 mutex_exit(&ill->ill_lock); 817 818 /* 819 * Clean up polling and soft ring capabilities 820 */ 821 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 822 ill_capability_dls_disable(ill); 823 824 /* 825 * Send the detach if there's one to send (i.e., if we're above a 826 * style 2 DLPI driver). 827 */ 828 if (ill->ill_detach_mp != NULL) { 829 ill_dlpi_send(ill, ill->ill_detach_mp); 830 ill->ill_detach_mp = NULL; 831 } 832 833 if (ill->ill_net_type != IRE_LOOPBACK) 834 qprocsoff(ill->ill_rq); 835 836 /* 837 * We do an ipsq_flush once again now. New messages could have 838 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 839 * could also have landed up if an ioctl thread had looked up 840 * the ill before we set the ILL_CONDEMNED flag, but not yet 841 * enqueued the ioctl when we did the ipsq_flush last time. 842 */ 843 ipsq_flush(ill); 844 845 /* 846 * Free capabilities. 847 */ 848 if (ill->ill_ipsec_capab_ah != NULL) { 849 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 850 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 851 ill->ill_ipsec_capab_ah = NULL; 852 } 853 854 if (ill->ill_ipsec_capab_esp != NULL) { 855 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 856 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 857 ill->ill_ipsec_capab_esp = NULL; 858 } 859 860 if (ill->ill_mdt_capab != NULL) { 861 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 862 ill->ill_mdt_capab = NULL; 863 } 864 865 if (ill->ill_hcksum_capab != NULL) { 866 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 867 ill->ill_hcksum_capab = NULL; 868 } 869 870 if (ill->ill_zerocopy_capab != NULL) { 871 kmem_free(ill->ill_zerocopy_capab, 872 sizeof (ill_zerocopy_capab_t)); 873 ill->ill_zerocopy_capab = NULL; 874 } 875 876 if (ill->ill_dls_capab != NULL) { 877 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 878 ill->ill_dls_capab->ill_unbind_conn = NULL; 879 kmem_free(ill->ill_dls_capab, 880 sizeof (ill_dls_capab_t) + 881 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 882 ill->ill_dls_capab = NULL; 883 } 884 885 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 886 887 while (ill->ill_ipif != NULL) 888 ipif_free_tail(ill->ill_ipif); 889 890 ill_down_tail(ill); 891 892 /* 893 * We have removed all references to ilm from conn and the ones joined 894 * within the kernel. 895 * 896 * We don't walk conns, mrts and ires because 897 * 898 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 899 * 2) ill_down ->ill_downi walks all the ires and cleans up 900 * ill references. 901 */ 902 ASSERT(ilm_walk_ill(ill) == 0); 903 /* 904 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 905 * could free the phyint. No more reference to the phyint after this 906 * point. 907 */ 908 (void) ill_glist_delete(ill); 909 910 rw_enter(&ip_g_nd_lock, RW_WRITER); 911 if (ill->ill_ndd_name != NULL) 912 nd_unload(&ip_g_nd, ill->ill_ndd_name); 913 rw_exit(&ip_g_nd_lock); 914 915 916 if (ill->ill_frag_ptr != NULL) { 917 uint_t count; 918 919 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 920 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 921 } 922 mi_free(ill->ill_frag_ptr); 923 ill->ill_frag_ptr = NULL; 924 ill->ill_frag_hash_tbl = NULL; 925 } 926 if (ill->ill_nd_lla_mp != NULL) 927 freemsg(ill->ill_nd_lla_mp); 928 /* Free all retained control messages. */ 929 mpp = &ill->ill_first_mp_to_free; 930 do { 931 while (mpp[0]) { 932 mblk_t *mp; 933 mblk_t *mp1; 934 935 mp = mpp[0]; 936 mpp[0] = mp->b_next; 937 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 938 mp1->b_next = NULL; 939 mp1->b_prev = NULL; 940 } 941 freemsg(mp); 942 } 943 } while (mpp++ != &ill->ill_last_mp_to_free); 944 945 ill_free_mib(ill); 946 ILL_TRACE_CLEANUP(ill); 947 } 948 949 static void 950 ill_free_mib(ill_t *ill) 951 { 952 if (ill->ill_ip6_mib != NULL) { 953 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 954 ill->ill_ip6_mib = NULL; 955 } 956 if (ill->ill_icmp6_mib != NULL) { 957 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 958 ill->ill_icmp6_mib = NULL; 959 } 960 } 961 962 /* 963 * Concatenate together a physical address and a sap. 964 * 965 * Sap_lengths are interpreted as follows: 966 * sap_length == 0 ==> no sap 967 * sap_length > 0 ==> sap is at the head of the dlpi address 968 * sap_length < 0 ==> sap is at the tail of the dlpi address 969 */ 970 static void 971 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 972 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 973 { 974 uint16_t sap_addr = (uint16_t)sap_src; 975 976 if (sap_length == 0) { 977 if (phys_src == NULL) 978 bzero(dst, phys_length); 979 else 980 bcopy(phys_src, dst, phys_length); 981 } else if (sap_length < 0) { 982 if (phys_src == NULL) 983 bzero(dst, phys_length); 984 else 985 bcopy(phys_src, dst, phys_length); 986 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 987 } else { 988 bcopy(&sap_addr, dst, sizeof (sap_addr)); 989 if (phys_src == NULL) 990 bzero((char *)dst + sap_length, phys_length); 991 else 992 bcopy(phys_src, (char *)dst + sap_length, phys_length); 993 } 994 } 995 996 /* 997 * Generate a dl_unitdata_req mblk for the device and address given. 998 * addr_length is the length of the physical portion of the address. 999 * If addr is NULL include an all zero address of the specified length. 1000 * TRUE? In any case, addr_length is taken to be the entire length of the 1001 * dlpi address, including the absolute value of sap_length. 1002 */ 1003 mblk_t * 1004 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1005 t_scalar_t sap_length) 1006 { 1007 dl_unitdata_req_t *dlur; 1008 mblk_t *mp; 1009 t_scalar_t abs_sap_length; /* absolute value */ 1010 1011 abs_sap_length = ABS(sap_length); 1012 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1013 DL_UNITDATA_REQ); 1014 if (mp == NULL) 1015 return (NULL); 1016 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1017 /* HACK: accomodate incompatible DLPI drivers */ 1018 if (addr_length == 8) 1019 addr_length = 6; 1020 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1021 dlur->dl_dest_addr_offset = sizeof (*dlur); 1022 dlur->dl_priority.dl_min = 0; 1023 dlur->dl_priority.dl_max = 0; 1024 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1025 (uchar_t *)&dlur[1]); 1026 return (mp); 1027 } 1028 1029 /* 1030 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 1031 * Return an error if we already have 1 or more ioctls in progress. 1032 * This is used only for non-exclusive ioctls. Currently this is used 1033 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 1034 * and thus need to use ipsq_pending_mp_add. 1035 */ 1036 boolean_t 1037 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1038 { 1039 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1040 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1041 /* 1042 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1043 */ 1044 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1045 (add_mp->b_datap->db_type == M_IOCTL)); 1046 1047 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1048 /* 1049 * Return error if the conn has started closing. The conn 1050 * could have finished cleaning up the pending mp list, 1051 * If so we should not add another mp to the list negating 1052 * the cleanup. 1053 */ 1054 if (connp->conn_state_flags & CONN_CLOSING) 1055 return (B_FALSE); 1056 /* 1057 * Add the pending mp to the head of the list, chained by b_next. 1058 * Note down the conn on which the ioctl request came, in b_prev. 1059 * This will be used to later get the conn, when we get a response 1060 * on the ill queue, from some other module (typically arp) 1061 */ 1062 add_mp->b_next = (void *)ill->ill_pending_mp; 1063 add_mp->b_queue = CONNP_TO_WQ(connp); 1064 ill->ill_pending_mp = add_mp; 1065 if (connp != NULL) 1066 connp->conn_oper_pending_ill = ill; 1067 return (B_TRUE); 1068 } 1069 1070 /* 1071 * Retrieve the ill_pending_mp and return it. We have to walk the list 1072 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1073 */ 1074 mblk_t * 1075 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1076 { 1077 mblk_t *prev = NULL; 1078 mblk_t *curr = NULL; 1079 uint_t id; 1080 conn_t *connp; 1081 1082 /* 1083 * When the conn closes, conn_ioctl_cleanup needs to clean 1084 * up the pending mp, but it does not know the ioc_id and 1085 * passes in a zero for it. 1086 */ 1087 mutex_enter(&ill->ill_lock); 1088 if (ioc_id != 0) 1089 *connpp = NULL; 1090 1091 /* Search the list for the appropriate ioctl based on ioc_id */ 1092 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1093 prev = curr, curr = curr->b_next) { 1094 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1095 connp = Q_TO_CONN(curr->b_queue); 1096 /* Match based on the ioc_id or based on the conn */ 1097 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1098 break; 1099 } 1100 1101 if (curr != NULL) { 1102 /* Unlink the mblk from the pending mp list */ 1103 if (prev != NULL) { 1104 prev->b_next = curr->b_next; 1105 } else { 1106 ASSERT(ill->ill_pending_mp == curr); 1107 ill->ill_pending_mp = curr->b_next; 1108 } 1109 1110 /* 1111 * conn refcnt must have been bumped up at the start of 1112 * the ioctl. So we can safely access the conn. 1113 */ 1114 ASSERT(CONN_Q(curr->b_queue)); 1115 *connpp = Q_TO_CONN(curr->b_queue); 1116 curr->b_next = NULL; 1117 curr->b_queue = NULL; 1118 } 1119 1120 mutex_exit(&ill->ill_lock); 1121 1122 return (curr); 1123 } 1124 1125 /* 1126 * Add the pending mp to the list. There can be only 1 pending mp 1127 * in the list. Any exclusive ioctl that needs to wait for a response 1128 * from another module or driver needs to use this function to set 1129 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1130 * the other module/driver. This is also used while waiting for the 1131 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1132 */ 1133 boolean_t 1134 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1135 int waitfor) 1136 { 1137 ipsq_t *ipsq; 1138 1139 ASSERT(IAM_WRITER_IPIF(ipif)); 1140 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1141 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1142 /* 1143 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1144 * M_ERROR/M_HANGUP from driver 1145 */ 1146 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1147 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP)); 1148 1149 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1150 if (connp != NULL) { 1151 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1152 /* 1153 * Return error if the conn has started closing. The conn 1154 * could have finished cleaning up the pending mp list, 1155 * If so we should not add another mp to the list negating 1156 * the cleanup. 1157 */ 1158 if (connp->conn_state_flags & CONN_CLOSING) 1159 return (B_FALSE); 1160 } 1161 mutex_enter(&ipsq->ipsq_lock); 1162 ipsq->ipsq_pending_ipif = ipif; 1163 /* 1164 * Note down the queue in b_queue. This will be returned by 1165 * ipsq_pending_mp_get. Caller will then use these values to restart 1166 * the processing 1167 */ 1168 add_mp->b_next = NULL; 1169 add_mp->b_queue = q; 1170 ipsq->ipsq_pending_mp = add_mp; 1171 ipsq->ipsq_waitfor = waitfor; 1172 /* 1173 * ipsq_current_ipif is needed to restart the operation from 1174 * ipif_ill_refrele_tail when the last reference to the ipi/ill 1175 * is gone. Since this is not an ioctl ipsq_current_ipif has not 1176 * been set until now. 1177 */ 1178 if (DB_TYPE(add_mp) == M_ERROR || DB_TYPE(add_mp) == M_HANGUP) { 1179 ASSERT(ipsq->ipsq_current_ipif == NULL); 1180 ipsq->ipsq_current_ipif = ipif; 1181 ipsq->ipsq_last_cmd = DB_TYPE(add_mp); 1182 } 1183 if (connp != NULL) 1184 connp->conn_oper_pending_ill = ipif->ipif_ill; 1185 mutex_exit(&ipsq->ipsq_lock); 1186 return (B_TRUE); 1187 } 1188 1189 /* 1190 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1191 * queued in the list. 1192 */ 1193 mblk_t * 1194 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1195 { 1196 mblk_t *curr = NULL; 1197 1198 mutex_enter(&ipsq->ipsq_lock); 1199 *connpp = NULL; 1200 if (ipsq->ipsq_pending_mp == NULL) { 1201 mutex_exit(&ipsq->ipsq_lock); 1202 return (NULL); 1203 } 1204 1205 /* There can be only 1 such excl message */ 1206 curr = ipsq->ipsq_pending_mp; 1207 ASSERT(curr != NULL && curr->b_next == NULL); 1208 ipsq->ipsq_pending_ipif = NULL; 1209 ipsq->ipsq_pending_mp = NULL; 1210 ipsq->ipsq_waitfor = 0; 1211 mutex_exit(&ipsq->ipsq_lock); 1212 1213 if (CONN_Q(curr->b_queue)) { 1214 /* 1215 * This mp did a refhold on the conn, at the start of the ioctl. 1216 * So we can safely return a pointer to the conn to the caller. 1217 */ 1218 *connpp = Q_TO_CONN(curr->b_queue); 1219 } else { 1220 *connpp = NULL; 1221 } 1222 curr->b_next = NULL; 1223 curr->b_prev = NULL; 1224 return (curr); 1225 } 1226 1227 /* 1228 * Cleanup the ioctl mp queued in ipsq_pending_mp 1229 * - Called in the ill_delete path 1230 * - Called in the M_ERROR or M_HANGUP path on the ill. 1231 * - Called in the conn close path. 1232 */ 1233 boolean_t 1234 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1235 { 1236 mblk_t *mp; 1237 ipsq_t *ipsq; 1238 queue_t *q; 1239 ipif_t *ipif; 1240 1241 ASSERT(IAM_WRITER_ILL(ill)); 1242 ipsq = ill->ill_phyint->phyint_ipsq; 1243 mutex_enter(&ipsq->ipsq_lock); 1244 /* 1245 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1246 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1247 * even if it is meant for another ill, since we have to enqueue 1248 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1249 * If connp is non-null we are called from the conn close path. 1250 */ 1251 mp = ipsq->ipsq_pending_mp; 1252 if (mp == NULL || (connp != NULL && 1253 mp->b_queue != CONNP_TO_WQ(connp))) { 1254 mutex_exit(&ipsq->ipsq_lock); 1255 return (B_FALSE); 1256 } 1257 /* Now remove from the ipsq_pending_mp */ 1258 ipsq->ipsq_pending_mp = NULL; 1259 q = mp->b_queue; 1260 mp->b_next = NULL; 1261 mp->b_prev = NULL; 1262 mp->b_queue = NULL; 1263 1264 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1265 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1266 if (ill->ill_move_in_progress) { 1267 ILL_CLEAR_MOVE(ill); 1268 } else if (ill->ill_up_ipifs) { 1269 ill_group_cleanup(ill); 1270 } 1271 1272 ipif = ipsq->ipsq_pending_ipif; 1273 ipsq->ipsq_pending_ipif = NULL; 1274 ipsq->ipsq_waitfor = 0; 1275 ipsq->ipsq_current_ipif = NULL; 1276 mutex_exit(&ipsq->ipsq_lock); 1277 1278 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1279 ip_ioctl_finish(q, mp, ENXIO, connp != NULL ? CONN_CLOSE : 1280 NO_COPYOUT, connp != NULL ? ipif : NULL, NULL); 1281 } else { 1282 /* 1283 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1284 * be just inet_freemsg. we have to restart it 1285 * otherwise the thread will be stuck. 1286 */ 1287 inet_freemsg(mp); 1288 } 1289 return (B_TRUE); 1290 } 1291 1292 /* 1293 * The ill is closing. Cleanup all the pending mps. Called exclusively 1294 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1295 * knows this ill, and hence nobody can add an mp to this list 1296 */ 1297 static void 1298 ill_pending_mp_cleanup(ill_t *ill) 1299 { 1300 mblk_t *mp; 1301 queue_t *q; 1302 1303 ASSERT(IAM_WRITER_ILL(ill)); 1304 1305 mutex_enter(&ill->ill_lock); 1306 /* 1307 * Every mp on the pending mp list originating from an ioctl 1308 * added 1 to the conn refcnt, at the start of the ioctl. 1309 * So bump it down now. See comments in ip_wput_nondata() 1310 */ 1311 while (ill->ill_pending_mp != NULL) { 1312 mp = ill->ill_pending_mp; 1313 ill->ill_pending_mp = mp->b_next; 1314 mutex_exit(&ill->ill_lock); 1315 1316 q = mp->b_queue; 1317 ASSERT(CONN_Q(q)); 1318 mp->b_next = NULL; 1319 mp->b_prev = NULL; 1320 mp->b_queue = NULL; 1321 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL, NULL); 1322 mutex_enter(&ill->ill_lock); 1323 } 1324 ill->ill_pending_ipif = NULL; 1325 1326 mutex_exit(&ill->ill_lock); 1327 } 1328 1329 /* 1330 * Called in the conn close path and ill delete path 1331 */ 1332 static void 1333 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1334 { 1335 ipsq_t *ipsq; 1336 mblk_t *prev; 1337 mblk_t *curr; 1338 mblk_t *next; 1339 queue_t *q; 1340 mblk_t *tmp_list = NULL; 1341 1342 ASSERT(IAM_WRITER_ILL(ill)); 1343 if (connp != NULL) 1344 q = CONNP_TO_WQ(connp); 1345 else 1346 q = ill->ill_wq; 1347 1348 ipsq = ill->ill_phyint->phyint_ipsq; 1349 /* 1350 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1351 * In the case of ioctl from a conn, there can be only 1 mp 1352 * queued on the ipsq. If an ill is being unplumbed, only messages 1353 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1354 * ioctls meant for this ill form conn's are not flushed. They will 1355 * be processed during ipsq_exit and will not find the ill and will 1356 * return error. 1357 */ 1358 mutex_enter(&ipsq->ipsq_lock); 1359 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1360 curr = next) { 1361 next = curr->b_next; 1362 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1363 /* Unlink the mblk from the pending mp list */ 1364 if (prev != NULL) { 1365 prev->b_next = curr->b_next; 1366 } else { 1367 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1368 ipsq->ipsq_xopq_mphead = curr->b_next; 1369 } 1370 if (ipsq->ipsq_xopq_mptail == curr) 1371 ipsq->ipsq_xopq_mptail = prev; 1372 /* 1373 * Create a temporary list and release the ipsq lock 1374 * New elements are added to the head of the tmp_list 1375 */ 1376 curr->b_next = tmp_list; 1377 tmp_list = curr; 1378 } else { 1379 prev = curr; 1380 } 1381 } 1382 mutex_exit(&ipsq->ipsq_lock); 1383 1384 while (tmp_list != NULL) { 1385 curr = tmp_list; 1386 tmp_list = curr->b_next; 1387 curr->b_next = NULL; 1388 curr->b_prev = NULL; 1389 curr->b_queue = NULL; 1390 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1391 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1392 CONN_CLOSE : NO_COPYOUT, NULL, NULL); 1393 } else { 1394 /* 1395 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1396 * this can't be just inet_freemsg. we have to 1397 * restart it otherwise the thread will be stuck. 1398 */ 1399 inet_freemsg(curr); 1400 } 1401 } 1402 } 1403 1404 /* 1405 * This conn has started closing. Cleanup any pending ioctl from this conn. 1406 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1407 */ 1408 void 1409 conn_ioctl_cleanup(conn_t *connp) 1410 { 1411 mblk_t *curr; 1412 ipsq_t *ipsq; 1413 ill_t *ill; 1414 boolean_t refheld; 1415 1416 /* 1417 * Is any exclusive ioctl pending ? If so clean it up. If the 1418 * ioctl has not yet started, the mp is pending in the list headed by 1419 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1420 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1421 * is currently executing now the mp is not queued anywhere but 1422 * conn_oper_pending_ill is null. The conn close will wait 1423 * till the conn_ref drops to zero. 1424 */ 1425 mutex_enter(&connp->conn_lock); 1426 ill = connp->conn_oper_pending_ill; 1427 if (ill == NULL) { 1428 mutex_exit(&connp->conn_lock); 1429 return; 1430 } 1431 1432 curr = ill_pending_mp_get(ill, &connp, 0); 1433 if (curr != NULL) { 1434 mutex_exit(&connp->conn_lock); 1435 CONN_DEC_REF(connp); 1436 inet_freemsg(curr); 1437 return; 1438 } 1439 /* 1440 * We may not be able to refhold the ill if the ill/ipif 1441 * is changing. But we need to make sure that the ill will 1442 * not vanish. So we just bump up the ill_waiter count. 1443 */ 1444 refheld = ill_waiter_inc(ill); 1445 mutex_exit(&connp->conn_lock); 1446 if (refheld) { 1447 if (ipsq_enter(ill, B_TRUE)) { 1448 ill_waiter_dcr(ill); 1449 /* 1450 * Check whether this ioctl has started and is 1451 * pending now in ipsq_pending_mp. If it is not 1452 * found there then check whether this ioctl has 1453 * not even started and is in the ipsq_xopq list. 1454 */ 1455 if (!ipsq_pending_mp_cleanup(ill, connp)) 1456 ipsq_xopq_mp_cleanup(ill, connp); 1457 ipsq = ill->ill_phyint->phyint_ipsq; 1458 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1459 return; 1460 } 1461 } 1462 1463 /* 1464 * The ill is also closing and we could not bump up the 1465 * ill_waiter_count or we could not enter the ipsq. Leave 1466 * the cleanup to ill_delete 1467 */ 1468 mutex_enter(&connp->conn_lock); 1469 while (connp->conn_oper_pending_ill != NULL) 1470 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1471 mutex_exit(&connp->conn_lock); 1472 if (refheld) 1473 ill_waiter_dcr(ill); 1474 } 1475 1476 /* 1477 * ipcl_walk function for cleaning up conn_*_ill fields. 1478 */ 1479 static void 1480 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1481 { 1482 ill_t *ill = (ill_t *)arg; 1483 ire_t *ire; 1484 1485 mutex_enter(&connp->conn_lock); 1486 if (connp->conn_multicast_ill == ill) { 1487 /* Revert to late binding */ 1488 connp->conn_multicast_ill = NULL; 1489 connp->conn_orig_multicast_ifindex = 0; 1490 } 1491 if (connp->conn_incoming_ill == ill) 1492 connp->conn_incoming_ill = NULL; 1493 if (connp->conn_outgoing_ill == ill) 1494 connp->conn_outgoing_ill = NULL; 1495 if (connp->conn_outgoing_pill == ill) 1496 connp->conn_outgoing_pill = NULL; 1497 if (connp->conn_nofailover_ill == ill) 1498 connp->conn_nofailover_ill = NULL; 1499 if (connp->conn_xmit_if_ill == ill) 1500 connp->conn_xmit_if_ill = NULL; 1501 if (connp->conn_ire_cache != NULL) { 1502 ire = connp->conn_ire_cache; 1503 /* 1504 * ip_newroute creates IRE_CACHE with ire_stq coming from 1505 * interface X and ipif coming from interface Y, if interface 1506 * X and Y are part of the same IPMPgroup. Thus whenever 1507 * interface X goes down, remove all references to it by 1508 * checking both on ire_ipif and ire_stq. 1509 */ 1510 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1511 (ire->ire_type == IRE_CACHE && 1512 ire->ire_stq == ill->ill_wq)) { 1513 connp->conn_ire_cache = NULL; 1514 mutex_exit(&connp->conn_lock); 1515 ire_refrele_notr(ire); 1516 return; 1517 } 1518 } 1519 mutex_exit(&connp->conn_lock); 1520 1521 } 1522 1523 /* ARGSUSED */ 1524 void 1525 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1526 { 1527 ill_t *ill = q->q_ptr; 1528 ipif_t *ipif; 1529 1530 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1531 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1532 ipif_non_duplicate(ipif); 1533 ipif_down_tail(ipif); 1534 } 1535 ill_down_tail(ill); 1536 freemsg(mp); 1537 ipsq->ipsq_current_ipif = NULL; 1538 } 1539 1540 /* 1541 * ill_down_start is called when we want to down this ill and bring it up again 1542 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1543 * all interfaces, but don't tear down any plumbing. 1544 */ 1545 boolean_t 1546 ill_down_start(queue_t *q, mblk_t *mp) 1547 { 1548 ill_t *ill; 1549 ipif_t *ipif; 1550 1551 ill = q->q_ptr; 1552 1553 ASSERT(IAM_WRITER_ILL(ill)); 1554 1555 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1556 (void) ipif_down(ipif, NULL, NULL); 1557 1558 ill_down(ill); 1559 1560 (void) ipsq_pending_mp_cleanup(ill, NULL); 1561 mutex_enter(&ill->ill_lock); 1562 /* 1563 * Atomically test and add the pending mp if references are 1564 * still active. 1565 */ 1566 if (!ill_is_quiescent(ill)) { 1567 /* 1568 * Get rid of any pending mps and cleanup. Call will 1569 * not fail since we are passing a null connp. 1570 */ 1571 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1572 mp, ILL_DOWN); 1573 mutex_exit(&ill->ill_lock); 1574 return (B_FALSE); 1575 } 1576 mutex_exit(&ill->ill_lock); 1577 return (B_TRUE); 1578 } 1579 1580 static void 1581 ill_down(ill_t *ill) 1582 { 1583 /* Blow off any IREs dependent on this ILL. */ 1584 ire_walk(ill_downi, (char *)ill); 1585 1586 mutex_enter(&ire_mrtun_lock); 1587 if (ire_mrtun_count != 0) { 1588 mutex_exit(&ire_mrtun_lock); 1589 ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif, 1590 (char *)ill, NULL); 1591 } else { 1592 mutex_exit(&ire_mrtun_lock); 1593 } 1594 1595 /* 1596 * If any interface based forwarding table exists 1597 * Blow off the ires there dependent on this ill 1598 */ 1599 mutex_enter(&ire_srcif_table_lock); 1600 if (ire_srcif_table_count > 0) { 1601 mutex_exit(&ire_srcif_table_lock); 1602 ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill); 1603 } else { 1604 mutex_exit(&ire_srcif_table_lock); 1605 } 1606 1607 /* Remove any conn_*_ill depending on this ill */ 1608 ipcl_walk(conn_cleanup_ill, (caddr_t)ill); 1609 1610 if (ill->ill_group != NULL) { 1611 illgrp_delete(ill); 1612 } 1613 1614 } 1615 1616 static void 1617 ill_down_tail(ill_t *ill) 1618 { 1619 int i; 1620 1621 /* Destroy ill_srcif_table if it exists */ 1622 /* Lock not reqd really because nobody should be able to access */ 1623 mutex_enter(&ill->ill_lock); 1624 if (ill->ill_srcif_table != NULL) { 1625 ill->ill_srcif_refcnt = 0; 1626 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 1627 rw_destroy(&ill->ill_srcif_table[i].irb_lock); 1628 } 1629 kmem_free(ill->ill_srcif_table, 1630 IP_SRCIF_TABLE_SIZE * sizeof (irb_t)); 1631 ill->ill_srcif_table = NULL; 1632 ill->ill_srcif_refcnt = 0; 1633 ill->ill_mrtun_refcnt = 0; 1634 } 1635 mutex_exit(&ill->ill_lock); 1636 } 1637 1638 /* 1639 * ire_walk routine used to delete every IRE that depends on queues 1640 * associated with 'ill'. (Always called as writer.) 1641 */ 1642 static void 1643 ill_downi(ire_t *ire, char *ill_arg) 1644 { 1645 ill_t *ill = (ill_t *)ill_arg; 1646 1647 /* 1648 * ip_newroute creates IRE_CACHE with ire_stq coming from 1649 * interface X and ipif coming from interface Y, if interface 1650 * X and Y are part of the same IPMP group. Thus whenever interface 1651 * X goes down, remove all references to it by checking both 1652 * on ire_ipif and ire_stq. 1653 */ 1654 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1655 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1656 ire_delete(ire); 1657 } 1658 } 1659 1660 /* 1661 * A seperate routine for deleting revtun and srcif based routes 1662 * are needed because the ires only deleted when the interface 1663 * is unplumbed. Also these ires have ire_in_ill non-null as well. 1664 * we want to keep mobile IP specific code separate. 1665 */ 1666 static void 1667 ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg) 1668 { 1669 ill_t *ill = (ill_t *)ill_arg; 1670 1671 ASSERT(ire->ire_in_ill != NULL); 1672 1673 if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) || 1674 (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) { 1675 ire_delete(ire); 1676 } 1677 } 1678 1679 /* 1680 * Remove ire/nce from the fastpath list. 1681 */ 1682 void 1683 ill_fastpath_nack(ill_t *ill) 1684 { 1685 if (ill->ill_isv6) { 1686 nce_fastpath_list_dispatch(ill, NULL, NULL); 1687 } else { 1688 ire_fastpath_list_dispatch(ill, NULL, NULL); 1689 } 1690 } 1691 1692 /* Consume an M_IOCACK of the fastpath probe. */ 1693 void 1694 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1695 { 1696 mblk_t *mp1 = mp; 1697 1698 /* 1699 * If this was the first attempt turn on the fastpath probing. 1700 */ 1701 mutex_enter(&ill->ill_lock); 1702 if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS) 1703 ill->ill_dlpi_fastpath_state = IDMS_OK; 1704 mutex_exit(&ill->ill_lock); 1705 1706 /* Free the M_IOCACK mblk, hold on to the data */ 1707 mp = mp->b_cont; 1708 freeb(mp1); 1709 if (mp == NULL) 1710 return; 1711 if (mp->b_cont != NULL) { 1712 /* 1713 * Update all IRE's or NCE's that are waiting for 1714 * fastpath update. 1715 */ 1716 if (ill->ill_isv6) { 1717 /* 1718 * update nce's in the fastpath list. 1719 */ 1720 nce_fastpath_list_dispatch(ill, 1721 ndp_fastpath_update, mp); 1722 } else { 1723 1724 /* 1725 * update ire's in the fastpath list. 1726 */ 1727 ire_fastpath_list_dispatch(ill, 1728 ire_fastpath_update, mp); 1729 /* 1730 * Check if we need to traverse reverse tunnel table. 1731 * Since there is only single ire_type (IRE_MIPRTUN) 1732 * in the table, we don't need to match on ire_type. 1733 * We have to check ire_mrtun_count and not the 1734 * ill_mrtun_refcnt since ill_mrtun_refcnt is set 1735 * on the incoming ill and here we are dealing with 1736 * outgoing ill. 1737 */ 1738 mutex_enter(&ire_mrtun_lock); 1739 if (ire_mrtun_count != 0) { 1740 mutex_exit(&ire_mrtun_lock); 1741 ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN, 1742 (void (*)(ire_t *, void *)) 1743 ire_fastpath_update, mp, ill); 1744 } else { 1745 mutex_exit(&ire_mrtun_lock); 1746 } 1747 } 1748 mp1 = mp->b_cont; 1749 freeb(mp); 1750 mp = mp1; 1751 } else { 1752 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1753 } 1754 1755 freeb(mp); 1756 } 1757 1758 /* 1759 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1760 * The data portion of the request is a dl_unitdata_req_t template for 1761 * what we would send downstream in the absence of a fastpath confirmation. 1762 */ 1763 int 1764 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1765 { 1766 struct iocblk *ioc; 1767 mblk_t *mp; 1768 1769 if (dlur_mp == NULL) 1770 return (EINVAL); 1771 1772 mutex_enter(&ill->ill_lock); 1773 switch (ill->ill_dlpi_fastpath_state) { 1774 case IDMS_FAILED: 1775 /* 1776 * Driver NAKed the first fastpath ioctl - assume it doesn't 1777 * support it. 1778 */ 1779 mutex_exit(&ill->ill_lock); 1780 return (ENOTSUP); 1781 case IDMS_UNKNOWN: 1782 /* This is the first probe */ 1783 ill->ill_dlpi_fastpath_state = IDMS_INPROGRESS; 1784 break; 1785 default: 1786 break; 1787 } 1788 mutex_exit(&ill->ill_lock); 1789 1790 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1791 return (EAGAIN); 1792 1793 mp->b_cont = copyb(dlur_mp); 1794 if (mp->b_cont == NULL) { 1795 freeb(mp); 1796 return (EAGAIN); 1797 } 1798 1799 ioc = (struct iocblk *)mp->b_rptr; 1800 ioc->ioc_count = msgdsize(mp->b_cont); 1801 1802 putnext(ill->ill_wq, mp); 1803 return (0); 1804 } 1805 1806 void 1807 ill_capability_probe(ill_t *ill) 1808 { 1809 /* 1810 * Do so only if negotiation is enabled, capabilities are unknown, 1811 * and a capability negotiation is not already in progress. 1812 */ 1813 if (ill->ill_capab_state != IDMS_UNKNOWN && 1814 ill->ill_capab_state != IDMS_RENEG) 1815 return; 1816 1817 ill->ill_capab_state = IDMS_INPROGRESS; 1818 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1819 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1820 } 1821 1822 void 1823 ill_capability_reset(ill_t *ill) 1824 { 1825 mblk_t *sc_mp = NULL; 1826 mblk_t *tmp; 1827 1828 /* 1829 * Note here that we reset the state to UNKNOWN, and later send 1830 * down the DL_CAPABILITY_REQ without first setting the state to 1831 * INPROGRESS. We do this in order to distinguish the 1832 * DL_CAPABILITY_ACK response which may come back in response to 1833 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1834 * also handle the case where the driver doesn't send us back 1835 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1836 * requires the state to be in UNKNOWN anyway. In any case, all 1837 * features are turned off until the state reaches IDMS_OK. 1838 */ 1839 ill->ill_capab_state = IDMS_UNKNOWN; 1840 1841 /* 1842 * Disable sub-capabilities and request a list of sub-capability 1843 * messages which will be sent down to the driver. Each handler 1844 * allocates the corresponding dl_capability_sub_t inside an 1845 * mblk, and links it to the existing sc_mp mblk, or return it 1846 * as sc_mp if it's the first sub-capability (the passed in 1847 * sc_mp is NULL). Upon returning from all capability handlers, 1848 * sc_mp will be pulled-up, before passing it downstream. 1849 */ 1850 ill_capability_mdt_reset(ill, &sc_mp); 1851 ill_capability_hcksum_reset(ill, &sc_mp); 1852 ill_capability_zerocopy_reset(ill, &sc_mp); 1853 ill_capability_ipsec_reset(ill, &sc_mp); 1854 ill_capability_dls_reset(ill, &sc_mp); 1855 1856 /* Nothing to send down in order to disable the capabilities? */ 1857 if (sc_mp == NULL) 1858 return; 1859 1860 tmp = msgpullup(sc_mp, -1); 1861 freemsg(sc_mp); 1862 if ((sc_mp = tmp) == NULL) { 1863 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1864 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1865 return; 1866 } 1867 1868 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1869 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1870 } 1871 1872 /* 1873 * Request or set new-style hardware capabilities supported by DLS provider. 1874 */ 1875 static void 1876 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1877 { 1878 mblk_t *mp; 1879 dl_capability_req_t *capb; 1880 size_t size = 0; 1881 uint8_t *ptr; 1882 1883 if (reqp != NULL) 1884 size = MBLKL(reqp); 1885 1886 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1887 if (mp == NULL) { 1888 freemsg(reqp); 1889 return; 1890 } 1891 ptr = mp->b_rptr; 1892 1893 capb = (dl_capability_req_t *)ptr; 1894 ptr += sizeof (dl_capability_req_t); 1895 1896 if (reqp != NULL) { 1897 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1898 capb->dl_sub_length = size; 1899 bcopy(reqp->b_rptr, ptr, size); 1900 ptr += size; 1901 mp->b_cont = reqp->b_cont; 1902 freeb(reqp); 1903 } 1904 ASSERT(ptr == mp->b_wptr); 1905 1906 ill_dlpi_send(ill, mp); 1907 } 1908 1909 static void 1910 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1911 { 1912 dl_capab_id_t *id_ic; 1913 uint_t sub_dl_cap = outers->dl_cap; 1914 dl_capability_sub_t *inners; 1915 uint8_t *capend; 1916 1917 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1918 1919 /* 1920 * Note: range checks here are not absolutely sufficient to 1921 * make us robust against malformed messages sent by drivers; 1922 * this is in keeping with the rest of IP's dlpi handling. 1923 * (Remember, it's coming from something else in the kernel 1924 * address space) 1925 */ 1926 1927 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1928 if (capend > mp->b_wptr) { 1929 cmn_err(CE_WARN, "ill_capability_id_ack: " 1930 "malformed sub-capability too long for mblk"); 1931 return; 1932 } 1933 1934 id_ic = (dl_capab_id_t *)(outers + 1); 1935 1936 if (outers->dl_length < sizeof (*id_ic) || 1937 (inners = &id_ic->id_subcap, 1938 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1939 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1940 "encapsulated capab type %d too long for mblk", 1941 inners->dl_cap); 1942 return; 1943 } 1944 1945 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1946 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1947 "isn't as expected; pass-thru module(s) detected, " 1948 "discarding capability\n", inners->dl_cap)); 1949 return; 1950 } 1951 1952 /* Process the encapsulated sub-capability */ 1953 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1954 } 1955 1956 /* 1957 * Process Multidata Transmit capability negotiation ack received from a 1958 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1959 * DL_CAPABILITY_ACK message. 1960 */ 1961 static void 1962 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1963 { 1964 mblk_t *nmp = NULL; 1965 dl_capability_req_t *oc; 1966 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1967 ill_mdt_capab_t **ill_mdt_capab; 1968 uint_t sub_dl_cap = isub->dl_cap; 1969 uint8_t *capend; 1970 1971 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1972 1973 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1974 1975 /* 1976 * Note: range checks here are not absolutely sufficient to 1977 * make us robust against malformed messages sent by drivers; 1978 * this is in keeping with the rest of IP's dlpi handling. 1979 * (Remember, it's coming from something else in the kernel 1980 * address space) 1981 */ 1982 1983 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1984 if (capend > mp->b_wptr) { 1985 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1986 "malformed sub-capability too long for mblk"); 1987 return; 1988 } 1989 1990 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1991 1992 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1993 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1994 "unsupported MDT sub-capability (version %d, expected %d)", 1995 mdt_ic->mdt_version, MDT_VERSION_2); 1996 return; 1997 } 1998 1999 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 2000 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 2001 "capability isn't as expected; pass-thru module(s) " 2002 "detected, discarding capability\n")); 2003 return; 2004 } 2005 2006 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 2007 2008 if (*ill_mdt_capab == NULL) { 2009 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 2010 KM_NOSLEEP); 2011 2012 if (*ill_mdt_capab == NULL) { 2013 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2014 "could not enable MDT version %d " 2015 "for %s (ENOMEM)\n", MDT_VERSION_2, 2016 ill->ill_name); 2017 return; 2018 } 2019 } 2020 2021 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 2022 "MDT version %d (%d bytes leading, %d bytes trailing " 2023 "header spaces, %d max pld bufs, %d span limit)\n", 2024 ill->ill_name, MDT_VERSION_2, 2025 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 2026 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 2027 2028 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 2029 (*ill_mdt_capab)->ill_mdt_on = 1; 2030 /* 2031 * Round the following values to the nearest 32-bit; ULP 2032 * may further adjust them to accomodate for additional 2033 * protocol headers. We pass these values to ULP during 2034 * bind time. 2035 */ 2036 (*ill_mdt_capab)->ill_mdt_hdr_head = 2037 roundup(mdt_ic->mdt_hdr_head, 4); 2038 (*ill_mdt_capab)->ill_mdt_hdr_tail = 2039 roundup(mdt_ic->mdt_hdr_tail, 4); 2040 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 2041 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 2042 2043 ill->ill_capabilities |= ILL_CAPAB_MDT; 2044 } else { 2045 uint_t size; 2046 uchar_t *rptr; 2047 2048 size = sizeof (dl_capability_req_t) + 2049 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2050 2051 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2052 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2053 "could not enable MDT for %s (ENOMEM)\n", 2054 ill->ill_name); 2055 return; 2056 } 2057 2058 rptr = nmp->b_rptr; 2059 /* initialize dl_capability_req_t */ 2060 oc = (dl_capability_req_t *)nmp->b_rptr; 2061 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2062 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2063 sizeof (dl_capab_mdt_t); 2064 nmp->b_rptr += sizeof (dl_capability_req_t); 2065 2066 /* initialize dl_capability_sub_t */ 2067 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2068 nmp->b_rptr += sizeof (*isub); 2069 2070 /* initialize dl_capab_mdt_t */ 2071 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2072 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2073 2074 nmp->b_rptr = rptr; 2075 2076 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2077 "to enable MDT version %d\n", ill->ill_name, 2078 MDT_VERSION_2)); 2079 2080 /* set ENABLE flag */ 2081 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2082 2083 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2084 ill_dlpi_send(ill, nmp); 2085 } 2086 } 2087 2088 static void 2089 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2090 { 2091 mblk_t *mp; 2092 dl_capab_mdt_t *mdt_subcap; 2093 dl_capability_sub_t *dl_subcap; 2094 int size; 2095 2096 if (!ILL_MDT_CAPABLE(ill)) 2097 return; 2098 2099 ASSERT(ill->ill_mdt_capab != NULL); 2100 /* 2101 * Clear the capability flag for MDT but retain the ill_mdt_capab 2102 * structure since it's possible that another thread is still 2103 * referring to it. The structure only gets deallocated when 2104 * we destroy the ill. 2105 */ 2106 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2107 2108 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2109 2110 mp = allocb(size, BPRI_HI); 2111 if (mp == NULL) { 2112 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2113 "request to disable MDT\n")); 2114 return; 2115 } 2116 2117 mp->b_wptr = mp->b_rptr + size; 2118 2119 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2120 dl_subcap->dl_cap = DL_CAPAB_MDT; 2121 dl_subcap->dl_length = sizeof (*mdt_subcap); 2122 2123 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2124 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2125 mdt_subcap->mdt_flags = 0; 2126 mdt_subcap->mdt_hdr_head = 0; 2127 mdt_subcap->mdt_hdr_tail = 0; 2128 2129 if (*sc_mp != NULL) 2130 linkb(*sc_mp, mp); 2131 else 2132 *sc_mp = mp; 2133 } 2134 2135 /* 2136 * Send a DL_NOTIFY_REQ to the specified ill to enable 2137 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2138 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2139 * acceleration. 2140 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2141 */ 2142 static boolean_t 2143 ill_enable_promisc_notify(ill_t *ill) 2144 { 2145 mblk_t *mp; 2146 dl_notify_req_t *req; 2147 2148 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2149 2150 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2151 if (mp == NULL) 2152 return (B_FALSE); 2153 2154 req = (dl_notify_req_t *)mp->b_rptr; 2155 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2156 DL_NOTE_PROMISC_OFF_PHYS; 2157 2158 ill_dlpi_send(ill, mp); 2159 2160 return (B_TRUE); 2161 } 2162 2163 2164 /* 2165 * Allocate an IPsec capability request which will be filled by our 2166 * caller to turn on support for one or more algorithms. 2167 */ 2168 static mblk_t * 2169 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2170 { 2171 mblk_t *nmp; 2172 dl_capability_req_t *ocap; 2173 dl_capab_ipsec_t *ocip; 2174 dl_capab_ipsec_t *icip; 2175 uint8_t *ptr; 2176 icip = (dl_capab_ipsec_t *)(isub + 1); 2177 2178 /* 2179 * The first time around, we send a DL_NOTIFY_REQ to enable 2180 * PROMISC_ON/OFF notification from the provider. We need to 2181 * do this before enabling the algorithms to avoid leakage of 2182 * cleartext packets. 2183 */ 2184 2185 if (!ill_enable_promisc_notify(ill)) 2186 return (NULL); 2187 2188 /* 2189 * Allocate new mblk which will contain a new capability 2190 * request to enable the capabilities. 2191 */ 2192 2193 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2194 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2195 if (nmp == NULL) 2196 return (NULL); 2197 2198 ptr = nmp->b_rptr; 2199 2200 /* initialize dl_capability_req_t */ 2201 ocap = (dl_capability_req_t *)ptr; 2202 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2203 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2204 ptr += sizeof (dl_capability_req_t); 2205 2206 /* initialize dl_capability_sub_t */ 2207 bcopy(isub, ptr, sizeof (*isub)); 2208 ptr += sizeof (*isub); 2209 2210 /* initialize dl_capab_ipsec_t */ 2211 ocip = (dl_capab_ipsec_t *)ptr; 2212 bcopy(icip, ocip, sizeof (*icip)); 2213 2214 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2215 return (nmp); 2216 } 2217 2218 /* 2219 * Process an IPsec capability negotiation ack received from a DLS Provider. 2220 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2221 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2222 */ 2223 static void 2224 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2225 { 2226 dl_capab_ipsec_t *icip; 2227 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2228 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2229 uint_t cipher, nciphers; 2230 mblk_t *nmp; 2231 uint_t alg_len; 2232 boolean_t need_sadb_dump; 2233 uint_t sub_dl_cap = isub->dl_cap; 2234 ill_ipsec_capab_t **ill_capab; 2235 uint64_t ill_capab_flag; 2236 uint8_t *capend, *ciphend; 2237 boolean_t sadb_resync; 2238 2239 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2240 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2241 2242 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2243 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2244 ill_capab_flag = ILL_CAPAB_AH; 2245 } else { 2246 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2247 ill_capab_flag = ILL_CAPAB_ESP; 2248 } 2249 2250 /* 2251 * If the ill capability structure exists, then this incoming 2252 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2253 * If this is so, then we'd need to resynchronize the SADB 2254 * after re-enabling the offloaded ciphers. 2255 */ 2256 sadb_resync = (*ill_capab != NULL); 2257 2258 /* 2259 * Note: range checks here are not absolutely sufficient to 2260 * make us robust against malformed messages sent by drivers; 2261 * this is in keeping with the rest of IP's dlpi handling. 2262 * (Remember, it's coming from something else in the kernel 2263 * address space) 2264 */ 2265 2266 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2267 if (capend > mp->b_wptr) { 2268 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2269 "malformed sub-capability too long for mblk"); 2270 return; 2271 } 2272 2273 /* 2274 * There are two types of acks we process here: 2275 * 1. acks in reply to a (first form) generic capability req 2276 * (no ENABLE flag set) 2277 * 2. acks in reply to a ENABLE capability req. 2278 * (ENABLE flag set) 2279 * 2280 * We process the subcapability passed as argument as follows: 2281 * 1 do initializations 2282 * 1.1 initialize nmp = NULL 2283 * 1.2 set need_sadb_dump to B_FALSE 2284 * 2 for each cipher in subcapability: 2285 * 2.1 if ENABLE flag is set: 2286 * 2.1.1 update per-ill ipsec capabilities info 2287 * 2.1.2 set need_sadb_dump to B_TRUE 2288 * 2.2 if ENABLE flag is not set: 2289 * 2.2.1 if nmp is NULL: 2290 * 2.2.1.1 allocate and initialize nmp 2291 * 2.2.1.2 init current pos in nmp 2292 * 2.2.2 copy current cipher to current pos in nmp 2293 * 2.2.3 set ENABLE flag in nmp 2294 * 2.2.4 update current pos 2295 * 3 if nmp is not equal to NULL, send enable request 2296 * 3.1 send capability request 2297 * 4 if need_sadb_dump is B_TRUE 2298 * 4.1 enable promiscuous on/off notifications 2299 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2300 * AH or ESP SA's to interface. 2301 */ 2302 2303 nmp = NULL; 2304 oalg = NULL; 2305 need_sadb_dump = B_FALSE; 2306 icip = (dl_capab_ipsec_t *)(isub + 1); 2307 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2308 2309 nciphers = icip->cip_nciphers; 2310 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2311 2312 if (ciphend > capend) { 2313 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2314 "too many ciphers for sub-capability len"); 2315 return; 2316 } 2317 2318 for (cipher = 0; cipher < nciphers; cipher++) { 2319 alg_len = sizeof (dl_capab_ipsec_alg_t); 2320 2321 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2322 /* 2323 * TBD: when we provide a way to disable capabilities 2324 * from above, need to manage the request-pending state 2325 * and fail if we were not expecting this ACK. 2326 */ 2327 IPSECHW_DEBUG(IPSECHW_CAPAB, 2328 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2329 2330 /* 2331 * Update IPsec capabilities for this ill 2332 */ 2333 2334 if (*ill_capab == NULL) { 2335 IPSECHW_DEBUG(IPSECHW_CAPAB, 2336 ("ill_capability_ipsec_ack: " 2337 "allocating ipsec_capab for ill\n")); 2338 *ill_capab = ill_ipsec_capab_alloc(); 2339 2340 if (*ill_capab == NULL) { 2341 cmn_err(CE_WARN, 2342 "ill_capability_ipsec_ack: " 2343 "could not enable IPsec Hardware " 2344 "acceleration for %s (ENOMEM)\n", 2345 ill->ill_name); 2346 return; 2347 } 2348 } 2349 2350 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2351 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2352 2353 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2354 cmn_err(CE_WARN, 2355 "ill_capability_ipsec_ack: " 2356 "malformed IPsec algorithm id %d", 2357 ialg->alg_prim); 2358 continue; 2359 } 2360 2361 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2362 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2363 ialg->alg_prim); 2364 } else { 2365 ipsec_capab_algparm_t *alp; 2366 2367 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2368 ialg->alg_prim); 2369 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2370 ialg->alg_prim)) { 2371 cmn_err(CE_WARN, 2372 "ill_capability_ipsec_ack: " 2373 "no space for IPsec alg id %d", 2374 ialg->alg_prim); 2375 continue; 2376 } 2377 alp = &((*ill_capab)->encr_algparm[ 2378 ialg->alg_prim]); 2379 alp->minkeylen = ialg->alg_minbits; 2380 alp->maxkeylen = ialg->alg_maxbits; 2381 } 2382 ill->ill_capabilities |= ill_capab_flag; 2383 /* 2384 * indicate that a capability was enabled, which 2385 * will be used below to kick off a SADB dump 2386 * to the ill. 2387 */ 2388 need_sadb_dump = B_TRUE; 2389 } else { 2390 IPSECHW_DEBUG(IPSECHW_CAPAB, 2391 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2392 ialg->alg_prim)); 2393 2394 if (nmp == NULL) { 2395 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2396 if (nmp == NULL) { 2397 /* 2398 * Sending the PROMISC_ON/OFF 2399 * notification request failed. 2400 * We cannot enable the algorithms 2401 * since the Provider will not 2402 * notify IP of promiscous mode 2403 * changes, which could lead 2404 * to leakage of packets. 2405 */ 2406 cmn_err(CE_WARN, 2407 "ill_capability_ipsec_ack: " 2408 "could not enable IPsec Hardware " 2409 "acceleration for %s (ENOMEM)\n", 2410 ill->ill_name); 2411 return; 2412 } 2413 /* ptr to current output alg specifier */ 2414 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2415 } 2416 2417 /* 2418 * Copy current alg specifier, set ENABLE 2419 * flag, and advance to next output alg. 2420 * For now we enable all IPsec capabilities. 2421 */ 2422 ASSERT(oalg != NULL); 2423 bcopy(ialg, oalg, alg_len); 2424 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2425 nmp->b_wptr += alg_len; 2426 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2427 } 2428 2429 /* move to next input algorithm specifier */ 2430 ialg = (dl_capab_ipsec_alg_t *) 2431 ((char *)ialg + alg_len); 2432 } 2433 2434 if (nmp != NULL) 2435 /* 2436 * nmp points to a DL_CAPABILITY_REQ message to enable 2437 * IPsec hardware acceleration. 2438 */ 2439 ill_dlpi_send(ill, nmp); 2440 2441 if (need_sadb_dump) 2442 /* 2443 * An acknowledgement corresponding to a request to 2444 * enable acceleration was received, notify SADB. 2445 */ 2446 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2447 } 2448 2449 /* 2450 * Given an mblk with enough space in it, create sub-capability entries for 2451 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2452 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2453 * in preparation for the reset the DL_CAPABILITY_REQ message. 2454 */ 2455 static void 2456 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2457 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2458 { 2459 dl_capab_ipsec_t *oipsec; 2460 dl_capab_ipsec_alg_t *oalg; 2461 dl_capability_sub_t *dl_subcap; 2462 int i, k; 2463 2464 ASSERT(nciphers > 0); 2465 ASSERT(ill_cap != NULL); 2466 ASSERT(mp != NULL); 2467 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2468 2469 /* dl_capability_sub_t for "stype" */ 2470 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2471 dl_subcap->dl_cap = stype; 2472 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2473 mp->b_wptr += sizeof (dl_capability_sub_t); 2474 2475 /* dl_capab_ipsec_t for "stype" */ 2476 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2477 oipsec->cip_version = 1; 2478 oipsec->cip_nciphers = nciphers; 2479 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2480 2481 /* create entries for "stype" AUTH ciphers */ 2482 for (i = 0; i < ill_cap->algs_size; i++) { 2483 for (k = 0; k < BITSPERBYTE; k++) { 2484 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2485 continue; 2486 2487 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2488 bzero((void *)oalg, sizeof (*oalg)); 2489 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2490 oalg->alg_prim = k + (BITSPERBYTE * i); 2491 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2492 } 2493 } 2494 /* create entries for "stype" ENCR ciphers */ 2495 for (i = 0; i < ill_cap->algs_size; i++) { 2496 for (k = 0; k < BITSPERBYTE; k++) { 2497 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2498 continue; 2499 2500 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2501 bzero((void *)oalg, sizeof (*oalg)); 2502 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2503 oalg->alg_prim = k + (BITSPERBYTE * i); 2504 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2505 } 2506 } 2507 } 2508 2509 /* 2510 * Macro to count number of 1s in a byte (8-bit word). The total count is 2511 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2512 * POPC instruction, but our macro is more flexible for an arbitrary length 2513 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2514 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2515 * stays that way, we can reduce the number of iterations required. 2516 */ 2517 #define COUNT_1S(val, sum) { \ 2518 uint8_t x = val & 0xff; \ 2519 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2520 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2521 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2522 } 2523 2524 /* ARGSUSED */ 2525 static void 2526 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2527 { 2528 mblk_t *mp; 2529 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2530 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2531 uint64_t ill_capabilities = ill->ill_capabilities; 2532 int ah_cnt = 0, esp_cnt = 0; 2533 int ah_len = 0, esp_len = 0; 2534 int i, size = 0; 2535 2536 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2537 return; 2538 2539 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2540 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2541 2542 /* Find out the number of ciphers for AH */ 2543 if (cap_ah != NULL) { 2544 for (i = 0; i < cap_ah->algs_size; i++) { 2545 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2546 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2547 } 2548 if (ah_cnt > 0) { 2549 size += sizeof (dl_capability_sub_t) + 2550 sizeof (dl_capab_ipsec_t); 2551 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2552 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2553 size += ah_len; 2554 } 2555 } 2556 2557 /* Find out the number of ciphers for ESP */ 2558 if (cap_esp != NULL) { 2559 for (i = 0; i < cap_esp->algs_size; i++) { 2560 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2561 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2562 } 2563 if (esp_cnt > 0) { 2564 size += sizeof (dl_capability_sub_t) + 2565 sizeof (dl_capab_ipsec_t); 2566 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2567 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2568 size += esp_len; 2569 } 2570 } 2571 2572 if (size == 0) { 2573 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2574 "there's nothing to reset\n")); 2575 return; 2576 } 2577 2578 mp = allocb(size, BPRI_HI); 2579 if (mp == NULL) { 2580 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2581 "request to disable IPSEC Hardware Acceleration\n")); 2582 return; 2583 } 2584 2585 /* 2586 * Clear the capability flags for IPSec HA but retain the ill 2587 * capability structures since it's possible that another thread 2588 * is still referring to them. The structures only get deallocated 2589 * when we destroy the ill. 2590 * 2591 * Various places check the flags to see if the ill is capable of 2592 * hardware acceleration, and by clearing them we ensure that new 2593 * outbound IPSec packets are sent down encrypted. 2594 */ 2595 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2596 2597 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2598 if (ah_cnt > 0) { 2599 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2600 cap_ah, mp); 2601 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2602 } 2603 2604 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2605 if (esp_cnt > 0) { 2606 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2607 cap_esp, mp); 2608 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2609 } 2610 2611 /* 2612 * At this point we've composed a bunch of sub-capabilities to be 2613 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2614 * by the caller. Upon receiving this reset message, the driver 2615 * must stop inbound decryption (by destroying all inbound SAs) 2616 * and let the corresponding packets come in encrypted. 2617 */ 2618 2619 if (*sc_mp != NULL) 2620 linkb(*sc_mp, mp); 2621 else 2622 *sc_mp = mp; 2623 } 2624 2625 static void 2626 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2627 boolean_t encapsulated) 2628 { 2629 boolean_t legacy = B_FALSE; 2630 2631 /* 2632 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2633 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2634 * instructed the driver to disable its advertised capabilities, 2635 * so there's no point in accepting any response at this moment. 2636 */ 2637 if (ill->ill_capab_state == IDMS_UNKNOWN) 2638 return; 2639 2640 /* 2641 * Note that only the following two sub-capabilities may be 2642 * considered as "legacy", since their original definitions 2643 * do not incorporate the dl_mid_t module ID token, and hence 2644 * may require the use of the wrapper sub-capability. 2645 */ 2646 switch (subp->dl_cap) { 2647 case DL_CAPAB_IPSEC_AH: 2648 case DL_CAPAB_IPSEC_ESP: 2649 legacy = B_TRUE; 2650 break; 2651 } 2652 2653 /* 2654 * For legacy sub-capabilities which don't incorporate a queue_t 2655 * pointer in their structures, discard them if we detect that 2656 * there are intermediate modules in between IP and the driver. 2657 */ 2658 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2659 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2660 "%d discarded; %d module(s) present below IP\n", 2661 subp->dl_cap, ill->ill_lmod_cnt)); 2662 return; 2663 } 2664 2665 switch (subp->dl_cap) { 2666 case DL_CAPAB_IPSEC_AH: 2667 case DL_CAPAB_IPSEC_ESP: 2668 ill_capability_ipsec_ack(ill, mp, subp); 2669 break; 2670 case DL_CAPAB_MDT: 2671 ill_capability_mdt_ack(ill, mp, subp); 2672 break; 2673 case DL_CAPAB_HCKSUM: 2674 ill_capability_hcksum_ack(ill, mp, subp); 2675 break; 2676 case DL_CAPAB_ZEROCOPY: 2677 ill_capability_zerocopy_ack(ill, mp, subp); 2678 break; 2679 case DL_CAPAB_POLL: 2680 if (!SOFT_RINGS_ENABLED()) 2681 ill_capability_dls_ack(ill, mp, subp); 2682 break; 2683 case DL_CAPAB_SOFT_RING: 2684 if (SOFT_RINGS_ENABLED()) 2685 ill_capability_dls_ack(ill, mp, subp); 2686 break; 2687 default: 2688 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2689 subp->dl_cap)); 2690 } 2691 } 2692 2693 /* 2694 * As part of negotiating polling capability, the driver tells us 2695 * the default (or normal) blanking interval and packet threshold 2696 * (the receive timer fires if blanking interval is reached or 2697 * the packet threshold is reached). 2698 * 2699 * As part of manipulating the polling interval, we always use our 2700 * estimated interval (avg service time * number of packets queued 2701 * on the squeue) but we try to blank for a minimum of 2702 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2703 * packet threshold during this time. When we are not in polling mode 2704 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2705 * rr_min_blank_ratio but up the packet cnt by a ratio of 2706 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2707 * possible although for a shorter interval. 2708 */ 2709 #define RR_MAX_BLANK_RATIO 20 2710 #define RR_MIN_BLANK_RATIO 10 2711 #define RR_MAX_PKT_CNT_RATIO 3 2712 #define RR_MIN_PKT_CNT_RATIO 3 2713 2714 /* 2715 * These can be tuned via /etc/system. 2716 */ 2717 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2718 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2719 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2720 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2721 2722 static mac_resource_handle_t 2723 ill_ring_add(void *arg, mac_resource_t *mrp) 2724 { 2725 ill_t *ill = (ill_t *)arg; 2726 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2727 ill_rx_ring_t *rx_ring; 2728 int ip_rx_index; 2729 2730 ASSERT(mrp != NULL); 2731 if (mrp->mr_type != MAC_RX_FIFO) { 2732 return (NULL); 2733 } 2734 ASSERT(ill != NULL); 2735 ASSERT(ill->ill_dls_capab != NULL); 2736 2737 mutex_enter(&ill->ill_lock); 2738 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2739 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2740 ASSERT(rx_ring != NULL); 2741 2742 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2743 time_t normal_blank_time = 2744 mrfp->mrf_normal_blank_time; 2745 uint_t normal_pkt_cnt = 2746 mrfp->mrf_normal_pkt_count; 2747 2748 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2749 2750 rx_ring->rr_blank = mrfp->mrf_blank; 2751 rx_ring->rr_handle = mrfp->mrf_arg; 2752 rx_ring->rr_ill = ill; 2753 rx_ring->rr_normal_blank_time = normal_blank_time; 2754 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2755 2756 rx_ring->rr_max_blank_time = 2757 normal_blank_time * rr_max_blank_ratio; 2758 rx_ring->rr_min_blank_time = 2759 normal_blank_time * rr_min_blank_ratio; 2760 rx_ring->rr_max_pkt_cnt = 2761 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2762 rx_ring->rr_min_pkt_cnt = 2763 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2764 2765 rx_ring->rr_ring_state = ILL_RING_INUSE; 2766 mutex_exit(&ill->ill_lock); 2767 2768 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2769 (int), ip_rx_index); 2770 return ((mac_resource_handle_t)rx_ring); 2771 } 2772 } 2773 2774 /* 2775 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2776 * we have devices which can overwhelm this limit, ILL_MAX_RING 2777 * should be made configurable. Meanwhile it cause no panic because 2778 * driver will pass ip_input a NULL handle which will make 2779 * IP allocate the default squeue and Polling mode will not 2780 * be used for this ring. 2781 */ 2782 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2783 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2784 2785 mutex_exit(&ill->ill_lock); 2786 return (NULL); 2787 } 2788 2789 static boolean_t 2790 ill_capability_dls_init(ill_t *ill) 2791 { 2792 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2793 conn_t *connp; 2794 size_t sz; 2795 2796 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2797 if (ill_dls == NULL) { 2798 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2799 "soft_ring enabled for ill=%s (%p) but data " 2800 "structs uninitialized\n", ill->ill_name, 2801 (void *)ill); 2802 } 2803 return (B_TRUE); 2804 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2805 if (ill_dls == NULL) { 2806 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2807 "polling enabled for ill=%s (%p) but data " 2808 "structs uninitialized\n", ill->ill_name, 2809 (void *)ill); 2810 } 2811 return (B_TRUE); 2812 } 2813 2814 if (ill_dls != NULL) { 2815 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2816 /* Soft_Ring or polling is being re-enabled */ 2817 2818 connp = ill_dls->ill_unbind_conn; 2819 ASSERT(rx_ring != NULL); 2820 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2821 bzero((void *)rx_ring, 2822 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2823 ill_dls->ill_ring_tbl = rx_ring; 2824 ill_dls->ill_unbind_conn = connp; 2825 return (B_TRUE); 2826 } 2827 2828 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 2829 return (B_FALSE); 2830 2831 sz = sizeof (ill_dls_capab_t); 2832 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2833 2834 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2835 if (ill_dls == NULL) { 2836 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2837 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2838 (void *)ill); 2839 CONN_DEC_REF(connp); 2840 return (B_FALSE); 2841 } 2842 2843 /* Allocate space to hold ring table */ 2844 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2845 ill->ill_dls_capab = ill_dls; 2846 ill_dls->ill_unbind_conn = connp; 2847 return (B_TRUE); 2848 } 2849 2850 /* 2851 * ill_capability_dls_disable: disable soft_ring and/or polling 2852 * capability. Since any of the rings might already be in use, need 2853 * to call ipsq_clean_all() which gets behind the squeue to disable 2854 * direct calls if necessary. 2855 */ 2856 static void 2857 ill_capability_dls_disable(ill_t *ill) 2858 { 2859 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2860 2861 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2862 ipsq_clean_all(ill); 2863 ill_dls->ill_tx = NULL; 2864 ill_dls->ill_tx_handle = NULL; 2865 ill_dls->ill_dls_change_status = NULL; 2866 ill_dls->ill_dls_bind = NULL; 2867 ill_dls->ill_dls_unbind = NULL; 2868 } 2869 2870 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2871 } 2872 2873 static void 2874 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2875 dl_capability_sub_t *isub) 2876 { 2877 uint_t size; 2878 uchar_t *rptr; 2879 dl_capab_dls_t dls, *odls; 2880 ill_dls_capab_t *ill_dls; 2881 mblk_t *nmp = NULL; 2882 dl_capability_req_t *ocap; 2883 uint_t sub_dl_cap = isub->dl_cap; 2884 2885 if (!ill_capability_dls_init(ill)) 2886 return; 2887 ill_dls = ill->ill_dls_capab; 2888 2889 /* Copy locally to get the members aligned */ 2890 bcopy((void *)idls, (void *)&dls, 2891 sizeof (dl_capab_dls_t)); 2892 2893 /* Get the tx function and handle from dld */ 2894 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2895 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2896 2897 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2898 ill_dls->ill_dls_change_status = 2899 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2900 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2901 ill_dls->ill_dls_unbind = 2902 (ip_dls_unbind_t)dls.dls_ring_unbind; 2903 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2904 } 2905 2906 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2907 isub->dl_length; 2908 2909 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2910 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2911 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2912 ill->ill_name, (void *)ill); 2913 return; 2914 } 2915 2916 /* initialize dl_capability_req_t */ 2917 rptr = nmp->b_rptr; 2918 ocap = (dl_capability_req_t *)rptr; 2919 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2920 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2921 rptr += sizeof (dl_capability_req_t); 2922 2923 /* initialize dl_capability_sub_t */ 2924 bcopy(isub, rptr, sizeof (*isub)); 2925 rptr += sizeof (*isub); 2926 2927 odls = (dl_capab_dls_t *)rptr; 2928 rptr += sizeof (dl_capab_dls_t); 2929 2930 /* initialize dl_capab_dls_t to be sent down */ 2931 dls.dls_rx_handle = (uintptr_t)ill; 2932 dls.dls_rx = (uintptr_t)ip_input; 2933 dls.dls_ring_add = (uintptr_t)ill_ring_add; 2934 2935 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2936 dls.dls_ring_cnt = ip_soft_rings_cnt; 2937 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 2938 dls.dls_flags = SOFT_RING_ENABLE; 2939 } else { 2940 dls.dls_flags = POLL_ENABLE; 2941 ip1dbg(("ill_capability_dls_capable: asking interface %s " 2942 "to enable polling\n", ill->ill_name)); 2943 } 2944 bcopy((void *)&dls, (void *)odls, 2945 sizeof (dl_capab_dls_t)); 2946 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2947 /* 2948 * nmp points to a DL_CAPABILITY_REQ message to 2949 * enable either soft_ring or polling 2950 */ 2951 ill_dlpi_send(ill, nmp); 2952 } 2953 2954 static void 2955 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 2956 { 2957 mblk_t *mp; 2958 dl_capab_dls_t *idls; 2959 dl_capability_sub_t *dl_subcap; 2960 int size; 2961 2962 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 2963 return; 2964 2965 ASSERT(ill->ill_dls_capab != NULL); 2966 2967 size = sizeof (*dl_subcap) + sizeof (*idls); 2968 2969 mp = allocb(size, BPRI_HI); 2970 if (mp == NULL) { 2971 ip1dbg(("ill_capability_dls_reset: unable to allocate " 2972 "request to disable soft_ring\n")); 2973 return; 2974 } 2975 2976 mp->b_wptr = mp->b_rptr + size; 2977 2978 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2979 dl_subcap->dl_length = sizeof (*idls); 2980 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2981 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 2982 else 2983 dl_subcap->dl_cap = DL_CAPAB_POLL; 2984 2985 idls = (dl_capab_dls_t *)(dl_subcap + 1); 2986 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2987 idls->dls_flags = SOFT_RING_DISABLE; 2988 else 2989 idls->dls_flags = POLL_DISABLE; 2990 2991 if (*sc_mp != NULL) 2992 linkb(*sc_mp, mp); 2993 else 2994 *sc_mp = mp; 2995 } 2996 2997 /* 2998 * Process a soft_ring/poll capability negotiation ack received 2999 * from a DLS Provider.isub must point to the sub-capability 3000 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 3001 */ 3002 static void 3003 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3004 { 3005 dl_capab_dls_t *idls; 3006 uint_t sub_dl_cap = isub->dl_cap; 3007 uint8_t *capend; 3008 3009 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 3010 sub_dl_cap == DL_CAPAB_POLL); 3011 3012 if (ill->ill_isv6) 3013 return; 3014 3015 /* 3016 * Note: range checks here are not absolutely sufficient to 3017 * make us robust against malformed messages sent by drivers; 3018 * this is in keeping with the rest of IP's dlpi handling. 3019 * (Remember, it's coming from something else in the kernel 3020 * address space) 3021 */ 3022 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3023 if (capend > mp->b_wptr) { 3024 cmn_err(CE_WARN, "ill_capability_dls_ack: " 3025 "malformed sub-capability too long for mblk"); 3026 return; 3027 } 3028 3029 /* 3030 * There are two types of acks we process here: 3031 * 1. acks in reply to a (first form) generic capability req 3032 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 3033 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 3034 * capability req. 3035 */ 3036 idls = (dl_capab_dls_t *)(isub + 1); 3037 3038 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 3039 ip1dbg(("ill_capability_dls_ack: mid token for dls " 3040 "capability isn't as expected; pass-thru " 3041 "module(s) detected, discarding capability\n")); 3042 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 3043 /* 3044 * This is a capability renegotitation case. 3045 * The interface better be unusable at this 3046 * point other wise bad things will happen 3047 * if we disable direct calls on a running 3048 * and up interface. 3049 */ 3050 ill_capability_dls_disable(ill); 3051 } 3052 return; 3053 } 3054 3055 switch (idls->dls_flags) { 3056 default: 3057 /* Disable if unknown flag */ 3058 case SOFT_RING_DISABLE: 3059 case POLL_DISABLE: 3060 ill_capability_dls_disable(ill); 3061 break; 3062 case SOFT_RING_CAPABLE: 3063 case POLL_CAPABLE: 3064 /* 3065 * If the capability was already enabled, its safe 3066 * to disable it first to get rid of stale information 3067 * and then start enabling it again. 3068 */ 3069 ill_capability_dls_disable(ill); 3070 ill_capability_dls_capable(ill, idls, isub); 3071 break; 3072 case SOFT_RING_ENABLE: 3073 case POLL_ENABLE: 3074 mutex_enter(&ill->ill_lock); 3075 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3076 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3077 ASSERT(ill->ill_dls_capab != NULL); 3078 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3079 } 3080 if (sub_dl_cap == DL_CAPAB_POLL && 3081 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3082 ASSERT(ill->ill_dls_capab != NULL); 3083 ill->ill_capabilities |= ILL_CAPAB_POLL; 3084 ip1dbg(("ill_capability_dls_ack: interface %s " 3085 "has enabled polling\n", ill->ill_name)); 3086 } 3087 mutex_exit(&ill->ill_lock); 3088 break; 3089 } 3090 } 3091 3092 /* 3093 * Process a hardware checksum offload capability negotiation ack received 3094 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3095 * of a DL_CAPABILITY_ACK message. 3096 */ 3097 static void 3098 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3099 { 3100 dl_capability_req_t *ocap; 3101 dl_capab_hcksum_t *ihck, *ohck; 3102 ill_hcksum_capab_t **ill_hcksum; 3103 mblk_t *nmp = NULL; 3104 uint_t sub_dl_cap = isub->dl_cap; 3105 uint8_t *capend; 3106 3107 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3108 3109 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3110 3111 /* 3112 * Note: range checks here are not absolutely sufficient to 3113 * make us robust against malformed messages sent by drivers; 3114 * this is in keeping with the rest of IP's dlpi handling. 3115 * (Remember, it's coming from something else in the kernel 3116 * address space) 3117 */ 3118 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3119 if (capend > mp->b_wptr) { 3120 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3121 "malformed sub-capability too long for mblk"); 3122 return; 3123 } 3124 3125 /* 3126 * There are two types of acks we process here: 3127 * 1. acks in reply to a (first form) generic capability req 3128 * (no ENABLE flag set) 3129 * 2. acks in reply to a ENABLE capability req. 3130 * (ENABLE flag set) 3131 */ 3132 ihck = (dl_capab_hcksum_t *)(isub + 1); 3133 3134 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3135 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3136 "unsupported hardware checksum " 3137 "sub-capability (version %d, expected %d)", 3138 ihck->hcksum_version, HCKSUM_VERSION_1); 3139 return; 3140 } 3141 3142 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3143 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3144 "checksum capability isn't as expected; pass-thru " 3145 "module(s) detected, discarding capability\n")); 3146 return; 3147 } 3148 3149 #define CURR_HCKSUM_CAPAB \ 3150 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3151 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3152 3153 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3154 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3155 /* do ENABLE processing */ 3156 if (*ill_hcksum == NULL) { 3157 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3158 KM_NOSLEEP); 3159 3160 if (*ill_hcksum == NULL) { 3161 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3162 "could not enable hcksum version %d " 3163 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3164 ill->ill_name); 3165 return; 3166 } 3167 } 3168 3169 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3170 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3171 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3172 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3173 "has enabled hardware checksumming\n ", 3174 ill->ill_name)); 3175 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3176 /* 3177 * Enabling hardware checksum offload 3178 * Currently IP supports {TCP,UDP}/IPv4 3179 * partial and full cksum offload and 3180 * IPv4 header checksum offload. 3181 * Allocate new mblk which will 3182 * contain a new capability request 3183 * to enable hardware checksum offload. 3184 */ 3185 uint_t size; 3186 uchar_t *rptr; 3187 3188 size = sizeof (dl_capability_req_t) + 3189 sizeof (dl_capability_sub_t) + isub->dl_length; 3190 3191 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3192 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3193 "could not enable hardware cksum for %s (ENOMEM)\n", 3194 ill->ill_name); 3195 return; 3196 } 3197 3198 rptr = nmp->b_rptr; 3199 /* initialize dl_capability_req_t */ 3200 ocap = (dl_capability_req_t *)nmp->b_rptr; 3201 ocap->dl_sub_offset = 3202 sizeof (dl_capability_req_t); 3203 ocap->dl_sub_length = 3204 sizeof (dl_capability_sub_t) + 3205 isub->dl_length; 3206 nmp->b_rptr += sizeof (dl_capability_req_t); 3207 3208 /* initialize dl_capability_sub_t */ 3209 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3210 nmp->b_rptr += sizeof (*isub); 3211 3212 /* initialize dl_capab_hcksum_t */ 3213 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3214 bcopy(ihck, ohck, sizeof (*ihck)); 3215 3216 nmp->b_rptr = rptr; 3217 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3218 3219 /* Set ENABLE flag */ 3220 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3221 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3222 3223 /* 3224 * nmp points to a DL_CAPABILITY_REQ message to enable 3225 * hardware checksum acceleration. 3226 */ 3227 ill_dlpi_send(ill, nmp); 3228 } else { 3229 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3230 "advertised %x hardware checksum capability flags\n", 3231 ill->ill_name, ihck->hcksum_txflags)); 3232 } 3233 } 3234 3235 static void 3236 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3237 { 3238 mblk_t *mp; 3239 dl_capab_hcksum_t *hck_subcap; 3240 dl_capability_sub_t *dl_subcap; 3241 int size; 3242 3243 if (!ILL_HCKSUM_CAPABLE(ill)) 3244 return; 3245 3246 ASSERT(ill->ill_hcksum_capab != NULL); 3247 /* 3248 * Clear the capability flag for hardware checksum offload but 3249 * retain the ill_hcksum_capab structure since it's possible that 3250 * another thread is still referring to it. The structure only 3251 * gets deallocated when we destroy the ill. 3252 */ 3253 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3254 3255 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3256 3257 mp = allocb(size, BPRI_HI); 3258 if (mp == NULL) { 3259 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3260 "request to disable hardware checksum offload\n")); 3261 return; 3262 } 3263 3264 mp->b_wptr = mp->b_rptr + size; 3265 3266 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3267 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3268 dl_subcap->dl_length = sizeof (*hck_subcap); 3269 3270 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3271 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3272 hck_subcap->hcksum_txflags = 0; 3273 3274 if (*sc_mp != NULL) 3275 linkb(*sc_mp, mp); 3276 else 3277 *sc_mp = mp; 3278 } 3279 3280 static void 3281 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3282 { 3283 mblk_t *nmp = NULL; 3284 dl_capability_req_t *oc; 3285 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3286 ill_zerocopy_capab_t **ill_zerocopy_capab; 3287 uint_t sub_dl_cap = isub->dl_cap; 3288 uint8_t *capend; 3289 3290 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3291 3292 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3293 3294 /* 3295 * Note: range checks here are not absolutely sufficient to 3296 * make us robust against malformed messages sent by drivers; 3297 * this is in keeping with the rest of IP's dlpi handling. 3298 * (Remember, it's coming from something else in the kernel 3299 * address space) 3300 */ 3301 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3302 if (capend > mp->b_wptr) { 3303 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3304 "malformed sub-capability too long for mblk"); 3305 return; 3306 } 3307 3308 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3309 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3310 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3311 "unsupported ZEROCOPY sub-capability (version %d, " 3312 "expected %d)", zc_ic->zerocopy_version, 3313 ZEROCOPY_VERSION_1); 3314 return; 3315 } 3316 3317 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3318 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3319 "capability isn't as expected; pass-thru module(s) " 3320 "detected, discarding capability\n")); 3321 return; 3322 } 3323 3324 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3325 if (*ill_zerocopy_capab == NULL) { 3326 *ill_zerocopy_capab = 3327 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3328 KM_NOSLEEP); 3329 3330 if (*ill_zerocopy_capab == NULL) { 3331 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3332 "could not enable Zero-copy version %d " 3333 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3334 ill->ill_name); 3335 return; 3336 } 3337 } 3338 3339 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3340 "supports Zero-copy version %d\n", ill->ill_name, 3341 ZEROCOPY_VERSION_1)); 3342 3343 (*ill_zerocopy_capab)->ill_zerocopy_version = 3344 zc_ic->zerocopy_version; 3345 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3346 zc_ic->zerocopy_flags; 3347 3348 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3349 } else { 3350 uint_t size; 3351 uchar_t *rptr; 3352 3353 size = sizeof (dl_capability_req_t) + 3354 sizeof (dl_capability_sub_t) + 3355 sizeof (dl_capab_zerocopy_t); 3356 3357 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3358 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3359 "could not enable zerocopy for %s (ENOMEM)\n", 3360 ill->ill_name); 3361 return; 3362 } 3363 3364 rptr = nmp->b_rptr; 3365 /* initialize dl_capability_req_t */ 3366 oc = (dl_capability_req_t *)rptr; 3367 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3368 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3369 sizeof (dl_capab_zerocopy_t); 3370 rptr += sizeof (dl_capability_req_t); 3371 3372 /* initialize dl_capability_sub_t */ 3373 bcopy(isub, rptr, sizeof (*isub)); 3374 rptr += sizeof (*isub); 3375 3376 /* initialize dl_capab_zerocopy_t */ 3377 zc_oc = (dl_capab_zerocopy_t *)rptr; 3378 *zc_oc = *zc_ic; 3379 3380 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3381 "to enable zero-copy version %d\n", ill->ill_name, 3382 ZEROCOPY_VERSION_1)); 3383 3384 /* set VMSAFE_MEM flag */ 3385 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3386 3387 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3388 ill_dlpi_send(ill, nmp); 3389 } 3390 } 3391 3392 static void 3393 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3394 { 3395 mblk_t *mp; 3396 dl_capab_zerocopy_t *zerocopy_subcap; 3397 dl_capability_sub_t *dl_subcap; 3398 int size; 3399 3400 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3401 return; 3402 3403 ASSERT(ill->ill_zerocopy_capab != NULL); 3404 /* 3405 * Clear the capability flag for Zero-copy but retain the 3406 * ill_zerocopy_capab structure since it's possible that another 3407 * thread is still referring to it. The structure only gets 3408 * deallocated when we destroy the ill. 3409 */ 3410 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3411 3412 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3413 3414 mp = allocb(size, BPRI_HI); 3415 if (mp == NULL) { 3416 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3417 "request to disable Zero-copy\n")); 3418 return; 3419 } 3420 3421 mp->b_wptr = mp->b_rptr + size; 3422 3423 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3424 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3425 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3426 3427 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3428 zerocopy_subcap->zerocopy_version = 3429 ill->ill_zerocopy_capab->ill_zerocopy_version; 3430 zerocopy_subcap->zerocopy_flags = 0; 3431 3432 if (*sc_mp != NULL) 3433 linkb(*sc_mp, mp); 3434 else 3435 *sc_mp = mp; 3436 } 3437 3438 /* 3439 * Consume a new-style hardware capabilities negotiation ack. 3440 * Called from ip_rput_dlpi_writer(). 3441 */ 3442 void 3443 ill_capability_ack(ill_t *ill, mblk_t *mp) 3444 { 3445 dl_capability_ack_t *capp; 3446 dl_capability_sub_t *subp, *endp; 3447 3448 if (ill->ill_capab_state == IDMS_INPROGRESS) 3449 ill->ill_capab_state = IDMS_OK; 3450 3451 capp = (dl_capability_ack_t *)mp->b_rptr; 3452 3453 if (capp->dl_sub_length == 0) 3454 /* no new-style capabilities */ 3455 return; 3456 3457 /* make sure the driver supplied correct dl_sub_length */ 3458 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3459 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3460 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3461 return; 3462 } 3463 3464 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3465 /* 3466 * There are sub-capabilities. Process the ones we know about. 3467 * Loop until we don't have room for another sub-cap header.. 3468 */ 3469 for (subp = SC(capp, capp->dl_sub_offset), 3470 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3471 subp <= endp; 3472 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3473 3474 switch (subp->dl_cap) { 3475 case DL_CAPAB_ID_WRAPPER: 3476 ill_capability_id_ack(ill, mp, subp); 3477 break; 3478 default: 3479 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3480 break; 3481 } 3482 } 3483 #undef SC 3484 } 3485 3486 /* 3487 * This routine is called to scan the fragmentation reassembly table for 3488 * the specified ILL for any packets that are starting to smell. 3489 * dead_interval is the maximum time in seconds that will be tolerated. It 3490 * will either be the value specified in ip_g_frag_timeout, or zero if the 3491 * ILL is shutting down and it is time to blow everything off. 3492 * 3493 * It returns the number of seconds (as a time_t) that the next frag timer 3494 * should be scheduled for, 0 meaning that the timer doesn't need to be 3495 * re-started. Note that the method of calculating next_timeout isn't 3496 * entirely accurate since time will flow between the time we grab 3497 * current_time and the time we schedule the next timeout. This isn't a 3498 * big problem since this is the timer for sending an ICMP reassembly time 3499 * exceeded messages, and it doesn't have to be exactly accurate. 3500 * 3501 * This function is 3502 * sometimes called as writer, although this is not required. 3503 */ 3504 time_t 3505 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3506 { 3507 ipfb_t *ipfb; 3508 ipfb_t *endp; 3509 ipf_t *ipf; 3510 ipf_t *ipfnext; 3511 mblk_t *mp; 3512 time_t current_time = gethrestime_sec(); 3513 time_t next_timeout = 0; 3514 uint32_t hdr_length; 3515 mblk_t *send_icmp_head; 3516 mblk_t *send_icmp_head_v6; 3517 3518 ipfb = ill->ill_frag_hash_tbl; 3519 if (ipfb == NULL) 3520 return (B_FALSE); 3521 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3522 /* Walk the frag hash table. */ 3523 for (; ipfb < endp; ipfb++) { 3524 send_icmp_head = NULL; 3525 send_icmp_head_v6 = NULL; 3526 mutex_enter(&ipfb->ipfb_lock); 3527 while ((ipf = ipfb->ipfb_ipf) != 0) { 3528 time_t frag_time = current_time - ipf->ipf_timestamp; 3529 time_t frag_timeout; 3530 3531 if (frag_time < dead_interval) { 3532 /* 3533 * There are some outstanding fragments 3534 * that will timeout later. Make note of 3535 * the time so that we can reschedule the 3536 * next timeout appropriately. 3537 */ 3538 frag_timeout = dead_interval - frag_time; 3539 if (next_timeout == 0 || 3540 frag_timeout < next_timeout) { 3541 next_timeout = frag_timeout; 3542 } 3543 break; 3544 } 3545 /* Time's up. Get it out of here. */ 3546 hdr_length = ipf->ipf_nf_hdr_len; 3547 ipfnext = ipf->ipf_hash_next; 3548 if (ipfnext) 3549 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3550 *ipf->ipf_ptphn = ipfnext; 3551 mp = ipf->ipf_mp->b_cont; 3552 for (; mp; mp = mp->b_cont) { 3553 /* Extra points for neatness. */ 3554 IP_REASS_SET_START(mp, 0); 3555 IP_REASS_SET_END(mp, 0); 3556 } 3557 mp = ipf->ipf_mp->b_cont; 3558 ill->ill_frag_count -= ipf->ipf_count; 3559 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3560 ipfb->ipfb_count -= ipf->ipf_count; 3561 ASSERT(ipfb->ipfb_frag_pkts > 0); 3562 ipfb->ipfb_frag_pkts--; 3563 /* 3564 * We do not send any icmp message from here because 3565 * we currently are holding the ipfb_lock for this 3566 * hash chain. If we try and send any icmp messages 3567 * from here we may end up via a put back into ip 3568 * trying to get the same lock, causing a recursive 3569 * mutex panic. Instead we build a list and send all 3570 * the icmp messages after we have dropped the lock. 3571 */ 3572 if (ill->ill_isv6) { 3573 BUMP_MIB(ill->ill_ip6_mib, ipv6ReasmFails); 3574 if (hdr_length != 0) { 3575 mp->b_next = send_icmp_head_v6; 3576 send_icmp_head_v6 = mp; 3577 } else { 3578 freemsg(mp); 3579 } 3580 } else { 3581 BUMP_MIB(&ip_mib, ipReasmFails); 3582 if (hdr_length != 0) { 3583 mp->b_next = send_icmp_head; 3584 send_icmp_head = mp; 3585 } else { 3586 freemsg(mp); 3587 } 3588 } 3589 freeb(ipf->ipf_mp); 3590 } 3591 mutex_exit(&ipfb->ipfb_lock); 3592 /* 3593 * Now need to send any icmp messages that we delayed from 3594 * above. 3595 */ 3596 while (send_icmp_head_v6 != NULL) { 3597 mp = send_icmp_head_v6; 3598 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3599 mp->b_next = NULL; 3600 icmp_time_exceeded_v6(ill->ill_wq, mp, 3601 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, B_FALSE); 3602 } 3603 while (send_icmp_head != NULL) { 3604 mp = send_icmp_head; 3605 send_icmp_head = send_icmp_head->b_next; 3606 mp->b_next = NULL; 3607 icmp_time_exceeded(ill->ill_wq, mp, 3608 ICMP_REASSEMBLY_TIME_EXCEEDED); 3609 } 3610 } 3611 /* 3612 * A non-dying ILL will use the return value to decide whether to 3613 * restart the frag timer, and for how long. 3614 */ 3615 return (next_timeout); 3616 } 3617 3618 /* 3619 * This routine is called when the approximate count of mblk memory used 3620 * for the specified ILL has exceeded max_count. 3621 */ 3622 void 3623 ill_frag_prune(ill_t *ill, uint_t max_count) 3624 { 3625 ipfb_t *ipfb; 3626 ipf_t *ipf; 3627 size_t count; 3628 3629 /* 3630 * If we are here within ip_min_frag_prune_time msecs remove 3631 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3632 * ill_frag_free_num_pkts. 3633 */ 3634 mutex_enter(&ill->ill_lock); 3635 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3636 (ip_min_frag_prune_time != 0 ? 3637 ip_min_frag_prune_time : msec_per_tick)) { 3638 3639 ill->ill_frag_free_num_pkts++; 3640 3641 } else { 3642 ill->ill_frag_free_num_pkts = 0; 3643 } 3644 ill->ill_last_frag_clean_time = lbolt; 3645 mutex_exit(&ill->ill_lock); 3646 3647 /* 3648 * free ill_frag_free_num_pkts oldest packets from each bucket. 3649 */ 3650 if (ill->ill_frag_free_num_pkts != 0) { 3651 int ix; 3652 3653 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3654 ipfb = &ill->ill_frag_hash_tbl[ix]; 3655 mutex_enter(&ipfb->ipfb_lock); 3656 if (ipfb->ipfb_ipf != NULL) { 3657 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3658 ill->ill_frag_free_num_pkts); 3659 } 3660 mutex_exit(&ipfb->ipfb_lock); 3661 } 3662 } 3663 /* 3664 * While the reassembly list for this ILL is too big, prune a fragment 3665 * queue by age, oldest first. Note that the per ILL count is 3666 * approximate, while the per frag hash bucket counts are accurate. 3667 */ 3668 while (ill->ill_frag_count > max_count) { 3669 int ix; 3670 ipfb_t *oipfb = NULL; 3671 uint_t oldest = UINT_MAX; 3672 3673 count = 0; 3674 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3675 ipfb = &ill->ill_frag_hash_tbl[ix]; 3676 mutex_enter(&ipfb->ipfb_lock); 3677 ipf = ipfb->ipfb_ipf; 3678 if (ipf != NULL && ipf->ipf_gen < oldest) { 3679 oldest = ipf->ipf_gen; 3680 oipfb = ipfb; 3681 } 3682 count += ipfb->ipfb_count; 3683 mutex_exit(&ipfb->ipfb_lock); 3684 } 3685 /* Refresh the per ILL count */ 3686 ill->ill_frag_count = count; 3687 if (oipfb == NULL) { 3688 ill->ill_frag_count = 0; 3689 break; 3690 } 3691 if (count <= max_count) 3692 return; /* Somebody beat us to it, nothing to do */ 3693 mutex_enter(&oipfb->ipfb_lock); 3694 ipf = oipfb->ipfb_ipf; 3695 if (ipf != NULL) { 3696 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3697 } 3698 mutex_exit(&oipfb->ipfb_lock); 3699 } 3700 } 3701 3702 /* 3703 * free 'free_cnt' fragmented packets starting at ipf. 3704 */ 3705 void 3706 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3707 { 3708 size_t count; 3709 mblk_t *mp; 3710 mblk_t *tmp; 3711 ipf_t **ipfp = ipf->ipf_ptphn; 3712 3713 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3714 ASSERT(ipfp != NULL); 3715 ASSERT(ipf != NULL); 3716 3717 while (ipf != NULL && free_cnt-- > 0) { 3718 count = ipf->ipf_count; 3719 mp = ipf->ipf_mp; 3720 ipf = ipf->ipf_hash_next; 3721 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3722 IP_REASS_SET_START(tmp, 0); 3723 IP_REASS_SET_END(tmp, 0); 3724 } 3725 ill->ill_frag_count -= count; 3726 ASSERT(ipfb->ipfb_count >= count); 3727 ipfb->ipfb_count -= count; 3728 ASSERT(ipfb->ipfb_frag_pkts > 0); 3729 ipfb->ipfb_frag_pkts--; 3730 freemsg(mp); 3731 BUMP_MIB(&ip_mib, ipReasmFails); 3732 } 3733 3734 if (ipf) 3735 ipf->ipf_ptphn = ipfp; 3736 ipfp[0] = ipf; 3737 } 3738 3739 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3740 "obsolete and may be removed in a future release of Solaris. Use " \ 3741 "ifconfig(1M) to manipulate the forwarding status of an interface." 3742 3743 /* 3744 * For obsolete per-interface forwarding configuration; 3745 * called in response to ND_GET. 3746 */ 3747 /* ARGSUSED */ 3748 static int 3749 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3750 { 3751 ill_t *ill = (ill_t *)cp; 3752 3753 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3754 3755 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3756 return (0); 3757 } 3758 3759 /* 3760 * For obsolete per-interface forwarding configuration; 3761 * called in response to ND_SET. 3762 */ 3763 /* ARGSUSED */ 3764 static int 3765 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3766 cred_t *ioc_cr) 3767 { 3768 long value; 3769 int retval; 3770 3771 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3772 3773 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3774 value < 0 || value > 1) { 3775 return (EINVAL); 3776 } 3777 3778 rw_enter(&ill_g_lock, RW_READER); 3779 retval = ill_forward_set(q, mp, (value != 0), cp); 3780 rw_exit(&ill_g_lock); 3781 return (retval); 3782 } 3783 3784 /* 3785 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 3786 * IPMP group, make sure all ill's in the group adopt the new policy. Send 3787 * up RTS_IFINFO routing socket messages for each interface whose flags we 3788 * change. 3789 */ 3790 /* ARGSUSED */ 3791 int 3792 ill_forward_set(queue_t *q, mblk_t *mp, boolean_t enable, caddr_t cp) 3793 { 3794 ill_t *ill = (ill_t *)cp; 3795 ill_group_t *illgrp; 3796 3797 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ill_g_lock)); 3798 3799 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3800 (!enable && !(ill->ill_flags & ILLF_ROUTER)) || 3801 (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) 3802 return (EINVAL); 3803 3804 /* 3805 * If the ill is in an IPMP group, set the forwarding policy on all 3806 * members of the group to the same value. 3807 */ 3808 illgrp = ill->ill_group; 3809 if (illgrp != NULL) { 3810 ill_t *tmp_ill; 3811 3812 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 3813 tmp_ill = tmp_ill->ill_group_next) { 3814 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3815 (enable ? "Enabling" : "Disabling"), 3816 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 3817 tmp_ill->ill_name)); 3818 mutex_enter(&tmp_ill->ill_lock); 3819 if (enable) 3820 tmp_ill->ill_flags |= ILLF_ROUTER; 3821 else 3822 tmp_ill->ill_flags &= ~ILLF_ROUTER; 3823 mutex_exit(&tmp_ill->ill_lock); 3824 if (tmp_ill->ill_isv6) 3825 ill_set_nce_router_flags(tmp_ill, enable); 3826 /* Notify routing socket listeners of this change. */ 3827 ip_rts_ifmsg(tmp_ill->ill_ipif); 3828 } 3829 } else { 3830 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3831 (enable ? "Enabling" : "Disabling"), 3832 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3833 mutex_enter(&ill->ill_lock); 3834 if (enable) 3835 ill->ill_flags |= ILLF_ROUTER; 3836 else 3837 ill->ill_flags &= ~ILLF_ROUTER; 3838 mutex_exit(&ill->ill_lock); 3839 if (ill->ill_isv6) 3840 ill_set_nce_router_flags(ill, enable); 3841 /* Notify routing socket listeners of this change. */ 3842 ip_rts_ifmsg(ill->ill_ipif); 3843 } 3844 3845 return (0); 3846 } 3847 3848 /* 3849 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3850 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3851 * set or clear. 3852 */ 3853 static void 3854 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3855 { 3856 ipif_t *ipif; 3857 nce_t *nce; 3858 3859 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3860 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 3861 if (nce != NULL) { 3862 mutex_enter(&nce->nce_lock); 3863 if (enable) 3864 nce->nce_flags |= NCE_F_ISROUTER; 3865 else 3866 nce->nce_flags &= ~NCE_F_ISROUTER; 3867 mutex_exit(&nce->nce_lock); 3868 NCE_REFRELE(nce); 3869 } 3870 } 3871 } 3872 3873 /* 3874 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 3875 * for this ill. Make sure the v6/v4 question has been answered about this 3876 * ill. The creation of this ndd variable is only for backwards compatibility. 3877 * The preferred way to control per-interface IP forwarding is through the 3878 * ILLF_ROUTER interface flag. 3879 */ 3880 static int 3881 ill_set_ndd_name(ill_t *ill) 3882 { 3883 char *suffix; 3884 3885 ASSERT(IAM_WRITER_ILL(ill)); 3886 3887 if (ill->ill_isv6) 3888 suffix = ipv6_forward_suffix; 3889 else 3890 suffix = ipv4_forward_suffix; 3891 3892 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 3893 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 3894 /* 3895 * Copies over the '\0'. 3896 * Note that strlen(suffix) is always bounded. 3897 */ 3898 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 3899 strlen(suffix) + 1); 3900 3901 /* 3902 * Use of the nd table requires holding the reader lock. 3903 * Modifying the nd table thru nd_load/nd_unload requires 3904 * the writer lock. 3905 */ 3906 rw_enter(&ip_g_nd_lock, RW_WRITER); 3907 if (!nd_load(&ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 3908 nd_ill_forward_set, (caddr_t)ill)) { 3909 /* 3910 * If the nd_load failed, it only meant that it could not 3911 * allocate a new bunch of room for further NDD expansion. 3912 * Because of that, the ill_ndd_name will be set to 0, and 3913 * this interface is at the mercy of the global ip_forwarding 3914 * variable. 3915 */ 3916 rw_exit(&ip_g_nd_lock); 3917 ill->ill_ndd_name = NULL; 3918 return (ENOMEM); 3919 } 3920 rw_exit(&ip_g_nd_lock); 3921 return (0); 3922 } 3923 3924 /* 3925 * Intializes the context structure and returns the first ill in the list 3926 * cuurently start_list and end_list can have values: 3927 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 3928 * IP_V4_G_HEAD Traverse IPV4 list only. 3929 * IP_V6_G_HEAD Traverse IPV6 list only. 3930 */ 3931 3932 /* 3933 * We don't check for CONDEMNED ills here. Caller must do that if 3934 * necessary under the ill lock. 3935 */ 3936 ill_t * 3937 ill_first(int start_list, int end_list, ill_walk_context_t *ctx) 3938 { 3939 ill_if_t *ifp; 3940 ill_t *ill; 3941 avl_tree_t *avl_tree; 3942 3943 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 3944 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 3945 3946 /* 3947 * setup the lists to search 3948 */ 3949 if (end_list != MAX_G_HEADS) { 3950 ctx->ctx_current_list = start_list; 3951 ctx->ctx_last_list = end_list; 3952 } else { 3953 ctx->ctx_last_list = MAX_G_HEADS - 1; 3954 ctx->ctx_current_list = 0; 3955 } 3956 3957 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 3958 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 3959 if (ifp != (ill_if_t *) 3960 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 3961 avl_tree = &ifp->illif_avl_by_ppa; 3962 ill = avl_first(avl_tree); 3963 /* 3964 * ill is guaranteed to be non NULL or ifp should have 3965 * not existed. 3966 */ 3967 ASSERT(ill != NULL); 3968 return (ill); 3969 } 3970 ctx->ctx_current_list++; 3971 } 3972 3973 return (NULL); 3974 } 3975 3976 /* 3977 * returns the next ill in the list. ill_first() must have been called 3978 * before calling ill_next() or bad things will happen. 3979 */ 3980 3981 /* 3982 * We don't check for CONDEMNED ills here. Caller must do that if 3983 * necessary under the ill lock. 3984 */ 3985 ill_t * 3986 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 3987 { 3988 ill_if_t *ifp; 3989 ill_t *ill; 3990 3991 3992 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 3993 ASSERT(lastill->ill_ifptr != (ill_if_t *) 3994 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)); 3995 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 3996 AVL_AFTER)) != NULL) { 3997 return (ill); 3998 } 3999 4000 /* goto next ill_ifp in the list. */ 4001 ifp = lastill->ill_ifptr->illif_next; 4002 4003 /* make sure not at end of circular list */ 4004 while (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 4005 if (++ctx->ctx_current_list > ctx->ctx_last_list) 4006 return (NULL); 4007 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 4008 } 4009 4010 return (avl_first(&ifp->illif_avl_by_ppa)); 4011 } 4012 4013 /* 4014 * Check interface name for correct format which is name+ppa. 4015 * name can contain characters and digits, the right most digits 4016 * make up the ppa number. use of octal is not allowed, name must contain 4017 * a ppa, return pointer to the start of ppa. 4018 * In case of error return NULL. 4019 */ 4020 static char * 4021 ill_get_ppa_ptr(char *name) 4022 { 4023 int namelen = mi_strlen(name); 4024 4025 int len = namelen; 4026 4027 name += len; 4028 while (len > 0) { 4029 name--; 4030 if (*name < '0' || *name > '9') 4031 break; 4032 len--; 4033 } 4034 4035 /* empty string, all digits, or no trailing digits */ 4036 if (len == 0 || len == (int)namelen) 4037 return (NULL); 4038 4039 name++; 4040 /* check for attempted use of octal */ 4041 if (*name == '0' && len != (int)namelen - 1) 4042 return (NULL); 4043 return (name); 4044 } 4045 4046 /* 4047 * use avl tree to locate the ill. 4048 */ 4049 static ill_t * 4050 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4051 ipsq_func_t func, int *error) 4052 { 4053 char *ppa_ptr = NULL; 4054 int len; 4055 uint_t ppa; 4056 ill_t *ill = NULL; 4057 ill_if_t *ifp; 4058 int list; 4059 ipsq_t *ipsq; 4060 4061 if (error != NULL) 4062 *error = 0; 4063 4064 /* 4065 * get ppa ptr 4066 */ 4067 if (isv6) 4068 list = IP_V6_G_HEAD; 4069 else 4070 list = IP_V4_G_HEAD; 4071 4072 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4073 if (error != NULL) 4074 *error = ENXIO; 4075 return (NULL); 4076 } 4077 4078 len = ppa_ptr - name + 1; 4079 4080 ppa = stoi(&ppa_ptr); 4081 4082 ifp = IP_VX_ILL_G_LIST(list); 4083 4084 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4085 /* 4086 * match is done on len - 1 as the name is not null 4087 * terminated it contains ppa in addition to the interface 4088 * name. 4089 */ 4090 if ((ifp->illif_name_len == len) && 4091 bcmp(ifp->illif_name, name, len - 1) == 0) { 4092 break; 4093 } else { 4094 ifp = ifp->illif_next; 4095 } 4096 } 4097 4098 4099 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4100 /* 4101 * Even the interface type does not exist. 4102 */ 4103 if (error != NULL) 4104 *error = ENXIO; 4105 return (NULL); 4106 } 4107 4108 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4109 if (ill != NULL) { 4110 /* 4111 * The block comment at the start of ipif_down 4112 * explains the use of the macros used below 4113 */ 4114 GRAB_CONN_LOCK(q); 4115 mutex_enter(&ill->ill_lock); 4116 if (ILL_CAN_LOOKUP(ill)) { 4117 ill_refhold_locked(ill); 4118 mutex_exit(&ill->ill_lock); 4119 RELEASE_CONN_LOCK(q); 4120 return (ill); 4121 } else if (ILL_CAN_WAIT(ill, q)) { 4122 ipsq = ill->ill_phyint->phyint_ipsq; 4123 mutex_enter(&ipsq->ipsq_lock); 4124 mutex_exit(&ill->ill_lock); 4125 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4126 mutex_exit(&ipsq->ipsq_lock); 4127 RELEASE_CONN_LOCK(q); 4128 *error = EINPROGRESS; 4129 return (NULL); 4130 } 4131 mutex_exit(&ill->ill_lock); 4132 RELEASE_CONN_LOCK(q); 4133 } 4134 if (error != NULL) 4135 *error = ENXIO; 4136 return (NULL); 4137 } 4138 4139 /* 4140 * comparison function for use with avl. 4141 */ 4142 static int 4143 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4144 { 4145 uint_t ppa; 4146 uint_t ill_ppa; 4147 4148 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4149 4150 ppa = *((uint_t *)ppa_ptr); 4151 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4152 /* 4153 * We want the ill with the lowest ppa to be on the 4154 * top. 4155 */ 4156 if (ill_ppa < ppa) 4157 return (1); 4158 if (ill_ppa > ppa) 4159 return (-1); 4160 return (0); 4161 } 4162 4163 /* 4164 * remove an interface type from the global list. 4165 */ 4166 static void 4167 ill_delete_interface_type(ill_if_t *interface) 4168 { 4169 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4170 4171 ASSERT(interface != NULL); 4172 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4173 4174 avl_destroy(&interface->illif_avl_by_ppa); 4175 if (interface->illif_ppa_arena != NULL) 4176 vmem_destroy(interface->illif_ppa_arena); 4177 4178 remque(interface); 4179 4180 mi_free(interface); 4181 } 4182 4183 /* 4184 * remove ill from the global list. 4185 */ 4186 static void 4187 ill_glist_delete(ill_t *ill) 4188 { 4189 if (ill == NULL) 4190 return; 4191 4192 rw_enter(&ill_g_lock, RW_WRITER); 4193 /* 4194 * If the ill was never inserted into the AVL tree 4195 * we skip the if branch. 4196 */ 4197 if (ill->ill_ifptr != NULL) { 4198 /* 4199 * remove from AVL tree and free ppa number 4200 */ 4201 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4202 4203 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4204 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4205 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4206 } 4207 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4208 ill_delete_interface_type(ill->ill_ifptr); 4209 } 4210 4211 /* 4212 * Indicate ill is no longer in the list. 4213 */ 4214 ill->ill_ifptr = NULL; 4215 ill->ill_name_length = 0; 4216 ill->ill_name[0] = '\0'; 4217 ill->ill_ppa = UINT_MAX; 4218 } 4219 ill_phyint_free(ill); 4220 rw_exit(&ill_g_lock); 4221 } 4222 4223 /* 4224 * allocate a ppa, if the number of plumbed interfaces of this type are 4225 * less than ill_no_arena do a linear search to find a unused ppa. 4226 * When the number goes beyond ill_no_arena switch to using an arena. 4227 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4228 * is the return value for an error condition, so allocation starts at one 4229 * and is decremented by one. 4230 */ 4231 static int 4232 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4233 { 4234 ill_t *tmp_ill; 4235 uint_t start, end; 4236 int ppa; 4237 4238 if (ifp->illif_ppa_arena == NULL && 4239 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4240 /* 4241 * Create an arena. 4242 */ 4243 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4244 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4245 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4246 /* allocate what has already been assigned */ 4247 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4248 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4249 tmp_ill, AVL_AFTER)) { 4250 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4251 1, /* size */ 4252 1, /* align/quantum */ 4253 0, /* phase */ 4254 0, /* nocross */ 4255 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), /* minaddr */ 4256 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), /* maxaddr */ 4257 VM_NOSLEEP|VM_FIRSTFIT); 4258 if (ppa == 0) { 4259 ip1dbg(("ill_alloc_ppa: ppa allocation" 4260 " failed while switching")); 4261 vmem_destroy(ifp->illif_ppa_arena); 4262 ifp->illif_ppa_arena = NULL; 4263 break; 4264 } 4265 } 4266 } 4267 4268 if (ifp->illif_ppa_arena != NULL) { 4269 if (ill->ill_ppa == UINT_MAX) { 4270 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4271 1, VM_NOSLEEP|VM_FIRSTFIT); 4272 if (ppa == 0) 4273 return (EAGAIN); 4274 ill->ill_ppa = --ppa; 4275 } else { 4276 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4277 1, /* size */ 4278 1, /* align/quantum */ 4279 0, /* phase */ 4280 0, /* nocross */ 4281 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4282 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4283 VM_NOSLEEP|VM_FIRSTFIT); 4284 /* 4285 * Most likely the allocation failed because 4286 * the requested ppa was in use. 4287 */ 4288 if (ppa == 0) 4289 return (EEXIST); 4290 } 4291 return (0); 4292 } 4293 4294 /* 4295 * No arena is in use and not enough (>ill_no_arena) interfaces have 4296 * been plumbed to create one. Do a linear search to get a unused ppa. 4297 */ 4298 if (ill->ill_ppa == UINT_MAX) { 4299 end = UINT_MAX - 1; 4300 start = 0; 4301 } else { 4302 end = start = ill->ill_ppa; 4303 } 4304 4305 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4306 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4307 if (start++ >= end) { 4308 if (ill->ill_ppa == UINT_MAX) 4309 return (EAGAIN); 4310 else 4311 return (EEXIST); 4312 } 4313 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4314 } 4315 ill->ill_ppa = start; 4316 return (0); 4317 } 4318 4319 /* 4320 * Insert ill into the list of configured ill's. Once this function completes, 4321 * the ill is globally visible and is available through lookups. More precisely 4322 * this happens after the caller drops the ill_g_lock. 4323 */ 4324 static int 4325 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4326 { 4327 ill_if_t *ill_interface; 4328 avl_index_t where = 0; 4329 int error; 4330 int name_length; 4331 int index; 4332 boolean_t check_length = B_FALSE; 4333 4334 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4335 4336 name_length = mi_strlen(name) + 1; 4337 4338 if (isv6) 4339 index = IP_V6_G_HEAD; 4340 else 4341 index = IP_V4_G_HEAD; 4342 4343 ill_interface = IP_VX_ILL_G_LIST(index); 4344 /* 4345 * Search for interface type based on name 4346 */ 4347 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4348 if ((ill_interface->illif_name_len == name_length) && 4349 (strcmp(ill_interface->illif_name, name) == 0)) { 4350 break; 4351 } 4352 ill_interface = ill_interface->illif_next; 4353 } 4354 4355 /* 4356 * Interface type not found, create one. 4357 */ 4358 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4359 4360 ill_g_head_t ghead; 4361 4362 /* 4363 * allocate ill_if_t structure 4364 */ 4365 4366 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4367 if (ill_interface == NULL) { 4368 return (ENOMEM); 4369 } 4370 4371 4372 4373 (void) strcpy(ill_interface->illif_name, name); 4374 ill_interface->illif_name_len = name_length; 4375 4376 avl_create(&ill_interface->illif_avl_by_ppa, 4377 ill_compare_ppa, sizeof (ill_t), 4378 offsetof(struct ill_s, ill_avl_byppa)); 4379 4380 /* 4381 * link the structure in the back to maintain order 4382 * of configuration for ifconfig output. 4383 */ 4384 ghead = ill_g_heads[index]; 4385 insque(ill_interface, ghead.ill_g_list_tail); 4386 4387 } 4388 4389 if (ill->ill_ppa == UINT_MAX) 4390 check_length = B_TRUE; 4391 4392 error = ill_alloc_ppa(ill_interface, ill); 4393 if (error != 0) { 4394 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4395 ill_delete_interface_type(ill->ill_ifptr); 4396 return (error); 4397 } 4398 4399 /* 4400 * When the ppa is choosen by the system, check that there is 4401 * enough space to insert ppa. if a specific ppa was passed in this 4402 * check is not required as the interface name passed in will have 4403 * the right ppa in it. 4404 */ 4405 if (check_length) { 4406 /* 4407 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4408 */ 4409 char buf[sizeof (uint_t) * 3]; 4410 4411 /* 4412 * convert ppa to string to calculate the amount of space 4413 * required for it in the name. 4414 */ 4415 numtos(ill->ill_ppa, buf); 4416 4417 /* Do we have enough space to insert ppa ? */ 4418 4419 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4420 /* Free ppa and interface type struct */ 4421 if (ill_interface->illif_ppa_arena != NULL) { 4422 vmem_free(ill_interface->illif_ppa_arena, 4423 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4424 } 4425 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4426 0) { 4427 ill_delete_interface_type(ill->ill_ifptr); 4428 } 4429 4430 return (EINVAL); 4431 } 4432 } 4433 4434 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4435 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4436 4437 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4438 &where); 4439 ill->ill_ifptr = ill_interface; 4440 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4441 4442 ill_phyint_reinit(ill); 4443 return (0); 4444 } 4445 4446 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4447 static boolean_t 4448 ipsq_init(ill_t *ill) 4449 { 4450 ipsq_t *ipsq; 4451 4452 /* Init the ipsq and impicitly enter as writer */ 4453 ill->ill_phyint->phyint_ipsq = 4454 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4455 if (ill->ill_phyint->phyint_ipsq == NULL) 4456 return (B_FALSE); 4457 ipsq = ill->ill_phyint->phyint_ipsq; 4458 ipsq->ipsq_phyint_list = ill->ill_phyint; 4459 ill->ill_phyint->phyint_ipsq_next = NULL; 4460 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4461 ipsq->ipsq_refs = 1; 4462 ipsq->ipsq_writer = curthread; 4463 ipsq->ipsq_reentry_cnt = 1; 4464 #ifdef ILL_DEBUG 4465 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, IP_STACK_DEPTH); 4466 #endif 4467 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4468 return (B_TRUE); 4469 } 4470 4471 /* 4472 * ill_init is called by ip_open when a device control stream is opened. 4473 * It does a few initializations, and shoots a DL_INFO_REQ message down 4474 * to the driver. The response is later picked up in ip_rput_dlpi and 4475 * used to set up default mechanisms for talking to the driver. (Always 4476 * called as writer.) 4477 * 4478 * If this function returns error, ip_open will call ip_close which in 4479 * turn will call ill_delete to clean up any memory allocated here that 4480 * is not yet freed. 4481 */ 4482 int 4483 ill_init(queue_t *q, ill_t *ill) 4484 { 4485 int count; 4486 dl_info_req_t *dlir; 4487 mblk_t *info_mp; 4488 uchar_t *frag_ptr; 4489 4490 /* 4491 * The ill is initialized to zero by mi_alloc*(). In addition 4492 * some fields already contain valid values, initialized in 4493 * ip_open(), before we reach here. 4494 */ 4495 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4496 4497 ill->ill_rq = q; 4498 ill->ill_wq = WR(q); 4499 4500 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4501 BPRI_HI); 4502 if (info_mp == NULL) 4503 return (ENOMEM); 4504 4505 /* 4506 * Allocate sufficient space to contain our fragment hash table and 4507 * the device name. 4508 */ 4509 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4510 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4511 if (frag_ptr == NULL) { 4512 freemsg(info_mp); 4513 return (ENOMEM); 4514 } 4515 ill->ill_frag_ptr = frag_ptr; 4516 ill->ill_frag_free_num_pkts = 0; 4517 ill->ill_last_frag_clean_time = 0; 4518 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4519 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4520 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4521 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4522 NULL, MUTEX_DEFAULT, NULL); 4523 } 4524 4525 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4526 if (ill->ill_phyint == NULL) { 4527 freemsg(info_mp); 4528 mi_free(frag_ptr); 4529 return (ENOMEM); 4530 } 4531 4532 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4533 /* 4534 * For now pretend this is a v4 ill. We need to set phyint_ill* 4535 * at this point because of the following reason. If we can't 4536 * enter the ipsq at some point and cv_wait, the writer that 4537 * wakes us up tries to locate us using the list of all phyints 4538 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4539 * If we don't set it now, we risk a missed wakeup. 4540 */ 4541 ill->ill_phyint->phyint_illv4 = ill; 4542 ill->ill_ppa = UINT_MAX; 4543 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4544 4545 if (!ipsq_init(ill)) { 4546 freemsg(info_mp); 4547 mi_free(frag_ptr); 4548 mi_free(ill->ill_phyint); 4549 return (ENOMEM); 4550 } 4551 4552 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4553 4554 4555 /* Frag queue limit stuff */ 4556 ill->ill_frag_count = 0; 4557 ill->ill_ipf_gen = 0; 4558 4559 ill->ill_global_timer = INFINITY; 4560 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4561 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4562 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4563 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4564 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4565 4566 /* 4567 * Initialize IPv6 configuration variables. The IP module is always 4568 * opened as an IPv4 module. Instead tracking down the cases where 4569 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4570 * here for convenience, this has no effect until the ill is set to do 4571 * IPv6. 4572 */ 4573 ill->ill_reachable_time = ND_REACHABLE_TIME; 4574 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4575 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4576 ill->ill_max_buf = ND_MAX_Q; 4577 ill->ill_refcnt = 0; 4578 4579 /* Send down the Info Request to the driver. */ 4580 info_mp->b_datap->db_type = M_PCPROTO; 4581 dlir = (dl_info_req_t *)info_mp->b_rptr; 4582 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4583 dlir->dl_primitive = DL_INFO_REQ; 4584 4585 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4586 4587 qprocson(q); 4588 ill_dlpi_send(ill, info_mp); 4589 4590 return (0); 4591 } 4592 4593 /* 4594 * ill_dls_info 4595 * creates datalink socket info from the device. 4596 */ 4597 int 4598 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4599 { 4600 size_t length; 4601 ill_t *ill = ipif->ipif_ill; 4602 4603 sdl->sdl_family = AF_LINK; 4604 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4605 sdl->sdl_type = ipif->ipif_type; 4606 (void) ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4607 length = mi_strlen(sdl->sdl_data); 4608 ASSERT(length < 256); 4609 sdl->sdl_nlen = (uchar_t)length; 4610 sdl->sdl_alen = ill->ill_phys_addr_length; 4611 mutex_enter(&ill->ill_lock); 4612 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) { 4613 bcopy(ill->ill_phys_addr, &sdl->sdl_data[length], 4614 ill->ill_phys_addr_length); 4615 } 4616 mutex_exit(&ill->ill_lock); 4617 sdl->sdl_slen = 0; 4618 return (sizeof (struct sockaddr_dl)); 4619 } 4620 4621 /* 4622 * ill_xarp_info 4623 * creates xarp info from the device. 4624 */ 4625 static int 4626 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4627 { 4628 sdl->sdl_family = AF_LINK; 4629 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4630 sdl->sdl_type = ill->ill_type; 4631 (void) ipif_get_name(ill->ill_ipif, sdl->sdl_data, 4632 sizeof (sdl->sdl_data)); 4633 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4634 sdl->sdl_alen = ill->ill_phys_addr_length; 4635 sdl->sdl_slen = 0; 4636 return (sdl->sdl_nlen); 4637 } 4638 4639 static int 4640 loopback_kstat_update(kstat_t *ksp, int rw) 4641 { 4642 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 4643 4644 if (rw == KSTAT_WRITE) 4645 return (EACCES); 4646 kn[0].value.ui32 = loopback_packets; 4647 kn[1].value.ui32 = loopback_packets; 4648 return (0); 4649 } 4650 4651 4652 /* 4653 * Has ifindex been plumbed already. 4654 */ 4655 static boolean_t 4656 phyint_exists(uint_t index) 4657 { 4658 phyint_t *phyi; 4659 4660 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4661 /* 4662 * Indexes are stored in the phyint - a common structure 4663 * to both IPv4 and IPv6. 4664 */ 4665 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 4666 (void *) &index, NULL); 4667 return (phyi != NULL); 4668 } 4669 4670 /* 4671 * Assign a unique interface index for the phyint. 4672 */ 4673 static boolean_t 4674 phyint_assign_ifindex(phyint_t *phyi) 4675 { 4676 uint_t starting_index; 4677 4678 ASSERT(phyi->phyint_ifindex == 0); 4679 if (!ill_index_wrap) { 4680 phyi->phyint_ifindex = ill_index++; 4681 if (ill_index == 0) { 4682 /* Reached the uint_t limit Next time wrap */ 4683 ill_index_wrap = B_TRUE; 4684 } 4685 return (B_TRUE); 4686 } 4687 4688 /* 4689 * Start reusing unused indexes. Note that we hold the ill_g_lock 4690 * at this point and don't want to call any function that attempts 4691 * to get the lock again. 4692 */ 4693 starting_index = ill_index++; 4694 for (; ill_index != starting_index; ill_index++) { 4695 if (ill_index != 0 && !phyint_exists(ill_index)) { 4696 /* found unused index - use it */ 4697 phyi->phyint_ifindex = ill_index; 4698 return (B_TRUE); 4699 } 4700 } 4701 4702 /* 4703 * all interface indicies are inuse. 4704 */ 4705 return (B_FALSE); 4706 } 4707 4708 /* 4709 * Return a pointer to the ill which matches the supplied name. Note that 4710 * the ill name length includes the null termination character. (May be 4711 * called as writer.) 4712 * If do_alloc and the interface is "lo0" it will be automatically created. 4713 * Cannot bump up reference on condemned ills. So dup detect can't be done 4714 * using this func. 4715 */ 4716 ill_t * 4717 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4718 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc) 4719 { 4720 ill_t *ill; 4721 ipif_t *ipif; 4722 kstat_named_t *kn; 4723 boolean_t isloopback; 4724 ipsq_t *old_ipsq; 4725 4726 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4727 4728 rw_enter(&ill_g_lock, RW_READER); 4729 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4730 rw_exit(&ill_g_lock); 4731 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4732 return (ill); 4733 4734 /* 4735 * Couldn't find it. Does this happen to be a lookup for the 4736 * loopback device and are we allowed to allocate it? 4737 */ 4738 if (!isloopback || !do_alloc) 4739 return (NULL); 4740 4741 rw_enter(&ill_g_lock, RW_WRITER); 4742 4743 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4744 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4745 rw_exit(&ill_g_lock); 4746 return (ill); 4747 } 4748 4749 /* Create the loopback device on demand */ 4750 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4751 sizeof (ipif_loopback_name), BPRI_MED)); 4752 if (ill == NULL) 4753 goto done; 4754 4755 *ill = ill_null; 4756 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4757 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4758 if (ill->ill_phyint == NULL) 4759 goto done; 4760 4761 if (isv6) 4762 ill->ill_phyint->phyint_illv6 = ill; 4763 else 4764 ill->ill_phyint->phyint_illv4 = ill; 4765 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4766 ill->ill_max_frag = IP_LOOPBACK_MTU; 4767 /* Add room for tcp+ip headers */ 4768 if (isv6) { 4769 ill->ill_isv6 = B_TRUE; 4770 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 4771 if (!ill_allocate_mibs(ill)) 4772 goto done; 4773 } else { 4774 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 4775 } 4776 ill->ill_max_mtu = ill->ill_max_frag; 4777 /* 4778 * ipif_loopback_name can't be pointed at directly because its used 4779 * by both the ipv4 and ipv6 interfaces. When the ill is removed 4780 * from the glist, ill_glist_delete() sets the first character of 4781 * ill_name to '\0'. 4782 */ 4783 ill->ill_name = (char *)ill + sizeof (*ill); 4784 (void) strcpy(ill->ill_name, ipif_loopback_name); 4785 ill->ill_name_length = sizeof (ipif_loopback_name); 4786 /* Set ill_name_set for ill_phyint_reinit to work properly */ 4787 4788 ill->ill_global_timer = INFINITY; 4789 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4790 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4791 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4792 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4793 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4794 4795 /* No resolver here. */ 4796 ill->ill_net_type = IRE_LOOPBACK; 4797 4798 /* Initialize the ipsq */ 4799 if (!ipsq_init(ill)) 4800 goto done; 4801 4802 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 4803 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 4804 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 4805 #ifdef ILL_DEBUG 4806 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 4807 #endif 4808 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 4809 if (ipif == NULL) 4810 goto done; 4811 4812 ill->ill_flags = ILLF_MULTICAST; 4813 4814 /* Set up default loopback address and mask. */ 4815 if (!isv6) { 4816 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 4817 4818 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 4819 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4820 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 4821 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4822 ipif->ipif_v6subnet); 4823 ill->ill_flags |= ILLF_IPV4; 4824 } else { 4825 ipif->ipif_v6lcl_addr = ipv6_loopback; 4826 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4827 ipif->ipif_v6net_mask = ipv6_all_ones; 4828 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4829 ipif->ipif_v6subnet); 4830 ill->ill_flags |= ILLF_IPV6; 4831 } 4832 4833 /* 4834 * Chain us in at the end of the ill list. hold the ill 4835 * before we make it globally visible. 1 for the lookup. 4836 */ 4837 ill->ill_refcnt = 0; 4838 ill_refhold(ill); 4839 4840 ill->ill_frag_count = 0; 4841 ill->ill_frag_free_num_pkts = 0; 4842 ill->ill_last_frag_clean_time = 0; 4843 4844 old_ipsq = ill->ill_phyint->phyint_ipsq; 4845 4846 if (ill_glist_insert(ill, "lo", isv6) != 0) 4847 cmn_err(CE_PANIC, "cannot insert loopback interface"); 4848 4849 /* Let SCTP know so that it can add this to its list */ 4850 sctp_update_ill(ill, SCTP_ILL_INSERT); 4851 4852 /* Let SCTP know about this IPIF, so that it can add it to its list */ 4853 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 4854 4855 /* 4856 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 4857 */ 4858 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 4859 /* Loopback ills aren't in any IPMP group */ 4860 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 4861 ipsq_delete(old_ipsq); 4862 } 4863 4864 /* 4865 * Delay this till the ipif is allocated as ipif_allocate 4866 * de-references ill_phyint for getting the ifindex. We 4867 * can't do this before ipif_allocate because ill_phyint_reinit 4868 * -> phyint_assign_ifindex expects ipif to be present. 4869 */ 4870 mutex_enter(&ill->ill_phyint->phyint_lock); 4871 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 4872 mutex_exit(&ill->ill_phyint->phyint_lock); 4873 4874 if (loopback_ksp == NULL) { 4875 /* Export loopback interface statistics */ 4876 loopback_ksp = kstat_create("lo", 0, ipif_loopback_name, "net", 4877 KSTAT_TYPE_NAMED, 2, 0); 4878 if (loopback_ksp != NULL) { 4879 loopback_ksp->ks_update = loopback_kstat_update; 4880 kn = KSTAT_NAMED_PTR(loopback_ksp); 4881 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 4882 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 4883 kstat_install(loopback_ksp); 4884 } 4885 } 4886 4887 if (error != NULL) 4888 *error = 0; 4889 *did_alloc = B_TRUE; 4890 rw_exit(&ill_g_lock); 4891 return (ill); 4892 done: 4893 if (ill != NULL) { 4894 if (ill->ill_phyint != NULL) { 4895 ipsq_t *ipsq; 4896 4897 ipsq = ill->ill_phyint->phyint_ipsq; 4898 if (ipsq != NULL) 4899 kmem_free(ipsq, sizeof (ipsq_t)); 4900 mi_free(ill->ill_phyint); 4901 } 4902 ill_free_mib(ill); 4903 mi_free(ill); 4904 } 4905 rw_exit(&ill_g_lock); 4906 if (error != NULL) 4907 *error = ENOMEM; 4908 return (NULL); 4909 } 4910 4911 /* 4912 * Return a pointer to the ill which matches the index and IP version type. 4913 */ 4914 ill_t * 4915 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 4916 ipsq_func_t func, int *err) 4917 { 4918 ill_t *ill; 4919 ipsq_t *ipsq; 4920 phyint_t *phyi; 4921 4922 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 4923 (q != NULL && mp != NULL && func != NULL && err != NULL)); 4924 4925 if (err != NULL) 4926 *err = 0; 4927 4928 /* 4929 * Indexes are stored in the phyint - a common structure 4930 * to both IPv4 and IPv6. 4931 */ 4932 rw_enter(&ill_g_lock, RW_READER); 4933 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 4934 (void *) &index, NULL); 4935 if (phyi != NULL) { 4936 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 4937 if (ill != NULL) { 4938 /* 4939 * The block comment at the start of ipif_down 4940 * explains the use of the macros used below 4941 */ 4942 GRAB_CONN_LOCK(q); 4943 mutex_enter(&ill->ill_lock); 4944 if (ILL_CAN_LOOKUP(ill)) { 4945 ill_refhold_locked(ill); 4946 mutex_exit(&ill->ill_lock); 4947 RELEASE_CONN_LOCK(q); 4948 rw_exit(&ill_g_lock); 4949 return (ill); 4950 } else if (ILL_CAN_WAIT(ill, q)) { 4951 ipsq = ill->ill_phyint->phyint_ipsq; 4952 mutex_enter(&ipsq->ipsq_lock); 4953 rw_exit(&ill_g_lock); 4954 mutex_exit(&ill->ill_lock); 4955 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4956 mutex_exit(&ipsq->ipsq_lock); 4957 RELEASE_CONN_LOCK(q); 4958 *err = EINPROGRESS; 4959 return (NULL); 4960 } 4961 RELEASE_CONN_LOCK(q); 4962 mutex_exit(&ill->ill_lock); 4963 } 4964 } 4965 rw_exit(&ill_g_lock); 4966 if (err != NULL) 4967 *err = ENXIO; 4968 return (NULL); 4969 } 4970 4971 /* 4972 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4973 * that gives a running thread a reference to the ill. This reference must be 4974 * released by the thread when it is done accessing the ill and related 4975 * objects. ill_refcnt can not be used to account for static references 4976 * such as other structures pointing to an ill. Callers must generally 4977 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4978 * or be sure that the ill is not being deleted or changing state before 4979 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4980 * ill won't change any of its critical state such as address, netmask etc. 4981 */ 4982 void 4983 ill_refhold(ill_t *ill) 4984 { 4985 mutex_enter(&ill->ill_lock); 4986 ill->ill_refcnt++; 4987 ILL_TRACE_REF(ill); 4988 mutex_exit(&ill->ill_lock); 4989 } 4990 4991 void 4992 ill_refhold_locked(ill_t *ill) 4993 { 4994 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4995 ill->ill_refcnt++; 4996 ILL_TRACE_REF(ill); 4997 } 4998 4999 int 5000 ill_check_and_refhold(ill_t *ill) 5001 { 5002 mutex_enter(&ill->ill_lock); 5003 if (ILL_CAN_LOOKUP(ill)) { 5004 ill_refhold_locked(ill); 5005 mutex_exit(&ill->ill_lock); 5006 return (0); 5007 } 5008 mutex_exit(&ill->ill_lock); 5009 return (ILL_LOOKUP_FAILED); 5010 } 5011 5012 /* 5013 * Must not be called while holding any locks. Otherwise if this is 5014 * the last reference to be released, there is a chance of recursive mutex 5015 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5016 * to restart an ioctl. 5017 */ 5018 void 5019 ill_refrele(ill_t *ill) 5020 { 5021 mutex_enter(&ill->ill_lock); 5022 ASSERT(ill->ill_refcnt != 0); 5023 ill->ill_refcnt--; 5024 ILL_UNTRACE_REF(ill); 5025 if (ill->ill_refcnt != 0) { 5026 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5027 mutex_exit(&ill->ill_lock); 5028 return; 5029 } 5030 5031 /* Drops the ill_lock */ 5032 ipif_ill_refrele_tail(ill); 5033 } 5034 5035 /* 5036 * Obtain a weak reference count on the ill. This reference ensures the 5037 * ill won't be freed, but the ill may change any of its critical state 5038 * such as netmask, address etc. Returns an error if the ill has started 5039 * closing. 5040 */ 5041 boolean_t 5042 ill_waiter_inc(ill_t *ill) 5043 { 5044 mutex_enter(&ill->ill_lock); 5045 if (ill->ill_state_flags & ILL_CONDEMNED) { 5046 mutex_exit(&ill->ill_lock); 5047 return (B_FALSE); 5048 } 5049 ill->ill_waiters++; 5050 mutex_exit(&ill->ill_lock); 5051 return (B_TRUE); 5052 } 5053 5054 void 5055 ill_waiter_dcr(ill_t *ill) 5056 { 5057 mutex_enter(&ill->ill_lock); 5058 ill->ill_waiters--; 5059 if (ill->ill_waiters == 0) 5060 cv_broadcast(&ill->ill_cv); 5061 mutex_exit(&ill->ill_lock); 5062 } 5063 5064 /* 5065 * Named Dispatch routine to produce a formatted report on all ILLs. 5066 * This report is accessed by using the ndd utility to "get" ND variable 5067 * "ip_ill_status". 5068 */ 5069 /* ARGSUSED */ 5070 int 5071 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5072 { 5073 ill_t *ill; 5074 ill_walk_context_t ctx; 5075 5076 (void) mi_mpprintf(mp, 5077 "ILL " MI_COL_HDRPAD_STR 5078 /* 01234567[89ABCDEF] */ 5079 "rq " MI_COL_HDRPAD_STR 5080 /* 01234567[89ABCDEF] */ 5081 "wq " MI_COL_HDRPAD_STR 5082 /* 01234567[89ABCDEF] */ 5083 "upcnt mxfrg err name"); 5084 /* 12345 12345 123 xxxxxxxx */ 5085 5086 rw_enter(&ill_g_lock, RW_READER); 5087 ill = ILL_START_WALK_ALL(&ctx); 5088 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5089 (void) mi_mpprintf(mp, 5090 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5091 "%05u %05u %03d %s", 5092 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5093 ill->ill_ipif_up_count, 5094 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5095 } 5096 rw_exit(&ill_g_lock); 5097 5098 return (0); 5099 } 5100 5101 /* 5102 * Named Dispatch routine to produce a formatted report on all IPIFs. 5103 * This report is accessed by using the ndd utility to "get" ND variable 5104 * "ip_ipif_status". 5105 */ 5106 /* ARGSUSED */ 5107 int 5108 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5109 { 5110 char buf1[INET6_ADDRSTRLEN]; 5111 char buf2[INET6_ADDRSTRLEN]; 5112 char buf3[INET6_ADDRSTRLEN]; 5113 char buf4[INET6_ADDRSTRLEN]; 5114 char buf5[INET6_ADDRSTRLEN]; 5115 char buf6[INET6_ADDRSTRLEN]; 5116 char buf[LIFNAMSIZ]; 5117 ill_t *ill; 5118 ipif_t *ipif; 5119 nv_t *nvp; 5120 uint64_t flags; 5121 zoneid_t zoneid; 5122 ill_walk_context_t ctx; 5123 5124 (void) mi_mpprintf(mp, 5125 "IPIF metric mtu in/out/forward name zone flags...\n" 5126 "\tlocal address\n" 5127 "\tsrc address\n" 5128 "\tsubnet\n" 5129 "\tmask\n" 5130 "\tbroadcast\n" 5131 "\tp-p-dst"); 5132 5133 ASSERT(q->q_next == NULL); 5134 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5135 5136 rw_enter(&ill_g_lock, RW_READER); 5137 ill = ILL_START_WALK_ALL(&ctx); 5138 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5139 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 5140 if (zoneid != GLOBAL_ZONEID && 5141 zoneid != ipif->ipif_zoneid && 5142 ipif->ipif_zoneid != ALL_ZONES) 5143 continue; 5144 (void) mi_mpprintf(mp, 5145 MI_COL_PTRFMT_STR 5146 "%04u %05u %u/%u/%u %s %d", 5147 (void *)ipif, 5148 ipif->ipif_metric, ipif->ipif_mtu, 5149 ipif->ipif_ib_pkt_count, 5150 ipif->ipif_ob_pkt_count, 5151 ipif->ipif_fo_pkt_count, 5152 ipif_get_name(ipif, buf, sizeof (buf)), 5153 ipif->ipif_zoneid); 5154 5155 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5156 ipif->ipif_ill->ill_phyint->phyint_flags; 5157 5158 /* Tack on text strings for any flags. */ 5159 nvp = ipif_nv_tbl; 5160 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5161 if (nvp->nv_value & flags) 5162 (void) mi_mpprintf_nr(mp, " %s", 5163 nvp->nv_name); 5164 } 5165 (void) mi_mpprintf(mp, 5166 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5167 inet_ntop(AF_INET6, 5168 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5169 inet_ntop(AF_INET6, 5170 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5171 inet_ntop(AF_INET6, 5172 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5173 inet_ntop(AF_INET6, 5174 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5175 inet_ntop(AF_INET6, 5176 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5177 inet_ntop(AF_INET6, 5178 &ipif->ipif_v6pp_dst_addr, 5179 buf6, sizeof (buf6))); 5180 } 5181 } 5182 rw_exit(&ill_g_lock); 5183 return (0); 5184 } 5185 5186 /* 5187 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5188 * driver. We construct best guess defaults for lower level information that 5189 * we need. If an interface is brought up without injection of any overriding 5190 * information from outside, we have to be ready to go with these defaults. 5191 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5192 * we primarely want the dl_provider_style. 5193 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5194 * at which point we assume the other part of the information is valid. 5195 */ 5196 void 5197 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5198 { 5199 uchar_t *brdcst_addr; 5200 uint_t brdcst_addr_length, phys_addr_length; 5201 t_scalar_t sap_length; 5202 dl_info_ack_t *dlia; 5203 ip_m_t *ipm; 5204 dl_qos_cl_sel1_t *sel1; 5205 5206 ASSERT(IAM_WRITER_ILL(ill)); 5207 5208 /* 5209 * Till the ill is fully up ILL_CHANGING will be set and 5210 * the ill is not globally visible. So no need for a lock. 5211 */ 5212 dlia = (dl_info_ack_t *)mp->b_rptr; 5213 ill->ill_mactype = dlia->dl_mac_type; 5214 5215 ipm = ip_m_lookup(dlia->dl_mac_type); 5216 if (ipm == NULL) { 5217 ipm = ip_m_lookup(DL_OTHER); 5218 ASSERT(ipm != NULL); 5219 } 5220 ill->ill_media = ipm; 5221 5222 /* 5223 * When the new DLPI stuff is ready we'll pull lengths 5224 * from dlia. 5225 */ 5226 if (dlia->dl_version == DL_VERSION_2) { 5227 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5228 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5229 brdcst_addr_length); 5230 if (brdcst_addr == NULL) { 5231 brdcst_addr_length = 0; 5232 } 5233 sap_length = dlia->dl_sap_length; 5234 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5235 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5236 brdcst_addr_length, sap_length, phys_addr_length)); 5237 } else { 5238 brdcst_addr_length = 6; 5239 brdcst_addr = ip_six_byte_all_ones; 5240 sap_length = -2; 5241 phys_addr_length = brdcst_addr_length; 5242 } 5243 5244 ill->ill_bcast_addr_length = brdcst_addr_length; 5245 ill->ill_phys_addr_length = phys_addr_length; 5246 ill->ill_sap_length = sap_length; 5247 ill->ill_max_frag = dlia->dl_max_sdu; 5248 ill->ill_max_mtu = ill->ill_max_frag; 5249 5250 ill->ill_type = ipm->ip_m_type; 5251 5252 if (!ill->ill_dlpi_style_set) { 5253 if (dlia->dl_provider_style == DL_STYLE2) 5254 ill->ill_needs_attach = 1; 5255 5256 /* 5257 * Allocate the first ipif on this ill. We don't delay it 5258 * further as ioctl handling assumes atleast one ipif to 5259 * be present. 5260 * 5261 * At this point we don't know whether the ill is v4 or v6. 5262 * We will know this whan the SIOCSLIFNAME happens and 5263 * the correct value for ill_isv6 will be assigned in 5264 * ipif_set_values(). We need to hold the ill lock and 5265 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5266 * the wakeup. 5267 */ 5268 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5269 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5270 mutex_enter(&ill->ill_lock); 5271 ASSERT(ill->ill_dlpi_style_set == 0); 5272 ill->ill_dlpi_style_set = 1; 5273 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5274 cv_broadcast(&ill->ill_cv); 5275 mutex_exit(&ill->ill_lock); 5276 freemsg(mp); 5277 return; 5278 } 5279 ASSERT(ill->ill_ipif != NULL); 5280 /* 5281 * We know whether it is IPv4 or IPv6 now, as this is the 5282 * second DL_INFO_ACK we are recieving in response to the 5283 * DL_INFO_REQ sent in ipif_set_values. 5284 */ 5285 if (ill->ill_isv6) 5286 ill->ill_sap = IP6_DL_SAP; 5287 else 5288 ill->ill_sap = IP_DL_SAP; 5289 /* 5290 * Set ipif_mtu which is used to set the IRE's 5291 * ire_max_frag value. The driver could have sent 5292 * a different mtu from what it sent last time. No 5293 * need to call ipif_mtu_change because IREs have 5294 * not yet been created. 5295 */ 5296 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5297 /* 5298 * Clear all the flags that were set based on ill_bcast_addr_length 5299 * and ill_phys_addr_length (in ipif_set_values) as these could have 5300 * changed now and we need to re-evaluate. 5301 */ 5302 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5303 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5304 5305 /* 5306 * Free ill_resolver_mp and ill_bcast_mp as things could have 5307 * changed now. 5308 */ 5309 if (ill->ill_bcast_addr_length == 0) { 5310 if (ill->ill_resolver_mp != NULL) 5311 freemsg(ill->ill_resolver_mp); 5312 if (ill->ill_bcast_mp != NULL) 5313 freemsg(ill->ill_bcast_mp); 5314 if (ill->ill_flags & ILLF_XRESOLV) 5315 ill->ill_net_type = IRE_IF_RESOLVER; 5316 else 5317 ill->ill_net_type = IRE_IF_NORESOLVER; 5318 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5319 ill->ill_phys_addr_length, 5320 ill->ill_sap, 5321 ill->ill_sap_length); 5322 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5323 5324 if (ill->ill_isv6) 5325 /* 5326 * Note: xresolv interfaces will eventually need NOARP 5327 * set here as well, but that will require those 5328 * external resolvers to have some knowledge of 5329 * that flag and act appropriately. Not to be changed 5330 * at present. 5331 */ 5332 ill->ill_flags |= ILLF_NONUD; 5333 else 5334 ill->ill_flags |= ILLF_NOARP; 5335 5336 if (ill->ill_phys_addr_length == 0) { 5337 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5338 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5339 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5340 } else { 5341 /* pt-pt supports multicast. */ 5342 ill->ill_flags |= ILLF_MULTICAST; 5343 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5344 } 5345 } 5346 } else { 5347 ill->ill_net_type = IRE_IF_RESOLVER; 5348 if (ill->ill_bcast_mp != NULL) 5349 freemsg(ill->ill_bcast_mp); 5350 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5351 ill->ill_bcast_addr_length, ill->ill_sap, 5352 ill->ill_sap_length); 5353 /* 5354 * Later detect lack of DLPI driver multicast 5355 * capability by catching DL_ENABMULTI errors in 5356 * ip_rput_dlpi. 5357 */ 5358 ill->ill_flags |= ILLF_MULTICAST; 5359 if (!ill->ill_isv6) 5360 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5361 } 5362 /* By default an interface does not support any CoS marking */ 5363 ill->ill_flags &= ~ILLF_COS_ENABLED; 5364 5365 /* 5366 * If we get QoS information in DL_INFO_ACK, the device supports 5367 * some form of CoS marking, set ILLF_COS_ENABLED. 5368 */ 5369 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5370 dlia->dl_qos_length); 5371 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5372 ill->ill_flags |= ILLF_COS_ENABLED; 5373 } 5374 5375 /* Clear any previous error indication. */ 5376 ill->ill_error = 0; 5377 freemsg(mp); 5378 } 5379 5380 /* 5381 * Perform various checks to verify that an address would make sense as a 5382 * local, remote, or subnet interface address. 5383 */ 5384 static boolean_t 5385 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5386 { 5387 ipaddr_t net_mask; 5388 5389 /* 5390 * Don't allow all zeroes, all ones or experimental address, but allow 5391 * all ones netmask. 5392 */ 5393 if ((net_mask = ip_net_mask(addr)) == 0) 5394 return (B_FALSE); 5395 /* A given netmask overrides the "guess" netmask */ 5396 if (subnet_mask != 0) 5397 net_mask = subnet_mask; 5398 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5399 (addr == (addr | ~net_mask)))) { 5400 return (B_FALSE); 5401 } 5402 if (CLASSD(addr)) 5403 return (B_FALSE); 5404 5405 return (B_TRUE); 5406 } 5407 5408 /* 5409 * ipif_lookup_group 5410 * Returns held ipif 5411 */ 5412 ipif_t * 5413 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid) 5414 { 5415 ire_t *ire; 5416 ipif_t *ipif; 5417 5418 ire = ire_lookup_multi(group, zoneid); 5419 if (ire == NULL) 5420 return (NULL); 5421 ipif = ire->ire_ipif; 5422 ipif_refhold(ipif); 5423 ire_refrele(ire); 5424 return (ipif); 5425 } 5426 5427 /* 5428 * Look for an ipif with the specified interface address and destination. 5429 * The destination address is used only for matching point-to-point interfaces. 5430 */ 5431 ipif_t * 5432 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5433 ipsq_func_t func, int *error) 5434 { 5435 ipif_t *ipif; 5436 ill_t *ill; 5437 ill_walk_context_t ctx; 5438 ipsq_t *ipsq; 5439 5440 if (error != NULL) 5441 *error = 0; 5442 5443 /* 5444 * First match all the point-to-point interfaces 5445 * before looking at non-point-to-point interfaces. 5446 * This is done to avoid returning non-point-to-point 5447 * ipif instead of unnumbered point-to-point ipif. 5448 */ 5449 rw_enter(&ill_g_lock, RW_READER); 5450 ill = ILL_START_WALK_V4(&ctx); 5451 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5452 GRAB_CONN_LOCK(q); 5453 mutex_enter(&ill->ill_lock); 5454 for (ipif = ill->ill_ipif; ipif != NULL; 5455 ipif = ipif->ipif_next) { 5456 /* Allow the ipif to be down */ 5457 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5458 (ipif->ipif_lcl_addr == if_addr) && 5459 (ipif->ipif_pp_dst_addr == dst)) { 5460 /* 5461 * The block comment at the start of ipif_down 5462 * explains the use of the macros used below 5463 */ 5464 if (IPIF_CAN_LOOKUP(ipif)) { 5465 ipif_refhold_locked(ipif); 5466 mutex_exit(&ill->ill_lock); 5467 RELEASE_CONN_LOCK(q); 5468 rw_exit(&ill_g_lock); 5469 return (ipif); 5470 } else if (IPIF_CAN_WAIT(ipif, q)) { 5471 ipsq = ill->ill_phyint->phyint_ipsq; 5472 mutex_enter(&ipsq->ipsq_lock); 5473 mutex_exit(&ill->ill_lock); 5474 rw_exit(&ill_g_lock); 5475 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5476 ill); 5477 mutex_exit(&ipsq->ipsq_lock); 5478 RELEASE_CONN_LOCK(q); 5479 *error = EINPROGRESS; 5480 return (NULL); 5481 } 5482 } 5483 } 5484 mutex_exit(&ill->ill_lock); 5485 RELEASE_CONN_LOCK(q); 5486 } 5487 rw_exit(&ill_g_lock); 5488 5489 /* lookup the ipif based on interface address */ 5490 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error); 5491 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5492 return (ipif); 5493 } 5494 5495 /* 5496 * Look for an ipif with the specified address. For point-point links 5497 * we look for matches on either the destination address and the local 5498 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5499 * is set. 5500 * Matches on a specific ill if match_ill is set. 5501 */ 5502 ipif_t * 5503 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5504 mblk_t *mp, ipsq_func_t func, int *error) 5505 { 5506 ipif_t *ipif; 5507 ill_t *ill; 5508 boolean_t ptp = B_FALSE; 5509 ipsq_t *ipsq; 5510 ill_walk_context_t ctx; 5511 5512 if (error != NULL) 5513 *error = 0; 5514 5515 rw_enter(&ill_g_lock, RW_READER); 5516 /* 5517 * Repeat twice, first based on local addresses and 5518 * next time for pointopoint. 5519 */ 5520 repeat: 5521 ill = ILL_START_WALK_V4(&ctx); 5522 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5523 if (match_ill != NULL && ill != match_ill) { 5524 continue; 5525 } 5526 GRAB_CONN_LOCK(q); 5527 mutex_enter(&ill->ill_lock); 5528 for (ipif = ill->ill_ipif; ipif != NULL; 5529 ipif = ipif->ipif_next) { 5530 if (zoneid != ALL_ZONES && 5531 zoneid != ipif->ipif_zoneid && 5532 ipif->ipif_zoneid != ALL_ZONES) 5533 continue; 5534 /* Allow the ipif to be down */ 5535 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5536 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5537 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5538 (ipif->ipif_pp_dst_addr == addr))) { 5539 /* 5540 * The block comment at the start of ipif_down 5541 * explains the use of the macros used below 5542 */ 5543 if (IPIF_CAN_LOOKUP(ipif)) { 5544 ipif_refhold_locked(ipif); 5545 mutex_exit(&ill->ill_lock); 5546 RELEASE_CONN_LOCK(q); 5547 rw_exit(&ill_g_lock); 5548 return (ipif); 5549 } else if (IPIF_CAN_WAIT(ipif, q)) { 5550 ipsq = ill->ill_phyint->phyint_ipsq; 5551 mutex_enter(&ipsq->ipsq_lock); 5552 mutex_exit(&ill->ill_lock); 5553 rw_exit(&ill_g_lock); 5554 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5555 ill); 5556 mutex_exit(&ipsq->ipsq_lock); 5557 RELEASE_CONN_LOCK(q); 5558 *error = EINPROGRESS; 5559 return (NULL); 5560 } 5561 } 5562 } 5563 mutex_exit(&ill->ill_lock); 5564 RELEASE_CONN_LOCK(q); 5565 } 5566 5567 /* Now try the ptp case */ 5568 if (ptp) { 5569 rw_exit(&ill_g_lock); 5570 if (error != NULL) 5571 *error = ENXIO; 5572 return (NULL); 5573 } 5574 ptp = B_TRUE; 5575 goto repeat; 5576 } 5577 5578 /* 5579 * Look for an ipif that matches the specified remote address i.e. the 5580 * ipif that would receive the specified packet. 5581 * First look for directly connected interfaces and then do a recursive 5582 * IRE lookup and pick the first ipif corresponding to the source address in the 5583 * ire. 5584 * Returns: held ipif 5585 */ 5586 ipif_t * 5587 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 5588 { 5589 ipif_t *ipif; 5590 ire_t *ire; 5591 5592 ASSERT(!ill->ill_isv6); 5593 5594 /* 5595 * Someone could be changing this ipif currently or change it 5596 * after we return this. Thus a few packets could use the old 5597 * old values. However structure updates/creates (ire, ilg, ilm etc) 5598 * will atomically be updated or cleaned up with the new value 5599 * Thus we don't need a lock to check the flags or other attrs below. 5600 */ 5601 mutex_enter(&ill->ill_lock); 5602 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5603 if (!IPIF_CAN_LOOKUP(ipif)) 5604 continue; 5605 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 5606 ipif->ipif_zoneid != ALL_ZONES) 5607 continue; 5608 /* Allow the ipif to be down */ 5609 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5610 if ((ipif->ipif_pp_dst_addr == addr) || 5611 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 5612 ipif->ipif_lcl_addr == addr)) { 5613 ipif_refhold_locked(ipif); 5614 mutex_exit(&ill->ill_lock); 5615 return (ipif); 5616 } 5617 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 5618 ipif_refhold_locked(ipif); 5619 mutex_exit(&ill->ill_lock); 5620 return (ipif); 5621 } 5622 } 5623 mutex_exit(&ill->ill_lock); 5624 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 5625 NULL, MATCH_IRE_RECURSIVE); 5626 if (ire != NULL) { 5627 /* 5628 * The callers of this function wants to know the 5629 * interface on which they have to send the replies 5630 * back. For IRE_CACHES that have ire_stq and ire_ipif 5631 * derived from different ills, we really don't care 5632 * what we return here. 5633 */ 5634 ipif = ire->ire_ipif; 5635 if (ipif != NULL) { 5636 ipif_refhold(ipif); 5637 ire_refrele(ire); 5638 return (ipif); 5639 } 5640 ire_refrele(ire); 5641 } 5642 /* Pick the first interface */ 5643 ipif = ipif_get_next_ipif(NULL, ill); 5644 return (ipif); 5645 } 5646 5647 /* 5648 * This func does not prevent refcnt from increasing. But if 5649 * the caller has taken steps to that effect, then this func 5650 * can be used to determine whether the ill has become quiescent 5651 */ 5652 boolean_t 5653 ill_is_quiescent(ill_t *ill) 5654 { 5655 ipif_t *ipif; 5656 5657 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5658 5659 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5660 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5661 return (B_FALSE); 5662 } 5663 } 5664 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 5665 ill->ill_nce_cnt != 0 || ill->ill_srcif_refcnt != 0 || 5666 ill->ill_mrtun_refcnt != 0) { 5667 return (B_FALSE); 5668 } 5669 return (B_TRUE); 5670 } 5671 5672 /* 5673 * This func does not prevent refcnt from increasing. But if 5674 * the caller has taken steps to that effect, then this func 5675 * can be used to determine whether the ipif has become quiescent 5676 */ 5677 static boolean_t 5678 ipif_is_quiescent(ipif_t *ipif) 5679 { 5680 ill_t *ill; 5681 5682 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5683 5684 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5685 return (B_FALSE); 5686 } 5687 5688 ill = ipif->ipif_ill; 5689 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 5690 ill->ill_logical_down) { 5691 return (B_TRUE); 5692 } 5693 5694 /* This is the last ipif going down or being deleted on this ill */ 5695 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 5696 return (B_FALSE); 5697 } 5698 5699 return (B_TRUE); 5700 } 5701 5702 /* 5703 * This func does not prevent refcnt from increasing. But if 5704 * the caller has taken steps to that effect, then this func 5705 * can be used to determine whether the ipifs marked with IPIF_MOVING 5706 * have become quiescent and can be moved in a failover/failback. 5707 */ 5708 static ipif_t * 5709 ill_quiescent_to_move(ill_t *ill) 5710 { 5711 ipif_t *ipif; 5712 5713 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5714 5715 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5716 if (ipif->ipif_state_flags & IPIF_MOVING) { 5717 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5718 return (ipif); 5719 } 5720 } 5721 } 5722 return (NULL); 5723 } 5724 5725 /* 5726 * The ipif/ill/ire has been refreled. Do the tail processing. 5727 * Determine if the ipif or ill in question has become quiescent and if so 5728 * wakeup close and/or restart any queued pending ioctl that is waiting 5729 * for the ipif_down (or ill_down) 5730 */ 5731 void 5732 ipif_ill_refrele_tail(ill_t *ill) 5733 { 5734 mblk_t *mp; 5735 conn_t *connp; 5736 ipsq_t *ipsq; 5737 ipif_t *ipif; 5738 5739 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5740 5741 if ((ill->ill_state_flags & ILL_CONDEMNED) && 5742 ill_is_quiescent(ill)) { 5743 /* ill_close may be waiting */ 5744 cv_broadcast(&ill->ill_cv); 5745 } 5746 5747 /* ipsq can't change because ill_lock is held */ 5748 ipsq = ill->ill_phyint->phyint_ipsq; 5749 if (ipsq->ipsq_waitfor == 0) { 5750 /* Not waiting for anything, just return. */ 5751 mutex_exit(&ill->ill_lock); 5752 return; 5753 } 5754 ASSERT(ipsq->ipsq_pending_mp != NULL && 5755 ipsq->ipsq_pending_ipif != NULL); 5756 /* 5757 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 5758 * Last ipif going down needs to down the ill, so ill_ire_cnt must 5759 * be zero for restarting an ioctl that ends up downing the ill. 5760 */ 5761 ipif = ipsq->ipsq_pending_ipif; 5762 if (ipif->ipif_ill != ill) { 5763 /* The ioctl is pending on some other ill. */ 5764 mutex_exit(&ill->ill_lock); 5765 return; 5766 } 5767 5768 switch (ipsq->ipsq_waitfor) { 5769 case IPIF_DOWN: 5770 case IPIF_FREE: 5771 if (!ipif_is_quiescent(ipif)) { 5772 mutex_exit(&ill->ill_lock); 5773 return; 5774 } 5775 break; 5776 5777 case ILL_DOWN: 5778 case ILL_FREE: 5779 /* 5780 * case ILL_FREE arises only for loopback. otherwise ill_delete 5781 * waits synchronously in ip_close, and no message is queued in 5782 * ipsq_pending_mp at all in this case 5783 */ 5784 if (!ill_is_quiescent(ill)) { 5785 mutex_exit(&ill->ill_lock); 5786 return; 5787 } 5788 5789 break; 5790 5791 case ILL_MOVE_OK: 5792 if (ill_quiescent_to_move(ill) != NULL) { 5793 mutex_exit(&ill->ill_lock); 5794 return; 5795 } 5796 5797 break; 5798 default: 5799 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 5800 (void *)ipsq, ipsq->ipsq_waitfor); 5801 } 5802 5803 /* 5804 * Incr refcnt for the qwriter_ip call below which 5805 * does a refrele 5806 */ 5807 ill_refhold_locked(ill); 5808 mutex_exit(&ill->ill_lock); 5809 5810 mp = ipsq_pending_mp_get(ipsq, &connp); 5811 ASSERT(mp != NULL); 5812 5813 switch (mp->b_datap->db_type) { 5814 case M_ERROR: 5815 case M_HANGUP: 5816 (void) qwriter_ip(NULL, ill, ill->ill_rq, mp, 5817 ipif_all_down_tail, CUR_OP, B_TRUE); 5818 return; 5819 5820 case M_IOCTL: 5821 case M_IOCDATA: 5822 (void) qwriter_ip(NULL, ill, 5823 (connp != NULL ? CONNP_TO_WQ(connp) : ill->ill_wq), mp, 5824 ip_reprocess_ioctl, CUR_OP, B_TRUE); 5825 return; 5826 5827 default: 5828 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5829 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5830 } 5831 } 5832 5833 #ifdef ILL_DEBUG 5834 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5835 void 5836 th_trace_rrecord(th_trace_t *th_trace) 5837 { 5838 tr_buf_t *tr_buf; 5839 uint_t lastref; 5840 5841 lastref = th_trace->th_trace_lastref; 5842 lastref++; 5843 if (lastref == TR_BUF_MAX) 5844 lastref = 0; 5845 th_trace->th_trace_lastref = lastref; 5846 tr_buf = &th_trace->th_trbuf[lastref]; 5847 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, IP_STACK_DEPTH); 5848 } 5849 5850 th_trace_t * 5851 th_trace_ipif_lookup(ipif_t *ipif) 5852 { 5853 int bucket_id; 5854 th_trace_t *th_trace; 5855 5856 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5857 5858 bucket_id = IP_TR_HASH(curthread); 5859 ASSERT(bucket_id < IP_TR_HASH_MAX); 5860 5861 for (th_trace = ipif->ipif_trace[bucket_id]; th_trace != NULL; 5862 th_trace = th_trace->th_next) { 5863 if (th_trace->th_id == curthread) 5864 return (th_trace); 5865 } 5866 return (NULL); 5867 } 5868 5869 void 5870 ipif_trace_ref(ipif_t *ipif) 5871 { 5872 int bucket_id; 5873 th_trace_t *th_trace; 5874 5875 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5876 5877 if (ipif->ipif_trace_disable) 5878 return; 5879 5880 /* 5881 * Attempt to locate the trace buffer for the curthread. 5882 * If it does not exist, then allocate a new trace buffer 5883 * and link it in list of trace bufs for this ipif, at the head 5884 */ 5885 th_trace = th_trace_ipif_lookup(ipif); 5886 if (th_trace == NULL) { 5887 bucket_id = IP_TR_HASH(curthread); 5888 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 5889 KM_NOSLEEP); 5890 if (th_trace == NULL) { 5891 ipif->ipif_trace_disable = B_TRUE; 5892 ipif_trace_cleanup(ipif); 5893 return; 5894 } 5895 th_trace->th_id = curthread; 5896 th_trace->th_next = ipif->ipif_trace[bucket_id]; 5897 th_trace->th_prev = &ipif->ipif_trace[bucket_id]; 5898 if (th_trace->th_next != NULL) 5899 th_trace->th_next->th_prev = &th_trace->th_next; 5900 ipif->ipif_trace[bucket_id] = th_trace; 5901 } 5902 ASSERT(th_trace->th_refcnt >= 0 && 5903 th_trace->th_refcnt < TR_BUF_MAX -1); 5904 th_trace->th_refcnt++; 5905 th_trace_rrecord(th_trace); 5906 } 5907 5908 void 5909 ipif_untrace_ref(ipif_t *ipif) 5910 { 5911 th_trace_t *th_trace; 5912 5913 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5914 5915 if (ipif->ipif_trace_disable) 5916 return; 5917 th_trace = th_trace_ipif_lookup(ipif); 5918 ASSERT(th_trace != NULL); 5919 ASSERT(th_trace->th_refcnt > 0); 5920 5921 th_trace->th_refcnt--; 5922 th_trace_rrecord(th_trace); 5923 } 5924 5925 th_trace_t * 5926 th_trace_ill_lookup(ill_t *ill) 5927 { 5928 th_trace_t *th_trace; 5929 int bucket_id; 5930 5931 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5932 5933 bucket_id = IP_TR_HASH(curthread); 5934 ASSERT(bucket_id < IP_TR_HASH_MAX); 5935 5936 for (th_trace = ill->ill_trace[bucket_id]; th_trace != NULL; 5937 th_trace = th_trace->th_next) { 5938 if (th_trace->th_id == curthread) 5939 return (th_trace); 5940 } 5941 return (NULL); 5942 } 5943 5944 void 5945 ill_trace_ref(ill_t *ill) 5946 { 5947 int bucket_id; 5948 th_trace_t *th_trace; 5949 5950 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5951 if (ill->ill_trace_disable) 5952 return; 5953 /* 5954 * Attempt to locate the trace buffer for the curthread. 5955 * If it does not exist, then allocate a new trace buffer 5956 * and link it in list of trace bufs for this ill, at the head 5957 */ 5958 th_trace = th_trace_ill_lookup(ill); 5959 if (th_trace == NULL) { 5960 bucket_id = IP_TR_HASH(curthread); 5961 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 5962 KM_NOSLEEP); 5963 if (th_trace == NULL) { 5964 ill->ill_trace_disable = B_TRUE; 5965 ill_trace_cleanup(ill); 5966 return; 5967 } 5968 th_trace->th_id = curthread; 5969 th_trace->th_next = ill->ill_trace[bucket_id]; 5970 th_trace->th_prev = &ill->ill_trace[bucket_id]; 5971 if (th_trace->th_next != NULL) 5972 th_trace->th_next->th_prev = &th_trace->th_next; 5973 ill->ill_trace[bucket_id] = th_trace; 5974 } 5975 ASSERT(th_trace->th_refcnt >= 0 && 5976 th_trace->th_refcnt < TR_BUF_MAX - 1); 5977 5978 th_trace->th_refcnt++; 5979 th_trace_rrecord(th_trace); 5980 } 5981 5982 void 5983 ill_untrace_ref(ill_t *ill) 5984 { 5985 th_trace_t *th_trace; 5986 5987 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5988 5989 if (ill->ill_trace_disable) 5990 return; 5991 th_trace = th_trace_ill_lookup(ill); 5992 ASSERT(th_trace != NULL); 5993 ASSERT(th_trace->th_refcnt > 0); 5994 5995 th_trace->th_refcnt--; 5996 th_trace_rrecord(th_trace); 5997 } 5998 5999 /* 6000 * Verify that this thread has no refs to the ipif and free 6001 * the trace buffers 6002 */ 6003 /* ARGSUSED */ 6004 void 6005 ipif_thread_exit(ipif_t *ipif, void *dummy) 6006 { 6007 th_trace_t *th_trace; 6008 6009 mutex_enter(&ipif->ipif_ill->ill_lock); 6010 6011 th_trace = th_trace_ipif_lookup(ipif); 6012 if (th_trace == NULL) { 6013 mutex_exit(&ipif->ipif_ill->ill_lock); 6014 return; 6015 } 6016 ASSERT(th_trace->th_refcnt == 0); 6017 /* unlink th_trace and free it */ 6018 *th_trace->th_prev = th_trace->th_next; 6019 if (th_trace->th_next != NULL) 6020 th_trace->th_next->th_prev = th_trace->th_prev; 6021 th_trace->th_next = NULL; 6022 th_trace->th_prev = NULL; 6023 kmem_free(th_trace, sizeof (th_trace_t)); 6024 6025 mutex_exit(&ipif->ipif_ill->ill_lock); 6026 } 6027 6028 /* 6029 * Verify that this thread has no refs to the ill and free 6030 * the trace buffers 6031 */ 6032 /* ARGSUSED */ 6033 void 6034 ill_thread_exit(ill_t *ill, void *dummy) 6035 { 6036 th_trace_t *th_trace; 6037 6038 mutex_enter(&ill->ill_lock); 6039 6040 th_trace = th_trace_ill_lookup(ill); 6041 if (th_trace == NULL) { 6042 mutex_exit(&ill->ill_lock); 6043 return; 6044 } 6045 ASSERT(th_trace->th_refcnt == 0); 6046 /* unlink th_trace and free it */ 6047 *th_trace->th_prev = th_trace->th_next; 6048 if (th_trace->th_next != NULL) 6049 th_trace->th_next->th_prev = th_trace->th_prev; 6050 th_trace->th_next = NULL; 6051 th_trace->th_prev = NULL; 6052 kmem_free(th_trace, sizeof (th_trace_t)); 6053 6054 mutex_exit(&ill->ill_lock); 6055 } 6056 #endif 6057 6058 #ifdef ILL_DEBUG 6059 void 6060 ip_thread_exit(void) 6061 { 6062 ill_t *ill; 6063 ipif_t *ipif; 6064 ill_walk_context_t ctx; 6065 6066 rw_enter(&ill_g_lock, RW_READER); 6067 ill = ILL_START_WALK_ALL(&ctx); 6068 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6069 for (ipif = ill->ill_ipif; ipif != NULL; 6070 ipif = ipif->ipif_next) { 6071 ipif_thread_exit(ipif, NULL); 6072 } 6073 ill_thread_exit(ill, NULL); 6074 } 6075 rw_exit(&ill_g_lock); 6076 6077 ire_walk(ire_thread_exit, NULL); 6078 ndp_walk_common(&ndp4, NULL, nce_thread_exit, NULL, B_FALSE); 6079 ndp_walk_common(&ndp6, NULL, nce_thread_exit, NULL, B_FALSE); 6080 } 6081 6082 /* 6083 * Called when ipif is unplumbed or when memory alloc fails 6084 */ 6085 void 6086 ipif_trace_cleanup(ipif_t *ipif) 6087 { 6088 int i; 6089 th_trace_t *th_trace; 6090 th_trace_t *th_trace_next; 6091 6092 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6093 for (th_trace = ipif->ipif_trace[i]; th_trace != NULL; 6094 th_trace = th_trace_next) { 6095 th_trace_next = th_trace->th_next; 6096 kmem_free(th_trace, sizeof (th_trace_t)); 6097 } 6098 ipif->ipif_trace[i] = NULL; 6099 } 6100 } 6101 6102 /* 6103 * Called when ill is unplumbed or when memory alloc fails 6104 */ 6105 void 6106 ill_trace_cleanup(ill_t *ill) 6107 { 6108 int i; 6109 th_trace_t *th_trace; 6110 th_trace_t *th_trace_next; 6111 6112 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6113 for (th_trace = ill->ill_trace[i]; th_trace != NULL; 6114 th_trace = th_trace_next) { 6115 th_trace_next = th_trace->th_next; 6116 kmem_free(th_trace, sizeof (th_trace_t)); 6117 } 6118 ill->ill_trace[i] = NULL; 6119 } 6120 } 6121 6122 #else 6123 void ip_thread_exit(void) {} 6124 #endif 6125 6126 void 6127 ipif_refhold_locked(ipif_t *ipif) 6128 { 6129 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6130 ipif->ipif_refcnt++; 6131 IPIF_TRACE_REF(ipif); 6132 } 6133 6134 void 6135 ipif_refhold(ipif_t *ipif) 6136 { 6137 ill_t *ill; 6138 6139 ill = ipif->ipif_ill; 6140 mutex_enter(&ill->ill_lock); 6141 ipif->ipif_refcnt++; 6142 IPIF_TRACE_REF(ipif); 6143 mutex_exit(&ill->ill_lock); 6144 } 6145 6146 /* 6147 * Must not be called while holding any locks. Otherwise if this is 6148 * the last reference to be released there is a chance of recursive mutex 6149 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6150 * to restart an ioctl. 6151 */ 6152 void 6153 ipif_refrele(ipif_t *ipif) 6154 { 6155 ill_t *ill; 6156 6157 ill = ipif->ipif_ill; 6158 6159 mutex_enter(&ill->ill_lock); 6160 ASSERT(ipif->ipif_refcnt != 0); 6161 ipif->ipif_refcnt--; 6162 IPIF_UNTRACE_REF(ipif); 6163 if (ipif->ipif_refcnt != 0) { 6164 mutex_exit(&ill->ill_lock); 6165 return; 6166 } 6167 6168 /* Drops the ill_lock */ 6169 ipif_ill_refrele_tail(ill); 6170 } 6171 6172 ipif_t * 6173 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6174 { 6175 ipif_t *ipif; 6176 6177 mutex_enter(&ill->ill_lock); 6178 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6179 ipif != NULL; ipif = ipif->ipif_next) { 6180 if (!IPIF_CAN_LOOKUP(ipif)) 6181 continue; 6182 ipif_refhold_locked(ipif); 6183 mutex_exit(&ill->ill_lock); 6184 return (ipif); 6185 } 6186 mutex_exit(&ill->ill_lock); 6187 return (NULL); 6188 } 6189 6190 /* 6191 * TODO: make this table extendible at run time 6192 * Return a pointer to the mac type info for 'mac_type' 6193 */ 6194 static ip_m_t * 6195 ip_m_lookup(t_uscalar_t mac_type) 6196 { 6197 ip_m_t *ipm; 6198 6199 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6200 if (ipm->ip_m_mac_type == mac_type) 6201 return (ipm); 6202 return (NULL); 6203 } 6204 6205 /* 6206 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6207 * ipif_arg is passed in to associate it with the correct interface. 6208 * We may need to restart this operation if the ipif cannot be looked up 6209 * due to an exclusive operation that is currently in progress. The restart 6210 * entry point is specified by 'func' 6211 */ 6212 int 6213 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6214 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6215 ire_t **ire_arg, boolean_t ioctl_msg, queue_t *q, mblk_t *mp, 6216 ipsq_func_t func, struct rtsa_s *sp) 6217 { 6218 ire_t *ire; 6219 ire_t *gw_ire = NULL; 6220 ipif_t *ipif = NULL; 6221 boolean_t ipif_refheld = B_FALSE; 6222 uint_t type; 6223 int match_flags = MATCH_IRE_TYPE; 6224 int error; 6225 tsol_gc_t *gc = NULL; 6226 tsol_gcgrp_t *gcgrp = NULL; 6227 boolean_t gcgrp_xtraref = B_FALSE; 6228 6229 ip1dbg(("ip_rt_add:")); 6230 6231 if (ire_arg != NULL) 6232 *ire_arg = NULL; 6233 6234 /* 6235 * If this is the case of RTF_HOST being set, then we set the netmask 6236 * to all ones (regardless if one was supplied). 6237 */ 6238 if (flags & RTF_HOST) 6239 mask = IP_HOST_MASK; 6240 6241 /* 6242 * Prevent routes with a zero gateway from being created (since 6243 * interfaces can currently be plumbed and brought up no assigned 6244 * address). 6245 * For routes with RTA_SRCIFP, the gateway address can be 0.0.0.0. 6246 */ 6247 if (gw_addr == 0 && src_ipif == NULL) 6248 return (ENETUNREACH); 6249 /* 6250 * Get the ipif, if any, corresponding to the gw_addr 6251 */ 6252 if (gw_addr != 0) { 6253 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, 6254 &error); 6255 if (ipif != NULL) { 6256 if (IS_VNI(ipif->ipif_ill)) { 6257 ipif_refrele(ipif); 6258 return (EINVAL); 6259 } 6260 ipif_refheld = B_TRUE; 6261 } else if (error == EINPROGRESS) { 6262 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6263 return (EINPROGRESS); 6264 } else { 6265 error = 0; 6266 } 6267 } 6268 6269 if (ipif != NULL) { 6270 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6271 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6272 } else { 6273 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6274 } 6275 6276 /* 6277 * GateD will attempt to create routes with a loopback interface 6278 * address as the gateway and with RTF_GATEWAY set. We allow 6279 * these routes to be added, but create them as interface routes 6280 * since the gateway is an interface address. 6281 */ 6282 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6283 flags &= ~RTF_GATEWAY; 6284 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6285 mask == IP_HOST_MASK) { 6286 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6287 ALL_ZONES, NULL, match_flags); 6288 if (ire != NULL) { 6289 ire_refrele(ire); 6290 if (ipif_refheld) 6291 ipif_refrele(ipif); 6292 return (EEXIST); 6293 } 6294 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" 6295 "for 0x%x\n", (void *)ipif, 6296 ipif->ipif_ire_type, 6297 ntohl(ipif->ipif_lcl_addr))); 6298 ire = ire_create( 6299 (uchar_t *)&dst_addr, /* dest address */ 6300 (uchar_t *)&mask, /* mask */ 6301 (uchar_t *)&ipif->ipif_src_addr, 6302 NULL, /* no gateway */ 6303 NULL, 6304 &ipif->ipif_mtu, 6305 NULL, 6306 ipif->ipif_rq, /* recv-from queue */ 6307 NULL, /* no send-to queue */ 6308 ipif->ipif_ire_type, /* LOOPBACK */ 6309 NULL, 6310 ipif, 6311 NULL, 6312 0, 6313 0, 6314 0, 6315 (ipif->ipif_flags & IPIF_PRIVATE) ? 6316 RTF_PRIVATE : 0, 6317 &ire_uinfo_null, 6318 NULL, 6319 NULL); 6320 6321 if (ire == NULL) { 6322 if (ipif_refheld) 6323 ipif_refrele(ipif); 6324 return (ENOMEM); 6325 } 6326 error = ire_add(&ire, q, mp, func, B_FALSE); 6327 if (error == 0) 6328 goto save_ire; 6329 if (ipif_refheld) 6330 ipif_refrele(ipif); 6331 return (error); 6332 6333 } 6334 } 6335 6336 /* 6337 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6338 * and the gateway address provided is one of the system's interface 6339 * addresses. By using the routing socket interface and supplying an 6340 * RTA_IFP sockaddr with an interface index, an alternate method of 6341 * specifying an interface route to be created is available which uses 6342 * the interface index that specifies the outgoing interface rather than 6343 * the address of an outgoing interface (which may not be able to 6344 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6345 * flag, routes can be specified which not only specify the next-hop to 6346 * be used when routing to a certain prefix, but also which outgoing 6347 * interface should be used. 6348 * 6349 * Previously, interfaces would have unique addresses assigned to them 6350 * and so the address assigned to a particular interface could be used 6351 * to identify a particular interface. One exception to this was the 6352 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6353 * 6354 * With the advent of IPv6 and its link-local addresses, this 6355 * restriction was relaxed and interfaces could share addresses between 6356 * themselves. In fact, typically all of the link-local interfaces on 6357 * an IPv6 node or router will have the same link-local address. In 6358 * order to differentiate between these interfaces, the use of an 6359 * interface index is necessary and this index can be carried inside a 6360 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6361 * of using the interface index, however, is that all of the ipif's that 6362 * are part of an ill have the same index and so the RTA_IFP sockaddr 6363 * cannot be used to differentiate between ipif's (or logical 6364 * interfaces) that belong to the same ill (physical interface). 6365 * 6366 * For example, in the following case involving IPv4 interfaces and 6367 * logical interfaces 6368 * 6369 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6370 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6371 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6372 * 6373 * the ipif's corresponding to each of these interface routes can be 6374 * uniquely identified by the "gateway" (actually interface address). 6375 * 6376 * In this case involving multiple IPv6 default routes to a particular 6377 * link-local gateway, the use of RTA_IFP is necessary to specify which 6378 * default route is of interest: 6379 * 6380 * default fe80::123:4567:89ab:cdef U if0 6381 * default fe80::123:4567:89ab:cdef U if1 6382 */ 6383 6384 /* RTF_GATEWAY not set */ 6385 if (!(flags & RTF_GATEWAY)) { 6386 queue_t *stq; 6387 queue_t *rfq = NULL; 6388 ill_t *in_ill = NULL; 6389 6390 if (sp != NULL) { 6391 ip2dbg(("ip_rt_add: gateway security attributes " 6392 "cannot be set with interface route\n")); 6393 if (ipif_refheld) 6394 ipif_refrele(ipif); 6395 return (EINVAL); 6396 } 6397 6398 /* 6399 * As the interface index specified with the RTA_IFP sockaddr is 6400 * the same for all ipif's off of an ill, the matching logic 6401 * below uses MATCH_IRE_ILL if such an index was specified. 6402 * This means that routes sharing the same prefix when added 6403 * using a RTA_IFP sockaddr must have distinct interface 6404 * indices (namely, they must be on distinct ill's). 6405 * 6406 * On the other hand, since the gateway address will usually be 6407 * different for each ipif on the system, the matching logic 6408 * uses MATCH_IRE_IPIF in the case of a traditional interface 6409 * route. This means that interface routes for the same prefix 6410 * can be created if they belong to distinct ipif's and if a 6411 * RTA_IFP sockaddr is not present. 6412 */ 6413 if (ipif_arg != NULL) { 6414 if (ipif_refheld) { 6415 ipif_refrele(ipif); 6416 ipif_refheld = B_FALSE; 6417 } 6418 ipif = ipif_arg; 6419 match_flags |= MATCH_IRE_ILL; 6420 } else { 6421 /* 6422 * Check the ipif corresponding to the gw_addr 6423 */ 6424 if (ipif == NULL) 6425 return (ENETUNREACH); 6426 match_flags |= MATCH_IRE_IPIF; 6427 } 6428 ASSERT(ipif != NULL); 6429 /* 6430 * If src_ipif is not NULL, we have to create 6431 * an ire with non-null ire_in_ill value 6432 */ 6433 if (src_ipif != NULL) { 6434 in_ill = src_ipif->ipif_ill; 6435 } 6436 6437 /* 6438 * We check for an existing entry at this point. 6439 * 6440 * Since a netmask isn't passed in via the ioctl interface 6441 * (SIOCADDRT), we don't check for a matching netmask in that 6442 * case. 6443 */ 6444 if (!ioctl_msg) 6445 match_flags |= MATCH_IRE_MASK; 6446 if (src_ipif != NULL) { 6447 /* Look up in the special table */ 6448 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6449 ipif, src_ipif->ipif_ill, match_flags); 6450 } else { 6451 ire = ire_ftable_lookup(dst_addr, mask, 0, 6452 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6453 NULL, match_flags); 6454 } 6455 if (ire != NULL) { 6456 ire_refrele(ire); 6457 if (ipif_refheld) 6458 ipif_refrele(ipif); 6459 return (EEXIST); 6460 } 6461 6462 if (src_ipif != NULL) { 6463 /* 6464 * Create the special ire for the IRE table 6465 * which hangs out of ire_in_ill. This ire 6466 * is in-between IRE_CACHE and IRE_INTERFACE. 6467 * Thus rfq is non-NULL. 6468 */ 6469 rfq = ipif->ipif_rq; 6470 } 6471 /* Create the usual interface ires */ 6472 6473 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6474 ? ipif->ipif_rq : ipif->ipif_wq; 6475 6476 /* 6477 * Create a copy of the IRE_LOOPBACK, 6478 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6479 * the modified address and netmask. 6480 */ 6481 ire = ire_create( 6482 (uchar_t *)&dst_addr, 6483 (uint8_t *)&mask, 6484 (uint8_t *)&ipif->ipif_src_addr, 6485 NULL, 6486 NULL, 6487 &ipif->ipif_mtu, 6488 NULL, 6489 rfq, 6490 stq, 6491 ipif->ipif_net_type, 6492 ipif->ipif_resolver_mp, 6493 ipif, 6494 in_ill, 6495 0, 6496 0, 6497 0, 6498 flags, 6499 &ire_uinfo_null, 6500 NULL, 6501 NULL); 6502 if (ire == NULL) { 6503 if (ipif_refheld) 6504 ipif_refrele(ipif); 6505 return (ENOMEM); 6506 } 6507 6508 /* 6509 * Some software (for example, GateD and Sun Cluster) attempts 6510 * to create (what amount to) IRE_PREFIX routes with the 6511 * loopback address as the gateway. This is primarily done to 6512 * set up prefixes with the RTF_REJECT flag set (for example, 6513 * when generating aggregate routes.) 6514 * 6515 * If the IRE type (as defined by ipif->ipif_net_type) is 6516 * IRE_LOOPBACK, then we map the request into a 6517 * IRE_IF_NORESOLVER. 6518 * 6519 * Needless to say, the real IRE_LOOPBACK is NOT created by this 6520 * routine, but rather using ire_create() directly. 6521 * 6522 */ 6523 if (ipif->ipif_net_type == IRE_LOOPBACK) 6524 ire->ire_type = IRE_IF_NORESOLVER; 6525 6526 error = ire_add(&ire, q, mp, func, B_FALSE); 6527 if (error == 0) 6528 goto save_ire; 6529 6530 /* 6531 * In the result of failure, ire_add() will have already 6532 * deleted the ire in question, so there is no need to 6533 * do that here. 6534 */ 6535 if (ipif_refheld) 6536 ipif_refrele(ipif); 6537 return (error); 6538 } 6539 if (ipif_refheld) { 6540 ipif_refrele(ipif); 6541 ipif_refheld = B_FALSE; 6542 } 6543 6544 if (src_ipif != NULL) { 6545 /* RTA_SRCIFP is not supported on RTF_GATEWAY */ 6546 ip2dbg(("ip_rt_add: SRCIF cannot be set with gateway route\n")); 6547 return (EINVAL); 6548 } 6549 /* 6550 * Get an interface IRE for the specified gateway. 6551 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 6552 * gateway, it is currently unreachable and we fail the request 6553 * accordingly. 6554 */ 6555 ipif = ipif_arg; 6556 if (ipif_arg != NULL) 6557 match_flags |= MATCH_IRE_ILL; 6558 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 6559 ALL_ZONES, 0, NULL, match_flags); 6560 if (gw_ire == NULL) 6561 return (ENETUNREACH); 6562 6563 /* 6564 * We create one of three types of IREs as a result of this request 6565 * based on the netmask. A netmask of all ones (which is automatically 6566 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 6567 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 6568 * created. Otherwise, an IRE_PREFIX route is created for the 6569 * destination prefix. 6570 */ 6571 if (mask == IP_HOST_MASK) 6572 type = IRE_HOST; 6573 else if (mask == 0) 6574 type = IRE_DEFAULT; 6575 else 6576 type = IRE_PREFIX; 6577 6578 /* check for a duplicate entry */ 6579 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 6580 NULL, ALL_ZONES, 0, NULL, 6581 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW); 6582 if (ire != NULL) { 6583 ire_refrele(gw_ire); 6584 ire_refrele(ire); 6585 return (EEXIST); 6586 } 6587 6588 /* Security attribute exists */ 6589 if (sp != NULL) { 6590 tsol_gcgrp_addr_t ga; 6591 6592 /* find or create the gateway credentials group */ 6593 ga.ga_af = AF_INET; 6594 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 6595 6596 /* we hold reference to it upon success */ 6597 gcgrp = gcgrp_lookup(&ga, B_TRUE); 6598 if (gcgrp == NULL) { 6599 ire_refrele(gw_ire); 6600 return (ENOMEM); 6601 } 6602 6603 /* 6604 * Create and add the security attribute to the group; a 6605 * reference to the group is made upon allocating a new 6606 * entry successfully. If it finds an already-existing 6607 * entry for the security attribute in the group, it simply 6608 * returns it and no new reference is made to the group. 6609 */ 6610 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 6611 if (gc == NULL) { 6612 /* release reference held by gcgrp_lookup */ 6613 GCGRP_REFRELE(gcgrp); 6614 ire_refrele(gw_ire); 6615 return (ENOMEM); 6616 } 6617 } 6618 6619 /* Create the IRE. */ 6620 ire = ire_create( 6621 (uchar_t *)&dst_addr, /* dest address */ 6622 (uchar_t *)&mask, /* mask */ 6623 /* src address assigned by the caller? */ 6624 (uchar_t *)(((src_addr != INADDR_ANY) && 6625 (flags & RTF_SETSRC)) ? &src_addr : NULL), 6626 (uchar_t *)&gw_addr, /* gateway address */ 6627 NULL, /* no in-srcaddress */ 6628 &gw_ire->ire_max_frag, 6629 NULL, /* no Fast Path header */ 6630 NULL, /* no recv-from queue */ 6631 NULL, /* no send-to queue */ 6632 (ushort_t)type, /* IRE type */ 6633 NULL, 6634 ipif_arg, 6635 NULL, 6636 0, 6637 0, 6638 0, 6639 flags, 6640 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 6641 gc, /* security attribute */ 6642 NULL); 6643 /* 6644 * The ire holds a reference to the 'gc' and the 'gc' holds a 6645 * reference to the 'gcgrp'. We can now release the extra reference 6646 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 6647 */ 6648 if (gcgrp_xtraref) 6649 GCGRP_REFRELE(gcgrp); 6650 if (ire == NULL) { 6651 if (gc != NULL) 6652 GC_REFRELE(gc); 6653 ire_refrele(gw_ire); 6654 return (ENOMEM); 6655 } 6656 6657 /* 6658 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 6659 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 6660 */ 6661 6662 /* Add the new IRE. */ 6663 error = ire_add(&ire, q, mp, func, B_FALSE); 6664 if (error != 0) { 6665 /* 6666 * In the result of failure, ire_add() will have already 6667 * deleted the ire in question, so there is no need to 6668 * do that here. 6669 */ 6670 ire_refrele(gw_ire); 6671 return (error); 6672 } 6673 6674 if (flags & RTF_MULTIRT) { 6675 /* 6676 * Invoke the CGTP (multirouting) filtering module 6677 * to add the dst address in the filtering database. 6678 * Replicated inbound packets coming from that address 6679 * will be filtered to discard the duplicates. 6680 * It is not necessary to call the CGTP filter hook 6681 * when the dst address is a broadcast or multicast, 6682 * because an IP source address cannot be a broadcast 6683 * or a multicast. 6684 */ 6685 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 6686 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 6687 if (ire_dst != NULL) { 6688 ip_cgtp_bcast_add(ire, ire_dst); 6689 ire_refrele(ire_dst); 6690 goto save_ire; 6691 } 6692 if ((ip_cgtp_filter_ops != NULL) && !CLASSD(ire->ire_addr)) { 6693 int res = ip_cgtp_filter_ops->cfo_add_dest_v4( 6694 ire->ire_addr, 6695 ire->ire_gateway_addr, 6696 ire->ire_src_addr, 6697 gw_ire->ire_src_addr); 6698 if (res != 0) { 6699 ire_refrele(gw_ire); 6700 ire_delete(ire); 6701 return (res); 6702 } 6703 } 6704 } 6705 6706 /* 6707 * Now that the prefix IRE entry has been created, delete any 6708 * existing gateway IRE cache entries as well as any IRE caches 6709 * using the gateway, and force them to be created through 6710 * ip_newroute. 6711 */ 6712 if (gc != NULL) { 6713 ASSERT(gcgrp != NULL); 6714 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES); 6715 } 6716 6717 save_ire: 6718 if (gw_ire != NULL) { 6719 ire_refrele(gw_ire); 6720 } 6721 /* 6722 * We do not do save_ire for the routes added with RTA_SRCIFP 6723 * flag. This route is only added and deleted by mipagent. 6724 * So, for simplicity of design, we refrain from saving 6725 * ires that are created with srcif value. This may change 6726 * in future if we find more usage of srcifp feature. 6727 */ 6728 if (ipif != NULL && src_ipif == NULL) { 6729 /* 6730 * Save enough information so that we can recreate the IRE if 6731 * the interface goes down and then up. The metrics associated 6732 * with the route will be saved as well when rts_setmetrics() is 6733 * called after the IRE has been created. In the case where 6734 * memory cannot be allocated, none of this information will be 6735 * saved. 6736 */ 6737 ipif_save_ire(ipif, ire); 6738 } 6739 if (ioctl_msg) 6740 ip_rts_rtmsg(RTM_OLDADD, ire, 0); 6741 if (ire_arg != NULL) { 6742 /* 6743 * Store the ire that was successfully added into where ire_arg 6744 * points to so that callers don't have to look it up 6745 * themselves (but they are responsible for ire_refrele()ing 6746 * the ire when they are finished with it). 6747 */ 6748 *ire_arg = ire; 6749 } else { 6750 ire_refrele(ire); /* Held in ire_add */ 6751 } 6752 if (ipif_refheld) 6753 ipif_refrele(ipif); 6754 return (0); 6755 } 6756 6757 /* 6758 * ip_rt_delete is called to delete an IPv4 route. 6759 * ipif_arg is passed in to associate it with the correct interface. 6760 * src_ipif is passed to associate the incoming interface of the packet. 6761 * We may need to restart this operation if the ipif cannot be looked up 6762 * due to an exclusive operation that is currently in progress. The restart 6763 * entry point is specified by 'func' 6764 */ 6765 /* ARGSUSED4 */ 6766 int 6767 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6768 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6769 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func) 6770 { 6771 ire_t *ire = NULL; 6772 ipif_t *ipif; 6773 boolean_t ipif_refheld = B_FALSE; 6774 uint_t type; 6775 uint_t match_flags = MATCH_IRE_TYPE; 6776 int err = 0; 6777 6778 ip1dbg(("ip_rt_delete:")); 6779 /* 6780 * If this is the case of RTF_HOST being set, then we set the netmask 6781 * to all ones. Otherwise, we use the netmask if one was supplied. 6782 */ 6783 if (flags & RTF_HOST) { 6784 mask = IP_HOST_MASK; 6785 match_flags |= MATCH_IRE_MASK; 6786 } else if (rtm_addrs & RTA_NETMASK) { 6787 match_flags |= MATCH_IRE_MASK; 6788 } 6789 6790 /* 6791 * Note that RTF_GATEWAY is never set on a delete, therefore 6792 * we check if the gateway address is one of our interfaces first, 6793 * and fall back on RTF_GATEWAY routes. 6794 * 6795 * This makes it possible to delete an original 6796 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 6797 * 6798 * As the interface index specified with the RTA_IFP sockaddr is the 6799 * same for all ipif's off of an ill, the matching logic below uses 6800 * MATCH_IRE_ILL if such an index was specified. This means a route 6801 * sharing the same prefix and interface index as the the route 6802 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 6803 * is specified in the request. 6804 * 6805 * On the other hand, since the gateway address will usually be 6806 * different for each ipif on the system, the matching logic 6807 * uses MATCH_IRE_IPIF in the case of a traditional interface 6808 * route. This means that interface routes for the same prefix can be 6809 * uniquely identified if they belong to distinct ipif's and if a 6810 * RTA_IFP sockaddr is not present. 6811 * 6812 * For more detail on specifying routes by gateway address and by 6813 * interface index, see the comments in ip_rt_add(). 6814 * gw_addr could be zero in some cases when both RTA_SRCIFP and 6815 * RTA_IFP are specified. If RTA_SRCIFP is specified and both 6816 * RTA_IFP and gateway_addr are NULL/zero, then delete will not 6817 * succeed. 6818 */ 6819 if (src_ipif != NULL) { 6820 if (ipif_arg == NULL && gw_addr != 0) { 6821 ipif_arg = ipif_lookup_interface(gw_addr, dst_addr, 6822 q, mp, func, &err); 6823 if (ipif_arg != NULL) 6824 ipif_refheld = B_TRUE; 6825 } 6826 if (ipif_arg == NULL) { 6827 err = (err == EINPROGRESS) ? err : ESRCH; 6828 return (err); 6829 } 6830 ipif = ipif_arg; 6831 } else { 6832 ipif = ipif_lookup_interface(gw_addr, dst_addr, 6833 q, mp, func, &err); 6834 if (ipif != NULL) 6835 ipif_refheld = B_TRUE; 6836 else if (err == EINPROGRESS) 6837 return (err); 6838 else 6839 err = 0; 6840 } 6841 if (ipif != NULL) { 6842 if (ipif_arg != NULL) { 6843 if (ipif_refheld) { 6844 ipif_refrele(ipif); 6845 ipif_refheld = B_FALSE; 6846 } 6847 ipif = ipif_arg; 6848 match_flags |= MATCH_IRE_ILL; 6849 } else { 6850 match_flags |= MATCH_IRE_IPIF; 6851 } 6852 if (src_ipif != NULL) { 6853 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6854 ipif, src_ipif->ipif_ill, match_flags); 6855 } else { 6856 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 6857 ire = ire_ctable_lookup(dst_addr, 0, 6858 IRE_LOOPBACK, ipif, ALL_ZONES, NULL, 6859 match_flags); 6860 } 6861 if (ire == NULL) { 6862 ire = ire_ftable_lookup(dst_addr, mask, 0, 6863 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6864 NULL, match_flags); 6865 } 6866 } 6867 } 6868 6869 if (ire == NULL) { 6870 /* 6871 * At this point, the gateway address is not one of our own 6872 * addresses or a matching interface route was not found. We 6873 * set the IRE type to lookup based on whether 6874 * this is a host route, a default route or just a prefix. 6875 * 6876 * If an ipif_arg was passed in, then the lookup is based on an 6877 * interface index so MATCH_IRE_ILL is added to match_flags. 6878 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 6879 * set as the route being looked up is not a traditional 6880 * interface route. 6881 * Since we do not add gateway route with srcipif, we don't 6882 * expect to find it either. 6883 */ 6884 if (src_ipif != NULL) { 6885 if (ipif_refheld) 6886 ipif_refrele(ipif); 6887 return (ESRCH); 6888 } else { 6889 match_flags &= ~MATCH_IRE_IPIF; 6890 match_flags |= MATCH_IRE_GW; 6891 if (ipif_arg != NULL) 6892 match_flags |= MATCH_IRE_ILL; 6893 if (mask == IP_HOST_MASK) 6894 type = IRE_HOST; 6895 else if (mask == 0) 6896 type = IRE_DEFAULT; 6897 else 6898 type = IRE_PREFIX; 6899 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, 6900 ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags); 6901 if (ire == NULL && type == IRE_HOST) { 6902 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, 6903 IRE_HOST_REDIRECT, ipif_arg, NULL, 6904 ALL_ZONES, 0, NULL, match_flags); 6905 } 6906 } 6907 } 6908 6909 if (ipif_refheld) 6910 ipif_refrele(ipif); 6911 6912 /* ipif is not refheld anymore */ 6913 if (ire == NULL) 6914 return (ESRCH); 6915 6916 if (ire->ire_flags & RTF_MULTIRT) { 6917 /* 6918 * Invoke the CGTP (multirouting) filtering module 6919 * to remove the dst address from the filtering database. 6920 * Packets coming from that address will no longer be 6921 * filtered to remove duplicates. 6922 */ 6923 if (ip_cgtp_filter_ops != NULL) { 6924 err = ip_cgtp_filter_ops->cfo_del_dest_v4(ire->ire_addr, 6925 ire->ire_gateway_addr); 6926 } 6927 ip_cgtp_bcast_delete(ire); 6928 } 6929 6930 ipif = ire->ire_ipif; 6931 /* 6932 * Removing from ipif_saved_ire_mp is not necessary 6933 * when src_ipif being non-NULL. ip_rt_add does not 6934 * save the ires which src_ipif being non-NULL. 6935 */ 6936 if (ipif != NULL && src_ipif == NULL) { 6937 ipif_remove_ire(ipif, ire); 6938 } 6939 if (ioctl_msg) 6940 ip_rts_rtmsg(RTM_OLDDEL, ire, 0); 6941 ire_delete(ire); 6942 ire_refrele(ire); 6943 return (err); 6944 } 6945 6946 /* 6947 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6948 */ 6949 /* ARGSUSED */ 6950 int 6951 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6952 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6953 { 6954 ipaddr_t dst_addr; 6955 ipaddr_t gw_addr; 6956 ipaddr_t mask; 6957 int error = 0; 6958 mblk_t *mp1; 6959 struct rtentry *rt; 6960 ipif_t *ipif = NULL; 6961 6962 ip1dbg(("ip_siocaddrt:")); 6963 /* Existence of mp1 verified in ip_wput_nondata */ 6964 mp1 = mp->b_cont->b_cont; 6965 rt = (struct rtentry *)mp1->b_rptr; 6966 6967 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6968 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6969 6970 /* 6971 * If the RTF_HOST flag is on, this is a request to assign a gateway 6972 * to a particular host address. In this case, we set the netmask to 6973 * all ones for the particular destination address. Otherwise, 6974 * determine the netmask to be used based on dst_addr and the interfaces 6975 * in use. 6976 */ 6977 if (rt->rt_flags & RTF_HOST) { 6978 mask = IP_HOST_MASK; 6979 } else { 6980 /* 6981 * Note that ip_subnet_mask returns a zero mask in the case of 6982 * default (an all-zeroes address). 6983 */ 6984 mask = ip_subnet_mask(dst_addr, &ipif); 6985 } 6986 6987 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6988 NULL, B_TRUE, q, mp, ip_process_ioctl, NULL); 6989 if (ipif != NULL) 6990 ipif_refrele(ipif); 6991 return (error); 6992 } 6993 6994 /* 6995 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6996 */ 6997 /* ARGSUSED */ 6998 int 6999 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7000 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7001 { 7002 ipaddr_t dst_addr; 7003 ipaddr_t gw_addr; 7004 ipaddr_t mask; 7005 int error; 7006 mblk_t *mp1; 7007 struct rtentry *rt; 7008 ipif_t *ipif = NULL; 7009 7010 ip1dbg(("ip_siocdelrt:")); 7011 /* Existence of mp1 verified in ip_wput_nondata */ 7012 mp1 = mp->b_cont->b_cont; 7013 rt = (struct rtentry *)mp1->b_rptr; 7014 7015 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7016 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7017 7018 /* 7019 * If the RTF_HOST flag is on, this is a request to delete a gateway 7020 * to a particular host address. In this case, we set the netmask to 7021 * all ones for the particular destination address. Otherwise, 7022 * determine the netmask to be used based on dst_addr and the interfaces 7023 * in use. 7024 */ 7025 if (rt->rt_flags & RTF_HOST) { 7026 mask = IP_HOST_MASK; 7027 } else { 7028 /* 7029 * Note that ip_subnet_mask returns a zero mask in the case of 7030 * default (an all-zeroes address). 7031 */ 7032 mask = ip_subnet_mask(dst_addr, &ipif); 7033 } 7034 7035 error = ip_rt_delete(dst_addr, mask, gw_addr, 7036 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, NULL, 7037 B_TRUE, q, mp, ip_process_ioctl); 7038 if (ipif != NULL) 7039 ipif_refrele(ipif); 7040 return (error); 7041 } 7042 7043 /* 7044 * Enqueue the mp onto the ipsq, chained by b_next. 7045 * b_prev stores the function to be executed later, and b_queue the queue 7046 * where this mp originated. 7047 */ 7048 void 7049 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7050 ill_t *pending_ill) 7051 { 7052 conn_t *connp = NULL; 7053 7054 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7055 ASSERT(func != NULL); 7056 7057 mp->b_queue = q; 7058 mp->b_prev = (void *)func; 7059 mp->b_next = NULL; 7060 7061 switch (type) { 7062 case CUR_OP: 7063 if (ipsq->ipsq_mptail != NULL) { 7064 ASSERT(ipsq->ipsq_mphead != NULL); 7065 ipsq->ipsq_mptail->b_next = mp; 7066 } else { 7067 ASSERT(ipsq->ipsq_mphead == NULL); 7068 ipsq->ipsq_mphead = mp; 7069 } 7070 ipsq->ipsq_mptail = mp; 7071 break; 7072 7073 case NEW_OP: 7074 if (ipsq->ipsq_xopq_mptail != NULL) { 7075 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7076 ipsq->ipsq_xopq_mptail->b_next = mp; 7077 } else { 7078 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7079 ipsq->ipsq_xopq_mphead = mp; 7080 } 7081 ipsq->ipsq_xopq_mptail = mp; 7082 break; 7083 default: 7084 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7085 } 7086 7087 if (CONN_Q(q) && pending_ill != NULL) { 7088 connp = Q_TO_CONN(q); 7089 7090 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7091 connp->conn_oper_pending_ill = pending_ill; 7092 } 7093 } 7094 7095 /* 7096 * Return the mp at the head of the ipsq. After emptying the ipsq 7097 * look at the next ioctl, if this ioctl is complete. Otherwise 7098 * return, we will resume when we complete the current ioctl. 7099 * The current ioctl will wait till it gets a response from the 7100 * driver below. 7101 */ 7102 static mblk_t * 7103 ipsq_dq(ipsq_t *ipsq) 7104 { 7105 mblk_t *mp; 7106 7107 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7108 7109 mp = ipsq->ipsq_mphead; 7110 if (mp != NULL) { 7111 ipsq->ipsq_mphead = mp->b_next; 7112 if (ipsq->ipsq_mphead == NULL) 7113 ipsq->ipsq_mptail = NULL; 7114 mp->b_next = NULL; 7115 return (mp); 7116 } 7117 if (ipsq->ipsq_current_ipif != NULL) 7118 return (NULL); 7119 mp = ipsq->ipsq_xopq_mphead; 7120 if (mp != NULL) { 7121 ipsq->ipsq_xopq_mphead = mp->b_next; 7122 if (ipsq->ipsq_xopq_mphead == NULL) 7123 ipsq->ipsq_xopq_mptail = NULL; 7124 mp->b_next = NULL; 7125 return (mp); 7126 } 7127 return (NULL); 7128 } 7129 7130 /* 7131 * Enter the ipsq corresponding to ill, by waiting synchronously till 7132 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7133 * will have to drain completely before ipsq_enter returns success. 7134 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 7135 * and the ipsq_exit logic will start the next enqueued ioctl after 7136 * completion of the current ioctl. If 'force' is used, we don't wait 7137 * for the enqueued ioctls. This is needed when a conn_close wants to 7138 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7139 * of an ill can also use this option. But we dont' use it currently. 7140 */ 7141 #define ENTER_SQ_WAIT_TICKS 100 7142 boolean_t 7143 ipsq_enter(ill_t *ill, boolean_t force) 7144 { 7145 ipsq_t *ipsq; 7146 boolean_t waited_enough = B_FALSE; 7147 7148 /* 7149 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 7150 * Since the <ill-ipsq> assocs could change while we wait for the 7151 * writer, it is easier to wait on a fixed global rather than try to 7152 * cv_wait on a changing ipsq. 7153 */ 7154 mutex_enter(&ill->ill_lock); 7155 for (;;) { 7156 if (ill->ill_state_flags & ILL_CONDEMNED) { 7157 mutex_exit(&ill->ill_lock); 7158 return (B_FALSE); 7159 } 7160 7161 ipsq = ill->ill_phyint->phyint_ipsq; 7162 mutex_enter(&ipsq->ipsq_lock); 7163 if (ipsq->ipsq_writer == NULL && 7164 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 7165 break; 7166 } else if (ipsq->ipsq_writer != NULL) { 7167 mutex_exit(&ipsq->ipsq_lock); 7168 cv_wait(&ill->ill_cv, &ill->ill_lock); 7169 } else { 7170 mutex_exit(&ipsq->ipsq_lock); 7171 if (force) { 7172 (void) cv_timedwait(&ill->ill_cv, 7173 &ill->ill_lock, 7174 lbolt + ENTER_SQ_WAIT_TICKS); 7175 waited_enough = B_TRUE; 7176 continue; 7177 } else { 7178 cv_wait(&ill->ill_cv, &ill->ill_lock); 7179 } 7180 } 7181 } 7182 7183 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7184 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7185 ipsq->ipsq_writer = curthread; 7186 ipsq->ipsq_reentry_cnt++; 7187 #ifdef ILL_DEBUG 7188 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7189 #endif 7190 mutex_exit(&ipsq->ipsq_lock); 7191 mutex_exit(&ill->ill_lock); 7192 return (B_TRUE); 7193 } 7194 7195 /* 7196 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7197 * certain critical operations like plumbing (i.e. most set ioctls), 7198 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7199 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7200 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7201 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7202 * threads executing in the ipsq. Responses from the driver pertain to the 7203 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7204 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7205 * 7206 * If a thread does not want to reenter the ipsq when it is already writer, 7207 * it must make sure that the specified reentry point to be called later 7208 * when the ipsq is empty, nor any code path starting from the specified reentry 7209 * point must never ever try to enter the ipsq again. Otherwise it can lead 7210 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7211 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7212 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7213 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7214 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7215 * ioctl if the current ioctl has completed. If the current ioctl is still 7216 * in progress it simply returns. The current ioctl could be waiting for 7217 * a response from another module (arp_ or the driver or could be waiting for 7218 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7219 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7220 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7221 * ipsq_current_ipif is clear which happens only on ioctl completion. 7222 */ 7223 7224 /* 7225 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7226 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7227 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7228 * completion. 7229 */ 7230 ipsq_t * 7231 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7232 ipsq_func_t func, int type, boolean_t reentry_ok) 7233 { 7234 ipsq_t *ipsq; 7235 7236 /* Only 1 of ipif or ill can be specified */ 7237 ASSERT((ipif != NULL) ^ (ill != NULL)); 7238 if (ipif != NULL) 7239 ill = ipif->ipif_ill; 7240 7241 /* 7242 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7243 * ipsq of an ill can't change when ill_lock is held. 7244 */ 7245 GRAB_CONN_LOCK(q); 7246 mutex_enter(&ill->ill_lock); 7247 ipsq = ill->ill_phyint->phyint_ipsq; 7248 mutex_enter(&ipsq->ipsq_lock); 7249 7250 /* 7251 * 1. Enter the ipsq if we are already writer and reentry is ok. 7252 * (Note: If the caller does not specify reentry_ok then neither 7253 * 'func' nor any of its callees must ever attempt to enter the ipsq 7254 * again. Otherwise it can lead to an infinite loop 7255 * 2. Enter the ipsq if there is no current writer and this attempted 7256 * entry is part of the current ioctl or operation 7257 * 3. Enter the ipsq if there is no current writer and this is a new 7258 * ioctl (or operation) and the ioctl (or operation) queue is 7259 * empty and there is no ioctl (or operation) currently in progress 7260 */ 7261 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7262 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7263 ipsq->ipsq_current_ipif == NULL))) || 7264 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7265 /* Success. */ 7266 ipsq->ipsq_reentry_cnt++; 7267 ipsq->ipsq_writer = curthread; 7268 mutex_exit(&ipsq->ipsq_lock); 7269 mutex_exit(&ill->ill_lock); 7270 RELEASE_CONN_LOCK(q); 7271 #ifdef ILL_DEBUG 7272 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7273 #endif 7274 return (ipsq); 7275 } 7276 7277 ipsq_enq(ipsq, q, mp, func, type, ill); 7278 7279 mutex_exit(&ipsq->ipsq_lock); 7280 mutex_exit(&ill->ill_lock); 7281 RELEASE_CONN_LOCK(q); 7282 return (NULL); 7283 } 7284 7285 /* 7286 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7287 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7288 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7289 * completion. 7290 * 7291 * This function does a refrele on the ipif/ill. 7292 */ 7293 void 7294 qwriter_ip(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7295 ipsq_func_t func, int type, boolean_t reentry_ok) 7296 { 7297 ipsq_t *ipsq; 7298 7299 ipsq = ipsq_try_enter(ipif, ill, q, mp, func, type, reentry_ok); 7300 /* 7301 * Caller must have done a refhold on the ipif. ipif_refrele 7302 * happens on the passed ipif. We can do this since we are 7303 * already exclusive, or we won't access ipif henceforth, Both 7304 * this func and caller will just return if we ipsq_try_enter 7305 * fails above. This is needed because func needs to 7306 * see the correct refcount. Eg. removeif can work only then. 7307 */ 7308 if (ipif != NULL) 7309 ipif_refrele(ipif); 7310 else 7311 ill_refrele(ill); 7312 if (ipsq != NULL) { 7313 (*func)(ipsq, q, mp, NULL); 7314 ipsq_exit(ipsq, B_TRUE, B_TRUE); 7315 } 7316 } 7317 7318 /* 7319 * If there are more than ILL_GRP_CNT ills in a group, 7320 * we use kmem alloc'd buffers, else use the stack 7321 */ 7322 #define ILL_GRP_CNT 14 7323 /* 7324 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 7325 * Called by a thread that is currently exclusive on this ipsq. 7326 */ 7327 void 7328 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 7329 { 7330 queue_t *q; 7331 mblk_t *mp; 7332 ipsq_func_t func; 7333 int next; 7334 ill_t **ill_list = NULL; 7335 size_t ill_list_size = 0; 7336 int cnt = 0; 7337 boolean_t need_ipsq_free = B_FALSE; 7338 7339 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7340 mutex_enter(&ipsq->ipsq_lock); 7341 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 7342 if (ipsq->ipsq_reentry_cnt != 1) { 7343 ipsq->ipsq_reentry_cnt--; 7344 mutex_exit(&ipsq->ipsq_lock); 7345 return; 7346 } 7347 7348 mp = ipsq_dq(ipsq); 7349 while (mp != NULL) { 7350 again: 7351 mutex_exit(&ipsq->ipsq_lock); 7352 func = (ipsq_func_t)mp->b_prev; 7353 q = (queue_t *)mp->b_queue; 7354 mp->b_prev = NULL; 7355 mp->b_queue = NULL; 7356 7357 /* 7358 * If 'q' is an conn queue, it is valid, since we did a 7359 * a refhold on the connp, at the start of the ioctl. 7360 * If 'q' is an ill queue, it is valid, since close of an 7361 * ill will clean up the 'ipsq'. 7362 */ 7363 (*func)(ipsq, q, mp, NULL); 7364 7365 mutex_enter(&ipsq->ipsq_lock); 7366 mp = ipsq_dq(ipsq); 7367 } 7368 7369 mutex_exit(&ipsq->ipsq_lock); 7370 7371 /* 7372 * Need to grab the locks in the right order. Need to 7373 * atomically check (under ipsq_lock) that there are no 7374 * messages before relinquishing the ipsq. Also need to 7375 * atomically wakeup waiters on ill_cv while holding ill_lock. 7376 * Holding ill_g_lock ensures that ipsq list of ills is stable. 7377 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 7378 * to grab ill_g_lock as writer. 7379 */ 7380 rw_enter(&ill_g_lock, ipsq->ipsq_split ? RW_WRITER : RW_READER); 7381 7382 /* ipsq_refs can't change while ill_g_lock is held as reader */ 7383 if (ipsq->ipsq_refs != 0) { 7384 /* At most 2 ills v4/v6 per phyint */ 7385 cnt = ipsq->ipsq_refs << 1; 7386 ill_list_size = cnt * sizeof (ill_t *); 7387 /* 7388 * If memory allocation fails, we will do the split 7389 * the next time ipsq_exit is called for whatever reason. 7390 * As long as the ipsq_split flag is set the need to 7391 * split is remembered. 7392 */ 7393 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 7394 if (ill_list != NULL) 7395 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 7396 } 7397 mutex_enter(&ipsq->ipsq_lock); 7398 mp = ipsq_dq(ipsq); 7399 if (mp != NULL) { 7400 /* oops, some message has landed up, we can't get out */ 7401 if (ill_list != NULL) 7402 ill_unlock_ills(ill_list, cnt); 7403 rw_exit(&ill_g_lock); 7404 if (ill_list != NULL) 7405 kmem_free(ill_list, ill_list_size); 7406 ill_list = NULL; 7407 ill_list_size = 0; 7408 cnt = 0; 7409 goto again; 7410 } 7411 7412 /* 7413 * Split only if no ioctl is pending and if memory alloc succeeded 7414 * above. 7415 */ 7416 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 7417 ill_list != NULL) { 7418 /* 7419 * No new ill can join this ipsq since we are holding the 7420 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 7421 * ipsq. ill_split_ipsq may fail due to memory shortage. 7422 * If so we will retry on the next ipsq_exit. 7423 */ 7424 ipsq->ipsq_split = ill_split_ipsq(ipsq); 7425 } 7426 7427 /* 7428 * We are holding the ipsq lock, hence no new messages can 7429 * land up on the ipsq, and there are no messages currently. 7430 * Now safe to get out. Wake up waiters and relinquish ipsq 7431 * atomically while holding ill locks. 7432 */ 7433 ipsq->ipsq_writer = NULL; 7434 ipsq->ipsq_reentry_cnt--; 7435 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7436 #ifdef ILL_DEBUG 7437 ipsq->ipsq_depth = 0; 7438 #endif 7439 mutex_exit(&ipsq->ipsq_lock); 7440 /* 7441 * For IPMP this should wake up all ills in this ipsq. 7442 * We need to hold the ill_lock while waking up waiters to 7443 * avoid missed wakeups. But there is no need to acquire all 7444 * the ill locks and then wakeup. If we have not acquired all 7445 * the locks (due to memory failure above) ill_signal_ipsq_ills 7446 * wakes up ills one at a time after getting the right ill_lock 7447 */ 7448 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 7449 if (ill_list != NULL) 7450 ill_unlock_ills(ill_list, cnt); 7451 if (ipsq->ipsq_refs == 0) 7452 need_ipsq_free = B_TRUE; 7453 rw_exit(&ill_g_lock); 7454 if (ill_list != 0) 7455 kmem_free(ill_list, ill_list_size); 7456 7457 if (need_ipsq_free) { 7458 /* 7459 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 7460 * looked up. ipsq can be looked up only thru ill or phyint 7461 * and there are no ills/phyint on this ipsq. 7462 */ 7463 ipsq_delete(ipsq); 7464 } 7465 /* 7466 * Now start any igmp or mld timers that could not be started 7467 * while inside the ipsq. The timers can't be started while inside 7468 * the ipsq, since igmp_start_timers may need to call untimeout() 7469 * which can't be done while holding a lock i.e. the ipsq. Otherwise 7470 * there could be a deadlock since the timeout handlers 7471 * mld_timeout_handler / igmp_timeout_handler also synchronously 7472 * wait in ipsq_enter() trying to get the ipsq. 7473 * 7474 * However there is one exception to the above. If this thread is 7475 * itself the igmp/mld timeout handler thread, then we don't want 7476 * to start any new timer until the current handler is done. The 7477 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 7478 * all others pass B_TRUE. 7479 */ 7480 if (start_igmp_timer) { 7481 mutex_enter(&igmp_timer_lock); 7482 next = igmp_deferred_next; 7483 igmp_deferred_next = INFINITY; 7484 mutex_exit(&igmp_timer_lock); 7485 7486 if (next != INFINITY) 7487 igmp_start_timers(next); 7488 } 7489 7490 if (start_mld_timer) { 7491 mutex_enter(&mld_timer_lock); 7492 next = mld_deferred_next; 7493 mld_deferred_next = INFINITY; 7494 mutex_exit(&mld_timer_lock); 7495 7496 if (next != INFINITY) 7497 mld_start_timers(next); 7498 } 7499 } 7500 7501 /* 7502 * The ill is closing. Flush all messages on the ipsq that originated 7503 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 7504 * for this ill since ipsq_enter could not have entered until then. 7505 * New messages can't be queued since the CONDEMNED flag is set. 7506 */ 7507 static void 7508 ipsq_flush(ill_t *ill) 7509 { 7510 queue_t *q; 7511 mblk_t *prev; 7512 mblk_t *mp; 7513 mblk_t *mp_next; 7514 ipsq_t *ipsq; 7515 7516 ASSERT(IAM_WRITER_ILL(ill)); 7517 ipsq = ill->ill_phyint->phyint_ipsq; 7518 /* 7519 * Flush any messages sent up by the driver. 7520 */ 7521 mutex_enter(&ipsq->ipsq_lock); 7522 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 7523 mp_next = mp->b_next; 7524 q = mp->b_queue; 7525 if (q == ill->ill_rq || q == ill->ill_wq) { 7526 /* Remove the mp from the ipsq */ 7527 if (prev == NULL) 7528 ipsq->ipsq_mphead = mp->b_next; 7529 else 7530 prev->b_next = mp->b_next; 7531 if (ipsq->ipsq_mptail == mp) { 7532 ASSERT(mp_next == NULL); 7533 ipsq->ipsq_mptail = prev; 7534 } 7535 inet_freemsg(mp); 7536 } else { 7537 prev = mp; 7538 } 7539 } 7540 mutex_exit(&ipsq->ipsq_lock); 7541 (void) ipsq_pending_mp_cleanup(ill, NULL); 7542 ipsq_xopq_mp_cleanup(ill, NULL); 7543 ill_pending_mp_cleanup(ill); 7544 } 7545 7546 /* 7547 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 7548 * The real cleanup happens behind the squeue via ip_squeue_clean function but 7549 * we need to protect ourselfs from 2 threads trying to cleanup at the same 7550 * time (possible with one port going down for aggr and someone tearing down the 7551 * entire aggr simultaneously. So we use ill_inuse_ref protected by ill_lock 7552 * to indicate when the cleanup has started (1 ref) and when the cleanup 7553 * is done (0 ref). When a new ring gets assigned to squeue, we start by 7554 * putting 2 ref on ill_inuse_ref. 7555 */ 7556 static void 7557 ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 7558 { 7559 conn_t *connp; 7560 squeue_t *sqp; 7561 mblk_t *mp; 7562 7563 ASSERT(rx_ring != NULL); 7564 7565 /* Just clean one squeue */ 7566 mutex_enter(&ill->ill_lock); 7567 /* 7568 * Reset the ILL_SOFT_RING_ASSIGN bit so that 7569 * ip_squeue_soft_ring_affinty() will not go 7570 * ahead with assigning rings. 7571 */ 7572 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 7573 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 7574 /* Some operations pending on the ring. Wait */ 7575 cv_wait(&ill->ill_cv, &ill->ill_lock); 7576 7577 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 7578 /* 7579 * Someone already trying to clean 7580 * this squeue or its already been cleaned. 7581 */ 7582 mutex_exit(&ill->ill_lock); 7583 return; 7584 } 7585 sqp = rx_ring->rr_sqp; 7586 7587 if (sqp == NULL) { 7588 /* 7589 * The rx_ring never had a squeue assigned to it. 7590 * We are under ill_lock so we can clean it up 7591 * here itself since no one can get to it. 7592 */ 7593 rx_ring->rr_blank = NULL; 7594 rx_ring->rr_handle = NULL; 7595 rx_ring->rr_sqp = NULL; 7596 rx_ring->rr_ring_state = ILL_RING_FREE; 7597 mutex_exit(&ill->ill_lock); 7598 return; 7599 } 7600 7601 /* Set the state that its being cleaned */ 7602 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 7603 ASSERT(sqp != NULL); 7604 mutex_exit(&ill->ill_lock); 7605 7606 /* 7607 * Use the preallocated ill_unbind_conn for this purpose 7608 */ 7609 connp = ill->ill_dls_capab->ill_unbind_conn; 7610 mp = &connp->conn_tcp->tcp_closemp; 7611 CONN_INC_REF(connp); 7612 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 7613 7614 mutex_enter(&ill->ill_lock); 7615 while (rx_ring->rr_ring_state != ILL_RING_FREE) 7616 cv_wait(&ill->ill_cv, &ill->ill_lock); 7617 7618 mutex_exit(&ill->ill_lock); 7619 } 7620 7621 static void 7622 ipsq_clean_all(ill_t *ill) 7623 { 7624 int idx; 7625 7626 /* 7627 * No need to clean if poll_capab isn't set for this ill 7628 */ 7629 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 7630 return; 7631 7632 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 7633 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 7634 ipsq_clean_ring(ill, ipr); 7635 } 7636 7637 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 7638 } 7639 7640 /* ARGSUSED */ 7641 int 7642 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7643 ip_ioctl_cmd_t *ipip, void *ifreq) 7644 { 7645 ill_t *ill; 7646 struct lifreq *lifr = (struct lifreq *)ifreq; 7647 boolean_t isv6; 7648 conn_t *connp; 7649 7650 connp = Q_TO_CONN(q); 7651 isv6 = connp->conn_af_isv6; 7652 /* 7653 * Set original index. 7654 * Failover and failback move logical interfaces 7655 * from one physical interface to another. The 7656 * original index indicates the parent of a logical 7657 * interface, in other words, the physical interface 7658 * the logical interface will be moved back to on 7659 * failback. 7660 */ 7661 7662 /* 7663 * Don't allow the original index to be changed 7664 * for non-failover addresses, autoconfigured 7665 * addresses, or IPv6 link local addresses. 7666 */ 7667 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 7668 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 7669 return (EINVAL); 7670 } 7671 /* 7672 * The new original index must be in use by some 7673 * physical interface. 7674 */ 7675 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 7676 NULL, NULL); 7677 if (ill == NULL) 7678 return (ENXIO); 7679 ill_refrele(ill); 7680 7681 ipif->ipif_orig_ifindex = lifr->lifr_index; 7682 /* 7683 * When this ipif gets failed back, don't 7684 * preserve the original id, as it is no 7685 * longer applicable. 7686 */ 7687 ipif->ipif_orig_ipifid = 0; 7688 /* 7689 * For IPv4, change the original index of any 7690 * multicast addresses associated with the 7691 * ipif to the new value. 7692 */ 7693 if (!isv6) { 7694 ilm_t *ilm; 7695 7696 mutex_enter(&ipif->ipif_ill->ill_lock); 7697 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 7698 ilm = ilm->ilm_next) { 7699 if (ilm->ilm_ipif == ipif) { 7700 ilm->ilm_orig_ifindex = lifr->lifr_index; 7701 } 7702 } 7703 mutex_exit(&ipif->ipif_ill->ill_lock); 7704 } 7705 return (0); 7706 } 7707 7708 /* ARGSUSED */ 7709 int 7710 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7711 ip_ioctl_cmd_t *ipip, void *ifreq) 7712 { 7713 struct lifreq *lifr = (struct lifreq *)ifreq; 7714 7715 /* 7716 * Get the original interface index i.e the one 7717 * before FAILOVER if it ever happened. 7718 */ 7719 lifr->lifr_index = ipif->ipif_orig_ifindex; 7720 return (0); 7721 } 7722 7723 /* 7724 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 7725 * refhold and return the associated ipif 7726 */ 7727 int 7728 ip_extract_tunreq(queue_t *q, mblk_t *mp, ipif_t **ipifp, ipsq_func_t func) 7729 { 7730 boolean_t exists; 7731 struct iftun_req *ta; 7732 ipif_t *ipif; 7733 ill_t *ill; 7734 boolean_t isv6; 7735 mblk_t *mp1; 7736 int error; 7737 conn_t *connp; 7738 7739 /* Existence verified in ip_wput_nondata */ 7740 mp1 = mp->b_cont->b_cont; 7741 ta = (struct iftun_req *)mp1->b_rptr; 7742 /* 7743 * Null terminate the string to protect against buffer 7744 * overrun. String was generated by user code and may not 7745 * be trusted. 7746 */ 7747 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 7748 7749 connp = Q_TO_CONN(q); 7750 isv6 = connp->conn_af_isv6; 7751 7752 /* Disallows implicit create */ 7753 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 7754 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 7755 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error); 7756 if (ipif == NULL) 7757 return (error); 7758 7759 if (ipif->ipif_id != 0) { 7760 /* 7761 * We really don't want to set/get tunnel parameters 7762 * on virtual tunnel interfaces. Only allow the 7763 * base tunnel to do these. 7764 */ 7765 ipif_refrele(ipif); 7766 return (EINVAL); 7767 } 7768 7769 /* 7770 * Send down to tunnel mod for ioctl processing. 7771 * Will finish ioctl in ip_rput_other(). 7772 */ 7773 ill = ipif->ipif_ill; 7774 if (ill->ill_net_type == IRE_LOOPBACK) { 7775 ipif_refrele(ipif); 7776 return (EOPNOTSUPP); 7777 } 7778 7779 if (ill->ill_wq == NULL) { 7780 ipif_refrele(ipif); 7781 return (ENXIO); 7782 } 7783 /* 7784 * Mark the ioctl as coming from an IPv6 interface for 7785 * tun's convenience. 7786 */ 7787 if (ill->ill_isv6) 7788 ta->ifta_flags |= 0x80000000; 7789 *ipifp = ipif; 7790 return (0); 7791 } 7792 7793 /* 7794 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7795 * and return the associated ipif. 7796 * Return value: 7797 * Non zero: An error has occurred. ci may not be filled out. 7798 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7799 * a held ipif in ci.ci_ipif. 7800 */ 7801 int 7802 ip_extract_lifreq_cmn(queue_t *q, mblk_t *mp, int cmd_type, int flags, 7803 cmd_info_t *ci, ipsq_func_t func) 7804 { 7805 sin_t *sin; 7806 sin6_t *sin6; 7807 char *name; 7808 struct ifreq *ifr; 7809 struct lifreq *lifr; 7810 ipif_t *ipif = NULL; 7811 ill_t *ill; 7812 conn_t *connp; 7813 boolean_t isv6; 7814 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7815 boolean_t exists; 7816 int err; 7817 mblk_t *mp1; 7818 zoneid_t zoneid; 7819 7820 if (q->q_next != NULL) { 7821 ill = (ill_t *)q->q_ptr; 7822 isv6 = ill->ill_isv6; 7823 connp = NULL; 7824 zoneid = ALL_ZONES; 7825 } else { 7826 ill = NULL; 7827 connp = Q_TO_CONN(q); 7828 isv6 = connp->conn_af_isv6; 7829 zoneid = connp->conn_zoneid; 7830 if (zoneid == GLOBAL_ZONEID) { 7831 /* global zone can access ipifs in all zones */ 7832 zoneid = ALL_ZONES; 7833 } 7834 } 7835 7836 /* Has been checked in ip_wput_nondata */ 7837 mp1 = mp->b_cont->b_cont; 7838 7839 7840 if (cmd_type == IF_CMD) { 7841 /* This a old style SIOC[GS]IF* command */ 7842 ifr = (struct ifreq *)mp1->b_rptr; 7843 /* 7844 * Null terminate the string to protect against buffer 7845 * overrun. String was generated by user code and may not 7846 * be trusted. 7847 */ 7848 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7849 sin = (sin_t *)&ifr->ifr_addr; 7850 name = ifr->ifr_name; 7851 ci->ci_sin = sin; 7852 ci->ci_sin6 = NULL; 7853 ci->ci_lifr = (struct lifreq *)ifr; 7854 } else { 7855 /* This a new style SIOC[GS]LIF* command */ 7856 ASSERT(cmd_type == LIF_CMD); 7857 lifr = (struct lifreq *)mp1->b_rptr; 7858 /* 7859 * Null terminate the string to protect against buffer 7860 * overrun. String was generated by user code and may not 7861 * be trusted. 7862 */ 7863 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7864 name = lifr->lifr_name; 7865 sin = (sin_t *)&lifr->lifr_addr; 7866 sin6 = (sin6_t *)&lifr->lifr_addr; 7867 if (iocp->ioc_cmd == SIOCSLIFGROUPNAME) { 7868 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 7869 LIFNAMSIZ); 7870 } 7871 ci->ci_sin = sin; 7872 ci->ci_sin6 = sin6; 7873 ci->ci_lifr = lifr; 7874 } 7875 7876 7877 if (iocp->ioc_cmd == SIOCSLIFNAME) { 7878 /* 7879 * The ioctl will be failed if the ioctl comes down 7880 * an conn stream 7881 */ 7882 if (ill == NULL) { 7883 /* 7884 * Not an ill queue, return EINVAL same as the 7885 * old error code. 7886 */ 7887 return (ENXIO); 7888 } 7889 ipif = ill->ill_ipif; 7890 ipif_refhold(ipif); 7891 } else { 7892 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 7893 &exists, isv6, zoneid, 7894 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err); 7895 if (ipif == NULL) { 7896 if (err == EINPROGRESS) 7897 return (err); 7898 if (iocp->ioc_cmd == SIOCLIFFAILOVER || 7899 iocp->ioc_cmd == SIOCLIFFAILBACK) { 7900 /* 7901 * Need to try both v4 and v6 since this 7902 * ioctl can come down either v4 or v6 7903 * socket. The lifreq.lifr_family passed 7904 * down by this ioctl is AF_UNSPEC. 7905 */ 7906 ipif = ipif_lookup_on_name(name, 7907 mi_strlen(name), B_FALSE, &exists, !isv6, 7908 zoneid, (connp == NULL) ? q : 7909 CONNP_TO_WQ(connp), mp, func, &err); 7910 if (err == EINPROGRESS) 7911 return (err); 7912 } 7913 err = 0; /* Ensure we don't use it below */ 7914 } 7915 } 7916 7917 /* 7918 * Old style [GS]IFCMD does not admit IPv6 ipif 7919 */ 7920 if (ipif != NULL && ipif->ipif_isv6 && cmd_type == IF_CMD) { 7921 ipif_refrele(ipif); 7922 return (ENXIO); 7923 } 7924 7925 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7926 name[0] == '\0') { 7927 /* 7928 * Handle a or a SIOC?IF* with a null name 7929 * during plumb (on the ill queue before the I_PLINK). 7930 */ 7931 ipif = ill->ill_ipif; 7932 ipif_refhold(ipif); 7933 } 7934 7935 if (ipif == NULL) 7936 return (ENXIO); 7937 7938 /* 7939 * Allow only GET operations if this ipif has been created 7940 * temporarily due to a MOVE operation. 7941 */ 7942 if (ipif->ipif_replace_zero && !(flags & IPI_REPL)) { 7943 ipif_refrele(ipif); 7944 return (EINVAL); 7945 } 7946 7947 ci->ci_ipif = ipif; 7948 return (0); 7949 } 7950 7951 /* 7952 * Return the total number of ipifs. 7953 */ 7954 static uint_t 7955 ip_get_numifs(zoneid_t zoneid) 7956 { 7957 uint_t numifs = 0; 7958 ill_t *ill; 7959 ill_walk_context_t ctx; 7960 ipif_t *ipif; 7961 7962 rw_enter(&ill_g_lock, RW_READER); 7963 ill = ILL_START_WALK_V4(&ctx); 7964 7965 while (ill != NULL) { 7966 for (ipif = ill->ill_ipif; ipif != NULL; 7967 ipif = ipif->ipif_next) { 7968 if (ipif->ipif_zoneid == zoneid || 7969 ipif->ipif_zoneid == ALL_ZONES) 7970 numifs++; 7971 } 7972 ill = ill_next(&ctx, ill); 7973 } 7974 rw_exit(&ill_g_lock); 7975 return (numifs); 7976 } 7977 7978 /* 7979 * Return the total number of ipifs. 7980 */ 7981 static uint_t 7982 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid) 7983 { 7984 uint_t numifs = 0; 7985 ill_t *ill; 7986 ipif_t *ipif; 7987 ill_walk_context_t ctx; 7988 7989 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7990 7991 rw_enter(&ill_g_lock, RW_READER); 7992 if (family == AF_INET) 7993 ill = ILL_START_WALK_V4(&ctx); 7994 else if (family == AF_INET6) 7995 ill = ILL_START_WALK_V6(&ctx); 7996 else 7997 ill = ILL_START_WALK_ALL(&ctx); 7998 7999 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8000 for (ipif = ill->ill_ipif; ipif != NULL; 8001 ipif = ipif->ipif_next) { 8002 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8003 !(lifn_flags & LIFC_NOXMIT)) 8004 continue; 8005 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8006 !(lifn_flags & LIFC_TEMPORARY)) 8007 continue; 8008 if (((ipif->ipif_flags & 8009 (IPIF_NOXMIT|IPIF_NOLOCAL| 8010 IPIF_DEPRECATED)) || 8011 (ill->ill_phyint->phyint_flags & 8012 PHYI_LOOPBACK) || 8013 !(ipif->ipif_flags & IPIF_UP)) && 8014 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8015 continue; 8016 8017 if (zoneid != ipif->ipif_zoneid && 8018 ipif->ipif_zoneid != ALL_ZONES && 8019 (zoneid != GLOBAL_ZONEID || 8020 !(lifn_flags & LIFC_ALLZONES))) 8021 continue; 8022 8023 numifs++; 8024 } 8025 } 8026 rw_exit(&ill_g_lock); 8027 return (numifs); 8028 } 8029 8030 uint_t 8031 ip_get_lifsrcofnum(ill_t *ill) 8032 { 8033 uint_t numifs = 0; 8034 ill_t *ill_head = ill; 8035 8036 /* 8037 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8038 * other thread may be trying to relink the ILLs in this usesrc group 8039 * and adjusting the ill_usesrc_grp_next pointers 8040 */ 8041 rw_enter(&ill_g_usesrc_lock, RW_READER); 8042 if ((ill->ill_usesrc_ifindex == 0) && 8043 (ill->ill_usesrc_grp_next != NULL)) { 8044 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8045 ill = ill->ill_usesrc_grp_next) 8046 numifs++; 8047 } 8048 rw_exit(&ill_g_usesrc_lock); 8049 8050 return (numifs); 8051 } 8052 8053 /* Null values are passed in for ipif, sin, and ifreq */ 8054 /* ARGSUSED */ 8055 int 8056 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8057 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8058 { 8059 int *nump; 8060 8061 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8062 8063 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8064 nump = (int *)mp->b_cont->b_cont->b_rptr; 8065 8066 *nump = ip_get_numifs(Q_TO_CONN(q)->conn_zoneid); 8067 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8068 return (0); 8069 } 8070 8071 /* Null values are passed in for ipif, sin, and ifreq */ 8072 /* ARGSUSED */ 8073 int 8074 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8075 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8076 { 8077 struct lifnum *lifn; 8078 mblk_t *mp1; 8079 8080 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8081 8082 /* Existence checked in ip_wput_nondata */ 8083 mp1 = mp->b_cont->b_cont; 8084 8085 lifn = (struct lifnum *)mp1->b_rptr; 8086 switch (lifn->lifn_family) { 8087 case AF_UNSPEC: 8088 case AF_INET: 8089 case AF_INET6: 8090 break; 8091 default: 8092 return (EAFNOSUPPORT); 8093 } 8094 8095 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8096 Q_TO_CONN(q)->conn_zoneid); 8097 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8098 return (0); 8099 } 8100 8101 /* ARGSUSED */ 8102 int 8103 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8104 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8105 { 8106 STRUCT_HANDLE(ifconf, ifc); 8107 mblk_t *mp1; 8108 struct iocblk *iocp; 8109 struct ifreq *ifr; 8110 ill_walk_context_t ctx; 8111 ill_t *ill; 8112 ipif_t *ipif; 8113 struct sockaddr_in *sin; 8114 int32_t ifclen; 8115 zoneid_t zoneid; 8116 8117 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8118 8119 ip1dbg(("ip_sioctl_get_ifconf")); 8120 /* Existence verified in ip_wput_nondata */ 8121 mp1 = mp->b_cont->b_cont; 8122 iocp = (struct iocblk *)mp->b_rptr; 8123 zoneid = Q_TO_CONN(q)->conn_zoneid; 8124 8125 /* 8126 * The original SIOCGIFCONF passed in a struct ifconf which specified 8127 * the user buffer address and length into which the list of struct 8128 * ifreqs was to be copied. Since AT&T Streams does not seem to 8129 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8130 * the SIOCGIFCONF operation was redefined to simply provide 8131 * a large output buffer into which we are supposed to jam the ifreq 8132 * array. The same ioctl command code was used, despite the fact that 8133 * both the applications and the kernel code had to change, thus making 8134 * it impossible to support both interfaces. 8135 * 8136 * For reasons not good enough to try to explain, the following 8137 * algorithm is used for deciding what to do with one of these: 8138 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8139 * form with the output buffer coming down as the continuation message. 8140 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8141 * and we have to copy in the ifconf structure to find out how big the 8142 * output buffer is and where to copy out to. Sure no problem... 8143 * 8144 */ 8145 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8146 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8147 int numifs = 0; 8148 size_t ifc_bufsize; 8149 8150 /* 8151 * Must be (better be!) continuation of a TRANSPARENT 8152 * IOCTL. We just copied in the ifconf structure. 8153 */ 8154 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8155 (struct ifconf *)mp1->b_rptr); 8156 8157 /* 8158 * Allocate a buffer to hold requested information. 8159 * 8160 * If ifc_len is larger than what is needed, we only 8161 * allocate what we will use. 8162 * 8163 * If ifc_len is smaller than what is needed, return 8164 * EINVAL. 8165 * 8166 * XXX: the ill_t structure can hava 2 counters, for 8167 * v4 and v6 (not just ill_ipif_up_count) to store the 8168 * number of interfaces for a device, so we don't need 8169 * to count them here... 8170 */ 8171 numifs = ip_get_numifs(zoneid); 8172 8173 ifclen = STRUCT_FGET(ifc, ifc_len); 8174 ifc_bufsize = numifs * sizeof (struct ifreq); 8175 if (ifc_bufsize > ifclen) { 8176 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8177 /* old behaviour */ 8178 return (EINVAL); 8179 } else { 8180 ifc_bufsize = ifclen; 8181 } 8182 } 8183 8184 mp1 = mi_copyout_alloc(q, mp, 8185 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8186 if (mp1 == NULL) 8187 return (ENOMEM); 8188 8189 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8190 } 8191 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8192 /* 8193 * the SIOCGIFCONF ioctl only knows about 8194 * IPv4 addresses, so don't try to tell 8195 * it about interfaces with IPv6-only 8196 * addresses. (Last parm 'isv6' is B_FALSE) 8197 */ 8198 8199 ifr = (struct ifreq *)mp1->b_rptr; 8200 8201 rw_enter(&ill_g_lock, RW_READER); 8202 ill = ILL_START_WALK_V4(&ctx); 8203 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8204 for (ipif = ill->ill_ipif; ipif; 8205 ipif = ipif->ipif_next) { 8206 if (zoneid != ipif->ipif_zoneid && 8207 ipif->ipif_zoneid != ALL_ZONES) 8208 continue; 8209 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8210 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8211 /* old behaviour */ 8212 rw_exit(&ill_g_lock); 8213 return (EINVAL); 8214 } else { 8215 goto if_copydone; 8216 } 8217 } 8218 (void) ipif_get_name(ipif, 8219 ifr->ifr_name, 8220 sizeof (ifr->ifr_name)); 8221 sin = (sin_t *)&ifr->ifr_addr; 8222 *sin = sin_null; 8223 sin->sin_family = AF_INET; 8224 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8225 ifr++; 8226 } 8227 } 8228 if_copydone: 8229 rw_exit(&ill_g_lock); 8230 mp1->b_wptr = (uchar_t *)ifr; 8231 8232 if (STRUCT_BUF(ifc) != NULL) { 8233 STRUCT_FSET(ifc, ifc_len, 8234 (int)((uchar_t *)ifr - mp1->b_rptr)); 8235 } 8236 return (0); 8237 } 8238 8239 /* 8240 * Get the interfaces using the address hosted on the interface passed in, 8241 * as a source adddress 8242 */ 8243 /* ARGSUSED */ 8244 int 8245 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8246 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8247 { 8248 mblk_t *mp1; 8249 ill_t *ill, *ill_head; 8250 ipif_t *ipif, *orig_ipif; 8251 int numlifs = 0; 8252 size_t lifs_bufsize, lifsmaxlen; 8253 struct lifreq *lifr; 8254 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8255 uint_t ifindex; 8256 zoneid_t zoneid; 8257 int err = 0; 8258 boolean_t isv6 = B_FALSE; 8259 struct sockaddr_in *sin; 8260 struct sockaddr_in6 *sin6; 8261 8262 STRUCT_HANDLE(lifsrcof, lifs); 8263 8264 ASSERT(q->q_next == NULL); 8265 8266 zoneid = Q_TO_CONN(q)->conn_zoneid; 8267 8268 /* Existence verified in ip_wput_nondata */ 8269 mp1 = mp->b_cont->b_cont; 8270 8271 /* 8272 * Must be (better be!) continuation of a TRANSPARENT 8273 * IOCTL. We just copied in the lifsrcof structure. 8274 */ 8275 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8276 (struct lifsrcof *)mp1->b_rptr); 8277 8278 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8279 return (EINVAL); 8280 8281 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8282 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8283 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8284 ip_process_ioctl, &err); 8285 if (ipif == NULL) { 8286 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8287 ifindex)); 8288 return (err); 8289 } 8290 8291 8292 /* Allocate a buffer to hold requested information */ 8293 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8294 lifs_bufsize = numlifs * sizeof (struct lifreq); 8295 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8296 /* The actual size needed is always returned in lifs_len */ 8297 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8298 8299 /* If the amount we need is more than what is passed in, abort */ 8300 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8301 ipif_refrele(ipif); 8302 return (0); 8303 } 8304 8305 mp1 = mi_copyout_alloc(q, mp, 8306 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8307 if (mp1 == NULL) { 8308 ipif_refrele(ipif); 8309 return (ENOMEM); 8310 } 8311 8312 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8313 bzero(mp1->b_rptr, lifs_bufsize); 8314 8315 lifr = (struct lifreq *)mp1->b_rptr; 8316 8317 ill = ill_head = ipif->ipif_ill; 8318 orig_ipif = ipif; 8319 8320 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8321 rw_enter(&ill_g_usesrc_lock, RW_READER); 8322 rw_enter(&ill_g_lock, RW_READER); 8323 8324 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8325 for (; (ill != NULL) && (ill != ill_head); 8326 ill = ill->ill_usesrc_grp_next) { 8327 8328 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8329 break; 8330 8331 ipif = ill->ill_ipif; 8332 (void) ipif_get_name(ipif, 8333 lifr->lifr_name, sizeof (lifr->lifr_name)); 8334 if (ipif->ipif_isv6) { 8335 sin6 = (sin6_t *)&lifr->lifr_addr; 8336 *sin6 = sin6_null; 8337 sin6->sin6_family = AF_INET6; 8338 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8339 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8340 &ipif->ipif_v6net_mask); 8341 } else { 8342 sin = (sin_t *)&lifr->lifr_addr; 8343 *sin = sin_null; 8344 sin->sin_family = AF_INET; 8345 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8346 lifr->lifr_addrlen = ip_mask_to_plen( 8347 ipif->ipif_net_mask); 8348 } 8349 lifr++; 8350 } 8351 rw_exit(&ill_g_usesrc_lock); 8352 rw_exit(&ill_g_lock); 8353 ipif_refrele(orig_ipif); 8354 mp1->b_wptr = (uchar_t *)lifr; 8355 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8356 8357 return (0); 8358 } 8359 8360 /* ARGSUSED */ 8361 int 8362 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8363 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8364 { 8365 mblk_t *mp1; 8366 int list; 8367 ill_t *ill; 8368 ipif_t *ipif; 8369 int flags; 8370 int numlifs = 0; 8371 size_t lifc_bufsize; 8372 struct lifreq *lifr; 8373 sa_family_t family; 8374 struct sockaddr_in *sin; 8375 struct sockaddr_in6 *sin6; 8376 ill_walk_context_t ctx; 8377 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8378 int32_t lifclen; 8379 zoneid_t zoneid; 8380 STRUCT_HANDLE(lifconf, lifc); 8381 8382 ip1dbg(("ip_sioctl_get_lifconf")); 8383 8384 ASSERT(q->q_next == NULL); 8385 8386 zoneid = Q_TO_CONN(q)->conn_zoneid; 8387 8388 /* Existence verified in ip_wput_nondata */ 8389 mp1 = mp->b_cont->b_cont; 8390 8391 /* 8392 * An extended version of SIOCGIFCONF that takes an 8393 * additional address family and flags field. 8394 * AF_UNSPEC retrieve both IPv4 and IPv6. 8395 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8396 * interfaces are omitted. 8397 * Similarly, IPIF_TEMPORARY interfaces are omitted 8398 * unless LIFC_TEMPORARY is specified. 8399 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8400 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8401 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8402 * has priority over LIFC_NOXMIT. 8403 */ 8404 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8405 8406 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8407 return (EINVAL); 8408 8409 /* 8410 * Must be (better be!) continuation of a TRANSPARENT 8411 * IOCTL. We just copied in the lifconf structure. 8412 */ 8413 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8414 8415 family = STRUCT_FGET(lifc, lifc_family); 8416 flags = STRUCT_FGET(lifc, lifc_flags); 8417 8418 switch (family) { 8419 case AF_UNSPEC: 8420 /* 8421 * walk all ILL's. 8422 */ 8423 list = MAX_G_HEADS; 8424 break; 8425 case AF_INET: 8426 /* 8427 * walk only IPV4 ILL's. 8428 */ 8429 list = IP_V4_G_HEAD; 8430 break; 8431 case AF_INET6: 8432 /* 8433 * walk only IPV6 ILL's. 8434 */ 8435 list = IP_V6_G_HEAD; 8436 break; 8437 default: 8438 return (EAFNOSUPPORT); 8439 } 8440 8441 /* 8442 * Allocate a buffer to hold requested information. 8443 * 8444 * If lifc_len is larger than what is needed, we only 8445 * allocate what we will use. 8446 * 8447 * If lifc_len is smaller than what is needed, return 8448 * EINVAL. 8449 */ 8450 numlifs = ip_get_numlifs(family, flags, zoneid); 8451 lifc_bufsize = numlifs * sizeof (struct lifreq); 8452 lifclen = STRUCT_FGET(lifc, lifc_len); 8453 if (lifc_bufsize > lifclen) { 8454 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8455 return (EINVAL); 8456 else 8457 lifc_bufsize = lifclen; 8458 } 8459 8460 mp1 = mi_copyout_alloc(q, mp, 8461 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8462 if (mp1 == NULL) 8463 return (ENOMEM); 8464 8465 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8466 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8467 8468 lifr = (struct lifreq *)mp1->b_rptr; 8469 8470 rw_enter(&ill_g_lock, RW_READER); 8471 ill = ill_first(list, list, &ctx); 8472 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8473 for (ipif = ill->ill_ipif; ipif != NULL; 8474 ipif = ipif->ipif_next) { 8475 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8476 !(flags & LIFC_NOXMIT)) 8477 continue; 8478 8479 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8480 !(flags & LIFC_TEMPORARY)) 8481 continue; 8482 8483 if (((ipif->ipif_flags & 8484 (IPIF_NOXMIT|IPIF_NOLOCAL| 8485 IPIF_DEPRECATED)) || 8486 (ill->ill_phyint->phyint_flags & 8487 PHYI_LOOPBACK) || 8488 !(ipif->ipif_flags & IPIF_UP)) && 8489 (flags & LIFC_EXTERNAL_SOURCE)) 8490 continue; 8491 8492 if (zoneid != ipif->ipif_zoneid && 8493 ipif->ipif_zoneid != ALL_ZONES && 8494 (zoneid != GLOBAL_ZONEID || 8495 !(flags & LIFC_ALLZONES))) 8496 continue; 8497 8498 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8499 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8500 rw_exit(&ill_g_lock); 8501 return (EINVAL); 8502 } else { 8503 goto lif_copydone; 8504 } 8505 } 8506 8507 (void) ipif_get_name(ipif, 8508 lifr->lifr_name, 8509 sizeof (lifr->lifr_name)); 8510 if (ipif->ipif_isv6) { 8511 sin6 = (sin6_t *)&lifr->lifr_addr; 8512 *sin6 = sin6_null; 8513 sin6->sin6_family = AF_INET6; 8514 sin6->sin6_addr = 8515 ipif->ipif_v6lcl_addr; 8516 lifr->lifr_addrlen = 8517 ip_mask_to_plen_v6( 8518 &ipif->ipif_v6net_mask); 8519 } else { 8520 sin = (sin_t *)&lifr->lifr_addr; 8521 *sin = sin_null; 8522 sin->sin_family = AF_INET; 8523 sin->sin_addr.s_addr = 8524 ipif->ipif_lcl_addr; 8525 lifr->lifr_addrlen = 8526 ip_mask_to_plen( 8527 ipif->ipif_net_mask); 8528 } 8529 lifr++; 8530 } 8531 } 8532 lif_copydone: 8533 rw_exit(&ill_g_lock); 8534 8535 mp1->b_wptr = (uchar_t *)lifr; 8536 if (STRUCT_BUF(lifc) != NULL) { 8537 STRUCT_FSET(lifc, lifc_len, 8538 (int)((uchar_t *)lifr - mp1->b_rptr)); 8539 } 8540 return (0); 8541 } 8542 8543 /* ARGSUSED */ 8544 int 8545 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 8546 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8547 { 8548 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8549 ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 8550 return (0); 8551 } 8552 8553 static void 8554 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 8555 { 8556 ip6_asp_t *table; 8557 size_t table_size; 8558 mblk_t *data_mp; 8559 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8560 8561 /* These two ioctls are I_STR only */ 8562 if (iocp->ioc_count == TRANSPARENT) { 8563 miocnak(q, mp, 0, EINVAL); 8564 return; 8565 } 8566 8567 data_mp = mp->b_cont; 8568 if (data_mp == NULL) { 8569 /* The user passed us a NULL argument */ 8570 table = NULL; 8571 table_size = iocp->ioc_count; 8572 } else { 8573 /* 8574 * The user provided a table. The stream head 8575 * may have copied in the user data in chunks, 8576 * so make sure everything is pulled up 8577 * properly. 8578 */ 8579 if (MBLKL(data_mp) < iocp->ioc_count) { 8580 mblk_t *new_data_mp; 8581 if ((new_data_mp = msgpullup(data_mp, -1)) == 8582 NULL) { 8583 miocnak(q, mp, 0, ENOMEM); 8584 return; 8585 } 8586 freemsg(data_mp); 8587 data_mp = new_data_mp; 8588 mp->b_cont = data_mp; 8589 } 8590 table = (ip6_asp_t *)data_mp->b_rptr; 8591 table_size = iocp->ioc_count; 8592 } 8593 8594 switch (iocp->ioc_cmd) { 8595 case SIOCGIP6ADDRPOLICY: 8596 iocp->ioc_rval = ip6_asp_get(table, table_size); 8597 if (iocp->ioc_rval == -1) 8598 iocp->ioc_error = EINVAL; 8599 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8600 else if (table != NULL && 8601 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 8602 ip6_asp_t *src = table; 8603 ip6_asp32_t *dst = (void *)table; 8604 int count = table_size / sizeof (ip6_asp_t); 8605 int i; 8606 8607 /* 8608 * We need to do an in-place shrink of the array 8609 * to match the alignment attributes of the 8610 * 32-bit ABI looking at it. 8611 */ 8612 /* LINTED: logical expression always true: op "||" */ 8613 ASSERT(sizeof (*src) > sizeof (*dst)); 8614 for (i = 1; i < count; i++) 8615 bcopy(src + i, dst + i, sizeof (*dst)); 8616 } 8617 #endif 8618 break; 8619 8620 case SIOCSIP6ADDRPOLICY: 8621 ASSERT(mp->b_prev == NULL); 8622 mp->b_prev = (void *)q; 8623 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8624 /* 8625 * We pass in the datamodel here so that the ip6_asp_replace() 8626 * routine can handle converting from 32-bit to native formats 8627 * where necessary. 8628 * 8629 * A better way to handle this might be to convert the inbound 8630 * data structure here, and hang it off a new 'mp'; thus the 8631 * ip6_asp_replace() logic would always be dealing with native 8632 * format data structures.. 8633 * 8634 * (An even simpler way to handle these ioctls is to just 8635 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 8636 * and just recompile everything that depends on it.) 8637 */ 8638 #endif 8639 ip6_asp_replace(mp, table, table_size, B_FALSE, 8640 iocp->ioc_flag & IOC_MODELS); 8641 return; 8642 } 8643 8644 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 8645 qreply(q, mp); 8646 } 8647 8648 static void 8649 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 8650 { 8651 mblk_t *data_mp; 8652 struct dstinforeq *dir; 8653 uint8_t *end, *cur; 8654 in6_addr_t *daddr, *saddr; 8655 ipaddr_t v4daddr; 8656 ire_t *ire; 8657 char *slabel, *dlabel; 8658 boolean_t isipv4; 8659 int match_ire; 8660 ill_t *dst_ill; 8661 ipif_t *src_ipif, *ire_ipif; 8662 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8663 zoneid_t zoneid; 8664 8665 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8666 zoneid = Q_TO_CONN(q)->conn_zoneid; 8667 8668 /* 8669 * This ioctl is I_STR only, and must have a 8670 * data mblk following the M_IOCTL mblk. 8671 */ 8672 data_mp = mp->b_cont; 8673 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 8674 miocnak(q, mp, 0, EINVAL); 8675 return; 8676 } 8677 8678 if (MBLKL(data_mp) < iocp->ioc_count) { 8679 mblk_t *new_data_mp; 8680 8681 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 8682 miocnak(q, mp, 0, ENOMEM); 8683 return; 8684 } 8685 freemsg(data_mp); 8686 data_mp = new_data_mp; 8687 mp->b_cont = data_mp; 8688 } 8689 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 8690 8691 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 8692 end - cur >= sizeof (struct dstinforeq); 8693 cur += sizeof (struct dstinforeq)) { 8694 dir = (struct dstinforeq *)cur; 8695 daddr = &dir->dir_daddr; 8696 saddr = &dir->dir_saddr; 8697 8698 /* 8699 * ip_addr_scope_v6() and ip6_asp_lookup() handle 8700 * v4 mapped addresses; ire_ftable_lookup[_v6]() 8701 * and ipif_select_source[_v6]() do not. 8702 */ 8703 dir->dir_dscope = ip_addr_scope_v6(daddr); 8704 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence); 8705 8706 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 8707 if (isipv4) { 8708 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 8709 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 8710 0, NULL, NULL, zoneid, 0, NULL, match_ire); 8711 } else { 8712 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 8713 0, NULL, NULL, zoneid, 0, NULL, match_ire); 8714 } 8715 if (ire == NULL) { 8716 dir->dir_dreachable = 0; 8717 8718 /* move on to next dst addr */ 8719 continue; 8720 } 8721 dir->dir_dreachable = 1; 8722 8723 ire_ipif = ire->ire_ipif; 8724 if (ire_ipif == NULL) 8725 goto next_dst; 8726 8727 /* 8728 * We expect to get back an interface ire or a 8729 * gateway ire cache entry. For both types, the 8730 * output interface is ire_ipif->ipif_ill. 8731 */ 8732 dst_ill = ire_ipif->ipif_ill; 8733 dir->dir_dmactype = dst_ill->ill_mactype; 8734 8735 if (isipv4) { 8736 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 8737 } else { 8738 src_ipif = ipif_select_source_v6(dst_ill, 8739 daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, 8740 zoneid); 8741 } 8742 if (src_ipif == NULL) 8743 goto next_dst; 8744 8745 *saddr = src_ipif->ipif_v6lcl_addr; 8746 dir->dir_sscope = ip_addr_scope_v6(saddr); 8747 slabel = ip6_asp_lookup(saddr, NULL); 8748 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 8749 dir->dir_sdeprecated = 8750 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 8751 ipif_refrele(src_ipif); 8752 next_dst: 8753 ire_refrele(ire); 8754 } 8755 miocack(q, mp, iocp->ioc_count, 0); 8756 } 8757 8758 8759 /* 8760 * Check if this is an address assigned to this machine. 8761 * Skips interfaces that are down by using ire checks. 8762 * Translates mapped addresses to v4 addresses and then 8763 * treats them as such, returning true if the v4 address 8764 * associated with this mapped address is configured. 8765 * Note: Applications will have to be careful what they do 8766 * with the response; use of mapped addresses limits 8767 * what can be done with the socket, especially with 8768 * respect to socket options and ioctls - neither IPv4 8769 * options nor IPv6 sticky options/ancillary data options 8770 * may be used. 8771 */ 8772 /* ARGSUSED */ 8773 int 8774 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8775 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8776 { 8777 struct sioc_addrreq *sia; 8778 sin_t *sin; 8779 ire_t *ire; 8780 mblk_t *mp1; 8781 zoneid_t zoneid; 8782 8783 ip1dbg(("ip_sioctl_tmyaddr")); 8784 8785 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8786 zoneid = Q_TO_CONN(q)->conn_zoneid; 8787 8788 /* Existence verified in ip_wput_nondata */ 8789 mp1 = mp->b_cont->b_cont; 8790 sia = (struct sioc_addrreq *)mp1->b_rptr; 8791 sin = (sin_t *)&sia->sa_addr; 8792 switch (sin->sin_family) { 8793 case AF_INET6: { 8794 sin6_t *sin6 = (sin6_t *)sin; 8795 8796 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8797 ipaddr_t v4_addr; 8798 8799 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8800 v4_addr); 8801 ire = ire_ctable_lookup(v4_addr, 0, 8802 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8803 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8804 } else { 8805 in6_addr_t v6addr; 8806 8807 v6addr = sin6->sin6_addr; 8808 ire = ire_ctable_lookup_v6(&v6addr, 0, 8809 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8810 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8811 } 8812 break; 8813 } 8814 case AF_INET: { 8815 ipaddr_t v4addr; 8816 8817 v4addr = sin->sin_addr.s_addr; 8818 ire = ire_ctable_lookup(v4addr, 0, 8819 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8820 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8821 break; 8822 } 8823 default: 8824 return (EAFNOSUPPORT); 8825 } 8826 if (ire != NULL) { 8827 sia->sa_res = 1; 8828 ire_refrele(ire); 8829 } else { 8830 sia->sa_res = 0; 8831 } 8832 return (0); 8833 } 8834 8835 /* 8836 * Check if this is an address assigned on-link i.e. neighbor, 8837 * and makes sure it's reachable from the current zone. 8838 * Returns true for my addresses as well. 8839 * Translates mapped addresses to v4 addresses and then 8840 * treats them as such, returning true if the v4 address 8841 * associated with this mapped address is configured. 8842 * Note: Applications will have to be careful what they do 8843 * with the response; use of mapped addresses limits 8844 * what can be done with the socket, especially with 8845 * respect to socket options and ioctls - neither IPv4 8846 * options nor IPv6 sticky options/ancillary data options 8847 * may be used. 8848 */ 8849 /* ARGSUSED */ 8850 int 8851 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8852 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8853 { 8854 struct sioc_addrreq *sia; 8855 sin_t *sin; 8856 mblk_t *mp1; 8857 ire_t *ire = NULL; 8858 zoneid_t zoneid; 8859 8860 ip1dbg(("ip_sioctl_tonlink")); 8861 8862 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8863 zoneid = Q_TO_CONN(q)->conn_zoneid; 8864 8865 /* Existence verified in ip_wput_nondata */ 8866 mp1 = mp->b_cont->b_cont; 8867 sia = (struct sioc_addrreq *)mp1->b_rptr; 8868 sin = (sin_t *)&sia->sa_addr; 8869 8870 /* 8871 * Match addresses with a zero gateway field to avoid 8872 * routes going through a router. 8873 * Exclude broadcast and multicast addresses. 8874 */ 8875 switch (sin->sin_family) { 8876 case AF_INET6: { 8877 sin6_t *sin6 = (sin6_t *)sin; 8878 8879 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8880 ipaddr_t v4_addr; 8881 8882 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8883 v4_addr); 8884 if (!CLASSD(v4_addr)) { 8885 ire = ire_route_lookup(v4_addr, 0, 0, 0, 8886 NULL, NULL, zoneid, NULL, 8887 MATCH_IRE_GW); 8888 } 8889 } else { 8890 in6_addr_t v6addr; 8891 in6_addr_t v6gw; 8892 8893 v6addr = sin6->sin6_addr; 8894 v6gw = ipv6_all_zeros; 8895 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8896 ire = ire_route_lookup_v6(&v6addr, 0, 8897 &v6gw, 0, NULL, NULL, zoneid, 8898 NULL, MATCH_IRE_GW); 8899 } 8900 } 8901 break; 8902 } 8903 case AF_INET: { 8904 ipaddr_t v4addr; 8905 8906 v4addr = sin->sin_addr.s_addr; 8907 if (!CLASSD(v4addr)) { 8908 ire = ire_route_lookup(v4addr, 0, 0, 0, 8909 NULL, NULL, zoneid, NULL, 8910 MATCH_IRE_GW); 8911 } 8912 break; 8913 } 8914 default: 8915 return (EAFNOSUPPORT); 8916 } 8917 sia->sa_res = 0; 8918 if (ire != NULL) { 8919 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 8920 IRE_LOCAL|IRE_LOOPBACK)) { 8921 sia->sa_res = 1; 8922 } 8923 ire_refrele(ire); 8924 } 8925 return (0); 8926 } 8927 8928 /* 8929 * TBD: implement when kernel maintaines a list of site prefixes. 8930 */ 8931 /* ARGSUSED */ 8932 int 8933 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8934 ip_ioctl_cmd_t *ipip, void *ifreq) 8935 { 8936 return (ENXIO); 8937 } 8938 8939 /* ARGSUSED */ 8940 int 8941 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8942 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8943 { 8944 ill_t *ill; 8945 mblk_t *mp1; 8946 conn_t *connp; 8947 boolean_t success; 8948 8949 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 8950 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 8951 /* ioctl comes down on an conn */ 8952 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8953 connp = Q_TO_CONN(q); 8954 8955 mp->b_datap->db_type = M_IOCTL; 8956 8957 /* 8958 * Send down a copy. (copymsg does not copy b_next/b_prev). 8959 * The original mp contains contaminated b_next values due to 'mi', 8960 * which is needed to do the mi_copy_done. Unfortunately if we 8961 * send down the original mblk itself and if we are popped due to an 8962 * an unplumb before the response comes back from tunnel, 8963 * the streamhead (which does a freemsg) will see this contaminated 8964 * message and the assertion in freemsg about non-null b_next/b_prev 8965 * will panic a DEBUG kernel. 8966 */ 8967 mp1 = copymsg(mp); 8968 if (mp1 == NULL) 8969 return (ENOMEM); 8970 8971 ill = ipif->ipif_ill; 8972 mutex_enter(&connp->conn_lock); 8973 mutex_enter(&ill->ill_lock); 8974 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 8975 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 8976 mp, 0); 8977 } else { 8978 success = ill_pending_mp_add(ill, connp, mp); 8979 } 8980 mutex_exit(&ill->ill_lock); 8981 mutex_exit(&connp->conn_lock); 8982 8983 if (success) { 8984 ip1dbg(("sending down tunparam request ")); 8985 putnext(ill->ill_wq, mp1); 8986 return (EINPROGRESS); 8987 } else { 8988 /* The conn has started closing */ 8989 freemsg(mp1); 8990 return (EINTR); 8991 } 8992 } 8993 8994 static int 8995 ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, 8996 boolean_t x_arp_ioctl, boolean_t if_arp_ioctl) 8997 { 8998 mblk_t *mp1; 8999 mblk_t *mp2; 9000 mblk_t *pending_mp; 9001 ipaddr_t ipaddr; 9002 area_t *area; 9003 struct iocblk *iocp; 9004 conn_t *connp; 9005 struct arpreq *ar; 9006 struct xarpreq *xar; 9007 boolean_t success; 9008 int flags, alength; 9009 char *lladdr; 9010 9011 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9012 connp = Q_TO_CONN(q); 9013 9014 iocp = (struct iocblk *)mp->b_rptr; 9015 /* 9016 * ill has already been set depending on whether 9017 * bsd style or interface style ioctl. 9018 */ 9019 ASSERT(ill != NULL); 9020 9021 /* 9022 * Is this one of the new SIOC*XARP ioctls? 9023 */ 9024 if (x_arp_ioctl) { 9025 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9026 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9027 ar = NULL; 9028 9029 flags = xar->xarp_flags; 9030 lladdr = LLADDR(&xar->xarp_ha); 9031 /* 9032 * Validate against user's link layer address length 9033 * input and name and addr length limits. 9034 */ 9035 alength = ill->ill_phys_addr_length; 9036 if (iocp->ioc_cmd == SIOCSXARP) { 9037 if (alength != xar->xarp_ha.sdl_alen || 9038 (alength + xar->xarp_ha.sdl_nlen > 9039 sizeof (xar->xarp_ha.sdl_data))) 9040 return (EINVAL); 9041 } 9042 } else { 9043 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9044 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9045 xar = NULL; 9046 9047 flags = ar->arp_flags; 9048 lladdr = ar->arp_ha.sa_data; 9049 /* 9050 * Theoretically, the sa_family could tell us what link 9051 * layer type this operation is trying to deal with. By 9052 * common usage AF_UNSPEC means ethernet. We'll assume 9053 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9054 * for now. Our new SIOC*XARP ioctls can be used more 9055 * generally. 9056 * 9057 * If the underlying media happens to have a non 6 byte 9058 * address, arp module will fail set/get, but the del 9059 * operation will succeed. 9060 */ 9061 alength = 6; 9062 if ((iocp->ioc_cmd != SIOCDARP) && 9063 (alength != ill->ill_phys_addr_length)) { 9064 return (EINVAL); 9065 } 9066 } 9067 9068 /* 9069 * We are going to pass up to ARP a packet chain that looks 9070 * like: 9071 * 9072 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9073 * 9074 * Get a copy of the original IOCTL mblk to head the chain, 9075 * to be sent up (in mp1). Also get another copy to store 9076 * in the ill_pending_mp list, for matching the response 9077 * when it comes back from ARP. 9078 */ 9079 mp1 = copyb(mp); 9080 pending_mp = copymsg(mp); 9081 if (mp1 == NULL || pending_mp == NULL) { 9082 if (mp1 != NULL) 9083 freeb(mp1); 9084 if (pending_mp != NULL) 9085 inet_freemsg(pending_mp); 9086 return (ENOMEM); 9087 } 9088 9089 ipaddr = sin->sin_addr.s_addr; 9090 9091 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9092 (caddr_t)&ipaddr); 9093 if (mp2 == NULL) { 9094 freeb(mp1); 9095 inet_freemsg(pending_mp); 9096 return (ENOMEM); 9097 } 9098 /* Put together the chain. */ 9099 mp1->b_cont = mp2; 9100 mp1->b_datap->db_type = M_IOCTL; 9101 mp2->b_cont = mp; 9102 mp2->b_datap->db_type = M_DATA; 9103 9104 iocp = (struct iocblk *)mp1->b_rptr; 9105 9106 /* 9107 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9108 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9109 * cp_private field (or cp_rval on 32-bit systems) in place of the 9110 * ioc_count field; set ioc_count to be correct. 9111 */ 9112 iocp->ioc_count = MBLKL(mp1->b_cont); 9113 9114 /* 9115 * Set the proper command in the ARP message. 9116 * Convert the SIOC{G|S|D}ARP calls into our 9117 * AR_ENTRY_xxx calls. 9118 */ 9119 area = (area_t *)mp2->b_rptr; 9120 switch (iocp->ioc_cmd) { 9121 case SIOCDARP: 9122 case SIOCDXARP: 9123 /* 9124 * We defer deleting the corresponding IRE until 9125 * we return from arp. 9126 */ 9127 area->area_cmd = AR_ENTRY_DELETE; 9128 area->area_proto_mask_offset = 0; 9129 break; 9130 case SIOCGARP: 9131 case SIOCGXARP: 9132 area->area_cmd = AR_ENTRY_SQUERY; 9133 area->area_proto_mask_offset = 0; 9134 break; 9135 case SIOCSARP: 9136 case SIOCSXARP: { 9137 /* 9138 * Delete the corresponding ire to make sure IP will 9139 * pick up any change from arp. 9140 */ 9141 if (!if_arp_ioctl) { 9142 (void) ip_ire_clookup_and_delete(ipaddr, NULL); 9143 break; 9144 } else { 9145 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9146 if (ipif != NULL) { 9147 (void) ip_ire_clookup_and_delete(ipaddr, ipif); 9148 ipif_refrele(ipif); 9149 } 9150 break; 9151 } 9152 } 9153 } 9154 iocp->ioc_cmd = area->area_cmd; 9155 9156 /* 9157 * Before sending 'mp' to ARP, we have to clear the b_next 9158 * and b_prev. Otherwise if STREAMS encounters such a message 9159 * in freemsg(), (because ARP can close any time) it can cause 9160 * a panic. But mi code needs the b_next and b_prev values of 9161 * mp->b_cont, to complete the ioctl. So we store it here 9162 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9163 * when the response comes down from ARP. 9164 */ 9165 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9166 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9167 mp->b_cont->b_next = NULL; 9168 mp->b_cont->b_prev = NULL; 9169 9170 mutex_enter(&connp->conn_lock); 9171 mutex_enter(&ill->ill_lock); 9172 /* conn has not yet started closing, hence this can't fail */ 9173 success = ill_pending_mp_add(ill, connp, pending_mp); 9174 ASSERT(success); 9175 mutex_exit(&ill->ill_lock); 9176 mutex_exit(&connp->conn_lock); 9177 9178 /* 9179 * Fill in the rest of the ARP operation fields. 9180 */ 9181 area->area_hw_addr_length = alength; 9182 bcopy(lladdr, 9183 (char *)area + area->area_hw_addr_offset, 9184 area->area_hw_addr_length); 9185 /* Translate the flags. */ 9186 if (flags & ATF_PERM) 9187 area->area_flags |= ACE_F_PERMANENT; 9188 if (flags & ATF_PUBL) 9189 area->area_flags |= ACE_F_PUBLISH; 9190 if (flags & ATF_AUTHORITY) 9191 area->area_flags |= ACE_F_AUTHORITY; 9192 9193 /* 9194 * Up to ARP it goes. The response will come 9195 * back in ip_wput as an M_IOCACK message, and 9196 * will be handed to ip_sioctl_iocack for 9197 * completion. 9198 */ 9199 putnext(ill->ill_rq, mp1); 9200 return (EINPROGRESS); 9201 } 9202 9203 /* ARGSUSED */ 9204 int 9205 ip_sioctl_xarp(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9206 ip_ioctl_cmd_t *ipip, void *ifreq) 9207 { 9208 struct xarpreq *xar; 9209 boolean_t isv6; 9210 mblk_t *mp1; 9211 int err; 9212 conn_t *connp; 9213 int ifnamelen; 9214 ire_t *ire = NULL; 9215 ill_t *ill = NULL; 9216 struct sockaddr_in *sin; 9217 boolean_t if_arp_ioctl = B_FALSE; 9218 9219 /* ioctl comes down on an conn */ 9220 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9221 connp = Q_TO_CONN(q); 9222 isv6 = connp->conn_af_isv6; 9223 9224 /* Existance verified in ip_wput_nondata */ 9225 mp1 = mp->b_cont->b_cont; 9226 9227 ASSERT(MBLKL(mp1) >= sizeof (*xar)); 9228 xar = (struct xarpreq *)mp1->b_rptr; 9229 sin = (sin_t *)&xar->xarp_pa; 9230 9231 if (isv6 || (xar->xarp_ha.sdl_family != AF_LINK) || 9232 (xar->xarp_pa.ss_family != AF_INET)) 9233 return (ENXIO); 9234 9235 ifnamelen = xar->xarp_ha.sdl_nlen; 9236 if (ifnamelen != 0) { 9237 char *cptr, cval; 9238 9239 if (ifnamelen >= LIFNAMSIZ) 9240 return (EINVAL); 9241 9242 /* 9243 * Instead of bcopying a bunch of bytes, 9244 * null-terminate the string in-situ. 9245 */ 9246 cptr = xar->xarp_ha.sdl_data + ifnamelen; 9247 cval = *cptr; 9248 *cptr = '\0'; 9249 ill = ill_lookup_on_name(xar->xarp_ha.sdl_data, 9250 B_FALSE, isv6, CONNP_TO_WQ(connp), mp, ip_process_ioctl, 9251 &err, NULL); 9252 *cptr = cval; 9253 if (ill == NULL) 9254 return (err); 9255 if (ill->ill_net_type != IRE_IF_RESOLVER) { 9256 ill_refrele(ill); 9257 return (ENXIO); 9258 } 9259 9260 if_arp_ioctl = B_TRUE; 9261 } else { 9262 /* 9263 * PSARC 2003/088 states that if sdl_nlen == 0, it behaves 9264 * as an extended BSD ioctl. The kernel uses the IP address 9265 * to figure out the network interface. 9266 */ 9267 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9268 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9269 ((ill = ire_to_ill(ire)) == NULL) || 9270 (ill->ill_net_type != IRE_IF_RESOLVER)) { 9271 if (ire != NULL) 9272 ire_refrele(ire); 9273 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9274 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9275 NULL, MATCH_IRE_TYPE); 9276 if ((ire == NULL) || 9277 ((ill = ire_to_ill(ire)) == NULL)) { 9278 if (ire != NULL) 9279 ire_refrele(ire); 9280 return (ENXIO); 9281 } 9282 } 9283 ASSERT(ire != NULL && ill != NULL); 9284 } 9285 9286 err = ip_sioctl_arp_common(ill, q, mp, sin, B_TRUE, if_arp_ioctl); 9287 if (if_arp_ioctl) 9288 ill_refrele(ill); 9289 if (ire != NULL) 9290 ire_refrele(ire); 9291 9292 return (err); 9293 } 9294 9295 /* 9296 * ARP IOCTLs. 9297 * How does IP get in the business of fronting ARP configuration/queries? 9298 * Well its like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9299 * are by tradition passed in through a datagram socket. That lands in IP. 9300 * As it happens, this is just as well since the interface is quite crude in 9301 * that it passes in no information about protocol or hardware types, or 9302 * interface association. After making the protocol assumption, IP is in 9303 * the position to look up the name of the ILL, which ARP will need, and 9304 * format a request that can be handled by ARP. The request is passed up 9305 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9306 * back a response. ARP supports its own set of more general IOCTLs, in 9307 * case anyone is interested. 9308 */ 9309 /* ARGSUSED */ 9310 int 9311 ip_sioctl_arp(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9312 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9313 { 9314 struct arpreq *ar; 9315 struct sockaddr_in *sin; 9316 ire_t *ire; 9317 boolean_t isv6; 9318 mblk_t *mp1; 9319 int err; 9320 conn_t *connp; 9321 ill_t *ill; 9322 9323 /* ioctl comes down on an conn */ 9324 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9325 connp = Q_TO_CONN(q); 9326 isv6 = connp->conn_af_isv6; 9327 if (isv6) 9328 return (ENXIO); 9329 9330 /* Existance verified in ip_wput_nondata */ 9331 mp1 = mp->b_cont->b_cont; 9332 9333 ar = (struct arpreq *)mp1->b_rptr; 9334 sin = (sin_t *)&ar->arp_pa; 9335 9336 /* 9337 * We need to let ARP know on which interface the IP 9338 * address has an ARP mapping. In the IPMP case, a 9339 * simple forwarding table lookup will return the 9340 * IRE_IF_RESOLVER for the first interface in the group, 9341 * which might not be the interface on which the 9342 * requested IP address was resolved due to the ill 9343 * selection algorithm (see ip_newroute_get_dst_ill()). 9344 * So we do a cache table lookup first: if the IRE cache 9345 * entry for the IP address is still there, it will 9346 * contain the ill pointer for the right interface, so 9347 * we use that. If the cache entry has been flushed, we 9348 * fall back to the forwarding table lookup. This should 9349 * be rare enough since IRE cache entries have a longer 9350 * life expectancy than ARP cache entries. 9351 */ 9352 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9353 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9354 ((ill = ire_to_ill(ire)) == NULL)) { 9355 if (ire != NULL) 9356 ire_refrele(ire); 9357 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9358 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9359 NULL, MATCH_IRE_TYPE); 9360 if ((ire == NULL) || ((ill = ire_to_ill(ire)) == NULL)) { 9361 if (ire != NULL) 9362 ire_refrele(ire); 9363 return (ENXIO); 9364 } 9365 } 9366 ASSERT(ire != NULL && ill != NULL); 9367 9368 err = ip_sioctl_arp_common(ill, q, mp, sin, B_FALSE, B_FALSE); 9369 ire_refrele(ire); 9370 return (err); 9371 } 9372 9373 /* 9374 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9375 * atomically set/clear the muxids. Also complete the ioctl by acking or 9376 * naking it. Note that the code is structured such that the link type, 9377 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9378 * its clones use the persistent link, while pppd(1M) and perhaps many 9379 * other daemons may use non-persistent link. When combined with some 9380 * ill_t states, linking and unlinking lower streams may be used as 9381 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9382 */ 9383 /* ARGSUSED */ 9384 void 9385 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9386 { 9387 mblk_t *mp1; 9388 mblk_t *mp2; 9389 struct linkblk *li; 9390 queue_t *ipwq; 9391 char *name; 9392 struct qinit *qinfo; 9393 struct ipmx_s *ipmxp; 9394 ill_t *ill = NULL; 9395 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9396 int err = 0; 9397 boolean_t entered_ipsq = B_FALSE; 9398 boolean_t islink; 9399 queue_t *dwq = NULL; 9400 9401 ASSERT(iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_PUNLINK || 9402 iocp->ioc_cmd == I_LINK || iocp->ioc_cmd == I_UNLINK); 9403 9404 islink = (iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_LINK) ? 9405 B_TRUE : B_FALSE; 9406 9407 mp1 = mp->b_cont; /* This is the linkblk info */ 9408 li = (struct linkblk *)mp1->b_rptr; 9409 9410 /* 9411 * ARP has added this special mblk, and the utility is asking us 9412 * to perform consistency checks, and also atomically set the 9413 * muxid. Ifconfig is an example. It achieves this by using 9414 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9415 * to /dev/udp[6] stream for use as the mux when plinking the IP 9416 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9417 * and other comments in this routine for more details. 9418 */ 9419 mp2 = mp1->b_cont; /* This is added by ARP */ 9420 9421 /* 9422 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9423 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9424 * get the special mblk above. For backward compatibility, we just 9425 * return success. The utility will use SIOCSLIFMUXID to store 9426 * the muxids. This is not atomic, and can leave the streams 9427 * unplumbable if the utility is interrrupted, before it does the 9428 * SIOCSLIFMUXID. 9429 */ 9430 if (mp2 == NULL) { 9431 /* 9432 * At this point we don't know whether or not this is the 9433 * IP module stream or the ARP device stream. We need to 9434 * walk the lower stream in order to find this out, since 9435 * the capability negotiation is done only on the IP module 9436 * stream. IP module instance is identified by the module 9437 * name IP, non-null q_next, and it's wput not being ip_lwput. 9438 * STREAMS ensures that the lower stream (l_qbot) will not 9439 * vanish until this ioctl completes. So we can safely walk 9440 * the stream or refer to the q_ptr. 9441 */ 9442 ipwq = li->l_qbot; 9443 while (ipwq != NULL) { 9444 qinfo = ipwq->q_qinfo; 9445 name = qinfo->qi_minfo->mi_idname; 9446 if (name != NULL && name[0] != NULL && 9447 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9448 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9449 (ipwq->q_next != NULL)) { 9450 break; 9451 } 9452 ipwq = ipwq->q_next; 9453 } 9454 /* 9455 * This looks like an IP module stream, so trigger 9456 * the capability reset or re-negotiation if necessary. 9457 */ 9458 if (ipwq != NULL) { 9459 ill = ipwq->q_ptr; 9460 ASSERT(ill != NULL); 9461 9462 if (ipsq == NULL) { 9463 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9464 ip_sioctl_plink, NEW_OP, B_TRUE); 9465 if (ipsq == NULL) 9466 return; 9467 entered_ipsq = B_TRUE; 9468 } 9469 ASSERT(IAM_WRITER_ILL(ill)); 9470 /* 9471 * Store the upper read queue of the module 9472 * immediately below IP, and count the total 9473 * number of lower modules. Do this only 9474 * for I_PLINK or I_LINK event. 9475 */ 9476 ill->ill_lmod_rq = NULL; 9477 ill->ill_lmod_cnt = 0; 9478 if (islink && (dwq = ipwq->q_next) != NULL) { 9479 ill->ill_lmod_rq = RD(dwq); 9480 9481 while (dwq != NULL) { 9482 ill->ill_lmod_cnt++; 9483 dwq = dwq->q_next; 9484 } 9485 } 9486 /* 9487 * There's no point in resetting or re-negotiating if 9488 * we are not bound to the driver, so only do this if 9489 * the DLPI state is idle (up); we assume such state 9490 * since ill_ipif_up_count gets incremented in 9491 * ipif_up_done(), which is after we are bound to the 9492 * driver. Note that in the case of logical 9493 * interfaces, IP won't rebind to the driver unless 9494 * the ill_ipif_up_count is 0, meaning that all other 9495 * IP interfaces (including the main ipif) are in the 9496 * down state. Because of this, we use such counter 9497 * as an indicator, instead of relying on the IPIF_UP 9498 * flag, which is per ipif instance. 9499 */ 9500 if (ill->ill_ipif_up_count > 0) { 9501 if (islink) 9502 ill_capability_probe(ill); 9503 else 9504 ill_capability_reset(ill); 9505 } 9506 } 9507 goto done; 9508 } 9509 9510 /* 9511 * This is an I_{P}LINK sent down by ifconfig on 9512 * /dev/arp. ARP has appended this last (3rd) mblk, 9513 * giving more info. STREAMS ensures that the lower 9514 * stream (l_qbot) will not vanish until this ioctl 9515 * completes. So we can safely walk the stream or refer 9516 * to the q_ptr. 9517 */ 9518 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9519 if (ipmxp->ipmx_arpdev_stream) { 9520 /* 9521 * The operation is occuring on the arp-device 9522 * stream. 9523 */ 9524 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9525 q, mp, ip_sioctl_plink, &err, NULL); 9526 if (ill == NULL) { 9527 if (err == EINPROGRESS) { 9528 return; 9529 } else { 9530 err = EINVAL; 9531 goto done; 9532 } 9533 } 9534 9535 if (ipsq == NULL) { 9536 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9537 NEW_OP, B_TRUE); 9538 if (ipsq == NULL) { 9539 ill_refrele(ill); 9540 return; 9541 } 9542 entered_ipsq = B_TRUE; 9543 } 9544 ASSERT(IAM_WRITER_ILL(ill)); 9545 ill_refrele(ill); 9546 /* 9547 * To ensure consistency between IP and ARP, 9548 * the following LIFO scheme is used in 9549 * plink/punlink. (IP first, ARP last). 9550 * This is because the muxid's are stored 9551 * in the IP stream on the ill. 9552 * 9553 * I_{P}LINK: ifconfig plinks the IP stream before 9554 * plinking the ARP stream. On an arp-dev 9555 * stream, IP checks that it is not yet 9556 * plinked, and it also checks that the 9557 * corresponding IP stream is already plinked. 9558 * 9559 * I_{P}UNLINK: ifconfig punlinks the ARP stream 9560 * before punlinking the IP stream. IP does 9561 * not allow punlink of the IP stream unless 9562 * the arp stream has been punlinked. 9563 * 9564 */ 9565 if ((islink && 9566 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9567 (!islink && 9568 ill->ill_arp_muxid != li->l_index)) { 9569 err = EINVAL; 9570 goto done; 9571 } 9572 if (islink) { 9573 ill->ill_arp_muxid = li->l_index; 9574 } else { 9575 ill->ill_arp_muxid = 0; 9576 } 9577 } else { 9578 /* 9579 * This must be the IP module stream with or 9580 * without arp. Walk the stream and locate the 9581 * IP module. An IP module instance is 9582 * identified by the module name IP, non-null 9583 * q_next, and it's wput not being ip_lwput. 9584 */ 9585 ipwq = li->l_qbot; 9586 while (ipwq != NULL) { 9587 qinfo = ipwq->q_qinfo; 9588 name = qinfo->qi_minfo->mi_idname; 9589 if (name != NULL && name[0] != NULL && 9590 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9591 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9592 (ipwq->q_next != NULL)) { 9593 break; 9594 } 9595 ipwq = ipwq->q_next; 9596 } 9597 if (ipwq != NULL) { 9598 ill = ipwq->q_ptr; 9599 ASSERT(ill != NULL); 9600 9601 if (ipsq == NULL) { 9602 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9603 ip_sioctl_plink, NEW_OP, B_TRUE); 9604 if (ipsq == NULL) 9605 return; 9606 entered_ipsq = B_TRUE; 9607 } 9608 ASSERT(IAM_WRITER_ILL(ill)); 9609 /* 9610 * Return error if the ip_mux_id is 9611 * non-zero and command is I_{P}LINK. 9612 * If command is I_{P}UNLINK, return 9613 * error if the arp-devstr is not 9614 * yet punlinked. 9615 */ 9616 if ((islink && ill->ill_ip_muxid != 0) || 9617 (!islink && ill->ill_arp_muxid != 0)) { 9618 err = EINVAL; 9619 goto done; 9620 } 9621 ill->ill_lmod_rq = NULL; 9622 ill->ill_lmod_cnt = 0; 9623 if (islink) { 9624 /* 9625 * Store the upper read queue of the module 9626 * immediately below IP, and count the total 9627 * number of lower modules. 9628 */ 9629 if ((dwq = ipwq->q_next) != NULL) { 9630 ill->ill_lmod_rq = RD(dwq); 9631 9632 while (dwq != NULL) { 9633 ill->ill_lmod_cnt++; 9634 dwq = dwq->q_next; 9635 } 9636 } 9637 ill->ill_ip_muxid = li->l_index; 9638 } else { 9639 ill->ill_ip_muxid = 0; 9640 } 9641 9642 /* 9643 * See comments above about resetting/re- 9644 * negotiating driver sub-capabilities. 9645 */ 9646 if (ill->ill_ipif_up_count > 0) { 9647 if (islink) 9648 ill_capability_probe(ill); 9649 else 9650 ill_capability_reset(ill); 9651 } 9652 } 9653 } 9654 done: 9655 iocp->ioc_count = 0; 9656 iocp->ioc_error = err; 9657 if (err == 0) 9658 mp->b_datap->db_type = M_IOCACK; 9659 else 9660 mp->b_datap->db_type = M_IOCNAK; 9661 qreply(q, mp); 9662 9663 /* Conn was refheld in ip_sioctl_copyin_setup */ 9664 if (CONN_Q(q)) 9665 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9666 if (entered_ipsq) 9667 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9668 } 9669 9670 /* 9671 * Search the ioctl command in the ioctl tables and return a pointer 9672 * to the ioctl command information. The ioctl command tables are 9673 * static and fully populated at compile time. 9674 */ 9675 ip_ioctl_cmd_t * 9676 ip_sioctl_lookup(int ioc_cmd) 9677 { 9678 int index; 9679 ip_ioctl_cmd_t *ipip; 9680 ip_ioctl_cmd_t *ipip_end; 9681 9682 if (ioc_cmd == IPI_DONTCARE) 9683 return (NULL); 9684 9685 /* 9686 * Do a 2 step search. First search the indexed table 9687 * based on the least significant byte of the ioctl cmd. 9688 * If we don't find a match, then search the misc table 9689 * serially. 9690 */ 9691 index = ioc_cmd & 0xFF; 9692 if (index < ip_ndx_ioctl_count) { 9693 ipip = &ip_ndx_ioctl_table[index]; 9694 if (ipip->ipi_cmd == ioc_cmd) { 9695 /* Found a match in the ndx table */ 9696 return (ipip); 9697 } 9698 } 9699 9700 /* Search the misc table */ 9701 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 9702 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 9703 if (ipip->ipi_cmd == ioc_cmd) 9704 /* Found a match in the misc table */ 9705 return (ipip); 9706 } 9707 9708 return (NULL); 9709 } 9710 9711 /* 9712 * Wrapper function for resuming deferred ioctl processing 9713 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 9714 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 9715 */ 9716 /* ARGSUSED */ 9717 void 9718 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 9719 void *dummy_arg) 9720 { 9721 ip_sioctl_copyin_setup(q, mp); 9722 } 9723 9724 /* 9725 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 9726 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 9727 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 9728 * We establish here the size of the block to be copied in. mi_copyin 9729 * arranges for this to happen, an processing continues in ip_wput with 9730 * an M_IOCDATA message. 9731 */ 9732 void 9733 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 9734 { 9735 int copyin_size; 9736 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9737 ip_ioctl_cmd_t *ipip; 9738 cred_t *cr; 9739 9740 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 9741 if (ipip == NULL) { 9742 /* 9743 * The ioctl is not one we understand or own. 9744 * Pass it along to be processed down stream, 9745 * if this is a module instance of IP, else nak 9746 * the ioctl. 9747 */ 9748 if (q->q_next == NULL) { 9749 goto nak; 9750 } else { 9751 putnext(q, mp); 9752 return; 9753 } 9754 } 9755 9756 /* 9757 * If this is deferred, then we will do all the checks when we 9758 * come back. 9759 */ 9760 if ((iocp->ioc_cmd == SIOCGDSTINFO || 9761 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup()) { 9762 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 9763 return; 9764 } 9765 9766 /* 9767 * Only allow a very small subset of IP ioctls on this stream if 9768 * IP is a module and not a driver. Allowing ioctls to be processed 9769 * in this case may cause assert failures or data corruption. 9770 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 9771 * ioctls allowed on an IP module stream, after which this stream 9772 * normally becomes a multiplexor (at which time the stream head 9773 * will fail all ioctls). 9774 */ 9775 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 9776 if (ipip->ipi_flags & IPI_PASS_DOWN) { 9777 /* 9778 * Pass common Streams ioctls which the IP 9779 * module does not own or consume along to 9780 * be processed down stream. 9781 */ 9782 putnext(q, mp); 9783 return; 9784 } else { 9785 goto nak; 9786 } 9787 } 9788 9789 /* Make sure we have ioctl data to process. */ 9790 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 9791 goto nak; 9792 9793 /* 9794 * Prefer dblk credential over ioctl credential; some synthesized 9795 * ioctls have kcred set because there's no way to crhold() 9796 * a credential in some contexts. (ioc_cr is not crfree() by 9797 * the framework; the caller of ioctl needs to hold the reference 9798 * for the duration of the call). 9799 */ 9800 cr = DB_CREDDEF(mp, iocp->ioc_cr); 9801 9802 /* Make sure normal users don't send down privileged ioctls */ 9803 if ((ipip->ipi_flags & IPI_PRIV) && 9804 (cr != NULL) && secpolicy_net_config(cr, B_TRUE) != 0) { 9805 /* We checked the privilege earlier but log it here */ 9806 miocnak(q, mp, 0, secpolicy_net_config(cr, B_FALSE)); 9807 return; 9808 } 9809 9810 /* 9811 * The ioctl command tables can only encode fixed length 9812 * ioctl data. If the length is variable, the table will 9813 * encode the length as zero. Such special cases are handled 9814 * below in the switch. 9815 */ 9816 if (ipip->ipi_copyin_size != 0) { 9817 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 9818 return; 9819 } 9820 9821 switch (iocp->ioc_cmd) { 9822 case O_SIOCGIFCONF: 9823 case SIOCGIFCONF: 9824 /* 9825 * This IOCTL is hilarious. See comments in 9826 * ip_sioctl_get_ifconf for the story. 9827 */ 9828 if (iocp->ioc_count == TRANSPARENT) 9829 copyin_size = SIZEOF_STRUCT(ifconf, 9830 iocp->ioc_flag); 9831 else 9832 copyin_size = iocp->ioc_count; 9833 mi_copyin(q, mp, NULL, copyin_size); 9834 return; 9835 9836 case O_SIOCGLIFCONF: 9837 case SIOCGLIFCONF: 9838 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 9839 mi_copyin(q, mp, NULL, copyin_size); 9840 return; 9841 9842 case SIOCGLIFSRCOF: 9843 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 9844 mi_copyin(q, mp, NULL, copyin_size); 9845 return; 9846 case SIOCGIP6ADDRPOLICY: 9847 ip_sioctl_ip6addrpolicy(q, mp); 9848 ip6_asp_table_refrele(); 9849 return; 9850 9851 case SIOCSIP6ADDRPOLICY: 9852 ip_sioctl_ip6addrpolicy(q, mp); 9853 return; 9854 9855 case SIOCGDSTINFO: 9856 ip_sioctl_dstinfo(q, mp); 9857 ip6_asp_table_refrele(); 9858 return; 9859 9860 case I_PLINK: 9861 case I_PUNLINK: 9862 case I_LINK: 9863 case I_UNLINK: 9864 /* 9865 * We treat non-persistent link similarly as the persistent 9866 * link case, in terms of plumbing/unplumbing, as well as 9867 * dynamic re-plumbing events indicator. See comments 9868 * in ip_sioctl_plink() for more. 9869 * 9870 * Request can be enqueued in the 'ipsq' while waiting 9871 * to become exclusive. So bump up the conn ref. 9872 */ 9873 if (CONN_Q(q)) 9874 CONN_INC_REF(Q_TO_CONN(q)); 9875 ip_sioctl_plink(NULL, q, mp, NULL); 9876 return; 9877 9878 case ND_GET: 9879 case ND_SET: 9880 /* 9881 * Use of the nd table requires holding the reader lock. 9882 * Modifying the nd table thru nd_load/nd_unload requires 9883 * the writer lock. 9884 */ 9885 rw_enter(&ip_g_nd_lock, RW_READER); 9886 if (nd_getset(q, ip_g_nd, mp)) { 9887 rw_exit(&ip_g_nd_lock); 9888 9889 if (iocp->ioc_error) 9890 iocp->ioc_count = 0; 9891 mp->b_datap->db_type = M_IOCACK; 9892 qreply(q, mp); 9893 return; 9894 } 9895 rw_exit(&ip_g_nd_lock); 9896 /* 9897 * We don't understand this subioctl of ND_GET / ND_SET. 9898 * Maybe intended for some driver / module below us 9899 */ 9900 if (q->q_next) { 9901 putnext(q, mp); 9902 } else { 9903 iocp->ioc_error = ENOENT; 9904 mp->b_datap->db_type = M_IOCNAK; 9905 iocp->ioc_count = 0; 9906 qreply(q, mp); 9907 } 9908 return; 9909 9910 case IP_IOCTL: 9911 ip_wput_ioctl(q, mp); 9912 return; 9913 default: 9914 cmn_err(CE_PANIC, "should not happen "); 9915 } 9916 nak: 9917 if (mp->b_cont != NULL) { 9918 freemsg(mp->b_cont); 9919 mp->b_cont = NULL; 9920 } 9921 iocp->ioc_error = EINVAL; 9922 mp->b_datap->db_type = M_IOCNAK; 9923 iocp->ioc_count = 0; 9924 qreply(q, mp); 9925 } 9926 9927 /* ip_wput hands off ARP IOCTL responses to us */ 9928 void 9929 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 9930 { 9931 struct arpreq *ar; 9932 struct xarpreq *xar; 9933 area_t *area; 9934 mblk_t *area_mp; 9935 struct iocblk *iocp; 9936 mblk_t *orig_ioc_mp, *tmp; 9937 struct iocblk *orig_iocp; 9938 ill_t *ill; 9939 conn_t *connp = NULL; 9940 uint_t ioc_id; 9941 mblk_t *pending_mp; 9942 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 9943 int *flagsp; 9944 char *storage = NULL; 9945 sin_t *sin; 9946 ipaddr_t addr; 9947 int err; 9948 9949 ill = q->q_ptr; 9950 ASSERT(ill != NULL); 9951 9952 /* 9953 * We should get back from ARP a packet chain that looks like: 9954 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9955 */ 9956 if (!(area_mp = mp->b_cont) || 9957 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 9958 !(orig_ioc_mp = area_mp->b_cont) || 9959 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 9960 freemsg(mp); 9961 return; 9962 } 9963 9964 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 9965 9966 tmp = (orig_ioc_mp->b_cont)->b_cont; 9967 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 9968 (orig_iocp->ioc_cmd == SIOCSXARP) || 9969 (orig_iocp->ioc_cmd == SIOCDXARP)) { 9970 x_arp_ioctl = B_TRUE; 9971 xar = (struct xarpreq *)tmp->b_rptr; 9972 sin = (sin_t *)&xar->xarp_pa; 9973 flagsp = &xar->xarp_flags; 9974 storage = xar->xarp_ha.sdl_data; 9975 if (xar->xarp_ha.sdl_nlen != 0) 9976 ifx_arp_ioctl = B_TRUE; 9977 } else { 9978 ar = (struct arpreq *)tmp->b_rptr; 9979 sin = (sin_t *)&ar->arp_pa; 9980 flagsp = &ar->arp_flags; 9981 storage = ar->arp_ha.sa_data; 9982 } 9983 9984 iocp = (struct iocblk *)mp->b_rptr; 9985 9986 /* 9987 * Pick out the originating queue based on the ioc_id. 9988 */ 9989 ioc_id = iocp->ioc_id; 9990 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 9991 if (pending_mp == NULL) { 9992 ASSERT(connp == NULL); 9993 inet_freemsg(mp); 9994 return; 9995 } 9996 ASSERT(connp != NULL); 9997 q = CONNP_TO_WQ(connp); 9998 9999 /* Uncouple the internally generated IOCTL from the original one */ 10000 area = (area_t *)area_mp->b_rptr; 10001 area_mp->b_cont = NULL; 10002 10003 /* 10004 * Restore the b_next and b_prev used by mi code. This is needed 10005 * to complete the ioctl using mi* functions. We stored them in 10006 * the pending mp prior to sending the request to ARP. 10007 */ 10008 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10009 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10010 inet_freemsg(pending_mp); 10011 10012 /* 10013 * We're done if there was an error or if this is not an SIOCG{X}ARP 10014 * Catch the case where there is an IRE_CACHE by no entry in the 10015 * arp table. 10016 */ 10017 addr = sin->sin_addr.s_addr; 10018 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10019 ire_t *ire; 10020 dl_unitdata_req_t *dlup; 10021 mblk_t *llmp; 10022 int addr_len; 10023 ill_t *ipsqill = NULL; 10024 10025 if (ifx_arp_ioctl) { 10026 /* 10027 * There's no need to lookup the ill, since 10028 * we've already done that when we started 10029 * processing the ioctl and sent the message 10030 * to ARP on that ill. So use the ill that 10031 * is stored in q->q_ptr. 10032 */ 10033 ipsqill = ill; 10034 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10035 ipsqill->ill_ipif, ALL_ZONES, 10036 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 10037 } else { 10038 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10039 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 10040 if (ire != NULL) 10041 ipsqill = ire_to_ill(ire); 10042 } 10043 10044 if ((x_arp_ioctl) && (ipsqill != NULL)) 10045 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10046 10047 if (ire != NULL) { 10048 /* 10049 * Since the ire obtained from cachetable is used for 10050 * mac addr copying below, treat an incomplete ire as if 10051 * as if we never found it. 10052 */ 10053 if (ire->ire_nce != NULL && 10054 ire->ire_nce->nce_state != ND_REACHABLE) { 10055 ire_refrele(ire); 10056 ire = NULL; 10057 ipsqill = NULL; 10058 goto errack; 10059 } 10060 *flagsp = ATF_INUSE; 10061 llmp = (ire->ire_nce != NULL ? 10062 ire->ire_nce->nce_res_mp : NULL); 10063 if (llmp != NULL && ipsqill != NULL) { 10064 uchar_t *macaddr; 10065 10066 addr_len = ipsqill->ill_phys_addr_length; 10067 if (x_arp_ioctl && ((addr_len + 10068 ipsqill->ill_name_length) > 10069 sizeof (xar->xarp_ha.sdl_data))) { 10070 ire_refrele(ire); 10071 freemsg(mp); 10072 ip_ioctl_finish(q, orig_ioc_mp, 10073 EINVAL, NO_COPYOUT, NULL, NULL); 10074 return; 10075 } 10076 *flagsp |= ATF_COM; 10077 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10078 if (ipsqill->ill_sap_length < 0) 10079 macaddr = llmp->b_rptr + 10080 dlup->dl_dest_addr_offset; 10081 else 10082 macaddr = llmp->b_rptr + 10083 dlup->dl_dest_addr_offset + 10084 ipsqill->ill_sap_length; 10085 /* 10086 * For SIOCGARP, MAC address length 10087 * validation has already been done 10088 * before the ioctl was issued to ARP to 10089 * allow it to progress only on 6 byte 10090 * addressable (ethernet like) media. Thus 10091 * the mac address copying can not overwrite 10092 * the sa_data area below. 10093 */ 10094 bcopy(macaddr, storage, addr_len); 10095 } 10096 /* Ditch the internal IOCTL. */ 10097 freemsg(mp); 10098 ire_refrele(ire); 10099 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 10100 return; 10101 } 10102 } 10103 10104 /* 10105 * Delete the coresponding IRE_CACHE if any. 10106 * Reset the error if there was one (in case there was no entry 10107 * in arp.) 10108 */ 10109 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10110 ipif_t *ipintf = NULL; 10111 10112 if (ifx_arp_ioctl) { 10113 /* 10114 * There's no need to lookup the ill, since 10115 * we've already done that when we started 10116 * processing the ioctl and sent the message 10117 * to ARP on that ill. So use the ill that 10118 * is stored in q->q_ptr. 10119 */ 10120 ipintf = ill->ill_ipif; 10121 } 10122 if (ip_ire_clookup_and_delete(addr, ipintf)) { 10123 /* 10124 * The address in "addr" may be an entry for a 10125 * router. If that's true, then any off-net 10126 * IRE_CACHE entries that go through the router 10127 * with address "addr" must be clobbered. Use 10128 * ire_walk to achieve this goal. 10129 */ 10130 if (ifx_arp_ioctl) 10131 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10132 ire_delete_cache_gw, (char *)&addr, ill); 10133 else 10134 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10135 ALL_ZONES); 10136 iocp->ioc_error = 0; 10137 } 10138 } 10139 errack: 10140 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10141 err = iocp->ioc_error; 10142 freemsg(mp); 10143 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL, NULL); 10144 return; 10145 } 10146 10147 /* 10148 * Completion of an SIOCG{X}ARP. Translate the information from 10149 * the area_t into the struct {x}arpreq. 10150 */ 10151 if (x_arp_ioctl) { 10152 storage += ill_xarp_info(&xar->xarp_ha, ill); 10153 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10154 sizeof (xar->xarp_ha.sdl_data)) { 10155 freemsg(mp); 10156 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, 10157 NO_COPYOUT, NULL, NULL); 10158 return; 10159 } 10160 } 10161 *flagsp = ATF_INUSE; 10162 if (area->area_flags & ACE_F_PERMANENT) 10163 *flagsp |= ATF_PERM; 10164 if (area->area_flags & ACE_F_PUBLISH) 10165 *flagsp |= ATF_PUBL; 10166 if (area->area_flags & ACE_F_AUTHORITY) 10167 *flagsp |= ATF_AUTHORITY; 10168 if (area->area_hw_addr_length != 0) { 10169 *flagsp |= ATF_COM; 10170 /* 10171 * For SIOCGARP, MAC address length validation has 10172 * already been done before the ioctl was issued to ARP 10173 * to allow it to progress only on 6 byte addressable 10174 * (ethernet like) media. Thus the mac address copying 10175 * can not overwrite the sa_data area below. 10176 */ 10177 bcopy((char *)area + area->area_hw_addr_offset, 10178 storage, area->area_hw_addr_length); 10179 } 10180 10181 /* Ditch the internal IOCTL. */ 10182 freemsg(mp); 10183 /* Complete the original. */ 10184 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 10185 } 10186 10187 /* 10188 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10189 * interface) create the next available logical interface for this 10190 * physical interface. 10191 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10192 * ipif with the specified name. 10193 * 10194 * If the address family is not AF_UNSPEC then set the address as well. 10195 * 10196 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10197 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10198 * 10199 * Executed as a writer on the ill or ill group. 10200 * So no lock is needed to traverse the ipif chain, or examine the 10201 * phyint flags. 10202 */ 10203 /* ARGSUSED */ 10204 int 10205 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10206 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10207 { 10208 mblk_t *mp1; 10209 struct lifreq *lifr; 10210 boolean_t isv6; 10211 boolean_t exists; 10212 char *name; 10213 char *endp; 10214 char *cp; 10215 int namelen; 10216 ipif_t *ipif; 10217 long id; 10218 ipsq_t *ipsq; 10219 ill_t *ill; 10220 sin_t *sin; 10221 int err = 0; 10222 boolean_t found_sep = B_FALSE; 10223 conn_t *connp; 10224 zoneid_t zoneid; 10225 int orig_ifindex = 0; 10226 10227 ip1dbg(("ip_sioctl_addif\n")); 10228 /* Existence of mp1 has been checked in ip_wput_nondata */ 10229 mp1 = mp->b_cont->b_cont; 10230 /* 10231 * Null terminate the string to protect against buffer 10232 * overrun. String was generated by user code and may not 10233 * be trusted. 10234 */ 10235 lifr = (struct lifreq *)mp1->b_rptr; 10236 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10237 name = lifr->lifr_name; 10238 ASSERT(CONN_Q(q)); 10239 connp = Q_TO_CONN(q); 10240 isv6 = connp->conn_af_isv6; 10241 zoneid = connp->conn_zoneid; 10242 namelen = mi_strlen(name); 10243 if (namelen == 0) 10244 return (EINVAL); 10245 10246 exists = B_FALSE; 10247 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10248 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10249 /* 10250 * Allow creating lo0 using SIOCLIFADDIF. 10251 * can't be any other writer thread. So can pass null below 10252 * for the last 4 args to ipif_lookup_name. 10253 */ 10254 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, 10255 B_TRUE, &exists, isv6, zoneid, NULL, NULL, NULL, NULL); 10256 /* Prevent any further action */ 10257 if (ipif == NULL) { 10258 return (ENOBUFS); 10259 } else if (!exists) { 10260 /* We created the ipif now and as writer */ 10261 ipif_refrele(ipif); 10262 return (0); 10263 } else { 10264 ill = ipif->ipif_ill; 10265 ill_refhold(ill); 10266 ipif_refrele(ipif); 10267 } 10268 } else { 10269 /* Look for a colon in the name. */ 10270 endp = &name[namelen]; 10271 for (cp = endp; --cp > name; ) { 10272 if (*cp == IPIF_SEPARATOR_CHAR) { 10273 found_sep = B_TRUE; 10274 /* 10275 * Reject any non-decimal aliases for plumbing 10276 * of logical interfaces. Aliases with leading 10277 * zeroes are also rejected as they introduce 10278 * ambiguity in the naming of the interfaces. 10279 * Comparing with "0" takes care of all such 10280 * cases. 10281 */ 10282 if ((strncmp("0", cp+1, 1)) == 0) 10283 return (EINVAL); 10284 10285 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10286 id <= 0 || *endp != '\0') { 10287 return (EINVAL); 10288 } 10289 *cp = '\0'; 10290 break; 10291 } 10292 } 10293 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10294 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL); 10295 if (found_sep) 10296 *cp = IPIF_SEPARATOR_CHAR; 10297 if (ill == NULL) 10298 return (err); 10299 } 10300 10301 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10302 B_TRUE); 10303 10304 /* 10305 * Release the refhold due to the lookup, now that we are excl 10306 * or we are just returning 10307 */ 10308 ill_refrele(ill); 10309 10310 if (ipsq == NULL) 10311 return (EINPROGRESS); 10312 10313 /* 10314 * If the interface is failed, inactive or offlined, look for a working 10315 * interface in the ill group and create the ipif there. If we can't 10316 * find a good interface, create the ipif anyway so that in.mpathd can 10317 * move it to the first repaired interface. 10318 */ 10319 if ((ill->ill_phyint->phyint_flags & 10320 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10321 ill->ill_phyint->phyint_groupname_len != 0) { 10322 phyint_t *phyi; 10323 char *groupname = ill->ill_phyint->phyint_groupname; 10324 10325 /* 10326 * We're looking for a working interface, but it doesn't matter 10327 * if it's up or down; so instead of following the group lists, 10328 * we look at each physical interface and compare the groupname. 10329 * We're only interested in interfaces with IPv4 (resp. IPv6) 10330 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 10331 * Otherwise we create the ipif on the failed interface. 10332 */ 10333 rw_enter(&ill_g_lock, RW_READER); 10334 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 10335 for (; phyi != NULL; 10336 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 10337 phyi, AVL_AFTER)) { 10338 if (phyi->phyint_groupname_len == 0) 10339 continue; 10340 ASSERT(phyi->phyint_groupname != NULL); 10341 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 10342 !(phyi->phyint_flags & 10343 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10344 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 10345 (phyi->phyint_illv4 != NULL))) { 10346 break; 10347 } 10348 } 10349 rw_exit(&ill_g_lock); 10350 10351 if (phyi != NULL) { 10352 orig_ifindex = ill->ill_phyint->phyint_ifindex; 10353 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 10354 phyi->phyint_illv4); 10355 } 10356 } 10357 10358 /* 10359 * We are now exclusive on the ipsq, so an ill move will be serialized 10360 * before or after us. 10361 */ 10362 ASSERT(IAM_WRITER_ILL(ill)); 10363 ASSERT(ill->ill_move_in_progress == B_FALSE); 10364 10365 if (found_sep && orig_ifindex == 0) { 10366 /* Now see if there is an IPIF with this unit number. */ 10367 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 10368 if (ipif->ipif_id == id) { 10369 err = EEXIST; 10370 goto done; 10371 } 10372 } 10373 } 10374 10375 /* 10376 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10377 * of lo0. We never come here when we plumb lo0:0. It 10378 * happens in ipif_lookup_on_name. 10379 * The specified unit number is ignored when we create the ipif on a 10380 * different interface. However, we save it in ipif_orig_ipifid below so 10381 * that the ipif fails back to the right position. 10382 */ 10383 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 10384 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 10385 err = ENOBUFS; 10386 goto done; 10387 } 10388 10389 /* Return created name with ioctl */ 10390 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10391 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10392 ip1dbg(("created %s\n", lifr->lifr_name)); 10393 10394 /* Set address */ 10395 sin = (sin_t *)&lifr->lifr_addr; 10396 if (sin->sin_family != AF_UNSPEC) { 10397 err = ip_sioctl_addr(ipif, sin, q, mp, 10398 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10399 } 10400 10401 /* Set ifindex and unit number for failback */ 10402 if (err == 0 && orig_ifindex != 0) { 10403 ipif->ipif_orig_ifindex = orig_ifindex; 10404 if (found_sep) { 10405 ipif->ipif_orig_ipifid = id; 10406 } 10407 } 10408 10409 done: 10410 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10411 return (err); 10412 } 10413 10414 /* 10415 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10416 * interface) delete it based on the IP address (on this physical interface). 10417 * Otherwise delete it based on the ipif_id. 10418 * Also, special handling to allow a removeif of lo0. 10419 */ 10420 /* ARGSUSED */ 10421 int 10422 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10423 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10424 { 10425 conn_t *connp; 10426 ill_t *ill = ipif->ipif_ill; 10427 boolean_t success; 10428 10429 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10430 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10431 ASSERT(IAM_WRITER_IPIF(ipif)); 10432 10433 connp = Q_TO_CONN(q); 10434 /* 10435 * Special case for unplumbing lo0 (the loopback physical interface). 10436 * If unplumbing lo0, the incoming address structure has been 10437 * initialized to all zeros. When unplumbing lo0, all its logical 10438 * interfaces must be removed too. 10439 * 10440 * Note that this interface may be called to remove a specific 10441 * loopback logical interface (eg, lo0:1). But in that case 10442 * ipif->ipif_id != 0 so that the code path for that case is the 10443 * same as any other interface (meaning it skips the code directly 10444 * below). 10445 */ 10446 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10447 if (sin->sin_family == AF_UNSPEC && 10448 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10449 /* 10450 * Mark it condemned. No new ref. will be made to ill. 10451 */ 10452 mutex_enter(&ill->ill_lock); 10453 ill->ill_state_flags |= ILL_CONDEMNED; 10454 for (ipif = ill->ill_ipif; ipif != NULL; 10455 ipif = ipif->ipif_next) { 10456 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10457 } 10458 mutex_exit(&ill->ill_lock); 10459 10460 ipif = ill->ill_ipif; 10461 /* unplumb the loopback interface */ 10462 ill_delete(ill); 10463 mutex_enter(&connp->conn_lock); 10464 mutex_enter(&ill->ill_lock); 10465 ASSERT(ill->ill_group == NULL); 10466 10467 /* Are any references to this ill active */ 10468 if (ill_is_quiescent(ill)) { 10469 mutex_exit(&ill->ill_lock); 10470 mutex_exit(&connp->conn_lock); 10471 ill_delete_tail(ill); 10472 mi_free(ill); 10473 return (0); 10474 } 10475 success = ipsq_pending_mp_add(connp, ipif, 10476 CONNP_TO_WQ(connp), mp, ILL_FREE); 10477 mutex_exit(&connp->conn_lock); 10478 mutex_exit(&ill->ill_lock); 10479 if (success) 10480 return (EINPROGRESS); 10481 else 10482 return (EINTR); 10483 } 10484 } 10485 10486 /* 10487 * We are exclusive on the ipsq, so an ill move will be serialized 10488 * before or after us. 10489 */ 10490 ASSERT(ill->ill_move_in_progress == B_FALSE); 10491 10492 if (ipif->ipif_id == 0) { 10493 /* Find based on address */ 10494 if (ipif->ipif_isv6) { 10495 sin6_t *sin6; 10496 10497 if (sin->sin_family != AF_INET6) 10498 return (EAFNOSUPPORT); 10499 10500 sin6 = (sin6_t *)sin; 10501 /* We are a writer, so we should be able to lookup */ 10502 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10503 ill, ALL_ZONES, NULL, NULL, NULL, NULL); 10504 if (ipif == NULL) { 10505 /* 10506 * Maybe the address in on another interface in 10507 * the same IPMP group? We check this below. 10508 */ 10509 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10510 NULL, ALL_ZONES, NULL, NULL, NULL, NULL); 10511 } 10512 } else { 10513 ipaddr_t addr; 10514 10515 if (sin->sin_family != AF_INET) 10516 return (EAFNOSUPPORT); 10517 10518 addr = sin->sin_addr.s_addr; 10519 /* We are a writer, so we should be able to lookup */ 10520 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 10521 NULL, NULL, NULL); 10522 if (ipif == NULL) { 10523 /* 10524 * Maybe the address in on another interface in 10525 * the same IPMP group? We check this below. 10526 */ 10527 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 10528 NULL, NULL, NULL, NULL); 10529 } 10530 } 10531 if (ipif == NULL) { 10532 return (EADDRNOTAVAIL); 10533 } 10534 /* 10535 * When the address to be removed is hosted on a different 10536 * interface, we check if the interface is in the same IPMP 10537 * group as the specified one; if so we proceed with the 10538 * removal. 10539 * ill->ill_group is NULL when the ill is down, so we have to 10540 * compare the group names instead. 10541 */ 10542 if (ipif->ipif_ill != ill && 10543 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 10544 ill->ill_phyint->phyint_groupname_len == 0 || 10545 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 10546 ill->ill_phyint->phyint_groupname) != 0)) { 10547 ipif_refrele(ipif); 10548 return (EADDRNOTAVAIL); 10549 } 10550 10551 /* This is a writer */ 10552 ipif_refrele(ipif); 10553 } 10554 10555 /* 10556 * Can not delete instance zero since it is tied to the ill. 10557 */ 10558 if (ipif->ipif_id == 0) 10559 return (EBUSY); 10560 10561 mutex_enter(&ill->ill_lock); 10562 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10563 mutex_exit(&ill->ill_lock); 10564 10565 ipif_free(ipif); 10566 10567 mutex_enter(&connp->conn_lock); 10568 mutex_enter(&ill->ill_lock); 10569 10570 /* Are any references to this ipif active */ 10571 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 10572 mutex_exit(&ill->ill_lock); 10573 mutex_exit(&connp->conn_lock); 10574 ipif_non_duplicate(ipif); 10575 ipif_down_tail(ipif); 10576 ipif_free_tail(ipif); 10577 return (0); 10578 } 10579 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10580 IPIF_FREE); 10581 mutex_exit(&ill->ill_lock); 10582 mutex_exit(&connp->conn_lock); 10583 if (success) 10584 return (EINPROGRESS); 10585 else 10586 return (EINTR); 10587 } 10588 10589 /* 10590 * Restart the removeif ioctl. The refcnt has gone down to 0. 10591 * The ipif is already condemned. So can't find it thru lookups. 10592 */ 10593 /* ARGSUSED */ 10594 int 10595 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10596 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10597 { 10598 ill_t *ill; 10599 10600 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10601 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10602 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10603 ill = ipif->ipif_ill; 10604 ASSERT(IAM_WRITER_ILL(ill)); 10605 ASSERT((ipif->ipif_state_flags & IPIF_CONDEMNED) && 10606 (ill->ill_state_flags & IPIF_CONDEMNED)); 10607 ill_delete_tail(ill); 10608 mi_free(ill); 10609 return (0); 10610 } 10611 10612 ill = ipif->ipif_ill; 10613 ASSERT(IAM_WRITER_IPIF(ipif)); 10614 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10615 10616 ipif_non_duplicate(ipif); 10617 ipif_down_tail(ipif); 10618 ipif_free_tail(ipif); 10619 10620 ILL_UNMARK_CHANGING(ill); 10621 return (0); 10622 } 10623 10624 /* 10625 * Set the local interface address. 10626 * Allow an address of all zero when the interface is down. 10627 */ 10628 /* ARGSUSED */ 10629 int 10630 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10631 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10632 { 10633 int err = 0; 10634 in6_addr_t v6addr; 10635 boolean_t need_up = B_FALSE; 10636 10637 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10638 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10639 10640 ASSERT(IAM_WRITER_IPIF(ipif)); 10641 10642 if (ipif->ipif_isv6) { 10643 sin6_t *sin6; 10644 ill_t *ill; 10645 phyint_t *phyi; 10646 10647 if (sin->sin_family != AF_INET6) 10648 return (EAFNOSUPPORT); 10649 10650 sin6 = (sin6_t *)sin; 10651 v6addr = sin6->sin6_addr; 10652 ill = ipif->ipif_ill; 10653 phyi = ill->ill_phyint; 10654 10655 /* 10656 * Enforce that true multicast interfaces have a link-local 10657 * address for logical unit 0. 10658 */ 10659 if (ipif->ipif_id == 0 && 10660 (ill->ill_flags & ILLF_MULTICAST) && 10661 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 10662 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 10663 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 10664 return (EADDRNOTAVAIL); 10665 } 10666 10667 /* 10668 * up interfaces shouldn't have the unspecified address 10669 * unless they also have the IPIF_NOLOCAL flags set and 10670 * have a subnet assigned. 10671 */ 10672 if ((ipif->ipif_flags & IPIF_UP) && 10673 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 10674 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 10675 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 10676 return (EADDRNOTAVAIL); 10677 } 10678 10679 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10680 return (EADDRNOTAVAIL); 10681 } else { 10682 ipaddr_t addr; 10683 10684 if (sin->sin_family != AF_INET) 10685 return (EAFNOSUPPORT); 10686 10687 addr = sin->sin_addr.s_addr; 10688 10689 /* Allow 0 as the local address. */ 10690 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10691 return (EADDRNOTAVAIL); 10692 10693 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10694 } 10695 10696 10697 /* 10698 * Even if there is no change we redo things just to rerun 10699 * ipif_set_default. 10700 */ 10701 if (ipif->ipif_flags & IPIF_UP) { 10702 /* 10703 * Setting a new local address, make sure 10704 * we have net and subnet bcast ire's for 10705 * the old address if we need them. 10706 */ 10707 if (!ipif->ipif_isv6) 10708 ipif_check_bcast_ires(ipif); 10709 /* 10710 * If the interface is already marked up, 10711 * we call ipif_down which will take care 10712 * of ditching any IREs that have been set 10713 * up based on the old interface address. 10714 */ 10715 err = ipif_logical_down(ipif, q, mp); 10716 if (err == EINPROGRESS) 10717 return (err); 10718 ipif_down_tail(ipif); 10719 need_up = 1; 10720 } 10721 10722 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 10723 return (err); 10724 } 10725 10726 int 10727 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10728 boolean_t need_up) 10729 { 10730 in6_addr_t v6addr; 10731 ipaddr_t addr; 10732 sin6_t *sin6; 10733 int err = 0; 10734 ill_t *ill = ipif->ipif_ill; 10735 boolean_t need_dl_down; 10736 boolean_t need_arp_down; 10737 10738 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 10739 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10740 ASSERT(IAM_WRITER_IPIF(ipif)); 10741 10742 /* Must cancel any pending timer before taking the ill_lock */ 10743 if (ipif->ipif_recovery_id != 0) 10744 (void) untimeout(ipif->ipif_recovery_id); 10745 ipif->ipif_recovery_id = 0; 10746 10747 if (ipif->ipif_isv6) { 10748 sin6 = (sin6_t *)sin; 10749 v6addr = sin6->sin6_addr; 10750 } else { 10751 addr = sin->sin_addr.s_addr; 10752 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10753 } 10754 mutex_enter(&ill->ill_lock); 10755 ipif->ipif_v6lcl_addr = v6addr; 10756 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 10757 ipif->ipif_v6src_addr = ipv6_all_zeros; 10758 } else { 10759 ipif->ipif_v6src_addr = v6addr; 10760 } 10761 ipif->ipif_addr_ready = 0; 10762 10763 /* 10764 * If the interface was previously marked as a duplicate, then since 10765 * we've now got a "new" address, it should no longer be considered a 10766 * duplicate -- even if the "new" address is the same as the old one. 10767 * Note that if all ipifs are down, we may have a pending ARP down 10768 * event to handle. This is because we want to recover from duplicates 10769 * and thus delay tearing down ARP until the duplicates have been 10770 * removed or disabled. 10771 */ 10772 need_dl_down = need_arp_down = B_FALSE; 10773 if (ipif->ipif_flags & IPIF_DUPLICATE) { 10774 need_arp_down = !need_up; 10775 ipif->ipif_flags &= ~IPIF_DUPLICATE; 10776 if (--ill->ill_ipif_dup_count == 0 && !need_up && 10777 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 10778 need_dl_down = B_TRUE; 10779 } 10780 } 10781 10782 if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && 10783 !ill->ill_is_6to4tun) { 10784 queue_t *wqp = ill->ill_wq; 10785 10786 /* 10787 * The local address of this interface is a 6to4 address, 10788 * check if this interface is in fact a 6to4 tunnel or just 10789 * an interface configured with a 6to4 address. We are only 10790 * interested in the former. 10791 */ 10792 if (wqp != NULL) { 10793 while ((wqp->q_next != NULL) && 10794 (wqp->q_next->q_qinfo != NULL) && 10795 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 10796 10797 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 10798 == TUN6TO4_MODID) { 10799 /* set for use in IP */ 10800 ill->ill_is_6to4tun = 1; 10801 break; 10802 } 10803 wqp = wqp->q_next; 10804 } 10805 } 10806 } 10807 10808 ipif_set_default(ipif); 10809 mutex_exit(&ill->ill_lock); 10810 10811 if (need_up) { 10812 /* 10813 * Now bring the interface back up. If this 10814 * is the only IPIF for the ILL, ipif_up 10815 * will have to re-bind to the device, so 10816 * we may get back EINPROGRESS, in which 10817 * case, this IOCTL will get completed in 10818 * ip_rput_dlpi when we see the DL_BIND_ACK. 10819 */ 10820 err = ipif_up(ipif, q, mp); 10821 } else { 10822 /* 10823 * Update the IPIF list in SCTP, ipif_up_done() will do it 10824 * if need_up is true. 10825 */ 10826 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10827 } 10828 10829 if (need_dl_down) 10830 ill_dl_down(ill); 10831 if (need_arp_down) 10832 ipif_arp_down(ipif); 10833 10834 return (err); 10835 } 10836 10837 10838 /* 10839 * Restart entry point to restart the address set operation after the 10840 * refcounts have dropped to zero. 10841 */ 10842 /* ARGSUSED */ 10843 int 10844 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10845 ip_ioctl_cmd_t *ipip, void *ifreq) 10846 { 10847 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 10848 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10849 ASSERT(IAM_WRITER_IPIF(ipif)); 10850 ipif_down_tail(ipif); 10851 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 10852 } 10853 10854 /* ARGSUSED */ 10855 int 10856 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10857 ip_ioctl_cmd_t *ipip, void *if_req) 10858 { 10859 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 10860 struct lifreq *lifr = (struct lifreq *)if_req; 10861 10862 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 10863 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10864 /* 10865 * The net mask and address can't change since we have a 10866 * reference to the ipif. So no lock is necessary. 10867 */ 10868 if (ipif->ipif_isv6) { 10869 *sin6 = sin6_null; 10870 sin6->sin6_family = AF_INET6; 10871 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 10872 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10873 lifr->lifr_addrlen = 10874 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 10875 } else { 10876 *sin = sin_null; 10877 sin->sin_family = AF_INET; 10878 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 10879 if (ipip->ipi_cmd_type == LIF_CMD) { 10880 lifr->lifr_addrlen = 10881 ip_mask_to_plen(ipif->ipif_net_mask); 10882 } 10883 } 10884 return (0); 10885 } 10886 10887 /* 10888 * Set the destination address for a pt-pt interface. 10889 */ 10890 /* ARGSUSED */ 10891 int 10892 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10893 ip_ioctl_cmd_t *ipip, void *if_req) 10894 { 10895 int err = 0; 10896 in6_addr_t v6addr; 10897 boolean_t need_up = B_FALSE; 10898 10899 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 10900 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10901 ASSERT(IAM_WRITER_IPIF(ipif)); 10902 10903 if (ipif->ipif_isv6) { 10904 sin6_t *sin6; 10905 10906 if (sin->sin_family != AF_INET6) 10907 return (EAFNOSUPPORT); 10908 10909 sin6 = (sin6_t *)sin; 10910 v6addr = sin6->sin6_addr; 10911 10912 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10913 return (EADDRNOTAVAIL); 10914 } else { 10915 ipaddr_t addr; 10916 10917 if (sin->sin_family != AF_INET) 10918 return (EAFNOSUPPORT); 10919 10920 addr = sin->sin_addr.s_addr; 10921 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10922 return (EADDRNOTAVAIL); 10923 10924 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10925 } 10926 10927 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 10928 return (0); /* No change */ 10929 10930 if (ipif->ipif_flags & IPIF_UP) { 10931 /* 10932 * If the interface is already marked up, 10933 * we call ipif_down which will take care 10934 * of ditching any IREs that have been set 10935 * up based on the old pp dst address. 10936 */ 10937 err = ipif_logical_down(ipif, q, mp); 10938 if (err == EINPROGRESS) 10939 return (err); 10940 ipif_down_tail(ipif); 10941 need_up = B_TRUE; 10942 } 10943 /* 10944 * could return EINPROGRESS. If so ioctl will complete in 10945 * ip_rput_dlpi_writer 10946 */ 10947 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 10948 return (err); 10949 } 10950 10951 static int 10952 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10953 boolean_t need_up) 10954 { 10955 in6_addr_t v6addr; 10956 ill_t *ill = ipif->ipif_ill; 10957 int err = 0; 10958 boolean_t need_dl_down; 10959 boolean_t need_arp_down; 10960 10961 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 10962 ipif->ipif_id, (void *)ipif)); 10963 10964 /* Must cancel any pending timer before taking the ill_lock */ 10965 if (ipif->ipif_recovery_id != 0) 10966 (void) untimeout(ipif->ipif_recovery_id); 10967 ipif->ipif_recovery_id = 0; 10968 10969 if (ipif->ipif_isv6) { 10970 sin6_t *sin6; 10971 10972 sin6 = (sin6_t *)sin; 10973 v6addr = sin6->sin6_addr; 10974 } else { 10975 ipaddr_t addr; 10976 10977 addr = sin->sin_addr.s_addr; 10978 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10979 } 10980 mutex_enter(&ill->ill_lock); 10981 /* Set point to point destination address. */ 10982 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10983 /* 10984 * Allow this as a means of creating logical 10985 * pt-pt interfaces on top of e.g. an Ethernet. 10986 * XXX Undocumented HACK for testing. 10987 * pt-pt interfaces are created with NUD disabled. 10988 */ 10989 ipif->ipif_flags |= IPIF_POINTOPOINT; 10990 ipif->ipif_flags &= ~IPIF_BROADCAST; 10991 if (ipif->ipif_isv6) 10992 ill->ill_flags |= ILLF_NONUD; 10993 } 10994 10995 /* 10996 * If the interface was previously marked as a duplicate, then since 10997 * we've now got a "new" address, it should no longer be considered a 10998 * duplicate -- even if the "new" address is the same as the old one. 10999 * Note that if all ipifs are down, we may have a pending ARP down 11000 * event to handle. 11001 */ 11002 need_dl_down = need_arp_down = B_FALSE; 11003 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11004 need_arp_down = !need_up; 11005 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11006 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11007 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11008 need_dl_down = B_TRUE; 11009 } 11010 } 11011 11012 /* Set the new address. */ 11013 ipif->ipif_v6pp_dst_addr = v6addr; 11014 /* Make sure subnet tracks pp_dst */ 11015 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11016 mutex_exit(&ill->ill_lock); 11017 11018 if (need_up) { 11019 /* 11020 * Now bring the interface back up. If this 11021 * is the only IPIF for the ILL, ipif_up 11022 * will have to re-bind to the device, so 11023 * we may get back EINPROGRESS, in which 11024 * case, this IOCTL will get completed in 11025 * ip_rput_dlpi when we see the DL_BIND_ACK. 11026 */ 11027 err = ipif_up(ipif, q, mp); 11028 } 11029 11030 if (need_dl_down) 11031 ill_dl_down(ill); 11032 11033 if (need_arp_down) 11034 ipif_arp_down(ipif); 11035 return (err); 11036 } 11037 11038 /* 11039 * Restart entry point to restart the dstaddress set operation after the 11040 * refcounts have dropped to zero. 11041 */ 11042 /* ARGSUSED */ 11043 int 11044 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11045 ip_ioctl_cmd_t *ipip, void *ifreq) 11046 { 11047 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11048 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11049 ipif_down_tail(ipif); 11050 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11051 } 11052 11053 /* ARGSUSED */ 11054 int 11055 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11056 ip_ioctl_cmd_t *ipip, void *if_req) 11057 { 11058 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11059 11060 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11061 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11062 /* 11063 * Get point to point destination address. The addresses can't 11064 * change since we hold a reference to the ipif. 11065 */ 11066 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11067 return (EADDRNOTAVAIL); 11068 11069 if (ipif->ipif_isv6) { 11070 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11071 *sin6 = sin6_null; 11072 sin6->sin6_family = AF_INET6; 11073 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11074 } else { 11075 *sin = sin_null; 11076 sin->sin_family = AF_INET; 11077 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11078 } 11079 return (0); 11080 } 11081 11082 /* 11083 * part of ipmp, make this func return the active/inactive state and 11084 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 11085 */ 11086 /* 11087 * This function either sets or clears the IFF_INACTIVE flag. 11088 * 11089 * As long as there are some addresses or multicast memberships on the 11090 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 11091 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 11092 * will be used for outbound packets. 11093 * 11094 * Caller needs to verify the validity of setting IFF_INACTIVE. 11095 */ 11096 static void 11097 phyint_inactive(phyint_t *phyi) 11098 { 11099 ill_t *ill_v4; 11100 ill_t *ill_v6; 11101 ipif_t *ipif; 11102 ilm_t *ilm; 11103 11104 ill_v4 = phyi->phyint_illv4; 11105 ill_v6 = phyi->phyint_illv6; 11106 11107 /* 11108 * No need for a lock while traversing the list since iam 11109 * a writer 11110 */ 11111 if (ill_v4 != NULL) { 11112 ASSERT(IAM_WRITER_ILL(ill_v4)); 11113 for (ipif = ill_v4->ill_ipif; ipif != NULL; 11114 ipif = ipif->ipif_next) { 11115 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11116 mutex_enter(&phyi->phyint_lock); 11117 phyi->phyint_flags &= ~PHYI_INACTIVE; 11118 mutex_exit(&phyi->phyint_lock); 11119 return; 11120 } 11121 } 11122 for (ilm = ill_v4->ill_ilm; ilm != NULL; 11123 ilm = ilm->ilm_next) { 11124 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11125 mutex_enter(&phyi->phyint_lock); 11126 phyi->phyint_flags &= ~PHYI_INACTIVE; 11127 mutex_exit(&phyi->phyint_lock); 11128 return; 11129 } 11130 } 11131 } 11132 if (ill_v6 != NULL) { 11133 ill_v6 = phyi->phyint_illv6; 11134 for (ipif = ill_v6->ill_ipif; ipif != NULL; 11135 ipif = ipif->ipif_next) { 11136 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11137 mutex_enter(&phyi->phyint_lock); 11138 phyi->phyint_flags &= ~PHYI_INACTIVE; 11139 mutex_exit(&phyi->phyint_lock); 11140 return; 11141 } 11142 } 11143 for (ilm = ill_v6->ill_ilm; ilm != NULL; 11144 ilm = ilm->ilm_next) { 11145 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11146 mutex_enter(&phyi->phyint_lock); 11147 phyi->phyint_flags &= ~PHYI_INACTIVE; 11148 mutex_exit(&phyi->phyint_lock); 11149 return; 11150 } 11151 } 11152 } 11153 mutex_enter(&phyi->phyint_lock); 11154 phyi->phyint_flags |= PHYI_INACTIVE; 11155 mutex_exit(&phyi->phyint_lock); 11156 } 11157 11158 /* 11159 * This function is called only when the phyint flags change. Currently 11160 * called from ip_sioctl_flags. We re-do the broadcast nomination so 11161 * that we can select a good ill. 11162 */ 11163 static void 11164 ip_redo_nomination(phyint_t *phyi) 11165 { 11166 ill_t *ill_v4; 11167 11168 ill_v4 = phyi->phyint_illv4; 11169 11170 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 11171 ASSERT(IAM_WRITER_ILL(ill_v4)); 11172 if (ill_v4->ill_group->illgrp_ill_count > 1) 11173 ill_nominate_bcast_rcv(ill_v4->ill_group); 11174 } 11175 } 11176 11177 /* 11178 * Heuristic to check if ill is INACTIVE. 11179 * Checks if ill has an ipif with an usable ip address. 11180 * 11181 * Return values: 11182 * B_TRUE - ill is INACTIVE; has no usable ipif 11183 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 11184 */ 11185 static boolean_t 11186 ill_is_inactive(ill_t *ill) 11187 { 11188 ipif_t *ipif; 11189 11190 /* Check whether it is in an IPMP group */ 11191 if (ill->ill_phyint->phyint_groupname == NULL) 11192 return (B_FALSE); 11193 11194 if (ill->ill_ipif_up_count == 0) 11195 return (B_TRUE); 11196 11197 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11198 uint64_t flags = ipif->ipif_flags; 11199 11200 /* 11201 * This ipif is usable if it is IPIF_UP and not a 11202 * dedicated test address. A dedicated test address 11203 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 11204 * (note in particular that V6 test addresses are 11205 * link-local data addresses and thus are marked 11206 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 11207 */ 11208 if ((flags & IPIF_UP) && 11209 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 11210 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 11211 return (B_FALSE); 11212 } 11213 return (B_TRUE); 11214 } 11215 11216 /* 11217 * Set interface flags. 11218 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 11219 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 11220 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 11221 * 11222 * NOTE : We really don't enforce that ipif_id zero should be used 11223 * for setting any flags other than IFF_LOGINT_FLAGS. This 11224 * is because applications generally does SICGLIFFLAGS and 11225 * ORs in the new flags (that affects the logical) and does a 11226 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11227 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11228 * flags that will be turned on is correct with respect to 11229 * ipif_id 0. For backward compatibility reasons, it is not done. 11230 */ 11231 /* ARGSUSED */ 11232 int 11233 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11234 ip_ioctl_cmd_t *ipip, void *if_req) 11235 { 11236 uint64_t turn_on; 11237 uint64_t turn_off; 11238 int err; 11239 boolean_t need_up = B_FALSE; 11240 phyint_t *phyi; 11241 ill_t *ill; 11242 uint64_t intf_flags; 11243 boolean_t phyint_flags_modified = B_FALSE; 11244 uint64_t flags; 11245 struct ifreq *ifr; 11246 struct lifreq *lifr; 11247 boolean_t set_linklocal = B_FALSE; 11248 boolean_t zero_source = B_FALSE; 11249 11250 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11251 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11252 11253 ASSERT(IAM_WRITER_IPIF(ipif)); 11254 11255 ill = ipif->ipif_ill; 11256 phyi = ill->ill_phyint; 11257 11258 if (ipip->ipi_cmd_type == IF_CMD) { 11259 ifr = (struct ifreq *)if_req; 11260 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11261 } else { 11262 lifr = (struct lifreq *)if_req; 11263 flags = lifr->lifr_flags; 11264 } 11265 11266 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11267 11268 /* 11269 * Has the flags been set correctly till now ? 11270 */ 11271 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11272 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11273 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11274 /* 11275 * Compare the new flags to the old, and partition 11276 * into those coming on and those going off. 11277 * For the 16 bit command keep the bits above bit 16 unchanged. 11278 */ 11279 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11280 flags |= intf_flags & ~0xFFFF; 11281 11282 /* 11283 * First check which bits will change and then which will 11284 * go on and off 11285 */ 11286 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 11287 if (!turn_on) 11288 return (0); /* No change */ 11289 11290 turn_off = intf_flags & turn_on; 11291 turn_on ^= turn_off; 11292 err = 0; 11293 11294 /* 11295 * Don't allow any bits belonging to the logical interface 11296 * to be set or cleared on the replacement ipif that was 11297 * created temporarily during a MOVE. 11298 */ 11299 if (ipif->ipif_replace_zero && 11300 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 11301 return (EINVAL); 11302 } 11303 11304 /* 11305 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11306 * IPv6 interfaces. 11307 */ 11308 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11309 return (EINVAL); 11310 11311 /* 11312 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11313 * interfaces. It makes no sense in that context. 11314 */ 11315 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11316 return (EINVAL); 11317 11318 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11319 zero_source = B_TRUE; 11320 11321 /* 11322 * For IPv6 ipif_id 0, don't allow the interface to be up without 11323 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11324 * If the link local address isn't set, and can be set, it will get 11325 * set later on in this function. 11326 */ 11327 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11328 (flags & IFF_UP) && !zero_source && 11329 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11330 if (ipif_cant_setlinklocal(ipif)) 11331 return (EINVAL); 11332 set_linklocal = B_TRUE; 11333 } 11334 11335 /* 11336 * ILL cannot be part of a usesrc group and and IPMP group at the 11337 * same time. No need to grab ill_g_usesrc_lock here, see 11338 * synchronization notes in ip.c 11339 */ 11340 if (turn_on & PHYI_STANDBY && 11341 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 11342 return (EINVAL); 11343 } 11344 11345 /* 11346 * If we modify physical interface flags, we'll potentially need to 11347 * send up two routing socket messages for the changes (one for the 11348 * IPv4 ill, and another for the IPv6 ill). Note that here. 11349 */ 11350 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11351 phyint_flags_modified = B_TRUE; 11352 11353 /* 11354 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 11355 * we need to flush the IRE_CACHES belonging to this ill. 11356 * We handle this case here without doing the DOWN/UP dance 11357 * like it is done for other flags. If some other flags are 11358 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 11359 * below will handle it by bringing it down and then 11360 * bringing it UP. 11361 */ 11362 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 11363 ill_t *ill_v4, *ill_v6; 11364 11365 ill_v4 = phyi->phyint_illv4; 11366 ill_v6 = phyi->phyint_illv6; 11367 11368 /* 11369 * First set the INACTIVE flag if needed. Then delete the ires. 11370 * ire_add will atomically prevent creating new IRE_CACHEs 11371 * unless hidden flag is set. 11372 * PHYI_FAILED and PHYI_INACTIVE are exclusive 11373 */ 11374 if ((turn_on & PHYI_FAILED) && 11375 ((intf_flags & PHYI_STANDBY) || !ipmp_enable_failback)) { 11376 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 11377 phyi->phyint_flags &= ~PHYI_INACTIVE; 11378 } 11379 if ((turn_off & PHYI_FAILED) && 11380 ((intf_flags & PHYI_STANDBY) || 11381 (!ipmp_enable_failback && ill_is_inactive(ill)))) { 11382 phyint_inactive(phyi); 11383 } 11384 11385 if (turn_on & PHYI_STANDBY) { 11386 /* 11387 * We implicitly set INACTIVE only when STANDBY is set. 11388 * INACTIVE is also set on non-STANDBY phyint when user 11389 * disables FAILBACK using configuration file. 11390 * Do not allow STANDBY to be set on such INACTIVE 11391 * phyint 11392 */ 11393 if (phyi->phyint_flags & PHYI_INACTIVE) 11394 return (EINVAL); 11395 if (!(phyi->phyint_flags & PHYI_FAILED)) 11396 phyint_inactive(phyi); 11397 } 11398 if (turn_off & PHYI_STANDBY) { 11399 if (ipmp_enable_failback) { 11400 /* 11401 * Reset PHYI_INACTIVE. 11402 */ 11403 phyi->phyint_flags &= ~PHYI_INACTIVE; 11404 } else if (ill_is_inactive(ill) && 11405 !(phyi->phyint_flags & PHYI_FAILED)) { 11406 /* 11407 * Need to set INACTIVE, when user sets 11408 * STANDBY on a non-STANDBY phyint and 11409 * later resets STANDBY 11410 */ 11411 phyint_inactive(phyi); 11412 } 11413 } 11414 /* 11415 * We should always send up a message so that the 11416 * daemons come to know of it. Note that the zeroth 11417 * interface can be down and the check below for IPIF_UP 11418 * will not make sense as we are actually setting 11419 * a phyint flag here. We assume that the ipif used 11420 * is always the zeroth ipif. (ip_rts_ifmsg does not 11421 * send up any message for non-zero ipifs). 11422 */ 11423 phyint_flags_modified = B_TRUE; 11424 11425 if (ill_v4 != NULL) { 11426 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11427 IRE_CACHE, ill_stq_cache_delete, 11428 (char *)ill_v4, ill_v4); 11429 illgrp_reset_schednext(ill_v4); 11430 } 11431 if (ill_v6 != NULL) { 11432 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11433 IRE_CACHE, ill_stq_cache_delete, 11434 (char *)ill_v6, ill_v6); 11435 illgrp_reset_schednext(ill_v6); 11436 } 11437 } 11438 11439 /* 11440 * If ILLF_ROUTER changes, we need to change the ip forwarding 11441 * status of the interface and, if the interface is part of an IPMP 11442 * group, all other interfaces that are part of the same IPMP 11443 * group. 11444 */ 11445 if ((turn_on | turn_off) & ILLF_ROUTER) { 11446 (void) ill_forward_set(q, mp, ((turn_on & ILLF_ROUTER) != 0), 11447 (caddr_t)ill); 11448 } 11449 11450 /* 11451 * If the interface is not UP and we are not going to 11452 * bring it UP, record the flags and return. When the 11453 * interface comes UP later, the right actions will be 11454 * taken. 11455 */ 11456 if (!(ipif->ipif_flags & IPIF_UP) && 11457 !(turn_on & IPIF_UP)) { 11458 /* Record new flags in their respective places. */ 11459 mutex_enter(&ill->ill_lock); 11460 mutex_enter(&ill->ill_phyint->phyint_lock); 11461 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11462 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11463 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11464 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11465 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11466 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11467 mutex_exit(&ill->ill_lock); 11468 mutex_exit(&ill->ill_phyint->phyint_lock); 11469 11470 /* 11471 * We do the broadcast and nomination here rather 11472 * than waiting for a FAILOVER/FAILBACK to happen. In 11473 * the case of FAILBACK from INACTIVE standby to the 11474 * interface that has been repaired, PHYI_FAILED has not 11475 * been cleared yet. If there are only two interfaces in 11476 * that group, all we have is a FAILED and INACTIVE 11477 * interface. If we do the nomination soon after a failback, 11478 * the broadcast nomination code would select the 11479 * INACTIVE interface for receiving broadcasts as FAILED is 11480 * not yet cleared. As we don't want STANDBY/INACTIVE to 11481 * receive broadcast packets, we need to redo nomination 11482 * when the FAILED is cleared here. Thus, in general we 11483 * always do the nomination here for FAILED, STANDBY 11484 * and OFFLINE. 11485 */ 11486 if (((turn_on | turn_off) & 11487 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 11488 ip_redo_nomination(phyi); 11489 } 11490 if (phyint_flags_modified) { 11491 if (phyi->phyint_illv4 != NULL) { 11492 ip_rts_ifmsg(phyi->phyint_illv4-> 11493 ill_ipif); 11494 } 11495 if (phyi->phyint_illv6 != NULL) { 11496 ip_rts_ifmsg(phyi->phyint_illv6-> 11497 ill_ipif); 11498 } 11499 } 11500 return (0); 11501 } else if (set_linklocal || zero_source) { 11502 mutex_enter(&ill->ill_lock); 11503 if (set_linklocal) 11504 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11505 if (zero_source) 11506 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11507 mutex_exit(&ill->ill_lock); 11508 } 11509 11510 /* 11511 * Disallow IPv6 interfaces coming up that have the unspecified address, 11512 * or point-to-point interfaces with an unspecified destination. We do 11513 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11514 * have a subnet assigned, which is how in.ndpd currently manages its 11515 * onlink prefix list when no addresses are configured with those 11516 * prefixes. 11517 */ 11518 if (ipif->ipif_isv6 && 11519 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11520 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11521 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11522 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11523 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11524 return (EINVAL); 11525 } 11526 11527 /* 11528 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11529 * from being brought up. 11530 */ 11531 if (!ipif->ipif_isv6 && 11532 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11533 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11534 return (EINVAL); 11535 } 11536 11537 /* 11538 * The only flag changes that we currently take specific action on 11539 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 11540 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 11541 * IPIF_PREFERRED. This is done by bring the ipif down, changing 11542 * the flags and bringing it back up again. 11543 */ 11544 if ((turn_on|turn_off) & 11545 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11546 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 11547 /* 11548 * Taking this ipif down, make sure we have 11549 * valid net and subnet bcast ire's for other 11550 * logical interfaces, if we need them. 11551 */ 11552 if (!ipif->ipif_isv6) 11553 ipif_check_bcast_ires(ipif); 11554 11555 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11556 !(turn_off & IPIF_UP)) { 11557 need_up = B_TRUE; 11558 if (ipif->ipif_flags & IPIF_UP) 11559 ill->ill_logical_down = 1; 11560 turn_on &= ~IPIF_UP; 11561 } 11562 err = ipif_down(ipif, q, mp); 11563 ip1dbg(("ipif_down returns %d err ", err)); 11564 if (err == EINPROGRESS) 11565 return (err); 11566 ipif_down_tail(ipif); 11567 } 11568 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 11569 } 11570 11571 static int 11572 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 11573 boolean_t need_up) 11574 { 11575 ill_t *ill; 11576 phyint_t *phyi; 11577 uint64_t turn_on; 11578 uint64_t turn_off; 11579 uint64_t intf_flags; 11580 boolean_t phyint_flags_modified = B_FALSE; 11581 int err = 0; 11582 boolean_t set_linklocal = B_FALSE; 11583 boolean_t zero_source = B_FALSE; 11584 11585 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 11586 ipif->ipif_ill->ill_name, ipif->ipif_id)); 11587 11588 ASSERT(IAM_WRITER_IPIF(ipif)); 11589 11590 ill = ipif->ipif_ill; 11591 phyi = ill->ill_phyint; 11592 11593 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11594 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 11595 11596 turn_off = intf_flags & turn_on; 11597 turn_on ^= turn_off; 11598 11599 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 11600 phyint_flags_modified = B_TRUE; 11601 11602 /* 11603 * Now we change the flags. Track current value of 11604 * other flags in their respective places. 11605 */ 11606 mutex_enter(&ill->ill_lock); 11607 mutex_enter(&phyi->phyint_lock); 11608 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11609 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11610 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11611 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11612 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11613 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11614 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 11615 set_linklocal = B_TRUE; 11616 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 11617 } 11618 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 11619 zero_source = B_TRUE; 11620 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 11621 } 11622 mutex_exit(&ill->ill_lock); 11623 mutex_exit(&phyi->phyint_lock); 11624 11625 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 11626 ip_redo_nomination(phyi); 11627 11628 if (set_linklocal) 11629 (void) ipif_setlinklocal(ipif); 11630 11631 if (zero_source) 11632 ipif->ipif_v6src_addr = ipv6_all_zeros; 11633 else 11634 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 11635 11636 if (need_up) { 11637 /* 11638 * XXX ipif_up really does not know whether a phyint flags 11639 * was modified or not. So, it sends up information on 11640 * only one routing sockets message. As we don't bring up 11641 * the interface and also set STANDBY/FAILED simultaneously 11642 * it should be okay. 11643 */ 11644 err = ipif_up(ipif, q, mp); 11645 } else { 11646 /* 11647 * Make sure routing socket sees all changes to the flags. 11648 * ipif_up_done* handles this when we use ipif_up. 11649 */ 11650 if (phyint_flags_modified) { 11651 if (phyi->phyint_illv4 != NULL) { 11652 ip_rts_ifmsg(phyi->phyint_illv4-> 11653 ill_ipif); 11654 } 11655 if (phyi->phyint_illv6 != NULL) { 11656 ip_rts_ifmsg(phyi->phyint_illv6-> 11657 ill_ipif); 11658 } 11659 } else { 11660 ip_rts_ifmsg(ipif); 11661 } 11662 } 11663 return (err); 11664 } 11665 11666 /* 11667 * Restart entry point to restart the flags restart operation after the 11668 * refcounts have dropped to zero. 11669 */ 11670 /* ARGSUSED */ 11671 int 11672 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11673 ip_ioctl_cmd_t *ipip, void *if_req) 11674 { 11675 int err; 11676 struct ifreq *ifr = (struct ifreq *)if_req; 11677 struct lifreq *lifr = (struct lifreq *)if_req; 11678 11679 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 11680 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11681 11682 ipif_down_tail(ipif); 11683 if (ipip->ipi_cmd_type == IF_CMD) { 11684 /* 11685 * Since ip_sioctl_flags expects an int and ifr_flags 11686 * is a short we need to cast ifr_flags into an int 11687 * to avoid having sign extension cause bits to get 11688 * set that should not be. 11689 */ 11690 err = ip_sioctl_flags_tail(ipif, 11691 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 11692 q, mp, B_TRUE); 11693 } else { 11694 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 11695 q, mp, B_TRUE); 11696 } 11697 return (err); 11698 } 11699 11700 /* ARGSUSED */ 11701 int 11702 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11703 ip_ioctl_cmd_t *ipip, void *if_req) 11704 { 11705 /* 11706 * Has the flags been set correctly till now ? 11707 */ 11708 ill_t *ill = ipif->ipif_ill; 11709 phyint_t *phyi = ill->ill_phyint; 11710 11711 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 11712 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11713 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11714 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11715 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11716 11717 /* 11718 * Need a lock since some flags can be set even when there are 11719 * references to the ipif. 11720 */ 11721 mutex_enter(&ill->ill_lock); 11722 if (ipip->ipi_cmd_type == IF_CMD) { 11723 struct ifreq *ifr = (struct ifreq *)if_req; 11724 11725 /* Get interface flags (low 16 only). */ 11726 ifr->ifr_flags = ((ipif->ipif_flags | 11727 ill->ill_flags | phyi->phyint_flags) & 0xffff); 11728 } else { 11729 struct lifreq *lifr = (struct lifreq *)if_req; 11730 11731 /* Get interface flags. */ 11732 lifr->lifr_flags = ipif->ipif_flags | 11733 ill->ill_flags | phyi->phyint_flags; 11734 } 11735 mutex_exit(&ill->ill_lock); 11736 return (0); 11737 } 11738 11739 /* ARGSUSED */ 11740 int 11741 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11742 ip_ioctl_cmd_t *ipip, void *if_req) 11743 { 11744 int mtu; 11745 int ip_min_mtu; 11746 struct ifreq *ifr; 11747 struct lifreq *lifr; 11748 ire_t *ire; 11749 11750 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 11751 ipif->ipif_id, (void *)ipif)); 11752 if (ipip->ipi_cmd_type == IF_CMD) { 11753 ifr = (struct ifreq *)if_req; 11754 mtu = ifr->ifr_metric; 11755 } else { 11756 lifr = (struct lifreq *)if_req; 11757 mtu = lifr->lifr_mtu; 11758 } 11759 11760 if (ipif->ipif_isv6) 11761 ip_min_mtu = IPV6_MIN_MTU; 11762 else 11763 ip_min_mtu = IP_MIN_MTU; 11764 11765 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 11766 return (EINVAL); 11767 11768 /* 11769 * Change the MTU size in all relevant ire's. 11770 * Mtu change Vs. new ire creation - protocol below. 11771 * First change ipif_mtu and the ire_max_frag of the 11772 * interface ire. Then do an ire walk and change the 11773 * ire_max_frag of all affected ires. During ire_add 11774 * under the bucket lock, set the ire_max_frag of the 11775 * new ire being created from the ipif/ire from which 11776 * it is being derived. If an mtu change happens after 11777 * the ire is added, the new ire will be cleaned up. 11778 * Conversely if the mtu change happens before the ire 11779 * is added, ire_add will see the new value of the mtu. 11780 */ 11781 ipif->ipif_mtu = mtu; 11782 ipif->ipif_flags |= IPIF_FIXEDMTU; 11783 11784 if (ipif->ipif_isv6) 11785 ire = ipif_to_ire_v6(ipif); 11786 else 11787 ire = ipif_to_ire(ipif); 11788 if (ire != NULL) { 11789 ire->ire_max_frag = ipif->ipif_mtu; 11790 ire_refrele(ire); 11791 } 11792 if (ipif->ipif_flags & IPIF_UP) { 11793 if (ipif->ipif_isv6) 11794 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES); 11795 else 11796 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES); 11797 } 11798 /* Update the MTU in SCTP's list */ 11799 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11800 return (0); 11801 } 11802 11803 /* Get interface MTU. */ 11804 /* ARGSUSED */ 11805 int 11806 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11807 ip_ioctl_cmd_t *ipip, void *if_req) 11808 { 11809 struct ifreq *ifr; 11810 struct lifreq *lifr; 11811 11812 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 11813 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11814 if (ipip->ipi_cmd_type == IF_CMD) { 11815 ifr = (struct ifreq *)if_req; 11816 ifr->ifr_metric = ipif->ipif_mtu; 11817 } else { 11818 lifr = (struct lifreq *)if_req; 11819 lifr->lifr_mtu = ipif->ipif_mtu; 11820 } 11821 return (0); 11822 } 11823 11824 /* Set interface broadcast address. */ 11825 /* ARGSUSED2 */ 11826 int 11827 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11828 ip_ioctl_cmd_t *ipip, void *if_req) 11829 { 11830 ipaddr_t addr; 11831 ire_t *ire; 11832 11833 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 11834 ipif->ipif_id)); 11835 11836 ASSERT(IAM_WRITER_IPIF(ipif)); 11837 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11838 return (EADDRNOTAVAIL); 11839 11840 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 11841 11842 if (sin->sin_family != AF_INET) 11843 return (EAFNOSUPPORT); 11844 11845 addr = sin->sin_addr.s_addr; 11846 if (ipif->ipif_flags & IPIF_UP) { 11847 /* 11848 * If we are already up, make sure the new 11849 * broadcast address makes sense. If it does, 11850 * there should be an IRE for it already. 11851 * Don't match on ipif, only on the ill 11852 * since we are sharing these now. Don't use 11853 * MATCH_IRE_ILL_GROUP as we are looking for 11854 * the broadcast ire on this ill and each ill 11855 * in the group has its own broadcast ire. 11856 */ 11857 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 11858 ipif, ALL_ZONES, NULL, 11859 (MATCH_IRE_ILL | MATCH_IRE_TYPE)); 11860 if (ire == NULL) { 11861 return (EINVAL); 11862 } else { 11863 ire_refrele(ire); 11864 } 11865 } 11866 /* 11867 * Changing the broadcast addr for this ipif. 11868 * Make sure we have valid net and subnet bcast 11869 * ire's for other logical interfaces, if needed. 11870 */ 11871 if (addr != ipif->ipif_brd_addr) 11872 ipif_check_bcast_ires(ipif); 11873 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 11874 return (0); 11875 } 11876 11877 /* Get interface broadcast address. */ 11878 /* ARGSUSED */ 11879 int 11880 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11881 ip_ioctl_cmd_t *ipip, void *if_req) 11882 { 11883 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 11884 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11885 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11886 return (EADDRNOTAVAIL); 11887 11888 /* IPIF_BROADCAST not possible with IPv6 */ 11889 ASSERT(!ipif->ipif_isv6); 11890 *sin = sin_null; 11891 sin->sin_family = AF_INET; 11892 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 11893 return (0); 11894 } 11895 11896 /* 11897 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 11898 */ 11899 /* ARGSUSED */ 11900 int 11901 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11902 ip_ioctl_cmd_t *ipip, void *if_req) 11903 { 11904 int err = 0; 11905 in6_addr_t v6mask; 11906 11907 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 11908 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11909 11910 ASSERT(IAM_WRITER_IPIF(ipif)); 11911 11912 if (ipif->ipif_isv6) { 11913 sin6_t *sin6; 11914 11915 if (sin->sin_family != AF_INET6) 11916 return (EAFNOSUPPORT); 11917 11918 sin6 = (sin6_t *)sin; 11919 v6mask = sin6->sin6_addr; 11920 } else { 11921 ipaddr_t mask; 11922 11923 if (sin->sin_family != AF_INET) 11924 return (EAFNOSUPPORT); 11925 11926 mask = sin->sin_addr.s_addr; 11927 V4MASK_TO_V6(mask, v6mask); 11928 } 11929 11930 /* 11931 * No big deal if the interface isn't already up, or the mask 11932 * isn't really changing, or this is pt-pt. 11933 */ 11934 if (!(ipif->ipif_flags & IPIF_UP) || 11935 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 11936 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 11937 ipif->ipif_v6net_mask = v6mask; 11938 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11939 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 11940 ipif->ipif_v6net_mask, 11941 ipif->ipif_v6subnet); 11942 } 11943 return (0); 11944 } 11945 /* 11946 * Make sure we have valid net and subnet broadcast ire's 11947 * for the old netmask, if needed by other logical interfaces. 11948 */ 11949 if (!ipif->ipif_isv6) 11950 ipif_check_bcast_ires(ipif); 11951 11952 err = ipif_logical_down(ipif, q, mp); 11953 if (err == EINPROGRESS) 11954 return (err); 11955 ipif_down_tail(ipif); 11956 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 11957 return (err); 11958 } 11959 11960 static int 11961 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 11962 { 11963 in6_addr_t v6mask; 11964 int err = 0; 11965 11966 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 11967 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11968 11969 if (ipif->ipif_isv6) { 11970 sin6_t *sin6; 11971 11972 sin6 = (sin6_t *)sin; 11973 v6mask = sin6->sin6_addr; 11974 } else { 11975 ipaddr_t mask; 11976 11977 mask = sin->sin_addr.s_addr; 11978 V4MASK_TO_V6(mask, v6mask); 11979 } 11980 11981 ipif->ipif_v6net_mask = v6mask; 11982 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11983 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 11984 ipif->ipif_v6subnet); 11985 } 11986 err = ipif_up(ipif, q, mp); 11987 11988 if (err == 0 || err == EINPROGRESS) { 11989 /* 11990 * The interface must be DL_BOUND if this packet has to 11991 * go out on the wire. Since we only go through a logical 11992 * down and are bound with the driver during an internal 11993 * down/up that is satisfied. 11994 */ 11995 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 11996 /* Potentially broadcast an address mask reply. */ 11997 ipif_mask_reply(ipif); 11998 } 11999 } 12000 return (err); 12001 } 12002 12003 /* ARGSUSED */ 12004 int 12005 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12006 ip_ioctl_cmd_t *ipip, void *if_req) 12007 { 12008 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12009 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12010 ipif_down_tail(ipif); 12011 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12012 } 12013 12014 /* Get interface net mask. */ 12015 /* ARGSUSED */ 12016 int 12017 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12018 ip_ioctl_cmd_t *ipip, void *if_req) 12019 { 12020 struct lifreq *lifr = (struct lifreq *)if_req; 12021 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12022 12023 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12024 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12025 12026 /* 12027 * net mask can't change since we have a reference to the ipif. 12028 */ 12029 if (ipif->ipif_isv6) { 12030 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12031 *sin6 = sin6_null; 12032 sin6->sin6_family = AF_INET6; 12033 sin6->sin6_addr = ipif->ipif_v6net_mask; 12034 lifr->lifr_addrlen = 12035 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12036 } else { 12037 *sin = sin_null; 12038 sin->sin_family = AF_INET; 12039 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12040 if (ipip->ipi_cmd_type == LIF_CMD) { 12041 lifr->lifr_addrlen = 12042 ip_mask_to_plen(ipif->ipif_net_mask); 12043 } 12044 } 12045 return (0); 12046 } 12047 12048 /* ARGSUSED */ 12049 int 12050 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12051 ip_ioctl_cmd_t *ipip, void *if_req) 12052 { 12053 12054 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12055 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12056 /* 12057 * Set interface metric. We don't use this for 12058 * anything but we keep track of it in case it is 12059 * important to routing applications or such. 12060 */ 12061 if (ipip->ipi_cmd_type == IF_CMD) { 12062 struct ifreq *ifr; 12063 12064 ifr = (struct ifreq *)if_req; 12065 ipif->ipif_metric = ifr->ifr_metric; 12066 } else { 12067 struct lifreq *lifr; 12068 12069 lifr = (struct lifreq *)if_req; 12070 ipif->ipif_metric = lifr->lifr_metric; 12071 } 12072 return (0); 12073 } 12074 12075 12076 /* ARGSUSED */ 12077 int 12078 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12079 ip_ioctl_cmd_t *ipip, void *if_req) 12080 { 12081 12082 /* Get interface metric. */ 12083 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12084 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12085 if (ipip->ipi_cmd_type == IF_CMD) { 12086 struct ifreq *ifr; 12087 12088 ifr = (struct ifreq *)if_req; 12089 ifr->ifr_metric = ipif->ipif_metric; 12090 } else { 12091 struct lifreq *lifr; 12092 12093 lifr = (struct lifreq *)if_req; 12094 lifr->lifr_metric = ipif->ipif_metric; 12095 } 12096 12097 return (0); 12098 } 12099 12100 /* ARGSUSED */ 12101 int 12102 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12103 ip_ioctl_cmd_t *ipip, void *if_req) 12104 { 12105 12106 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12107 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12108 /* 12109 * Set the muxid returned from I_PLINK. 12110 */ 12111 if (ipip->ipi_cmd_type == IF_CMD) { 12112 struct ifreq *ifr = (struct ifreq *)if_req; 12113 12114 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12115 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12116 } else { 12117 struct lifreq *lifr = (struct lifreq *)if_req; 12118 12119 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12120 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12121 } 12122 return (0); 12123 } 12124 12125 /* ARGSUSED */ 12126 int 12127 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12128 ip_ioctl_cmd_t *ipip, void *if_req) 12129 { 12130 12131 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12132 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12133 /* 12134 * Get the muxid saved in ill for I_PUNLINK. 12135 */ 12136 if (ipip->ipi_cmd_type == IF_CMD) { 12137 struct ifreq *ifr = (struct ifreq *)if_req; 12138 12139 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12140 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12141 } else { 12142 struct lifreq *lifr = (struct lifreq *)if_req; 12143 12144 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12145 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12146 } 12147 return (0); 12148 } 12149 12150 /* 12151 * Set the subnet prefix. Does not modify the broadcast address. 12152 */ 12153 /* ARGSUSED */ 12154 int 12155 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12156 ip_ioctl_cmd_t *ipip, void *if_req) 12157 { 12158 int err = 0; 12159 in6_addr_t v6addr; 12160 in6_addr_t v6mask; 12161 boolean_t need_up = B_FALSE; 12162 int addrlen; 12163 12164 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12165 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12166 12167 ASSERT(IAM_WRITER_IPIF(ipif)); 12168 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12169 12170 if (ipif->ipif_isv6) { 12171 sin6_t *sin6; 12172 12173 if (sin->sin_family != AF_INET6) 12174 return (EAFNOSUPPORT); 12175 12176 sin6 = (sin6_t *)sin; 12177 v6addr = sin6->sin6_addr; 12178 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12179 return (EADDRNOTAVAIL); 12180 } else { 12181 ipaddr_t addr; 12182 12183 if (sin->sin_family != AF_INET) 12184 return (EAFNOSUPPORT); 12185 12186 addr = sin->sin_addr.s_addr; 12187 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12188 return (EADDRNOTAVAIL); 12189 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12190 /* Add 96 bits */ 12191 addrlen += IPV6_ABITS - IP_ABITS; 12192 } 12193 12194 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12195 return (EINVAL); 12196 12197 /* Check if bits in the address is set past the mask */ 12198 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12199 return (EINVAL); 12200 12201 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12202 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12203 return (0); /* No change */ 12204 12205 if (ipif->ipif_flags & IPIF_UP) { 12206 /* 12207 * If the interface is already marked up, 12208 * we call ipif_down which will take care 12209 * of ditching any IREs that have been set 12210 * up based on the old interface address. 12211 */ 12212 err = ipif_logical_down(ipif, q, mp); 12213 if (err == EINPROGRESS) 12214 return (err); 12215 ipif_down_tail(ipif); 12216 need_up = B_TRUE; 12217 } 12218 12219 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12220 return (err); 12221 } 12222 12223 static int 12224 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12225 queue_t *q, mblk_t *mp, boolean_t need_up) 12226 { 12227 ill_t *ill = ipif->ipif_ill; 12228 int err = 0; 12229 12230 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12231 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12232 12233 /* Set the new address. */ 12234 mutex_enter(&ill->ill_lock); 12235 ipif->ipif_v6net_mask = v6mask; 12236 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12237 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12238 ipif->ipif_v6subnet); 12239 } 12240 mutex_exit(&ill->ill_lock); 12241 12242 if (need_up) { 12243 /* 12244 * Now bring the interface back up. If this 12245 * is the only IPIF for the ILL, ipif_up 12246 * will have to re-bind to the device, so 12247 * we may get back EINPROGRESS, in which 12248 * case, this IOCTL will get completed in 12249 * ip_rput_dlpi when we see the DL_BIND_ACK. 12250 */ 12251 err = ipif_up(ipif, q, mp); 12252 if (err == EINPROGRESS) 12253 return (err); 12254 } 12255 return (err); 12256 } 12257 12258 /* ARGSUSED */ 12259 int 12260 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12261 ip_ioctl_cmd_t *ipip, void *if_req) 12262 { 12263 int addrlen; 12264 in6_addr_t v6addr; 12265 in6_addr_t v6mask; 12266 struct lifreq *lifr = (struct lifreq *)if_req; 12267 12268 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12269 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12270 ipif_down_tail(ipif); 12271 12272 addrlen = lifr->lifr_addrlen; 12273 if (ipif->ipif_isv6) { 12274 sin6_t *sin6; 12275 12276 sin6 = (sin6_t *)sin; 12277 v6addr = sin6->sin6_addr; 12278 } else { 12279 ipaddr_t addr; 12280 12281 addr = sin->sin_addr.s_addr; 12282 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12283 addrlen += IPV6_ABITS - IP_ABITS; 12284 } 12285 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12286 12287 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12288 } 12289 12290 /* ARGSUSED */ 12291 int 12292 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12293 ip_ioctl_cmd_t *ipip, void *if_req) 12294 { 12295 struct lifreq *lifr = (struct lifreq *)if_req; 12296 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12297 12298 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12299 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12300 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12301 12302 if (ipif->ipif_isv6) { 12303 *sin6 = sin6_null; 12304 sin6->sin6_family = AF_INET6; 12305 sin6->sin6_addr = ipif->ipif_v6subnet; 12306 lifr->lifr_addrlen = 12307 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12308 } else { 12309 *sin = sin_null; 12310 sin->sin_family = AF_INET; 12311 sin->sin_addr.s_addr = ipif->ipif_subnet; 12312 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12313 } 12314 return (0); 12315 } 12316 12317 /* 12318 * Set the IPv6 address token. 12319 */ 12320 /* ARGSUSED */ 12321 int 12322 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12323 ip_ioctl_cmd_t *ipi, void *if_req) 12324 { 12325 ill_t *ill = ipif->ipif_ill; 12326 int err; 12327 in6_addr_t v6addr; 12328 in6_addr_t v6mask; 12329 boolean_t need_up = B_FALSE; 12330 int i; 12331 sin6_t *sin6 = (sin6_t *)sin; 12332 struct lifreq *lifr = (struct lifreq *)if_req; 12333 int addrlen; 12334 12335 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12336 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12337 ASSERT(IAM_WRITER_IPIF(ipif)); 12338 12339 addrlen = lifr->lifr_addrlen; 12340 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12341 if (ipif->ipif_id != 0) 12342 return (EINVAL); 12343 12344 if (!ipif->ipif_isv6) 12345 return (EINVAL); 12346 12347 if (addrlen > IPV6_ABITS) 12348 return (EINVAL); 12349 12350 v6addr = sin6->sin6_addr; 12351 12352 /* 12353 * The length of the token is the length from the end. To get 12354 * the proper mask for this, compute the mask of the bits not 12355 * in the token; ie. the prefix, and then xor to get the mask. 12356 */ 12357 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12358 return (EINVAL); 12359 for (i = 0; i < 4; i++) { 12360 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12361 } 12362 12363 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12364 ill->ill_token_length == addrlen) 12365 return (0); /* No change */ 12366 12367 if (ipif->ipif_flags & IPIF_UP) { 12368 err = ipif_logical_down(ipif, q, mp); 12369 if (err == EINPROGRESS) 12370 return (err); 12371 ipif_down_tail(ipif); 12372 need_up = B_TRUE; 12373 } 12374 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12375 return (err); 12376 } 12377 12378 static int 12379 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12380 mblk_t *mp, boolean_t need_up) 12381 { 12382 in6_addr_t v6addr; 12383 in6_addr_t v6mask; 12384 ill_t *ill = ipif->ipif_ill; 12385 int i; 12386 int err = 0; 12387 12388 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12389 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12390 v6addr = sin6->sin6_addr; 12391 /* 12392 * The length of the token is the length from the end. To get 12393 * the proper mask for this, compute the mask of the bits not 12394 * in the token; ie. the prefix, and then xor to get the mask. 12395 */ 12396 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12397 for (i = 0; i < 4; i++) 12398 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12399 12400 mutex_enter(&ill->ill_lock); 12401 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12402 ill->ill_token_length = addrlen; 12403 mutex_exit(&ill->ill_lock); 12404 12405 if (need_up) { 12406 /* 12407 * Now bring the interface back up. If this 12408 * is the only IPIF for the ILL, ipif_up 12409 * will have to re-bind to the device, so 12410 * we may get back EINPROGRESS, in which 12411 * case, this IOCTL will get completed in 12412 * ip_rput_dlpi when we see the DL_BIND_ACK. 12413 */ 12414 err = ipif_up(ipif, q, mp); 12415 if (err == EINPROGRESS) 12416 return (err); 12417 } 12418 return (err); 12419 } 12420 12421 /* ARGSUSED */ 12422 int 12423 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12424 ip_ioctl_cmd_t *ipi, void *if_req) 12425 { 12426 ill_t *ill; 12427 sin6_t *sin6 = (sin6_t *)sin; 12428 struct lifreq *lifr = (struct lifreq *)if_req; 12429 12430 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12431 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12432 if (ipif->ipif_id != 0) 12433 return (EINVAL); 12434 12435 ill = ipif->ipif_ill; 12436 if (!ill->ill_isv6) 12437 return (ENXIO); 12438 12439 *sin6 = sin6_null; 12440 sin6->sin6_family = AF_INET6; 12441 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12442 sin6->sin6_addr = ill->ill_token; 12443 lifr->lifr_addrlen = ill->ill_token_length; 12444 return (0); 12445 } 12446 12447 /* 12448 * Set (hardware) link specific information that might override 12449 * what was acquired through the DL_INFO_ACK. 12450 * The logic is as follows. 12451 * 12452 * become exclusive 12453 * set CHANGING flag 12454 * change mtu on affected IREs 12455 * clear CHANGING flag 12456 * 12457 * An ire add that occurs before the CHANGING flag is set will have its mtu 12458 * changed by the ip_sioctl_lnkinfo. 12459 * 12460 * During the time the CHANGING flag is set, no new ires will be added to the 12461 * bucket, and ire add will fail (due the CHANGING flag). 12462 * 12463 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12464 * before it is added to the bucket. 12465 * 12466 * Obviously only 1 thread can set the CHANGING flag and we need to become 12467 * exclusive to set the flag. 12468 */ 12469 /* ARGSUSED */ 12470 int 12471 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12472 ip_ioctl_cmd_t *ipi, void *if_req) 12473 { 12474 ill_t *ill = ipif->ipif_ill; 12475 ipif_t *nipif; 12476 int ip_min_mtu; 12477 boolean_t mtu_walk = B_FALSE; 12478 struct lifreq *lifr = (struct lifreq *)if_req; 12479 lif_ifinfo_req_t *lir; 12480 ire_t *ire; 12481 12482 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12483 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12484 lir = &lifr->lifr_ifinfo; 12485 ASSERT(IAM_WRITER_IPIF(ipif)); 12486 12487 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12488 if (ipif->ipif_id != 0) 12489 return (EINVAL); 12490 12491 /* Set interface MTU. */ 12492 if (ipif->ipif_isv6) 12493 ip_min_mtu = IPV6_MIN_MTU; 12494 else 12495 ip_min_mtu = IP_MIN_MTU; 12496 12497 /* 12498 * Verify values before we set anything. Allow zero to 12499 * mean unspecified. 12500 */ 12501 if (lir->lir_maxmtu != 0 && 12502 (lir->lir_maxmtu > ill->ill_max_frag || 12503 lir->lir_maxmtu < ip_min_mtu)) 12504 return (EINVAL); 12505 if (lir->lir_reachtime != 0 && 12506 lir->lir_reachtime > ND_MAX_REACHTIME) 12507 return (EINVAL); 12508 if (lir->lir_reachretrans != 0 && 12509 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12510 return (EINVAL); 12511 12512 mutex_enter(&ill->ill_lock); 12513 ill->ill_state_flags |= ILL_CHANGING; 12514 for (nipif = ill->ill_ipif; nipif != NULL; 12515 nipif = nipif->ipif_next) { 12516 nipif->ipif_state_flags |= IPIF_CHANGING; 12517 } 12518 12519 mutex_exit(&ill->ill_lock); 12520 12521 if (lir->lir_maxmtu != 0) { 12522 ill->ill_max_mtu = lir->lir_maxmtu; 12523 ill->ill_mtu_userspecified = 1; 12524 mtu_walk = B_TRUE; 12525 } 12526 12527 if (lir->lir_reachtime != 0) 12528 ill->ill_reachable_time = lir->lir_reachtime; 12529 12530 if (lir->lir_reachretrans != 0) 12531 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12532 12533 ill->ill_max_hops = lir->lir_maxhops; 12534 12535 ill->ill_max_buf = ND_MAX_Q; 12536 12537 if (mtu_walk) { 12538 /* 12539 * Set the MTU on all ipifs associated with this ill except 12540 * for those whose MTU was fixed via SIOCSLIFMTU. 12541 */ 12542 for (nipif = ill->ill_ipif; nipif != NULL; 12543 nipif = nipif->ipif_next) { 12544 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12545 continue; 12546 12547 nipif->ipif_mtu = ill->ill_max_mtu; 12548 12549 if (!(nipif->ipif_flags & IPIF_UP)) 12550 continue; 12551 12552 if (nipif->ipif_isv6) 12553 ire = ipif_to_ire_v6(nipif); 12554 else 12555 ire = ipif_to_ire(nipif); 12556 if (ire != NULL) { 12557 ire->ire_max_frag = ipif->ipif_mtu; 12558 ire_refrele(ire); 12559 } 12560 if (ill->ill_isv6) { 12561 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 12562 ipif_mtu_change, (char *)nipif, 12563 ill); 12564 } else { 12565 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 12566 ipif_mtu_change, (char *)nipif, 12567 ill); 12568 } 12569 } 12570 } 12571 12572 mutex_enter(&ill->ill_lock); 12573 for (nipif = ill->ill_ipif; nipif != NULL; 12574 nipif = nipif->ipif_next) { 12575 nipif->ipif_state_flags &= ~IPIF_CHANGING; 12576 } 12577 ILL_UNMARK_CHANGING(ill); 12578 mutex_exit(&ill->ill_lock); 12579 12580 return (0); 12581 } 12582 12583 /* ARGSUSED */ 12584 int 12585 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12586 ip_ioctl_cmd_t *ipi, void *if_req) 12587 { 12588 struct lif_ifinfo_req *lir; 12589 ill_t *ill = ipif->ipif_ill; 12590 12591 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 12592 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12593 if (ipif->ipif_id != 0) 12594 return (EINVAL); 12595 12596 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 12597 lir->lir_maxhops = ill->ill_max_hops; 12598 lir->lir_reachtime = ill->ill_reachable_time; 12599 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 12600 lir->lir_maxmtu = ill->ill_max_mtu; 12601 12602 return (0); 12603 } 12604 12605 /* 12606 * Return best guess as to the subnet mask for the specified address. 12607 * Based on the subnet masks for all the configured interfaces. 12608 * 12609 * We end up returning a zero mask in the case of default, multicast or 12610 * experimental. 12611 */ 12612 static ipaddr_t 12613 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp) 12614 { 12615 ipaddr_t net_mask; 12616 ill_t *ill; 12617 ipif_t *ipif; 12618 ill_walk_context_t ctx; 12619 ipif_t *fallback_ipif = NULL; 12620 12621 net_mask = ip_net_mask(addr); 12622 if (net_mask == 0) { 12623 *ipifp = NULL; 12624 return (0); 12625 } 12626 12627 /* Let's check to see if this is maybe a local subnet route. */ 12628 /* this function only applies to IPv4 interfaces */ 12629 rw_enter(&ill_g_lock, RW_READER); 12630 ill = ILL_START_WALK_V4(&ctx); 12631 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 12632 mutex_enter(&ill->ill_lock); 12633 for (ipif = ill->ill_ipif; ipif != NULL; 12634 ipif = ipif->ipif_next) { 12635 if (!IPIF_CAN_LOOKUP(ipif)) 12636 continue; 12637 if (!(ipif->ipif_flags & IPIF_UP)) 12638 continue; 12639 if ((ipif->ipif_subnet & net_mask) == 12640 (addr & net_mask)) { 12641 /* 12642 * Don't trust pt-pt interfaces if there are 12643 * other interfaces. 12644 */ 12645 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 12646 if (fallback_ipif == NULL) { 12647 ipif_refhold_locked(ipif); 12648 fallback_ipif = ipif; 12649 } 12650 continue; 12651 } 12652 12653 /* 12654 * Fine. Just assume the same net mask as the 12655 * directly attached subnet interface is using. 12656 */ 12657 ipif_refhold_locked(ipif); 12658 mutex_exit(&ill->ill_lock); 12659 rw_exit(&ill_g_lock); 12660 if (fallback_ipif != NULL) 12661 ipif_refrele(fallback_ipif); 12662 *ipifp = ipif; 12663 return (ipif->ipif_net_mask); 12664 } 12665 } 12666 mutex_exit(&ill->ill_lock); 12667 } 12668 rw_exit(&ill_g_lock); 12669 12670 *ipifp = fallback_ipif; 12671 return ((fallback_ipif != NULL) ? 12672 fallback_ipif->ipif_net_mask : net_mask); 12673 } 12674 12675 /* 12676 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 12677 */ 12678 static void 12679 ip_wput_ioctl(queue_t *q, mblk_t *mp) 12680 { 12681 IOCP iocp; 12682 ipft_t *ipft; 12683 ipllc_t *ipllc; 12684 mblk_t *mp1; 12685 cred_t *cr; 12686 int error = 0; 12687 conn_t *connp; 12688 12689 ip1dbg(("ip_wput_ioctl")); 12690 iocp = (IOCP)mp->b_rptr; 12691 mp1 = mp->b_cont; 12692 if (mp1 == NULL) { 12693 iocp->ioc_error = EINVAL; 12694 mp->b_datap->db_type = M_IOCNAK; 12695 iocp->ioc_count = 0; 12696 qreply(q, mp); 12697 return; 12698 } 12699 12700 /* 12701 * These IOCTLs provide various control capabilities to 12702 * upstream agents such as ULPs and processes. There 12703 * are currently two such IOCTLs implemented. They 12704 * are used by TCP to provide update information for 12705 * existing IREs and to forcibly delete an IRE for a 12706 * host that is not responding, thereby forcing an 12707 * attempt at a new route. 12708 */ 12709 iocp->ioc_error = EINVAL; 12710 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 12711 goto done; 12712 12713 ipllc = (ipllc_t *)mp1->b_rptr; 12714 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 12715 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 12716 break; 12717 } 12718 /* 12719 * prefer credential from mblk over ioctl; 12720 * see ip_sioctl_copyin_setup 12721 */ 12722 cr = DB_CREDDEF(mp, iocp->ioc_cr); 12723 12724 /* 12725 * Refhold the conn in case the request gets queued up in some lookup 12726 */ 12727 ASSERT(CONN_Q(q)); 12728 connp = Q_TO_CONN(q); 12729 CONN_INC_REF(connp); 12730 if (ipft->ipft_pfi && 12731 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 12732 pullupmsg(mp1, ipft->ipft_min_size))) { 12733 error = (*ipft->ipft_pfi)(q, 12734 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 12735 } 12736 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 12737 /* 12738 * CONN_OPER_PENDING_DONE happens in the function called 12739 * through ipft_pfi above. 12740 */ 12741 return; 12742 } 12743 12744 CONN_OPER_PENDING_DONE(connp); 12745 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 12746 freemsg(mp); 12747 return; 12748 } 12749 iocp->ioc_error = error; 12750 12751 done: 12752 mp->b_datap->db_type = M_IOCACK; 12753 if (iocp->ioc_error) 12754 iocp->ioc_count = 0; 12755 qreply(q, mp); 12756 } 12757 12758 /* 12759 * Lookup an ipif using the sequence id (ipif_seqid) 12760 */ 12761 ipif_t * 12762 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 12763 { 12764 ipif_t *ipif; 12765 12766 ASSERT(MUTEX_HELD(&ill->ill_lock)); 12767 12768 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12769 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 12770 return (ipif); 12771 } 12772 return (NULL); 12773 } 12774 12775 uint64_t ipif_g_seqid; 12776 12777 /* 12778 * Assign a unique id for the ipif. This is used later when we send 12779 * IRES to ARP for resolution where we initialize ire_ipif_seqid 12780 * to the value pointed by ire_ipif->ipif_seqid. Later when the 12781 * IRE is added, we verify that ipif has not disappeared. 12782 */ 12783 12784 static void 12785 ipif_assign_seqid(ipif_t *ipif) 12786 { 12787 ipif->ipif_seqid = atomic_add_64_nv(&ipif_g_seqid, 1); 12788 } 12789 12790 /* 12791 * Insert the ipif, so that the list of ipifs on the ill will be sorted 12792 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 12793 * be inserted into the first space available in the list. The value of 12794 * ipif_id will then be set to the appropriate value for its position. 12795 */ 12796 static int 12797 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 12798 { 12799 ill_t *ill; 12800 ipif_t *tipif; 12801 ipif_t **tipifp; 12802 int id; 12803 12804 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 12805 IAM_WRITER_IPIF(ipif)); 12806 12807 ill = ipif->ipif_ill; 12808 ASSERT(ill != NULL); 12809 12810 /* 12811 * In the case of lo0:0 we already hold the ill_g_lock. 12812 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 12813 * ipif_insert. Another such caller is ipif_move. 12814 */ 12815 if (acquire_g_lock) 12816 rw_enter(&ill_g_lock, RW_WRITER); 12817 if (acquire_ill_lock) 12818 mutex_enter(&ill->ill_lock); 12819 id = ipif->ipif_id; 12820 tipifp = &(ill->ill_ipif); 12821 if (id == -1) { /* need to find a real id */ 12822 id = 0; 12823 while ((tipif = *tipifp) != NULL) { 12824 ASSERT(tipif->ipif_id >= id); 12825 if (tipif->ipif_id != id) 12826 break; /* non-consecutive id */ 12827 id++; 12828 tipifp = &(tipif->ipif_next); 12829 } 12830 /* limit number of logical interfaces */ 12831 if (id >= ip_addrs_per_if) { 12832 if (acquire_ill_lock) 12833 mutex_exit(&ill->ill_lock); 12834 if (acquire_g_lock) 12835 rw_exit(&ill_g_lock); 12836 return (-1); 12837 } 12838 ipif->ipif_id = id; /* assign new id */ 12839 } else if (id < ip_addrs_per_if) { 12840 /* we have a real id; insert ipif in the right place */ 12841 while ((tipif = *tipifp) != NULL) { 12842 ASSERT(tipif->ipif_id != id); 12843 if (tipif->ipif_id > id) 12844 break; /* found correct location */ 12845 tipifp = &(tipif->ipif_next); 12846 } 12847 } else { 12848 if (acquire_ill_lock) 12849 mutex_exit(&ill->ill_lock); 12850 if (acquire_g_lock) 12851 rw_exit(&ill_g_lock); 12852 return (-1); 12853 } 12854 12855 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 12856 12857 ipif->ipif_next = tipif; 12858 *tipifp = ipif; 12859 if (acquire_ill_lock) 12860 mutex_exit(&ill->ill_lock); 12861 if (acquire_g_lock) 12862 rw_exit(&ill_g_lock); 12863 return (0); 12864 } 12865 12866 /* 12867 * Allocate and initialize a new interface control structure. (Always 12868 * called as writer.) 12869 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 12870 * is not part of the global linked list of ills. ipif_seqid is unique 12871 * in the system and to preserve the uniqueness, it is assigned only 12872 * when ill becomes part of the global list. At that point ill will 12873 * have a name. If it doesn't get assigned here, it will get assigned 12874 * in ipif_set_values() as part of SIOCSLIFNAME processing. 12875 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 12876 * the interface flags or any other information from the DL_INFO_ACK for 12877 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 12878 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 12879 * second DL_INFO_ACK comes in from the driver. 12880 */ 12881 static ipif_t * 12882 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 12883 { 12884 ipif_t *ipif; 12885 phyint_t *phyi; 12886 12887 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 12888 ill->ill_name, id, (void *)ill)); 12889 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 12890 12891 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 12892 return (NULL); 12893 *ipif = ipif_zero; /* start clean */ 12894 12895 ipif->ipif_ill = ill; 12896 ipif->ipif_id = id; /* could be -1 */ 12897 ipif->ipif_zoneid = GLOBAL_ZONEID; 12898 12899 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 12900 12901 ipif->ipif_refcnt = 0; 12902 ipif->ipif_saved_ire_cnt = 0; 12903 12904 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 12905 mi_free(ipif); 12906 return (NULL); 12907 } 12908 /* -1 id should have been replaced by real id */ 12909 id = ipif->ipif_id; 12910 ASSERT(id >= 0); 12911 12912 if (ill->ill_name[0] != '\0') { 12913 ipif_assign_seqid(ipif); 12914 if (ill->ill_phyint->phyint_ifindex != 0) 12915 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 12916 } 12917 /* 12918 * Keep a copy of original id in ipif_orig_ipifid. Failback 12919 * will attempt to restore the original id. The SIOCSLIFOINDEX 12920 * ioctl sets ipif_orig_ipifid to zero. 12921 */ 12922 ipif->ipif_orig_ipifid = id; 12923 12924 /* 12925 * We grab the ill_lock and phyint_lock to protect the flag changes. 12926 * The ipif is still not up and can't be looked up until the 12927 * ioctl completes and the IPIF_CHANGING flag is cleared. 12928 */ 12929 mutex_enter(&ill->ill_lock); 12930 mutex_enter(&ill->ill_phyint->phyint_lock); 12931 /* 12932 * Set the running flag when logical interface zero is created. 12933 * For subsequent logical interfaces, a DLPI link down 12934 * notification message may have cleared the running flag to 12935 * indicate the link is down, so we shouldn't just blindly set it. 12936 */ 12937 if (id == 0) 12938 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 12939 ipif->ipif_ire_type = ire_type; 12940 phyi = ill->ill_phyint; 12941 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 12942 12943 if (ipif->ipif_isv6) { 12944 ill->ill_flags |= ILLF_IPV6; 12945 } else { 12946 ipaddr_t inaddr_any = INADDR_ANY; 12947 12948 ill->ill_flags |= ILLF_IPV4; 12949 12950 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 12951 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12952 &ipif->ipif_v6lcl_addr); 12953 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12954 &ipif->ipif_v6src_addr); 12955 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12956 &ipif->ipif_v6subnet); 12957 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12958 &ipif->ipif_v6net_mask); 12959 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12960 &ipif->ipif_v6brd_addr); 12961 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12962 &ipif->ipif_v6pp_dst_addr); 12963 } 12964 12965 /* 12966 * Don't set the interface flags etc. now, will do it in 12967 * ip_ll_subnet_defaults. 12968 */ 12969 if (!initialize) { 12970 mutex_exit(&ill->ill_lock); 12971 mutex_exit(&ill->ill_phyint->phyint_lock); 12972 return (ipif); 12973 } 12974 ipif->ipif_mtu = ill->ill_max_mtu; 12975 12976 if (ill->ill_bcast_addr_length != 0) { 12977 /* 12978 * Later detect lack of DLPI driver multicast 12979 * capability by catching DL_ENABMULTI errors in 12980 * ip_rput_dlpi. 12981 */ 12982 ill->ill_flags |= ILLF_MULTICAST; 12983 if (!ipif->ipif_isv6) 12984 ipif->ipif_flags |= IPIF_BROADCAST; 12985 } else { 12986 if (ill->ill_net_type != IRE_LOOPBACK) { 12987 if (ipif->ipif_isv6) 12988 /* 12989 * Note: xresolv interfaces will eventually need 12990 * NOARP set here as well, but that will require 12991 * those external resolvers to have some 12992 * knowledge of that flag and act appropriately. 12993 * Not to be changed at present. 12994 */ 12995 ill->ill_flags |= ILLF_NONUD; 12996 else 12997 ill->ill_flags |= ILLF_NOARP; 12998 } 12999 if (ill->ill_phys_addr_length == 0) { 13000 if (ill->ill_media && 13001 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 13002 ipif->ipif_flags |= IPIF_NOXMIT; 13003 phyi->phyint_flags |= PHYI_VIRTUAL; 13004 } else { 13005 /* pt-pt supports multicast. */ 13006 ill->ill_flags |= ILLF_MULTICAST; 13007 if (ill->ill_net_type == IRE_LOOPBACK) { 13008 phyi->phyint_flags |= 13009 (PHYI_LOOPBACK | PHYI_VIRTUAL); 13010 } else { 13011 ipif->ipif_flags |= IPIF_POINTOPOINT; 13012 } 13013 } 13014 } 13015 } 13016 mutex_exit(&ill->ill_lock); 13017 mutex_exit(&ill->ill_phyint->phyint_lock); 13018 return (ipif); 13019 } 13020 13021 /* 13022 * If appropriate, send a message up to the resolver delete the entry 13023 * for the address of this interface which is going out of business. 13024 * (Always called as writer). 13025 * 13026 * NOTE : We need to check for NULL mps as some of the fields are 13027 * initialized only for some interface types. See ipif_resolver_up() 13028 * for details. 13029 */ 13030 void 13031 ipif_arp_down(ipif_t *ipif) 13032 { 13033 mblk_t *mp; 13034 ill_t *ill = ipif->ipif_ill; 13035 13036 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13037 ASSERT(IAM_WRITER_IPIF(ipif)); 13038 13039 /* Delete the mapping for the local address */ 13040 mp = ipif->ipif_arp_del_mp; 13041 if (mp != NULL) { 13042 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13043 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13044 putnext(ill->ill_rq, mp); 13045 ipif->ipif_arp_del_mp = NULL; 13046 } 13047 13048 /* 13049 * If this is the last ipif that is going down and there are no 13050 * duplicate addresses we may yet attempt to re-probe, then we need to 13051 * clean up ARP completely. 13052 */ 13053 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13054 13055 /* Send up AR_INTERFACE_DOWN message */ 13056 mp = ill->ill_arp_down_mp; 13057 if (mp != NULL) { 13058 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13059 *(unsigned *)mp->b_rptr, ill->ill_name, 13060 ipif->ipif_id)); 13061 putnext(ill->ill_rq, mp); 13062 ill->ill_arp_down_mp = NULL; 13063 } 13064 13065 /* Tell ARP to delete the multicast mappings */ 13066 mp = ill->ill_arp_del_mapping_mp; 13067 if (mp != NULL) { 13068 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13069 *(unsigned *)mp->b_rptr, ill->ill_name, 13070 ipif->ipif_id)); 13071 putnext(ill->ill_rq, mp); 13072 ill->ill_arp_del_mapping_mp = NULL; 13073 } 13074 } 13075 } 13076 13077 /* 13078 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 13079 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 13080 * that it wants the add_mp allocated in this function to be returned 13081 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 13082 * just re-do the multicast, it wants us to send the add_mp to ARP also. 13083 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 13084 * as it does a ipif_arp_down after calling this function - which will 13085 * remove what we add here. 13086 * 13087 * Returns -1 on failures and 0 on success. 13088 */ 13089 int 13090 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13091 { 13092 mblk_t *del_mp = NULL; 13093 mblk_t *add_mp = NULL; 13094 mblk_t *mp; 13095 ill_t *ill = ipif->ipif_ill; 13096 phyint_t *phyi = ill->ill_phyint; 13097 ipaddr_t addr, mask, extract_mask = 0; 13098 arma_t *arma; 13099 uint8_t *maddr, *bphys_addr; 13100 uint32_t hw_start; 13101 dl_unitdata_req_t *dlur; 13102 13103 ASSERT(IAM_WRITER_IPIF(ipif)); 13104 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13105 return (0); 13106 13107 /* 13108 * Delete the existing mapping from ARP. Normally ipif_down 13109 * -> ipif_arp_down should send this up to ARP. The only 13110 * reason we would find this when we are switching from 13111 * Multicast to Broadcast where we did not do a down. 13112 */ 13113 mp = ill->ill_arp_del_mapping_mp; 13114 if (mp != NULL) { 13115 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13116 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13117 putnext(ill->ill_rq, mp); 13118 ill->ill_arp_del_mapping_mp = NULL; 13119 } 13120 13121 if (arp_add_mapping_mp != NULL) 13122 *arp_add_mapping_mp = NULL; 13123 13124 /* 13125 * Check that the address is not to long for the constant 13126 * length reserved in the template arma_t. 13127 */ 13128 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13129 return (-1); 13130 13131 /* Add mapping mblk */ 13132 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13133 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13134 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13135 (caddr_t)&addr); 13136 if (add_mp == NULL) 13137 return (-1); 13138 arma = (arma_t *)add_mp->b_rptr; 13139 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13140 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13141 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13142 13143 /* 13144 * Determine the broadcast address. 13145 */ 13146 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13147 if (ill->ill_sap_length < 0) 13148 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13149 else 13150 bphys_addr = (uchar_t *)dlur + 13151 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13152 /* 13153 * Check PHYI_MULTI_BCAST and length of physical 13154 * address to determine if we use the mapping or the 13155 * broadcast address. 13156 */ 13157 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13158 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13159 bphys_addr, maddr, &hw_start, &extract_mask)) 13160 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13161 13162 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13163 (ill->ill_flags & ILLF_MULTICAST)) { 13164 /* Make sure this will not match the "exact" entry. */ 13165 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13166 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13167 (caddr_t)&addr); 13168 if (del_mp == NULL) { 13169 freemsg(add_mp); 13170 return (-1); 13171 } 13172 bcopy(&extract_mask, (char *)arma + 13173 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13174 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13175 /* Use link-layer broadcast address for MULTI_BCAST */ 13176 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13177 ip2dbg(("ipif_arp_setup_multicast: adding" 13178 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13179 } else { 13180 arma->arma_hw_mapping_start = hw_start; 13181 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13182 " ARP setup for %s\n", ill->ill_name)); 13183 } 13184 } else { 13185 freemsg(add_mp); 13186 ASSERT(del_mp == NULL); 13187 /* It is neither MULTICAST nor MULTI_BCAST */ 13188 return (0); 13189 } 13190 ASSERT(add_mp != NULL && del_mp != NULL); 13191 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13192 ill->ill_arp_del_mapping_mp = del_mp; 13193 if (arp_add_mapping_mp != NULL) { 13194 /* The caller just wants the mblks allocated */ 13195 *arp_add_mapping_mp = add_mp; 13196 } else { 13197 /* The caller wants us to send it to arp */ 13198 putnext(ill->ill_rq, add_mp); 13199 } 13200 return (0); 13201 } 13202 13203 /* 13204 * Get the resolver set up for a new interface address. 13205 * (Always called as writer.) 13206 * Called both for IPv4 and IPv6 interfaces, 13207 * though it only sets up the resolver for v6 13208 * if it's an xresolv interface (one using an external resolver). 13209 * Honors ILLF_NOARP. 13210 * The enumerated value res_act is used to tune the behavior. 13211 * If set to Res_act_initial, then we set up all the resolver 13212 * structures for a new interface. If set to Res_act_move, then 13213 * we just send an AR_ENTRY_ADD message up to ARP for IPv4 13214 * interfaces; this is called by ip_rput_dlpi_writer() to handle 13215 * asynchronous hardware address change notification. If set to 13216 * Res_act_defend, then we tell ARP that it needs to send a single 13217 * gratuitous message in defense of the address. 13218 * Returns error on failure. 13219 */ 13220 int 13221 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13222 { 13223 caddr_t addr; 13224 mblk_t *arp_up_mp = NULL; 13225 mblk_t *arp_down_mp = NULL; 13226 mblk_t *arp_add_mp = NULL; 13227 mblk_t *arp_del_mp = NULL; 13228 mblk_t *arp_add_mapping_mp = NULL; 13229 mblk_t *arp_del_mapping_mp = NULL; 13230 ill_t *ill = ipif->ipif_ill; 13231 uchar_t *area_p = NULL; 13232 uchar_t *ared_p = NULL; 13233 int err = ENOMEM; 13234 boolean_t was_dup; 13235 13236 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13237 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13238 ASSERT(IAM_WRITER_IPIF(ipif)); 13239 13240 was_dup = B_FALSE; 13241 if (res_act == Res_act_initial) { 13242 ipif->ipif_addr_ready = 0; 13243 /* 13244 * We're bringing an interface up here. There's no way that we 13245 * should need to shut down ARP now. 13246 */ 13247 mutex_enter(&ill->ill_lock); 13248 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13249 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13250 ill->ill_ipif_dup_count--; 13251 was_dup = B_TRUE; 13252 } 13253 mutex_exit(&ill->ill_lock); 13254 } 13255 if (ipif->ipif_recovery_id != 0) 13256 (void) untimeout(ipif->ipif_recovery_id); 13257 ipif->ipif_recovery_id = 0; 13258 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13259 ipif->ipif_addr_ready = 1; 13260 return (0); 13261 } 13262 /* NDP will set the ipif_addr_ready flag when it's ready */ 13263 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13264 return (0); 13265 13266 if (ill->ill_isv6) { 13267 /* 13268 * External resolver for IPv6 13269 */ 13270 ASSERT(res_act == Res_act_initial); 13271 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 13272 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 13273 area_p = (uchar_t *)&ip6_area_template; 13274 ared_p = (uchar_t *)&ip6_ared_template; 13275 } 13276 } else { 13277 /* 13278 * IPv4 arp case. If the ARP stream has already started 13279 * closing, fail this request for ARP bringup. Else 13280 * record the fact that an ARP bringup is pending. 13281 */ 13282 mutex_enter(&ill->ill_lock); 13283 if (ill->ill_arp_closing) { 13284 mutex_exit(&ill->ill_lock); 13285 err = EINVAL; 13286 goto failed; 13287 } else { 13288 if (ill->ill_ipif_up_count == 0 && 13289 ill->ill_ipif_dup_count == 0 && !was_dup) 13290 ill->ill_arp_bringup_pending = 1; 13291 mutex_exit(&ill->ill_lock); 13292 } 13293 if (ipif->ipif_lcl_addr != INADDR_ANY) { 13294 addr = (caddr_t)&ipif->ipif_lcl_addr; 13295 area_p = (uchar_t *)&ip_area_template; 13296 ared_p = (uchar_t *)&ip_ared_template; 13297 } 13298 } 13299 13300 /* 13301 * Add an entry for the local address in ARP only if it 13302 * is not UNNUMBERED and the address is not INADDR_ANY. 13303 */ 13304 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { 13305 area_t *area; 13306 13307 /* Now ask ARP to publish our address. */ 13308 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 13309 if (arp_add_mp == NULL) 13310 goto failed; 13311 area = (area_t *)arp_add_mp->b_rptr; 13312 if (res_act != Res_act_initial) { 13313 /* 13314 * Copy the new hardware address and length into 13315 * arp_add_mp to be sent to ARP. 13316 */ 13317 area->area_hw_addr_length = 13318 ill->ill_phys_addr_length; 13319 bcopy((char *)ill->ill_phys_addr, 13320 ((char *)area + area->area_hw_addr_offset), 13321 area->area_hw_addr_length); 13322 } 13323 13324 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | 13325 ACE_F_MYADDR; 13326 13327 if (res_act == Res_act_defend) { 13328 area->area_flags |= ACE_F_DEFEND; 13329 /* 13330 * If we're just defending our address now, then 13331 * there's no need to set up ARP multicast mappings. 13332 * The publish command is enough. 13333 */ 13334 goto done; 13335 } 13336 13337 if (res_act != Res_act_initial) 13338 goto arp_setup_multicast; 13339 13340 /* 13341 * Allocate an ARP deletion message so we know we can tell ARP 13342 * when the interface goes down. 13343 */ 13344 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 13345 if (arp_del_mp == NULL) 13346 goto failed; 13347 13348 } else { 13349 if (res_act != Res_act_initial) 13350 goto done; 13351 } 13352 /* 13353 * Need to bring up ARP or setup multicast mapping only 13354 * when the first interface is coming UP. 13355 */ 13356 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 13357 was_dup) { 13358 goto done; 13359 } 13360 13361 /* 13362 * Allocate an ARP down message (to be saved) and an ARP up 13363 * message. 13364 */ 13365 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13366 if (arp_down_mp == NULL) 13367 goto failed; 13368 13369 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13370 if (arp_up_mp == NULL) 13371 goto failed; 13372 13373 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13374 goto done; 13375 13376 arp_setup_multicast: 13377 /* 13378 * Setup the multicast mappings. This function initializes 13379 * ill_arp_del_mapping_mp also. This does not need to be done for 13380 * IPv6. 13381 */ 13382 if (!ill->ill_isv6) { 13383 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13384 if (err != 0) 13385 goto failed; 13386 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13387 ASSERT(arp_add_mapping_mp != NULL); 13388 } 13389 13390 done: 13391 if (arp_del_mp != NULL) { 13392 ASSERT(ipif->ipif_arp_del_mp == NULL); 13393 ipif->ipif_arp_del_mp = arp_del_mp; 13394 } 13395 if (arp_down_mp != NULL) { 13396 ASSERT(ill->ill_arp_down_mp == NULL); 13397 ill->ill_arp_down_mp = arp_down_mp; 13398 } 13399 if (arp_del_mapping_mp != NULL) { 13400 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13401 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13402 } 13403 if (arp_up_mp != NULL) { 13404 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13405 ill->ill_name, ipif->ipif_id)); 13406 putnext(ill->ill_rq, arp_up_mp); 13407 } 13408 if (arp_add_mp != NULL) { 13409 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13410 ill->ill_name, ipif->ipif_id)); 13411 /* 13412 * If it's an extended ARP implementation, then we'll wait to 13413 * hear that DAD has finished before using the interface. 13414 */ 13415 if (!ill->ill_arp_extend) 13416 ipif->ipif_addr_ready = 1; 13417 putnext(ill->ill_rq, arp_add_mp); 13418 } else { 13419 ipif->ipif_addr_ready = 1; 13420 } 13421 if (arp_add_mapping_mp != NULL) { 13422 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13423 ill->ill_name, ipif->ipif_id)); 13424 putnext(ill->ill_rq, arp_add_mapping_mp); 13425 } 13426 if (res_act != Res_act_initial) 13427 return (0); 13428 13429 if (ill->ill_flags & ILLF_NOARP) 13430 err = ill_arp_off(ill); 13431 else 13432 err = ill_arp_on(ill); 13433 if (err != 0) { 13434 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 13435 freemsg(ipif->ipif_arp_del_mp); 13436 freemsg(ill->ill_arp_down_mp); 13437 freemsg(ill->ill_arp_del_mapping_mp); 13438 ipif->ipif_arp_del_mp = NULL; 13439 ill->ill_arp_down_mp = NULL; 13440 ill->ill_arp_del_mapping_mp = NULL; 13441 return (err); 13442 } 13443 return ((ill->ill_ipif_up_count != 0 || was_dup || 13444 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 13445 13446 failed: 13447 ip1dbg(("ipif_resolver_up: FAILED\n")); 13448 freemsg(arp_add_mp); 13449 freemsg(arp_del_mp); 13450 freemsg(arp_add_mapping_mp); 13451 freemsg(arp_up_mp); 13452 freemsg(arp_down_mp); 13453 ill->ill_arp_bringup_pending = 0; 13454 return (err); 13455 } 13456 13457 /* 13458 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 13459 * just gone back up. 13460 */ 13461 static void 13462 ipif_arp_start_dad(ipif_t *ipif) 13463 { 13464 ill_t *ill = ipif->ipif_ill; 13465 mblk_t *arp_add_mp; 13466 area_t *area; 13467 13468 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 13469 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13470 ipif->ipif_lcl_addr == INADDR_ANY || 13471 (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 13472 (char *)&ipif->ipif_lcl_addr)) == NULL) { 13473 /* 13474 * If we can't contact ARP for some reason, that's not really a 13475 * problem. Just send out the routing socket notification that 13476 * DAD completion would have done, and continue. 13477 */ 13478 ipif_mask_reply(ipif); 13479 ip_rts_ifmsg(ipif); 13480 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13481 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13482 ipif->ipif_addr_ready = 1; 13483 return; 13484 } 13485 13486 /* Setting the 'unverified' flag restarts DAD */ 13487 area = (area_t *)arp_add_mp->b_rptr; 13488 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 13489 ACE_F_UNVERIFIED; 13490 putnext(ill->ill_rq, arp_add_mp); 13491 } 13492 13493 static void 13494 ipif_ndp_start_dad(ipif_t *ipif) 13495 { 13496 nce_t *nce; 13497 13498 nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); 13499 if (nce == NULL) 13500 return; 13501 13502 if (!ndp_restart_dad(nce)) { 13503 /* 13504 * If we can't restart DAD for some reason, that's not really a 13505 * problem. Just send out the routing socket notification that 13506 * DAD completion would have done, and continue. 13507 */ 13508 ip_rts_ifmsg(ipif); 13509 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13510 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13511 ipif->ipif_addr_ready = 1; 13512 } 13513 NCE_REFRELE(nce); 13514 } 13515 13516 /* 13517 * Restart duplicate address detection on all interfaces on the given ill. 13518 * 13519 * This is called when an interface transitions from down to up 13520 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 13521 * 13522 * Note that since the underlying physical link has transitioned, we must cause 13523 * at least one routing socket message to be sent here, either via DAD 13524 * completion or just by default on the first ipif. (If we don't do this, then 13525 * in.mpathd will see long delays when doing link-based failure recovery.) 13526 */ 13527 void 13528 ill_restart_dad(ill_t *ill, boolean_t went_up) 13529 { 13530 ipif_t *ipif; 13531 13532 if (ill == NULL) 13533 return; 13534 13535 /* 13536 * If layer two doesn't support duplicate address detection, then just 13537 * send the routing socket message now and be done with it. 13538 */ 13539 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 13540 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 13541 ip_rts_ifmsg(ill->ill_ipif); 13542 return; 13543 } 13544 13545 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13546 if (went_up) { 13547 if (ipif->ipif_flags & IPIF_UP) { 13548 if (ill->ill_isv6) 13549 ipif_ndp_start_dad(ipif); 13550 else 13551 ipif_arp_start_dad(ipif); 13552 } else if (ill->ill_isv6 && 13553 (ipif->ipif_flags & IPIF_DUPLICATE)) { 13554 /* 13555 * For IPv4, the ARP module itself will 13556 * automatically start the DAD process when it 13557 * sees DL_NOTE_LINK_UP. We respond to the 13558 * AR_CN_READY at the completion of that task. 13559 * For IPv6, we must kick off the bring-up 13560 * process now. 13561 */ 13562 ndp_do_recovery(ipif); 13563 } else { 13564 /* 13565 * Unfortunately, the first ipif is "special" 13566 * and represents the underlying ill in the 13567 * routing socket messages. Thus, when this 13568 * one ipif is down, we must still notify so 13569 * that the user knows the IFF_RUNNING status 13570 * change. (If the first ipif is up, then 13571 * we'll handle eventual routing socket 13572 * notification via DAD completion.) 13573 */ 13574 if (ipif == ill->ill_ipif) 13575 ip_rts_ifmsg(ill->ill_ipif); 13576 } 13577 } else { 13578 /* 13579 * After link down, we'll need to send a new routing 13580 * message when the link comes back, so clear 13581 * ipif_addr_ready. 13582 */ 13583 ipif->ipif_addr_ready = 0; 13584 } 13585 } 13586 13587 /* 13588 * If we've torn down links, then notify the user right away. 13589 */ 13590 if (!went_up) 13591 ip_rts_ifmsg(ill->ill_ipif); 13592 } 13593 13594 /* 13595 * Wakeup all threads waiting to enter the ipsq, and sleeping 13596 * on any of the ills in this ipsq. The ill_lock of the ill 13597 * must be held so that waiters don't miss wakeups 13598 */ 13599 static void 13600 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 13601 { 13602 phyint_t *phyint; 13603 13604 phyint = ipsq->ipsq_phyint_list; 13605 while (phyint != NULL) { 13606 if (phyint->phyint_illv4) { 13607 if (!caller_holds_lock) 13608 mutex_enter(&phyint->phyint_illv4->ill_lock); 13609 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13610 cv_broadcast(&phyint->phyint_illv4->ill_cv); 13611 if (!caller_holds_lock) 13612 mutex_exit(&phyint->phyint_illv4->ill_lock); 13613 } 13614 if (phyint->phyint_illv6) { 13615 if (!caller_holds_lock) 13616 mutex_enter(&phyint->phyint_illv6->ill_lock); 13617 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13618 cv_broadcast(&phyint->phyint_illv6->ill_cv); 13619 if (!caller_holds_lock) 13620 mutex_exit(&phyint->phyint_illv6->ill_lock); 13621 } 13622 phyint = phyint->phyint_ipsq_next; 13623 } 13624 } 13625 13626 static ipsq_t * 13627 ipsq_create(char *groupname) 13628 { 13629 ipsq_t *ipsq; 13630 13631 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13632 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 13633 if (ipsq == NULL) { 13634 return (NULL); 13635 } 13636 13637 if (groupname != NULL) 13638 (void) strcpy(ipsq->ipsq_name, groupname); 13639 else 13640 ipsq->ipsq_name[0] = '\0'; 13641 13642 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 13643 ipsq->ipsq_flags |= IPSQ_GROUP; 13644 ipsq->ipsq_next = ipsq_g_head; 13645 ipsq_g_head = ipsq; 13646 return (ipsq); 13647 } 13648 13649 /* 13650 * Return an ipsq correspoding to the groupname. If 'create' is true 13651 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 13652 * uniquely with an IPMP group. However during IPMP groupname operations, 13653 * multiple IPMP groups may be associated with a single ipsq. But no 13654 * IPMP group can be associated with more than 1 ipsq at any time. 13655 * For example 13656 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 13657 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 13658 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 13659 * 13660 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 13661 * status shown below during the execution of the above command. 13662 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 13663 * 13664 * After the completion of the above groupname command we return to the stable 13665 * state shown below. 13666 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 13667 * hme4 mpk17-85 ipsq2 mpk17-85 1 13668 * 13669 * Because of the above, we don't search based on the ipsq_name since that 13670 * would miss the correct ipsq during certain windows as shown above. 13671 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 13672 * natural state. 13673 */ 13674 static ipsq_t * 13675 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq) 13676 { 13677 ipsq_t *ipsq; 13678 int group_len; 13679 phyint_t *phyint; 13680 13681 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 13682 13683 group_len = strlen(groupname); 13684 ASSERT(group_len != 0); 13685 group_len++; 13686 13687 for (ipsq = ipsq_g_head; ipsq != NULL; ipsq = ipsq->ipsq_next) { 13688 /* 13689 * When an ipsq is being split, and ill_split_ipsq 13690 * calls this function, we exclude it from being considered. 13691 */ 13692 if (ipsq == exclude_ipsq) 13693 continue; 13694 13695 /* 13696 * Compare against the ipsq_name. The groupname change happens 13697 * in 2 phases. The 1st phase merges the from group into 13698 * the to group's ipsq, by calling ill_merge_groups and restarts 13699 * the ioctl. The 2nd phase then locates the ipsq again thru 13700 * ipsq_name. At this point the phyint_groupname has not been 13701 * updated. 13702 */ 13703 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 13704 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 13705 /* 13706 * Verify that an ipmp groupname is exactly 13707 * part of 1 ipsq and is not found in any other 13708 * ipsq. 13709 */ 13710 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) == 13711 NULL); 13712 return (ipsq); 13713 } 13714 13715 /* 13716 * Comparison against ipsq_name alone is not sufficient. 13717 * In the case when groups are currently being 13718 * merged, the ipsq could hold other IPMP groups temporarily. 13719 * so we walk the phyint list and compare against the 13720 * phyint_groupname as well. 13721 */ 13722 phyint = ipsq->ipsq_phyint_list; 13723 while (phyint != NULL) { 13724 if ((group_len == phyint->phyint_groupname_len) && 13725 (bcmp(phyint->phyint_groupname, groupname, 13726 group_len) == 0)) { 13727 /* 13728 * Verify that an ipmp groupname is exactly 13729 * part of 1 ipsq and is not found in any other 13730 * ipsq. 13731 */ 13732 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) 13733 == NULL); 13734 return (ipsq); 13735 } 13736 phyint = phyint->phyint_ipsq_next; 13737 } 13738 } 13739 if (create) 13740 ipsq = ipsq_create(groupname); 13741 return (ipsq); 13742 } 13743 13744 static void 13745 ipsq_delete(ipsq_t *ipsq) 13746 { 13747 ipsq_t *nipsq; 13748 ipsq_t *pipsq = NULL; 13749 13750 /* 13751 * We don't hold the ipsq lock, but we are sure no new 13752 * messages can land up, since the ipsq_refs is zero. 13753 * i.e. this ipsq is unnamed and no phyint or phyint group 13754 * is associated with this ipsq. (Lookups are based on ill_name 13755 * or phyint_group_name) 13756 */ 13757 ASSERT(ipsq->ipsq_refs == 0); 13758 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 13759 ASSERT(ipsq->ipsq_pending_mp == NULL); 13760 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 13761 /* 13762 * This is not the ipsq of an IPMP group. 13763 */ 13764 kmem_free(ipsq, sizeof (ipsq_t)); 13765 return; 13766 } 13767 13768 rw_enter(&ill_g_lock, RW_WRITER); 13769 13770 /* 13771 * Locate the ipsq before we can remove it from 13772 * the singly linked list of ipsq's. 13773 */ 13774 for (nipsq = ipsq_g_head; nipsq != NULL; nipsq = nipsq->ipsq_next) { 13775 if (nipsq == ipsq) { 13776 break; 13777 } 13778 pipsq = nipsq; 13779 } 13780 13781 ASSERT(nipsq == ipsq); 13782 13783 /* unlink ipsq from the list */ 13784 if (pipsq != NULL) 13785 pipsq->ipsq_next = ipsq->ipsq_next; 13786 else 13787 ipsq_g_head = ipsq->ipsq_next; 13788 kmem_free(ipsq, sizeof (ipsq_t)); 13789 rw_exit(&ill_g_lock); 13790 } 13791 13792 static void 13793 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 13794 queue_t *q) 13795 13796 { 13797 13798 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 13799 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 13800 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 13801 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 13802 ASSERT(current_mp != NULL); 13803 13804 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 13805 NEW_OP, NULL); 13806 13807 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 13808 new_ipsq->ipsq_xopq_mphead != NULL); 13809 13810 /* 13811 * move from old ipsq to the new ipsq. 13812 */ 13813 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 13814 if (old_ipsq->ipsq_xopq_mphead != NULL) 13815 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 13816 13817 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 13818 } 13819 13820 void 13821 ill_group_cleanup(ill_t *ill) 13822 { 13823 ill_t *ill_v4; 13824 ill_t *ill_v6; 13825 ipif_t *ipif; 13826 13827 ill_v4 = ill->ill_phyint->phyint_illv4; 13828 ill_v6 = ill->ill_phyint->phyint_illv6; 13829 13830 if (ill_v4 != NULL) { 13831 mutex_enter(&ill_v4->ill_lock); 13832 for (ipif = ill_v4->ill_ipif; ipif != NULL; 13833 ipif = ipif->ipif_next) { 13834 IPIF_UNMARK_MOVING(ipif); 13835 } 13836 ill_v4->ill_up_ipifs = B_FALSE; 13837 mutex_exit(&ill_v4->ill_lock); 13838 } 13839 13840 if (ill_v6 != NULL) { 13841 mutex_enter(&ill_v6->ill_lock); 13842 for (ipif = ill_v6->ill_ipif; ipif != NULL; 13843 ipif = ipif->ipif_next) { 13844 IPIF_UNMARK_MOVING(ipif); 13845 } 13846 ill_v6->ill_up_ipifs = B_FALSE; 13847 mutex_exit(&ill_v6->ill_lock); 13848 } 13849 } 13850 /* 13851 * This function is called when an ill has had a change in its group status 13852 * to bring up all the ipifs that were up before the change. 13853 */ 13854 int 13855 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 13856 { 13857 ipif_t *ipif; 13858 ill_t *ill_v4; 13859 ill_t *ill_v6; 13860 ill_t *from_ill; 13861 int err = 0; 13862 13863 13864 ASSERT(IAM_WRITER_ILL(ill)); 13865 13866 /* 13867 * Except for ipif_state_flags and ill_state_flags the other 13868 * fields of the ipif/ill that are modified below are protected 13869 * implicitly since we are a writer. We would have tried to down 13870 * even an ipif that was already down, in ill_down_ipifs. So we 13871 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 13872 */ 13873 ill_v4 = ill->ill_phyint->phyint_illv4; 13874 ill_v6 = ill->ill_phyint->phyint_illv6; 13875 if (ill_v4 != NULL) { 13876 ill_v4->ill_up_ipifs = B_TRUE; 13877 for (ipif = ill_v4->ill_ipif; ipif != NULL; 13878 ipif = ipif->ipif_next) { 13879 mutex_enter(&ill_v4->ill_lock); 13880 ipif->ipif_state_flags &= ~IPIF_CHANGING; 13881 IPIF_UNMARK_MOVING(ipif); 13882 mutex_exit(&ill_v4->ill_lock); 13883 if (ipif->ipif_was_up) { 13884 if (!(ipif->ipif_flags & IPIF_UP)) 13885 err = ipif_up(ipif, q, mp); 13886 ipif->ipif_was_up = B_FALSE; 13887 if (err != 0) { 13888 /* 13889 * Can there be any other error ? 13890 */ 13891 ASSERT(err == EINPROGRESS); 13892 return (err); 13893 } 13894 } 13895 } 13896 mutex_enter(&ill_v4->ill_lock); 13897 ill_v4->ill_state_flags &= ~ILL_CHANGING; 13898 mutex_exit(&ill_v4->ill_lock); 13899 ill_v4->ill_up_ipifs = B_FALSE; 13900 if (ill_v4->ill_move_in_progress) { 13901 ASSERT(ill_v4->ill_move_peer != NULL); 13902 ill_v4->ill_move_in_progress = B_FALSE; 13903 from_ill = ill_v4->ill_move_peer; 13904 from_ill->ill_move_in_progress = B_FALSE; 13905 from_ill->ill_move_peer = NULL; 13906 mutex_enter(&from_ill->ill_lock); 13907 from_ill->ill_state_flags &= ~ILL_CHANGING; 13908 mutex_exit(&from_ill->ill_lock); 13909 if (ill_v6 == NULL) { 13910 if (from_ill->ill_phyint->phyint_flags & 13911 PHYI_STANDBY) { 13912 phyint_inactive(from_ill->ill_phyint); 13913 } 13914 if (ill_v4->ill_phyint->phyint_flags & 13915 PHYI_STANDBY) { 13916 phyint_inactive(ill_v4->ill_phyint); 13917 } 13918 } 13919 ill_v4->ill_move_peer = NULL; 13920 } 13921 } 13922 13923 if (ill_v6 != NULL) { 13924 ill_v6->ill_up_ipifs = B_TRUE; 13925 for (ipif = ill_v6->ill_ipif; ipif != NULL; 13926 ipif = ipif->ipif_next) { 13927 mutex_enter(&ill_v6->ill_lock); 13928 ipif->ipif_state_flags &= ~IPIF_CHANGING; 13929 IPIF_UNMARK_MOVING(ipif); 13930 mutex_exit(&ill_v6->ill_lock); 13931 if (ipif->ipif_was_up) { 13932 if (!(ipif->ipif_flags & IPIF_UP)) 13933 err = ipif_up(ipif, q, mp); 13934 ipif->ipif_was_up = B_FALSE; 13935 if (err != 0) { 13936 /* 13937 * Can there be any other error ? 13938 */ 13939 ASSERT(err == EINPROGRESS); 13940 return (err); 13941 } 13942 } 13943 } 13944 mutex_enter(&ill_v6->ill_lock); 13945 ill_v6->ill_state_flags &= ~ILL_CHANGING; 13946 mutex_exit(&ill_v6->ill_lock); 13947 ill_v6->ill_up_ipifs = B_FALSE; 13948 if (ill_v6->ill_move_in_progress) { 13949 ASSERT(ill_v6->ill_move_peer != NULL); 13950 ill_v6->ill_move_in_progress = B_FALSE; 13951 from_ill = ill_v6->ill_move_peer; 13952 from_ill->ill_move_in_progress = B_FALSE; 13953 from_ill->ill_move_peer = NULL; 13954 mutex_enter(&from_ill->ill_lock); 13955 from_ill->ill_state_flags &= ~ILL_CHANGING; 13956 mutex_exit(&from_ill->ill_lock); 13957 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 13958 phyint_inactive(from_ill->ill_phyint); 13959 } 13960 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 13961 phyint_inactive(ill_v6->ill_phyint); 13962 } 13963 ill_v6->ill_move_peer = NULL; 13964 } 13965 } 13966 return (0); 13967 } 13968 13969 /* 13970 * bring down all the approriate ipifs. 13971 */ 13972 /* ARGSUSED */ 13973 static void 13974 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 13975 { 13976 ipif_t *ipif; 13977 13978 ASSERT(IAM_WRITER_ILL(ill)); 13979 13980 /* 13981 * Except for ipif_state_flags the other fields of the ipif/ill that 13982 * are modified below are protected implicitly since we are a writer 13983 */ 13984 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13985 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 13986 continue; 13987 if (index == 0 || index == ipif->ipif_orig_ifindex) { 13988 /* 13989 * We go through the ipif_down logic even if the ipif 13990 * is already down, since routes can be added based 13991 * on down ipifs. Going through ipif_down once again 13992 * will delete any IREs created based on these routes. 13993 */ 13994 if (ipif->ipif_flags & IPIF_UP) 13995 ipif->ipif_was_up = B_TRUE; 13996 /* 13997 * If called with chk_nofailover true ipif is moving. 13998 */ 13999 mutex_enter(&ill->ill_lock); 14000 if (chk_nofailover) { 14001 ipif->ipif_state_flags |= 14002 IPIF_MOVING | IPIF_CHANGING; 14003 } else { 14004 ipif->ipif_state_flags |= IPIF_CHANGING; 14005 } 14006 mutex_exit(&ill->ill_lock); 14007 /* 14008 * Need to re-create net/subnet bcast ires if 14009 * they are dependent on ipif. 14010 */ 14011 if (!ipif->ipif_isv6) 14012 ipif_check_bcast_ires(ipif); 14013 (void) ipif_logical_down(ipif, NULL, NULL); 14014 ipif_non_duplicate(ipif); 14015 ipif_down_tail(ipif); 14016 /* 14017 * We don't do ipif_multicast_down for IPv4 in 14018 * ipif_down. We need to set this so that 14019 * ipif_multicast_up will join the 14020 * ALLHOSTS_GROUP on to_ill. 14021 */ 14022 ipif->ipif_multicast_up = B_FALSE; 14023 } 14024 } 14025 } 14026 14027 #define IPSQ_INC_REF(ipsq) { \ 14028 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 14029 (ipsq)->ipsq_refs++; \ 14030 } 14031 14032 #define IPSQ_DEC_REF(ipsq) { \ 14033 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 14034 (ipsq)->ipsq_refs--; \ 14035 if ((ipsq)->ipsq_refs == 0) \ 14036 (ipsq)->ipsq_name[0] = '\0'; \ 14037 } 14038 14039 /* 14040 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14041 * new_ipsq. 14042 */ 14043 static void 14044 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq) 14045 { 14046 phyint_t *phyint; 14047 phyint_t *next_phyint; 14048 14049 /* 14050 * To change the ipsq of an ill, we need to hold the ill_g_lock as 14051 * writer and the ill_lock of the ill in question. Also the dest 14052 * ipsq can't vanish while we hold the ill_g_lock as writer. 14053 */ 14054 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14055 14056 phyint = cur_ipsq->ipsq_phyint_list; 14057 cur_ipsq->ipsq_phyint_list = NULL; 14058 while (phyint != NULL) { 14059 next_phyint = phyint->phyint_ipsq_next; 14060 IPSQ_DEC_REF(cur_ipsq); 14061 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 14062 new_ipsq->ipsq_phyint_list = phyint; 14063 IPSQ_INC_REF(new_ipsq); 14064 phyint->phyint_ipsq = new_ipsq; 14065 phyint = next_phyint; 14066 } 14067 } 14068 14069 #define SPLIT_SUCCESS 0 14070 #define SPLIT_NOT_NEEDED 1 14071 #define SPLIT_FAILED 2 14072 14073 int 14074 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry) 14075 { 14076 ipsq_t *newipsq = NULL; 14077 14078 /* 14079 * Assertions denote pre-requisites for changing the ipsq of 14080 * a phyint 14081 */ 14082 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14083 /* 14084 * <ill-phyint> assocs can't change while ill_g_lock 14085 * is held as writer. See ill_phyint_reinit() 14086 */ 14087 ASSERT(phyint->phyint_illv4 == NULL || 14088 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14089 ASSERT(phyint->phyint_illv6 == NULL || 14090 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14091 14092 if ((phyint->phyint_groupname_len != 14093 (strlen(cur_ipsq->ipsq_name) + 1) || 14094 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 14095 phyint->phyint_groupname_len) != 0)) { 14096 /* 14097 * Once we fail in creating a new ipsq due to memory shortage, 14098 * don't attempt to create new ipsq again, based on another 14099 * phyint, since we want all phyints belonging to an IPMP group 14100 * to be in the same ipsq even in the event of mem alloc fails. 14101 */ 14102 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 14103 cur_ipsq); 14104 if (newipsq == NULL) { 14105 /* Memory allocation failure */ 14106 return (SPLIT_FAILED); 14107 } else { 14108 /* ipsq_refs protected by ill_g_lock (writer) */ 14109 IPSQ_DEC_REF(cur_ipsq); 14110 phyint->phyint_ipsq = newipsq; 14111 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 14112 newipsq->ipsq_phyint_list = phyint; 14113 IPSQ_INC_REF(newipsq); 14114 return (SPLIT_SUCCESS); 14115 } 14116 } 14117 return (SPLIT_NOT_NEEDED); 14118 } 14119 14120 /* 14121 * The ill locks of the phyint and the ill_g_lock (writer) must be held 14122 * to do this split 14123 */ 14124 static int 14125 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq) 14126 { 14127 ipsq_t *newipsq; 14128 14129 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14130 /* 14131 * <ill-phyint> assocs can't change while ill_g_lock 14132 * is held as writer. See ill_phyint_reinit() 14133 */ 14134 14135 ASSERT(phyint->phyint_illv4 == NULL || 14136 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14137 ASSERT(phyint->phyint_illv6 == NULL || 14138 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14139 14140 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 14141 phyint->phyint_illv4: phyint->phyint_illv6)) { 14142 /* 14143 * ipsq_init failed due to no memory 14144 * caller will use the same ipsq 14145 */ 14146 return (SPLIT_FAILED); 14147 } 14148 14149 /* ipsq_ref is protected by ill_g_lock (writer) */ 14150 IPSQ_DEC_REF(cur_ipsq); 14151 14152 /* 14153 * This is a new ipsq that is unknown to the world. 14154 * So we don't need to hold ipsq_lock, 14155 */ 14156 newipsq = phyint->phyint_ipsq; 14157 newipsq->ipsq_writer = NULL; 14158 newipsq->ipsq_reentry_cnt--; 14159 ASSERT(newipsq->ipsq_reentry_cnt == 0); 14160 #ifdef ILL_DEBUG 14161 newipsq->ipsq_depth = 0; 14162 #endif 14163 14164 return (SPLIT_SUCCESS); 14165 } 14166 14167 /* 14168 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14169 * ipsq's representing their individual groups or themselves. Return 14170 * whether split needs to be retried again later. 14171 */ 14172 static boolean_t 14173 ill_split_ipsq(ipsq_t *cur_ipsq) 14174 { 14175 phyint_t *phyint; 14176 phyint_t *next_phyint; 14177 int error; 14178 boolean_t need_retry = B_FALSE; 14179 14180 phyint = cur_ipsq->ipsq_phyint_list; 14181 cur_ipsq->ipsq_phyint_list = NULL; 14182 while (phyint != NULL) { 14183 next_phyint = phyint->phyint_ipsq_next; 14184 /* 14185 * 'created' will tell us whether the callee actually 14186 * created an ipsq. Lack of memory may force the callee 14187 * to return without creating an ipsq. 14188 */ 14189 if (phyint->phyint_groupname == NULL) { 14190 error = ill_split_to_own_ipsq(phyint, cur_ipsq); 14191 } else { 14192 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 14193 need_retry); 14194 } 14195 14196 switch (error) { 14197 case SPLIT_FAILED: 14198 need_retry = B_TRUE; 14199 /* FALLTHRU */ 14200 case SPLIT_NOT_NEEDED: 14201 /* 14202 * Keep it on the list. 14203 */ 14204 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 14205 cur_ipsq->ipsq_phyint_list = phyint; 14206 break; 14207 case SPLIT_SUCCESS: 14208 break; 14209 default: 14210 ASSERT(0); 14211 } 14212 14213 phyint = next_phyint; 14214 } 14215 return (need_retry); 14216 } 14217 14218 /* 14219 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 14220 * and return the ills in the list. This list will be 14221 * needed to unlock all the ills later on by the caller. 14222 * The <ill-ipsq> associations could change between the 14223 * lock and unlock. Hence the unlock can't traverse the 14224 * ipsq to get the list of ills. 14225 */ 14226 static int 14227 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 14228 { 14229 int cnt = 0; 14230 phyint_t *phyint; 14231 14232 /* 14233 * The caller holds ill_g_lock to ensure that the ill memberships 14234 * of the ipsq don't change 14235 */ 14236 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 14237 14238 phyint = ipsq->ipsq_phyint_list; 14239 while (phyint != NULL) { 14240 if (phyint->phyint_illv4 != NULL) { 14241 ASSERT(cnt < list_max); 14242 list[cnt++] = phyint->phyint_illv4; 14243 } 14244 if (phyint->phyint_illv6 != NULL) { 14245 ASSERT(cnt < list_max); 14246 list[cnt++] = phyint->phyint_illv6; 14247 } 14248 phyint = phyint->phyint_ipsq_next; 14249 } 14250 ill_lock_ills(list, cnt); 14251 return (cnt); 14252 } 14253 14254 void 14255 ill_lock_ills(ill_t **list, int cnt) 14256 { 14257 int i; 14258 14259 if (cnt > 1) { 14260 boolean_t try_again; 14261 do { 14262 try_again = B_FALSE; 14263 for (i = 0; i < cnt - 1; i++) { 14264 if (list[i] < list[i + 1]) { 14265 ill_t *tmp; 14266 14267 /* swap the elements */ 14268 tmp = list[i]; 14269 list[i] = list[i + 1]; 14270 list[i + 1] = tmp; 14271 try_again = B_TRUE; 14272 } 14273 } 14274 } while (try_again); 14275 } 14276 14277 for (i = 0; i < cnt; i++) { 14278 if (i == 0) { 14279 if (list[i] != NULL) 14280 mutex_enter(&list[i]->ill_lock); 14281 else 14282 return; 14283 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14284 mutex_enter(&list[i]->ill_lock); 14285 } 14286 } 14287 } 14288 14289 void 14290 ill_unlock_ills(ill_t **list, int cnt) 14291 { 14292 int i; 14293 14294 for (i = 0; i < cnt; i++) { 14295 if ((i == 0) && (list[i] != NULL)) { 14296 mutex_exit(&list[i]->ill_lock); 14297 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14298 mutex_exit(&list[i]->ill_lock); 14299 } 14300 } 14301 } 14302 14303 /* 14304 * Merge all the ills from 1 ipsq group into another ipsq group. 14305 * The source ipsq group is specified by the ipsq associated with 14306 * 'from_ill'. The destination ipsq group is specified by the ipsq 14307 * associated with 'to_ill' or 'groupname' respectively. 14308 * Note that ipsq itself does not have a reference count mechanism 14309 * and functions don't look up an ipsq and pass it around. Instead 14310 * functions pass around an ill or groupname, and the ipsq is looked 14311 * up from the ill or groupname and the required operation performed 14312 * atomically with the lookup on the ipsq. 14313 */ 14314 static int 14315 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 14316 queue_t *q) 14317 { 14318 ipsq_t *old_ipsq; 14319 ipsq_t *new_ipsq; 14320 ill_t **ill_list; 14321 int cnt; 14322 size_t ill_list_size; 14323 boolean_t became_writer_on_new_sq = B_FALSE; 14324 14325 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 14326 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 14327 14328 /* 14329 * Need to hold ill_g_lock as writer and also the ill_lock to 14330 * change the <ill-ipsq> assoc of an ill. Need to hold the 14331 * ipsq_lock to prevent new messages from landing on an ipsq. 14332 */ 14333 rw_enter(&ill_g_lock, RW_WRITER); 14334 14335 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 14336 if (groupname != NULL) 14337 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL); 14338 else { 14339 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 14340 } 14341 14342 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 14343 14344 /* 14345 * both groups are on the same ipsq. 14346 */ 14347 if (old_ipsq == new_ipsq) { 14348 rw_exit(&ill_g_lock); 14349 return (0); 14350 } 14351 14352 cnt = old_ipsq->ipsq_refs << 1; 14353 ill_list_size = cnt * sizeof (ill_t *); 14354 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 14355 if (ill_list == NULL) { 14356 rw_exit(&ill_g_lock); 14357 return (ENOMEM); 14358 } 14359 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 14360 14361 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 14362 mutex_enter(&new_ipsq->ipsq_lock); 14363 if ((new_ipsq->ipsq_writer == NULL && 14364 new_ipsq->ipsq_current_ipif == NULL) || 14365 (new_ipsq->ipsq_writer == curthread)) { 14366 new_ipsq->ipsq_writer = curthread; 14367 new_ipsq->ipsq_reentry_cnt++; 14368 became_writer_on_new_sq = B_TRUE; 14369 } 14370 14371 /* 14372 * We are holding ill_g_lock as writer and all the ill locks of 14373 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 14374 * message can land up on the old ipsq even though we don't hold the 14375 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 14376 */ 14377 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 14378 14379 /* 14380 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 14381 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 14382 * assocs. till we release the ill_g_lock, and hence it can't vanish. 14383 */ 14384 ill_merge_ipsq(old_ipsq, new_ipsq); 14385 14386 /* 14387 * Mark the new ipsq as needing a split since it is currently 14388 * being shared by more than 1 IPMP group. The split will 14389 * occur at the end of ipsq_exit 14390 */ 14391 new_ipsq->ipsq_split = B_TRUE; 14392 14393 /* Now release all the locks */ 14394 mutex_exit(&new_ipsq->ipsq_lock); 14395 ill_unlock_ills(ill_list, cnt); 14396 rw_exit(&ill_g_lock); 14397 14398 kmem_free(ill_list, ill_list_size); 14399 14400 /* 14401 * If we succeeded in becoming writer on the new ipsq, then 14402 * drain the new ipsq and start processing all enqueued messages 14403 * including the current ioctl we are processing which is either 14404 * a set groupname or failover/failback. 14405 */ 14406 if (became_writer_on_new_sq) 14407 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 14408 14409 /* 14410 * syncq has been changed and all the messages have been moved. 14411 */ 14412 mutex_enter(&old_ipsq->ipsq_lock); 14413 old_ipsq->ipsq_current_ipif = NULL; 14414 mutex_exit(&old_ipsq->ipsq_lock); 14415 return (EINPROGRESS); 14416 } 14417 14418 /* 14419 * Delete and add the loopback copy and non-loopback copy of 14420 * the BROADCAST ire corresponding to ill and addr. Used to 14421 * group broadcast ires together when ill becomes part of 14422 * a group. 14423 * 14424 * This function is also called when ill is leaving the group 14425 * so that the ires belonging to the group gets re-grouped. 14426 */ 14427 static void 14428 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 14429 { 14430 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 14431 ire_t **ire_ptpn = &ire_head; 14432 14433 /* 14434 * The loopback and non-loopback IREs are inserted in the order in which 14435 * they're found, on the basis that they are correctly ordered (loopback 14436 * first). 14437 */ 14438 for (;;) { 14439 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14440 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14441 if (ire == NULL) 14442 break; 14443 14444 /* 14445 * we are passing in KM_SLEEP because it is not easy to 14446 * go back to a sane state in case of memory failure. 14447 */ 14448 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 14449 ASSERT(nire != NULL); 14450 bzero(nire, sizeof (ire_t)); 14451 /* 14452 * Don't use ire_max_frag directly since we don't 14453 * hold on to 'ire' until we add the new ire 'nire' and 14454 * we don't want the new ire to have a dangling reference 14455 * to 'ire'. The ire_max_frag of a broadcast ire must 14456 * be in sync with the ipif_mtu of the associate ipif. 14457 * For eg. this happens as a result of SIOCSLIFNAME, 14458 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 14459 * the driver. A change in ire_max_frag triggered as 14460 * as a result of path mtu discovery, or due to an 14461 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 14462 * route change -mtu command does not apply to broadcast ires. 14463 * 14464 * XXX We need a recovery strategy here if ire_init fails 14465 */ 14466 if (ire_init(nire, 14467 (uchar_t *)&ire->ire_addr, 14468 (uchar_t *)&ire->ire_mask, 14469 (uchar_t *)&ire->ire_src_addr, 14470 (uchar_t *)&ire->ire_gateway_addr, 14471 (uchar_t *)&ire->ire_in_src_addr, 14472 ire->ire_stq == NULL ? &ip_loopback_mtu : 14473 &ire->ire_ipif->ipif_mtu, 14474 (ire->ire_nce != NULL ? ire->ire_nce->nce_fp_mp : NULL), 14475 ire->ire_rfq, 14476 ire->ire_stq, 14477 ire->ire_type, 14478 (ire->ire_nce != NULL? ire->ire_nce->nce_res_mp : NULL), 14479 ire->ire_ipif, 14480 ire->ire_in_ill, 14481 ire->ire_cmask, 14482 ire->ire_phandle, 14483 ire->ire_ihandle, 14484 ire->ire_flags, 14485 &ire->ire_uinfo, 14486 NULL, 14487 NULL) == NULL) { 14488 cmn_err(CE_PANIC, "ire_init() failed"); 14489 } 14490 ire_delete(ire); 14491 ire_refrele(ire); 14492 14493 /* 14494 * The newly created IREs are inserted at the tail of the list 14495 * starting with ire_head. As we've just allocated them no one 14496 * knows about them so it's safe. 14497 */ 14498 *ire_ptpn = nire; 14499 ire_ptpn = &nire->ire_next; 14500 } 14501 14502 for (nire = ire_head; nire != NULL; nire = nire_next) { 14503 int error; 14504 ire_t *oire; 14505 /* unlink the IRE from our list before calling ire_add() */ 14506 nire_next = nire->ire_next; 14507 nire->ire_next = NULL; 14508 14509 /* ire_add adds the ire at the right place in the list */ 14510 oire = nire; 14511 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); 14512 ASSERT(error == 0); 14513 ASSERT(oire == nire); 14514 ire_refrele(nire); /* Held in ire_add */ 14515 } 14516 } 14517 14518 /* 14519 * This function is usually called when an ill is inserted in 14520 * a group and all the ipifs are already UP. As all the ipifs 14521 * are already UP, the broadcast ires have already been created 14522 * and been inserted. But, ire_add_v4 would not have grouped properly. 14523 * We need to re-group for the benefit of ip_wput_ire which 14524 * expects BROADCAST ires to be grouped properly to avoid sending 14525 * more than one copy of the broadcast packet per group. 14526 * 14527 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 14528 * because when ipif_up_done ends up calling this, ires have 14529 * already been added before illgrp_insert i.e before ill_group 14530 * has been initialized. 14531 */ 14532 static void 14533 ill_group_bcast_for_xmit(ill_t *ill) 14534 { 14535 ill_group_t *illgrp; 14536 ipif_t *ipif; 14537 ipaddr_t addr; 14538 ipaddr_t net_mask; 14539 ipaddr_t subnet_netmask; 14540 14541 illgrp = ill->ill_group; 14542 14543 /* 14544 * This function is called even when an ill is deleted from 14545 * the group. Hence, illgrp could be null. 14546 */ 14547 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 14548 return; 14549 14550 /* 14551 * Delete all the BROADCAST ires matching this ill and add 14552 * them back. This time, ire_add_v4 should take care of 14553 * grouping them with others because ill is part of the 14554 * group. 14555 */ 14556 ill_bcast_delete_and_add(ill, 0); 14557 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 14558 14559 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14560 14561 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14562 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14563 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14564 } else { 14565 net_mask = htonl(IN_CLASSA_NET); 14566 } 14567 addr = net_mask & ipif->ipif_subnet; 14568 ill_bcast_delete_and_add(ill, addr); 14569 ill_bcast_delete_and_add(ill, ~net_mask | addr); 14570 14571 subnet_netmask = ipif->ipif_net_mask; 14572 addr = ipif->ipif_subnet; 14573 ill_bcast_delete_and_add(ill, addr); 14574 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 14575 } 14576 } 14577 14578 /* 14579 * This function is called from illgrp_delete when ill is being deleted 14580 * from the group. 14581 * 14582 * As ill is not there in the group anymore, any address belonging 14583 * to this ill should be cleared of IRE_MARK_NORECV. 14584 */ 14585 static void 14586 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 14587 { 14588 ire_t *ire; 14589 irb_t *irb; 14590 14591 ASSERT(ill->ill_group == NULL); 14592 14593 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14594 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14595 14596 if (ire != NULL) { 14597 /* 14598 * IPMP and plumbing operations are serialized on the ipsq, so 14599 * no one will insert or delete a broadcast ire under our feet. 14600 */ 14601 irb = ire->ire_bucket; 14602 rw_enter(&irb->irb_lock, RW_READER); 14603 ire_refrele(ire); 14604 14605 for (; ire != NULL; ire = ire->ire_next) { 14606 if (ire->ire_addr != addr) 14607 break; 14608 if (ire_to_ill(ire) != ill) 14609 continue; 14610 14611 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 14612 ire->ire_marks &= ~IRE_MARK_NORECV; 14613 } 14614 rw_exit(&irb->irb_lock); 14615 } 14616 } 14617 14618 /* 14619 * This function must be called only after the broadcast ires 14620 * have been grouped together. For a given address addr, nominate 14621 * only one of the ires whose interface is not FAILED or OFFLINE. 14622 * 14623 * This is also called when an ipif goes down, so that we can nominate 14624 * a different ire with the same address for receiving. 14625 */ 14626 static void 14627 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr) 14628 { 14629 irb_t *irb; 14630 ire_t *ire; 14631 ire_t *ire1; 14632 ire_t *save_ire; 14633 ire_t **irep = NULL; 14634 boolean_t first = B_TRUE; 14635 ire_t *clear_ire = NULL; 14636 ire_t *start_ire = NULL; 14637 ire_t *new_lb_ire; 14638 ire_t *new_nlb_ire; 14639 boolean_t new_lb_ire_used = B_FALSE; 14640 boolean_t new_nlb_ire_used = B_FALSE; 14641 uint64_t match_flags; 14642 uint64_t phyi_flags; 14643 boolean_t fallback = B_FALSE; 14644 14645 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 14646 NULL, MATCH_IRE_TYPE); 14647 /* 14648 * We may not be able to find some ires if a previous 14649 * ire_create failed. This happens when an ipif goes 14650 * down and we are unable to create BROADCAST ires due 14651 * to memory failure. Thus, we have to check for NULL 14652 * below. This should handle the case for LOOPBACK, 14653 * POINTOPOINT and interfaces with some POINTOPOINT 14654 * logicals for which there are no BROADCAST ires. 14655 */ 14656 if (ire == NULL) 14657 return; 14658 /* 14659 * Currently IRE_BROADCASTS are deleted when an ipif 14660 * goes down which runs exclusively. Thus, setting 14661 * IRE_MARK_RCVD should not race with ire_delete marking 14662 * IRE_MARK_CONDEMNED. We grab the lock below just to 14663 * be consistent with other parts of the code that walks 14664 * a given bucket. 14665 */ 14666 save_ire = ire; 14667 irb = ire->ire_bucket; 14668 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14669 if (new_lb_ire == NULL) { 14670 ire_refrele(ire); 14671 return; 14672 } 14673 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14674 if (new_nlb_ire == NULL) { 14675 ire_refrele(ire); 14676 kmem_cache_free(ire_cache, new_lb_ire); 14677 return; 14678 } 14679 IRB_REFHOLD(irb); 14680 rw_enter(&irb->irb_lock, RW_WRITER); 14681 /* 14682 * Get to the first ire matching the address and the 14683 * group. If the address does not match we are done 14684 * as we could not find the IRE. If the address matches 14685 * we should get to the first one matching the group. 14686 */ 14687 while (ire != NULL) { 14688 if (ire->ire_addr != addr || 14689 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14690 break; 14691 } 14692 ire = ire->ire_next; 14693 } 14694 match_flags = PHYI_FAILED | PHYI_INACTIVE; 14695 start_ire = ire; 14696 redo: 14697 while (ire != NULL && ire->ire_addr == addr && 14698 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14699 /* 14700 * The first ire for any address within a group 14701 * should always be the one with IRE_MARK_NORECV cleared 14702 * so that ip_wput_ire can avoid searching for one. 14703 * Note down the insertion point which will be used 14704 * later. 14705 */ 14706 if (first && (irep == NULL)) 14707 irep = ire->ire_ptpn; 14708 /* 14709 * PHYI_FAILED is set when the interface fails. 14710 * This interface might have become good, but the 14711 * daemon has not yet detected. We should still 14712 * not receive on this. PHYI_OFFLINE should never 14713 * be picked as this has been offlined and soon 14714 * be removed. 14715 */ 14716 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 14717 if (phyi_flags & PHYI_OFFLINE) { 14718 ire->ire_marks |= IRE_MARK_NORECV; 14719 ire = ire->ire_next; 14720 continue; 14721 } 14722 if (phyi_flags & match_flags) { 14723 ire->ire_marks |= IRE_MARK_NORECV; 14724 ire = ire->ire_next; 14725 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 14726 PHYI_INACTIVE) { 14727 fallback = B_TRUE; 14728 } 14729 continue; 14730 } 14731 if (first) { 14732 /* 14733 * We will move this to the front of the list later 14734 * on. 14735 */ 14736 clear_ire = ire; 14737 ire->ire_marks &= ~IRE_MARK_NORECV; 14738 } else { 14739 ire->ire_marks |= IRE_MARK_NORECV; 14740 } 14741 first = B_FALSE; 14742 ire = ire->ire_next; 14743 } 14744 /* 14745 * If we never nominated anybody, try nominating at least 14746 * an INACTIVE, if we found one. Do it only once though. 14747 */ 14748 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 14749 fallback) { 14750 match_flags = PHYI_FAILED; 14751 ire = start_ire; 14752 irep = NULL; 14753 goto redo; 14754 } 14755 ire_refrele(save_ire); 14756 14757 /* 14758 * irep non-NULL indicates that we entered the while loop 14759 * above. If clear_ire is at the insertion point, we don't 14760 * have to do anything. clear_ire will be NULL if all the 14761 * interfaces are failed. 14762 * 14763 * We cannot unlink and reinsert the ire at the right place 14764 * in the list since there can be other walkers of this bucket. 14765 * Instead we delete and recreate the ire 14766 */ 14767 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 14768 ire_t *clear_ire_stq = NULL; 14769 mblk_t *fp_mp = NULL, *res_mp = NULL; 14770 14771 bzero(new_lb_ire, sizeof (ire_t)); 14772 if (clear_ire->ire_nce != NULL) { 14773 fp_mp = clear_ire->ire_nce->nce_fp_mp; 14774 res_mp = clear_ire->ire_nce->nce_res_mp; 14775 } 14776 /* XXX We need a recovery strategy here. */ 14777 if (ire_init(new_lb_ire, 14778 (uchar_t *)&clear_ire->ire_addr, 14779 (uchar_t *)&clear_ire->ire_mask, 14780 (uchar_t *)&clear_ire->ire_src_addr, 14781 (uchar_t *)&clear_ire->ire_gateway_addr, 14782 (uchar_t *)&clear_ire->ire_in_src_addr, 14783 &clear_ire->ire_max_frag, 14784 fp_mp, 14785 clear_ire->ire_rfq, 14786 clear_ire->ire_stq, 14787 clear_ire->ire_type, 14788 res_mp, 14789 clear_ire->ire_ipif, 14790 clear_ire->ire_in_ill, 14791 clear_ire->ire_cmask, 14792 clear_ire->ire_phandle, 14793 clear_ire->ire_ihandle, 14794 clear_ire->ire_flags, 14795 &clear_ire->ire_uinfo, 14796 NULL, 14797 NULL) == NULL) 14798 cmn_err(CE_PANIC, "ire_init() failed"); 14799 if (clear_ire->ire_stq == NULL) { 14800 ire_t *ire_next = clear_ire->ire_next; 14801 if (ire_next != NULL && 14802 ire_next->ire_stq != NULL && 14803 ire_next->ire_addr == clear_ire->ire_addr && 14804 ire_next->ire_ipif->ipif_ill == 14805 clear_ire->ire_ipif->ipif_ill) { 14806 clear_ire_stq = ire_next; 14807 14808 bzero(new_nlb_ire, sizeof (ire_t)); 14809 if (clear_ire_stq->ire_nce != NULL) { 14810 fp_mp = 14811 clear_ire_stq->ire_nce->nce_fp_mp; 14812 res_mp = 14813 clear_ire_stq->ire_nce->nce_res_mp; 14814 } else { 14815 fp_mp = res_mp = NULL; 14816 } 14817 /* XXX We need a recovery strategy here. */ 14818 if (ire_init(new_nlb_ire, 14819 (uchar_t *)&clear_ire_stq->ire_addr, 14820 (uchar_t *)&clear_ire_stq->ire_mask, 14821 (uchar_t *)&clear_ire_stq->ire_src_addr, 14822 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 14823 (uchar_t *)&clear_ire_stq->ire_in_src_addr, 14824 &clear_ire_stq->ire_max_frag, 14825 fp_mp, 14826 clear_ire_stq->ire_rfq, 14827 clear_ire_stq->ire_stq, 14828 clear_ire_stq->ire_type, 14829 res_mp, 14830 clear_ire_stq->ire_ipif, 14831 clear_ire_stq->ire_in_ill, 14832 clear_ire_stq->ire_cmask, 14833 clear_ire_stq->ire_phandle, 14834 clear_ire_stq->ire_ihandle, 14835 clear_ire_stq->ire_flags, 14836 &clear_ire_stq->ire_uinfo, 14837 NULL, 14838 NULL) == NULL) 14839 cmn_err(CE_PANIC, "ire_init() failed"); 14840 } 14841 } 14842 14843 /* 14844 * Delete the ire. We can't call ire_delete() since 14845 * we are holding the bucket lock. We can't release the 14846 * bucket lock since we can't allow irep to change. So just 14847 * mark it CONDEMNED. The IRB_REFRELE will delete the 14848 * ire from the list and do the refrele. 14849 */ 14850 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 14851 irb->irb_marks |= IRB_MARK_CONDEMNED; 14852 14853 if (clear_ire_stq != NULL) { 14854 ire_fastpath_list_delete( 14855 (ill_t *)clear_ire_stq->ire_stq->q_ptr, 14856 clear_ire_stq); 14857 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 14858 } 14859 14860 /* 14861 * Also take care of otherfields like ib/ob pkt count 14862 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 14863 */ 14864 14865 /* Add the new ire's. Insert at *irep */ 14866 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 14867 ire1 = *irep; 14868 if (ire1 != NULL) 14869 ire1->ire_ptpn = &new_lb_ire->ire_next; 14870 new_lb_ire->ire_next = ire1; 14871 /* Link the new one in. */ 14872 new_lb_ire->ire_ptpn = irep; 14873 membar_producer(); 14874 *irep = new_lb_ire; 14875 new_lb_ire_used = B_TRUE; 14876 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 14877 new_lb_ire->ire_bucket->irb_ire_cnt++; 14878 new_lb_ire->ire_ipif->ipif_ire_cnt++; 14879 14880 if (clear_ire_stq != NULL) { 14881 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 14882 irep = &new_lb_ire->ire_next; 14883 /* Add the new ire. Insert at *irep */ 14884 ire1 = *irep; 14885 if (ire1 != NULL) 14886 ire1->ire_ptpn = &new_nlb_ire->ire_next; 14887 new_nlb_ire->ire_next = ire1; 14888 /* Link the new one in. */ 14889 new_nlb_ire->ire_ptpn = irep; 14890 membar_producer(); 14891 *irep = new_nlb_ire; 14892 new_nlb_ire_used = B_TRUE; 14893 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 14894 new_nlb_ire->ire_bucket->irb_ire_cnt++; 14895 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 14896 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 14897 } 14898 } 14899 rw_exit(&irb->irb_lock); 14900 if (!new_lb_ire_used) 14901 kmem_cache_free(ire_cache, new_lb_ire); 14902 if (!new_nlb_ire_used) 14903 kmem_cache_free(ire_cache, new_nlb_ire); 14904 IRB_REFRELE(irb); 14905 } 14906 14907 /* 14908 * Whenever an ipif goes down we have to renominate a different 14909 * broadcast ire to receive. Whenever an ipif comes up, we need 14910 * to make sure that we have only one nominated to receive. 14911 */ 14912 static void 14913 ipif_renominate_bcast(ipif_t *ipif) 14914 { 14915 ill_t *ill = ipif->ipif_ill; 14916 ipaddr_t subnet_addr; 14917 ipaddr_t net_addr; 14918 ipaddr_t net_mask = 0; 14919 ipaddr_t subnet_netmask; 14920 ipaddr_t addr; 14921 ill_group_t *illgrp; 14922 14923 illgrp = ill->ill_group; 14924 /* 14925 * If this is the last ipif going down, it might take 14926 * the ill out of the group. In that case ipif_down -> 14927 * illgrp_delete takes care of doing the nomination. 14928 * ipif_down does not call for this case. 14929 */ 14930 ASSERT(illgrp != NULL); 14931 14932 /* There could not have been any ires associated with this */ 14933 if (ipif->ipif_subnet == 0) 14934 return; 14935 14936 ill_mark_bcast(illgrp, 0); 14937 ill_mark_bcast(illgrp, INADDR_BROADCAST); 14938 14939 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14940 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14941 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14942 } else { 14943 net_mask = htonl(IN_CLASSA_NET); 14944 } 14945 addr = net_mask & ipif->ipif_subnet; 14946 ill_mark_bcast(illgrp, addr); 14947 14948 net_addr = ~net_mask | addr; 14949 ill_mark_bcast(illgrp, net_addr); 14950 14951 subnet_netmask = ipif->ipif_net_mask; 14952 addr = ipif->ipif_subnet; 14953 ill_mark_bcast(illgrp, addr); 14954 14955 subnet_addr = ~subnet_netmask | addr; 14956 ill_mark_bcast(illgrp, subnet_addr); 14957 } 14958 14959 /* 14960 * Whenever we form or delete ill groups, we need to nominate one set of 14961 * BROADCAST ires for receiving in the group. 14962 * 14963 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 14964 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 14965 * for ill_ipif_up_count to be non-zero. This is the only case where 14966 * ill_ipif_up_count is zero and we would still find the ires. 14967 * 14968 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 14969 * ipif is UP and we just have to do the nomination. 14970 * 14971 * 3) When ill_handoff_responsibility calls us, some ill has been removed 14972 * from the group. So, we have to do the nomination. 14973 * 14974 * Because of (3), there could be just one ill in the group. But we have 14975 * to nominate still as IRE_MARK_NORCV may have been marked on this. 14976 * Thus, this function does not optimize when there is only one ill as 14977 * it is not correct for (3). 14978 */ 14979 static void 14980 ill_nominate_bcast_rcv(ill_group_t *illgrp) 14981 { 14982 ill_t *ill; 14983 ipif_t *ipif; 14984 ipaddr_t subnet_addr; 14985 ipaddr_t prev_subnet_addr = 0; 14986 ipaddr_t net_addr; 14987 ipaddr_t prev_net_addr = 0; 14988 ipaddr_t net_mask = 0; 14989 ipaddr_t subnet_netmask; 14990 ipaddr_t addr; 14991 14992 /* 14993 * When the last memeber is leaving, there is nothing to 14994 * nominate. 14995 */ 14996 if (illgrp->illgrp_ill_count == 0) { 14997 ASSERT(illgrp->illgrp_ill == NULL); 14998 return; 14999 } 15000 15001 ill = illgrp->illgrp_ill; 15002 ASSERT(!ill->ill_isv6); 15003 /* 15004 * We assume that ires with same address and belonging to the 15005 * same group, has been grouped together. Nominating a *single* 15006 * ill in the group for sending and receiving broadcast is done 15007 * by making sure that the first BROADCAST ire (which will be 15008 * the one returned by ire_ctable_lookup for ip_rput and the 15009 * one that will be used in ip_wput_ire) will be the one that 15010 * will not have IRE_MARK_NORECV set. 15011 * 15012 * 1) ip_rput checks and discards packets received on ires marked 15013 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 15014 * broadcast packets. We need to clear IRE_MARK_NORECV on the 15015 * first ire in the group for every broadcast address in the group. 15016 * ip_rput will accept packets only on the first ire i.e only 15017 * one copy of the ill. 15018 * 15019 * 2) ip_wput_ire needs to send out just one copy of the broadcast 15020 * packet for the whole group. It needs to send out on the ill 15021 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 15022 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 15023 * the copy echoed back on other port where the ire is not marked 15024 * with IRE_MARK_NORECV. 15025 * 15026 * Note that we just need to have the first IRE either loopback or 15027 * non-loopback (either of them may not exist if ire_create failed 15028 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 15029 * always hit the first one and hence will always accept one copy. 15030 * 15031 * We have a broadcast ire per ill for all the unique prefixes 15032 * hosted on that ill. As we don't have a way of knowing the 15033 * unique prefixes on a given ill and hence in the whole group, 15034 * we just call ill_mark_bcast on all the prefixes that exist 15035 * in the group. For the common case of one prefix, the code 15036 * below optimizes by remebering the last address used for 15037 * markng. In the case of multiple prefixes, this will still 15038 * optimize depending the order of prefixes. 15039 * 15040 * The only unique address across the whole group is 0.0.0.0 and 15041 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 15042 * the first ire in the bucket for receiving and disables the 15043 * others. 15044 */ 15045 ill_mark_bcast(illgrp, 0); 15046 ill_mark_bcast(illgrp, INADDR_BROADCAST); 15047 for (; ill != NULL; ill = ill->ill_group_next) { 15048 15049 for (ipif = ill->ill_ipif; ipif != NULL; 15050 ipif = ipif->ipif_next) { 15051 15052 if (!(ipif->ipif_flags & IPIF_UP) || 15053 ipif->ipif_subnet == 0) { 15054 continue; 15055 } 15056 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15057 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15058 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15059 } else { 15060 net_mask = htonl(IN_CLASSA_NET); 15061 } 15062 addr = net_mask & ipif->ipif_subnet; 15063 if (prev_net_addr == 0 || prev_net_addr != addr) { 15064 ill_mark_bcast(illgrp, addr); 15065 net_addr = ~net_mask | addr; 15066 ill_mark_bcast(illgrp, net_addr); 15067 } 15068 prev_net_addr = addr; 15069 15070 subnet_netmask = ipif->ipif_net_mask; 15071 addr = ipif->ipif_subnet; 15072 if (prev_subnet_addr == 0 || 15073 prev_subnet_addr != addr) { 15074 ill_mark_bcast(illgrp, addr); 15075 subnet_addr = ~subnet_netmask | addr; 15076 ill_mark_bcast(illgrp, subnet_addr); 15077 } 15078 prev_subnet_addr = addr; 15079 } 15080 } 15081 } 15082 15083 /* 15084 * This function is called while forming ill groups. 15085 * 15086 * Currently, we handle only allmulti groups. We want to join 15087 * allmulti on only one of the ills in the groups. In future, 15088 * when we have link aggregation, we may have to join normal 15089 * multicast groups on multiple ills as switch does inbound load 15090 * balancing. Following are the functions that calls this 15091 * function : 15092 * 15093 * 1) ill_recover_multicast : Interface is coming back UP. 15094 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 15095 * will call ill_recover_multicast to recover all the multicast 15096 * groups. We need to make sure that only one member is joined 15097 * in the ill group. 15098 * 15099 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 15100 * Somebody is joining allmulti. We need to make sure that only one 15101 * member is joined in the group. 15102 * 15103 * 3) illgrp_insert : If allmulti has already joined, we need to make 15104 * sure that only one member is joined in the group. 15105 * 15106 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 15107 * allmulti who we have nominated. We need to pick someother ill. 15108 * 15109 * 5) illgrp_delete : The ill we nominated is leaving the group, 15110 * we need to pick a new ill to join the group. 15111 * 15112 * For (1), (2), (5) - we just have to check whether there is 15113 * a good ill joined in the group. If we could not find any ills 15114 * joined the group, we should join. 15115 * 15116 * For (4), the one that was nominated to receive, left the group. 15117 * There could be nobody joined in the group when this function is 15118 * called. 15119 * 15120 * For (3) - we need to explicitly check whether there are multiple 15121 * ills joined in the group. 15122 * 15123 * For simplicity, we don't differentiate any of the above cases. We 15124 * just leave the group if it is joined on any of them and join on 15125 * the first good ill. 15126 */ 15127 int 15128 ill_nominate_mcast_rcv(ill_group_t *illgrp) 15129 { 15130 ilm_t *ilm; 15131 ill_t *ill; 15132 ill_t *fallback_inactive_ill = NULL; 15133 ill_t *fallback_failed_ill = NULL; 15134 int ret = 0; 15135 15136 /* 15137 * Leave the allmulti on all the ills and start fresh. 15138 */ 15139 for (ill = illgrp->illgrp_ill; ill != NULL; 15140 ill = ill->ill_group_next) { 15141 if (ill->ill_join_allmulti) 15142 (void) ip_leave_allmulti(ill->ill_ipif); 15143 } 15144 15145 /* 15146 * Choose a good ill. Fallback to inactive or failed if 15147 * none available. We need to fallback to FAILED in the 15148 * case where we have 2 interfaces in a group - where 15149 * one of them is failed and another is a good one and 15150 * the good one (not marked inactive) is leaving the group. 15151 */ 15152 ret = 0; 15153 for (ill = illgrp->illgrp_ill; ill != NULL; 15154 ill = ill->ill_group_next) { 15155 /* Never pick an offline interface */ 15156 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 15157 continue; 15158 15159 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 15160 fallback_failed_ill = ill; 15161 continue; 15162 } 15163 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 15164 fallback_inactive_ill = ill; 15165 continue; 15166 } 15167 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15168 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15169 ret = ip_join_allmulti(ill->ill_ipif); 15170 /* 15171 * ip_join_allmulti can fail because of memory 15172 * failures. So, make sure we join at least 15173 * on one ill. 15174 */ 15175 if (ill->ill_join_allmulti) 15176 return (0); 15177 } 15178 } 15179 } 15180 if (ret != 0) { 15181 /* 15182 * If we tried nominating above and failed to do so, 15183 * return error. We might have tried multiple times. 15184 * But, return the latest error. 15185 */ 15186 return (ret); 15187 } 15188 if ((ill = fallback_inactive_ill) != NULL) { 15189 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15190 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15191 ret = ip_join_allmulti(ill->ill_ipif); 15192 return (ret); 15193 } 15194 } 15195 } else if ((ill = fallback_failed_ill) != NULL) { 15196 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15197 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15198 ret = ip_join_allmulti(ill->ill_ipif); 15199 return (ret); 15200 } 15201 } 15202 } 15203 return (0); 15204 } 15205 15206 /* 15207 * This function is called from illgrp_delete after it is 15208 * deleted from the group to reschedule responsibilities 15209 * to a different ill. 15210 */ 15211 static void 15212 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 15213 { 15214 ilm_t *ilm; 15215 ipif_t *ipif; 15216 ipaddr_t subnet_addr; 15217 ipaddr_t net_addr; 15218 ipaddr_t net_mask = 0; 15219 ipaddr_t subnet_netmask; 15220 ipaddr_t addr; 15221 15222 ASSERT(ill->ill_group == NULL); 15223 /* 15224 * Broadcast Responsibility: 15225 * 15226 * 1. If this ill has been nominated for receiving broadcast 15227 * packets, we need to find a new one. Before we find a new 15228 * one, we need to re-group the ires that are part of this new 15229 * group (assumed by ill_nominate_bcast_rcv). We do this by 15230 * calling ill_group_bcast_for_xmit(ill) which will do the right 15231 * thing for us. 15232 * 15233 * 2. If this ill was not nominated for receiving broadcast 15234 * packets, we need to clear the IRE_MARK_NORECV flag 15235 * so that we continue to send up broadcast packets. 15236 */ 15237 if (!ill->ill_isv6) { 15238 /* 15239 * Case 1 above : No optimization here. Just redo the 15240 * nomination. 15241 */ 15242 ill_group_bcast_for_xmit(ill); 15243 ill_nominate_bcast_rcv(illgrp); 15244 15245 /* 15246 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 15247 */ 15248 ill_clear_bcast_mark(ill, 0); 15249 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 15250 15251 for (ipif = ill->ill_ipif; ipif != NULL; 15252 ipif = ipif->ipif_next) { 15253 15254 if (!(ipif->ipif_flags & IPIF_UP) || 15255 ipif->ipif_subnet == 0) { 15256 continue; 15257 } 15258 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15259 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15260 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15261 } else { 15262 net_mask = htonl(IN_CLASSA_NET); 15263 } 15264 addr = net_mask & ipif->ipif_subnet; 15265 ill_clear_bcast_mark(ill, addr); 15266 15267 net_addr = ~net_mask | addr; 15268 ill_clear_bcast_mark(ill, net_addr); 15269 15270 subnet_netmask = ipif->ipif_net_mask; 15271 addr = ipif->ipif_subnet; 15272 ill_clear_bcast_mark(ill, addr); 15273 15274 subnet_addr = ~subnet_netmask | addr; 15275 ill_clear_bcast_mark(ill, subnet_addr); 15276 } 15277 } 15278 15279 /* 15280 * Multicast Responsibility. 15281 * 15282 * If we have joined allmulti on this one, find a new member 15283 * in the group to join allmulti. As this ill is already part 15284 * of allmulti, we don't have to join on this one. 15285 * 15286 * If we have not joined allmulti on this one, there is no 15287 * responsibility to handoff. But we need to take new 15288 * responsibility i.e, join allmulti on this one if we need 15289 * to. 15290 */ 15291 if (ill->ill_join_allmulti) { 15292 (void) ill_nominate_mcast_rcv(illgrp); 15293 } else { 15294 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15295 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15296 (void) ip_join_allmulti(ill->ill_ipif); 15297 break; 15298 } 15299 } 15300 } 15301 15302 /* 15303 * We intentionally do the flushing of IRE_CACHES only matching 15304 * on the ill and not on groups. Note that we are already deleted 15305 * from the group. 15306 * 15307 * This will make sure that all IRE_CACHES whose stq is pointing 15308 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 15309 * deleted and IRE_CACHES that are not pointing at this ill will 15310 * be left alone. 15311 */ 15312 if (ill->ill_isv6) { 15313 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15314 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15315 } else { 15316 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15317 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15318 } 15319 15320 /* 15321 * Some conn may have cached one of the IREs deleted above. By removing 15322 * the ire reference, we clean up the extra reference to the ill held in 15323 * ire->ire_stq. 15324 */ 15325 ipcl_walk(conn_cleanup_stale_ire, NULL); 15326 15327 /* 15328 * Re-do source address selection for all the members in the 15329 * group, if they borrowed source address from one of the ipifs 15330 * in this ill. 15331 */ 15332 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15333 if (ill->ill_isv6) { 15334 ipif_update_other_ipifs_v6(ipif, illgrp); 15335 } else { 15336 ipif_update_other_ipifs(ipif, illgrp); 15337 } 15338 } 15339 } 15340 15341 /* 15342 * Delete the ill from the group. The caller makes sure that it is 15343 * in a group and it okay to delete from the group. So, we always 15344 * delete here. 15345 */ 15346 static void 15347 illgrp_delete(ill_t *ill) 15348 { 15349 ill_group_t *illgrp; 15350 ill_group_t *tmpg; 15351 ill_t *tmp_ill; 15352 15353 /* 15354 * Reset illgrp_ill_schednext if it was pointing at us. 15355 * We need to do this before we set ill_group to NULL. 15356 */ 15357 rw_enter(&ill_g_lock, RW_WRITER); 15358 mutex_enter(&ill->ill_lock); 15359 15360 illgrp_reset_schednext(ill); 15361 15362 illgrp = ill->ill_group; 15363 15364 /* Delete the ill from illgrp. */ 15365 if (illgrp->illgrp_ill == ill) { 15366 illgrp->illgrp_ill = ill->ill_group_next; 15367 } else { 15368 tmp_ill = illgrp->illgrp_ill; 15369 while (tmp_ill->ill_group_next != ill) { 15370 tmp_ill = tmp_ill->ill_group_next; 15371 ASSERT(tmp_ill != NULL); 15372 } 15373 tmp_ill->ill_group_next = ill->ill_group_next; 15374 } 15375 ill->ill_group = NULL; 15376 ill->ill_group_next = NULL; 15377 15378 illgrp->illgrp_ill_count--; 15379 mutex_exit(&ill->ill_lock); 15380 rw_exit(&ill_g_lock); 15381 15382 /* 15383 * As this ill is leaving the group, we need to hand off 15384 * the responsibilities to the other ills in the group, if 15385 * this ill had some responsibilities. 15386 */ 15387 15388 ill_handoff_responsibility(ill, illgrp); 15389 15390 rw_enter(&ill_g_lock, RW_WRITER); 15391 15392 if (illgrp->illgrp_ill_count == 0) { 15393 15394 ASSERT(illgrp->illgrp_ill == NULL); 15395 if (ill->ill_isv6) { 15396 if (illgrp == illgrp_head_v6) { 15397 illgrp_head_v6 = illgrp->illgrp_next; 15398 } else { 15399 tmpg = illgrp_head_v6; 15400 while (tmpg->illgrp_next != illgrp) { 15401 tmpg = tmpg->illgrp_next; 15402 ASSERT(tmpg != NULL); 15403 } 15404 tmpg->illgrp_next = illgrp->illgrp_next; 15405 } 15406 } else { 15407 if (illgrp == illgrp_head_v4) { 15408 illgrp_head_v4 = illgrp->illgrp_next; 15409 } else { 15410 tmpg = illgrp_head_v4; 15411 while (tmpg->illgrp_next != illgrp) { 15412 tmpg = tmpg->illgrp_next; 15413 ASSERT(tmpg != NULL); 15414 } 15415 tmpg->illgrp_next = illgrp->illgrp_next; 15416 } 15417 } 15418 mutex_destroy(&illgrp->illgrp_lock); 15419 mi_free(illgrp); 15420 } 15421 rw_exit(&ill_g_lock); 15422 15423 /* 15424 * Even though the ill is out of the group its not necessary 15425 * to set ipsq_split as TRUE as the ipifs could be down temporarily 15426 * We will split the ipsq when phyint_groupname is set to NULL. 15427 */ 15428 15429 /* 15430 * Send a routing sockets message if we are deleting from 15431 * groups with names. 15432 */ 15433 if (ill->ill_phyint->phyint_groupname_len != 0) 15434 ip_rts_ifmsg(ill->ill_ipif); 15435 } 15436 15437 /* 15438 * Re-do source address selection. This is normally called when 15439 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 15440 * ipif comes up. 15441 */ 15442 void 15443 ill_update_source_selection(ill_t *ill) 15444 { 15445 ipif_t *ipif; 15446 15447 ASSERT(IAM_WRITER_ILL(ill)); 15448 15449 if (ill->ill_group != NULL) 15450 ill = ill->ill_group->illgrp_ill; 15451 15452 for (; ill != NULL; ill = ill->ill_group_next) { 15453 for (ipif = ill->ill_ipif; ipif != NULL; 15454 ipif = ipif->ipif_next) { 15455 if (ill->ill_isv6) 15456 ipif_recreate_interface_routes_v6(NULL, ipif); 15457 else 15458 ipif_recreate_interface_routes(NULL, ipif); 15459 } 15460 } 15461 } 15462 15463 /* 15464 * Insert ill in a group headed by illgrp_head. The caller can either 15465 * pass a groupname in which case we search for a group with the 15466 * same name to insert in or pass a group to insert in. This function 15467 * would only search groups with names. 15468 * 15469 * NOTE : The caller should make sure that there is at least one ipif 15470 * UP on this ill so that illgrp_scheduler can pick this ill 15471 * for outbound packets. If ill_ipif_up_count is zero, we have 15472 * already sent a DL_UNBIND to the driver and we don't want to 15473 * send anymore packets. We don't assert for ipif_up_count 15474 * to be greater than zero, because ipif_up_done wants to call 15475 * this function before bumping up the ipif_up_count. See 15476 * ipif_up_done() for details. 15477 */ 15478 int 15479 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 15480 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 15481 { 15482 ill_group_t *illgrp; 15483 ill_t *prev_ill; 15484 phyint_t *phyi; 15485 15486 ASSERT(ill->ill_group == NULL); 15487 15488 rw_enter(&ill_g_lock, RW_WRITER); 15489 mutex_enter(&ill->ill_lock); 15490 15491 if (groupname != NULL) { 15492 /* 15493 * Look for a group with a matching groupname to insert. 15494 */ 15495 for (illgrp = *illgrp_head; illgrp != NULL; 15496 illgrp = illgrp->illgrp_next) { 15497 15498 ill_t *tmp_ill; 15499 15500 /* 15501 * If we have an ill_group_t in the list which has 15502 * no ill_t assigned then we must be in the process of 15503 * removing this group. We skip this as illgrp_delete() 15504 * will remove it from the list. 15505 */ 15506 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 15507 ASSERT(illgrp->illgrp_ill_count == 0); 15508 continue; 15509 } 15510 15511 ASSERT(tmp_ill->ill_phyint != NULL); 15512 phyi = tmp_ill->ill_phyint; 15513 /* 15514 * Look at groups which has names only. 15515 */ 15516 if (phyi->phyint_groupname_len == 0) 15517 continue; 15518 /* 15519 * Names are stored in the phyint common to both 15520 * IPv4 and IPv6. 15521 */ 15522 if (mi_strcmp(phyi->phyint_groupname, 15523 groupname) == 0) { 15524 break; 15525 } 15526 } 15527 } else { 15528 /* 15529 * If the caller passes in a NULL "grp_to_insert", we 15530 * allocate one below and insert this singleton. 15531 */ 15532 illgrp = grp_to_insert; 15533 } 15534 15535 ill->ill_group_next = NULL; 15536 15537 if (illgrp == NULL) { 15538 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 15539 if (illgrp == NULL) { 15540 return (ENOMEM); 15541 } 15542 illgrp->illgrp_next = *illgrp_head; 15543 *illgrp_head = illgrp; 15544 illgrp->illgrp_ill = ill; 15545 illgrp->illgrp_ill_count = 1; 15546 ill->ill_group = illgrp; 15547 /* 15548 * Used in illgrp_scheduler to protect multiple threads 15549 * from traversing the list. 15550 */ 15551 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 15552 } else { 15553 ASSERT(ill->ill_net_type == 15554 illgrp->illgrp_ill->ill_net_type); 15555 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 15556 15557 /* Insert ill at tail of this group */ 15558 prev_ill = illgrp->illgrp_ill; 15559 while (prev_ill->ill_group_next != NULL) 15560 prev_ill = prev_ill->ill_group_next; 15561 prev_ill->ill_group_next = ill; 15562 ill->ill_group = illgrp; 15563 illgrp->illgrp_ill_count++; 15564 /* 15565 * Inherit group properties. Currently only forwarding 15566 * is the property we try to keep the same with all the 15567 * ills. When there are more, we will abstract this into 15568 * a function. 15569 */ 15570 ill->ill_flags &= ~ILLF_ROUTER; 15571 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 15572 } 15573 mutex_exit(&ill->ill_lock); 15574 rw_exit(&ill_g_lock); 15575 15576 /* 15577 * 1) When ipif_up_done() calls this function, ipif_up_count 15578 * may be zero as it has not yet been bumped. But the ires 15579 * have already been added. So, we do the nomination here 15580 * itself. But, when ip_sioctl_groupname calls this, it checks 15581 * for ill_ipif_up_count != 0. Thus we don't check for 15582 * ill_ipif_up_count here while nominating broadcast ires for 15583 * receive. 15584 * 15585 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 15586 * to group them properly as ire_add() has already happened 15587 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 15588 * case, we need to do it here anyway. 15589 */ 15590 if (!ill->ill_isv6) { 15591 ill_group_bcast_for_xmit(ill); 15592 ill_nominate_bcast_rcv(illgrp); 15593 } 15594 15595 if (!ipif_is_coming_up) { 15596 /* 15597 * When ipif_up_done() calls this function, the multicast 15598 * groups have not been joined yet. So, there is no point in 15599 * nomination. ip_join_allmulti will handle groups when 15600 * ill_recover_multicast is called from ipif_up_done() later. 15601 */ 15602 (void) ill_nominate_mcast_rcv(illgrp); 15603 /* 15604 * ipif_up_done calls ill_update_source_selection 15605 * anyway. Moreover, we don't want to re-create 15606 * interface routes while ipif_up_done() still has reference 15607 * to them. Refer to ipif_up_done() for more details. 15608 */ 15609 ill_update_source_selection(ill); 15610 } 15611 15612 /* 15613 * Send a routing sockets message if we are inserting into 15614 * groups with names. 15615 */ 15616 if (groupname != NULL) 15617 ip_rts_ifmsg(ill->ill_ipif); 15618 return (0); 15619 } 15620 15621 /* 15622 * Return the first phyint matching the groupname. There could 15623 * be more than one when there are ill groups. 15624 * 15625 * Needs work: called only from ip_sioctl_groupname 15626 */ 15627 static phyint_t * 15628 phyint_lookup_group(char *groupname) 15629 { 15630 phyint_t *phyi; 15631 15632 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 15633 /* 15634 * Group names are stored in the phyint - a common structure 15635 * to both IPv4 and IPv6. 15636 */ 15637 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 15638 for (; phyi != NULL; 15639 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 15640 phyi, AVL_AFTER)) { 15641 if (phyi->phyint_groupname_len == 0) 15642 continue; 15643 ASSERT(phyi->phyint_groupname != NULL); 15644 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 15645 return (phyi); 15646 } 15647 return (NULL); 15648 } 15649 15650 15651 15652 /* 15653 * MT notes on creation and deletion of IPMP groups 15654 * 15655 * Creation and deletion of IPMP groups introduce the need to merge or 15656 * split the associated serialization objects i.e the ipsq's. Normally all 15657 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 15658 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 15659 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 15660 * is a need to change the <ill-ipsq> association and we have to operate on both 15661 * the source and destination IPMP groups. For eg. attempting to set the 15662 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 15663 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 15664 * source or destination IPMP group are mapped to a single ipsq for executing 15665 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 15666 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 15667 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 15668 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 15669 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 15670 * ipsq has to be examined for redoing the <ill-ipsq> associations. 15671 * 15672 * In the above example the ioctl handling code locates the current ipsq of hme0 15673 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 15674 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 15675 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 15676 * the destination ipsq. If the destination ipsq is not busy, it also enters 15677 * the destination ipsq exclusively. Now the actual groupname setting operation 15678 * can proceed. If the destination ipsq is busy, the operation is enqueued 15679 * on the destination (merged) ipsq and will be handled in the unwind from 15680 * ipsq_exit. 15681 * 15682 * To prevent other threads accessing the ill while the group name change is 15683 * in progres, we bring down the ipifs which also removes the ill from the 15684 * group. The group is changed in phyint and when the first ipif on the ill 15685 * is brought up, the ill is inserted into the right IPMP group by 15686 * illgrp_insert. 15687 */ 15688 /* ARGSUSED */ 15689 int 15690 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15691 ip_ioctl_cmd_t *ipip, void *ifreq) 15692 { 15693 int i; 15694 char *tmp; 15695 int namelen; 15696 ill_t *ill = ipif->ipif_ill; 15697 ill_t *ill_v4, *ill_v6; 15698 int err = 0; 15699 phyint_t *phyi; 15700 phyint_t *phyi_tmp; 15701 struct lifreq *lifr; 15702 mblk_t *mp1; 15703 char *groupname; 15704 ipsq_t *ipsq; 15705 15706 ASSERT(IAM_WRITER_IPIF(ipif)); 15707 15708 /* Existance verified in ip_wput_nondata */ 15709 mp1 = mp->b_cont->b_cont; 15710 lifr = (struct lifreq *)mp1->b_rptr; 15711 groupname = lifr->lifr_groupname; 15712 15713 if (ipif->ipif_id != 0) 15714 return (EINVAL); 15715 15716 phyi = ill->ill_phyint; 15717 ASSERT(phyi != NULL); 15718 15719 if (phyi->phyint_flags & PHYI_VIRTUAL) 15720 return (EINVAL); 15721 15722 tmp = groupname; 15723 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 15724 ; 15725 15726 if (i == LIFNAMSIZ) { 15727 /* no null termination */ 15728 return (EINVAL); 15729 } 15730 15731 /* 15732 * Calculate the namelen exclusive of the null 15733 * termination character. 15734 */ 15735 namelen = tmp - groupname; 15736 15737 ill_v4 = phyi->phyint_illv4; 15738 ill_v6 = phyi->phyint_illv6; 15739 15740 /* 15741 * ILL cannot be part of a usesrc group and and IPMP group at the 15742 * same time. No need to grab the ill_g_usesrc_lock here, see 15743 * synchronization notes in ip.c 15744 */ 15745 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 15746 return (EINVAL); 15747 } 15748 15749 /* 15750 * mark the ill as changing. 15751 * this should queue all new requests on the syncq. 15752 */ 15753 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15754 15755 if (ill_v4 != NULL) 15756 ill_v4->ill_state_flags |= ILL_CHANGING; 15757 if (ill_v6 != NULL) 15758 ill_v6->ill_state_flags |= ILL_CHANGING; 15759 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15760 15761 if (namelen == 0) { 15762 /* 15763 * Null string means remove this interface from the 15764 * existing group. 15765 */ 15766 if (phyi->phyint_groupname_len == 0) { 15767 /* 15768 * Never was in a group. 15769 */ 15770 err = 0; 15771 goto done; 15772 } 15773 15774 /* 15775 * IPv4 or IPv6 may be temporarily out of the group when all 15776 * the ipifs are down. Thus, we need to check for ill_group to 15777 * be non-NULL. 15778 */ 15779 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 15780 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 15781 mutex_enter(&ill_v4->ill_lock); 15782 if (!ill_is_quiescent(ill_v4)) { 15783 /* 15784 * ipsq_pending_mp_add will not fail since 15785 * connp is NULL 15786 */ 15787 (void) ipsq_pending_mp_add(NULL, 15788 ill_v4->ill_ipif, q, mp, ILL_DOWN); 15789 mutex_exit(&ill_v4->ill_lock); 15790 err = EINPROGRESS; 15791 goto done; 15792 } 15793 mutex_exit(&ill_v4->ill_lock); 15794 } 15795 15796 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 15797 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 15798 mutex_enter(&ill_v6->ill_lock); 15799 if (!ill_is_quiescent(ill_v6)) { 15800 (void) ipsq_pending_mp_add(NULL, 15801 ill_v6->ill_ipif, q, mp, ILL_DOWN); 15802 mutex_exit(&ill_v6->ill_lock); 15803 err = EINPROGRESS; 15804 goto done; 15805 } 15806 mutex_exit(&ill_v6->ill_lock); 15807 } 15808 15809 rw_enter(&ill_g_lock, RW_WRITER); 15810 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15811 mutex_enter(&phyi->phyint_lock); 15812 ASSERT(phyi->phyint_groupname != NULL); 15813 mi_free(phyi->phyint_groupname); 15814 phyi->phyint_groupname = NULL; 15815 phyi->phyint_groupname_len = 0; 15816 mutex_exit(&phyi->phyint_lock); 15817 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15818 rw_exit(&ill_g_lock); 15819 err = ill_up_ipifs(ill, q, mp); 15820 15821 /* 15822 * set the split flag so that the ipsq can be split 15823 */ 15824 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15825 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15826 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15827 15828 } else { 15829 if (phyi->phyint_groupname_len != 0) { 15830 ASSERT(phyi->phyint_groupname != NULL); 15831 /* Are we inserting in the same group ? */ 15832 if (mi_strcmp(groupname, 15833 phyi->phyint_groupname) == 0) { 15834 err = 0; 15835 goto done; 15836 } 15837 } 15838 15839 rw_enter(&ill_g_lock, RW_READER); 15840 /* 15841 * Merge ipsq for the group's. 15842 * This check is here as multiple groups/ills might be 15843 * sharing the same ipsq. 15844 * If we have to merege than the operation is restarted 15845 * on the new ipsq. 15846 */ 15847 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL); 15848 if (phyi->phyint_ipsq != ipsq) { 15849 rw_exit(&ill_g_lock); 15850 err = ill_merge_groups(ill, NULL, groupname, mp, q); 15851 goto done; 15852 } 15853 /* 15854 * Running exclusive on new ipsq. 15855 */ 15856 15857 ASSERT(ipsq != NULL); 15858 ASSERT(ipsq->ipsq_writer == curthread); 15859 15860 /* 15861 * Check whether the ill_type and ill_net_type matches before 15862 * we allocate any memory so that the cleanup is easier. 15863 * 15864 * We can't group dissimilar ones as we can't load spread 15865 * packets across the group because of potential link-level 15866 * header differences. 15867 */ 15868 phyi_tmp = phyint_lookup_group(groupname); 15869 if (phyi_tmp != NULL) { 15870 if ((ill_v4 != NULL && 15871 phyi_tmp->phyint_illv4 != NULL) && 15872 ((ill_v4->ill_net_type != 15873 phyi_tmp->phyint_illv4->ill_net_type) || 15874 (ill_v4->ill_type != 15875 phyi_tmp->phyint_illv4->ill_type))) { 15876 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15877 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15878 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15879 rw_exit(&ill_g_lock); 15880 return (EINVAL); 15881 } 15882 if ((ill_v6 != NULL && 15883 phyi_tmp->phyint_illv6 != NULL) && 15884 ((ill_v6->ill_net_type != 15885 phyi_tmp->phyint_illv6->ill_net_type) || 15886 (ill_v6->ill_type != 15887 phyi_tmp->phyint_illv6->ill_type))) { 15888 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15889 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15890 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15891 rw_exit(&ill_g_lock); 15892 return (EINVAL); 15893 } 15894 } 15895 15896 rw_exit(&ill_g_lock); 15897 15898 /* 15899 * bring down all v4 ipifs. 15900 */ 15901 if (ill_v4 != NULL) { 15902 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 15903 } 15904 15905 /* 15906 * bring down all v6 ipifs. 15907 */ 15908 if (ill_v6 != NULL) { 15909 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 15910 } 15911 15912 /* 15913 * make sure all ipifs are down and there are no active 15914 * references. Call to ipsq_pending_mp_add will not fail 15915 * since connp is NULL. 15916 */ 15917 if (ill_v4 != NULL) { 15918 mutex_enter(&ill_v4->ill_lock); 15919 if (!ill_is_quiescent(ill_v4)) { 15920 (void) ipsq_pending_mp_add(NULL, 15921 ill_v4->ill_ipif, q, mp, ILL_DOWN); 15922 mutex_exit(&ill_v4->ill_lock); 15923 err = EINPROGRESS; 15924 goto done; 15925 } 15926 mutex_exit(&ill_v4->ill_lock); 15927 } 15928 15929 if (ill_v6 != NULL) { 15930 mutex_enter(&ill_v6->ill_lock); 15931 if (!ill_is_quiescent(ill_v6)) { 15932 (void) ipsq_pending_mp_add(NULL, 15933 ill_v6->ill_ipif, q, mp, ILL_DOWN); 15934 mutex_exit(&ill_v6->ill_lock); 15935 err = EINPROGRESS; 15936 goto done; 15937 } 15938 mutex_exit(&ill_v6->ill_lock); 15939 } 15940 15941 /* 15942 * allocate including space for null terminator 15943 * before we insert. 15944 */ 15945 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 15946 if (tmp == NULL) 15947 return (ENOMEM); 15948 15949 rw_enter(&ill_g_lock, RW_WRITER); 15950 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15951 mutex_enter(&phyi->phyint_lock); 15952 if (phyi->phyint_groupname_len != 0) { 15953 ASSERT(phyi->phyint_groupname != NULL); 15954 mi_free(phyi->phyint_groupname); 15955 } 15956 15957 /* 15958 * setup the new group name. 15959 */ 15960 phyi->phyint_groupname = tmp; 15961 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 15962 phyi->phyint_groupname_len = namelen + 1; 15963 mutex_exit(&phyi->phyint_lock); 15964 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15965 rw_exit(&ill_g_lock); 15966 15967 err = ill_up_ipifs(ill, q, mp); 15968 } 15969 15970 done: 15971 /* 15972 * normally ILL_CHANGING is cleared in ill_up_ipifs. 15973 */ 15974 if (err != EINPROGRESS) { 15975 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15976 if (ill_v4 != NULL) 15977 ill_v4->ill_state_flags &= ~ILL_CHANGING; 15978 if (ill_v6 != NULL) 15979 ill_v6->ill_state_flags &= ~ILL_CHANGING; 15980 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15981 } 15982 return (err); 15983 } 15984 15985 /* ARGSUSED */ 15986 int 15987 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 15988 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15989 { 15990 ill_t *ill; 15991 phyint_t *phyi; 15992 struct lifreq *lifr; 15993 mblk_t *mp1; 15994 15995 /* Existence verified in ip_wput_nondata */ 15996 mp1 = mp->b_cont->b_cont; 15997 lifr = (struct lifreq *)mp1->b_rptr; 15998 ill = ipif->ipif_ill; 15999 phyi = ill->ill_phyint; 16000 16001 lifr->lifr_groupname[0] = '\0'; 16002 /* 16003 * ill_group may be null if all the interfaces 16004 * are down. But still, the phyint should always 16005 * hold the name. 16006 */ 16007 if (phyi->phyint_groupname_len != 0) { 16008 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 16009 phyi->phyint_groupname_len); 16010 } 16011 16012 return (0); 16013 } 16014 16015 16016 typedef struct conn_move_s { 16017 ill_t *cm_from_ill; 16018 ill_t *cm_to_ill; 16019 int cm_ifindex; 16020 } conn_move_t; 16021 16022 /* 16023 * ipcl_walk function for moving conn_multicast_ill for a given ill. 16024 */ 16025 static void 16026 conn_move(conn_t *connp, caddr_t arg) 16027 { 16028 conn_move_t *connm; 16029 int ifindex; 16030 int i; 16031 ill_t *from_ill; 16032 ill_t *to_ill; 16033 ilg_t *ilg; 16034 ilm_t *ret_ilm; 16035 16036 connm = (conn_move_t *)arg; 16037 ifindex = connm->cm_ifindex; 16038 from_ill = connm->cm_from_ill; 16039 to_ill = connm->cm_to_ill; 16040 16041 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 16042 16043 /* All multicast fields protected by conn_lock */ 16044 mutex_enter(&connp->conn_lock); 16045 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 16046 if ((connp->conn_outgoing_ill == from_ill) && 16047 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 16048 connp->conn_outgoing_ill = to_ill; 16049 connp->conn_incoming_ill = to_ill; 16050 } 16051 16052 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 16053 16054 if ((connp->conn_multicast_ill == from_ill) && 16055 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 16056 connp->conn_multicast_ill = connm->cm_to_ill; 16057 } 16058 16059 /* Change IP_XMIT_IF associations */ 16060 if ((connp->conn_xmit_if_ill == from_ill) && 16061 (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) { 16062 connp->conn_xmit_if_ill = to_ill; 16063 } 16064 /* 16065 * Change the ilg_ill to point to the new one. This assumes 16066 * ilm_move_v6 has moved the ilms to new_ill and the driver 16067 * has been told to receive packets on this interface. 16068 * ilm_move_v6 FAILBACKS all the ilms successfully always. 16069 * But when doing a FAILOVER, it might fail with ENOMEM and so 16070 * some ilms may not have moved. We check to see whether 16071 * the ilms have moved to to_ill. We can't check on from_ill 16072 * as in the process of moving, we could have split an ilm 16073 * in to two - which has the same orig_ifindex and v6group. 16074 * 16075 * For IPv4, ilg_ipif moves implicitly. The code below really 16076 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 16077 */ 16078 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 16079 ilg = &connp->conn_ilg[i]; 16080 if ((ilg->ilg_ill == from_ill) && 16081 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 16082 /* ifindex != 0 indicates failback */ 16083 if (ifindex != 0) { 16084 connp->conn_ilg[i].ilg_ill = to_ill; 16085 continue; 16086 } 16087 16088 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 16089 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 16090 connp->conn_zoneid); 16091 16092 if (ret_ilm != NULL) 16093 connp->conn_ilg[i].ilg_ill = to_ill; 16094 } 16095 } 16096 mutex_exit(&connp->conn_lock); 16097 } 16098 16099 static void 16100 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 16101 { 16102 conn_move_t connm; 16103 16104 connm.cm_from_ill = from_ill; 16105 connm.cm_to_ill = to_ill; 16106 connm.cm_ifindex = ifindex; 16107 16108 ipcl_walk(conn_move, (caddr_t)&connm); 16109 } 16110 16111 /* 16112 * ilm has been moved from from_ill to to_ill. 16113 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 16114 * appropriately. 16115 * 16116 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 16117 * the code there de-references ipif_ill to get the ill to 16118 * send multicast requests. It does not work as ipif is on its 16119 * move and already moved when this function is called. 16120 * Thus, we need to use from_ill and to_ill send down multicast 16121 * requests. 16122 */ 16123 static void 16124 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 16125 { 16126 ipif_t *ipif; 16127 ilm_t *ilm; 16128 16129 /* 16130 * See whether we need to send down DL_ENABMULTI_REQ on 16131 * to_ill as ilm has just been added. 16132 */ 16133 ASSERT(IAM_WRITER_ILL(to_ill)); 16134 ASSERT(IAM_WRITER_ILL(from_ill)); 16135 16136 ILM_WALKER_HOLD(to_ill); 16137 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16138 16139 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 16140 continue; 16141 /* 16142 * no locks held, ill/ipif cannot dissappear as long 16143 * as we are writer. 16144 */ 16145 ipif = to_ill->ill_ipif; 16146 /* 16147 * No need to hold any lock as we are the writer and this 16148 * can only be changed by a writer. 16149 */ 16150 ilm->ilm_is_new = B_FALSE; 16151 16152 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 16153 ipif->ipif_flags & IPIF_POINTOPOINT) { 16154 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 16155 "resolver\n")); 16156 continue; /* Must be IRE_IF_NORESOLVER */ 16157 } 16158 16159 16160 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16161 ip1dbg(("ilm_send_multicast_reqs: " 16162 "to_ill MULTI_BCAST\n")); 16163 goto from; 16164 } 16165 16166 if (to_ill->ill_isv6) 16167 mld_joingroup(ilm); 16168 else 16169 igmp_joingroup(ilm); 16170 16171 if (to_ill->ill_ipif_up_count == 0) { 16172 /* 16173 * Nobody there. All multicast addresses will be 16174 * re-joined when we get the DL_BIND_ACK bringing the 16175 * interface up. 16176 */ 16177 ilm->ilm_notify_driver = B_FALSE; 16178 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 16179 goto from; 16180 } 16181 16182 /* 16183 * For allmulti address, we want to join on only one interface. 16184 * Checking for ilm_numentries_v6 is not correct as you may 16185 * find an ilm with zero address on to_ill, but we may not 16186 * have nominated to_ill for receiving. Thus, if we have 16187 * nominated from_ill (ill_join_allmulti is set), nominate 16188 * only if to_ill is not already nominated (to_ill normally 16189 * should not have been nominated if "from_ill" has already 16190 * been nominated. As we don't prevent failovers from happening 16191 * across groups, we don't assert). 16192 */ 16193 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16194 /* 16195 * There is no need to hold ill locks as we are 16196 * writer on both ills and when ill_join_allmulti 16197 * is changed the thread is always a writer. 16198 */ 16199 if (from_ill->ill_join_allmulti && 16200 !to_ill->ill_join_allmulti) { 16201 (void) ip_join_allmulti(to_ill->ill_ipif); 16202 } 16203 } else if (ilm->ilm_notify_driver) { 16204 16205 /* 16206 * This is a newly moved ilm so we need to tell the 16207 * driver about the new group. There can be more than 16208 * one ilm's for the same group in the list each with a 16209 * different orig_ifindex. We have to inform the driver 16210 * once. In ilm_move_v[4,6] we only set the flag 16211 * ilm_notify_driver for the first ilm. 16212 */ 16213 16214 (void) ip_ll_send_enabmulti_req(to_ill, 16215 &ilm->ilm_v6addr); 16216 } 16217 16218 ilm->ilm_notify_driver = B_FALSE; 16219 16220 /* 16221 * See whether we need to send down DL_DISABMULTI_REQ on 16222 * from_ill as ilm has just been removed. 16223 */ 16224 from: 16225 ipif = from_ill->ill_ipif; 16226 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 16227 ipif->ipif_flags & IPIF_POINTOPOINT) { 16228 ip1dbg(("ilm_send_multicast_reqs: " 16229 "from_ill not resolver\n")); 16230 continue; /* Must be IRE_IF_NORESOLVER */ 16231 } 16232 16233 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16234 ip1dbg(("ilm_send_multicast_reqs: " 16235 "from_ill MULTI_BCAST\n")); 16236 continue; 16237 } 16238 16239 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16240 if (from_ill->ill_join_allmulti) 16241 (void) ip_leave_allmulti(from_ill->ill_ipif); 16242 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 16243 (void) ip_ll_send_disabmulti_req(from_ill, 16244 &ilm->ilm_v6addr); 16245 } 16246 } 16247 ILM_WALKER_RELE(to_ill); 16248 } 16249 16250 /* 16251 * This function is called when all multicast memberships needs 16252 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 16253 * called only once unlike the IPv4 counterpart where it is called after 16254 * every logical interface is moved. The reason is due to multicast 16255 * memberships are joined using an interface address in IPv4 while in 16256 * IPv6, interface index is used. 16257 */ 16258 static void 16259 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 16260 { 16261 ilm_t *ilm; 16262 ilm_t *ilm_next; 16263 ilm_t *new_ilm; 16264 ilm_t **ilmp; 16265 int count; 16266 char buf[INET6_ADDRSTRLEN]; 16267 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 16268 16269 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16270 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16271 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16272 16273 if (ifindex == 0) { 16274 /* 16275 * Form the solicited node mcast address which is used later. 16276 */ 16277 ipif_t *ipif; 16278 16279 ipif = from_ill->ill_ipif; 16280 ASSERT(ipif->ipif_id == 0); 16281 16282 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 16283 } 16284 16285 ilmp = &from_ill->ill_ilm; 16286 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16287 ilm_next = ilm->ilm_next; 16288 16289 if (ilm->ilm_flags & ILM_DELETED) { 16290 ilmp = &ilm->ilm_next; 16291 continue; 16292 } 16293 16294 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 16295 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 16296 ASSERT(ilm->ilm_orig_ifindex != 0); 16297 if (ilm->ilm_orig_ifindex == ifindex) { 16298 /* 16299 * We are failing back multicast memberships. 16300 * If the same ilm exists in to_ill, it means somebody 16301 * has joined the same group there e.g. ff02::1 16302 * is joined within the kernel when the interfaces 16303 * came UP. 16304 */ 16305 ASSERT(ilm->ilm_ipif == NULL); 16306 if (new_ilm != NULL) { 16307 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16308 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16309 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16310 new_ilm->ilm_is_new = B_TRUE; 16311 } 16312 } else { 16313 /* 16314 * check if we can just move the ilm 16315 */ 16316 if (from_ill->ill_ilm_walker_cnt != 0) { 16317 /* 16318 * We have walkers we cannot move 16319 * the ilm, so allocate a new ilm, 16320 * this (old) ilm will be marked 16321 * ILM_DELETED at the end of the loop 16322 * and will be freed when the 16323 * last walker exits. 16324 */ 16325 new_ilm = (ilm_t *)mi_zalloc 16326 (sizeof (ilm_t)); 16327 if (new_ilm == NULL) { 16328 ip0dbg(("ilm_move_v6: " 16329 "FAILBACK of IPv6" 16330 " multicast address %s : " 16331 "from %s to" 16332 " %s failed : ENOMEM \n", 16333 inet_ntop(AF_INET6, 16334 &ilm->ilm_v6addr, buf, 16335 sizeof (buf)), 16336 from_ill->ill_name, 16337 to_ill->ill_name)); 16338 16339 ilmp = &ilm->ilm_next; 16340 continue; 16341 } 16342 *new_ilm = *ilm; 16343 /* 16344 * we don't want new_ilm linked to 16345 * ilm's filter list. 16346 */ 16347 new_ilm->ilm_filter = NULL; 16348 } else { 16349 /* 16350 * No walkers we can move the ilm. 16351 * lets take it out of the list. 16352 */ 16353 *ilmp = ilm->ilm_next; 16354 ilm->ilm_next = NULL; 16355 new_ilm = ilm; 16356 } 16357 16358 /* 16359 * if this is the first ilm for the group 16360 * set ilm_notify_driver so that we notify the 16361 * driver in ilm_send_multicast_reqs. 16362 */ 16363 if (ilm_lookup_ill_v6(to_ill, 16364 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16365 new_ilm->ilm_notify_driver = B_TRUE; 16366 16367 new_ilm->ilm_ill = to_ill; 16368 /* Add to the to_ill's list */ 16369 new_ilm->ilm_next = to_ill->ill_ilm; 16370 to_ill->ill_ilm = new_ilm; 16371 /* 16372 * set the flag so that mld_joingroup is 16373 * called in ilm_send_multicast_reqs(). 16374 */ 16375 new_ilm->ilm_is_new = B_TRUE; 16376 } 16377 goto bottom; 16378 } else if (ifindex != 0) { 16379 /* 16380 * If this is FAILBACK (ifindex != 0) and the ifindex 16381 * has not matched above, look at the next ilm. 16382 */ 16383 ilmp = &ilm->ilm_next; 16384 continue; 16385 } 16386 /* 16387 * If we are here, it means ifindex is 0. Failover 16388 * everything. 16389 * 16390 * We need to handle solicited node mcast address 16391 * and all_nodes mcast address differently as they 16392 * are joined witin the kenrel (ipif_multicast_up) 16393 * and potentially from the userland. We are called 16394 * after the ipifs of from_ill has been moved. 16395 * If we still find ilms on ill with solicited node 16396 * mcast address or all_nodes mcast address, it must 16397 * belong to the UP interface that has not moved e.g. 16398 * ipif_id 0 with the link local prefix does not move. 16399 * We join this on the new ill accounting for all the 16400 * userland memberships so that applications don't 16401 * see any failure. 16402 * 16403 * We need to make sure that we account only for the 16404 * solicited node and all node multicast addresses 16405 * that was brought UP on these. In the case of 16406 * a failover from A to B, we might have ilms belonging 16407 * to A (ilm_orig_ifindex pointing at A) on B accounting 16408 * for the membership from the userland. If we are failing 16409 * over from B to C now, we will find the ones belonging 16410 * to A on B. These don't account for the ill_ipif_up_count. 16411 * They just move from B to C. The check below on 16412 * ilm_orig_ifindex ensures that. 16413 */ 16414 if ((ilm->ilm_orig_ifindex == 16415 from_ill->ill_phyint->phyint_ifindex) && 16416 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 16417 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 16418 &ilm->ilm_v6addr))) { 16419 ASSERT(ilm->ilm_refcnt > 0); 16420 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 16421 /* 16422 * For indentation reasons, we are not using a 16423 * "else" here. 16424 */ 16425 if (count == 0) { 16426 ilmp = &ilm->ilm_next; 16427 continue; 16428 } 16429 ilm->ilm_refcnt -= count; 16430 if (new_ilm != NULL) { 16431 /* 16432 * Can find one with the same 16433 * ilm_orig_ifindex, if we are failing 16434 * over to a STANDBY. This happens 16435 * when somebody wants to join a group 16436 * on a STANDBY interface and we 16437 * internally join on a different one. 16438 * If we had joined on from_ill then, a 16439 * failover now will find a new ilm 16440 * with this index. 16441 */ 16442 ip1dbg(("ilm_move_v6: FAILOVER, found" 16443 " new ilm on %s, group address %s\n", 16444 to_ill->ill_name, 16445 inet_ntop(AF_INET6, 16446 &ilm->ilm_v6addr, buf, 16447 sizeof (buf)))); 16448 new_ilm->ilm_refcnt += count; 16449 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16450 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16451 new_ilm->ilm_is_new = B_TRUE; 16452 } 16453 } else { 16454 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16455 if (new_ilm == NULL) { 16456 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 16457 " multicast address %s : from %s to" 16458 " %s failed : ENOMEM \n", 16459 inet_ntop(AF_INET6, 16460 &ilm->ilm_v6addr, buf, 16461 sizeof (buf)), from_ill->ill_name, 16462 to_ill->ill_name)); 16463 ilmp = &ilm->ilm_next; 16464 continue; 16465 } 16466 *new_ilm = *ilm; 16467 new_ilm->ilm_filter = NULL; 16468 new_ilm->ilm_refcnt = count; 16469 new_ilm->ilm_timer = INFINITY; 16470 new_ilm->ilm_rtx.rtx_timer = INFINITY; 16471 new_ilm->ilm_is_new = B_TRUE; 16472 /* 16473 * If the to_ill has not joined this 16474 * group we need to tell the driver in 16475 * ill_send_multicast_reqs. 16476 */ 16477 if (ilm_lookup_ill_v6(to_ill, 16478 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16479 new_ilm->ilm_notify_driver = B_TRUE; 16480 16481 new_ilm->ilm_ill = to_ill; 16482 /* Add to the to_ill's list */ 16483 new_ilm->ilm_next = to_ill->ill_ilm; 16484 to_ill->ill_ilm = new_ilm; 16485 ASSERT(new_ilm->ilm_ipif == NULL); 16486 } 16487 if (ilm->ilm_refcnt == 0) { 16488 goto bottom; 16489 } else { 16490 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16491 CLEAR_SLIST(new_ilm->ilm_filter); 16492 ilmp = &ilm->ilm_next; 16493 } 16494 continue; 16495 } else { 16496 /* 16497 * ifindex = 0 means, move everything pointing at 16498 * from_ill. We are doing this becuase ill has 16499 * either FAILED or became INACTIVE. 16500 * 16501 * As we would like to move things later back to 16502 * from_ill, we want to retain the identity of this 16503 * ilm. Thus, we don't blindly increment the reference 16504 * count on the ilms matching the address alone. We 16505 * need to match on the ilm_orig_index also. new_ilm 16506 * was obtained by matching ilm_orig_index also. 16507 */ 16508 if (new_ilm != NULL) { 16509 /* 16510 * This is possible only if a previous restore 16511 * was incomplete i.e restore to 16512 * ilm_orig_ifindex left some ilms because 16513 * of some failures. Thus when we are failing 16514 * again, we might find our old friends there. 16515 */ 16516 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 16517 " on %s, group address %s\n", 16518 to_ill->ill_name, 16519 inet_ntop(AF_INET6, 16520 &ilm->ilm_v6addr, buf, 16521 sizeof (buf)))); 16522 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16523 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16524 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16525 new_ilm->ilm_is_new = B_TRUE; 16526 } 16527 } else { 16528 if (from_ill->ill_ilm_walker_cnt != 0) { 16529 new_ilm = (ilm_t *) 16530 mi_zalloc(sizeof (ilm_t)); 16531 if (new_ilm == NULL) { 16532 ip0dbg(("ilm_move_v6: " 16533 "FAILOVER of IPv6" 16534 " multicast address %s : " 16535 "from %s to" 16536 " %s failed : ENOMEM \n", 16537 inet_ntop(AF_INET6, 16538 &ilm->ilm_v6addr, buf, 16539 sizeof (buf)), 16540 from_ill->ill_name, 16541 to_ill->ill_name)); 16542 16543 ilmp = &ilm->ilm_next; 16544 continue; 16545 } 16546 *new_ilm = *ilm; 16547 new_ilm->ilm_filter = NULL; 16548 } else { 16549 *ilmp = ilm->ilm_next; 16550 new_ilm = ilm; 16551 } 16552 /* 16553 * If the to_ill has not joined this 16554 * group we need to tell the driver in 16555 * ill_send_multicast_reqs. 16556 */ 16557 if (ilm_lookup_ill_v6(to_ill, 16558 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16559 new_ilm->ilm_notify_driver = B_TRUE; 16560 16561 /* Add to the to_ill's list */ 16562 new_ilm->ilm_next = to_ill->ill_ilm; 16563 to_ill->ill_ilm = new_ilm; 16564 ASSERT(ilm->ilm_ipif == NULL); 16565 new_ilm->ilm_ill = to_ill; 16566 new_ilm->ilm_is_new = B_TRUE; 16567 } 16568 16569 } 16570 16571 bottom: 16572 /* 16573 * Revert multicast filter state to (EXCLUDE, NULL). 16574 * new_ilm->ilm_is_new should already be set if needed. 16575 */ 16576 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16577 CLEAR_SLIST(new_ilm->ilm_filter); 16578 /* 16579 * We allocated/got a new ilm, free the old one. 16580 */ 16581 if (new_ilm != ilm) { 16582 if (from_ill->ill_ilm_walker_cnt == 0) { 16583 *ilmp = ilm->ilm_next; 16584 ilm->ilm_next = NULL; 16585 FREE_SLIST(ilm->ilm_filter); 16586 FREE_SLIST(ilm->ilm_pendsrcs); 16587 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 16588 FREE_SLIST(ilm->ilm_rtx.rtx_block); 16589 mi_free((char *)ilm); 16590 } else { 16591 ilm->ilm_flags |= ILM_DELETED; 16592 from_ill->ill_ilm_cleanup_reqd = 1; 16593 ilmp = &ilm->ilm_next; 16594 } 16595 } 16596 } 16597 } 16598 16599 /* 16600 * Move all the multicast memberships to to_ill. Called when 16601 * an ipif moves from "from_ill" to "to_ill". This function is slightly 16602 * different from IPv6 counterpart as multicast memberships are associated 16603 * with ills in IPv6. This function is called after every ipif is moved 16604 * unlike IPv6, where it is moved only once. 16605 */ 16606 static void 16607 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 16608 { 16609 ilm_t *ilm; 16610 ilm_t *ilm_next; 16611 ilm_t *new_ilm; 16612 ilm_t **ilmp; 16613 16614 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16615 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16616 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16617 16618 ilmp = &from_ill->ill_ilm; 16619 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16620 ilm_next = ilm->ilm_next; 16621 16622 if (ilm->ilm_flags & ILM_DELETED) { 16623 ilmp = &ilm->ilm_next; 16624 continue; 16625 } 16626 16627 ASSERT(ilm->ilm_ipif != NULL); 16628 16629 if (ilm->ilm_ipif != ipif) { 16630 ilmp = &ilm->ilm_next; 16631 continue; 16632 } 16633 16634 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 16635 htonl(INADDR_ALLHOSTS_GROUP)) { 16636 /* 16637 * We joined this in ipif_multicast_up 16638 * and we never did an ipif_multicast_down 16639 * for IPv4. If nobody else from the userland 16640 * has reference, we free the ilm, and later 16641 * when this ipif comes up on the new ill, 16642 * we will join this again. 16643 */ 16644 if (--ilm->ilm_refcnt == 0) 16645 goto delete_ilm; 16646 16647 new_ilm = ilm_lookup_ipif(ipif, 16648 V4_PART_OF_V6(ilm->ilm_v6addr)); 16649 if (new_ilm != NULL) { 16650 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16651 /* 16652 * We still need to deal with the from_ill. 16653 */ 16654 new_ilm->ilm_is_new = B_TRUE; 16655 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16656 CLEAR_SLIST(new_ilm->ilm_filter); 16657 goto delete_ilm; 16658 } 16659 /* 16660 * If we could not find one e.g. ipif is 16661 * still down on to_ill, we add this ilm 16662 * on ill_new to preserve the reference 16663 * count. 16664 */ 16665 } 16666 /* 16667 * When ipifs move, ilms always move with it 16668 * to the NEW ill. Thus we should never be 16669 * able to find ilm till we really move it here. 16670 */ 16671 ASSERT(ilm_lookup_ipif(ipif, 16672 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 16673 16674 if (from_ill->ill_ilm_walker_cnt != 0) { 16675 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16676 if (new_ilm == NULL) { 16677 char buf[INET6_ADDRSTRLEN]; 16678 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 16679 " multicast address %s : " 16680 "from %s to" 16681 " %s failed : ENOMEM \n", 16682 inet_ntop(AF_INET, 16683 &ilm->ilm_v6addr, buf, 16684 sizeof (buf)), 16685 from_ill->ill_name, 16686 to_ill->ill_name)); 16687 16688 ilmp = &ilm->ilm_next; 16689 continue; 16690 } 16691 *new_ilm = *ilm; 16692 /* We don't want new_ilm linked to ilm's filter list */ 16693 new_ilm->ilm_filter = NULL; 16694 } else { 16695 /* Remove from the list */ 16696 *ilmp = ilm->ilm_next; 16697 new_ilm = ilm; 16698 } 16699 16700 /* 16701 * If we have never joined this group on the to_ill 16702 * make sure we tell the driver. 16703 */ 16704 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 16705 ALL_ZONES) == NULL) 16706 new_ilm->ilm_notify_driver = B_TRUE; 16707 16708 /* Add to the to_ill's list */ 16709 new_ilm->ilm_next = to_ill->ill_ilm; 16710 to_ill->ill_ilm = new_ilm; 16711 new_ilm->ilm_is_new = B_TRUE; 16712 16713 /* 16714 * Revert multicast filter state to (EXCLUDE, NULL) 16715 */ 16716 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16717 CLEAR_SLIST(new_ilm->ilm_filter); 16718 16719 /* 16720 * Delete only if we have allocated a new ilm. 16721 */ 16722 if (new_ilm != ilm) { 16723 delete_ilm: 16724 if (from_ill->ill_ilm_walker_cnt == 0) { 16725 /* Remove from the list */ 16726 *ilmp = ilm->ilm_next; 16727 ilm->ilm_next = NULL; 16728 FREE_SLIST(ilm->ilm_filter); 16729 FREE_SLIST(ilm->ilm_pendsrcs); 16730 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 16731 FREE_SLIST(ilm->ilm_rtx.rtx_block); 16732 mi_free((char *)ilm); 16733 } else { 16734 ilm->ilm_flags |= ILM_DELETED; 16735 from_ill->ill_ilm_cleanup_reqd = 1; 16736 ilmp = &ilm->ilm_next; 16737 } 16738 } 16739 } 16740 } 16741 16742 static uint_t 16743 ipif_get_id(ill_t *ill, uint_t id) 16744 { 16745 uint_t unit; 16746 ipif_t *tipif; 16747 boolean_t found = B_FALSE; 16748 16749 /* 16750 * During failback, we want to go back to the same id 16751 * instead of the smallest id so that the original 16752 * configuration is maintained. id is non-zero in that 16753 * case. 16754 */ 16755 if (id != 0) { 16756 /* 16757 * While failing back, if we still have an ipif with 16758 * MAX_ADDRS_PER_IF, it means this will be replaced 16759 * as soon as we return from this function. It was 16760 * to set to MAX_ADDRS_PER_IF by the caller so that 16761 * we can choose the smallest id. Thus we return zero 16762 * in that case ignoring the hint. 16763 */ 16764 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 16765 return (0); 16766 for (tipif = ill->ill_ipif; tipif != NULL; 16767 tipif = tipif->ipif_next) { 16768 if (tipif->ipif_id == id) { 16769 found = B_TRUE; 16770 break; 16771 } 16772 } 16773 /* 16774 * If somebody already plumbed another logical 16775 * with the same id, we won't be able to find it. 16776 */ 16777 if (!found) 16778 return (id); 16779 } 16780 for (unit = 0; unit <= ip_addrs_per_if; unit++) { 16781 found = B_FALSE; 16782 for (tipif = ill->ill_ipif; tipif != NULL; 16783 tipif = tipif->ipif_next) { 16784 if (tipif->ipif_id == unit) { 16785 found = B_TRUE; 16786 break; 16787 } 16788 } 16789 if (!found) 16790 break; 16791 } 16792 return (unit); 16793 } 16794 16795 /* ARGSUSED */ 16796 static int 16797 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 16798 ipif_t **rep_ipif_ptr) 16799 { 16800 ill_t *from_ill; 16801 ipif_t *rep_ipif; 16802 ipif_t **ipifp; 16803 uint_t unit; 16804 int err = 0; 16805 ipif_t *to_ipif; 16806 struct iocblk *iocp; 16807 boolean_t failback_cmd; 16808 boolean_t remove_ipif; 16809 int rc; 16810 16811 ASSERT(IAM_WRITER_ILL(to_ill)); 16812 ASSERT(IAM_WRITER_IPIF(ipif)); 16813 16814 iocp = (struct iocblk *)mp->b_rptr; 16815 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 16816 remove_ipif = B_FALSE; 16817 16818 from_ill = ipif->ipif_ill; 16819 16820 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16821 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16822 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16823 16824 /* 16825 * Don't move LINK LOCAL addresses as they are tied to 16826 * physical interface. 16827 */ 16828 if (from_ill->ill_isv6 && 16829 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 16830 ipif->ipif_was_up = B_FALSE; 16831 IPIF_UNMARK_MOVING(ipif); 16832 return (0); 16833 } 16834 16835 /* 16836 * We set the ipif_id to maximum so that the search for 16837 * ipif_id will pick the lowest number i.e 0 in the 16838 * following 2 cases : 16839 * 16840 * 1) We have a replacement ipif at the head of to_ill. 16841 * We can't remove it yet as we can exceed ip_addrs_per_if 16842 * on to_ill and hence the MOVE might fail. We want to 16843 * remove it only if we could move the ipif. Thus, by 16844 * setting it to the MAX value, we make the search in 16845 * ipif_get_id return the zeroth id. 16846 * 16847 * 2) When DR pulls out the NIC and re-plumbs the interface, 16848 * we might just have a zero address plumbed on the ipif 16849 * with zero id in the case of IPv4. We remove that while 16850 * doing the failback. We want to remove it only if we 16851 * could move the ipif. Thus, by setting it to the MAX 16852 * value, we make the search in ipif_get_id return the 16853 * zeroth id. 16854 * 16855 * Both (1) and (2) are done only when when we are moving 16856 * an ipif (either due to failover/failback) which originally 16857 * belonged to this interface i.e the ipif_orig_ifindex is 16858 * the same as to_ill's ifindex. This is needed so that 16859 * FAILOVER from A -> B ( A failed) followed by FAILOVER 16860 * from B -> A (B is being removed from the group) and 16861 * FAILBACK from A -> B restores the original configuration. 16862 * Without the check for orig_ifindex, the second FAILOVER 16863 * could make the ipif belonging to B replace the A's zeroth 16864 * ipif and the subsequent failback re-creating the replacement 16865 * ipif again. 16866 * 16867 * NOTE : We created the replacement ipif when we did a 16868 * FAILOVER (See below). We could check for FAILBACK and 16869 * then look for replacement ipif to be removed. But we don't 16870 * want to do that because we wan't to allow the possibility 16871 * of a FAILOVER from A -> B (which creates the replacement ipif), 16872 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 16873 * from B -> A. 16874 */ 16875 to_ipif = to_ill->ill_ipif; 16876 if ((to_ill->ill_phyint->phyint_ifindex == 16877 ipif->ipif_orig_ifindex) && 16878 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 16879 ASSERT(to_ipif->ipif_id == 0); 16880 remove_ipif = B_TRUE; 16881 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 16882 } 16883 /* 16884 * Find the lowest logical unit number on the to_ill. 16885 * If we are failing back, try to get the original id 16886 * rather than the lowest one so that the original 16887 * configuration is maintained. 16888 * 16889 * XXX need a better scheme for this. 16890 */ 16891 if (failback_cmd) { 16892 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 16893 } else { 16894 unit = ipif_get_id(to_ill, 0); 16895 } 16896 16897 /* Reset back to zero in case we fail below */ 16898 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 16899 to_ipif->ipif_id = 0; 16900 16901 if (unit == ip_addrs_per_if) { 16902 ipif->ipif_was_up = B_FALSE; 16903 IPIF_UNMARK_MOVING(ipif); 16904 return (EINVAL); 16905 } 16906 16907 /* 16908 * ipif is ready to move from "from_ill" to "to_ill". 16909 * 16910 * 1) If we are moving ipif with id zero, create a 16911 * replacement ipif for this ipif on from_ill. If this fails 16912 * fail the MOVE operation. 16913 * 16914 * 2) Remove the replacement ipif on to_ill if any. 16915 * We could remove the replacement ipif when we are moving 16916 * the ipif with id zero. But what if somebody already 16917 * unplumbed it ? Thus we always remove it if it is present. 16918 * We want to do it only if we are sure we are going to 16919 * move the ipif to to_ill which is why there are no 16920 * returns due to error till ipif is linked to to_ill. 16921 * Note that the first ipif that we failback will always 16922 * be zero if it is present. 16923 */ 16924 if (ipif->ipif_id == 0) { 16925 ipaddr_t inaddr_any = INADDR_ANY; 16926 16927 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 16928 if (rep_ipif == NULL) { 16929 ipif->ipif_was_up = B_FALSE; 16930 IPIF_UNMARK_MOVING(ipif); 16931 return (ENOMEM); 16932 } 16933 *rep_ipif = ipif_zero; 16934 /* 16935 * Before we put the ipif on the list, store the addresses 16936 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 16937 * assumes so. This logic is not any different from what 16938 * ipif_allocate does. 16939 */ 16940 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16941 &rep_ipif->ipif_v6lcl_addr); 16942 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16943 &rep_ipif->ipif_v6src_addr); 16944 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16945 &rep_ipif->ipif_v6subnet); 16946 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16947 &rep_ipif->ipif_v6net_mask); 16948 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16949 &rep_ipif->ipif_v6brd_addr); 16950 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16951 &rep_ipif->ipif_v6pp_dst_addr); 16952 /* 16953 * We mark IPIF_NOFAILOVER so that this can never 16954 * move. 16955 */ 16956 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 16957 rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; 16958 rep_ipif->ipif_replace_zero = B_TRUE; 16959 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 16960 MUTEX_DEFAULT, NULL); 16961 rep_ipif->ipif_id = 0; 16962 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 16963 rep_ipif->ipif_ill = from_ill; 16964 rep_ipif->ipif_orig_ifindex = 16965 from_ill->ill_phyint->phyint_ifindex; 16966 /* Insert at head */ 16967 rep_ipif->ipif_next = from_ill->ill_ipif; 16968 from_ill->ill_ipif = rep_ipif; 16969 /* 16970 * We don't really care to let apps know about 16971 * this interface. 16972 */ 16973 } 16974 16975 if (remove_ipif) { 16976 /* 16977 * We set to a max value above for this case to get 16978 * id zero. ASSERT that we did get one. 16979 */ 16980 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 16981 rep_ipif = to_ipif; 16982 to_ill->ill_ipif = rep_ipif->ipif_next; 16983 rep_ipif->ipif_next = NULL; 16984 /* 16985 * If some apps scanned and find this interface, 16986 * it is time to let them know, so that they can 16987 * delete it. 16988 */ 16989 16990 *rep_ipif_ptr = rep_ipif; 16991 } 16992 16993 /* Get it out of the ILL interface list. */ 16994 ipifp = &ipif->ipif_ill->ill_ipif; 16995 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 16996 if (*ipifp == ipif) { 16997 *ipifp = ipif->ipif_next; 16998 break; 16999 } 17000 } 17001 17002 /* Assign the new ill */ 17003 ipif->ipif_ill = to_ill; 17004 ipif->ipif_id = unit; 17005 /* id has already been checked */ 17006 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 17007 ASSERT(rc == 0); 17008 /* Let SCTP update its list */ 17009 sctp_move_ipif(ipif, from_ill, to_ill); 17010 /* 17011 * Handle the failover and failback of ipif_t between 17012 * ill_t that have differing maximum mtu values. 17013 */ 17014 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 17015 if (ipif->ipif_saved_mtu == 0) { 17016 /* 17017 * As this ipif_t is moving to an ill_t 17018 * that has a lower ill_max_mtu, its 17019 * ipif_mtu needs to be saved so it can 17020 * be restored during failback or during 17021 * failover to an ill_t which has a 17022 * higher ill_max_mtu. 17023 */ 17024 ipif->ipif_saved_mtu = ipif->ipif_mtu; 17025 ipif->ipif_mtu = to_ill->ill_max_mtu; 17026 } else { 17027 /* 17028 * The ipif_t is, once again, moving to 17029 * an ill_t that has a lower maximum mtu 17030 * value. 17031 */ 17032 ipif->ipif_mtu = to_ill->ill_max_mtu; 17033 } 17034 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 17035 ipif->ipif_saved_mtu != 0) { 17036 /* 17037 * The mtu of this ipif_t had to be reduced 17038 * during an earlier failover; this is an 17039 * opportunity for it to be increased (either as 17040 * part of another failover or a failback). 17041 */ 17042 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 17043 ipif->ipif_mtu = ipif->ipif_saved_mtu; 17044 ipif->ipif_saved_mtu = 0; 17045 } else { 17046 ipif->ipif_mtu = to_ill->ill_max_mtu; 17047 } 17048 } 17049 17050 /* 17051 * We preserve all the other fields of the ipif including 17052 * ipif_saved_ire_mp. The routes that are saved here will 17053 * be recreated on the new interface and back on the old 17054 * interface when we move back. 17055 */ 17056 ASSERT(ipif->ipif_arp_del_mp == NULL); 17057 17058 return (err); 17059 } 17060 17061 static int 17062 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 17063 int ifindex, ipif_t **rep_ipif_ptr) 17064 { 17065 ipif_t *mipif; 17066 ipif_t *ipif_next; 17067 int err; 17068 17069 /* 17070 * We don't really try to MOVE back things if some of the 17071 * operations fail. The daemon will take care of moving again 17072 * later on. 17073 */ 17074 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 17075 ipif_next = mipif->ipif_next; 17076 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 17077 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 17078 17079 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 17080 17081 /* 17082 * When the MOVE fails, it is the job of the 17083 * application to take care of this properly 17084 * i.e try again if it is ENOMEM. 17085 */ 17086 if (mipif->ipif_ill != from_ill) { 17087 /* 17088 * ipif has moved. 17089 * 17090 * Move the multicast memberships associated 17091 * with this ipif to the new ill. For IPv6, we 17092 * do it once after all the ipifs are moved 17093 * (in ill_move) as they are not associated 17094 * with ipifs. 17095 * 17096 * We need to move the ilms as the ipif has 17097 * already been moved to a new ill even 17098 * in the case of errors. Neither 17099 * ilm_free(ipif) will find the ilm 17100 * when somebody unplumbs this ipif nor 17101 * ilm_delete(ilm) will be able to find the 17102 * ilm, if we don't move now. 17103 */ 17104 if (!from_ill->ill_isv6) 17105 ilm_move_v4(from_ill, to_ill, mipif); 17106 } 17107 17108 if (err != 0) 17109 return (err); 17110 } 17111 } 17112 return (0); 17113 } 17114 17115 static int 17116 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 17117 { 17118 int ifindex; 17119 int err; 17120 struct iocblk *iocp; 17121 ipif_t *ipif; 17122 ipif_t *rep_ipif_ptr = NULL; 17123 ipif_t *from_ipif = NULL; 17124 boolean_t check_rep_if = B_FALSE; 17125 17126 iocp = (struct iocblk *)mp->b_rptr; 17127 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 17128 /* 17129 * Move everything pointing at from_ill to to_ill. 17130 * We acheive this by passing in 0 as ifindex. 17131 */ 17132 ifindex = 0; 17133 } else { 17134 /* 17135 * Move everything pointing at from_ill whose original 17136 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 17137 * We acheive this by passing in ifindex rather than 0. 17138 * Multicast vifs, ilgs move implicitly because ipifs move. 17139 */ 17140 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 17141 ifindex = to_ill->ill_phyint->phyint_ifindex; 17142 } 17143 17144 /* 17145 * Determine if there is at least one ipif that would move from 17146 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 17147 * ipif (if it exists) on the to_ill would be consumed as a result of 17148 * the move, in which case we need to quiesce the replacement ipif also. 17149 */ 17150 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 17151 from_ipif = from_ipif->ipif_next) { 17152 if (((ifindex == 0) || 17153 (ifindex == from_ipif->ipif_orig_ifindex)) && 17154 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 17155 check_rep_if = B_TRUE; 17156 break; 17157 } 17158 } 17159 17160 17161 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 17162 17163 GRAB_ILL_LOCKS(from_ill, to_ill); 17164 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 17165 (void) ipsq_pending_mp_add(NULL, ipif, q, 17166 mp, ILL_MOVE_OK); 17167 RELEASE_ILL_LOCKS(from_ill, to_ill); 17168 return (EINPROGRESS); 17169 } 17170 17171 /* Check if the replacement ipif is quiescent to delete */ 17172 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 17173 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 17174 to_ill->ill_ipif->ipif_state_flags |= 17175 IPIF_MOVING | IPIF_CHANGING; 17176 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 17177 (void) ipsq_pending_mp_add(NULL, ipif, q, 17178 mp, ILL_MOVE_OK); 17179 RELEASE_ILL_LOCKS(from_ill, to_ill); 17180 return (EINPROGRESS); 17181 } 17182 } 17183 RELEASE_ILL_LOCKS(from_ill, to_ill); 17184 17185 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 17186 rw_enter(&ill_g_lock, RW_WRITER); 17187 GRAB_ILL_LOCKS(from_ill, to_ill); 17188 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 17189 17190 /* ilm_move is done inside ipif_move for IPv4 */ 17191 if (err == 0 && from_ill->ill_isv6) 17192 ilm_move_v6(from_ill, to_ill, ifindex); 17193 17194 RELEASE_ILL_LOCKS(from_ill, to_ill); 17195 rw_exit(&ill_g_lock); 17196 17197 /* 17198 * send rts messages and multicast messages. 17199 */ 17200 if (rep_ipif_ptr != NULL) { 17201 ip_rts_ifmsg(rep_ipif_ptr); 17202 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 17203 IPIF_TRACE_CLEANUP(rep_ipif_ptr); 17204 mi_free(rep_ipif_ptr); 17205 } 17206 17207 conn_move_ill(from_ill, to_ill, ifindex); 17208 17209 return (err); 17210 } 17211 17212 /* 17213 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 17214 * Also checks for the validity of the arguments. 17215 * Note: We are already exclusive inside the from group. 17216 * It is upto the caller to release refcnt on the to_ill's. 17217 */ 17218 static int 17219 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 17220 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 17221 { 17222 int dst_index; 17223 ipif_t *ipif_v4, *ipif_v6; 17224 struct lifreq *lifr; 17225 mblk_t *mp1; 17226 boolean_t exists; 17227 sin_t *sin; 17228 int err = 0; 17229 17230 if ((mp1 = mp->b_cont) == NULL) 17231 return (EPROTO); 17232 17233 if ((mp1 = mp1->b_cont) == NULL) 17234 return (EPROTO); 17235 17236 lifr = (struct lifreq *)mp1->b_rptr; 17237 sin = (sin_t *)&lifr->lifr_addr; 17238 17239 /* 17240 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 17241 * specific operations. 17242 */ 17243 if (sin->sin_family != AF_UNSPEC) 17244 return (EINVAL); 17245 17246 /* 17247 * Get ipif with id 0. We are writer on the from ill. So we can pass 17248 * NULLs for the last 4 args and we know the lookup won't fail 17249 * with EINPROGRESS. 17250 */ 17251 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 17252 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 17253 ALL_ZONES, NULL, NULL, NULL, NULL); 17254 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 17255 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 17256 ALL_ZONES, NULL, NULL, NULL, NULL); 17257 17258 if (ipif_v4 == NULL && ipif_v6 == NULL) 17259 return (ENXIO); 17260 17261 if (ipif_v4 != NULL) { 17262 ASSERT(ipif_v4->ipif_refcnt != 0); 17263 if (ipif_v4->ipif_id != 0) { 17264 err = EINVAL; 17265 goto done; 17266 } 17267 17268 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 17269 *ill_from_v4 = ipif_v4->ipif_ill; 17270 } 17271 17272 if (ipif_v6 != NULL) { 17273 ASSERT(ipif_v6->ipif_refcnt != 0); 17274 if (ipif_v6->ipif_id != 0) { 17275 err = EINVAL; 17276 goto done; 17277 } 17278 17279 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 17280 *ill_from_v6 = ipif_v6->ipif_ill; 17281 } 17282 17283 err = 0; 17284 dst_index = lifr->lifr_movetoindex; 17285 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 17286 q, mp, ip_process_ioctl, &err); 17287 if (err != 0) { 17288 /* 17289 * There could be only v6. 17290 */ 17291 if (err != ENXIO) 17292 goto done; 17293 err = 0; 17294 } 17295 17296 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 17297 q, mp, ip_process_ioctl, &err); 17298 if (err != 0) { 17299 if (err != ENXIO) 17300 goto done; 17301 if (*ill_to_v4 == NULL) { 17302 err = ENXIO; 17303 goto done; 17304 } 17305 err = 0; 17306 } 17307 17308 /* 17309 * If we have something to MOVE i.e "from" not NULL, 17310 * "to" should be non-NULL. 17311 */ 17312 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 17313 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 17314 err = EINVAL; 17315 } 17316 17317 done: 17318 if (ipif_v4 != NULL) 17319 ipif_refrele(ipif_v4); 17320 if (ipif_v6 != NULL) 17321 ipif_refrele(ipif_v6); 17322 return (err); 17323 } 17324 17325 /* 17326 * FAILOVER and FAILBACK are modelled as MOVE operations. 17327 * 17328 * We don't check whether the MOVE is within the same group or 17329 * not, because this ioctl can be used as a generic mechanism 17330 * to failover from interface A to B, though things will function 17331 * only if they are really part of the same group. Moreover, 17332 * all ipifs may be down and hence temporarily out of the group. 17333 * 17334 * ipif's that need to be moved are first brought down; V4 ipifs are brought 17335 * down first and then V6. For each we wait for the ipif's to become quiescent. 17336 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 17337 * have been deleted and there are no active references. Once quiescent the 17338 * ipif's are moved and brought up on the new ill. 17339 * 17340 * Normally the source ill and destination ill belong to the same IPMP group 17341 * and hence the same ipsq_t. In the event they don't belong to the same 17342 * same group the two ipsq's are first merged into one ipsq - that of the 17343 * to_ill. The multicast memberships on the source and destination ill cannot 17344 * change during the move operation since multicast joins/leaves also have to 17345 * execute on the same ipsq and are hence serialized. 17346 */ 17347 /* ARGSUSED */ 17348 int 17349 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17350 ip_ioctl_cmd_t *ipip, void *ifreq) 17351 { 17352 ill_t *ill_to_v4 = NULL; 17353 ill_t *ill_to_v6 = NULL; 17354 ill_t *ill_from_v4 = NULL; 17355 ill_t *ill_from_v6 = NULL; 17356 int err = 0; 17357 17358 /* 17359 * setup from and to ill's, we can get EINPROGRESS only for 17360 * to_ill's. 17361 */ 17362 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 17363 &ill_to_v4, &ill_to_v6); 17364 17365 if (err != 0) { 17366 ip0dbg(("ip_sioctl_move: extract args failed\n")); 17367 goto done; 17368 } 17369 17370 /* 17371 * nothing to do. 17372 */ 17373 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 17374 goto done; 17375 } 17376 17377 /* 17378 * nothing to do. 17379 */ 17380 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 17381 goto done; 17382 } 17383 17384 /* 17385 * Mark the ill as changing. 17386 * ILL_CHANGING flag is cleared when the ipif's are brought up 17387 * in ill_up_ipifs in case of error they are cleared below. 17388 */ 17389 17390 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17391 if (ill_from_v4 != NULL) 17392 ill_from_v4->ill_state_flags |= ILL_CHANGING; 17393 if (ill_from_v6 != NULL) 17394 ill_from_v6->ill_state_flags |= ILL_CHANGING; 17395 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17396 17397 /* 17398 * Make sure that both src and dst are 17399 * in the same syncq group. If not make it happen. 17400 * We are not holding any locks because we are the writer 17401 * on the from_ipsq and we will hold locks in ill_merge_groups 17402 * to protect to_ipsq against changing. 17403 */ 17404 if (ill_from_v4 != NULL) { 17405 if (ill_from_v4->ill_phyint->phyint_ipsq != 17406 ill_to_v4->ill_phyint->phyint_ipsq) { 17407 err = ill_merge_groups(ill_from_v4, ill_to_v4, 17408 NULL, mp, q); 17409 goto err_ret; 17410 17411 } 17412 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 17413 } else { 17414 17415 if (ill_from_v6->ill_phyint->phyint_ipsq != 17416 ill_to_v6->ill_phyint->phyint_ipsq) { 17417 err = ill_merge_groups(ill_from_v6, ill_to_v6, 17418 NULL, mp, q); 17419 goto err_ret; 17420 17421 } 17422 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 17423 } 17424 17425 /* 17426 * Now that the ipsq's have been merged and we are the writer 17427 * lets mark to_ill as changing as well. 17428 */ 17429 17430 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17431 if (ill_to_v4 != NULL) 17432 ill_to_v4->ill_state_flags |= ILL_CHANGING; 17433 if (ill_to_v6 != NULL) 17434 ill_to_v6->ill_state_flags |= ILL_CHANGING; 17435 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17436 17437 /* 17438 * Its ok for us to proceed with the move even if 17439 * ill_pending_mp is non null on one of the from ill's as the reply 17440 * should not be looking at the ipif, it should only care about the 17441 * ill itself. 17442 */ 17443 17444 /* 17445 * lets move ipv4 first. 17446 */ 17447 if (ill_from_v4 != NULL) { 17448 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 17449 ill_from_v4->ill_move_in_progress = B_TRUE; 17450 ill_to_v4->ill_move_in_progress = B_TRUE; 17451 ill_to_v4->ill_move_peer = ill_from_v4; 17452 ill_from_v4->ill_move_peer = ill_to_v4; 17453 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 17454 } 17455 17456 /* 17457 * Now lets move ipv6. 17458 */ 17459 if (err == 0 && ill_from_v6 != NULL) { 17460 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 17461 ill_from_v6->ill_move_in_progress = B_TRUE; 17462 ill_to_v6->ill_move_in_progress = B_TRUE; 17463 ill_to_v6->ill_move_peer = ill_from_v6; 17464 ill_from_v6->ill_move_peer = ill_to_v6; 17465 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 17466 } 17467 17468 err_ret: 17469 /* 17470 * EINPROGRESS means we are waiting for the ipif's that need to be 17471 * moved to become quiescent. 17472 */ 17473 if (err == EINPROGRESS) { 17474 goto done; 17475 } 17476 17477 /* 17478 * if err is set ill_up_ipifs will not be called 17479 * lets clear the flags. 17480 */ 17481 17482 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17483 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17484 /* 17485 * Some of the clearing may be redundant. But it is simple 17486 * not making any extra checks. 17487 */ 17488 if (ill_from_v6 != NULL) { 17489 ill_from_v6->ill_move_in_progress = B_FALSE; 17490 ill_from_v6->ill_move_peer = NULL; 17491 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 17492 } 17493 if (ill_from_v4 != NULL) { 17494 ill_from_v4->ill_move_in_progress = B_FALSE; 17495 ill_from_v4->ill_move_peer = NULL; 17496 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 17497 } 17498 if (ill_to_v6 != NULL) { 17499 ill_to_v6->ill_move_in_progress = B_FALSE; 17500 ill_to_v6->ill_move_peer = NULL; 17501 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 17502 } 17503 if (ill_to_v4 != NULL) { 17504 ill_to_v4->ill_move_in_progress = B_FALSE; 17505 ill_to_v4->ill_move_peer = NULL; 17506 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 17507 } 17508 17509 /* 17510 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 17511 * Do this always to maintain proper state i.e even in case of errors. 17512 * As phyint_inactive looks at both v4 and v6 interfaces, 17513 * we need not call on both v4 and v6 interfaces. 17514 */ 17515 if (ill_from_v4 != NULL) { 17516 if ((ill_from_v4->ill_phyint->phyint_flags & 17517 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17518 phyint_inactive(ill_from_v4->ill_phyint); 17519 } 17520 } else if (ill_from_v6 != NULL) { 17521 if ((ill_from_v6->ill_phyint->phyint_flags & 17522 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17523 phyint_inactive(ill_from_v6->ill_phyint); 17524 } 17525 } 17526 17527 if (ill_to_v4 != NULL) { 17528 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17529 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17530 } 17531 } else if (ill_to_v6 != NULL) { 17532 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17533 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17534 } 17535 } 17536 17537 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17538 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17539 17540 no_err: 17541 /* 17542 * lets bring the interfaces up on the to_ill. 17543 */ 17544 if (err == 0) { 17545 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 17546 q, mp); 17547 } 17548 17549 if (err == 0) { 17550 if (ill_from_v4 != NULL && ill_to_v4 != NULL) 17551 ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); 17552 17553 if (ill_from_v6 != NULL && ill_to_v6 != NULL) 17554 ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); 17555 } 17556 done: 17557 17558 if (ill_to_v4 != NULL) { 17559 ill_refrele(ill_to_v4); 17560 } 17561 if (ill_to_v6 != NULL) { 17562 ill_refrele(ill_to_v6); 17563 } 17564 17565 return (err); 17566 } 17567 17568 static void 17569 ill_dl_down(ill_t *ill) 17570 { 17571 /* 17572 * The ill is down; unbind but stay attached since we're still 17573 * associated with a PPA. 17574 */ 17575 mblk_t *mp = ill->ill_unbind_mp; 17576 17577 ill->ill_unbind_mp = NULL; 17578 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 17579 if (mp != NULL) { 17580 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 17581 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 17582 ill->ill_name)); 17583 mutex_enter(&ill->ill_lock); 17584 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 17585 mutex_exit(&ill->ill_lock); 17586 ill_dlpi_send(ill, mp); 17587 } 17588 17589 /* 17590 * Toss all of our multicast memberships. We could keep them, but 17591 * then we'd have to do bookkeeping of any joins and leaves performed 17592 * by the application while the the interface is down (we can't just 17593 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 17594 * on a downed interface). 17595 */ 17596 ill_leave_multicast(ill); 17597 17598 mutex_enter(&ill->ill_lock); 17599 ill->ill_dl_up = 0; 17600 mutex_exit(&ill->ill_lock); 17601 } 17602 17603 void 17604 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 17605 { 17606 union DL_primitives *dlp; 17607 t_uscalar_t prim; 17608 17609 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17610 17611 dlp = (union DL_primitives *)mp->b_rptr; 17612 prim = dlp->dl_primitive; 17613 17614 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 17615 dlpi_prim_str(prim), prim, ill->ill_name)); 17616 17617 switch (prim) { 17618 case DL_PHYS_ADDR_REQ: 17619 { 17620 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 17621 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 17622 break; 17623 } 17624 case DL_BIND_REQ: 17625 mutex_enter(&ill->ill_lock); 17626 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 17627 mutex_exit(&ill->ill_lock); 17628 break; 17629 } 17630 17631 ill->ill_dlpi_pending = prim; 17632 17633 /* 17634 * Some drivers send M_FLUSH up to IP as part of unbind 17635 * request. When this M_FLUSH is sent back to the driver, 17636 * this can go after we send the detach request if the 17637 * M_FLUSH ends up in IP's syncq. To avoid that, we reply 17638 * to the M_FLUSH in ip_rput and locally generate another 17639 * M_FLUSH for the correctness. This will get freed in 17640 * ip_wput_nondata. 17641 */ 17642 if (prim == DL_UNBIND_REQ) 17643 (void) putnextctl1(ill->ill_rq, M_FLUSH, FLUSHRW); 17644 17645 putnext(ill->ill_wq, mp); 17646 } 17647 17648 /* 17649 * Send a DLPI control message to the driver but make sure there 17650 * is only one outstanding message. Uses ill_dlpi_pending to tell 17651 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 17652 * when an ACK or a NAK is received to process the next queued message. 17653 * 17654 * We don't protect ill_dlpi_pending with any lock. This is okay as 17655 * every place where its accessed, ip is exclusive while accessing 17656 * ill_dlpi_pending except when this function is called from ill_init() 17657 */ 17658 void 17659 ill_dlpi_send(ill_t *ill, mblk_t *mp) 17660 { 17661 mblk_t **mpp; 17662 17663 ASSERT(IAM_WRITER_ILL(ill)); 17664 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17665 17666 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 17667 /* Must queue message. Tail insertion */ 17668 mpp = &ill->ill_dlpi_deferred; 17669 while (*mpp != NULL) 17670 mpp = &((*mpp)->b_next); 17671 17672 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 17673 ill->ill_name)); 17674 17675 *mpp = mp; 17676 return; 17677 } 17678 17679 ill_dlpi_dispatch(ill, mp); 17680 } 17681 17682 /* 17683 * Called when an DLPI control message has been acked or nacked to 17684 * send down the next queued message (if any). 17685 */ 17686 void 17687 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 17688 { 17689 mblk_t *mp; 17690 17691 ASSERT(IAM_WRITER_ILL(ill)); 17692 17693 ASSERT(prim != DL_PRIM_INVAL); 17694 if (ill->ill_dlpi_pending != prim) { 17695 if (ill->ill_dlpi_pending == DL_PRIM_INVAL) { 17696 (void) mi_strlog(ill->ill_rq, 1, 17697 SL_CONSOLE|SL_ERROR|SL_TRACE, 17698 "ill_dlpi_done: unsolicited ack for %s from %s\n", 17699 dlpi_prim_str(prim), ill->ill_name); 17700 } else { 17701 (void) mi_strlog(ill->ill_rq, 1, 17702 SL_CONSOLE|SL_ERROR|SL_TRACE, 17703 "ill_dlpi_done: unexpected ack for %s from %s " 17704 "(expecting ack for %s)\n", 17705 dlpi_prim_str(prim), ill->ill_name, 17706 dlpi_prim_str(ill->ill_dlpi_pending)); 17707 } 17708 return; 17709 } 17710 17711 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 17712 dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 17713 17714 if ((mp = ill->ill_dlpi_deferred) == NULL) { 17715 ill->ill_dlpi_pending = DL_PRIM_INVAL; 17716 return; 17717 } 17718 17719 ill->ill_dlpi_deferred = mp->b_next; 17720 mp->b_next = NULL; 17721 17722 ill_dlpi_dispatch(ill, mp); 17723 } 17724 17725 void 17726 conn_delete_ire(conn_t *connp, caddr_t arg) 17727 { 17728 ipif_t *ipif = (ipif_t *)arg; 17729 ire_t *ire; 17730 17731 /* 17732 * Look at the cached ires on conns which has pointers to ipifs. 17733 * We just call ire_refrele which clears up the reference 17734 * to ire. Called when a conn closes. Also called from ipif_free 17735 * to cleanup indirect references to the stale ipif via the cached ire. 17736 */ 17737 mutex_enter(&connp->conn_lock); 17738 ire = connp->conn_ire_cache; 17739 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 17740 connp->conn_ire_cache = NULL; 17741 mutex_exit(&connp->conn_lock); 17742 IRE_REFRELE_NOTR(ire); 17743 return; 17744 } 17745 mutex_exit(&connp->conn_lock); 17746 17747 } 17748 17749 /* 17750 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 17751 * of IREs. Those IREs may have been previously cached in the conn structure. 17752 * This ipcl_walk() walker function releases all references to such IREs based 17753 * on the condemned flag. 17754 */ 17755 /* ARGSUSED */ 17756 void 17757 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 17758 { 17759 ire_t *ire; 17760 17761 mutex_enter(&connp->conn_lock); 17762 ire = connp->conn_ire_cache; 17763 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 17764 connp->conn_ire_cache = NULL; 17765 mutex_exit(&connp->conn_lock); 17766 IRE_REFRELE_NOTR(ire); 17767 return; 17768 } 17769 mutex_exit(&connp->conn_lock); 17770 } 17771 17772 /* 17773 * Take down a specific interface, but don't lose any information about it. 17774 * Also delete interface from its interface group (ifgrp). 17775 * (Always called as writer.) 17776 * This function goes through the down sequence even if the interface is 17777 * already down. There are 2 reasons. 17778 * a. Currently we permit interface routes that depend on down interfaces 17779 * to be added. This behaviour itself is questionable. However it appears 17780 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 17781 * time. We go thru the cleanup in order to remove these routes. 17782 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 17783 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 17784 * down, but we need to cleanup i.e. do ill_dl_down and 17785 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 17786 * 17787 * IP-MT notes: 17788 * 17789 * Model of reference to interfaces. 17790 * 17791 * The following members in ipif_t track references to the ipif. 17792 * int ipif_refcnt; Active reference count 17793 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 17794 * The following members in ill_t track references to the ill. 17795 * int ill_refcnt; active refcnt 17796 * uint_t ill_ire_cnt; Number of ires referencing ill 17797 * uint_t ill_nce_cnt; Number of nces referencing ill 17798 * 17799 * Reference to an ipif or ill can be obtained in any of the following ways. 17800 * 17801 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 17802 * Pointers to ipif / ill from other data structures viz ire and conn. 17803 * Implicit reference to the ipif / ill by holding a reference to the ire. 17804 * 17805 * The ipif/ill lookup functions return a reference held ipif / ill. 17806 * ipif_refcnt and ill_refcnt track the reference counts respectively. 17807 * This is a purely dynamic reference count associated with threads holding 17808 * references to the ipif / ill. Pointers from other structures do not 17809 * count towards this reference count. 17810 * 17811 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 17812 * ipif/ill. This is incremented whenever a new ire is created referencing the 17813 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 17814 * actually added to the ire hash table. The count is decremented in 17815 * ire_inactive where the ire is destroyed. 17816 * 17817 * nce's reference ill's thru nce_ill and the count of nce's associated with 17818 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 17819 * ndp_add() where the nce is actually added to the table. Similarly it is 17820 * decremented in ndp_inactive where the nce is destroyed. 17821 * 17822 * Flow of ioctls involving interface down/up 17823 * 17824 * The following is the sequence of an attempt to set some critical flags on an 17825 * up interface. 17826 * ip_sioctl_flags 17827 * ipif_down 17828 * wait for ipif to be quiescent 17829 * ipif_down_tail 17830 * ip_sioctl_flags_tail 17831 * 17832 * All set ioctls that involve down/up sequence would have a skeleton similar 17833 * to the above. All the *tail functions are called after the refcounts have 17834 * dropped to the appropriate values. 17835 * 17836 * The mechanism to quiesce an ipif is as follows. 17837 * 17838 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 17839 * on the ipif. Callers either pass a flag requesting wait or the lookup 17840 * functions will return NULL. 17841 * 17842 * Delete all ires referencing this ipif 17843 * 17844 * Any thread attempting to do an ipif_refhold on an ipif that has been 17845 * obtained thru a cached pointer will first make sure that 17846 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 17847 * increment the refcount. 17848 * 17849 * The above guarantees that the ipif refcount will eventually come down to 17850 * zero and the ipif will quiesce, once all threads that currently hold a 17851 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 17852 * ipif_refcount has dropped to zero and all ire's associated with this ipif 17853 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 17854 * drop to zero. 17855 * 17856 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 17857 * 17858 * Threads trying to lookup an ipif or ill can pass a flag requesting 17859 * wait and restart if the ipif / ill cannot be looked up currently. 17860 * For eg. bind, and route operations (Eg. route add / delete) cannot return 17861 * failure if the ipif is currently undergoing an exclusive operation, and 17862 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 17863 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 17864 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 17865 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 17866 * change while the ill_lock is held. Before dropping the ill_lock we acquire 17867 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 17868 * until we release the ipsq_lock, even though the the ill/ipif state flags 17869 * can change after we drop the ill_lock. 17870 * 17871 * An attempt to send out a packet using an ipif that is currently 17872 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 17873 * operation and restart it later when the exclusive condition on the ipif ends. 17874 * This is an example of not passing the wait flag to the lookup functions. For 17875 * example an attempt to refhold and use conn->conn_multicast_ipif and send 17876 * out a multicast packet on that ipif will fail while the ipif is 17877 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 17878 * currently IPIF_CHANGING will also fail. 17879 */ 17880 int 17881 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 17882 { 17883 ill_t *ill = ipif->ipif_ill; 17884 phyint_t *phyi; 17885 conn_t *connp; 17886 boolean_t success; 17887 boolean_t ipif_was_up = B_FALSE; 17888 17889 ASSERT(IAM_WRITER_IPIF(ipif)); 17890 17891 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 17892 17893 if (ipif->ipif_flags & IPIF_UP) { 17894 mutex_enter(&ill->ill_lock); 17895 ipif->ipif_flags &= ~IPIF_UP; 17896 ASSERT(ill->ill_ipif_up_count > 0); 17897 --ill->ill_ipif_up_count; 17898 mutex_exit(&ill->ill_lock); 17899 ipif_was_up = B_TRUE; 17900 /* Update status in SCTP's list */ 17901 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 17902 } 17903 17904 /* 17905 * Blow away v6 memberships we established in ipif_multicast_up(); the 17906 * v4 ones are left alone (as is the ipif_multicast_up flag, so we 17907 * know not to rejoin when the interface is brought back up). 17908 */ 17909 if (ipif->ipif_isv6) 17910 ipif_multicast_down(ipif); 17911 /* 17912 * Remove from the mapping for __sin6_src_id. We insert only 17913 * when the address is not INADDR_ANY. As IPv4 addresses are 17914 * stored as mapped addresses, we need to check for mapped 17915 * INADDR_ANY also. 17916 */ 17917 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 17918 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 17919 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 17920 int err; 17921 17922 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 17923 ipif->ipif_zoneid); 17924 if (err != 0) { 17925 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 17926 } 17927 } 17928 17929 /* 17930 * Before we delete the ill from the group (if any), we need 17931 * to make sure that we delete all the routes dependent on 17932 * this and also any ipifs dependent on this ipif for 17933 * source address. We need to do before we delete from 17934 * the group because 17935 * 17936 * 1) ipif_down_delete_ire de-references ill->ill_group. 17937 * 17938 * 2) ipif_update_other_ipifs needs to walk the whole group 17939 * for re-doing source address selection. Note that 17940 * ipif_select_source[_v6] called from 17941 * ipif_update_other_ipifs[_v6] will not pick this ipif 17942 * because we have already marked down here i.e cleared 17943 * IPIF_UP. 17944 */ 17945 if (ipif->ipif_isv6) 17946 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 17947 else 17948 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 17949 17950 /* 17951 * Need to add these also to be saved and restored when the 17952 * ipif is brought down and up 17953 */ 17954 mutex_enter(&ire_mrtun_lock); 17955 if (ire_mrtun_count != 0) { 17956 mutex_exit(&ire_mrtun_lock); 17957 ire_walk_ill_mrtun(0, 0, ipif_down_delete_ire, 17958 (char *)ipif, NULL); 17959 } else { 17960 mutex_exit(&ire_mrtun_lock); 17961 } 17962 17963 mutex_enter(&ire_srcif_table_lock); 17964 if (ire_srcif_table_count > 0) { 17965 mutex_exit(&ire_srcif_table_lock); 17966 ire_walk_srcif_table_v4(ipif_down_delete_ire, (char *)ipif); 17967 } else { 17968 mutex_exit(&ire_srcif_table_lock); 17969 } 17970 17971 /* 17972 * Cleaning up the conn_ire_cache or conns must be done only after the 17973 * ires have been deleted above. Otherwise a thread could end up 17974 * caching an ire in a conn after we have finished the cleanup of the 17975 * conn. The caching is done after making sure that the ire is not yet 17976 * condemned. Also documented in the block comment above ip_output 17977 */ 17978 ipcl_walk(conn_cleanup_stale_ire, NULL); 17979 /* Also, delete the ires cached in SCTP */ 17980 sctp_ire_cache_flush(ipif); 17981 17982 /* Resolve any IPsec/IKE NAT-T instances that depend on this ipif. */ 17983 nattymod_clean_ipif(ipif); 17984 17985 /* 17986 * Update any other ipifs which have used "our" local address as 17987 * a source address. This entails removing and recreating IRE_INTERFACE 17988 * entries for such ipifs. 17989 */ 17990 if (ipif->ipif_isv6) 17991 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 17992 else 17993 ipif_update_other_ipifs(ipif, ill->ill_group); 17994 17995 if (ipif_was_up) { 17996 /* 17997 * Check whether it is last ipif to leave this group. 17998 * If this is the last ipif to leave, we should remove 17999 * this ill from the group as ipif_select_source will not 18000 * be able to find any useful ipifs if this ill is selected 18001 * for load balancing. 18002 * 18003 * For nameless groups, we should call ifgrp_delete if this 18004 * belongs to some group. As this ipif is going down, we may 18005 * need to reconstruct groups. 18006 */ 18007 phyi = ill->ill_phyint; 18008 /* 18009 * If the phyint_groupname_len is 0, it may or may not 18010 * be in the nameless group. If the phyint_groupname_len is 18011 * not 0, then this ill should be part of some group. 18012 * As we always insert this ill in the group if 18013 * phyint_groupname_len is not zero when the first ipif 18014 * comes up (in ipif_up_done), it should be in a group 18015 * when the namelen is not 0. 18016 * 18017 * NOTE : When we delete the ill from the group,it will 18018 * blow away all the IRE_CACHES pointing either at this ipif or 18019 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 18020 * should be pointing at this ill. 18021 */ 18022 ASSERT(phyi->phyint_groupname_len == 0 || 18023 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 18024 18025 if (phyi->phyint_groupname_len != 0) { 18026 if (ill->ill_ipif_up_count == 0) 18027 illgrp_delete(ill); 18028 } 18029 18030 /* 18031 * If we have deleted some of the broadcast ires associated 18032 * with this ipif, we need to re-nominate somebody else if 18033 * the ires that we deleted were the nominated ones. 18034 */ 18035 if (ill->ill_group != NULL && !ill->ill_isv6) 18036 ipif_renominate_bcast(ipif); 18037 } 18038 18039 /* 18040 * neighbor-discovery or arp entries for this interface. 18041 */ 18042 ipif_ndp_down(ipif); 18043 18044 /* 18045 * If mp is NULL the caller will wait for the appropriate refcnt. 18046 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 18047 * and ill_delete -> ipif_free -> ipif_down 18048 */ 18049 if (mp == NULL) { 18050 ASSERT(q == NULL); 18051 return (0); 18052 } 18053 18054 if (CONN_Q(q)) { 18055 connp = Q_TO_CONN(q); 18056 mutex_enter(&connp->conn_lock); 18057 } else { 18058 connp = NULL; 18059 } 18060 mutex_enter(&ill->ill_lock); 18061 /* 18062 * Are there any ire's pointing to this ipif that are still active ? 18063 * If this is the last ipif going down, are there any ire's pointing 18064 * to this ill that are still active ? 18065 */ 18066 if (ipif_is_quiescent(ipif)) { 18067 mutex_exit(&ill->ill_lock); 18068 if (connp != NULL) 18069 mutex_exit(&connp->conn_lock); 18070 return (0); 18071 } 18072 18073 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 18074 ill->ill_name, (void *)ill)); 18075 /* 18076 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 18077 * drops down, the operation will be restarted by ipif_ill_refrele_tail 18078 * which in turn is called by the last refrele on the ipif/ill/ire. 18079 */ 18080 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 18081 if (!success) { 18082 /* The conn is closing. So just return */ 18083 ASSERT(connp != NULL); 18084 mutex_exit(&ill->ill_lock); 18085 mutex_exit(&connp->conn_lock); 18086 return (EINTR); 18087 } 18088 18089 mutex_exit(&ill->ill_lock); 18090 if (connp != NULL) 18091 mutex_exit(&connp->conn_lock); 18092 return (EINPROGRESS); 18093 } 18094 18095 void 18096 ipif_down_tail(ipif_t *ipif) 18097 { 18098 ill_t *ill = ipif->ipif_ill; 18099 18100 /* 18101 * Skip any loopback interface (null wq). 18102 * If this is the last logical interface on the ill 18103 * have ill_dl_down tell the driver we are gone (unbind) 18104 * Note that lun 0 can ipif_down even though 18105 * there are other logical units that are up. 18106 * This occurs e.g. when we change a "significant" IFF_ flag. 18107 */ 18108 if (ill->ill_wq != NULL && !ill->ill_logical_down && 18109 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 18110 ill->ill_dl_up) { 18111 ill_dl_down(ill); 18112 } 18113 ill->ill_logical_down = 0; 18114 18115 /* 18116 * Have to be after removing the routes in ipif_down_delete_ire. 18117 */ 18118 if (ipif->ipif_isv6) { 18119 if (ill->ill_flags & ILLF_XRESOLV) 18120 ipif_arp_down(ipif); 18121 } else { 18122 ipif_arp_down(ipif); 18123 } 18124 18125 ip_rts_ifmsg(ipif); 18126 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 18127 } 18128 18129 /* 18130 * Bring interface logically down without bringing the physical interface 18131 * down e.g. when the netmask is changed. This avoids long lasting link 18132 * negotiations between an ethernet interface and a certain switches. 18133 */ 18134 static int 18135 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18136 { 18137 /* 18138 * The ill_logical_down flag is a transient flag. It is set here 18139 * and is cleared once the down has completed in ipif_down_tail. 18140 * This flag does not indicate whether the ill stream is in the 18141 * DL_BOUND state with the driver. Instead this flag is used by 18142 * ipif_down_tail to determine whether to DL_UNBIND the stream with 18143 * the driver. The state of the ill stream i.e. whether it is 18144 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 18145 */ 18146 ipif->ipif_ill->ill_logical_down = 1; 18147 return (ipif_down(ipif, q, mp)); 18148 } 18149 18150 /* 18151 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 18152 * If the usesrc client ILL is already part of a usesrc group or not, 18153 * in either case a ire_stq with the matching usesrc client ILL will 18154 * locate the IRE's that need to be deleted. We want IREs to be created 18155 * with the new source address. 18156 */ 18157 static void 18158 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 18159 { 18160 ill_t *ucill = (ill_t *)ill_arg; 18161 18162 ASSERT(IAM_WRITER_ILL(ucill)); 18163 18164 if (ire->ire_stq == NULL) 18165 return; 18166 18167 if ((ire->ire_type == IRE_CACHE) && 18168 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 18169 ire_delete(ire); 18170 } 18171 18172 /* 18173 * ire_walk routine to delete every IRE dependent on the interface 18174 * address that is going down. (Always called as writer.) 18175 * Works for both v4 and v6. 18176 * In addition for checking for ire_ipif matches it also checks for 18177 * IRE_CACHE entries which have the same source address as the 18178 * disappearing ipif since ipif_select_source might have picked 18179 * that source. Note that ipif_down/ipif_update_other_ipifs takes 18180 * care of any IRE_INTERFACE with the disappearing source address. 18181 */ 18182 static void 18183 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 18184 { 18185 ipif_t *ipif = (ipif_t *)ipif_arg; 18186 ill_t *ire_ill; 18187 ill_t *ipif_ill; 18188 18189 ASSERT(IAM_WRITER_IPIF(ipif)); 18190 if (ire->ire_ipif == NULL) 18191 return; 18192 18193 /* 18194 * For IPv4, we derive source addresses for an IRE from ipif's 18195 * belonging to the same IPMP group as the IRE's outgoing 18196 * interface. If an IRE's outgoing interface isn't in the 18197 * same IPMP group as a particular ipif, then that ipif 18198 * couldn't have been used as a source address for this IRE. 18199 * 18200 * For IPv6, source addresses are only restricted to the IPMP group 18201 * if the IRE is for a link-local address or a multicast address. 18202 * Otherwise, source addresses for an IRE can be chosen from 18203 * interfaces other than the the outgoing interface for that IRE. 18204 * 18205 * For source address selection details, see ipif_select_source() 18206 * and ipif_select_source_v6(). 18207 */ 18208 if (ire->ire_ipversion == IPV4_VERSION || 18209 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 18210 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 18211 ire_ill = ire->ire_ipif->ipif_ill; 18212 ipif_ill = ipif->ipif_ill; 18213 18214 if (ire_ill->ill_group != ipif_ill->ill_group) { 18215 return; 18216 } 18217 } 18218 18219 18220 if (ire->ire_ipif != ipif) { 18221 /* 18222 * Look for a matching source address. 18223 */ 18224 if (ire->ire_type != IRE_CACHE) 18225 return; 18226 if (ipif->ipif_flags & IPIF_NOLOCAL) 18227 return; 18228 18229 if (ire->ire_ipversion == IPV4_VERSION) { 18230 if (ire->ire_src_addr != ipif->ipif_src_addr) 18231 return; 18232 } else { 18233 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 18234 &ipif->ipif_v6lcl_addr)) 18235 return; 18236 } 18237 ire_delete(ire); 18238 return; 18239 } 18240 /* 18241 * ire_delete() will do an ire_flush_cache which will delete 18242 * all ire_ipif matches 18243 */ 18244 ire_delete(ire); 18245 } 18246 18247 /* 18248 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 18249 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 18250 * 2) when an interface is brought up or down (on that ill). 18251 * This ensures that the IRE_CACHE entries don't retain stale source 18252 * address selection results. 18253 */ 18254 void 18255 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 18256 { 18257 ill_t *ill = (ill_t *)ill_arg; 18258 ill_t *ipif_ill; 18259 18260 ASSERT(IAM_WRITER_ILL(ill)); 18261 /* 18262 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18263 * Hence this should be IRE_CACHE. 18264 */ 18265 ASSERT(ire->ire_type == IRE_CACHE); 18266 18267 /* 18268 * We are called for IRE_CACHES whose ire_ipif matches ill. 18269 * We are only interested in IRE_CACHES that has borrowed 18270 * the source address from ill_arg e.g. ipif_up_done[_v6] 18271 * for which we need to look at ire_ipif->ipif_ill match 18272 * with ill. 18273 */ 18274 ASSERT(ire->ire_ipif != NULL); 18275 ipif_ill = ire->ire_ipif->ipif_ill; 18276 if (ipif_ill == ill || (ill->ill_group != NULL && 18277 ipif_ill->ill_group == ill->ill_group)) { 18278 ire_delete(ire); 18279 } 18280 } 18281 18282 /* 18283 * Delete all the ire whose stq references ill_arg. 18284 */ 18285 static void 18286 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 18287 { 18288 ill_t *ill = (ill_t *)ill_arg; 18289 ill_t *ire_ill; 18290 18291 ASSERT(IAM_WRITER_ILL(ill)); 18292 /* 18293 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18294 * Hence this should be IRE_CACHE. 18295 */ 18296 ASSERT(ire->ire_type == IRE_CACHE); 18297 18298 /* 18299 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18300 * matches ill. We are only interested in IRE_CACHES that 18301 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 18302 * filtering here. 18303 */ 18304 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 18305 18306 if (ire_ill == ill) 18307 ire_delete(ire); 18308 } 18309 18310 /* 18311 * This is called when an ill leaves the group. We want to delete 18312 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 18313 * pointing at ill. 18314 */ 18315 static void 18316 illgrp_cache_delete(ire_t *ire, char *ill_arg) 18317 { 18318 ill_t *ill = (ill_t *)ill_arg; 18319 18320 ASSERT(IAM_WRITER_ILL(ill)); 18321 ASSERT(ill->ill_group == NULL); 18322 /* 18323 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18324 * Hence this should be IRE_CACHE. 18325 */ 18326 ASSERT(ire->ire_type == IRE_CACHE); 18327 /* 18328 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18329 * matches ill. We are interested in both. 18330 */ 18331 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 18332 (ire->ire_ipif->ipif_ill == ill)); 18333 18334 ire_delete(ire); 18335 } 18336 18337 /* 18338 * Initiate deallocate of an IPIF. Always called as writer. Called by 18339 * ill_delete or ip_sioctl_removeif. 18340 */ 18341 static void 18342 ipif_free(ipif_t *ipif) 18343 { 18344 ASSERT(IAM_WRITER_IPIF(ipif)); 18345 18346 if (ipif->ipif_recovery_id != 0) 18347 (void) untimeout(ipif->ipif_recovery_id); 18348 ipif->ipif_recovery_id = 0; 18349 18350 /* Remove conn references */ 18351 reset_conn_ipif(ipif); 18352 18353 /* 18354 * Make sure we have valid net and subnet broadcast ire's for the 18355 * other ipif's which share them with this ipif. 18356 */ 18357 if (!ipif->ipif_isv6) 18358 ipif_check_bcast_ires(ipif); 18359 18360 /* 18361 * Take down the interface. We can be called either from ill_delete 18362 * or from ip_sioctl_removeif. 18363 */ 18364 (void) ipif_down(ipif, NULL, NULL); 18365 18366 rw_enter(&ill_g_lock, RW_WRITER); 18367 /* Remove pointers to this ill in the multicast routing tables */ 18368 reset_mrt_vif_ipif(ipif); 18369 rw_exit(&ill_g_lock); 18370 } 18371 18372 static void 18373 ipif_free_tail(ipif_t *ipif) 18374 { 18375 mblk_t *mp; 18376 ipif_t **ipifp; 18377 18378 /* 18379 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 18380 */ 18381 mutex_enter(&ipif->ipif_saved_ire_lock); 18382 mp = ipif->ipif_saved_ire_mp; 18383 ipif->ipif_saved_ire_mp = NULL; 18384 mutex_exit(&ipif->ipif_saved_ire_lock); 18385 freemsg(mp); 18386 18387 /* 18388 * Need to hold both ill_g_lock and ill_lock while 18389 * inserting or removing an ipif from the linked list 18390 * of ipifs hanging off the ill. 18391 */ 18392 rw_enter(&ill_g_lock, RW_WRITER); 18393 /* 18394 * Remove all multicast memberships on the interface now. 18395 * This removes IPv4 multicast memberships joined within 18396 * the kernel as ipif_down does not do ipif_multicast_down 18397 * for IPv4. IPv6 is not handled here as the multicast memberships 18398 * are based on ill and not on ipif. 18399 */ 18400 ilm_free(ipif); 18401 18402 /* 18403 * Since we held the ill_g_lock while doing the ilm_free above, 18404 * we can assert the ilms were really deleted and not just marked 18405 * ILM_DELETED. 18406 */ 18407 ASSERT(ilm_walk_ipif(ipif) == 0); 18408 18409 18410 IPIF_TRACE_CLEANUP(ipif); 18411 18412 /* Ask SCTP to take it out of it list */ 18413 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 18414 18415 mutex_enter(&ipif->ipif_ill->ill_lock); 18416 /* Get it out of the ILL interface list. */ 18417 ipifp = &ipif->ipif_ill->ill_ipif; 18418 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 18419 if (*ipifp == ipif) { 18420 *ipifp = ipif->ipif_next; 18421 break; 18422 } 18423 } 18424 18425 mutex_exit(&ipif->ipif_ill->ill_lock); 18426 rw_exit(&ill_g_lock); 18427 18428 mutex_destroy(&ipif->ipif_saved_ire_lock); 18429 18430 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 18431 18432 /* Free the memory. */ 18433 mi_free((char *)ipif); 18434 } 18435 18436 /* 18437 * Returns an ipif name in the form "ill_name/unit" if ipif_id is not zero, 18438 * "ill_name" otherwise. 18439 */ 18440 char * 18441 ipif_get_name(const ipif_t *ipif, char *buf, int len) 18442 { 18443 char lbuf[32]; 18444 char *name; 18445 size_t name_len; 18446 18447 buf[0] = '\0'; 18448 if (!ipif) 18449 return (buf); 18450 name = ipif->ipif_ill->ill_name; 18451 name_len = ipif->ipif_ill->ill_name_length; 18452 if (ipif->ipif_id != 0) { 18453 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 18454 ipif->ipif_id); 18455 name = lbuf; 18456 name_len = mi_strlen(name) + 1; 18457 } 18458 len -= 1; 18459 buf[len] = '\0'; 18460 len = MIN(len, name_len); 18461 bcopy(name, buf, len); 18462 return (buf); 18463 } 18464 18465 /* 18466 * Find an IPIF based on the name passed in. Names can be of the 18467 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 18468 * The <phys> string can have forms like <dev><#> (e.g., le0), 18469 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 18470 * When there is no colon, the implied unit id is zero. <phys> must 18471 * correspond to the name of an ILL. (May be called as writer.) 18472 */ 18473 static ipif_t * 18474 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 18475 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 18476 mblk_t *mp, ipsq_func_t func, int *error) 18477 { 18478 char *cp; 18479 char *endp; 18480 long id; 18481 ill_t *ill; 18482 ipif_t *ipif; 18483 uint_t ire_type; 18484 boolean_t did_alloc = B_FALSE; 18485 ipsq_t *ipsq; 18486 18487 if (error != NULL) 18488 *error = 0; 18489 18490 /* 18491 * If the caller wants to us to create the ipif, make sure we have a 18492 * valid zoneid 18493 */ 18494 ASSERT(!do_alloc || zoneid != ALL_ZONES); 18495 18496 if (namelen == 0) { 18497 if (error != NULL) 18498 *error = ENXIO; 18499 return (NULL); 18500 } 18501 18502 *exists = B_FALSE; 18503 /* Look for a colon in the name. */ 18504 endp = &name[namelen]; 18505 for (cp = endp; --cp > name; ) { 18506 if (*cp == IPIF_SEPARATOR_CHAR) 18507 break; 18508 } 18509 18510 if (*cp == IPIF_SEPARATOR_CHAR) { 18511 /* 18512 * Reject any non-decimal aliases for logical 18513 * interfaces. Aliases with leading zeroes 18514 * are also rejected as they introduce ambiguity 18515 * in the naming of the interfaces. 18516 * In order to confirm with existing semantics, 18517 * and to not break any programs/script relying 18518 * on that behaviour, if<0>:0 is considered to be 18519 * a valid interface. 18520 * 18521 * If alias has two or more digits and the first 18522 * is zero, fail. 18523 */ 18524 if (&cp[2] < endp && cp[1] == '0') 18525 return (NULL); 18526 } 18527 18528 if (cp <= name) { 18529 cp = endp; 18530 } else { 18531 *cp = '\0'; 18532 } 18533 18534 /* 18535 * Look up the ILL, based on the portion of the name 18536 * before the slash. ill_lookup_on_name returns a held ill. 18537 * Temporary to check whether ill exists already. If so 18538 * ill_lookup_on_name will clear it. 18539 */ 18540 ill = ill_lookup_on_name(name, do_alloc, isv6, 18541 q, mp, func, error, &did_alloc); 18542 if (cp != endp) 18543 *cp = IPIF_SEPARATOR_CHAR; 18544 if (ill == NULL) 18545 return (NULL); 18546 18547 /* Establish the unit number in the name. */ 18548 id = 0; 18549 if (cp < endp && *endp == '\0') { 18550 /* If there was a colon, the unit number follows. */ 18551 cp++; 18552 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 18553 ill_refrele(ill); 18554 if (error != NULL) 18555 *error = ENXIO; 18556 return (NULL); 18557 } 18558 } 18559 18560 GRAB_CONN_LOCK(q); 18561 mutex_enter(&ill->ill_lock); 18562 /* Now see if there is an IPIF with this unit number. */ 18563 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 18564 if (ipif->ipif_id == id) { 18565 if (zoneid != ALL_ZONES && 18566 zoneid != ipif->ipif_zoneid && 18567 ipif->ipif_zoneid != ALL_ZONES) { 18568 mutex_exit(&ill->ill_lock); 18569 RELEASE_CONN_LOCK(q); 18570 ill_refrele(ill); 18571 if (error != NULL) 18572 *error = ENXIO; 18573 return (NULL); 18574 } 18575 /* 18576 * The block comment at the start of ipif_down 18577 * explains the use of the macros used below 18578 */ 18579 if (IPIF_CAN_LOOKUP(ipif)) { 18580 ipif_refhold_locked(ipif); 18581 mutex_exit(&ill->ill_lock); 18582 if (!did_alloc) 18583 *exists = B_TRUE; 18584 /* 18585 * Drop locks before calling ill_refrele 18586 * since it can potentially call into 18587 * ipif_ill_refrele_tail which can end up 18588 * in trying to acquire any lock. 18589 */ 18590 RELEASE_CONN_LOCK(q); 18591 ill_refrele(ill); 18592 return (ipif); 18593 } else if (IPIF_CAN_WAIT(ipif, q)) { 18594 ipsq = ill->ill_phyint->phyint_ipsq; 18595 mutex_enter(&ipsq->ipsq_lock); 18596 mutex_exit(&ill->ill_lock); 18597 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 18598 mutex_exit(&ipsq->ipsq_lock); 18599 RELEASE_CONN_LOCK(q); 18600 ill_refrele(ill); 18601 *error = EINPROGRESS; 18602 return (NULL); 18603 } 18604 } 18605 } 18606 RELEASE_CONN_LOCK(q); 18607 18608 if (!do_alloc) { 18609 mutex_exit(&ill->ill_lock); 18610 ill_refrele(ill); 18611 if (error != NULL) 18612 *error = ENXIO; 18613 return (NULL); 18614 } 18615 18616 /* 18617 * If none found, atomically allocate and return a new one. 18618 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 18619 * to support "receive only" use of lo0:1 etc. as is still done 18620 * below as an initial guess. 18621 * However, this is now likely to be overriden later in ipif_up_done() 18622 * when we know for sure what address has been configured on the 18623 * interface, since we might have more than one loopback interface 18624 * with a loopback address, e.g. in the case of zones, and all the 18625 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 18626 */ 18627 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 18628 ire_type = IRE_LOOPBACK; 18629 else 18630 ire_type = IRE_LOCAL; 18631 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 18632 if (ipif != NULL) 18633 ipif_refhold_locked(ipif); 18634 else if (error != NULL) 18635 *error = ENOMEM; 18636 mutex_exit(&ill->ill_lock); 18637 ill_refrele(ill); 18638 return (ipif); 18639 } 18640 18641 /* 18642 * This routine is called whenever a new address comes up on an ipif. If 18643 * we are configured to respond to address mask requests, then we are supposed 18644 * to broadcast an address mask reply at this time. This routine is also 18645 * called if we are already up, but a netmask change is made. This is legal 18646 * but might not make the system manager very popular. (May be called 18647 * as writer.) 18648 */ 18649 void 18650 ipif_mask_reply(ipif_t *ipif) 18651 { 18652 icmph_t *icmph; 18653 ipha_t *ipha; 18654 mblk_t *mp; 18655 18656 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 18657 18658 if (!ip_respond_to_address_mask_broadcast) 18659 return; 18660 18661 /* ICMP mask reply is IPv4 only */ 18662 ASSERT(!ipif->ipif_isv6); 18663 /* ICMP mask reply is not for a loopback interface */ 18664 ASSERT(ipif->ipif_ill->ill_wq != NULL); 18665 18666 mp = allocb(REPLY_LEN, BPRI_HI); 18667 if (mp == NULL) 18668 return; 18669 mp->b_wptr = mp->b_rptr + REPLY_LEN; 18670 18671 ipha = (ipha_t *)mp->b_rptr; 18672 bzero(ipha, REPLY_LEN); 18673 *ipha = icmp_ipha; 18674 ipha->ipha_ttl = ip_broadcast_ttl; 18675 ipha->ipha_src = ipif->ipif_src_addr; 18676 ipha->ipha_dst = ipif->ipif_brd_addr; 18677 ipha->ipha_length = htons(REPLY_LEN); 18678 ipha->ipha_ident = 0; 18679 18680 icmph = (icmph_t *)&ipha[1]; 18681 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 18682 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 18683 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 18684 if (icmph->icmph_checksum == 0) 18685 icmph->icmph_checksum = 0xffff; 18686 18687 put(ipif->ipif_wq, mp); 18688 18689 #undef REPLY_LEN 18690 } 18691 18692 /* 18693 * When the mtu in the ipif changes, we call this routine through ire_walk 18694 * to update all the relevant IREs. 18695 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 18696 */ 18697 static void 18698 ipif_mtu_change(ire_t *ire, char *ipif_arg) 18699 { 18700 ipif_t *ipif = (ipif_t *)ipif_arg; 18701 18702 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 18703 return; 18704 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 18705 } 18706 18707 /* 18708 * When the mtu in the ill changes, we call this routine through ire_walk 18709 * to update all the relevant IREs. 18710 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 18711 */ 18712 void 18713 ill_mtu_change(ire_t *ire, char *ill_arg) 18714 { 18715 ill_t *ill = (ill_t *)ill_arg; 18716 18717 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 18718 return; 18719 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 18720 } 18721 18722 /* 18723 * Join the ipif specific multicast groups. 18724 * Must be called after a mapping has been set up in the resolver. (Always 18725 * called as writer.) 18726 */ 18727 void 18728 ipif_multicast_up(ipif_t *ipif) 18729 { 18730 int err, index; 18731 ill_t *ill; 18732 18733 ASSERT(IAM_WRITER_IPIF(ipif)); 18734 18735 ill = ipif->ipif_ill; 18736 index = ill->ill_phyint->phyint_ifindex; 18737 18738 ip1dbg(("ipif_multicast_up\n")); 18739 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 18740 return; 18741 18742 if (ipif->ipif_isv6) { 18743 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 18744 return; 18745 18746 /* Join the all hosts multicast address */ 18747 ip1dbg(("ipif_multicast_up - addmulti\n")); 18748 /* 18749 * Passing B_TRUE means we have to join the multicast 18750 * membership on this interface even though this is 18751 * FAILED. If we join on a different one in the group, 18752 * we will not be able to delete the membership later 18753 * as we currently don't track where we join when we 18754 * join within the kernel unlike applications where 18755 * we have ilg/ilg_orig_index. See ip_addmulti_v6 18756 * for more on this. 18757 */ 18758 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 18759 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 18760 if (err != 0) { 18761 ip0dbg(("ipif_multicast_up: " 18762 "all_hosts_mcast failed %d\n", 18763 err)); 18764 return; 18765 } 18766 /* 18767 * Enable multicast for the solicited node multicast address 18768 */ 18769 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 18770 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 18771 18772 ipv6_multi.s6_addr32[3] |= 18773 ipif->ipif_v6lcl_addr.s6_addr32[3]; 18774 18775 err = ip_addmulti_v6(&ipv6_multi, ill, index, 18776 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 18777 NULL); 18778 if (err != 0) { 18779 ip0dbg(("ipif_multicast_up: solicited MC" 18780 " failed %d\n", err)); 18781 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 18782 ill, ill->ill_phyint->phyint_ifindex, 18783 ipif->ipif_zoneid, B_TRUE, B_TRUE); 18784 return; 18785 } 18786 } 18787 } else { 18788 if (ipif->ipif_lcl_addr == INADDR_ANY) 18789 return; 18790 18791 /* Join the all hosts multicast address */ 18792 ip1dbg(("ipif_multicast_up - addmulti\n")); 18793 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 18794 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 18795 if (err) { 18796 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 18797 return; 18798 } 18799 } 18800 ipif->ipif_multicast_up = 1; 18801 } 18802 18803 /* 18804 * Blow away any IPv6 multicast groups that we joined in ipif_multicast_up(); 18805 * any explicit memberships are blown away in ill_leave_multicast() when the 18806 * ill is brought down. 18807 */ 18808 static void 18809 ipif_multicast_down(ipif_t *ipif) 18810 { 18811 int err; 18812 18813 ASSERT(IAM_WRITER_IPIF(ipif)); 18814 18815 ip1dbg(("ipif_multicast_down\n")); 18816 if (!ipif->ipif_multicast_up) 18817 return; 18818 18819 ASSERT(ipif->ipif_isv6); 18820 18821 ip1dbg(("ipif_multicast_down - delmulti\n")); 18822 18823 /* 18824 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 18825 * we should look for ilms on this ill rather than the ones that have 18826 * been failed over here. They are here temporarily. As 18827 * ipif_multicast_up has joined on this ill, we should delete only 18828 * from this ill. 18829 */ 18830 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 18831 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 18832 B_TRUE, B_TRUE); 18833 if (err != 0) { 18834 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 18835 err)); 18836 } 18837 /* 18838 * Disable multicast for the solicited node multicast address 18839 */ 18840 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 18841 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 18842 18843 ipv6_multi.s6_addr32[3] |= 18844 ipif->ipif_v6lcl_addr.s6_addr32[3]; 18845 18846 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 18847 ipif->ipif_ill->ill_phyint->phyint_ifindex, 18848 ipif->ipif_zoneid, B_TRUE, B_TRUE); 18849 18850 if (err != 0) { 18851 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 18852 err)); 18853 } 18854 } 18855 18856 ipif->ipif_multicast_up = 0; 18857 } 18858 18859 /* 18860 * Used when an interface comes up to recreate any extra routes on this 18861 * interface. 18862 */ 18863 static ire_t ** 18864 ipif_recover_ire(ipif_t *ipif) 18865 { 18866 mblk_t *mp; 18867 ire_t **ipif_saved_irep; 18868 ire_t **irep; 18869 18870 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 18871 ipif->ipif_id)); 18872 18873 mutex_enter(&ipif->ipif_saved_ire_lock); 18874 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 18875 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 18876 if (ipif_saved_irep == NULL) { 18877 mutex_exit(&ipif->ipif_saved_ire_lock); 18878 return (NULL); 18879 } 18880 18881 irep = ipif_saved_irep; 18882 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 18883 ire_t *ire; 18884 queue_t *rfq; 18885 queue_t *stq; 18886 ifrt_t *ifrt; 18887 uchar_t *src_addr; 18888 uchar_t *gateway_addr; 18889 mblk_t *resolver_mp; 18890 ushort_t type; 18891 18892 /* 18893 * When the ire was initially created and then added in 18894 * ip_rt_add(), it was created either using ipif->ipif_net_type 18895 * in the case of a traditional interface route, or as one of 18896 * the IRE_OFFSUBNET types (with the exception of 18897 * IRE_HOST_REDIRECT which is created by icmp_redirect() and 18898 * which we don't need to save or recover). In the case where 18899 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 18900 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 18901 * to satisfy software like GateD and Sun Cluster which creates 18902 * routes using the the loopback interface's address as a 18903 * gateway. 18904 * 18905 * As ifrt->ifrt_type reflects the already updated ire_type and 18906 * since ire_create() expects that IRE_IF_NORESOLVER will have 18907 * a valid nce_res_mp field (which doesn't make sense for a 18908 * IRE_LOOPBACK), ire_create() will be called in the same way 18909 * here as in ip_rt_add(), namely using ipif->ipif_net_type when 18910 * the route looks like a traditional interface route (where 18911 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 18912 * the saved ifrt->ifrt_type. This means that in the case where 18913 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 18914 * ire_create() will be an IRE_LOOPBACK, it will then be turned 18915 * into an IRE_IF_NORESOLVER and then added by ire_add(). 18916 */ 18917 ifrt = (ifrt_t *)mp->b_rptr; 18918 if (ifrt->ifrt_type & IRE_INTERFACE) { 18919 rfq = NULL; 18920 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 18921 ? ipif->ipif_rq : ipif->ipif_wq; 18922 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18923 ? (uint8_t *)&ifrt->ifrt_src_addr 18924 : (uint8_t *)&ipif->ipif_src_addr; 18925 gateway_addr = NULL; 18926 resolver_mp = ipif->ipif_resolver_mp; 18927 type = ipif->ipif_net_type; 18928 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 18929 /* Recover multiroute broadcast IRE. */ 18930 rfq = ipif->ipif_rq; 18931 stq = ipif->ipif_wq; 18932 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18933 ? (uint8_t *)&ifrt->ifrt_src_addr 18934 : (uint8_t *)&ipif->ipif_src_addr; 18935 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 18936 resolver_mp = ipif->ipif_bcast_mp; 18937 type = ifrt->ifrt_type; 18938 } else { 18939 rfq = NULL; 18940 stq = NULL; 18941 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18942 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 18943 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 18944 resolver_mp = NULL; 18945 type = ifrt->ifrt_type; 18946 } 18947 18948 /* 18949 * Create a copy of the IRE with the saved address and netmask. 18950 */ 18951 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 18952 "0x%x/0x%x\n", 18953 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 18954 ntohl(ifrt->ifrt_addr), 18955 ntohl(ifrt->ifrt_mask))); 18956 ire = ire_create( 18957 (uint8_t *)&ifrt->ifrt_addr, 18958 (uint8_t *)&ifrt->ifrt_mask, 18959 src_addr, 18960 gateway_addr, 18961 NULL, 18962 &ifrt->ifrt_max_frag, 18963 NULL, 18964 rfq, 18965 stq, 18966 type, 18967 resolver_mp, 18968 ipif, 18969 NULL, 18970 0, 18971 0, 18972 0, 18973 ifrt->ifrt_flags, 18974 &ifrt->ifrt_iulp_info, 18975 NULL, 18976 NULL); 18977 18978 if (ire == NULL) { 18979 mutex_exit(&ipif->ipif_saved_ire_lock); 18980 kmem_free(ipif_saved_irep, 18981 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 18982 return (NULL); 18983 } 18984 18985 /* 18986 * Some software (for example, GateD and Sun Cluster) attempts 18987 * to create (what amount to) IRE_PREFIX routes with the 18988 * loopback address as the gateway. This is primarily done to 18989 * set up prefixes with the RTF_REJECT flag set (for example, 18990 * when generating aggregate routes.) 18991 * 18992 * If the IRE type (as defined by ipif->ipif_net_type) is 18993 * IRE_LOOPBACK, then we map the request into a 18994 * IRE_IF_NORESOLVER. 18995 */ 18996 if (ipif->ipif_net_type == IRE_LOOPBACK) 18997 ire->ire_type = IRE_IF_NORESOLVER; 18998 /* 18999 * ire held by ire_add, will be refreled' towards the 19000 * the end of ipif_up_done 19001 */ 19002 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 19003 *irep = ire; 19004 irep++; 19005 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 19006 } 19007 mutex_exit(&ipif->ipif_saved_ire_lock); 19008 return (ipif_saved_irep); 19009 } 19010 19011 /* 19012 * Used to set the netmask and broadcast address to default values when the 19013 * interface is brought up. (Always called as writer.) 19014 */ 19015 static void 19016 ipif_set_default(ipif_t *ipif) 19017 { 19018 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19019 19020 if (!ipif->ipif_isv6) { 19021 /* 19022 * Interface holds an IPv4 address. Default 19023 * mask is the natural netmask. 19024 */ 19025 if (!ipif->ipif_net_mask) { 19026 ipaddr_t v4mask; 19027 19028 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 19029 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 19030 } 19031 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19032 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19033 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19034 } else { 19035 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19036 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19037 } 19038 /* 19039 * NOTE: SunOS 4.X does this even if the broadcast address 19040 * has been already set thus we do the same here. 19041 */ 19042 if (ipif->ipif_flags & IPIF_BROADCAST) { 19043 ipaddr_t v4addr; 19044 19045 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 19046 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 19047 } 19048 } else { 19049 /* 19050 * Interface holds an IPv6-only address. Default 19051 * mask is all-ones. 19052 */ 19053 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 19054 ipif->ipif_v6net_mask = ipv6_all_ones; 19055 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19056 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19057 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19058 } else { 19059 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19060 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19061 } 19062 } 19063 } 19064 19065 /* 19066 * Return 0 if this address can be used as local address without causing 19067 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 19068 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 19069 * Special checks are needed to allow the same IPv6 link-local address 19070 * on different ills. 19071 * TODO: allowing the same site-local address on different ill's. 19072 */ 19073 int 19074 ip_addr_availability_check(ipif_t *new_ipif) 19075 { 19076 in6_addr_t our_v6addr; 19077 ill_t *ill; 19078 ipif_t *ipif; 19079 ill_walk_context_t ctx; 19080 19081 ASSERT(IAM_WRITER_IPIF(new_ipif)); 19082 ASSERT(MUTEX_HELD(&ip_addr_avail_lock)); 19083 ASSERT(RW_READ_HELD(&ill_g_lock)); 19084 19085 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 19086 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 19087 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 19088 return (0); 19089 19090 our_v6addr = new_ipif->ipif_v6lcl_addr; 19091 19092 if (new_ipif->ipif_isv6) 19093 ill = ILL_START_WALK_V6(&ctx); 19094 else 19095 ill = ILL_START_WALK_V4(&ctx); 19096 19097 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19098 for (ipif = ill->ill_ipif; ipif != NULL; 19099 ipif = ipif->ipif_next) { 19100 if ((ipif == new_ipif) || 19101 !(ipif->ipif_flags & IPIF_UP) || 19102 (ipif->ipif_flags & IPIF_UNNUMBERED)) 19103 continue; 19104 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 19105 &our_v6addr)) { 19106 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 19107 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 19108 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 19109 ipif->ipif_flags |= IPIF_UNNUMBERED; 19110 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 19111 new_ipif->ipif_ill != ill) 19112 continue; 19113 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 19114 new_ipif->ipif_ill != ill) 19115 continue; 19116 else if (new_ipif->ipif_zoneid != 19117 ipif->ipif_zoneid && 19118 ipif->ipif_zoneid != ALL_ZONES && 19119 (ill->ill_phyint->phyint_flags & 19120 PHYI_LOOPBACK)) 19121 continue; 19122 else if (new_ipif->ipif_ill == ill) 19123 return (EADDRINUSE); 19124 else 19125 return (EADDRNOTAVAIL); 19126 } 19127 } 19128 } 19129 19130 return (0); 19131 } 19132 19133 /* 19134 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 19135 * IREs for the ipif. 19136 * When the routine returns EINPROGRESS then mp has been consumed and 19137 * the ioctl will be acked from ip_rput_dlpi. 19138 */ 19139 static int 19140 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 19141 { 19142 ill_t *ill = ipif->ipif_ill; 19143 boolean_t isv6 = ipif->ipif_isv6; 19144 int err = 0; 19145 boolean_t success; 19146 19147 ASSERT(IAM_WRITER_IPIF(ipif)); 19148 19149 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 19150 19151 /* Shouldn't get here if it is already up. */ 19152 if (ipif->ipif_flags & IPIF_UP) 19153 return (EALREADY); 19154 19155 /* Skip arp/ndp for any loopback interface. */ 19156 if (ill->ill_wq != NULL) { 19157 conn_t *connp = Q_TO_CONN(q); 19158 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19159 19160 if (!ill->ill_dl_up) { 19161 /* 19162 * ill_dl_up is not yet set. i.e. we are yet to 19163 * DL_BIND with the driver and this is the first 19164 * logical interface on the ill to become "up". 19165 * Tell the driver to get going (via DL_BIND_REQ). 19166 * Note that changing "significant" IFF_ flags 19167 * address/netmask etc cause a down/up dance, but 19168 * does not cause an unbind (DL_UNBIND) with the driver 19169 */ 19170 return (ill_dl_up(ill, ipif, mp, q)); 19171 } 19172 19173 /* 19174 * ipif_resolver_up may end up sending an 19175 * AR_INTERFACE_UP message to ARP, which would, in 19176 * turn send a DLPI message to the driver. ioctls are 19177 * serialized and so we cannot send more than one 19178 * interface up message at a time. If ipif_resolver_up 19179 * does send an interface up message to ARP, we get 19180 * EINPROGRESS and we will complete in ip_arp_done. 19181 */ 19182 19183 ASSERT(connp != NULL); 19184 ASSERT(ipsq->ipsq_pending_mp == NULL); 19185 mutex_enter(&connp->conn_lock); 19186 mutex_enter(&ill->ill_lock); 19187 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19188 mutex_exit(&ill->ill_lock); 19189 mutex_exit(&connp->conn_lock); 19190 if (!success) 19191 return (EINTR); 19192 19193 /* 19194 * Crank up IPv6 neighbor discovery 19195 * Unlike ARP, this should complete when 19196 * ipif_ndp_up returns. However, for 19197 * ILLF_XRESOLV interfaces we also send a 19198 * AR_INTERFACE_UP to the external resolver. 19199 * That ioctl will complete in ip_rput. 19200 */ 19201 if (isv6) { 19202 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, 19203 B_FALSE); 19204 if (err != 0) { 19205 if (err != EINPROGRESS) 19206 mp = ipsq_pending_mp_get(ipsq, &connp); 19207 return (err); 19208 } 19209 } 19210 /* Now, ARP */ 19211 err = ipif_resolver_up(ipif, Res_act_initial); 19212 if (err == EINPROGRESS) { 19213 /* We will complete it in ip_arp_done */ 19214 return (err); 19215 } 19216 mp = ipsq_pending_mp_get(ipsq, &connp); 19217 ASSERT(mp != NULL); 19218 if (err != 0) 19219 return (err); 19220 } else { 19221 /* 19222 * Interfaces without underlying hardware don't do duplicate 19223 * address detection. 19224 */ 19225 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 19226 ipif->ipif_addr_ready = 1; 19227 } 19228 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 19229 } 19230 19231 /* 19232 * Perform a bind for the physical device. 19233 * When the routine returns EINPROGRESS then mp has been consumed and 19234 * the ioctl will be acked from ip_rput_dlpi. 19235 * Allocate an unbind message and save it until ipif_down. 19236 */ 19237 static int 19238 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 19239 { 19240 mblk_t *areq_mp = NULL; 19241 mblk_t *bind_mp = NULL; 19242 mblk_t *unbind_mp = NULL; 19243 conn_t *connp; 19244 boolean_t success; 19245 19246 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 19247 ASSERT(IAM_WRITER_ILL(ill)); 19248 19249 ASSERT(mp != NULL); 19250 19251 /* Create a resolver cookie for ARP */ 19252 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 19253 areq_t *areq; 19254 uint16_t sap_addr; 19255 19256 areq_mp = ill_arp_alloc(ill, 19257 (uchar_t *)&ip_areq_template, 0); 19258 if (areq_mp == NULL) { 19259 return (ENOMEM); 19260 } 19261 freemsg(ill->ill_resolver_mp); 19262 ill->ill_resolver_mp = areq_mp; 19263 areq = (areq_t *)areq_mp->b_rptr; 19264 sap_addr = ill->ill_sap; 19265 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 19266 /* 19267 * Wait till we call ill_pending_mp_add to determine 19268 * the success before we free the ill_resolver_mp and 19269 * attach areq_mp in it's place. 19270 */ 19271 } 19272 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 19273 DL_BIND_REQ); 19274 if (bind_mp == NULL) 19275 goto bad; 19276 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 19277 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 19278 19279 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 19280 if (unbind_mp == NULL) 19281 goto bad; 19282 19283 /* 19284 * Record state needed to complete this operation when the 19285 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 19286 */ 19287 if (WR(q)->q_next == NULL) { 19288 connp = Q_TO_CONN(q); 19289 mutex_enter(&connp->conn_lock); 19290 } else { 19291 connp = NULL; 19292 } 19293 mutex_enter(&ipif->ipif_ill->ill_lock); 19294 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19295 mutex_exit(&ipif->ipif_ill->ill_lock); 19296 if (connp != NULL) 19297 mutex_exit(&connp->conn_lock); 19298 if (!success) 19299 goto bad; 19300 19301 /* 19302 * Save the unbind message for ill_dl_down(); it will be consumed when 19303 * the interface goes down. 19304 */ 19305 ASSERT(ill->ill_unbind_mp == NULL); 19306 ill->ill_unbind_mp = unbind_mp; 19307 19308 ill_dlpi_send(ill, bind_mp); 19309 /* Send down link-layer capabilities probe if not already done. */ 19310 ill_capability_probe(ill); 19311 19312 /* 19313 * Sysid used to rely on the fact that netboots set domainname 19314 * and the like. Now that miniroot boots aren't strictly netboots 19315 * and miniroot network configuration is driven from userland 19316 * these things still need to be set. This situation can be detected 19317 * by comparing the interface being configured here to the one 19318 * dhcack was set to reference by the boot loader. Once sysid is 19319 * converted to use dhcp_ipc_getinfo() this call can go away. 19320 */ 19321 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) && 19322 (strcmp(ill->ill_name, dhcack) == 0) && 19323 (strlen(srpc_domain) == 0)) { 19324 if (dhcpinit() != 0) 19325 cmn_err(CE_WARN, "no cached dhcp response"); 19326 } 19327 19328 /* 19329 * This operation will complete in ip_rput_dlpi with either 19330 * a DL_BIND_ACK or DL_ERROR_ACK. 19331 */ 19332 return (EINPROGRESS); 19333 bad: 19334 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 19335 /* 19336 * We don't have to check for possible removal from illgrp 19337 * as we have not yet inserted in illgrp. For groups 19338 * without names, this ipif is still not UP and hence 19339 * this could not have possibly had any influence in forming 19340 * groups. 19341 */ 19342 19343 if (bind_mp != NULL) 19344 freemsg(bind_mp); 19345 if (unbind_mp != NULL) 19346 freemsg(unbind_mp); 19347 return (ENOMEM); 19348 } 19349 19350 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 19351 19352 /* 19353 * DLPI and ARP is up. 19354 * Create all the IREs associated with an interface bring up multicast. 19355 * Set the interface flag and finish other initialization 19356 * that potentially had to be differed to after DL_BIND_ACK. 19357 */ 19358 int 19359 ipif_up_done(ipif_t *ipif) 19360 { 19361 ire_t *ire_array[20]; 19362 ire_t **irep = ire_array; 19363 ire_t **irep1; 19364 ipaddr_t net_mask = 0; 19365 ipaddr_t subnet_mask, route_mask; 19366 ill_t *ill = ipif->ipif_ill; 19367 queue_t *stq; 19368 ipif_t *src_ipif; 19369 ipif_t *tmp_ipif; 19370 boolean_t flush_ire_cache = B_TRUE; 19371 int err = 0; 19372 phyint_t *phyi; 19373 ire_t **ipif_saved_irep = NULL; 19374 int ipif_saved_ire_cnt; 19375 int cnt; 19376 boolean_t src_ipif_held = B_FALSE; 19377 boolean_t ire_added = B_FALSE; 19378 boolean_t loopback = B_FALSE; 19379 19380 ip1dbg(("ipif_up_done(%s:%u)\n", 19381 ipif->ipif_ill->ill_name, ipif->ipif_id)); 19382 /* Check if this is a loopback interface */ 19383 if (ipif->ipif_ill->ill_wq == NULL) 19384 loopback = B_TRUE; 19385 19386 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19387 /* 19388 * If all other interfaces for this ill are down or DEPRECATED, 19389 * or otherwise unsuitable for source address selection, remove 19390 * any IRE_CACHE entries for this ill to make sure source 19391 * address selection gets to take this new ipif into account. 19392 * No need to hold ill_lock while traversing the ipif list since 19393 * we are writer 19394 */ 19395 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 19396 tmp_ipif = tmp_ipif->ipif_next) { 19397 if (((tmp_ipif->ipif_flags & 19398 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 19399 !(tmp_ipif->ipif_flags & IPIF_UP)) || 19400 (tmp_ipif == ipif)) 19401 continue; 19402 /* first useable pre-existing interface */ 19403 flush_ire_cache = B_FALSE; 19404 break; 19405 } 19406 if (flush_ire_cache) 19407 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 19408 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 19409 19410 /* 19411 * Figure out which way the send-to queue should go. Only 19412 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 19413 * should show up here. 19414 */ 19415 switch (ill->ill_net_type) { 19416 case IRE_IF_RESOLVER: 19417 stq = ill->ill_rq; 19418 break; 19419 case IRE_IF_NORESOLVER: 19420 case IRE_LOOPBACK: 19421 stq = ill->ill_wq; 19422 break; 19423 default: 19424 return (EINVAL); 19425 } 19426 19427 if (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) { 19428 /* 19429 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 19430 * ipif_lookup_on_name(), but in the case of zones we can have 19431 * several loopback addresses on lo0. So all the interfaces with 19432 * loopback addresses need to be marked IRE_LOOPBACK. 19433 */ 19434 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 19435 htonl(INADDR_LOOPBACK)) 19436 ipif->ipif_ire_type = IRE_LOOPBACK; 19437 else 19438 ipif->ipif_ire_type = IRE_LOCAL; 19439 } 19440 19441 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 19442 /* 19443 * Can't use our source address. Select a different 19444 * source address for the IRE_INTERFACE and IRE_LOCAL 19445 */ 19446 src_ipif = ipif_select_source(ipif->ipif_ill, 19447 ipif->ipif_subnet, ipif->ipif_zoneid); 19448 if (src_ipif == NULL) 19449 src_ipif = ipif; /* Last resort */ 19450 else 19451 src_ipif_held = B_TRUE; 19452 } else { 19453 src_ipif = ipif; 19454 } 19455 19456 /* Create all the IREs associated with this interface */ 19457 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19458 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19459 19460 /* 19461 * If we're on a labeled system then make sure that zone- 19462 * private addresses have proper remote host database entries. 19463 */ 19464 if (is_system_labeled() && 19465 ipif->ipif_ire_type != IRE_LOOPBACK && 19466 !tsol_check_interface_address(ipif)) 19467 return (EINVAL); 19468 19469 /* Register the source address for __sin6_src_id */ 19470 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 19471 ipif->ipif_zoneid); 19472 if (err != 0) { 19473 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 19474 return (err); 19475 } 19476 19477 /* If the interface address is set, create the local IRE. */ 19478 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 19479 (void *)ipif, 19480 ipif->ipif_ire_type, 19481 ntohl(ipif->ipif_lcl_addr))); 19482 *irep++ = ire_create( 19483 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 19484 (uchar_t *)&ip_g_all_ones, /* mask */ 19485 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 19486 NULL, /* no gateway */ 19487 NULL, 19488 &ip_loopback_mtuplus, /* max frag size */ 19489 NULL, 19490 ipif->ipif_rq, /* recv-from queue */ 19491 NULL, /* no send-to queue */ 19492 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 19493 NULL, 19494 ipif, 19495 NULL, 19496 0, 19497 0, 19498 0, 19499 (ipif->ipif_flags & IPIF_PRIVATE) ? 19500 RTF_PRIVATE : 0, 19501 &ire_uinfo_null, 19502 NULL, 19503 NULL); 19504 } else { 19505 ip1dbg(( 19506 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 19507 ipif->ipif_ire_type, 19508 ntohl(ipif->ipif_lcl_addr), 19509 (uint_t)ipif->ipif_flags)); 19510 } 19511 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19512 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19513 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 19514 } else { 19515 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 19516 } 19517 19518 subnet_mask = ipif->ipif_net_mask; 19519 19520 /* 19521 * If mask was not specified, use natural netmask of 19522 * interface address. Also, store this mask back into the 19523 * ipif struct. 19524 */ 19525 if (subnet_mask == 0) { 19526 subnet_mask = net_mask; 19527 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 19528 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 19529 ipif->ipif_v6subnet); 19530 } 19531 19532 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 19533 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 19534 ipif->ipif_subnet != INADDR_ANY) { 19535 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19536 19537 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19538 route_mask = IP_HOST_MASK; 19539 } else { 19540 route_mask = subnet_mask; 19541 } 19542 19543 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 19544 "creating if IRE ill_net_type 0x%x for 0x%x\n", 19545 (void *)ipif, (void *)ill, 19546 ill->ill_net_type, 19547 ntohl(ipif->ipif_subnet))); 19548 *irep++ = ire_create( 19549 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 19550 (uchar_t *)&route_mask, /* mask */ 19551 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 19552 NULL, /* no gateway */ 19553 NULL, 19554 &ipif->ipif_mtu, /* max frag */ 19555 NULL, 19556 NULL, /* no recv queue */ 19557 stq, /* send-to queue */ 19558 ill->ill_net_type, /* IF_[NO]RESOLVER */ 19559 ill->ill_resolver_mp, /* xmit header */ 19560 ipif, 19561 NULL, 19562 0, 19563 0, 19564 0, 19565 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 19566 &ire_uinfo_null, 19567 NULL, 19568 NULL); 19569 } 19570 19571 /* 19572 * If the interface address is set, create the broadcast IREs. 19573 * 19574 * ire_create_bcast checks if the proposed new IRE matches 19575 * any existing IRE's with the same physical interface (ILL). 19576 * This should get rid of duplicates. 19577 * ire_create_bcast also check IPIF_NOXMIT and does not create 19578 * any broadcast ires. 19579 */ 19580 if ((ipif->ipif_subnet != INADDR_ANY) && 19581 (ipif->ipif_flags & IPIF_BROADCAST)) { 19582 ipaddr_t addr; 19583 19584 ip1dbg(("ipif_up_done: creating broadcast IRE\n")); 19585 irep = ire_check_and_create_bcast(ipif, 0, irep, 19586 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19587 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, 19588 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19589 19590 /* 19591 * For backward compatibility, we need to create net 19592 * broadcast ire's based on the old "IP address class 19593 * system." The reason is that some old machines only 19594 * respond to these class derived net broadcast. 19595 * 19596 * But we should not create these net broadcast ire's if 19597 * the subnet_mask is shorter than the IP address class based 19598 * derived netmask. Otherwise, we may create a net 19599 * broadcast address which is the same as an IP address 19600 * on the subnet. Then TCP will refuse to talk to that 19601 * address. 19602 * 19603 * Nor do we need IRE_BROADCAST ire's for the interface 19604 * with the netmask as 0xFFFFFFFF, as IRE_LOCAL for that 19605 * interface is already created. Creating these broadcast 19606 * ire's will only create confusion as the "addr" is going 19607 * to be same as that of the IP address of the interface. 19608 */ 19609 if (net_mask < subnet_mask) { 19610 addr = net_mask & ipif->ipif_subnet; 19611 irep = ire_check_and_create_bcast(ipif, addr, irep, 19612 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19613 irep = ire_check_and_create_bcast(ipif, 19614 ~net_mask | addr, irep, 19615 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19616 } 19617 19618 if (subnet_mask != 0xFFFFFFFF) { 19619 addr = ipif->ipif_subnet; 19620 irep = ire_check_and_create_bcast(ipif, addr, irep, 19621 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19622 irep = ire_check_and_create_bcast(ipif, 19623 ~subnet_mask|addr, irep, 19624 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19625 } 19626 } 19627 19628 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19629 19630 /* If an earlier ire_create failed, get out now */ 19631 for (irep1 = irep; irep1 > ire_array; ) { 19632 irep1--; 19633 if (*irep1 == NULL) { 19634 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 19635 err = ENOMEM; 19636 goto bad; 19637 } 19638 } 19639 19640 /* 19641 * Need to atomically check for ip_addr_availablity_check 19642 * under ip_addr_avail_lock, and if it fails got bad, and remove 19643 * from group also.The ill_g_lock is grabbed as reader 19644 * just to make sure no new ills or new ipifs are being added 19645 * to the system while we are checking the uniqueness of addresses. 19646 */ 19647 rw_enter(&ill_g_lock, RW_READER); 19648 mutex_enter(&ip_addr_avail_lock); 19649 /* Mark it up, and increment counters. */ 19650 ill->ill_ipif_up_count++; 19651 ipif->ipif_flags |= IPIF_UP; 19652 err = ip_addr_availability_check(ipif); 19653 mutex_exit(&ip_addr_avail_lock); 19654 rw_exit(&ill_g_lock); 19655 19656 if (err != 0) { 19657 /* 19658 * Our address may already be up on the same ill. In this case, 19659 * the ARP entry for our ipif replaced the one for the other 19660 * ipif. So we don't want to delete it (otherwise the other ipif 19661 * would be unable to send packets). 19662 * ip_addr_availability_check() identifies this case for us and 19663 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 19664 * which is the expected error code. 19665 */ 19666 if (err == EADDRINUSE) { 19667 freemsg(ipif->ipif_arp_del_mp); 19668 ipif->ipif_arp_del_mp = NULL; 19669 err = EADDRNOTAVAIL; 19670 } 19671 ill->ill_ipif_up_count--; 19672 ipif->ipif_flags &= ~IPIF_UP; 19673 goto bad; 19674 } 19675 19676 /* 19677 * Add in all newly created IREs. ire_create_bcast() has 19678 * already checked for duplicates of the IRE_BROADCAST type. 19679 * We want to add before we call ifgrp_insert which wants 19680 * to know whether IRE_IF_RESOLVER exists or not. 19681 * 19682 * NOTE : We refrele the ire though we may branch to "bad" 19683 * later on where we do ire_delete. This is okay 19684 * because nobody can delete it as we are running 19685 * exclusively. 19686 */ 19687 for (irep1 = irep; irep1 > ire_array; ) { 19688 irep1--; 19689 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 19690 /* 19691 * refheld by ire_add. refele towards the end of the func 19692 */ 19693 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 19694 } 19695 ire_added = B_TRUE; 19696 /* 19697 * Form groups if possible. 19698 * 19699 * If we are supposed to be in a ill_group with a name, insert it 19700 * now as we know that at least one ipif is UP. Otherwise form 19701 * nameless groups. 19702 * 19703 * If ip_enable_group_ifs is set and ipif address is not 0, insert 19704 * this ipif into the appropriate interface group, or create a 19705 * new one. If this is already in a nameless group, we try to form 19706 * a bigger group looking at other ills potentially sharing this 19707 * ipif's prefix. 19708 */ 19709 phyi = ill->ill_phyint; 19710 if (phyi->phyint_groupname_len != 0) { 19711 ASSERT(phyi->phyint_groupname != NULL); 19712 if (ill->ill_ipif_up_count == 1) { 19713 ASSERT(ill->ill_group == NULL); 19714 err = illgrp_insert(&illgrp_head_v4, ill, 19715 phyi->phyint_groupname, NULL, B_TRUE); 19716 if (err != 0) { 19717 ip1dbg(("ipif_up_done: illgrp allocation " 19718 "failed, error %d\n", err)); 19719 goto bad; 19720 } 19721 } 19722 ASSERT(ill->ill_group != NULL); 19723 } 19724 19725 /* 19726 * When this is part of group, we need to make sure that 19727 * any broadcast ires created because of this ipif coming 19728 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 19729 * so that we don't receive duplicate broadcast packets. 19730 */ 19731 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 19732 ipif_renominate_bcast(ipif); 19733 19734 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 19735 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 19736 ipif_saved_irep = ipif_recover_ire(ipif); 19737 19738 if (!loopback) { 19739 /* 19740 * If the broadcast address has been set, make sure it makes 19741 * sense based on the interface address. 19742 * Only match on ill since we are sharing broadcast addresses. 19743 */ 19744 if ((ipif->ipif_brd_addr != INADDR_ANY) && 19745 (ipif->ipif_flags & IPIF_BROADCAST)) { 19746 ire_t *ire; 19747 19748 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 19749 IRE_BROADCAST, ipif, ALL_ZONES, 19750 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19751 19752 if (ire == NULL) { 19753 /* 19754 * If there isn't a matching broadcast IRE, 19755 * revert to the default for this netmask. 19756 */ 19757 ipif->ipif_v6brd_addr = ipv6_all_zeros; 19758 mutex_enter(&ipif->ipif_ill->ill_lock); 19759 ipif_set_default(ipif); 19760 mutex_exit(&ipif->ipif_ill->ill_lock); 19761 } else { 19762 ire_refrele(ire); 19763 } 19764 } 19765 19766 } 19767 19768 /* This is the first interface on this ill */ 19769 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 19770 /* 19771 * Need to recover all multicast memberships in the driver. 19772 * This had to be deferred until we had attached. 19773 */ 19774 ill_recover_multicast(ill); 19775 } 19776 /* Join the allhosts multicast address */ 19777 ipif_multicast_up(ipif); 19778 19779 if (!loopback) { 19780 /* 19781 * See whether anybody else would benefit from the 19782 * new ipif that we added. We call this always rather 19783 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 19784 * ipif is for the benefit of illgrp_insert (done above) 19785 * which does not do source address selection as it does 19786 * not want to re-create interface routes that we are 19787 * having reference to it here. 19788 */ 19789 ill_update_source_selection(ill); 19790 } 19791 19792 for (irep1 = irep; irep1 > ire_array; ) { 19793 irep1--; 19794 if (*irep1 != NULL) { 19795 /* was held in ire_add */ 19796 ire_refrele(*irep1); 19797 } 19798 } 19799 19800 cnt = ipif_saved_ire_cnt; 19801 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 19802 if (*irep1 != NULL) { 19803 /* was held in ire_add */ 19804 ire_refrele(*irep1); 19805 } 19806 } 19807 19808 if (!loopback && ipif->ipif_addr_ready) { 19809 /* Broadcast an address mask reply. */ 19810 ipif_mask_reply(ipif); 19811 } 19812 if (ipif_saved_irep != NULL) { 19813 kmem_free(ipif_saved_irep, 19814 ipif_saved_ire_cnt * sizeof (ire_t *)); 19815 } 19816 if (src_ipif_held) 19817 ipif_refrele(src_ipif); 19818 19819 /* 19820 * This had to be deferred until we had bound. Tell routing sockets and 19821 * others that this interface is up if it looks like the address has 19822 * been validated. Otherwise, if it isn't ready yet, wait for 19823 * duplicate address detection to do its thing. 19824 */ 19825 if (ipif->ipif_addr_ready) { 19826 ip_rts_ifmsg(ipif); 19827 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 19828 /* Let SCTP update the status for this ipif */ 19829 sctp_update_ipif(ipif, SCTP_IPIF_UP); 19830 } 19831 return (0); 19832 19833 bad: 19834 ip1dbg(("ipif_up_done: FAILED \n")); 19835 /* 19836 * We don't have to bother removing from ill groups because 19837 * 19838 * 1) For groups with names, we insert only when the first ipif 19839 * comes up. In that case if it fails, it will not be in any 19840 * group. So, we need not try to remove for that case. 19841 * 19842 * 2) For groups without names, either we tried to insert ipif_ill 19843 * in a group as singleton or found some other group to become 19844 * a bigger group. For the former, if it fails we don't have 19845 * anything to do as ipif_ill is not in the group and for the 19846 * latter, there are no failures in illgrp_insert/illgrp_delete 19847 * (ENOMEM can't occur for this. Check ifgrp_insert). 19848 */ 19849 while (irep > ire_array) { 19850 irep--; 19851 if (*irep != NULL) { 19852 ire_delete(*irep); 19853 if (ire_added) 19854 ire_refrele(*irep); 19855 } 19856 } 19857 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid); 19858 19859 if (ipif_saved_irep != NULL) { 19860 kmem_free(ipif_saved_irep, 19861 ipif_saved_ire_cnt * sizeof (ire_t *)); 19862 } 19863 if (src_ipif_held) 19864 ipif_refrele(src_ipif); 19865 19866 ipif_arp_down(ipif); 19867 return (err); 19868 } 19869 19870 /* 19871 * Turn off the ARP with the ILLF_NOARP flag. 19872 */ 19873 static int 19874 ill_arp_off(ill_t *ill) 19875 { 19876 mblk_t *arp_off_mp = NULL; 19877 mblk_t *arp_on_mp = NULL; 19878 19879 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 19880 19881 ASSERT(IAM_WRITER_ILL(ill)); 19882 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 19883 19884 /* 19885 * If the on message is still around we've already done 19886 * an arp_off without doing an arp_on thus there is no 19887 * work needed. 19888 */ 19889 if (ill->ill_arp_on_mp != NULL) 19890 return (0); 19891 19892 /* 19893 * Allocate an ARP on message (to be saved) and an ARP off message 19894 */ 19895 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 19896 if (!arp_off_mp) 19897 return (ENOMEM); 19898 19899 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 19900 if (!arp_on_mp) 19901 goto failed; 19902 19903 ASSERT(ill->ill_arp_on_mp == NULL); 19904 ill->ill_arp_on_mp = arp_on_mp; 19905 19906 /* Send an AR_INTERFACE_OFF request */ 19907 putnext(ill->ill_rq, arp_off_mp); 19908 return (0); 19909 failed: 19910 19911 if (arp_off_mp) 19912 freemsg(arp_off_mp); 19913 return (ENOMEM); 19914 } 19915 19916 /* 19917 * Turn on ARP by turning off the ILLF_NOARP flag. 19918 */ 19919 static int 19920 ill_arp_on(ill_t *ill) 19921 { 19922 mblk_t *mp; 19923 19924 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 19925 19926 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 19927 19928 ASSERT(IAM_WRITER_ILL(ill)); 19929 /* 19930 * Send an AR_INTERFACE_ON request if we have already done 19931 * an arp_off (which allocated the message). 19932 */ 19933 if (ill->ill_arp_on_mp != NULL) { 19934 mp = ill->ill_arp_on_mp; 19935 ill->ill_arp_on_mp = NULL; 19936 putnext(ill->ill_rq, mp); 19937 } 19938 return (0); 19939 } 19940 19941 /* 19942 * Called after either deleting ill from the group or when setting 19943 * FAILED or STANDBY on the interface. 19944 */ 19945 static void 19946 illgrp_reset_schednext(ill_t *ill) 19947 { 19948 ill_group_t *illgrp; 19949 ill_t *save_ill; 19950 19951 ASSERT(IAM_WRITER_ILL(ill)); 19952 /* 19953 * When called from illgrp_delete, ill_group will be non-NULL. 19954 * But when called from ip_sioctl_flags, it could be NULL if 19955 * somebody is setting FAILED/INACTIVE on some interface which 19956 * is not part of a group. 19957 */ 19958 illgrp = ill->ill_group; 19959 if (illgrp == NULL) 19960 return; 19961 if (illgrp->illgrp_ill_schednext != ill) 19962 return; 19963 19964 illgrp->illgrp_ill_schednext = NULL; 19965 save_ill = ill; 19966 /* 19967 * Choose a good ill to be the next one for 19968 * outbound traffic. As the flags FAILED/STANDBY is 19969 * not yet marked when called from ip_sioctl_flags, 19970 * we check for ill separately. 19971 */ 19972 for (ill = illgrp->illgrp_ill; ill != NULL; 19973 ill = ill->ill_group_next) { 19974 if ((ill != save_ill) && 19975 !(ill->ill_phyint->phyint_flags & 19976 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 19977 illgrp->illgrp_ill_schednext = ill; 19978 return; 19979 } 19980 } 19981 } 19982 19983 /* 19984 * Given an ill, find the next ill in the group to be scheduled. 19985 * (This should be called by ip_newroute() before ire_create().) 19986 * The passed in ill may be pulled out of the group, after we have picked 19987 * up a different outgoing ill from the same group. However ire add will 19988 * atomically check this. 19989 */ 19990 ill_t * 19991 illgrp_scheduler(ill_t *ill) 19992 { 19993 ill_t *retill; 19994 ill_group_t *illgrp; 19995 int illcnt; 19996 int i; 19997 uint64_t flags; 19998 19999 /* 20000 * We don't use a lock to check for the ill_group. If this ill 20001 * is currently being inserted we may end up just returning this 20002 * ill itself. That is ok. 20003 */ 20004 if (ill->ill_group == NULL) { 20005 ill_refhold(ill); 20006 return (ill); 20007 } 20008 20009 /* 20010 * Grab the ill_g_lock as reader to make sure we are dealing with 20011 * a set of stable ills. No ill can be added or deleted or change 20012 * group while we hold the reader lock. 20013 */ 20014 rw_enter(&ill_g_lock, RW_READER); 20015 if ((illgrp = ill->ill_group) == NULL) { 20016 rw_exit(&ill_g_lock); 20017 ill_refhold(ill); 20018 return (ill); 20019 } 20020 20021 illcnt = illgrp->illgrp_ill_count; 20022 mutex_enter(&illgrp->illgrp_lock); 20023 retill = illgrp->illgrp_ill_schednext; 20024 20025 if (retill == NULL) 20026 retill = illgrp->illgrp_ill; 20027 20028 /* 20029 * We do a circular search beginning at illgrp_ill_schednext 20030 * or illgrp_ill. We don't check the flags against the ill lock 20031 * since it can change anytime. The ire creation will be atomic 20032 * and will fail if the ill is FAILED or OFFLINE. 20033 */ 20034 for (i = 0; i < illcnt; i++) { 20035 flags = retill->ill_phyint->phyint_flags; 20036 20037 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 20038 ILL_CAN_LOOKUP(retill)) { 20039 illgrp->illgrp_ill_schednext = retill->ill_group_next; 20040 ill_refhold(retill); 20041 break; 20042 } 20043 retill = retill->ill_group_next; 20044 if (retill == NULL) 20045 retill = illgrp->illgrp_ill; 20046 } 20047 mutex_exit(&illgrp->illgrp_lock); 20048 rw_exit(&ill_g_lock); 20049 20050 return (i == illcnt ? NULL : retill); 20051 } 20052 20053 /* 20054 * Checks for availbility of a usable source address (if there is one) when the 20055 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 20056 * this selection is done regardless of the destination. 20057 */ 20058 boolean_t 20059 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 20060 { 20061 uint_t ifindex; 20062 ipif_t *ipif = NULL; 20063 ill_t *uill; 20064 boolean_t isv6; 20065 20066 ASSERT(ill != NULL); 20067 20068 isv6 = ill->ill_isv6; 20069 ifindex = ill->ill_usesrc_ifindex; 20070 if (ifindex != 0) { 20071 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 20072 NULL); 20073 if (uill == NULL) 20074 return (NULL); 20075 mutex_enter(&uill->ill_lock); 20076 for (ipif = uill->ill_ipif; ipif != NULL; 20077 ipif = ipif->ipif_next) { 20078 if (!IPIF_CAN_LOOKUP(ipif)) 20079 continue; 20080 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20081 continue; 20082 if (!(ipif->ipif_flags & IPIF_UP)) 20083 continue; 20084 if (ipif->ipif_zoneid != zoneid) 20085 continue; 20086 if ((isv6 && 20087 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 20088 (ipif->ipif_lcl_addr == INADDR_ANY)) 20089 continue; 20090 mutex_exit(&uill->ill_lock); 20091 ill_refrele(uill); 20092 return (B_TRUE); 20093 } 20094 mutex_exit(&uill->ill_lock); 20095 ill_refrele(uill); 20096 } 20097 return (B_FALSE); 20098 } 20099 20100 /* 20101 * Determine the best source address given a destination address and an ill. 20102 * Prefers non-deprecated over deprecated but will return a deprecated 20103 * address if there is no other choice. If there is a usable source address 20104 * on the interface pointed to by ill_usesrc_ifindex then that is given 20105 * first preference. 20106 * 20107 * Returns NULL if there is no suitable source address for the ill. 20108 * This only occurs when there is no valid source address for the ill. 20109 */ 20110 ipif_t * 20111 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 20112 { 20113 ipif_t *ipif; 20114 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 20115 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 20116 int index = 0; 20117 boolean_t wrapped = B_FALSE; 20118 boolean_t same_subnet_only = B_FALSE; 20119 boolean_t ipif_same_found, ipif_other_found; 20120 boolean_t specific_found; 20121 ill_t *till, *usill = NULL; 20122 tsol_tpc_t *src_rhtp, *dst_rhtp; 20123 20124 if (ill->ill_usesrc_ifindex != 0) { 20125 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, B_FALSE, 20126 NULL, NULL, NULL, NULL); 20127 if (usill != NULL) 20128 ill = usill; /* Select source from usesrc ILL */ 20129 else 20130 return (NULL); 20131 } 20132 20133 /* 20134 * If we're dealing with an unlabeled destination on a labeled system, 20135 * make sure that we ignore source addresses that are incompatible with 20136 * the destination's default label. That destination's default label 20137 * must dominate the minimum label on the source address. 20138 */ 20139 dst_rhtp = NULL; 20140 if (is_system_labeled()) { 20141 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 20142 if (dst_rhtp == NULL) 20143 return (NULL); 20144 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 20145 TPC_RELE(dst_rhtp); 20146 dst_rhtp = NULL; 20147 } 20148 } 20149 20150 /* 20151 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 20152 * can be deleted. But an ipif/ill can get CONDEMNED any time. 20153 * After selecting the right ipif, under ill_lock make sure ipif is 20154 * not condemned, and increment refcnt. If ipif is CONDEMNED, 20155 * we retry. Inside the loop we still need to check for CONDEMNED, 20156 * but not under a lock. 20157 */ 20158 rw_enter(&ill_g_lock, RW_READER); 20159 20160 retry: 20161 till = ill; 20162 ipif_arr[0] = NULL; 20163 20164 if (till->ill_group != NULL) 20165 till = till->ill_group->illgrp_ill; 20166 20167 /* 20168 * Choose one good source address from each ill across the group. 20169 * If possible choose a source address in the same subnet as 20170 * the destination address. 20171 * 20172 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 20173 * This is okay because of the following. 20174 * 20175 * If PHYI_FAILED is set and we still have non-deprecated 20176 * addresses, it means the addresses have not yet been 20177 * failed over to a different interface. We potentially 20178 * select them to create IRE_CACHES, which will be later 20179 * flushed when the addresses move over. 20180 * 20181 * If PHYI_INACTIVE is set and we still have non-deprecated 20182 * addresses, it means either the user has configured them 20183 * or PHYI_INACTIVE has not been cleared after the addresses 20184 * been moved over. For the former, in.mpathd does a failover 20185 * when the interface becomes INACTIVE and hence we should 20186 * not find them. Once INACTIVE is set, we don't allow them 20187 * to create logical interfaces anymore. For the latter, a 20188 * flush will happen when INACTIVE is cleared which will 20189 * flush the IRE_CACHES. 20190 * 20191 * If PHYI_OFFLINE is set, all the addresses will be failed 20192 * over soon. We potentially select them to create IRE_CACHEs, 20193 * which will be later flushed when the addresses move over. 20194 * 20195 * NOTE : As ipif_select_source is called to borrow source address 20196 * for an ipif that is part of a group, source address selection 20197 * will be re-done whenever the group changes i.e either an 20198 * insertion/deletion in the group. 20199 * 20200 * Fill ipif_arr[] with source addresses, using these rules: 20201 * 20202 * 1. At most one source address from a given ill ends up 20203 * in ipif_arr[] -- that is, at most one of the ipif's 20204 * associated with a given ill ends up in ipif_arr[]. 20205 * 20206 * 2. If there is at least one non-deprecated ipif in the 20207 * IPMP group with a source address on the same subnet as 20208 * our destination, then fill ipif_arr[] only with 20209 * source addresses on the same subnet as our destination. 20210 * Note that because of (1), only the first 20211 * non-deprecated ipif found with a source address 20212 * matching the destination ends up in ipif_arr[]. 20213 * 20214 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 20215 * addresses not in the same subnet as our destination. 20216 * Again, because of (1), only the first off-subnet source 20217 * address will be chosen. 20218 * 20219 * 4. If there are no non-deprecated ipifs, then just use 20220 * the source address associated with the last deprecated 20221 * one we find that happens to be on the same subnet, 20222 * otherwise the first one not in the same subnet. 20223 */ 20224 specific_found = B_FALSE; 20225 for (; till != NULL; till = till->ill_group_next) { 20226 ipif_same_found = B_FALSE; 20227 ipif_other_found = B_FALSE; 20228 for (ipif = till->ill_ipif; ipif != NULL; 20229 ipif = ipif->ipif_next) { 20230 if (!IPIF_CAN_LOOKUP(ipif)) 20231 continue; 20232 /* Always skip NOLOCAL and ANYCAST interfaces */ 20233 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20234 continue; 20235 if (!(ipif->ipif_flags & IPIF_UP) || 20236 !ipif->ipif_addr_ready) 20237 continue; 20238 if (ipif->ipif_zoneid != zoneid && 20239 ipif->ipif_zoneid != ALL_ZONES) 20240 continue; 20241 /* 20242 * Interfaces with 0.0.0.0 address are allowed to be UP, 20243 * but are not valid as source addresses. 20244 */ 20245 if (ipif->ipif_lcl_addr == INADDR_ANY) 20246 continue; 20247 20248 /* 20249 * Check compatibility of local address for 20250 * destination's default label if we're on a labeled 20251 * system. Incompatible addresses can't be used at 20252 * all. 20253 */ 20254 if (dst_rhtp != NULL) { 20255 boolean_t incompat; 20256 20257 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 20258 IPV4_VERSION, B_FALSE); 20259 if (src_rhtp == NULL) 20260 continue; 20261 incompat = 20262 src_rhtp->tpc_tp.host_type != SUN_CIPSO || 20263 src_rhtp->tpc_tp.tp_doi != 20264 dst_rhtp->tpc_tp.tp_doi || 20265 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 20266 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 20267 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 20268 src_rhtp->tpc_tp.tp_sl_set_cipso)); 20269 TPC_RELE(src_rhtp); 20270 if (incompat) 20271 continue; 20272 } 20273 20274 /* 20275 * We prefer not to use all all-zones addresses, if we 20276 * can avoid it, as they pose problems with unlabeled 20277 * destinations. 20278 */ 20279 if (ipif->ipif_zoneid != ALL_ZONES) { 20280 if (!specific_found && 20281 (!same_subnet_only || 20282 (ipif->ipif_net_mask & dst) == 20283 ipif->ipif_subnet)) { 20284 index = 0; 20285 specific_found = B_TRUE; 20286 ipif_other_found = B_FALSE; 20287 } 20288 } else { 20289 if (specific_found) 20290 continue; 20291 } 20292 if (ipif->ipif_flags & IPIF_DEPRECATED) { 20293 if (ipif_dep == NULL || 20294 (ipif->ipif_net_mask & dst) == 20295 ipif->ipif_subnet) 20296 ipif_dep = ipif; 20297 continue; 20298 } 20299 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 20300 /* found a source address in the same subnet */ 20301 if (!same_subnet_only) { 20302 same_subnet_only = B_TRUE; 20303 index = 0; 20304 } 20305 ipif_same_found = B_TRUE; 20306 } else { 20307 if (same_subnet_only || ipif_other_found) 20308 continue; 20309 ipif_other_found = B_TRUE; 20310 } 20311 ipif_arr[index++] = ipif; 20312 if (index == MAX_IPIF_SELECT_SOURCE) { 20313 wrapped = B_TRUE; 20314 index = 0; 20315 } 20316 if (ipif_same_found) 20317 break; 20318 } 20319 } 20320 20321 if (ipif_arr[0] == NULL) { 20322 ipif = ipif_dep; 20323 } else { 20324 if (wrapped) 20325 index = MAX_IPIF_SELECT_SOURCE; 20326 ipif = ipif_arr[ipif_rand() % index]; 20327 ASSERT(ipif != NULL); 20328 } 20329 20330 if (ipif != NULL) { 20331 mutex_enter(&ipif->ipif_ill->ill_lock); 20332 if (!IPIF_CAN_LOOKUP(ipif)) { 20333 mutex_exit(&ipif->ipif_ill->ill_lock); 20334 goto retry; 20335 } 20336 ipif_refhold_locked(ipif); 20337 mutex_exit(&ipif->ipif_ill->ill_lock); 20338 } 20339 20340 rw_exit(&ill_g_lock); 20341 if (usill != NULL) 20342 ill_refrele(usill); 20343 if (dst_rhtp != NULL) 20344 TPC_RELE(dst_rhtp); 20345 20346 #ifdef DEBUG 20347 if (ipif == NULL) { 20348 char buf1[INET6_ADDRSTRLEN]; 20349 20350 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 20351 ill->ill_name, 20352 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 20353 } else { 20354 char buf1[INET6_ADDRSTRLEN]; 20355 char buf2[INET6_ADDRSTRLEN]; 20356 20357 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 20358 ipif->ipif_ill->ill_name, 20359 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 20360 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 20361 buf2, sizeof (buf2)))); 20362 } 20363 #endif /* DEBUG */ 20364 return (ipif); 20365 } 20366 20367 20368 /* 20369 * If old_ipif is not NULL, see if ipif was derived from old 20370 * ipif and if so, recreate the interface route by re-doing 20371 * source address selection. This happens when ipif_down -> 20372 * ipif_update_other_ipifs calls us. 20373 * 20374 * If old_ipif is NULL, just redo the source address selection 20375 * if needed. This happens when illgrp_insert or ipif_up_done 20376 * calls us. 20377 */ 20378 static void 20379 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 20380 { 20381 ire_t *ire; 20382 ire_t *ipif_ire; 20383 queue_t *stq; 20384 ipif_t *nipif; 20385 ill_t *ill; 20386 boolean_t need_rele = B_FALSE; 20387 20388 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 20389 ASSERT(IAM_WRITER_IPIF(ipif)); 20390 20391 ill = ipif->ipif_ill; 20392 if (!(ipif->ipif_flags & 20393 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 20394 /* 20395 * Can't possibly have borrowed the source 20396 * from old_ipif. 20397 */ 20398 return; 20399 } 20400 20401 /* 20402 * Is there any work to be done? No work if the address 20403 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 20404 * ipif_select_source() does not borrow addresses from 20405 * NOLOCAL and ANYCAST interfaces). 20406 */ 20407 if ((old_ipif != NULL) && 20408 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 20409 (old_ipif->ipif_ill->ill_wq == NULL) || 20410 (old_ipif->ipif_flags & 20411 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 20412 return; 20413 } 20414 20415 /* 20416 * Perform the same checks as when creating the 20417 * IRE_INTERFACE in ipif_up_done. 20418 */ 20419 if (!(ipif->ipif_flags & IPIF_UP)) 20420 return; 20421 20422 if ((ipif->ipif_flags & IPIF_NOXMIT) || 20423 (ipif->ipif_subnet == INADDR_ANY)) 20424 return; 20425 20426 ipif_ire = ipif_to_ire(ipif); 20427 if (ipif_ire == NULL) 20428 return; 20429 20430 /* 20431 * We know that ipif uses some other source for its 20432 * IRE_INTERFACE. Is it using the source of this 20433 * old_ipif? 20434 */ 20435 if (old_ipif != NULL && 20436 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 20437 ire_refrele(ipif_ire); 20438 return; 20439 } 20440 if (ip_debug > 2) { 20441 /* ip1dbg */ 20442 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 20443 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 20444 } 20445 20446 stq = ipif_ire->ire_stq; 20447 20448 /* 20449 * Can't use our source address. Select a different 20450 * source address for the IRE_INTERFACE. 20451 */ 20452 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 20453 if (nipif == NULL) { 20454 /* Last resort - all ipif's have IPIF_NOLOCAL */ 20455 nipif = ipif; 20456 } else { 20457 need_rele = B_TRUE; 20458 } 20459 20460 ire = ire_create( 20461 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 20462 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 20463 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 20464 NULL, /* no gateway */ 20465 NULL, 20466 &ipif->ipif_mtu, /* max frag */ 20467 NULL, /* fast path header */ 20468 NULL, /* no recv from queue */ 20469 stq, /* send-to queue */ 20470 ill->ill_net_type, /* IF_[NO]RESOLVER */ 20471 ill->ill_resolver_mp, /* xmit header */ 20472 ipif, 20473 NULL, 20474 0, 20475 0, 20476 0, 20477 0, 20478 &ire_uinfo_null, 20479 NULL, 20480 NULL); 20481 20482 if (ire != NULL) { 20483 ire_t *ret_ire; 20484 int error; 20485 20486 /* 20487 * We don't need ipif_ire anymore. We need to delete 20488 * before we add so that ire_add does not detect 20489 * duplicates. 20490 */ 20491 ire_delete(ipif_ire); 20492 ret_ire = ire; 20493 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 20494 ASSERT(error == 0); 20495 ASSERT(ire == ret_ire); 20496 /* Held in ire_add */ 20497 ire_refrele(ret_ire); 20498 } 20499 /* 20500 * Either we are falling through from above or could not 20501 * allocate a replacement. 20502 */ 20503 ire_refrele(ipif_ire); 20504 if (need_rele) 20505 ipif_refrele(nipif); 20506 } 20507 20508 /* 20509 * This old_ipif is going away. 20510 * 20511 * Determine if any other ipif's is using our address as 20512 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 20513 * IPIF_DEPRECATED). 20514 * Find the IRE_INTERFACE for such ipifs and recreate them 20515 * to use an different source address following the rules in 20516 * ipif_up_done. 20517 * 20518 * This function takes an illgrp as an argument so that illgrp_delete 20519 * can call this to update source address even after deleting the 20520 * old_ipif->ipif_ill from the ill group. 20521 */ 20522 static void 20523 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 20524 { 20525 ipif_t *ipif; 20526 ill_t *ill; 20527 char buf[INET6_ADDRSTRLEN]; 20528 20529 ASSERT(IAM_WRITER_IPIF(old_ipif)); 20530 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 20531 20532 ill = old_ipif->ipif_ill; 20533 20534 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 20535 ill->ill_name, 20536 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 20537 buf, sizeof (buf)))); 20538 /* 20539 * If this part of a group, look at all ills as ipif_select_source 20540 * borrows source address across all the ills in the group. 20541 */ 20542 if (illgrp != NULL) 20543 ill = illgrp->illgrp_ill; 20544 20545 for (; ill != NULL; ill = ill->ill_group_next) { 20546 for (ipif = ill->ill_ipif; ipif != NULL; 20547 ipif = ipif->ipif_next) { 20548 20549 if (ipif == old_ipif) 20550 continue; 20551 20552 ipif_recreate_interface_routes(old_ipif, ipif); 20553 } 20554 } 20555 } 20556 20557 /* ARGSUSED */ 20558 int 20559 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20560 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20561 { 20562 /* 20563 * ill_phyint_reinit merged the v4 and v6 into a single 20564 * ipsq. Could also have become part of a ipmp group in the 20565 * process, and we might not have been able to complete the 20566 * operation in ipif_set_values, if we could not become 20567 * exclusive. If so restart it here. 20568 */ 20569 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 20570 } 20571 20572 20573 /* ARGSUSED */ 20574 int 20575 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20576 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20577 { 20578 queue_t *q1 = q; 20579 char *cp; 20580 char interf_name[LIFNAMSIZ]; 20581 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 20582 20583 if (!q->q_next) { 20584 ip1dbg(( 20585 "if_unitsel: IF_UNITSEL: no q_next\n")); 20586 return (EINVAL); 20587 } 20588 20589 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 20590 return (EALREADY); 20591 20592 do { 20593 q1 = q1->q_next; 20594 } while (q1->q_next); 20595 cp = q1->q_qinfo->qi_minfo->mi_idname; 20596 (void) sprintf(interf_name, "%s%d", cp, ppa); 20597 20598 /* 20599 * Here we are not going to delay the ioack until after 20600 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 20601 * original ioctl message before sending the requests. 20602 */ 20603 return (ipif_set_values(q, mp, interf_name, &ppa)); 20604 } 20605 20606 /* ARGSUSED */ 20607 int 20608 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20609 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20610 { 20611 return (ENXIO); 20612 } 20613 20614 /* 20615 * Net and subnet broadcast ire's are now specific to the particular 20616 * physical interface (ill) and not to any one locigal interface (ipif). 20617 * However, if a particular logical interface is being taken down, it's 20618 * associated ire's will be taken down as well. Hence, when we go to 20619 * take down or change the local address, broadcast address or netmask 20620 * of a specific logical interface, we must check to make sure that we 20621 * have valid net and subnet broadcast ire's for the other logical 20622 * interfaces which may have been shared with the logical interface 20623 * being brought down or changed. 20624 * 20625 * There is one set of 0.0.0.0 and 255.255.255.255 per ill. Usually it 20626 * is tied to the first interface coming UP. If that ipif is going down, 20627 * we need to recreate them on the next valid ipif. 20628 * 20629 * Note: assume that the ipif passed in is still up so that it's IRE 20630 * entries are still valid. 20631 */ 20632 static void 20633 ipif_check_bcast_ires(ipif_t *test_ipif) 20634 { 20635 ipif_t *ipif; 20636 ire_t *test_subnet_ire, *test_net_ire; 20637 ire_t *test_allzero_ire, *test_allone_ire; 20638 ire_t *ire_array[12]; 20639 ire_t **irep = &ire_array[0]; 20640 ire_t **irep1; 20641 20642 ipaddr_t net_addr, subnet_addr, net_mask, subnet_mask; 20643 ipaddr_t test_net_addr, test_subnet_addr; 20644 ipaddr_t test_net_mask, test_subnet_mask; 20645 boolean_t need_net_bcast_ire = B_FALSE; 20646 boolean_t need_subnet_bcast_ire = B_FALSE; 20647 boolean_t allzero_bcast_ire_created = B_FALSE; 20648 boolean_t allone_bcast_ire_created = B_FALSE; 20649 boolean_t net_bcast_ire_created = B_FALSE; 20650 boolean_t subnet_bcast_ire_created = B_FALSE; 20651 20652 ipif_t *backup_ipif_net = (ipif_t *)NULL; 20653 ipif_t *backup_ipif_subnet = (ipif_t *)NULL; 20654 ipif_t *backup_ipif_allzeros = (ipif_t *)NULL; 20655 ipif_t *backup_ipif_allones = (ipif_t *)NULL; 20656 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 20657 20658 ASSERT(!test_ipif->ipif_isv6); 20659 ASSERT(IAM_WRITER_IPIF(test_ipif)); 20660 20661 /* 20662 * No broadcast IREs for the LOOPBACK interface 20663 * or others such as point to point and IPIF_NOXMIT. 20664 */ 20665 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 20666 (test_ipif->ipif_flags & IPIF_NOXMIT)) 20667 return; 20668 20669 test_allzero_ire = ire_ctable_lookup(0, 0, IRE_BROADCAST, 20670 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20671 20672 test_allone_ire = ire_ctable_lookup(INADDR_BROADCAST, 0, IRE_BROADCAST, 20673 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20674 20675 test_net_mask = ip_net_mask(test_ipif->ipif_subnet); 20676 test_subnet_mask = test_ipif->ipif_net_mask; 20677 20678 /* 20679 * If no net mask set, assume the default based on net class. 20680 */ 20681 if (test_subnet_mask == 0) 20682 test_subnet_mask = test_net_mask; 20683 20684 /* 20685 * Check if there is a network broadcast ire associated with this ipif 20686 */ 20687 test_net_addr = test_net_mask & test_ipif->ipif_subnet; 20688 test_net_ire = ire_ctable_lookup(test_net_addr, 0, IRE_BROADCAST, 20689 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20690 20691 /* 20692 * Check if there is a subnet broadcast IRE associated with this ipif 20693 */ 20694 test_subnet_addr = test_subnet_mask & test_ipif->ipif_subnet; 20695 test_subnet_ire = ire_ctable_lookup(test_subnet_addr, 0, IRE_BROADCAST, 20696 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20697 20698 /* 20699 * No broadcast ire's associated with this ipif. 20700 */ 20701 if ((test_subnet_ire == NULL) && (test_net_ire == NULL) && 20702 (test_allzero_ire == NULL) && (test_allone_ire == NULL)) { 20703 return; 20704 } 20705 20706 /* 20707 * We have established which bcast ires have to be replaced. 20708 * Next we try to locate ipifs that match there ires. 20709 * The rules are simple: If we find an ipif that matches on the subnet 20710 * address it will also match on the net address, the allzeros and 20711 * allones address. Any ipif that matches only on the net address will 20712 * also match the allzeros and allones addresses. 20713 * The other criterion is the ipif_flags. We look for non-deprecated 20714 * (and non-anycast and non-nolocal) ipifs as the best choice. 20715 * ipifs with check_flags matching (deprecated, etc) are used only 20716 * if good ipifs are not available. While looping, we save existing 20717 * deprecated ipifs as backup_ipif. 20718 * We loop through all the ipifs for this ill looking for ipifs 20719 * whose broadcast addr match the ipif passed in, but do not have 20720 * their own broadcast ires. For creating 0.0.0.0 and 20721 * 255.255.255.255 we just need an ipif on this ill to create. 20722 */ 20723 for (ipif = test_ipif->ipif_ill->ill_ipif; ipif != NULL; 20724 ipif = ipif->ipif_next) { 20725 20726 ASSERT(!ipif->ipif_isv6); 20727 /* 20728 * Already checked the ipif passed in. 20729 */ 20730 if (ipif == test_ipif) { 20731 continue; 20732 } 20733 20734 /* 20735 * We only need to recreate broadcast ires if another ipif in 20736 * the same zone uses them. The new ires must be created in the 20737 * same zone. 20738 */ 20739 if (ipif->ipif_zoneid != test_ipif->ipif_zoneid) { 20740 continue; 20741 } 20742 20743 /* 20744 * Only interested in logical interfaces with valid local 20745 * addresses or with the ability to broadcast. 20746 */ 20747 if ((ipif->ipif_subnet == 0) || 20748 !(ipif->ipif_flags & IPIF_BROADCAST) || 20749 (ipif->ipif_flags & IPIF_NOXMIT) || 20750 !(ipif->ipif_flags & IPIF_UP)) { 20751 continue; 20752 } 20753 /* 20754 * Check if there is a net broadcast ire for this 20755 * net address. If it turns out that the ipif we are 20756 * about to take down owns this ire, we must make a 20757 * new one because it is potentially going away. 20758 */ 20759 if (test_net_ire && (!net_bcast_ire_created)) { 20760 net_mask = ip_net_mask(ipif->ipif_subnet); 20761 net_addr = net_mask & ipif->ipif_subnet; 20762 if (net_addr == test_net_addr) { 20763 need_net_bcast_ire = B_TRUE; 20764 /* 20765 * Use DEPRECATED ipif only if no good 20766 * ires are available. subnet_addr is 20767 * a better match than net_addr. 20768 */ 20769 if ((ipif->ipif_flags & check_flags) && 20770 (backup_ipif_net == NULL)) { 20771 backup_ipif_net = ipif; 20772 } 20773 } 20774 } 20775 /* 20776 * Check if there is a subnet broadcast ire for this 20777 * net address. If it turns out that the ipif we are 20778 * about to take down owns this ire, we must make a 20779 * new one because it is potentially going away. 20780 */ 20781 if (test_subnet_ire && (!subnet_bcast_ire_created)) { 20782 subnet_mask = ipif->ipif_net_mask; 20783 subnet_addr = ipif->ipif_subnet; 20784 if (subnet_addr == test_subnet_addr) { 20785 need_subnet_bcast_ire = B_TRUE; 20786 if ((ipif->ipif_flags & check_flags) && 20787 (backup_ipif_subnet == NULL)) { 20788 backup_ipif_subnet = ipif; 20789 } 20790 } 20791 } 20792 20793 20794 /* Short circuit here if this ipif is deprecated */ 20795 if (ipif->ipif_flags & check_flags) { 20796 if ((test_allzero_ire != NULL) && 20797 (!allzero_bcast_ire_created) && 20798 (backup_ipif_allzeros == NULL)) { 20799 backup_ipif_allzeros = ipif; 20800 } 20801 if ((test_allone_ire != NULL) && 20802 (!allone_bcast_ire_created) && 20803 (backup_ipif_allones == NULL)) { 20804 backup_ipif_allones = ipif; 20805 } 20806 continue; 20807 } 20808 20809 /* 20810 * Found an ipif which has the same broadcast ire as the 20811 * ipif passed in and the ipif passed in "owns" the ire. 20812 * Create new broadcast ire's for this broadcast addr. 20813 */ 20814 if (need_net_bcast_ire && !net_bcast_ire_created) { 20815 irep = ire_create_bcast(ipif, net_addr, irep); 20816 irep = ire_create_bcast(ipif, 20817 ~net_mask | net_addr, irep); 20818 net_bcast_ire_created = B_TRUE; 20819 } 20820 if (need_subnet_bcast_ire && !subnet_bcast_ire_created) { 20821 irep = ire_create_bcast(ipif, subnet_addr, irep); 20822 irep = ire_create_bcast(ipif, 20823 ~subnet_mask | subnet_addr, irep); 20824 subnet_bcast_ire_created = B_TRUE; 20825 } 20826 if (test_allzero_ire != NULL && !allzero_bcast_ire_created) { 20827 irep = ire_create_bcast(ipif, 0, irep); 20828 allzero_bcast_ire_created = B_TRUE; 20829 } 20830 if (test_allone_ire != NULL && !allone_bcast_ire_created) { 20831 irep = ire_create_bcast(ipif, INADDR_BROADCAST, irep); 20832 allone_bcast_ire_created = B_TRUE; 20833 } 20834 /* 20835 * Once we have created all the appropriate ires, we 20836 * just break out of this loop to add what we have created. 20837 * This has been indented similar to ire_match_args for 20838 * readability. 20839 */ 20840 if (((test_net_ire == NULL) || 20841 (net_bcast_ire_created)) && 20842 ((test_subnet_ire == NULL) || 20843 (subnet_bcast_ire_created)) && 20844 ((test_allzero_ire == NULL) || 20845 (allzero_bcast_ire_created)) && 20846 ((test_allone_ire == NULL) || 20847 (allone_bcast_ire_created))) { 20848 break; 20849 } 20850 } 20851 20852 /* 20853 * Create bcast ires on deprecated ipifs if no non-deprecated ipifs 20854 * exist. 6 pairs of bcast ires are needed. 20855 * Note - the old ires are deleted in ipif_down. 20856 */ 20857 if (need_net_bcast_ire && !net_bcast_ire_created && backup_ipif_net) { 20858 ipif = backup_ipif_net; 20859 irep = ire_create_bcast(ipif, net_addr, irep); 20860 irep = ire_create_bcast(ipif, ~net_mask | net_addr, irep); 20861 net_bcast_ire_created = B_TRUE; 20862 } 20863 if (need_subnet_bcast_ire && !subnet_bcast_ire_created && 20864 backup_ipif_subnet) { 20865 ipif = backup_ipif_subnet; 20866 irep = ire_create_bcast(ipif, subnet_addr, irep); 20867 irep = ire_create_bcast(ipif, 20868 ~subnet_mask | subnet_addr, irep); 20869 subnet_bcast_ire_created = B_TRUE; 20870 } 20871 if (test_allzero_ire != NULL && !allzero_bcast_ire_created && 20872 backup_ipif_allzeros) { 20873 irep = ire_create_bcast(backup_ipif_allzeros, 0, irep); 20874 allzero_bcast_ire_created = B_TRUE; 20875 } 20876 if (test_allone_ire != NULL && !allone_bcast_ire_created && 20877 backup_ipif_allones) { 20878 irep = ire_create_bcast(backup_ipif_allones, 20879 INADDR_BROADCAST, irep); 20880 allone_bcast_ire_created = B_TRUE; 20881 } 20882 20883 /* 20884 * If we can't create all of them, don't add any of them. 20885 * Code in ip_wput_ire and ire_to_ill assumes that we 20886 * always have a non-loopback copy and loopback copy 20887 * for a given address. 20888 */ 20889 for (irep1 = irep; irep1 > ire_array; ) { 20890 irep1--; 20891 if (*irep1 == NULL) { 20892 ip0dbg(("ipif_check_bcast_ires: can't create " 20893 "IRE_BROADCAST, memory allocation failure\n")); 20894 while (irep > ire_array) { 20895 irep--; 20896 if (*irep != NULL) 20897 ire_delete(*irep); 20898 } 20899 goto bad; 20900 } 20901 } 20902 for (irep1 = irep; irep1 > ire_array; ) { 20903 int error; 20904 20905 irep1--; 20906 error = ire_add(irep1, NULL, NULL, NULL, B_FALSE); 20907 if (error == 0) { 20908 ire_refrele(*irep1); /* Held in ire_add */ 20909 } 20910 } 20911 bad: 20912 if (test_allzero_ire != NULL) 20913 ire_refrele(test_allzero_ire); 20914 if (test_allone_ire != NULL) 20915 ire_refrele(test_allone_ire); 20916 if (test_net_ire != NULL) 20917 ire_refrele(test_net_ire); 20918 if (test_subnet_ire != NULL) 20919 ire_refrele(test_subnet_ire); 20920 } 20921 20922 /* 20923 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 20924 * from lifr_flags and the name from lifr_name. 20925 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 20926 * since ipif_lookup_on_name uses the _isv6 flags when matching. 20927 * Returns EINPROGRESS when mp has been consumed by queueing it on 20928 * ill_pending_mp and the ioctl will complete in ip_rput. 20929 */ 20930 /* ARGSUSED */ 20931 int 20932 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20933 ip_ioctl_cmd_t *ipip, void *if_req) 20934 { 20935 int err; 20936 ill_t *ill; 20937 struct lifreq *lifr = (struct lifreq *)if_req; 20938 20939 ASSERT(ipif != NULL); 20940 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 20941 ASSERT(q->q_next != NULL); 20942 20943 ill = (ill_t *)q->q_ptr; 20944 /* 20945 * If we are not writer on 'q' then this interface exists already 20946 * and previous lookups (ipif_extract_lifreq_cmn) found this ipif. 20947 * So return EALREADY 20948 */ 20949 if (ill != ipif->ipif_ill) 20950 return (EALREADY); 20951 20952 if (ill->ill_name[0] != '\0') 20953 return (EALREADY); 20954 20955 /* 20956 * Set all the flags. Allows all kinds of override. Provide some 20957 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 20958 * unless there is either multicast/broadcast support in the driver 20959 * or it is a pt-pt link. 20960 */ 20961 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 20962 /* Meaningless to IP thus don't allow them to be set. */ 20963 ip1dbg(("ip_setname: EINVAL 1\n")); 20964 return (EINVAL); 20965 } 20966 /* 20967 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 20968 * ill_bcast_addr_length info. 20969 */ 20970 if (!ill->ill_needs_attach && 20971 ((lifr->lifr_flags & IFF_MULTICAST) && 20972 !(lifr->lifr_flags & IFF_POINTOPOINT) && 20973 ill->ill_bcast_addr_length == 0)) { 20974 /* Link not broadcast/pt-pt capable i.e. no multicast */ 20975 ip1dbg(("ip_setname: EINVAL 2\n")); 20976 return (EINVAL); 20977 } 20978 if ((lifr->lifr_flags & IFF_BROADCAST) && 20979 ((lifr->lifr_flags & IFF_IPV6) || 20980 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 20981 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 20982 ip1dbg(("ip_setname: EINVAL 3\n")); 20983 return (EINVAL); 20984 } 20985 if (lifr->lifr_flags & IFF_UP) { 20986 /* Can only be set with SIOCSLIFFLAGS */ 20987 ip1dbg(("ip_setname: EINVAL 4\n")); 20988 return (EINVAL); 20989 } 20990 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 20991 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 20992 ip1dbg(("ip_setname: EINVAL 5\n")); 20993 return (EINVAL); 20994 } 20995 /* 20996 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 20997 */ 20998 if ((lifr->lifr_flags & IFF_XRESOLV) && 20999 !(lifr->lifr_flags & IFF_IPV6) && 21000 !(ipif->ipif_isv6)) { 21001 ip1dbg(("ip_setname: EINVAL 6\n")); 21002 return (EINVAL); 21003 } 21004 21005 /* 21006 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 21007 * we have all the flags here. So, we assign rather than we OR. 21008 * We can't OR the flags here because we don't want to set 21009 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 21010 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 21011 * on lifr_flags value here. 21012 */ 21013 /* 21014 * This ill has not been inserted into the global list. 21015 * So we are still single threaded and don't need any lock 21016 */ 21017 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS & 21018 ~IFF_DUPLICATE; 21019 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 21020 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 21021 21022 /* We started off as V4. */ 21023 if (ill->ill_flags & ILLF_IPV6) { 21024 ill->ill_phyint->phyint_illv6 = ill; 21025 ill->ill_phyint->phyint_illv4 = NULL; 21026 } 21027 err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa); 21028 return (err); 21029 } 21030 21031 /* ARGSUSED */ 21032 int 21033 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21034 ip_ioctl_cmd_t *ipip, void *if_req) 21035 { 21036 /* 21037 * ill_phyint_reinit merged the v4 and v6 into a single 21038 * ipsq. Could also have become part of a ipmp group in the 21039 * process, and we might not have been able to complete the 21040 * slifname in ipif_set_values, if we could not become 21041 * exclusive. If so restart it here 21042 */ 21043 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21044 } 21045 21046 /* 21047 * Return a pointer to the ipif which matches the index, IP version type and 21048 * zoneid. 21049 */ 21050 ipif_t * 21051 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 21052 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 21053 { 21054 ill_t *ill; 21055 ipsq_t *ipsq; 21056 phyint_t *phyi; 21057 ipif_t *ipif; 21058 21059 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 21060 (q != NULL && mp != NULL && func != NULL && err != NULL)); 21061 21062 if (err != NULL) 21063 *err = 0; 21064 21065 /* 21066 * Indexes are stored in the phyint - a common structure 21067 * to both IPv4 and IPv6. 21068 */ 21069 21070 rw_enter(&ill_g_lock, RW_READER); 21071 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 21072 (void *) &index, NULL); 21073 if (phyi != NULL) { 21074 ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4; 21075 if (ill == NULL) { 21076 rw_exit(&ill_g_lock); 21077 if (err != NULL) 21078 *err = ENXIO; 21079 return (NULL); 21080 } 21081 GRAB_CONN_LOCK(q); 21082 mutex_enter(&ill->ill_lock); 21083 if (ILL_CAN_LOOKUP(ill)) { 21084 for (ipif = ill->ill_ipif; ipif != NULL; 21085 ipif = ipif->ipif_next) { 21086 if (IPIF_CAN_LOOKUP(ipif) && 21087 (zoneid == ALL_ZONES || 21088 zoneid == ipif->ipif_zoneid || 21089 ipif->ipif_zoneid == ALL_ZONES)) { 21090 ipif_refhold_locked(ipif); 21091 mutex_exit(&ill->ill_lock); 21092 RELEASE_CONN_LOCK(q); 21093 rw_exit(&ill_g_lock); 21094 return (ipif); 21095 } 21096 } 21097 } else if (ILL_CAN_WAIT(ill, q)) { 21098 ipsq = ill->ill_phyint->phyint_ipsq; 21099 mutex_enter(&ipsq->ipsq_lock); 21100 rw_exit(&ill_g_lock); 21101 mutex_exit(&ill->ill_lock); 21102 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 21103 mutex_exit(&ipsq->ipsq_lock); 21104 RELEASE_CONN_LOCK(q); 21105 *err = EINPROGRESS; 21106 return (NULL); 21107 } 21108 mutex_exit(&ill->ill_lock); 21109 RELEASE_CONN_LOCK(q); 21110 } 21111 rw_exit(&ill_g_lock); 21112 if (err != NULL) 21113 *err = ENXIO; 21114 return (NULL); 21115 } 21116 21117 typedef struct conn_change_s { 21118 uint_t cc_old_ifindex; 21119 uint_t cc_new_ifindex; 21120 } conn_change_t; 21121 21122 /* 21123 * ipcl_walk function for changing interface index. 21124 */ 21125 static void 21126 conn_change_ifindex(conn_t *connp, caddr_t arg) 21127 { 21128 conn_change_t *connc; 21129 uint_t old_ifindex; 21130 uint_t new_ifindex; 21131 int i; 21132 ilg_t *ilg; 21133 21134 connc = (conn_change_t *)arg; 21135 old_ifindex = connc->cc_old_ifindex; 21136 new_ifindex = connc->cc_new_ifindex; 21137 21138 if (connp->conn_orig_bound_ifindex == old_ifindex) 21139 connp->conn_orig_bound_ifindex = new_ifindex; 21140 21141 if (connp->conn_orig_multicast_ifindex == old_ifindex) 21142 connp->conn_orig_multicast_ifindex = new_ifindex; 21143 21144 if (connp->conn_orig_xmit_ifindex == old_ifindex) 21145 connp->conn_orig_xmit_ifindex = new_ifindex; 21146 21147 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 21148 ilg = &connp->conn_ilg[i]; 21149 if (ilg->ilg_orig_ifindex == old_ifindex) 21150 ilg->ilg_orig_ifindex = new_ifindex; 21151 } 21152 } 21153 21154 /* 21155 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 21156 * to new_index if it matches the old_index. 21157 * 21158 * Failovers typically happen within a group of ills. But somebody 21159 * can remove an ill from the group after a failover happened. If 21160 * we are setting the ifindex after this, we potentially need to 21161 * look at all the ills rather than just the ones in the group. 21162 * We cut down the work by looking at matching ill_net_types 21163 * and ill_types as we could not possibly grouped them together. 21164 */ 21165 static void 21166 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 21167 { 21168 ill_t *ill; 21169 ipif_t *ipif; 21170 uint_t old_ifindex; 21171 uint_t new_ifindex; 21172 ilm_t *ilm; 21173 ill_walk_context_t ctx; 21174 21175 old_ifindex = connc->cc_old_ifindex; 21176 new_ifindex = connc->cc_new_ifindex; 21177 21178 rw_enter(&ill_g_lock, RW_READER); 21179 ill = ILL_START_WALK_ALL(&ctx); 21180 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 21181 if ((ill_orig->ill_net_type != ill->ill_net_type) || 21182 (ill_orig->ill_type != ill->ill_type)) { 21183 continue; 21184 } 21185 for (ipif = ill->ill_ipif; ipif != NULL; 21186 ipif = ipif->ipif_next) { 21187 if (ipif->ipif_orig_ifindex == old_ifindex) 21188 ipif->ipif_orig_ifindex = new_ifindex; 21189 } 21190 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 21191 if (ilm->ilm_orig_ifindex == old_ifindex) 21192 ilm->ilm_orig_ifindex = new_ifindex; 21193 } 21194 } 21195 rw_exit(&ill_g_lock); 21196 } 21197 21198 /* 21199 * We first need to ensure that the new index is unique, and 21200 * then carry the change across both v4 and v6 ill representation 21201 * of the physical interface. 21202 */ 21203 /* ARGSUSED */ 21204 int 21205 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21206 ip_ioctl_cmd_t *ipip, void *ifreq) 21207 { 21208 ill_t *ill; 21209 ill_t *ill_other; 21210 phyint_t *phyi; 21211 int old_index; 21212 conn_change_t connc; 21213 struct ifreq *ifr = (struct ifreq *)ifreq; 21214 struct lifreq *lifr = (struct lifreq *)ifreq; 21215 uint_t index; 21216 ill_t *ill_v4; 21217 ill_t *ill_v6; 21218 21219 if (ipip->ipi_cmd_type == IF_CMD) 21220 index = ifr->ifr_index; 21221 else 21222 index = lifr->lifr_index; 21223 21224 /* 21225 * Only allow on physical interface. Also, index zero is illegal. 21226 * 21227 * Need to check for PHYI_FAILED and PHYI_INACTIVE 21228 * 21229 * 1) If PHYI_FAILED is set, a failover could have happened which 21230 * implies a possible failback might have to happen. As failback 21231 * depends on the old index, we should fail setting the index. 21232 * 21233 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 21234 * any addresses or multicast memberships are failed over to 21235 * a non-STANDBY interface. As failback depends on the old 21236 * index, we should fail setting the index for this case also. 21237 * 21238 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 21239 * Be consistent with PHYI_FAILED and fail the ioctl. 21240 */ 21241 ill = ipif->ipif_ill; 21242 phyi = ill->ill_phyint; 21243 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 21244 ipif->ipif_id != 0 || index == 0) { 21245 return (EINVAL); 21246 } 21247 old_index = phyi->phyint_ifindex; 21248 21249 /* If the index is not changing, no work to do */ 21250 if (old_index == index) 21251 return (0); 21252 21253 /* 21254 * Use ill_lookup_on_ifindex to determine if the 21255 * new index is unused and if so allow the change. 21256 */ 21257 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL); 21258 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL); 21259 if (ill_v6 != NULL || ill_v4 != NULL) { 21260 if (ill_v4 != NULL) 21261 ill_refrele(ill_v4); 21262 if (ill_v6 != NULL) 21263 ill_refrele(ill_v6); 21264 return (EBUSY); 21265 } 21266 21267 /* 21268 * The new index is unused. Set it in the phyint. 21269 * Locate the other ill so that we can send a routing 21270 * sockets message. 21271 */ 21272 if (ill->ill_isv6) { 21273 ill_other = phyi->phyint_illv4; 21274 } else { 21275 ill_other = phyi->phyint_illv6; 21276 } 21277 21278 phyi->phyint_ifindex = index; 21279 21280 connc.cc_old_ifindex = old_index; 21281 connc.cc_new_ifindex = index; 21282 ip_change_ifindex(ill, &connc); 21283 ipcl_walk(conn_change_ifindex, (caddr_t)&connc); 21284 21285 /* Send the routing sockets message */ 21286 ip_rts_ifmsg(ipif); 21287 if (ill_other != NULL) 21288 ip_rts_ifmsg(ill_other->ill_ipif); 21289 21290 return (0); 21291 } 21292 21293 /* ARGSUSED */ 21294 int 21295 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21296 ip_ioctl_cmd_t *ipip, void *ifreq) 21297 { 21298 struct ifreq *ifr = (struct ifreq *)ifreq; 21299 struct lifreq *lifr = (struct lifreq *)ifreq; 21300 21301 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 21302 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21303 /* Get the interface index */ 21304 if (ipip->ipi_cmd_type == IF_CMD) { 21305 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21306 } else { 21307 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21308 } 21309 return (0); 21310 } 21311 21312 /* ARGSUSED */ 21313 int 21314 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21315 ip_ioctl_cmd_t *ipip, void *ifreq) 21316 { 21317 struct lifreq *lifr = (struct lifreq *)ifreq; 21318 21319 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 21320 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21321 /* Get the interface zone */ 21322 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21323 lifr->lifr_zoneid = ipif->ipif_zoneid; 21324 return (0); 21325 } 21326 21327 /* 21328 * Set the zoneid of an interface. 21329 */ 21330 /* ARGSUSED */ 21331 int 21332 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21333 ip_ioctl_cmd_t *ipip, void *ifreq) 21334 { 21335 struct lifreq *lifr = (struct lifreq *)ifreq; 21336 int err = 0; 21337 boolean_t need_up = B_FALSE; 21338 zone_t *zptr; 21339 zone_status_t status; 21340 zoneid_t zoneid; 21341 21342 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21343 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 21344 if (!is_system_labeled()) 21345 return (ENOTSUP); 21346 zoneid = GLOBAL_ZONEID; 21347 } 21348 21349 /* cannot assign instance zero to a non-global zone */ 21350 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 21351 return (ENOTSUP); 21352 21353 /* 21354 * Cannot assign to a zone that doesn't exist or is shutting down. In 21355 * the event of a race with the zone shutdown processing, since IP 21356 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 21357 * interface will be cleaned up even if the zone is shut down 21358 * immediately after the status check. If the interface can't be brought 21359 * down right away, and the zone is shut down before the restart 21360 * function is called, we resolve the possible races by rechecking the 21361 * zone status in the restart function. 21362 */ 21363 if ((zptr = zone_find_by_id(zoneid)) == NULL) 21364 return (EINVAL); 21365 status = zone_status_get(zptr); 21366 zone_rele(zptr); 21367 21368 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 21369 return (EINVAL); 21370 21371 if (ipif->ipif_flags & IPIF_UP) { 21372 /* 21373 * If the interface is already marked up, 21374 * we call ipif_down which will take care 21375 * of ditching any IREs that have been set 21376 * up based on the old interface address. 21377 */ 21378 err = ipif_logical_down(ipif, q, mp); 21379 if (err == EINPROGRESS) 21380 return (err); 21381 ipif_down_tail(ipif); 21382 need_up = B_TRUE; 21383 } 21384 21385 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 21386 return (err); 21387 } 21388 21389 static int 21390 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 21391 queue_t *q, mblk_t *mp, boolean_t need_up) 21392 { 21393 int err = 0; 21394 21395 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 21396 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21397 21398 /* Set the new zone id. */ 21399 ipif->ipif_zoneid = zoneid; 21400 21401 /* Update sctp list */ 21402 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 21403 21404 if (need_up) { 21405 /* 21406 * Now bring the interface back up. If this 21407 * is the only IPIF for the ILL, ipif_up 21408 * will have to re-bind to the device, so 21409 * we may get back EINPROGRESS, in which 21410 * case, this IOCTL will get completed in 21411 * ip_rput_dlpi when we see the DL_BIND_ACK. 21412 */ 21413 err = ipif_up(ipif, q, mp); 21414 } 21415 return (err); 21416 } 21417 21418 /* ARGSUSED */ 21419 int 21420 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21421 ip_ioctl_cmd_t *ipip, void *if_req) 21422 { 21423 struct lifreq *lifr = (struct lifreq *)if_req; 21424 zoneid_t zoneid; 21425 zone_t *zptr; 21426 zone_status_t status; 21427 21428 ASSERT(ipif->ipif_id != 0); 21429 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21430 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 21431 zoneid = GLOBAL_ZONEID; 21432 21433 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 21434 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21435 21436 /* 21437 * We recheck the zone status to resolve the following race condition: 21438 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 21439 * 2) hme0:1 is up and can't be brought down right away; 21440 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 21441 * 3) zone "myzone" is halted; the zone status switches to 21442 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 21443 * the interfaces to remove - hme0:1 is not returned because it's not 21444 * yet in "myzone", so it won't be removed; 21445 * 4) the restart function for SIOCSLIFZONE is called; without the 21446 * status check here, we would have hme0:1 in "myzone" after it's been 21447 * destroyed. 21448 * Note that if the status check fails, we need to bring the interface 21449 * back to its state prior to ip_sioctl_slifzone(), hence the call to 21450 * ipif_up_done[_v6](). 21451 */ 21452 status = ZONE_IS_UNINITIALIZED; 21453 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 21454 status = zone_status_get(zptr); 21455 zone_rele(zptr); 21456 } 21457 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 21458 if (ipif->ipif_isv6) { 21459 (void) ipif_up_done_v6(ipif); 21460 } else { 21461 (void) ipif_up_done(ipif); 21462 } 21463 return (EINVAL); 21464 } 21465 21466 ipif_down_tail(ipif); 21467 21468 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 21469 B_TRUE)); 21470 } 21471 21472 /* ARGSUSED */ 21473 int 21474 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21475 ip_ioctl_cmd_t *ipip, void *ifreq) 21476 { 21477 struct lifreq *lifr = ifreq; 21478 21479 ASSERT(q->q_next == NULL); 21480 ASSERT(CONN_Q(q)); 21481 21482 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 21483 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21484 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 21485 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 21486 21487 return (0); 21488 } 21489 21490 21491 /* Find the previous ILL in this usesrc group */ 21492 static ill_t * 21493 ill_prev_usesrc(ill_t *uill) 21494 { 21495 ill_t *ill; 21496 21497 for (ill = uill->ill_usesrc_grp_next; 21498 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 21499 ill = ill->ill_usesrc_grp_next) 21500 /* do nothing */; 21501 return (ill); 21502 } 21503 21504 /* 21505 * Release all members of the usesrc group. This routine is called 21506 * from ill_delete when the interface being unplumbed is the 21507 * group head. 21508 */ 21509 static void 21510 ill_disband_usesrc_group(ill_t *uill) 21511 { 21512 ill_t *next_ill, *tmp_ill; 21513 ASSERT(RW_WRITE_HELD(&ill_g_usesrc_lock)); 21514 next_ill = uill->ill_usesrc_grp_next; 21515 21516 do { 21517 ASSERT(next_ill != NULL); 21518 tmp_ill = next_ill->ill_usesrc_grp_next; 21519 ASSERT(tmp_ill != NULL); 21520 next_ill->ill_usesrc_grp_next = NULL; 21521 next_ill->ill_usesrc_ifindex = 0; 21522 next_ill = tmp_ill; 21523 } while (next_ill->ill_usesrc_ifindex != 0); 21524 uill->ill_usesrc_grp_next = NULL; 21525 } 21526 21527 /* 21528 * Remove the client usesrc ILL from the list and relink to a new list 21529 */ 21530 int 21531 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 21532 { 21533 ill_t *ill, *tmp_ill; 21534 21535 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 21536 (uill != NULL) && RW_WRITE_HELD(&ill_g_usesrc_lock)); 21537 21538 /* 21539 * Check if the usesrc client ILL passed in is not already 21540 * in use as a usesrc ILL i.e one whose source address is 21541 * in use OR a usesrc ILL is not already in use as a usesrc 21542 * client ILL 21543 */ 21544 if ((ucill->ill_usesrc_ifindex == 0) || 21545 (uill->ill_usesrc_ifindex != 0)) { 21546 return (-1); 21547 } 21548 21549 ill = ill_prev_usesrc(ucill); 21550 ASSERT(ill->ill_usesrc_grp_next != NULL); 21551 21552 /* Remove from the current list */ 21553 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 21554 /* Only two elements in the list */ 21555 ASSERT(ill->ill_usesrc_ifindex == 0); 21556 ill->ill_usesrc_grp_next = NULL; 21557 } else { 21558 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 21559 } 21560 21561 if (ifindex == 0) { 21562 ucill->ill_usesrc_ifindex = 0; 21563 ucill->ill_usesrc_grp_next = NULL; 21564 return (0); 21565 } 21566 21567 ucill->ill_usesrc_ifindex = ifindex; 21568 tmp_ill = uill->ill_usesrc_grp_next; 21569 uill->ill_usesrc_grp_next = ucill; 21570 ucill->ill_usesrc_grp_next = 21571 (tmp_ill != NULL) ? tmp_ill : uill; 21572 return (0); 21573 } 21574 21575 /* 21576 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 21577 * ip.c for locking details. 21578 */ 21579 /* ARGSUSED */ 21580 int 21581 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21582 ip_ioctl_cmd_t *ipip, void *ifreq) 21583 { 21584 struct lifreq *lifr = (struct lifreq *)ifreq; 21585 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 21586 ill_flag_changed = B_FALSE; 21587 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 21588 int err = 0, ret; 21589 uint_t ifindex; 21590 phyint_t *us_phyint, *us_cli_phyint; 21591 ipsq_t *ipsq = NULL; 21592 21593 ASSERT(IAM_WRITER_IPIF(ipif)); 21594 ASSERT(q->q_next == NULL); 21595 ASSERT(CONN_Q(q)); 21596 21597 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 21598 us_cli_phyint = usesrc_cli_ill->ill_phyint; 21599 21600 ASSERT(us_cli_phyint != NULL); 21601 21602 /* 21603 * If the client ILL is being used for IPMP, abort. 21604 * Note, this can be done before ipsq_try_enter since we are already 21605 * exclusive on this ILL 21606 */ 21607 if ((us_cli_phyint->phyint_groupname != NULL) || 21608 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 21609 return (EINVAL); 21610 } 21611 21612 ifindex = lifr->lifr_index; 21613 if (ifindex == 0) { 21614 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 21615 /* non usesrc group interface, nothing to reset */ 21616 return (0); 21617 } 21618 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 21619 /* valid reset request */ 21620 reset_flg = B_TRUE; 21621 } 21622 21623 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 21624 ip_process_ioctl, &err); 21625 21626 if (usesrc_ill == NULL) { 21627 return (err); 21628 } 21629 21630 /* 21631 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 21632 * group nor can either of the interfaces be used for standy. So 21633 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 21634 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 21635 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 21636 * We are already exlusive on this ipsq i.e ipsq corresponding to 21637 * the usesrc_cli_ill 21638 */ 21639 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 21640 NEW_OP, B_TRUE); 21641 if (ipsq == NULL) { 21642 err = EINPROGRESS; 21643 /* Operation enqueued on the ipsq of the usesrc ILL */ 21644 goto done; 21645 } 21646 21647 /* Check if the usesrc_ill is used for IPMP */ 21648 us_phyint = usesrc_ill->ill_phyint; 21649 if ((us_phyint->phyint_groupname != NULL) || 21650 (us_phyint->phyint_flags & PHYI_STANDBY)) { 21651 err = EINVAL; 21652 goto done; 21653 } 21654 21655 /* 21656 * If the client is already in use as a usesrc_ill or a usesrc_ill is 21657 * already a client then return EINVAL 21658 */ 21659 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 21660 err = EINVAL; 21661 goto done; 21662 } 21663 21664 /* 21665 * If the ill_usesrc_ifindex field is already set to what it needs to 21666 * be then this is a duplicate operation. 21667 */ 21668 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 21669 err = 0; 21670 goto done; 21671 } 21672 21673 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 21674 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 21675 usesrc_ill->ill_isv6)); 21676 21677 /* 21678 * The next step ensures that no new ires will be created referencing 21679 * the client ill, until the ILL_CHANGING flag is cleared. Then 21680 * we go through an ire walk deleting all ire caches that reference 21681 * the client ill. New ires referencing the client ill that are added 21682 * to the ire table before the ILL_CHANGING flag is set, will be 21683 * cleaned up by the ire walk below. Attempt to add new ires referencing 21684 * the client ill while the ILL_CHANGING flag is set will be failed 21685 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 21686 * checks (under the ill_g_usesrc_lock) that the ire being added 21687 * is not stale, i.e the ire_stq and ire_ipif are consistent and 21688 * belong to the same usesrc group. 21689 */ 21690 mutex_enter(&usesrc_cli_ill->ill_lock); 21691 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 21692 mutex_exit(&usesrc_cli_ill->ill_lock); 21693 ill_flag_changed = B_TRUE; 21694 21695 if (ipif->ipif_isv6) 21696 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 21697 ALL_ZONES); 21698 else 21699 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 21700 ALL_ZONES); 21701 21702 /* 21703 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 21704 * and the ill_usesrc_ifindex fields 21705 */ 21706 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 21707 21708 if (reset_flg) { 21709 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 21710 if (ret != 0) { 21711 err = EINVAL; 21712 } 21713 rw_exit(&ill_g_usesrc_lock); 21714 goto done; 21715 } 21716 21717 /* 21718 * Four possibilities to consider: 21719 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 21720 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 21721 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 21722 * 4. Both are part of their respective usesrc groups 21723 */ 21724 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 21725 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 21726 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 21727 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 21728 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 21729 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 21730 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 21731 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 21732 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 21733 /* Insert at head of list */ 21734 usesrc_cli_ill->ill_usesrc_grp_next = 21735 usesrc_ill->ill_usesrc_grp_next; 21736 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 21737 } else { 21738 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 21739 ifindex); 21740 if (ret != 0) 21741 err = EINVAL; 21742 } 21743 rw_exit(&ill_g_usesrc_lock); 21744 21745 done: 21746 if (ill_flag_changed) { 21747 mutex_enter(&usesrc_cli_ill->ill_lock); 21748 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 21749 mutex_exit(&usesrc_cli_ill->ill_lock); 21750 } 21751 if (ipsq != NULL) 21752 ipsq_exit(ipsq, B_TRUE, B_TRUE); 21753 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 21754 ill_refrele(usesrc_ill); 21755 return (err); 21756 } 21757 21758 /* 21759 * comparison function used by avl. 21760 */ 21761 static int 21762 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 21763 { 21764 21765 uint_t index; 21766 21767 ASSERT(phyip != NULL && index_ptr != NULL); 21768 21769 index = *((uint_t *)index_ptr); 21770 /* 21771 * let the phyint with the lowest index be on top. 21772 */ 21773 if (((phyint_t *)phyip)->phyint_ifindex < index) 21774 return (1); 21775 if (((phyint_t *)phyip)->phyint_ifindex > index) 21776 return (-1); 21777 return (0); 21778 } 21779 21780 /* 21781 * comparison function used by avl. 21782 */ 21783 static int 21784 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 21785 { 21786 ill_t *ill; 21787 int res = 0; 21788 21789 ASSERT(phyip != NULL && name_ptr != NULL); 21790 21791 if (((phyint_t *)phyip)->phyint_illv4) 21792 ill = ((phyint_t *)phyip)->phyint_illv4; 21793 else 21794 ill = ((phyint_t *)phyip)->phyint_illv6; 21795 ASSERT(ill != NULL); 21796 21797 res = strcmp(ill->ill_name, (char *)name_ptr); 21798 if (res > 0) 21799 return (1); 21800 else if (res < 0) 21801 return (-1); 21802 return (0); 21803 } 21804 /* 21805 * This function is called from ill_delete when the ill is being 21806 * unplumbed. We remove the reference from the phyint and we also 21807 * free the phyint when there are no more references to it. 21808 */ 21809 static void 21810 ill_phyint_free(ill_t *ill) 21811 { 21812 phyint_t *phyi; 21813 phyint_t *next_phyint; 21814 ipsq_t *cur_ipsq; 21815 21816 ASSERT(ill->ill_phyint != NULL); 21817 21818 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 21819 phyi = ill->ill_phyint; 21820 ill->ill_phyint = NULL; 21821 /* 21822 * ill_init allocates a phyint always to store the copy 21823 * of flags relevant to phyint. At that point in time, we could 21824 * not assign the name and hence phyint_illv4/v6 could not be 21825 * initialized. Later in ipif_set_values, we assign the name to 21826 * the ill, at which point in time we assign phyint_illv4/v6. 21827 * Thus we don't rely on phyint_illv6 to be initialized always. 21828 */ 21829 if (ill->ill_flags & ILLF_IPV6) { 21830 phyi->phyint_illv6 = NULL; 21831 } else { 21832 phyi->phyint_illv4 = NULL; 21833 } 21834 /* 21835 * ipif_down removes it from the group when the last ipif goes 21836 * down. 21837 */ 21838 ASSERT(ill->ill_group == NULL); 21839 21840 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 21841 return; 21842 21843 /* 21844 * Make sure this phyint was put in the list. 21845 */ 21846 if (phyi->phyint_ifindex > 0) { 21847 avl_remove(&phyint_g_list.phyint_list_avl_by_index, 21848 phyi); 21849 avl_remove(&phyint_g_list.phyint_list_avl_by_name, 21850 phyi); 21851 } 21852 /* 21853 * remove phyint from the ipsq list. 21854 */ 21855 cur_ipsq = phyi->phyint_ipsq; 21856 if (phyi == cur_ipsq->ipsq_phyint_list) { 21857 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 21858 } else { 21859 next_phyint = cur_ipsq->ipsq_phyint_list; 21860 while (next_phyint != NULL) { 21861 if (next_phyint->phyint_ipsq_next == phyi) { 21862 next_phyint->phyint_ipsq_next = 21863 phyi->phyint_ipsq_next; 21864 break; 21865 } 21866 next_phyint = next_phyint->phyint_ipsq_next; 21867 } 21868 ASSERT(next_phyint != NULL); 21869 } 21870 IPSQ_DEC_REF(cur_ipsq); 21871 21872 if (phyi->phyint_groupname_len != 0) { 21873 ASSERT(phyi->phyint_groupname != NULL); 21874 mi_free(phyi->phyint_groupname); 21875 } 21876 mi_free(phyi); 21877 } 21878 21879 /* 21880 * Attach the ill to the phyint structure which can be shared by both 21881 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 21882 * function is called from ipif_set_values and ill_lookup_on_name (for 21883 * loopback) where we know the name of the ill. We lookup the ill and if 21884 * there is one present already with the name use that phyint. Otherwise 21885 * reuse the one allocated by ill_init. 21886 */ 21887 static void 21888 ill_phyint_reinit(ill_t *ill) 21889 { 21890 boolean_t isv6 = ill->ill_isv6; 21891 phyint_t *phyi_old; 21892 phyint_t *phyi; 21893 avl_index_t where = 0; 21894 ill_t *ill_other = NULL; 21895 ipsq_t *ipsq; 21896 21897 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 21898 21899 phyi_old = ill->ill_phyint; 21900 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 21901 phyi_old->phyint_illv6 == NULL)); 21902 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 21903 phyi_old->phyint_illv4 == NULL)); 21904 ASSERT(phyi_old->phyint_ifindex == 0); 21905 21906 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 21907 ill->ill_name, &where); 21908 21909 /* 21910 * 1. We grabbed the ill_g_lock before inserting this ill into 21911 * the global list of ills. So no other thread could have located 21912 * this ill and hence the ipsq of this ill is guaranteed to be empty. 21913 * 2. Now locate the other protocol instance of this ill. 21914 * 3. Now grab both ill locks in the right order, and the phyint lock of 21915 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 21916 * of neither ill can change. 21917 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 21918 * other ill. 21919 * 5. Release all locks. 21920 */ 21921 21922 /* 21923 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 21924 * we are initializing IPv4. 21925 */ 21926 if (phyi != NULL) { 21927 ill_other = (isv6) ? phyi->phyint_illv4 : 21928 phyi->phyint_illv6; 21929 ASSERT(ill_other->ill_phyint != NULL); 21930 ASSERT((isv6 && !ill_other->ill_isv6) || 21931 (!isv6 && ill_other->ill_isv6)); 21932 GRAB_ILL_LOCKS(ill, ill_other); 21933 /* 21934 * We are potentially throwing away phyint_flags which 21935 * could be different from the one that we obtain from 21936 * ill_other->ill_phyint. But it is okay as we are assuming 21937 * that the state maintained within IP is correct. 21938 */ 21939 mutex_enter(&phyi->phyint_lock); 21940 if (isv6) { 21941 ASSERT(phyi->phyint_illv6 == NULL); 21942 phyi->phyint_illv6 = ill; 21943 } else { 21944 ASSERT(phyi->phyint_illv4 == NULL); 21945 phyi->phyint_illv4 = ill; 21946 } 21947 /* 21948 * This is a new ill, currently undergoing SLIFNAME 21949 * So we could not have joined an IPMP group until now. 21950 */ 21951 ASSERT(phyi_old->phyint_ipsq_next == NULL && 21952 phyi_old->phyint_groupname == NULL); 21953 21954 /* 21955 * This phyi_old is going away. Decref ipsq_refs and 21956 * assert it is zero. The ipsq itself will be freed in 21957 * ipsq_exit 21958 */ 21959 ipsq = phyi_old->phyint_ipsq; 21960 IPSQ_DEC_REF(ipsq); 21961 ASSERT(ipsq->ipsq_refs == 0); 21962 /* Get the singleton phyint out of the ipsq list */ 21963 ASSERT(phyi_old->phyint_ipsq_next == NULL); 21964 ipsq->ipsq_phyint_list = NULL; 21965 phyi_old->phyint_illv4 = NULL; 21966 phyi_old->phyint_illv6 = NULL; 21967 mi_free(phyi_old); 21968 } else { 21969 mutex_enter(&ill->ill_lock); 21970 /* 21971 * We don't need to acquire any lock, since 21972 * the ill is not yet visible globally and we 21973 * have not yet released the ill_g_lock. 21974 */ 21975 phyi = phyi_old; 21976 mutex_enter(&phyi->phyint_lock); 21977 /* XXX We need a recovery strategy here. */ 21978 if (!phyint_assign_ifindex(phyi)) 21979 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 21980 21981 avl_insert(&phyint_g_list.phyint_list_avl_by_name, 21982 (void *)phyi, where); 21983 21984 (void) avl_find(&phyint_g_list.phyint_list_avl_by_index, 21985 &phyi->phyint_ifindex, &where); 21986 avl_insert(&phyint_g_list.phyint_list_avl_by_index, 21987 (void *)phyi, where); 21988 } 21989 21990 /* 21991 * Reassigning ill_phyint automatically reassigns the ipsq also. 21992 * pending mp is not affected because that is per ill basis. 21993 */ 21994 ill->ill_phyint = phyi; 21995 21996 /* 21997 * Keep the index on ipif_orig_index to be used by FAILOVER. 21998 * We do this here as when the first ipif was allocated, 21999 * ipif_allocate does not know the right interface index. 22000 */ 22001 22002 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 22003 /* 22004 * Now that the phyint's ifindex has been assigned, complete the 22005 * remaining 22006 */ 22007 if (ill->ill_isv6) { 22008 ill->ill_ip6_mib->ipv6IfIndex = 22009 ill->ill_phyint->phyint_ifindex; 22010 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 22011 ill->ill_phyint->phyint_ifindex; 22012 } 22013 22014 RELEASE_ILL_LOCKS(ill, ill_other); 22015 mutex_exit(&phyi->phyint_lock); 22016 } 22017 22018 /* 22019 * Notify any downstream modules of the name of this interface. 22020 * An M_IOCTL is used even though we don't expect a successful reply. 22021 * Any reply message from the driver (presumably an M_IOCNAK) will 22022 * eventually get discarded somewhere upstream. The message format is 22023 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 22024 * to IP. 22025 */ 22026 static void 22027 ip_ifname_notify(ill_t *ill, queue_t *q) 22028 { 22029 mblk_t *mp1, *mp2; 22030 struct iocblk *iocp; 22031 struct lifreq *lifr; 22032 22033 mp1 = mkiocb(SIOCSLIFNAME); 22034 if (mp1 == NULL) 22035 return; 22036 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 22037 if (mp2 == NULL) { 22038 freeb(mp1); 22039 return; 22040 } 22041 22042 mp1->b_cont = mp2; 22043 iocp = (struct iocblk *)mp1->b_rptr; 22044 iocp->ioc_count = sizeof (struct lifreq); 22045 22046 lifr = (struct lifreq *)mp2->b_rptr; 22047 mp2->b_wptr += sizeof (struct lifreq); 22048 bzero(lifr, sizeof (struct lifreq)); 22049 22050 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 22051 lifr->lifr_ppa = ill->ill_ppa; 22052 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 22053 22054 putnext(q, mp1); 22055 } 22056 22057 static boolean_t ip_trash_timer_started = B_FALSE; 22058 22059 static int 22060 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 22061 { 22062 int err; 22063 22064 /* Set the obsolete NDD per-interface forwarding name. */ 22065 err = ill_set_ndd_name(ill); 22066 if (err != 0) { 22067 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 22068 err); 22069 } 22070 22071 /* Tell downstream modules where they are. */ 22072 ip_ifname_notify(ill, q); 22073 22074 /* 22075 * ill_dl_phys returns EINPROGRESS in the usual case. 22076 * Error cases are ENOMEM ... 22077 */ 22078 err = ill_dl_phys(ill, ipif, mp, q); 22079 22080 /* 22081 * If there is no IRE expiration timer running, get one started. 22082 * igmp and mld timers will be triggered by the first multicast 22083 */ 22084 if (!ip_trash_timer_started) { 22085 /* 22086 * acquire the lock and check again. 22087 */ 22088 mutex_enter(&ip_trash_timer_lock); 22089 if (!ip_trash_timer_started) { 22090 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 22091 MSEC_TO_TICK(ip_timer_interval)); 22092 ip_trash_timer_started = B_TRUE; 22093 } 22094 mutex_exit(&ip_trash_timer_lock); 22095 } 22096 22097 if (ill->ill_isv6) { 22098 mutex_enter(&mld_slowtimeout_lock); 22099 if (mld_slowtimeout_id == 0) { 22100 mld_slowtimeout_id = timeout(mld_slowtimo, NULL, 22101 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22102 } 22103 mutex_exit(&mld_slowtimeout_lock); 22104 } else { 22105 mutex_enter(&igmp_slowtimeout_lock); 22106 if (igmp_slowtimeout_id == 0) { 22107 igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL, 22108 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22109 } 22110 mutex_exit(&igmp_slowtimeout_lock); 22111 } 22112 22113 return (err); 22114 } 22115 22116 /* 22117 * Common routine for ppa and ifname setting. Should be called exclusive. 22118 * 22119 * Returns EINPROGRESS when mp has been consumed by queueing it on 22120 * ill_pending_mp and the ioctl will complete in ip_rput. 22121 * 22122 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 22123 * the new name and new ppa in lifr_name and lifr_ppa respectively. 22124 * For SLIFNAME, we pass these values back to the userland. 22125 */ 22126 static int 22127 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 22128 { 22129 ill_t *ill; 22130 ipif_t *ipif; 22131 ipsq_t *ipsq; 22132 char *ppa_ptr; 22133 char *old_ptr; 22134 char old_char; 22135 int error; 22136 22137 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 22138 ASSERT(q->q_next != NULL); 22139 ASSERT(interf_name != NULL); 22140 22141 ill = (ill_t *)q->q_ptr; 22142 22143 ASSERT(ill->ill_name[0] == '\0'); 22144 ASSERT(IAM_WRITER_ILL(ill)); 22145 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 22146 ASSERT(ill->ill_ppa == UINT_MAX); 22147 22148 /* The ppa is sent down by ifconfig or is chosen */ 22149 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 22150 return (EINVAL); 22151 } 22152 22153 /* 22154 * make sure ppa passed in is same as ppa in the name. 22155 * This check is not made when ppa == UINT_MAX in that case ppa 22156 * in the name could be anything. System will choose a ppa and 22157 * update new_ppa_ptr and inter_name to contain the choosen ppa. 22158 */ 22159 if (*new_ppa_ptr != UINT_MAX) { 22160 /* stoi changes the pointer */ 22161 old_ptr = ppa_ptr; 22162 /* 22163 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 22164 * (they don't have an externally visible ppa). We assign one 22165 * here so that we can manage the interface. Note that in 22166 * the past this value was always 0 for DLPI 1 drivers. 22167 */ 22168 if (*new_ppa_ptr == 0) 22169 *new_ppa_ptr = stoi(&old_ptr); 22170 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 22171 return (EINVAL); 22172 } 22173 /* 22174 * terminate string before ppa 22175 * save char at that location. 22176 */ 22177 old_char = ppa_ptr[0]; 22178 ppa_ptr[0] = '\0'; 22179 22180 ill->ill_ppa = *new_ppa_ptr; 22181 /* 22182 * Finish as much work now as possible before calling ill_glist_insert 22183 * which makes the ill globally visible and also merges it with the 22184 * other protocol instance of this phyint. The remaining work is 22185 * done after entering the ipsq which may happen sometime later. 22186 * ill_set_ndd_name occurs after the ill has been made globally visible. 22187 */ 22188 ipif = ill->ill_ipif; 22189 22190 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 22191 ipif_assign_seqid(ipif); 22192 22193 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 22194 ill->ill_flags |= ILLF_IPV4; 22195 22196 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 22197 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 22198 22199 if (ill->ill_flags & ILLF_IPV6) { 22200 22201 ill->ill_isv6 = B_TRUE; 22202 if (ill->ill_rq != NULL) { 22203 ill->ill_rq->q_qinfo = &rinit_ipv6; 22204 ill->ill_wq->q_qinfo = &winit_ipv6; 22205 } 22206 22207 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 22208 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 22209 ipif->ipif_v6src_addr = ipv6_all_zeros; 22210 ipif->ipif_v6subnet = ipv6_all_zeros; 22211 ipif->ipif_v6net_mask = ipv6_all_zeros; 22212 ipif->ipif_v6brd_addr = ipv6_all_zeros; 22213 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 22214 /* 22215 * point-to-point or Non-mulicast capable 22216 * interfaces won't do NUD unless explicitly 22217 * configured to do so. 22218 */ 22219 if (ipif->ipif_flags & IPIF_POINTOPOINT || 22220 !(ill->ill_flags & ILLF_MULTICAST)) { 22221 ill->ill_flags |= ILLF_NONUD; 22222 } 22223 /* Make sure IPv4 specific flag is not set on IPv6 if */ 22224 if (ill->ill_flags & ILLF_NOARP) { 22225 /* 22226 * Note: xresolv interfaces will eventually need 22227 * NOARP set here as well, but that will require 22228 * those external resolvers to have some 22229 * knowledge of that flag and act appropriately. 22230 * Not to be changed at present. 22231 */ 22232 ill->ill_flags &= ~ILLF_NOARP; 22233 } 22234 /* 22235 * Set the ILLF_ROUTER flag according to the global 22236 * IPv6 forwarding policy. 22237 */ 22238 if (ipv6_forward != 0) 22239 ill->ill_flags |= ILLF_ROUTER; 22240 } else if (ill->ill_flags & ILLF_IPV4) { 22241 ill->ill_isv6 = B_FALSE; 22242 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 22243 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 22244 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 22245 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 22246 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 22247 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 22248 /* 22249 * Set the ILLF_ROUTER flag according to the global 22250 * IPv4 forwarding policy. 22251 */ 22252 if (ip_g_forward != 0) 22253 ill->ill_flags |= ILLF_ROUTER; 22254 } 22255 22256 ASSERT(ill->ill_phyint != NULL); 22257 22258 /* 22259 * The ipv6Ifindex and ipv6IfIcmpIfIndex assignments will 22260 * be completed in ill_glist_insert -> ill_phyint_reinit 22261 */ 22262 if (ill->ill_isv6) { 22263 /* allocate v6 mib */ 22264 if (!ill_allocate_mibs(ill)) 22265 return (ENOMEM); 22266 } 22267 22268 /* 22269 * Pick a default sap until we get the DL_INFO_ACK back from 22270 * the driver. 22271 */ 22272 if (ill->ill_sap == 0) { 22273 if (ill->ill_isv6) 22274 ill->ill_sap = IP6_DL_SAP; 22275 else 22276 ill->ill_sap = IP_DL_SAP; 22277 } 22278 22279 ill->ill_ifname_pending = 1; 22280 ill->ill_ifname_pending_err = 0; 22281 22282 ill_refhold(ill); 22283 rw_enter(&ill_g_lock, RW_WRITER); 22284 if ((error = ill_glist_insert(ill, interf_name, 22285 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 22286 ill->ill_ppa = UINT_MAX; 22287 ill->ill_name[0] = '\0'; 22288 /* 22289 * undo null termination done above. 22290 */ 22291 ppa_ptr[0] = old_char; 22292 rw_exit(&ill_g_lock); 22293 ill_refrele(ill); 22294 return (error); 22295 } 22296 22297 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 22298 22299 /* 22300 * When we return the buffer pointed to by interf_name should contain 22301 * the same name as in ill_name. 22302 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 22303 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 22304 * so copy full name and update the ppa ptr. 22305 * When ppa passed in != UINT_MAX all values are correct just undo 22306 * null termination, this saves a bcopy. 22307 */ 22308 if (*new_ppa_ptr == UINT_MAX) { 22309 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 22310 *new_ppa_ptr = ill->ill_ppa; 22311 } else { 22312 /* 22313 * undo null termination done above. 22314 */ 22315 ppa_ptr[0] = old_char; 22316 } 22317 22318 /* Let SCTP know about this ILL */ 22319 sctp_update_ill(ill, SCTP_ILL_INSERT); 22320 22321 /* and also about the first ipif */ 22322 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 22323 22324 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 22325 B_TRUE); 22326 22327 rw_exit(&ill_g_lock); 22328 ill_refrele(ill); 22329 if (ipsq == NULL) 22330 return (EINPROGRESS); 22331 22332 /* 22333 * Need to set the ipsq_current_ipif now, if we have changed ipsq 22334 * due to the phyint merge in ill_phyint_reinit. 22335 */ 22336 ASSERT(ipsq->ipsq_current_ipif == NULL || 22337 ipsq->ipsq_current_ipif == ipif); 22338 ipsq->ipsq_current_ipif = ipif; 22339 ipsq->ipsq_last_cmd = SIOCSLIFNAME; 22340 error = ipif_set_values_tail(ill, ipif, mp, q); 22341 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22342 if (error != 0 && error != EINPROGRESS) { 22343 /* 22344 * restore previous values 22345 */ 22346 ill->ill_isv6 = B_FALSE; 22347 } 22348 return (error); 22349 } 22350 22351 22352 extern void (*ip_cleanup_func)(void); 22353 22354 void 22355 ipif_init(void) 22356 { 22357 hrtime_t hrt; 22358 int i; 22359 22360 /* 22361 * Can't call drv_getparm here as it is too early in the boot. 22362 * As we use ipif_src_random just for picking a different 22363 * source address everytime, this need not be really random. 22364 */ 22365 hrt = gethrtime(); 22366 ipif_src_random = ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 22367 22368 for (i = 0; i < MAX_G_HEADS; i++) { 22369 ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ill_g_heads[i]; 22370 ill_g_heads[i].ill_g_list_tail = (ill_if_t *)&ill_g_heads[i]; 22371 } 22372 22373 avl_create(&phyint_g_list.phyint_list_avl_by_index, 22374 ill_phyint_compare_index, 22375 sizeof (phyint_t), 22376 offsetof(struct phyint, phyint_avl_by_index)); 22377 avl_create(&phyint_g_list.phyint_list_avl_by_name, 22378 ill_phyint_compare_name, 22379 sizeof (phyint_t), 22380 offsetof(struct phyint, phyint_avl_by_name)); 22381 22382 ip_cleanup_func = ip_thread_exit; 22383 } 22384 22385 /* 22386 * This is called by ip_rt_add when src_addr value is other than zero. 22387 * src_addr signifies the source address of the incoming packet. For 22388 * reverse tunnel route we need to create a source addr based routing 22389 * table. This routine creates ip_mrtun_table if it's empty and then 22390 * it adds the route entry hashed by source address. It verifies that 22391 * the outgoing interface is always a non-resolver interface (tunnel). 22392 */ 22393 int 22394 ip_mrtun_rt_add(ipaddr_t in_src_addr, int flags, ipif_t *ipif_arg, 22395 ipif_t *src_ipif, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func) 22396 { 22397 ire_t *ire; 22398 ire_t *save_ire; 22399 ipif_t *ipif; 22400 ill_t *in_ill = NULL; 22401 ill_t *out_ill; 22402 queue_t *stq; 22403 mblk_t *dlureq_mp; 22404 int error; 22405 22406 if (ire_arg != NULL) 22407 *ire_arg = NULL; 22408 ASSERT(in_src_addr != INADDR_ANY); 22409 22410 ipif = ipif_arg; 22411 if (ipif != NULL) { 22412 out_ill = ipif->ipif_ill; 22413 } else { 22414 ip1dbg(("ip_mrtun_rt_add: ipif is NULL\n")); 22415 return (EINVAL); 22416 } 22417 22418 if (src_ipif == NULL) { 22419 ip1dbg(("ip_mrtun_rt_add: src_ipif is NULL\n")); 22420 return (EINVAL); 22421 } 22422 in_ill = src_ipif->ipif_ill; 22423 22424 /* 22425 * Check for duplicates. We don't need to 22426 * match out_ill, because the uniqueness of 22427 * a route is only dependent on src_addr and 22428 * in_ill. 22429 */ 22430 ire = ire_mrtun_lookup(in_src_addr, in_ill); 22431 if (ire != NULL) { 22432 ire_refrele(ire); 22433 return (EEXIST); 22434 } 22435 if (ipif->ipif_net_type != IRE_IF_NORESOLVER) { 22436 ip2dbg(("ip_mrtun_rt_add: outgoing interface is type %d\n", 22437 ipif->ipif_net_type)); 22438 return (EINVAL); 22439 } 22440 22441 stq = ipif->ipif_wq; 22442 ASSERT(stq != NULL); 22443 22444 /* 22445 * The outgoing interface must be non-resolver 22446 * interface. 22447 */ 22448 dlureq_mp = ill_dlur_gen(NULL, 22449 out_ill->ill_phys_addr_length, out_ill->ill_sap, 22450 out_ill->ill_sap_length); 22451 22452 if (dlureq_mp == NULL) { 22453 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 22454 return (ENOMEM); 22455 } 22456 22457 /* Create the IRE. */ 22458 22459 ire = ire_create( 22460 NULL, /* Zero dst addr */ 22461 NULL, /* Zero mask */ 22462 NULL, /* Zero gateway addr */ 22463 NULL, /* Zero ipif_src addr */ 22464 (uint8_t *)&in_src_addr, /* in_src-addr */ 22465 &ipif->ipif_mtu, 22466 NULL, 22467 NULL, /* rfq */ 22468 stq, 22469 IRE_MIPRTUN, 22470 dlureq_mp, 22471 ipif, 22472 in_ill, 22473 0, 22474 0, 22475 0, 22476 flags, 22477 &ire_uinfo_null, 22478 NULL, 22479 NULL); 22480 22481 if (ire == NULL) { 22482 freeb(dlureq_mp); 22483 return (ENOMEM); 22484 } 22485 ip2dbg(("ip_mrtun_rt_add: mrtun route is created with type %d\n", 22486 ire->ire_type)); 22487 save_ire = ire; 22488 ASSERT(save_ire != NULL); 22489 error = ire_add_mrtun(&ire, q, mp, func); 22490 /* 22491 * If ire_add_mrtun() failed, the ire passed in was freed 22492 * so there is no need to do so here. 22493 */ 22494 if (error != 0) { 22495 return (error); 22496 } 22497 22498 /* Duplicate check */ 22499 if (ire != save_ire) { 22500 /* route already exists by now */ 22501 ire_refrele(ire); 22502 return (EEXIST); 22503 } 22504 22505 if (ire_arg != NULL) { 22506 /* 22507 * Store the ire that was just added. the caller 22508 * ip_rts_request responsible for doing ire_refrele() 22509 * on it. 22510 */ 22511 *ire_arg = ire; 22512 } else { 22513 ire_refrele(ire); /* held in ire_add_mrtun */ 22514 } 22515 22516 return (0); 22517 } 22518 22519 /* 22520 * It is called by ip_rt_delete() only when mipagent requests to delete 22521 * a reverse tunnel route that was added by ip_mrtun_rt_add() before. 22522 */ 22523 22524 int 22525 ip_mrtun_rt_delete(ipaddr_t in_src_addr, ipif_t *src_ipif) 22526 { 22527 ire_t *ire = NULL; 22528 22529 if (in_src_addr == INADDR_ANY) 22530 return (EINVAL); 22531 if (src_ipif == NULL) 22532 return (EINVAL); 22533 22534 /* search if this route exists in the ip_mrtun_table */ 22535 ire = ire_mrtun_lookup(in_src_addr, src_ipif->ipif_ill); 22536 if (ire == NULL) { 22537 ip2dbg(("ip_mrtun_rt_delete: ire not found\n")); 22538 return (ESRCH); 22539 } 22540 ire_delete(ire); 22541 ire_refrele(ire); 22542 return (0); 22543 } 22544 22545 /* 22546 * Lookup the ipif corresponding to the onlink destination address. For 22547 * point-to-point interfaces, it matches with remote endpoint destination 22548 * address. For point-to-multipoint interfaces it only tries to match the 22549 * destination with the interface's subnet address. The longest, most specific 22550 * match is found to take care of such rare network configurations like - 22551 * le0: 129.146.1.1/16 22552 * le1: 129.146.2.2/24 22553 * It is used only by SO_DONTROUTE at the moment. 22554 */ 22555 ipif_t * 22556 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid) 22557 { 22558 ipif_t *ipif, *best_ipif; 22559 ill_t *ill; 22560 ill_walk_context_t ctx; 22561 22562 ASSERT(zoneid != ALL_ZONES); 22563 best_ipif = NULL; 22564 22565 rw_enter(&ill_g_lock, RW_READER); 22566 ill = ILL_START_WALK_V4(&ctx); 22567 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 22568 mutex_enter(&ill->ill_lock); 22569 for (ipif = ill->ill_ipif; ipif != NULL; 22570 ipif = ipif->ipif_next) { 22571 if (!IPIF_CAN_LOOKUP(ipif)) 22572 continue; 22573 if (ipif->ipif_zoneid != zoneid && 22574 ipif->ipif_zoneid != ALL_ZONES) 22575 continue; 22576 /* 22577 * Point-to-point case. Look for exact match with 22578 * destination address. 22579 */ 22580 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 22581 if (ipif->ipif_pp_dst_addr == addr) { 22582 ipif_refhold_locked(ipif); 22583 mutex_exit(&ill->ill_lock); 22584 rw_exit(&ill_g_lock); 22585 if (best_ipif != NULL) 22586 ipif_refrele(best_ipif); 22587 return (ipif); 22588 } 22589 } else if (ipif->ipif_subnet == (addr & 22590 ipif->ipif_net_mask)) { 22591 /* 22592 * Point-to-multipoint case. Looping through to 22593 * find the most specific match. If there are 22594 * multiple best match ipif's then prefer ipif's 22595 * that are UP. If there is only one best match 22596 * ipif and it is DOWN we must still return it. 22597 */ 22598 if ((best_ipif == NULL) || 22599 (ipif->ipif_net_mask > 22600 best_ipif->ipif_net_mask) || 22601 ((ipif->ipif_net_mask == 22602 best_ipif->ipif_net_mask) && 22603 ((ipif->ipif_flags & IPIF_UP) && 22604 (!(best_ipif->ipif_flags & IPIF_UP))))) { 22605 ipif_refhold_locked(ipif); 22606 mutex_exit(&ill->ill_lock); 22607 rw_exit(&ill_g_lock); 22608 if (best_ipif != NULL) 22609 ipif_refrele(best_ipif); 22610 best_ipif = ipif; 22611 rw_enter(&ill_g_lock, RW_READER); 22612 mutex_enter(&ill->ill_lock); 22613 } 22614 } 22615 } 22616 mutex_exit(&ill->ill_lock); 22617 } 22618 rw_exit(&ill_g_lock); 22619 return (best_ipif); 22620 } 22621 22622 22623 /* 22624 * Save enough information so that we can recreate the IRE if 22625 * the interface goes down and then up. 22626 */ 22627 static void 22628 ipif_save_ire(ipif_t *ipif, ire_t *ire) 22629 { 22630 mblk_t *save_mp; 22631 22632 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 22633 if (save_mp != NULL) { 22634 ifrt_t *ifrt; 22635 22636 save_mp->b_wptr += sizeof (ifrt_t); 22637 ifrt = (ifrt_t *)save_mp->b_rptr; 22638 bzero(ifrt, sizeof (ifrt_t)); 22639 ifrt->ifrt_type = ire->ire_type; 22640 ifrt->ifrt_addr = ire->ire_addr; 22641 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 22642 ifrt->ifrt_src_addr = ire->ire_src_addr; 22643 ifrt->ifrt_mask = ire->ire_mask; 22644 ifrt->ifrt_flags = ire->ire_flags; 22645 ifrt->ifrt_max_frag = ire->ire_max_frag; 22646 mutex_enter(&ipif->ipif_saved_ire_lock); 22647 save_mp->b_cont = ipif->ipif_saved_ire_mp; 22648 ipif->ipif_saved_ire_mp = save_mp; 22649 ipif->ipif_saved_ire_cnt++; 22650 mutex_exit(&ipif->ipif_saved_ire_lock); 22651 } 22652 } 22653 22654 22655 static void 22656 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 22657 { 22658 mblk_t **mpp; 22659 mblk_t *mp; 22660 ifrt_t *ifrt; 22661 22662 /* Remove from ipif_saved_ire_mp list if it is there */ 22663 mutex_enter(&ipif->ipif_saved_ire_lock); 22664 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 22665 mpp = &(*mpp)->b_cont) { 22666 /* 22667 * On a given ipif, the triple of address, gateway and 22668 * mask is unique for each saved IRE (in the case of 22669 * ordinary interface routes, the gateway address is 22670 * all-zeroes). 22671 */ 22672 mp = *mpp; 22673 ifrt = (ifrt_t *)mp->b_rptr; 22674 if (ifrt->ifrt_addr == ire->ire_addr && 22675 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 22676 ifrt->ifrt_mask == ire->ire_mask) { 22677 *mpp = mp->b_cont; 22678 ipif->ipif_saved_ire_cnt--; 22679 freeb(mp); 22680 break; 22681 } 22682 } 22683 mutex_exit(&ipif->ipif_saved_ire_lock); 22684 } 22685 22686 22687 /* 22688 * IP multirouting broadcast routes handling 22689 * Append CGTP broadcast IREs to regular ones created 22690 * at ifconfig time. 22691 */ 22692 static void 22693 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst) 22694 { 22695 ire_t *ire_prim; 22696 22697 ASSERT(ire != NULL); 22698 ASSERT(ire_dst != NULL); 22699 22700 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 22701 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 22702 if (ire_prim != NULL) { 22703 /* 22704 * We are in the special case of broadcasts for 22705 * CGTP. We add an IRE_BROADCAST that holds 22706 * the RTF_MULTIRT flag, the destination 22707 * address of ire_dst and the low level 22708 * info of ire_prim. In other words, CGTP 22709 * broadcast is added to the redundant ipif. 22710 */ 22711 ipif_t *ipif_prim; 22712 ire_t *bcast_ire; 22713 22714 ipif_prim = ire_prim->ire_ipif; 22715 22716 ip2dbg(("ip_cgtp_filter_bcast_add: " 22717 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 22718 (void *)ire_dst, (void *)ire_prim, 22719 (void *)ipif_prim)); 22720 22721 bcast_ire = ire_create( 22722 (uchar_t *)&ire->ire_addr, 22723 (uchar_t *)&ip_g_all_ones, 22724 (uchar_t *)&ire_dst->ire_src_addr, 22725 (uchar_t *)&ire->ire_gateway_addr, 22726 NULL, 22727 &ipif_prim->ipif_mtu, 22728 NULL, 22729 ipif_prim->ipif_rq, 22730 ipif_prim->ipif_wq, 22731 IRE_BROADCAST, 22732 ipif_prim->ipif_bcast_mp, 22733 ipif_prim, 22734 NULL, 22735 0, 22736 0, 22737 0, 22738 ire->ire_flags, 22739 &ire_uinfo_null, 22740 NULL, 22741 NULL); 22742 22743 if (bcast_ire != NULL) { 22744 22745 if (ire_add(&bcast_ire, NULL, NULL, NULL, 22746 B_FALSE) == 0) { 22747 ip2dbg(("ip_cgtp_filter_bcast_add: " 22748 "added bcast_ire %p\n", 22749 (void *)bcast_ire)); 22750 22751 ipif_save_ire(bcast_ire->ire_ipif, 22752 bcast_ire); 22753 ire_refrele(bcast_ire); 22754 } 22755 } 22756 ire_refrele(ire_prim); 22757 } 22758 } 22759 22760 22761 /* 22762 * IP multirouting broadcast routes handling 22763 * Remove the broadcast ire 22764 */ 22765 static void 22766 ip_cgtp_bcast_delete(ire_t *ire) 22767 { 22768 ire_t *ire_dst; 22769 22770 ASSERT(ire != NULL); 22771 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 22772 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 22773 if (ire_dst != NULL) { 22774 ire_t *ire_prim; 22775 22776 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 22777 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 22778 if (ire_prim != NULL) { 22779 ipif_t *ipif_prim; 22780 ire_t *bcast_ire; 22781 22782 ipif_prim = ire_prim->ire_ipif; 22783 22784 ip2dbg(("ip_cgtp_filter_bcast_delete: " 22785 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 22786 (void *)ire_dst, (void *)ire_prim, 22787 (void *)ipif_prim)); 22788 22789 bcast_ire = ire_ctable_lookup(ire->ire_addr, 22790 ire->ire_gateway_addr, 22791 IRE_BROADCAST, 22792 ipif_prim, ALL_ZONES, 22793 NULL, 22794 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 22795 MATCH_IRE_MASK); 22796 22797 if (bcast_ire != NULL) { 22798 ip2dbg(("ip_cgtp_filter_bcast_delete: " 22799 "looked up bcast_ire %p\n", 22800 (void *)bcast_ire)); 22801 ipif_remove_ire(bcast_ire->ire_ipif, 22802 bcast_ire); 22803 ire_delete(bcast_ire); 22804 } 22805 ire_refrele(ire_prim); 22806 } 22807 ire_refrele(ire_dst); 22808 } 22809 } 22810 22811 /* 22812 * IPsec hardware acceleration capabilities related functions. 22813 */ 22814 22815 /* 22816 * Free a per-ill IPsec capabilities structure. 22817 */ 22818 static void 22819 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 22820 { 22821 if (capab->auth_hw_algs != NULL) 22822 kmem_free(capab->auth_hw_algs, capab->algs_size); 22823 if (capab->encr_hw_algs != NULL) 22824 kmem_free(capab->encr_hw_algs, capab->algs_size); 22825 if (capab->encr_algparm != NULL) 22826 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 22827 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 22828 } 22829 22830 /* 22831 * Allocate a new per-ill IPsec capabilities structure. This structure 22832 * is specific to an IPsec protocol (AH or ESP). It is implemented as 22833 * an array which specifies, for each algorithm, whether this algorithm 22834 * is supported by the ill or not. 22835 */ 22836 static ill_ipsec_capab_t * 22837 ill_ipsec_capab_alloc(void) 22838 { 22839 ill_ipsec_capab_t *capab; 22840 uint_t nelems; 22841 22842 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 22843 if (capab == NULL) 22844 return (NULL); 22845 22846 /* we need one bit per algorithm */ 22847 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 22848 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 22849 22850 /* allocate memory to store algorithm flags */ 22851 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 22852 if (capab->encr_hw_algs == NULL) 22853 goto nomem; 22854 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 22855 if (capab->auth_hw_algs == NULL) 22856 goto nomem; 22857 /* 22858 * Leave encr_algparm NULL for now since we won't need it half 22859 * the time 22860 */ 22861 return (capab); 22862 22863 nomem: 22864 ill_ipsec_capab_free(capab); 22865 return (NULL); 22866 } 22867 22868 /* 22869 * Resize capability array. Since we're exclusive, this is OK. 22870 */ 22871 static boolean_t 22872 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 22873 { 22874 ipsec_capab_algparm_t *nalp, *oalp; 22875 uint32_t olen, nlen; 22876 22877 oalp = capab->encr_algparm; 22878 olen = capab->encr_algparm_size; 22879 22880 if (oalp != NULL) { 22881 if (algid < capab->encr_algparm_end) 22882 return (B_TRUE); 22883 } 22884 22885 nlen = (algid + 1) * sizeof (*nalp); 22886 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 22887 if (nalp == NULL) 22888 return (B_FALSE); 22889 22890 if (oalp != NULL) { 22891 bcopy(oalp, nalp, olen); 22892 kmem_free(oalp, olen); 22893 } 22894 capab->encr_algparm = nalp; 22895 capab->encr_algparm_size = nlen; 22896 capab->encr_algparm_end = algid + 1; 22897 22898 return (B_TRUE); 22899 } 22900 22901 /* 22902 * Compare the capabilities of the specified ill with the protocol 22903 * and algorithms specified by the SA passed as argument. 22904 * If they match, returns B_TRUE, B_FALSE if they do not match. 22905 * 22906 * The ill can be passed as a pointer to it, or by specifying its index 22907 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 22908 * 22909 * Called by ipsec_out_is_accelerated() do decide whether an outbound 22910 * packet is eligible for hardware acceleration, and by 22911 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 22912 * to a particular ill. 22913 */ 22914 boolean_t 22915 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 22916 ipsa_t *sa) 22917 { 22918 boolean_t sa_isv6; 22919 uint_t algid; 22920 struct ill_ipsec_capab_s *cpp; 22921 boolean_t need_refrele = B_FALSE; 22922 22923 if (ill == NULL) { 22924 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 22925 NULL, NULL, NULL); 22926 if (ill == NULL) { 22927 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 22928 return (B_FALSE); 22929 } 22930 need_refrele = B_TRUE; 22931 } 22932 22933 /* 22934 * Use the address length specified by the SA to determine 22935 * if it corresponds to a IPv6 address, and fail the matching 22936 * if the isv6 flag passed as argument does not match. 22937 * Note: this check is used for SADB capability checking before 22938 * sending SA information to an ill. 22939 */ 22940 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 22941 if (sa_isv6 != ill_isv6) 22942 /* protocol mismatch */ 22943 goto done; 22944 22945 /* 22946 * Check if the ill supports the protocol, algorithm(s) and 22947 * key size(s) specified by the SA, and get the pointers to 22948 * the algorithms supported by the ill. 22949 */ 22950 switch (sa->ipsa_type) { 22951 22952 case SADB_SATYPE_ESP: 22953 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 22954 /* ill does not support ESP acceleration */ 22955 goto done; 22956 cpp = ill->ill_ipsec_capab_esp; 22957 algid = sa->ipsa_auth_alg; 22958 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 22959 goto done; 22960 algid = sa->ipsa_encr_alg; 22961 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 22962 goto done; 22963 if (algid < cpp->encr_algparm_end) { 22964 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 22965 if (sa->ipsa_encrkeybits < alp->minkeylen) 22966 goto done; 22967 if (sa->ipsa_encrkeybits > alp->maxkeylen) 22968 goto done; 22969 } 22970 break; 22971 22972 case SADB_SATYPE_AH: 22973 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 22974 /* ill does not support AH acceleration */ 22975 goto done; 22976 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 22977 ill->ill_ipsec_capab_ah->auth_hw_algs)) 22978 goto done; 22979 break; 22980 } 22981 22982 if (need_refrele) 22983 ill_refrele(ill); 22984 return (B_TRUE); 22985 done: 22986 if (need_refrele) 22987 ill_refrele(ill); 22988 return (B_FALSE); 22989 } 22990 22991 22992 /* 22993 * Add a new ill to the list of IPsec capable ills. 22994 * Called from ill_capability_ipsec_ack() when an ACK was received 22995 * indicating that IPsec hardware processing was enabled for an ill. 22996 * 22997 * ill must point to the ill for which acceleration was enabled. 22998 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 22999 */ 23000 static void 23001 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 23002 { 23003 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 23004 uint_t sa_type; 23005 uint_t ipproto; 23006 23007 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 23008 (dl_cap == DL_CAPAB_IPSEC_ESP)); 23009 23010 switch (dl_cap) { 23011 case DL_CAPAB_IPSEC_AH: 23012 sa_type = SADB_SATYPE_AH; 23013 ills = &ipsec_capab_ills_ah; 23014 ipproto = IPPROTO_AH; 23015 break; 23016 case DL_CAPAB_IPSEC_ESP: 23017 sa_type = SADB_SATYPE_ESP; 23018 ills = &ipsec_capab_ills_esp; 23019 ipproto = IPPROTO_ESP; 23020 break; 23021 } 23022 23023 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 23024 23025 /* 23026 * Add ill index to list of hardware accelerators. If 23027 * already in list, do nothing. 23028 */ 23029 for (cur_ill = *ills; cur_ill != NULL && 23030 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 23031 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 23032 ; 23033 23034 if (cur_ill == NULL) { 23035 /* if this is a new entry for this ill */ 23036 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 23037 if (new_ill == NULL) { 23038 rw_exit(&ipsec_capab_ills_lock); 23039 return; 23040 } 23041 23042 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 23043 new_ill->ill_isv6 = ill->ill_isv6; 23044 new_ill->next = *ills; 23045 *ills = new_ill; 23046 } else if (!sadb_resync) { 23047 /* not resync'ing SADB and an entry exists for this ill */ 23048 rw_exit(&ipsec_capab_ills_lock); 23049 return; 23050 } 23051 23052 rw_exit(&ipsec_capab_ills_lock); 23053 23054 if (ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 23055 /* 23056 * IPsec module for protocol loaded, initiate dump 23057 * of the SADB to this ill. 23058 */ 23059 sadb_ill_download(ill, sa_type); 23060 } 23061 23062 /* 23063 * Remove an ill from the list of IPsec capable ills. 23064 */ 23065 static void 23066 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 23067 { 23068 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 23069 23070 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 23071 dl_cap == DL_CAPAB_IPSEC_ESP); 23072 23073 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipsec_capab_ills_ah : 23074 &ipsec_capab_ills_esp; 23075 23076 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 23077 23078 prev_ill = NULL; 23079 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 23080 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 23081 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 23082 ; 23083 if (cur_ill == NULL) { 23084 /* entry not found */ 23085 rw_exit(&ipsec_capab_ills_lock); 23086 return; 23087 } 23088 if (prev_ill == NULL) { 23089 /* entry at front of list */ 23090 *ills = NULL; 23091 } else { 23092 prev_ill->next = cur_ill->next; 23093 } 23094 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 23095 rw_exit(&ipsec_capab_ills_lock); 23096 } 23097 23098 23099 /* 23100 * Handling of DL_CONTROL_REQ messages that must be sent down to 23101 * an ill while having exclusive access. 23102 */ 23103 /* ARGSUSED */ 23104 static void 23105 ill_ipsec_capab_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 23106 { 23107 ill_t *ill = (ill_t *)q->q_ptr; 23108 23109 ill_dlpi_send(ill, mp); 23110 } 23111 23112 23113 /* 23114 * Called by SADB to send a DL_CONTROL_REQ message to every ill 23115 * supporting the specified IPsec protocol acceleration. 23116 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 23117 * We free the mblk and, if sa is non-null, release the held referece. 23118 */ 23119 void 23120 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa) 23121 { 23122 ipsec_capab_ill_t *ici, *cur_ici; 23123 ill_t *ill; 23124 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 23125 23126 ici = (sa_type == SADB_SATYPE_AH) ? ipsec_capab_ills_ah : 23127 ipsec_capab_ills_esp; 23128 23129 rw_enter(&ipsec_capab_ills_lock, RW_READER); 23130 23131 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 23132 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 23133 cur_ici->ill_isv6, NULL, NULL, NULL, NULL); 23134 23135 /* 23136 * Handle the case where the ill goes away while the SADB is 23137 * attempting to send messages. If it's going away, it's 23138 * nuking its shadow SADB, so we don't care.. 23139 */ 23140 23141 if (ill == NULL) 23142 continue; 23143 23144 if (sa != NULL) { 23145 /* 23146 * Make sure capabilities match before 23147 * sending SA to ill. 23148 */ 23149 if (!ipsec_capab_match(ill, cur_ici->ill_index, 23150 cur_ici->ill_isv6, sa)) { 23151 ill_refrele(ill); 23152 continue; 23153 } 23154 23155 mutex_enter(&sa->ipsa_lock); 23156 sa->ipsa_flags |= IPSA_F_HW; 23157 mutex_exit(&sa->ipsa_lock); 23158 } 23159 23160 /* 23161 * Copy template message, and add it to the front 23162 * of the mblk ship list. We want to avoid holding 23163 * the ipsec_capab_ills_lock while sending the 23164 * message to the ills. 23165 * 23166 * The b_next and b_prev are temporarily used 23167 * to build a list of mblks to be sent down, and to 23168 * save the ill to which they must be sent. 23169 */ 23170 nmp = copymsg(mp); 23171 if (nmp == NULL) { 23172 ill_refrele(ill); 23173 continue; 23174 } 23175 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 23176 nmp->b_next = mp_ship_list; 23177 mp_ship_list = nmp; 23178 nmp->b_prev = (mblk_t *)ill; 23179 } 23180 23181 rw_exit(&ipsec_capab_ills_lock); 23182 23183 nmp = mp_ship_list; 23184 while (nmp != NULL) { 23185 /* restore the mblk to a sane state */ 23186 next_mp = nmp->b_next; 23187 nmp->b_next = NULL; 23188 ill = (ill_t *)nmp->b_prev; 23189 nmp->b_prev = NULL; 23190 23191 /* 23192 * Ship the mblk to the ill, must be exclusive. Keep the 23193 * reference to the ill as qwriter_ip() does a ill_referele(). 23194 */ 23195 (void) qwriter_ip(NULL, ill, ill->ill_wq, nmp, 23196 ill_ipsec_capab_send_writer, NEW_OP, B_TRUE); 23197 23198 nmp = next_mp; 23199 } 23200 23201 if (sa != NULL) 23202 IPSA_REFRELE(sa); 23203 freemsg(mp); 23204 } 23205 23206 23207 /* 23208 * Derive an interface id from the link layer address. 23209 * Knows about IEEE 802 and IEEE EUI-64 mappings. 23210 */ 23211 static boolean_t 23212 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23213 { 23214 char *addr; 23215 23216 if (phys_length != ETHERADDRL) 23217 return (B_FALSE); 23218 23219 /* Form EUI-64 like address */ 23220 addr = (char *)&v6addr->s6_addr32[2]; 23221 bcopy((char *)phys_addr, addr, 3); 23222 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 23223 addr[3] = (char)0xff; 23224 addr[4] = (char)0xfe; 23225 bcopy((char *)phys_addr + 3, addr + 5, 3); 23226 return (B_TRUE); 23227 } 23228 23229 /* ARGSUSED */ 23230 static boolean_t 23231 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23232 { 23233 return (B_FALSE); 23234 } 23235 23236 /* ARGSUSED */ 23237 static boolean_t 23238 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23239 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23240 { 23241 /* 23242 * Multicast address mappings used over Ethernet/802.X. 23243 * This address is used as a base for mappings. 23244 */ 23245 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 23246 0x00, 0x00, 0x00}; 23247 23248 /* 23249 * Extract low order 32 bits from IPv6 multicast address. 23250 * Or that into the link layer address, starting from the 23251 * second byte. 23252 */ 23253 *hw_start = 2; 23254 v6_extract_mask->s6_addr32[0] = 0; 23255 v6_extract_mask->s6_addr32[1] = 0; 23256 v6_extract_mask->s6_addr32[2] = 0; 23257 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23258 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 23259 return (B_TRUE); 23260 } 23261 23262 /* 23263 * Indicate by return value whether multicast is supported. If not, 23264 * this code should not touch/change any parameters. 23265 */ 23266 /* ARGSUSED */ 23267 static boolean_t 23268 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23269 uint32_t *hw_start, ipaddr_t *extract_mask) 23270 { 23271 /* 23272 * Multicast address mappings used over Ethernet/802.X. 23273 * This address is used as a base for mappings. 23274 */ 23275 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 23276 0x00, 0x00, 0x00 }; 23277 23278 if (phys_length != ETHERADDRL) 23279 return (B_FALSE); 23280 23281 *extract_mask = htonl(0x007fffff); 23282 *hw_start = 2; 23283 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 23284 return (B_TRUE); 23285 } 23286 23287 /* 23288 * Derive IPoIB interface id from the link layer address. 23289 */ 23290 static boolean_t 23291 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23292 { 23293 char *addr; 23294 23295 if (phys_length != 20) 23296 return (B_FALSE); 23297 addr = (char *)&v6addr->s6_addr32[2]; 23298 bcopy(phys_addr + 12, addr, 8); 23299 /* 23300 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 23301 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 23302 * rules. In these cases, the IBA considers these GUIDs to be in 23303 * "Modified EUI-64" format, and thus toggling the u/l bit is not 23304 * required; vendors are required not to assign global EUI-64's 23305 * that differ only in u/l bit values, thus guaranteeing uniqueness 23306 * of the interface identifier. Whether the GUID is in modified 23307 * or proper EUI-64 format, the ipv6 identifier must have the u/l 23308 * bit set to 1. 23309 */ 23310 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 23311 return (B_TRUE); 23312 } 23313 23314 /* 23315 * Note on mapping from multicast IP addresses to IPoIB multicast link 23316 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 23317 * The format of an IPoIB multicast address is: 23318 * 23319 * 4 byte QPN Scope Sign. Pkey 23320 * +--------------------------------------------+ 23321 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 23322 * +--------------------------------------------+ 23323 * 23324 * The Scope and Pkey components are properties of the IBA port and 23325 * network interface. They can be ascertained from the broadcast address. 23326 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 23327 */ 23328 23329 static boolean_t 23330 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23331 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23332 { 23333 /* 23334 * Base IPoIB IPv6 multicast address used for mappings. 23335 * Does not contain the IBA scope/Pkey values. 23336 */ 23337 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23338 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 23339 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23340 23341 /* 23342 * Extract low order 80 bits from IPv6 multicast address. 23343 * Or that into the link layer address, starting from the 23344 * sixth byte. 23345 */ 23346 *hw_start = 6; 23347 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 23348 23349 /* 23350 * Now fill in the IBA scope/Pkey values from the broadcast address. 23351 */ 23352 *(maddr + 5) = *(bphys_addr + 5); 23353 *(maddr + 8) = *(bphys_addr + 8); 23354 *(maddr + 9) = *(bphys_addr + 9); 23355 23356 v6_extract_mask->s6_addr32[0] = 0; 23357 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 23358 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 23359 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23360 return (B_TRUE); 23361 } 23362 23363 static boolean_t 23364 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23365 uint32_t *hw_start, ipaddr_t *extract_mask) 23366 { 23367 /* 23368 * Base IPoIB IPv4 multicast address used for mappings. 23369 * Does not contain the IBA scope/Pkey values. 23370 */ 23371 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23372 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 23373 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23374 23375 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 23376 return (B_FALSE); 23377 23378 /* 23379 * Extract low order 28 bits from IPv4 multicast address. 23380 * Or that into the link layer address, starting from the 23381 * sixteenth byte. 23382 */ 23383 *extract_mask = htonl(0x0fffffff); 23384 *hw_start = 16; 23385 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 23386 23387 /* 23388 * Now fill in the IBA scope/Pkey values from the broadcast address. 23389 */ 23390 *(maddr + 5) = *(bphys_addr + 5); 23391 *(maddr + 8) = *(bphys_addr + 8); 23392 *(maddr + 9) = *(bphys_addr + 9); 23393 return (B_TRUE); 23394 } 23395 23396 /* 23397 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 23398 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 23399 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 23400 * the link-local address is preferred. 23401 */ 23402 boolean_t 23403 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23404 { 23405 ipif_t *ipif; 23406 ipif_t *maybe_ipif = NULL; 23407 23408 mutex_enter(&ill->ill_lock); 23409 if (ill->ill_state_flags & ILL_CONDEMNED) { 23410 mutex_exit(&ill->ill_lock); 23411 if (ipifp != NULL) 23412 *ipifp = NULL; 23413 return (B_FALSE); 23414 } 23415 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 23416 if (!IPIF_CAN_LOOKUP(ipif)) 23417 continue; 23418 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 23419 ipif->ipif_zoneid != ALL_ZONES) 23420 continue; 23421 if ((ipif->ipif_flags & flags) != flags) 23422 continue; 23423 23424 if (ipifp == NULL) { 23425 mutex_exit(&ill->ill_lock); 23426 ASSERT(maybe_ipif == NULL); 23427 return (B_TRUE); 23428 } 23429 if (!ill->ill_isv6 || 23430 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 23431 ipif_refhold_locked(ipif); 23432 mutex_exit(&ill->ill_lock); 23433 *ipifp = ipif; 23434 return (B_TRUE); 23435 } 23436 if (maybe_ipif == NULL) 23437 maybe_ipif = ipif; 23438 } 23439 if (ipifp != NULL) { 23440 if (maybe_ipif != NULL) 23441 ipif_refhold_locked(maybe_ipif); 23442 *ipifp = maybe_ipif; 23443 } 23444 mutex_exit(&ill->ill_lock); 23445 return (maybe_ipif != NULL); 23446 } 23447 23448 /* 23449 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 23450 */ 23451 boolean_t 23452 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23453 { 23454 ill_t *illg; 23455 23456 /* 23457 * We look at the passed-in ill first without grabbing ill_g_lock. 23458 */ 23459 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 23460 return (B_TRUE); 23461 } 23462 rw_enter(&ill_g_lock, RW_READER); 23463 if (ill->ill_group == NULL) { 23464 /* ill not in a group */ 23465 rw_exit(&ill_g_lock); 23466 return (B_FALSE); 23467 } 23468 23469 /* 23470 * There's no ipif in the zone on ill, however ill is part of an IPMP 23471 * group. We need to look for an ipif in the zone on all the ills in the 23472 * group. 23473 */ 23474 illg = ill->ill_group->illgrp_ill; 23475 do { 23476 /* 23477 * We don't call ipif_lookup_zoneid() on ill as we already know 23478 * that it's not there. 23479 */ 23480 if (illg != ill && 23481 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 23482 break; 23483 } 23484 } while ((illg = illg->ill_group_next) != NULL); 23485 rw_exit(&ill_g_lock); 23486 return (illg != NULL); 23487 } 23488 23489 /* 23490 * Check if this ill is only being used to send ICMP probes for IPMP 23491 */ 23492 boolean_t 23493 ill_is_probeonly(ill_t *ill) 23494 { 23495 /* 23496 * Check if the interface is FAILED, or INACTIVE 23497 */ 23498 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 23499 return (B_TRUE); 23500 23501 return (B_FALSE); 23502 } 23503