1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 47 #include <sys/kmem.h> 48 #include <sys/systm.h> 49 #include <sys/param.h> 50 #include <sys/socket.h> 51 #include <sys/isa_defs.h> 52 #include <net/if.h> 53 #include <net/if_arp.h> 54 #include <net/if_types.h> 55 #include <net/if_dl.h> 56 #include <net/route.h> 57 #include <sys/sockio.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/igmp_var.h> 62 #include <sys/strsun.h> 63 #include <sys/policy.h> 64 #include <sys/ethernet.h> 65 66 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 67 #include <inet/mi.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/mib2.h> 71 #include <inet/ip.h> 72 #include <inet/ip6.h> 73 #include <inet/ip6_asp.h> 74 #include <inet/tcp.h> 75 #include <inet/ip_multi.h> 76 #include <inet/ip_ire.h> 77 #include <inet/ip_ftable.h> 78 #include <inet/ip_rts.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/ip_if.h> 81 #include <inet/ip_impl.h> 82 #include <inet/tun.h> 83 #include <inet/sctp_ip.h> 84 #include <inet/ip_netinfo.h> 85 86 #include <net/pfkeyv2.h> 87 #include <inet/ipsec_info.h> 88 #include <inet/sadb.h> 89 #include <inet/ipsec_impl.h> 90 #include <sys/iphada.h> 91 92 93 #include <netinet/igmp.h> 94 #include <inet/ip_listutils.h> 95 #include <inet/ipclassifier.h> 96 #include <sys/mac.h> 97 98 #include <sys/systeminfo.h> 99 #include <sys/bootconf.h> 100 101 #include <sys/tsol/tndb.h> 102 #include <sys/tsol/tnet.h> 103 104 /* The character which tells where the ill_name ends */ 105 #define IPIF_SEPARATOR_CHAR ':' 106 107 /* IP ioctl function table entry */ 108 typedef struct ipft_s { 109 int ipft_cmd; 110 pfi_t ipft_pfi; 111 int ipft_min_size; 112 int ipft_flags; 113 } ipft_t; 114 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 115 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 116 117 typedef struct ip_sock_ar_s { 118 union { 119 area_t ip_sock_area; 120 ared_t ip_sock_ared; 121 areq_t ip_sock_areq; 122 } ip_sock_ar_u; 123 queue_t *ip_sock_ar_q; 124 } ip_sock_ar_t; 125 126 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 127 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 128 char *value, caddr_t cp, cred_t *ioc_cr); 129 130 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 131 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 132 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 mblk_t *mp, boolean_t need_up); 134 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 135 mblk_t *mp, boolean_t need_up); 136 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 137 queue_t *q, mblk_t *mp, boolean_t need_up); 138 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 139 mblk_t *mp, boolean_t need_up); 140 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 141 mblk_t *mp); 142 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, 145 sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl); 146 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **); 147 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 148 static void ipsq_flush(ill_t *ill); 149 static void ipsq_clean_all(ill_t *ill); 150 static void ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring); 151 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 152 queue_t *q, mblk_t *mp, boolean_t need_up); 153 static void ipsq_delete(ipsq_t *); 154 155 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 156 boolean_t initialize); 157 static void ipif_check_bcast_ires(ipif_t *test_ipif); 158 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 159 static void ipif_delete_cache_ire(ire_t *, char *); 160 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 161 static void ipif_free(ipif_t *ipif); 162 static void ipif_free_tail(ipif_t *ipif); 163 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 164 static void ipif_multicast_down(ipif_t *ipif); 165 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 166 static void ipif_set_default(ipif_t *ipif); 167 static int ipif_set_values(queue_t *q, mblk_t *mp, 168 char *interf_name, uint_t *ppa); 169 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 170 queue_t *q); 171 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 172 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 173 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error); 174 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 175 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 176 177 static int ill_alloc_ppa(ill_if_t *, ill_t *); 178 static int ill_arp_off(ill_t *ill); 179 static int ill_arp_on(ill_t *ill); 180 static void ill_delete_interface_type(ill_if_t *); 181 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 182 static void ill_dl_down(ill_t *ill); 183 static void ill_down(ill_t *ill); 184 static void ill_downi(ire_t *ire, char *ill_arg); 185 static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); 186 static void ill_down_tail(ill_t *ill); 187 static void ill_free_mib(ill_t *ill); 188 static void ill_glist_delete(ill_t *); 189 static boolean_t ill_has_usable_ipif(ill_t *); 190 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 191 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 192 static void ill_phyint_free(ill_t *ill); 193 static void ill_phyint_reinit(ill_t *ill); 194 static void ill_set_nce_router_flags(ill_t *, boolean_t); 195 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 196 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 197 static void ill_stq_cache_delete(ire_t *, char *); 198 199 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 200 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 201 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 202 in6_addr_t *); 203 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 204 ipaddr_t *); 205 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 206 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 207 in6_addr_t *); 208 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 209 ipaddr_t *); 210 211 static void ipif_save_ire(ipif_t *, ire_t *); 212 static void ipif_remove_ire(ipif_t *, ire_t *); 213 static void ip_cgtp_bcast_add(ire_t *, ire_t *); 214 static void ip_cgtp_bcast_delete(ire_t *); 215 216 /* 217 * Per-ill IPsec capabilities management. 218 */ 219 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 220 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 221 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 222 static void ill_ipsec_capab_delete(ill_t *, uint_t); 223 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 224 static void ill_capability_proto(ill_t *, int, mblk_t *); 225 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 226 boolean_t); 227 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 228 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 229 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 230 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 231 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 232 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 233 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 234 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 235 dl_capability_sub_t *); 236 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 237 238 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 239 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 240 static void ill_capability_dls_reset(ill_t *, mblk_t **); 241 static void ill_capability_dls_disable(ill_t *); 242 243 static void illgrp_cache_delete(ire_t *, char *); 244 static void illgrp_delete(ill_t *ill); 245 static void illgrp_reset_schednext(ill_t *ill); 246 247 static ill_t *ill_prev_usesrc(ill_t *); 248 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 249 static void ill_disband_usesrc_group(ill_t *); 250 251 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 252 253 /* 254 * if we go over the memory footprint limit more than once in this msec 255 * interval, we'll start pruning aggressively. 256 */ 257 int ip_min_frag_prune_time = 0; 258 259 /* 260 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 261 * and the IPsec DOI 262 */ 263 #define MAX_IPSEC_ALGS 256 264 265 #define BITSPERBYTE 8 266 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 267 268 #define IPSEC_ALG_ENABLE(algs, algid) \ 269 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 270 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 271 272 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 273 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 274 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 275 276 typedef uint8_t ipsec_capab_elem_t; 277 278 /* 279 * Per-algorithm parameters. Note that at present, only encryption 280 * algorithms have variable keysize (IKE does not provide a way to negotiate 281 * auth algorithm keysize). 282 * 283 * All sizes here are in bits. 284 */ 285 typedef struct 286 { 287 uint16_t minkeylen; 288 uint16_t maxkeylen; 289 } ipsec_capab_algparm_t; 290 291 /* 292 * Per-ill capabilities. 293 */ 294 struct ill_ipsec_capab_s { 295 ipsec_capab_elem_t *encr_hw_algs; 296 ipsec_capab_elem_t *auth_hw_algs; 297 uint32_t algs_size; /* size of _hw_algs in bytes */ 298 /* algorithm key lengths */ 299 ipsec_capab_algparm_t *encr_algparm; 300 uint32_t encr_algparm_size; 301 uint32_t encr_algparm_end; 302 }; 303 304 /* 305 * List of AH and ESP IPsec acceleration capable ills 306 */ 307 typedef struct ipsec_capab_ill_s { 308 uint_t ill_index; 309 boolean_t ill_isv6; 310 struct ipsec_capab_ill_s *next; 311 } ipsec_capab_ill_t; 312 313 static ipsec_capab_ill_t *ipsec_capab_ills_ah; 314 static ipsec_capab_ill_t *ipsec_capab_ills_esp; 315 krwlock_t ipsec_capab_ills_lock; 316 317 /* 318 * The field values are larger than strictly necessary for simple 319 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 320 */ 321 static area_t ip_area_template = { 322 AR_ENTRY_ADD, /* area_cmd */ 323 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 324 /* area_name_offset */ 325 /* area_name_length temporarily holds this structure length */ 326 sizeof (area_t), /* area_name_length */ 327 IP_ARP_PROTO_TYPE, /* area_proto */ 328 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 329 IP_ADDR_LEN, /* area_proto_addr_length */ 330 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 331 /* area_proto_mask_offset */ 332 0, /* area_flags */ 333 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 334 /* area_hw_addr_offset */ 335 /* Zero length hw_addr_length means 'use your idea of the address' */ 336 0 /* area_hw_addr_length */ 337 }; 338 339 /* 340 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 341 * support 342 */ 343 static area_t ip6_area_template = { 344 AR_ENTRY_ADD, /* area_cmd */ 345 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 346 /* area_name_offset */ 347 /* area_name_length temporarily holds this structure length */ 348 sizeof (area_t), /* area_name_length */ 349 IP_ARP_PROTO_TYPE, /* area_proto */ 350 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 351 IPV6_ADDR_LEN, /* area_proto_addr_length */ 352 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 353 /* area_proto_mask_offset */ 354 0, /* area_flags */ 355 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 356 /* area_hw_addr_offset */ 357 /* Zero length hw_addr_length means 'use your idea of the address' */ 358 0 /* area_hw_addr_length */ 359 }; 360 361 static ared_t ip_ared_template = { 362 AR_ENTRY_DELETE, 363 sizeof (ared_t) + IP_ADDR_LEN, 364 sizeof (ared_t), 365 IP_ARP_PROTO_TYPE, 366 sizeof (ared_t), 367 IP_ADDR_LEN 368 }; 369 370 static ared_t ip6_ared_template = { 371 AR_ENTRY_DELETE, 372 sizeof (ared_t) + IPV6_ADDR_LEN, 373 sizeof (ared_t), 374 IP_ARP_PROTO_TYPE, 375 sizeof (ared_t), 376 IPV6_ADDR_LEN 377 }; 378 379 /* 380 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 381 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 382 * areq is used). 383 */ 384 static areq_t ip_areq_template = { 385 AR_ENTRY_QUERY, /* cmd */ 386 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 387 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 388 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 389 sizeof (areq_t), /* target addr offset */ 390 IP_ADDR_LEN, /* target addr_length */ 391 0, /* flags */ 392 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 393 IP_ADDR_LEN, /* sender addr length */ 394 6, /* xmit_count */ 395 1000, /* (re)xmit_interval in milliseconds */ 396 4 /* max # of requests to buffer */ 397 /* anything else filled in by the code */ 398 }; 399 400 static arc_t ip_aru_template = { 401 AR_INTERFACE_UP, 402 sizeof (arc_t), /* Name offset */ 403 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 404 }; 405 406 static arc_t ip_ard_template = { 407 AR_INTERFACE_DOWN, 408 sizeof (arc_t), /* Name offset */ 409 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 410 }; 411 412 static arc_t ip_aron_template = { 413 AR_INTERFACE_ON, 414 sizeof (arc_t), /* Name offset */ 415 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 416 }; 417 418 static arc_t ip_aroff_template = { 419 AR_INTERFACE_OFF, 420 sizeof (arc_t), /* Name offset */ 421 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 422 }; 423 424 425 static arma_t ip_arma_multi_template = { 426 AR_MAPPING_ADD, 427 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 428 /* Name offset */ 429 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 430 IP_ARP_PROTO_TYPE, 431 sizeof (arma_t), /* proto_addr_offset */ 432 IP_ADDR_LEN, /* proto_addr_length */ 433 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 434 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 435 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 436 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 437 IP_MAX_HW_LEN, /* hw_addr_length */ 438 0, /* hw_mapping_start */ 439 }; 440 441 static ipft_t ip_ioctl_ftbl[] = { 442 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 443 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 444 IPFT_F_NO_REPLY }, 445 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 446 IPFT_F_NO_REPLY }, 447 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 448 { 0 } 449 }; 450 451 /* Simple ICMP IP Header Template */ 452 static ipha_t icmp_ipha = { 453 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 454 }; 455 456 /* Flag descriptors for ip_ipif_report */ 457 static nv_t ipif_nv_tbl[] = { 458 { IPIF_UP, "UP" }, 459 { IPIF_BROADCAST, "BROADCAST" }, 460 { ILLF_DEBUG, "DEBUG" }, 461 { PHYI_LOOPBACK, "LOOPBACK" }, 462 { IPIF_POINTOPOINT, "POINTOPOINT" }, 463 { ILLF_NOTRAILERS, "NOTRAILERS" }, 464 { PHYI_RUNNING, "RUNNING" }, 465 { ILLF_NOARP, "NOARP" }, 466 { PHYI_PROMISC, "PROMISC" }, 467 { PHYI_ALLMULTI, "ALLMULTI" }, 468 { PHYI_INTELLIGENT, "INTELLIGENT" }, 469 { ILLF_MULTICAST, "MULTICAST" }, 470 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 471 { IPIF_UNNUMBERED, "UNNUMBERED" }, 472 { IPIF_DHCPRUNNING, "DHCP" }, 473 { IPIF_PRIVATE, "PRIVATE" }, 474 { IPIF_NOXMIT, "NOXMIT" }, 475 { IPIF_NOLOCAL, "NOLOCAL" }, 476 { IPIF_DEPRECATED, "DEPRECATED" }, 477 { IPIF_PREFERRED, "PREFERRED" }, 478 { IPIF_TEMPORARY, "TEMPORARY" }, 479 { IPIF_ADDRCONF, "ADDRCONF" }, 480 { PHYI_VIRTUAL, "VIRTUAL" }, 481 { ILLF_ROUTER, "ROUTER" }, 482 { ILLF_NONUD, "NONUD" }, 483 { IPIF_ANYCAST, "ANYCAST" }, 484 { ILLF_NORTEXCH, "NORTEXCH" }, 485 { ILLF_IPV4, "IPV4" }, 486 { ILLF_IPV6, "IPV6" }, 487 { IPIF_MIPRUNNING, "MIP" }, 488 { IPIF_NOFAILOVER, "NOFAILOVER" }, 489 { PHYI_FAILED, "FAILED" }, 490 { PHYI_STANDBY, "STANDBY" }, 491 { PHYI_INACTIVE, "INACTIVE" }, 492 { PHYI_OFFLINE, "OFFLINE" }, 493 }; 494 495 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 496 497 static ip_m_t ip_m_tbl[] = { 498 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 499 ip_ether_v6intfid }, 500 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 501 ip_nodef_v6intfid }, 502 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 503 ip_nodef_v6intfid }, 504 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 505 ip_nodef_v6intfid }, 506 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 507 ip_ether_v6intfid }, 508 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 509 ip_ib_v6intfid }, 510 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 511 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 512 ip_nodef_v6intfid } 513 }; 514 515 static ill_t ill_null; /* Empty ILL for init. */ 516 char ipif_loopback_name[] = "lo0"; 517 static char *ipv4_forward_suffix = ":ip_forwarding"; 518 static char *ipv6_forward_suffix = ":ip6_forwarding"; 519 static kstat_t *loopback_ksp = NULL; 520 static sin6_t sin6_null; /* Zero address for quick clears */ 521 static sin_t sin_null; /* Zero address for quick clears */ 522 static uint_t ill_index = 1; /* Used to assign interface indicies */ 523 /* When set search for unused index */ 524 static boolean_t ill_index_wrap = B_FALSE; 525 /* When set search for unused ipif_seqid */ 526 static ipif_t ipif_zero; 527 uint_t ipif_src_random; 528 529 /* 530 * For details on the protection offered by these locks please refer 531 * to the notes under the Synchronization section at the start of ip.c 532 */ 533 krwlock_t ill_g_lock; /* The global ill_g_lock */ 534 kmutex_t ip_addr_avail_lock; /* Address availability check lock */ 535 ipsq_t *ipsq_g_head; /* List of all ipsq's on the system */ 536 537 krwlock_t ill_g_usesrc_lock; /* Protects usesrc related fields */ 538 539 /* 540 * illgrp_head/ifgrp_head is protected by IP's perimeter. 541 */ 542 static ill_group_t *illgrp_head_v4; /* Head of IPv4 ill groups */ 543 ill_group_t *illgrp_head_v6; /* Head of IPv6 ill groups */ 544 545 ill_g_head_t ill_g_heads[MAX_G_HEADS]; /* ILL List Head */ 546 547 /* 548 * ppa arena is created after these many 549 * interfaces have been plumbed. 550 */ 551 uint_t ill_no_arena = 12; 552 553 #pragma align CACHE_ALIGN_SIZE(phyint_g_list) 554 static phyint_list_t phyint_g_list; /* start of phyint list */ 555 556 /* 557 * Reflects value of FAILBACK variable in IPMP config file 558 * /etc/default/mpathd. Default value is B_TRUE. 559 * Set to B_FALSE if user disabled failback by configuring "FAILBACK=no" 560 * in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel. 561 */ 562 static boolean_t ipmp_enable_failback = B_TRUE; 563 564 /* 565 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 566 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 567 * set through platform specific code (Niagara/Ontario). 568 */ 569 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 570 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 571 572 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 573 574 static uint_t 575 ipif_rand(void) 576 { 577 ipif_src_random = ipif_src_random * 1103515245 + 12345; 578 return ((ipif_src_random >> 16) & 0x7fff); 579 } 580 581 /* 582 * Allocate per-interface mibs. Only used for ipv6. 583 * Returns true if ok. False otherwise. 584 * ipsq may not yet be allocated (loopback case ). 585 */ 586 static boolean_t 587 ill_allocate_mibs(ill_t *ill) 588 { 589 ASSERT(ill->ill_isv6); 590 591 /* Already allocated? */ 592 if (ill->ill_ip6_mib != NULL) { 593 ASSERT(ill->ill_icmp6_mib != NULL); 594 return (B_TRUE); 595 } 596 597 ill->ill_ip6_mib = kmem_zalloc(sizeof (*ill->ill_ip6_mib), 598 KM_NOSLEEP); 599 if (ill->ill_ip6_mib == NULL) { 600 return (B_FALSE); 601 } 602 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 603 KM_NOSLEEP); 604 if (ill->ill_icmp6_mib == NULL) { 605 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 606 ill->ill_ip6_mib = NULL; 607 return (B_FALSE); 608 } 609 /* 610 * The ipv6Ifindex and ipv6IfIcmpIndex will be assigned later 611 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 612 * -> ill_phyint_reinit 613 */ 614 return (B_TRUE); 615 } 616 617 /* 618 * Common code for preparation of ARP commands. Two points to remember: 619 * 1) The ill_name is tacked on at the end of the allocated space so 620 * the templates name_offset field must contain the total space 621 * to allocate less the name length. 622 * 623 * 2) The templates name_length field should contain the *template* 624 * length. We use it as a parameter to bcopy() and then write 625 * the real ill_name_length into the name_length field of the copy. 626 * (Always called as writer.) 627 */ 628 mblk_t * 629 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 630 { 631 arc_t *arc = (arc_t *)template; 632 char *cp; 633 int len; 634 mblk_t *mp; 635 uint_t name_length = ill->ill_name_length; 636 uint_t template_len = arc->arc_name_length; 637 638 len = arc->arc_name_offset + name_length; 639 mp = allocb(len, BPRI_HI); 640 if (mp == NULL) 641 return (NULL); 642 cp = (char *)mp->b_rptr; 643 mp->b_wptr = (uchar_t *)&cp[len]; 644 if (template_len) 645 bcopy(template, cp, template_len); 646 if (len > template_len) 647 bzero(&cp[template_len], len - template_len); 648 mp->b_datap->db_type = M_PROTO; 649 650 arc = (arc_t *)cp; 651 arc->arc_name_length = name_length; 652 cp = (char *)arc + arc->arc_name_offset; 653 bcopy(ill->ill_name, cp, name_length); 654 655 if (addr) { 656 area_t *area = (area_t *)mp->b_rptr; 657 658 cp = (char *)area + area->area_proto_addr_offset; 659 bcopy(addr, cp, area->area_proto_addr_length); 660 if (area->area_cmd == AR_ENTRY_ADD) { 661 cp = (char *)area; 662 len = area->area_proto_addr_length; 663 if (area->area_proto_mask_offset) 664 cp += area->area_proto_mask_offset; 665 else 666 cp += area->area_proto_addr_offset + len; 667 while (len-- > 0) 668 *cp++ = (char)~0; 669 } 670 } 671 return (mp); 672 } 673 674 mblk_t * 675 ipif_area_alloc(ipif_t *ipif) 676 { 677 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, 678 (char *)&ipif->ipif_lcl_addr)); 679 } 680 681 mblk_t * 682 ipif_ared_alloc(ipif_t *ipif) 683 { 684 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, 685 (char *)&ipif->ipif_lcl_addr)); 686 } 687 688 mblk_t * 689 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 690 { 691 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 692 (char *)&addr)); 693 } 694 695 /* 696 * Completely vaporize a lower level tap and all associated interfaces. 697 * ill_delete is called only out of ip_close when the device control 698 * stream is being closed. 699 */ 700 void 701 ill_delete(ill_t *ill) 702 { 703 ipif_t *ipif; 704 ill_t *prev_ill; 705 706 /* 707 * ill_delete may be forcibly entering the ipsq. The previous 708 * ioctl may not have completed and may need to be aborted. 709 * ipsq_flush takes care of it. If we don't need to enter the 710 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 711 * ill_delete_tail is sufficient. 712 */ 713 ipsq_flush(ill); 714 715 /* 716 * Nuke all interfaces. ipif_free will take down the interface, 717 * remove it from the list, and free the data structure. 718 * Walk down the ipif list and remove the logical interfaces 719 * first before removing the main ipif. We can't unplumb 720 * zeroth interface first in the case of IPv6 as reset_conn_ill 721 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 722 * POINTOPOINT. 723 * 724 * If ill_ipif was not properly initialized (i.e low on memory), 725 * then no interfaces to clean up. In this case just clean up the 726 * ill. 727 */ 728 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 729 ipif_free(ipif); 730 731 /* 732 * Used only by ill_arp_on and ill_arp_off, which are writers. 733 * So nobody can be using this mp now. Free the mp allocated for 734 * honoring ILLF_NOARP 735 */ 736 freemsg(ill->ill_arp_on_mp); 737 ill->ill_arp_on_mp = NULL; 738 739 /* Clean up msgs on pending upcalls for mrouted */ 740 reset_mrt_ill(ill); 741 742 /* 743 * ipif_free -> reset_conn_ipif will remove all multicast 744 * references for IPv4. For IPv6, we need to do it here as 745 * it points only at ills. 746 */ 747 reset_conn_ill(ill); 748 749 /* 750 * ill_down will arrange to blow off any IRE's dependent on this 751 * ILL, and shut down fragmentation reassembly. 752 */ 753 ill_down(ill); 754 755 /* Let SCTP know, so that it can remove this from its list. */ 756 sctp_update_ill(ill, SCTP_ILL_REMOVE); 757 758 /* 759 * If an address on this ILL is being used as a source address then 760 * clear out the pointers in other ILLs that point to this ILL. 761 */ 762 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 763 if (ill->ill_usesrc_grp_next != NULL) { 764 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 765 ill_disband_usesrc_group(ill); 766 } else { /* consumer of the usesrc ILL */ 767 prev_ill = ill_prev_usesrc(ill); 768 prev_ill->ill_usesrc_grp_next = 769 ill->ill_usesrc_grp_next; 770 } 771 } 772 rw_exit(&ill_g_usesrc_lock); 773 } 774 775 static void 776 ipif_non_duplicate(ipif_t *ipif) 777 { 778 ill_t *ill = ipif->ipif_ill; 779 mutex_enter(&ill->ill_lock); 780 if (ipif->ipif_flags & IPIF_DUPLICATE) { 781 ipif->ipif_flags &= ~IPIF_DUPLICATE; 782 ASSERT(ill->ill_ipif_dup_count > 0); 783 ill->ill_ipif_dup_count--; 784 } 785 mutex_exit(&ill->ill_lock); 786 } 787 788 /* 789 * ill_delete_tail is called from ip_modclose after all references 790 * to the closing ill are gone. The wait is done in ip_modclose 791 */ 792 void 793 ill_delete_tail(ill_t *ill) 794 { 795 mblk_t **mpp; 796 ipif_t *ipif; 797 798 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 799 ipif_non_duplicate(ipif); 800 ipif_down_tail(ipif); 801 } 802 803 ASSERT(ill->ill_ipif_dup_count == 0 && 804 ill->ill_arp_down_mp == NULL && 805 ill->ill_arp_del_mapping_mp == NULL); 806 807 /* 808 * If polling capability is enabled (which signifies direct 809 * upcall into IP and driver has ill saved as a handle), 810 * we need to make sure that unbind has completed before we 811 * let the ill disappear and driver no longer has any reference 812 * to this ill. 813 */ 814 mutex_enter(&ill->ill_lock); 815 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 816 cv_wait(&ill->ill_cv, &ill->ill_lock); 817 mutex_exit(&ill->ill_lock); 818 819 /* 820 * Clean up polling and soft ring capabilities 821 */ 822 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 823 ill_capability_dls_disable(ill); 824 825 /* 826 * Send the detach if there's one to send (i.e., if we're above a 827 * style 2 DLPI driver). 828 */ 829 if (ill->ill_detach_mp != NULL) { 830 ill_dlpi_send(ill, ill->ill_detach_mp); 831 ill->ill_detach_mp = NULL; 832 } 833 834 if (ill->ill_net_type != IRE_LOOPBACK) 835 qprocsoff(ill->ill_rq); 836 837 /* 838 * We do an ipsq_flush once again now. New messages could have 839 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 840 * could also have landed up if an ioctl thread had looked up 841 * the ill before we set the ILL_CONDEMNED flag, but not yet 842 * enqueued the ioctl when we did the ipsq_flush last time. 843 */ 844 ipsq_flush(ill); 845 846 /* 847 * Free capabilities. 848 */ 849 if (ill->ill_ipsec_capab_ah != NULL) { 850 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 851 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 852 ill->ill_ipsec_capab_ah = NULL; 853 } 854 855 if (ill->ill_ipsec_capab_esp != NULL) { 856 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 857 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 858 ill->ill_ipsec_capab_esp = NULL; 859 } 860 861 if (ill->ill_mdt_capab != NULL) { 862 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 863 ill->ill_mdt_capab = NULL; 864 } 865 866 if (ill->ill_hcksum_capab != NULL) { 867 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 868 ill->ill_hcksum_capab = NULL; 869 } 870 871 if (ill->ill_zerocopy_capab != NULL) { 872 kmem_free(ill->ill_zerocopy_capab, 873 sizeof (ill_zerocopy_capab_t)); 874 ill->ill_zerocopy_capab = NULL; 875 } 876 877 if (ill->ill_dls_capab != NULL) { 878 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 879 ill->ill_dls_capab->ill_unbind_conn = NULL; 880 kmem_free(ill->ill_dls_capab, 881 sizeof (ill_dls_capab_t) + 882 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 883 ill->ill_dls_capab = NULL; 884 } 885 886 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 887 888 while (ill->ill_ipif != NULL) 889 ipif_free_tail(ill->ill_ipif); 890 891 ill_down_tail(ill); 892 893 /* 894 * We have removed all references to ilm from conn and the ones joined 895 * within the kernel. 896 * 897 * We don't walk conns, mrts and ires because 898 * 899 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 900 * 2) ill_down ->ill_downi walks all the ires and cleans up 901 * ill references. 902 */ 903 ASSERT(ilm_walk_ill(ill) == 0); 904 /* 905 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 906 * could free the phyint. No more reference to the phyint after this 907 * point. 908 */ 909 (void) ill_glist_delete(ill); 910 911 rw_enter(&ip_g_nd_lock, RW_WRITER); 912 if (ill->ill_ndd_name != NULL) 913 nd_unload(&ip_g_nd, ill->ill_ndd_name); 914 rw_exit(&ip_g_nd_lock); 915 916 917 if (ill->ill_frag_ptr != NULL) { 918 uint_t count; 919 920 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 921 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 922 } 923 mi_free(ill->ill_frag_ptr); 924 ill->ill_frag_ptr = NULL; 925 ill->ill_frag_hash_tbl = NULL; 926 } 927 if (ill->ill_nd_lla_mp != NULL) 928 freemsg(ill->ill_nd_lla_mp); 929 /* Free all retained control messages. */ 930 mpp = &ill->ill_first_mp_to_free; 931 do { 932 while (mpp[0]) { 933 mblk_t *mp; 934 mblk_t *mp1; 935 936 mp = mpp[0]; 937 mpp[0] = mp->b_next; 938 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 939 mp1->b_next = NULL; 940 mp1->b_prev = NULL; 941 } 942 freemsg(mp); 943 } 944 } while (mpp++ != &ill->ill_last_mp_to_free); 945 946 ill_free_mib(ill); 947 ILL_TRACE_CLEANUP(ill); 948 } 949 950 static void 951 ill_free_mib(ill_t *ill) 952 { 953 if (ill->ill_ip6_mib != NULL) { 954 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 955 ill->ill_ip6_mib = NULL; 956 } 957 if (ill->ill_icmp6_mib != NULL) { 958 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 959 ill->ill_icmp6_mib = NULL; 960 } 961 } 962 963 /* 964 * Concatenate together a physical address and a sap. 965 * 966 * Sap_lengths are interpreted as follows: 967 * sap_length == 0 ==> no sap 968 * sap_length > 0 ==> sap is at the head of the dlpi address 969 * sap_length < 0 ==> sap is at the tail of the dlpi address 970 */ 971 static void 972 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 973 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 974 { 975 uint16_t sap_addr = (uint16_t)sap_src; 976 977 if (sap_length == 0) { 978 if (phys_src == NULL) 979 bzero(dst, phys_length); 980 else 981 bcopy(phys_src, dst, phys_length); 982 } else if (sap_length < 0) { 983 if (phys_src == NULL) 984 bzero(dst, phys_length); 985 else 986 bcopy(phys_src, dst, phys_length); 987 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 988 } else { 989 bcopy(&sap_addr, dst, sizeof (sap_addr)); 990 if (phys_src == NULL) 991 bzero((char *)dst + sap_length, phys_length); 992 else 993 bcopy(phys_src, (char *)dst + sap_length, phys_length); 994 } 995 } 996 997 /* 998 * Generate a dl_unitdata_req mblk for the device and address given. 999 * addr_length is the length of the physical portion of the address. 1000 * If addr is NULL include an all zero address of the specified length. 1001 * TRUE? In any case, addr_length is taken to be the entire length of the 1002 * dlpi address, including the absolute value of sap_length. 1003 */ 1004 mblk_t * 1005 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1006 t_scalar_t sap_length) 1007 { 1008 dl_unitdata_req_t *dlur; 1009 mblk_t *mp; 1010 t_scalar_t abs_sap_length; /* absolute value */ 1011 1012 abs_sap_length = ABS(sap_length); 1013 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1014 DL_UNITDATA_REQ); 1015 if (mp == NULL) 1016 return (NULL); 1017 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1018 /* HACK: accomodate incompatible DLPI drivers */ 1019 if (addr_length == 8) 1020 addr_length = 6; 1021 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1022 dlur->dl_dest_addr_offset = sizeof (*dlur); 1023 dlur->dl_priority.dl_min = 0; 1024 dlur->dl_priority.dl_max = 0; 1025 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1026 (uchar_t *)&dlur[1]); 1027 return (mp); 1028 } 1029 1030 /* 1031 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 1032 * Return an error if we already have 1 or more ioctls in progress. 1033 * This is used only for non-exclusive ioctls. Currently this is used 1034 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 1035 * and thus need to use ipsq_pending_mp_add. 1036 */ 1037 boolean_t 1038 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1039 { 1040 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1041 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1042 /* 1043 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1044 */ 1045 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1046 (add_mp->b_datap->db_type == M_IOCTL)); 1047 1048 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1049 /* 1050 * Return error if the conn has started closing. The conn 1051 * could have finished cleaning up the pending mp list, 1052 * If so we should not add another mp to the list negating 1053 * the cleanup. 1054 */ 1055 if (connp->conn_state_flags & CONN_CLOSING) 1056 return (B_FALSE); 1057 /* 1058 * Add the pending mp to the head of the list, chained by b_next. 1059 * Note down the conn on which the ioctl request came, in b_prev. 1060 * This will be used to later get the conn, when we get a response 1061 * on the ill queue, from some other module (typically arp) 1062 */ 1063 add_mp->b_next = (void *)ill->ill_pending_mp; 1064 add_mp->b_queue = CONNP_TO_WQ(connp); 1065 ill->ill_pending_mp = add_mp; 1066 if (connp != NULL) 1067 connp->conn_oper_pending_ill = ill; 1068 return (B_TRUE); 1069 } 1070 1071 /* 1072 * Retrieve the ill_pending_mp and return it. We have to walk the list 1073 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1074 */ 1075 mblk_t * 1076 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1077 { 1078 mblk_t *prev = NULL; 1079 mblk_t *curr = NULL; 1080 uint_t id; 1081 conn_t *connp; 1082 1083 /* 1084 * When the conn closes, conn_ioctl_cleanup needs to clean 1085 * up the pending mp, but it does not know the ioc_id and 1086 * passes in a zero for it. 1087 */ 1088 mutex_enter(&ill->ill_lock); 1089 if (ioc_id != 0) 1090 *connpp = NULL; 1091 1092 /* Search the list for the appropriate ioctl based on ioc_id */ 1093 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1094 prev = curr, curr = curr->b_next) { 1095 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1096 connp = Q_TO_CONN(curr->b_queue); 1097 /* Match based on the ioc_id or based on the conn */ 1098 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1099 break; 1100 } 1101 1102 if (curr != NULL) { 1103 /* Unlink the mblk from the pending mp list */ 1104 if (prev != NULL) { 1105 prev->b_next = curr->b_next; 1106 } else { 1107 ASSERT(ill->ill_pending_mp == curr); 1108 ill->ill_pending_mp = curr->b_next; 1109 } 1110 1111 /* 1112 * conn refcnt must have been bumped up at the start of 1113 * the ioctl. So we can safely access the conn. 1114 */ 1115 ASSERT(CONN_Q(curr->b_queue)); 1116 *connpp = Q_TO_CONN(curr->b_queue); 1117 curr->b_next = NULL; 1118 curr->b_queue = NULL; 1119 } 1120 1121 mutex_exit(&ill->ill_lock); 1122 1123 return (curr); 1124 } 1125 1126 /* 1127 * Add the pending mp to the list. There can be only 1 pending mp 1128 * in the list. Any exclusive ioctl that needs to wait for a response 1129 * from another module or driver needs to use this function to set 1130 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1131 * the other module/driver. This is also used while waiting for the 1132 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1133 */ 1134 boolean_t 1135 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1136 int waitfor) 1137 { 1138 ipsq_t *ipsq; 1139 1140 ASSERT(IAM_WRITER_IPIF(ipif)); 1141 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1142 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1143 /* 1144 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1145 * M_ERROR/M_HANGUP from driver 1146 */ 1147 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1148 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP)); 1149 1150 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1151 if (connp != NULL) { 1152 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1153 /* 1154 * Return error if the conn has started closing. The conn 1155 * could have finished cleaning up the pending mp list, 1156 * If so we should not add another mp to the list negating 1157 * the cleanup. 1158 */ 1159 if (connp->conn_state_flags & CONN_CLOSING) 1160 return (B_FALSE); 1161 } 1162 mutex_enter(&ipsq->ipsq_lock); 1163 ipsq->ipsq_pending_ipif = ipif; 1164 /* 1165 * Note down the queue in b_queue. This will be returned by 1166 * ipsq_pending_mp_get. Caller will then use these values to restart 1167 * the processing 1168 */ 1169 add_mp->b_next = NULL; 1170 add_mp->b_queue = q; 1171 ipsq->ipsq_pending_mp = add_mp; 1172 ipsq->ipsq_waitfor = waitfor; 1173 /* 1174 * ipsq_current_ipif is needed to restart the operation from 1175 * ipif_ill_refrele_tail when the last reference to the ipi/ill 1176 * is gone. Since this is not an ioctl ipsq_current_ipif has not 1177 * been set until now. 1178 */ 1179 if (DB_TYPE(add_mp) == M_ERROR || DB_TYPE(add_mp) == M_HANGUP) { 1180 ASSERT(ipsq->ipsq_current_ipif == NULL); 1181 ipsq->ipsq_current_ipif = ipif; 1182 ipsq->ipsq_last_cmd = DB_TYPE(add_mp); 1183 } 1184 if (connp != NULL) 1185 connp->conn_oper_pending_ill = ipif->ipif_ill; 1186 mutex_exit(&ipsq->ipsq_lock); 1187 return (B_TRUE); 1188 } 1189 1190 /* 1191 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1192 * queued in the list. 1193 */ 1194 mblk_t * 1195 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1196 { 1197 mblk_t *curr = NULL; 1198 1199 mutex_enter(&ipsq->ipsq_lock); 1200 *connpp = NULL; 1201 if (ipsq->ipsq_pending_mp == NULL) { 1202 mutex_exit(&ipsq->ipsq_lock); 1203 return (NULL); 1204 } 1205 1206 /* There can be only 1 such excl message */ 1207 curr = ipsq->ipsq_pending_mp; 1208 ASSERT(curr != NULL && curr->b_next == NULL); 1209 ipsq->ipsq_pending_ipif = NULL; 1210 ipsq->ipsq_pending_mp = NULL; 1211 ipsq->ipsq_waitfor = 0; 1212 mutex_exit(&ipsq->ipsq_lock); 1213 1214 if (CONN_Q(curr->b_queue)) { 1215 /* 1216 * This mp did a refhold on the conn, at the start of the ioctl. 1217 * So we can safely return a pointer to the conn to the caller. 1218 */ 1219 *connpp = Q_TO_CONN(curr->b_queue); 1220 } else { 1221 *connpp = NULL; 1222 } 1223 curr->b_next = NULL; 1224 curr->b_prev = NULL; 1225 return (curr); 1226 } 1227 1228 /* 1229 * Cleanup the ioctl mp queued in ipsq_pending_mp 1230 * - Called in the ill_delete path 1231 * - Called in the M_ERROR or M_HANGUP path on the ill. 1232 * - Called in the conn close path. 1233 */ 1234 boolean_t 1235 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1236 { 1237 mblk_t *mp; 1238 ipsq_t *ipsq; 1239 queue_t *q; 1240 ipif_t *ipif; 1241 1242 ASSERT(IAM_WRITER_ILL(ill)); 1243 ipsq = ill->ill_phyint->phyint_ipsq; 1244 mutex_enter(&ipsq->ipsq_lock); 1245 /* 1246 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1247 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1248 * even if it is meant for another ill, since we have to enqueue 1249 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1250 * If connp is non-null we are called from the conn close path. 1251 */ 1252 mp = ipsq->ipsq_pending_mp; 1253 if (mp == NULL || (connp != NULL && 1254 mp->b_queue != CONNP_TO_WQ(connp))) { 1255 mutex_exit(&ipsq->ipsq_lock); 1256 return (B_FALSE); 1257 } 1258 /* Now remove from the ipsq_pending_mp */ 1259 ipsq->ipsq_pending_mp = NULL; 1260 q = mp->b_queue; 1261 mp->b_next = NULL; 1262 mp->b_prev = NULL; 1263 mp->b_queue = NULL; 1264 1265 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1266 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1267 if (ill->ill_move_in_progress) { 1268 ILL_CLEAR_MOVE(ill); 1269 } else if (ill->ill_up_ipifs) { 1270 ill_group_cleanup(ill); 1271 } 1272 1273 ipif = ipsq->ipsq_pending_ipif; 1274 ipsq->ipsq_pending_ipif = NULL; 1275 ipsq->ipsq_waitfor = 0; 1276 ipsq->ipsq_current_ipif = NULL; 1277 mutex_exit(&ipsq->ipsq_lock); 1278 1279 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1280 ip_ioctl_finish(q, mp, ENXIO, connp != NULL ? CONN_CLOSE : 1281 NO_COPYOUT, connp != NULL ? ipif : NULL, NULL); 1282 } else { 1283 /* 1284 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1285 * be just inet_freemsg. we have to restart it 1286 * otherwise the thread will be stuck. 1287 */ 1288 inet_freemsg(mp); 1289 } 1290 return (B_TRUE); 1291 } 1292 1293 /* 1294 * The ill is closing. Cleanup all the pending mps. Called exclusively 1295 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1296 * knows this ill, and hence nobody can add an mp to this list 1297 */ 1298 static void 1299 ill_pending_mp_cleanup(ill_t *ill) 1300 { 1301 mblk_t *mp; 1302 queue_t *q; 1303 1304 ASSERT(IAM_WRITER_ILL(ill)); 1305 1306 mutex_enter(&ill->ill_lock); 1307 /* 1308 * Every mp on the pending mp list originating from an ioctl 1309 * added 1 to the conn refcnt, at the start of the ioctl. 1310 * So bump it down now. See comments in ip_wput_nondata() 1311 */ 1312 while (ill->ill_pending_mp != NULL) { 1313 mp = ill->ill_pending_mp; 1314 ill->ill_pending_mp = mp->b_next; 1315 mutex_exit(&ill->ill_lock); 1316 1317 q = mp->b_queue; 1318 ASSERT(CONN_Q(q)); 1319 mp->b_next = NULL; 1320 mp->b_prev = NULL; 1321 mp->b_queue = NULL; 1322 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL, NULL); 1323 mutex_enter(&ill->ill_lock); 1324 } 1325 ill->ill_pending_ipif = NULL; 1326 1327 mutex_exit(&ill->ill_lock); 1328 } 1329 1330 /* 1331 * Called in the conn close path and ill delete path 1332 */ 1333 static void 1334 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1335 { 1336 ipsq_t *ipsq; 1337 mblk_t *prev; 1338 mblk_t *curr; 1339 mblk_t *next; 1340 queue_t *q; 1341 mblk_t *tmp_list = NULL; 1342 1343 ASSERT(IAM_WRITER_ILL(ill)); 1344 if (connp != NULL) 1345 q = CONNP_TO_WQ(connp); 1346 else 1347 q = ill->ill_wq; 1348 1349 ipsq = ill->ill_phyint->phyint_ipsq; 1350 /* 1351 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1352 * In the case of ioctl from a conn, there can be only 1 mp 1353 * queued on the ipsq. If an ill is being unplumbed, only messages 1354 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1355 * ioctls meant for this ill form conn's are not flushed. They will 1356 * be processed during ipsq_exit and will not find the ill and will 1357 * return error. 1358 */ 1359 mutex_enter(&ipsq->ipsq_lock); 1360 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1361 curr = next) { 1362 next = curr->b_next; 1363 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1364 /* Unlink the mblk from the pending mp list */ 1365 if (prev != NULL) { 1366 prev->b_next = curr->b_next; 1367 } else { 1368 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1369 ipsq->ipsq_xopq_mphead = curr->b_next; 1370 } 1371 if (ipsq->ipsq_xopq_mptail == curr) 1372 ipsq->ipsq_xopq_mptail = prev; 1373 /* 1374 * Create a temporary list and release the ipsq lock 1375 * New elements are added to the head of the tmp_list 1376 */ 1377 curr->b_next = tmp_list; 1378 tmp_list = curr; 1379 } else { 1380 prev = curr; 1381 } 1382 } 1383 mutex_exit(&ipsq->ipsq_lock); 1384 1385 while (tmp_list != NULL) { 1386 curr = tmp_list; 1387 tmp_list = curr->b_next; 1388 curr->b_next = NULL; 1389 curr->b_prev = NULL; 1390 curr->b_queue = NULL; 1391 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1392 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1393 CONN_CLOSE : NO_COPYOUT, NULL, NULL); 1394 } else { 1395 /* 1396 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1397 * this can't be just inet_freemsg. we have to 1398 * restart it otherwise the thread will be stuck. 1399 */ 1400 inet_freemsg(curr); 1401 } 1402 } 1403 } 1404 1405 /* 1406 * This conn has started closing. Cleanup any pending ioctl from this conn. 1407 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1408 */ 1409 void 1410 conn_ioctl_cleanup(conn_t *connp) 1411 { 1412 mblk_t *curr; 1413 ipsq_t *ipsq; 1414 ill_t *ill; 1415 boolean_t refheld; 1416 1417 /* 1418 * Is any exclusive ioctl pending ? If so clean it up. If the 1419 * ioctl has not yet started, the mp is pending in the list headed by 1420 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1421 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1422 * is currently executing now the mp is not queued anywhere but 1423 * conn_oper_pending_ill is null. The conn close will wait 1424 * till the conn_ref drops to zero. 1425 */ 1426 mutex_enter(&connp->conn_lock); 1427 ill = connp->conn_oper_pending_ill; 1428 if (ill == NULL) { 1429 mutex_exit(&connp->conn_lock); 1430 return; 1431 } 1432 1433 curr = ill_pending_mp_get(ill, &connp, 0); 1434 if (curr != NULL) { 1435 mutex_exit(&connp->conn_lock); 1436 CONN_DEC_REF(connp); 1437 inet_freemsg(curr); 1438 return; 1439 } 1440 /* 1441 * We may not be able to refhold the ill if the ill/ipif 1442 * is changing. But we need to make sure that the ill will 1443 * not vanish. So we just bump up the ill_waiter count. 1444 */ 1445 refheld = ill_waiter_inc(ill); 1446 mutex_exit(&connp->conn_lock); 1447 if (refheld) { 1448 if (ipsq_enter(ill, B_TRUE)) { 1449 ill_waiter_dcr(ill); 1450 /* 1451 * Check whether this ioctl has started and is 1452 * pending now in ipsq_pending_mp. If it is not 1453 * found there then check whether this ioctl has 1454 * not even started and is in the ipsq_xopq list. 1455 */ 1456 if (!ipsq_pending_mp_cleanup(ill, connp)) 1457 ipsq_xopq_mp_cleanup(ill, connp); 1458 ipsq = ill->ill_phyint->phyint_ipsq; 1459 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1460 return; 1461 } 1462 } 1463 1464 /* 1465 * The ill is also closing and we could not bump up the 1466 * ill_waiter_count or we could not enter the ipsq. Leave 1467 * the cleanup to ill_delete 1468 */ 1469 mutex_enter(&connp->conn_lock); 1470 while (connp->conn_oper_pending_ill != NULL) 1471 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1472 mutex_exit(&connp->conn_lock); 1473 if (refheld) 1474 ill_waiter_dcr(ill); 1475 } 1476 1477 /* 1478 * ipcl_walk function for cleaning up conn_*_ill fields. 1479 */ 1480 static void 1481 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1482 { 1483 ill_t *ill = (ill_t *)arg; 1484 ire_t *ire; 1485 1486 mutex_enter(&connp->conn_lock); 1487 if (connp->conn_multicast_ill == ill) { 1488 /* Revert to late binding */ 1489 connp->conn_multicast_ill = NULL; 1490 connp->conn_orig_multicast_ifindex = 0; 1491 } 1492 if (connp->conn_incoming_ill == ill) 1493 connp->conn_incoming_ill = NULL; 1494 if (connp->conn_outgoing_ill == ill) 1495 connp->conn_outgoing_ill = NULL; 1496 if (connp->conn_outgoing_pill == ill) 1497 connp->conn_outgoing_pill = NULL; 1498 if (connp->conn_nofailover_ill == ill) 1499 connp->conn_nofailover_ill = NULL; 1500 if (connp->conn_xmit_if_ill == ill) 1501 connp->conn_xmit_if_ill = NULL; 1502 if (connp->conn_ire_cache != NULL) { 1503 ire = connp->conn_ire_cache; 1504 /* 1505 * ip_newroute creates IRE_CACHE with ire_stq coming from 1506 * interface X and ipif coming from interface Y, if interface 1507 * X and Y are part of the same IPMPgroup. Thus whenever 1508 * interface X goes down, remove all references to it by 1509 * checking both on ire_ipif and ire_stq. 1510 */ 1511 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1512 (ire->ire_type == IRE_CACHE && 1513 ire->ire_stq == ill->ill_wq)) { 1514 connp->conn_ire_cache = NULL; 1515 mutex_exit(&connp->conn_lock); 1516 ire_refrele_notr(ire); 1517 return; 1518 } 1519 } 1520 mutex_exit(&connp->conn_lock); 1521 1522 } 1523 1524 /* ARGSUSED */ 1525 void 1526 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1527 { 1528 ill_t *ill = q->q_ptr; 1529 ipif_t *ipif; 1530 1531 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1532 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1533 ipif_non_duplicate(ipif); 1534 ipif_down_tail(ipif); 1535 } 1536 ill_down_tail(ill); 1537 freemsg(mp); 1538 ipsq->ipsq_current_ipif = NULL; 1539 } 1540 1541 /* 1542 * ill_down_start is called when we want to down this ill and bring it up again 1543 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1544 * all interfaces, but don't tear down any plumbing. 1545 */ 1546 boolean_t 1547 ill_down_start(queue_t *q, mblk_t *mp) 1548 { 1549 ill_t *ill; 1550 ipif_t *ipif; 1551 1552 ill = q->q_ptr; 1553 1554 ASSERT(IAM_WRITER_ILL(ill)); 1555 1556 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1557 (void) ipif_down(ipif, NULL, NULL); 1558 1559 ill_down(ill); 1560 1561 (void) ipsq_pending_mp_cleanup(ill, NULL); 1562 mutex_enter(&ill->ill_lock); 1563 /* 1564 * Atomically test and add the pending mp if references are 1565 * still active. 1566 */ 1567 if (!ill_is_quiescent(ill)) { 1568 /* 1569 * Get rid of any pending mps and cleanup. Call will 1570 * not fail since we are passing a null connp. 1571 */ 1572 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1573 mp, ILL_DOWN); 1574 mutex_exit(&ill->ill_lock); 1575 return (B_FALSE); 1576 } 1577 mutex_exit(&ill->ill_lock); 1578 return (B_TRUE); 1579 } 1580 1581 static void 1582 ill_down(ill_t *ill) 1583 { 1584 /* Blow off any IREs dependent on this ILL. */ 1585 ire_walk(ill_downi, (char *)ill); 1586 1587 mutex_enter(&ire_mrtun_lock); 1588 if (ire_mrtun_count != 0) { 1589 mutex_exit(&ire_mrtun_lock); 1590 ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif, 1591 (char *)ill, NULL); 1592 } else { 1593 mutex_exit(&ire_mrtun_lock); 1594 } 1595 1596 /* 1597 * If any interface based forwarding table exists 1598 * Blow off the ires there dependent on this ill 1599 */ 1600 mutex_enter(&ire_srcif_table_lock); 1601 if (ire_srcif_table_count > 0) { 1602 mutex_exit(&ire_srcif_table_lock); 1603 ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill); 1604 } else { 1605 mutex_exit(&ire_srcif_table_lock); 1606 } 1607 1608 /* Remove any conn_*_ill depending on this ill */ 1609 ipcl_walk(conn_cleanup_ill, (caddr_t)ill); 1610 1611 if (ill->ill_group != NULL) { 1612 illgrp_delete(ill); 1613 } 1614 1615 } 1616 1617 static void 1618 ill_down_tail(ill_t *ill) 1619 { 1620 int i; 1621 1622 /* Destroy ill_srcif_table if it exists */ 1623 /* Lock not reqd really because nobody should be able to access */ 1624 mutex_enter(&ill->ill_lock); 1625 if (ill->ill_srcif_table != NULL) { 1626 ill->ill_srcif_refcnt = 0; 1627 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 1628 rw_destroy(&ill->ill_srcif_table[i].irb_lock); 1629 } 1630 kmem_free(ill->ill_srcif_table, 1631 IP_SRCIF_TABLE_SIZE * sizeof (irb_t)); 1632 ill->ill_srcif_table = NULL; 1633 ill->ill_srcif_refcnt = 0; 1634 ill->ill_mrtun_refcnt = 0; 1635 } 1636 mutex_exit(&ill->ill_lock); 1637 } 1638 1639 /* 1640 * ire_walk routine used to delete every IRE that depends on queues 1641 * associated with 'ill'. (Always called as writer.) 1642 */ 1643 static void 1644 ill_downi(ire_t *ire, char *ill_arg) 1645 { 1646 ill_t *ill = (ill_t *)ill_arg; 1647 1648 /* 1649 * ip_newroute creates IRE_CACHE with ire_stq coming from 1650 * interface X and ipif coming from interface Y, if interface 1651 * X and Y are part of the same IPMP group. Thus whenever interface 1652 * X goes down, remove all references to it by checking both 1653 * on ire_ipif and ire_stq. 1654 */ 1655 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1656 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1657 ire_delete(ire); 1658 } 1659 } 1660 1661 /* 1662 * A seperate routine for deleting revtun and srcif based routes 1663 * are needed because the ires only deleted when the interface 1664 * is unplumbed. Also these ires have ire_in_ill non-null as well. 1665 * we want to keep mobile IP specific code separate. 1666 */ 1667 static void 1668 ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg) 1669 { 1670 ill_t *ill = (ill_t *)ill_arg; 1671 1672 ASSERT(ire->ire_in_ill != NULL); 1673 1674 if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) || 1675 (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) { 1676 ire_delete(ire); 1677 } 1678 } 1679 1680 /* 1681 * Remove ire/nce from the fastpath list. 1682 */ 1683 void 1684 ill_fastpath_nack(ill_t *ill) 1685 { 1686 if (ill->ill_isv6) { 1687 nce_fastpath_list_dispatch(ill, NULL, NULL); 1688 } else { 1689 ire_fastpath_list_dispatch(ill, NULL, NULL); 1690 } 1691 } 1692 1693 /* Consume an M_IOCACK of the fastpath probe. */ 1694 void 1695 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1696 { 1697 mblk_t *mp1 = mp; 1698 1699 /* 1700 * If this was the first attempt turn on the fastpath probing. 1701 */ 1702 mutex_enter(&ill->ill_lock); 1703 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1704 ill->ill_dlpi_fastpath_state = IDS_OK; 1705 mutex_exit(&ill->ill_lock); 1706 1707 /* Free the M_IOCACK mblk, hold on to the data */ 1708 mp = mp->b_cont; 1709 freeb(mp1); 1710 if (mp == NULL) 1711 return; 1712 if (mp->b_cont != NULL) { 1713 /* 1714 * Update all IRE's or NCE's that are waiting for 1715 * fastpath update. 1716 */ 1717 if (ill->ill_isv6) { 1718 /* 1719 * update nce's in the fastpath list. 1720 */ 1721 nce_fastpath_list_dispatch(ill, 1722 ndp_fastpath_update, mp); 1723 } else { 1724 1725 /* 1726 * update ire's in the fastpath list. 1727 */ 1728 ire_fastpath_list_dispatch(ill, 1729 ire_fastpath_update, mp); 1730 /* 1731 * Check if we need to traverse reverse tunnel table. 1732 * Since there is only single ire_type (IRE_MIPRTUN) 1733 * in the table, we don't need to match on ire_type. 1734 * We have to check ire_mrtun_count and not the 1735 * ill_mrtun_refcnt since ill_mrtun_refcnt is set 1736 * on the incoming ill and here we are dealing with 1737 * outgoing ill. 1738 */ 1739 mutex_enter(&ire_mrtun_lock); 1740 if (ire_mrtun_count != 0) { 1741 mutex_exit(&ire_mrtun_lock); 1742 ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN, 1743 (void (*)(ire_t *, void *)) 1744 ire_fastpath_update, mp, ill); 1745 } else { 1746 mutex_exit(&ire_mrtun_lock); 1747 } 1748 } 1749 mp1 = mp->b_cont; 1750 freeb(mp); 1751 mp = mp1; 1752 } else { 1753 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1754 } 1755 1756 freeb(mp); 1757 } 1758 1759 /* 1760 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1761 * The data portion of the request is a dl_unitdata_req_t template for 1762 * what we would send downstream in the absence of a fastpath confirmation. 1763 */ 1764 int 1765 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1766 { 1767 struct iocblk *ioc; 1768 mblk_t *mp; 1769 1770 if (dlur_mp == NULL) 1771 return (EINVAL); 1772 1773 mutex_enter(&ill->ill_lock); 1774 switch (ill->ill_dlpi_fastpath_state) { 1775 case IDS_FAILED: 1776 /* 1777 * Driver NAKed the first fastpath ioctl - assume it doesn't 1778 * support it. 1779 */ 1780 mutex_exit(&ill->ill_lock); 1781 return (ENOTSUP); 1782 case IDS_UNKNOWN: 1783 /* This is the first probe */ 1784 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1785 break; 1786 default: 1787 break; 1788 } 1789 mutex_exit(&ill->ill_lock); 1790 1791 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1792 return (EAGAIN); 1793 1794 mp->b_cont = copyb(dlur_mp); 1795 if (mp->b_cont == NULL) { 1796 freeb(mp); 1797 return (EAGAIN); 1798 } 1799 1800 ioc = (struct iocblk *)mp->b_rptr; 1801 ioc->ioc_count = msgdsize(mp->b_cont); 1802 1803 putnext(ill->ill_wq, mp); 1804 return (0); 1805 } 1806 1807 void 1808 ill_capability_probe(ill_t *ill) 1809 { 1810 /* 1811 * Do so only if negotiation is enabled, capabilities are unknown, 1812 * and a capability negotiation is not already in progress. 1813 */ 1814 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN && 1815 ill->ill_dlpi_capab_state != IDS_RENEG) 1816 return; 1817 1818 ill->ill_dlpi_capab_state = IDS_INPROGRESS; 1819 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1820 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1821 } 1822 1823 void 1824 ill_capability_reset(ill_t *ill) 1825 { 1826 mblk_t *sc_mp = NULL; 1827 mblk_t *tmp; 1828 1829 /* 1830 * Note here that we reset the state to UNKNOWN, and later send 1831 * down the DL_CAPABILITY_REQ without first setting the state to 1832 * INPROGRESS. We do this in order to distinguish the 1833 * DL_CAPABILITY_ACK response which may come back in response to 1834 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1835 * also handle the case where the driver doesn't send us back 1836 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1837 * requires the state to be in UNKNOWN anyway. In any case, all 1838 * features are turned off until the state reaches IDS_OK. 1839 */ 1840 ill->ill_dlpi_capab_state = IDS_UNKNOWN; 1841 1842 /* 1843 * Disable sub-capabilities and request a list of sub-capability 1844 * messages which will be sent down to the driver. Each handler 1845 * allocates the corresponding dl_capability_sub_t inside an 1846 * mblk, and links it to the existing sc_mp mblk, or return it 1847 * as sc_mp if it's the first sub-capability (the passed in 1848 * sc_mp is NULL). Upon returning from all capability handlers, 1849 * sc_mp will be pulled-up, before passing it downstream. 1850 */ 1851 ill_capability_mdt_reset(ill, &sc_mp); 1852 ill_capability_hcksum_reset(ill, &sc_mp); 1853 ill_capability_zerocopy_reset(ill, &sc_mp); 1854 ill_capability_ipsec_reset(ill, &sc_mp); 1855 ill_capability_dls_reset(ill, &sc_mp); 1856 1857 /* Nothing to send down in order to disable the capabilities? */ 1858 if (sc_mp == NULL) 1859 return; 1860 1861 tmp = msgpullup(sc_mp, -1); 1862 freemsg(sc_mp); 1863 if ((sc_mp = tmp) == NULL) { 1864 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1865 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1866 return; 1867 } 1868 1869 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1870 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1871 } 1872 1873 /* 1874 * Request or set new-style hardware capabilities supported by DLS provider. 1875 */ 1876 static void 1877 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1878 { 1879 mblk_t *mp; 1880 dl_capability_req_t *capb; 1881 size_t size = 0; 1882 uint8_t *ptr; 1883 1884 if (reqp != NULL) 1885 size = MBLKL(reqp); 1886 1887 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1888 if (mp == NULL) { 1889 freemsg(reqp); 1890 return; 1891 } 1892 ptr = mp->b_rptr; 1893 1894 capb = (dl_capability_req_t *)ptr; 1895 ptr += sizeof (dl_capability_req_t); 1896 1897 if (reqp != NULL) { 1898 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1899 capb->dl_sub_length = size; 1900 bcopy(reqp->b_rptr, ptr, size); 1901 ptr += size; 1902 mp->b_cont = reqp->b_cont; 1903 freeb(reqp); 1904 } 1905 ASSERT(ptr == mp->b_wptr); 1906 1907 ill_dlpi_send(ill, mp); 1908 } 1909 1910 static void 1911 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1912 { 1913 dl_capab_id_t *id_ic; 1914 uint_t sub_dl_cap = outers->dl_cap; 1915 dl_capability_sub_t *inners; 1916 uint8_t *capend; 1917 1918 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1919 1920 /* 1921 * Note: range checks here are not absolutely sufficient to 1922 * make us robust against malformed messages sent by drivers; 1923 * this is in keeping with the rest of IP's dlpi handling. 1924 * (Remember, it's coming from something else in the kernel 1925 * address space) 1926 */ 1927 1928 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1929 if (capend > mp->b_wptr) { 1930 cmn_err(CE_WARN, "ill_capability_id_ack: " 1931 "malformed sub-capability too long for mblk"); 1932 return; 1933 } 1934 1935 id_ic = (dl_capab_id_t *)(outers + 1); 1936 1937 if (outers->dl_length < sizeof (*id_ic) || 1938 (inners = &id_ic->id_subcap, 1939 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1940 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1941 "encapsulated capab type %d too long for mblk", 1942 inners->dl_cap); 1943 return; 1944 } 1945 1946 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1947 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1948 "isn't as expected; pass-thru module(s) detected, " 1949 "discarding capability\n", inners->dl_cap)); 1950 return; 1951 } 1952 1953 /* Process the encapsulated sub-capability */ 1954 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1955 } 1956 1957 /* 1958 * Process Multidata Transmit capability negotiation ack received from a 1959 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1960 * DL_CAPABILITY_ACK message. 1961 */ 1962 static void 1963 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1964 { 1965 mblk_t *nmp = NULL; 1966 dl_capability_req_t *oc; 1967 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1968 ill_mdt_capab_t **ill_mdt_capab; 1969 uint_t sub_dl_cap = isub->dl_cap; 1970 uint8_t *capend; 1971 1972 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1973 1974 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1975 1976 /* 1977 * Note: range checks here are not absolutely sufficient to 1978 * make us robust against malformed messages sent by drivers; 1979 * this is in keeping with the rest of IP's dlpi handling. 1980 * (Remember, it's coming from something else in the kernel 1981 * address space) 1982 */ 1983 1984 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1985 if (capend > mp->b_wptr) { 1986 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1987 "malformed sub-capability too long for mblk"); 1988 return; 1989 } 1990 1991 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1992 1993 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1994 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1995 "unsupported MDT sub-capability (version %d, expected %d)", 1996 mdt_ic->mdt_version, MDT_VERSION_2); 1997 return; 1998 } 1999 2000 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 2001 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 2002 "capability isn't as expected; pass-thru module(s) " 2003 "detected, discarding capability\n")); 2004 return; 2005 } 2006 2007 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 2008 2009 if (*ill_mdt_capab == NULL) { 2010 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 2011 KM_NOSLEEP); 2012 2013 if (*ill_mdt_capab == NULL) { 2014 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2015 "could not enable MDT version %d " 2016 "for %s (ENOMEM)\n", MDT_VERSION_2, 2017 ill->ill_name); 2018 return; 2019 } 2020 } 2021 2022 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 2023 "MDT version %d (%d bytes leading, %d bytes trailing " 2024 "header spaces, %d max pld bufs, %d span limit)\n", 2025 ill->ill_name, MDT_VERSION_2, 2026 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 2027 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 2028 2029 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 2030 (*ill_mdt_capab)->ill_mdt_on = 1; 2031 /* 2032 * Round the following values to the nearest 32-bit; ULP 2033 * may further adjust them to accomodate for additional 2034 * protocol headers. We pass these values to ULP during 2035 * bind time. 2036 */ 2037 (*ill_mdt_capab)->ill_mdt_hdr_head = 2038 roundup(mdt_ic->mdt_hdr_head, 4); 2039 (*ill_mdt_capab)->ill_mdt_hdr_tail = 2040 roundup(mdt_ic->mdt_hdr_tail, 4); 2041 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 2042 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 2043 2044 ill->ill_capabilities |= ILL_CAPAB_MDT; 2045 } else { 2046 uint_t size; 2047 uchar_t *rptr; 2048 2049 size = sizeof (dl_capability_req_t) + 2050 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2051 2052 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2053 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2054 "could not enable MDT for %s (ENOMEM)\n", 2055 ill->ill_name); 2056 return; 2057 } 2058 2059 rptr = nmp->b_rptr; 2060 /* initialize dl_capability_req_t */ 2061 oc = (dl_capability_req_t *)nmp->b_rptr; 2062 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2063 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2064 sizeof (dl_capab_mdt_t); 2065 nmp->b_rptr += sizeof (dl_capability_req_t); 2066 2067 /* initialize dl_capability_sub_t */ 2068 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2069 nmp->b_rptr += sizeof (*isub); 2070 2071 /* initialize dl_capab_mdt_t */ 2072 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2073 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2074 2075 nmp->b_rptr = rptr; 2076 2077 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2078 "to enable MDT version %d\n", ill->ill_name, 2079 MDT_VERSION_2)); 2080 2081 /* set ENABLE flag */ 2082 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2083 2084 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2085 ill_dlpi_send(ill, nmp); 2086 } 2087 } 2088 2089 static void 2090 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2091 { 2092 mblk_t *mp; 2093 dl_capab_mdt_t *mdt_subcap; 2094 dl_capability_sub_t *dl_subcap; 2095 int size; 2096 2097 if (!ILL_MDT_CAPABLE(ill)) 2098 return; 2099 2100 ASSERT(ill->ill_mdt_capab != NULL); 2101 /* 2102 * Clear the capability flag for MDT but retain the ill_mdt_capab 2103 * structure since it's possible that another thread is still 2104 * referring to it. The structure only gets deallocated when 2105 * we destroy the ill. 2106 */ 2107 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2108 2109 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2110 2111 mp = allocb(size, BPRI_HI); 2112 if (mp == NULL) { 2113 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2114 "request to disable MDT\n")); 2115 return; 2116 } 2117 2118 mp->b_wptr = mp->b_rptr + size; 2119 2120 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2121 dl_subcap->dl_cap = DL_CAPAB_MDT; 2122 dl_subcap->dl_length = sizeof (*mdt_subcap); 2123 2124 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2125 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2126 mdt_subcap->mdt_flags = 0; 2127 mdt_subcap->mdt_hdr_head = 0; 2128 mdt_subcap->mdt_hdr_tail = 0; 2129 2130 if (*sc_mp != NULL) 2131 linkb(*sc_mp, mp); 2132 else 2133 *sc_mp = mp; 2134 } 2135 2136 /* 2137 * Send a DL_NOTIFY_REQ to the specified ill to enable 2138 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2139 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2140 * acceleration. 2141 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2142 */ 2143 static boolean_t 2144 ill_enable_promisc_notify(ill_t *ill) 2145 { 2146 mblk_t *mp; 2147 dl_notify_req_t *req; 2148 2149 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2150 2151 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2152 if (mp == NULL) 2153 return (B_FALSE); 2154 2155 req = (dl_notify_req_t *)mp->b_rptr; 2156 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2157 DL_NOTE_PROMISC_OFF_PHYS; 2158 2159 ill_dlpi_send(ill, mp); 2160 2161 return (B_TRUE); 2162 } 2163 2164 2165 /* 2166 * Allocate an IPsec capability request which will be filled by our 2167 * caller to turn on support for one or more algorithms. 2168 */ 2169 static mblk_t * 2170 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2171 { 2172 mblk_t *nmp; 2173 dl_capability_req_t *ocap; 2174 dl_capab_ipsec_t *ocip; 2175 dl_capab_ipsec_t *icip; 2176 uint8_t *ptr; 2177 icip = (dl_capab_ipsec_t *)(isub + 1); 2178 2179 /* 2180 * The first time around, we send a DL_NOTIFY_REQ to enable 2181 * PROMISC_ON/OFF notification from the provider. We need to 2182 * do this before enabling the algorithms to avoid leakage of 2183 * cleartext packets. 2184 */ 2185 2186 if (!ill_enable_promisc_notify(ill)) 2187 return (NULL); 2188 2189 /* 2190 * Allocate new mblk which will contain a new capability 2191 * request to enable the capabilities. 2192 */ 2193 2194 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2195 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2196 if (nmp == NULL) 2197 return (NULL); 2198 2199 ptr = nmp->b_rptr; 2200 2201 /* initialize dl_capability_req_t */ 2202 ocap = (dl_capability_req_t *)ptr; 2203 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2204 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2205 ptr += sizeof (dl_capability_req_t); 2206 2207 /* initialize dl_capability_sub_t */ 2208 bcopy(isub, ptr, sizeof (*isub)); 2209 ptr += sizeof (*isub); 2210 2211 /* initialize dl_capab_ipsec_t */ 2212 ocip = (dl_capab_ipsec_t *)ptr; 2213 bcopy(icip, ocip, sizeof (*icip)); 2214 2215 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2216 return (nmp); 2217 } 2218 2219 /* 2220 * Process an IPsec capability negotiation ack received from a DLS Provider. 2221 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2222 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2223 */ 2224 static void 2225 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2226 { 2227 dl_capab_ipsec_t *icip; 2228 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2229 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2230 uint_t cipher, nciphers; 2231 mblk_t *nmp; 2232 uint_t alg_len; 2233 boolean_t need_sadb_dump; 2234 uint_t sub_dl_cap = isub->dl_cap; 2235 ill_ipsec_capab_t **ill_capab; 2236 uint64_t ill_capab_flag; 2237 uint8_t *capend, *ciphend; 2238 boolean_t sadb_resync; 2239 2240 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2241 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2242 2243 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2244 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2245 ill_capab_flag = ILL_CAPAB_AH; 2246 } else { 2247 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2248 ill_capab_flag = ILL_CAPAB_ESP; 2249 } 2250 2251 /* 2252 * If the ill capability structure exists, then this incoming 2253 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2254 * If this is so, then we'd need to resynchronize the SADB 2255 * after re-enabling the offloaded ciphers. 2256 */ 2257 sadb_resync = (*ill_capab != NULL); 2258 2259 /* 2260 * Note: range checks here are not absolutely sufficient to 2261 * make us robust against malformed messages sent by drivers; 2262 * this is in keeping with the rest of IP's dlpi handling. 2263 * (Remember, it's coming from something else in the kernel 2264 * address space) 2265 */ 2266 2267 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2268 if (capend > mp->b_wptr) { 2269 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2270 "malformed sub-capability too long for mblk"); 2271 return; 2272 } 2273 2274 /* 2275 * There are two types of acks we process here: 2276 * 1. acks in reply to a (first form) generic capability req 2277 * (no ENABLE flag set) 2278 * 2. acks in reply to a ENABLE capability req. 2279 * (ENABLE flag set) 2280 * 2281 * We process the subcapability passed as argument as follows: 2282 * 1 do initializations 2283 * 1.1 initialize nmp = NULL 2284 * 1.2 set need_sadb_dump to B_FALSE 2285 * 2 for each cipher in subcapability: 2286 * 2.1 if ENABLE flag is set: 2287 * 2.1.1 update per-ill ipsec capabilities info 2288 * 2.1.2 set need_sadb_dump to B_TRUE 2289 * 2.2 if ENABLE flag is not set: 2290 * 2.2.1 if nmp is NULL: 2291 * 2.2.1.1 allocate and initialize nmp 2292 * 2.2.1.2 init current pos in nmp 2293 * 2.2.2 copy current cipher to current pos in nmp 2294 * 2.2.3 set ENABLE flag in nmp 2295 * 2.2.4 update current pos 2296 * 3 if nmp is not equal to NULL, send enable request 2297 * 3.1 send capability request 2298 * 4 if need_sadb_dump is B_TRUE 2299 * 4.1 enable promiscuous on/off notifications 2300 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2301 * AH or ESP SA's to interface. 2302 */ 2303 2304 nmp = NULL; 2305 oalg = NULL; 2306 need_sadb_dump = B_FALSE; 2307 icip = (dl_capab_ipsec_t *)(isub + 1); 2308 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2309 2310 nciphers = icip->cip_nciphers; 2311 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2312 2313 if (ciphend > capend) { 2314 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2315 "too many ciphers for sub-capability len"); 2316 return; 2317 } 2318 2319 for (cipher = 0; cipher < nciphers; cipher++) { 2320 alg_len = sizeof (dl_capab_ipsec_alg_t); 2321 2322 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2323 /* 2324 * TBD: when we provide a way to disable capabilities 2325 * from above, need to manage the request-pending state 2326 * and fail if we were not expecting this ACK. 2327 */ 2328 IPSECHW_DEBUG(IPSECHW_CAPAB, 2329 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2330 2331 /* 2332 * Update IPsec capabilities for this ill 2333 */ 2334 2335 if (*ill_capab == NULL) { 2336 IPSECHW_DEBUG(IPSECHW_CAPAB, 2337 ("ill_capability_ipsec_ack: " 2338 "allocating ipsec_capab for ill\n")); 2339 *ill_capab = ill_ipsec_capab_alloc(); 2340 2341 if (*ill_capab == NULL) { 2342 cmn_err(CE_WARN, 2343 "ill_capability_ipsec_ack: " 2344 "could not enable IPsec Hardware " 2345 "acceleration for %s (ENOMEM)\n", 2346 ill->ill_name); 2347 return; 2348 } 2349 } 2350 2351 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2352 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2353 2354 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2355 cmn_err(CE_WARN, 2356 "ill_capability_ipsec_ack: " 2357 "malformed IPsec algorithm id %d", 2358 ialg->alg_prim); 2359 continue; 2360 } 2361 2362 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2363 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2364 ialg->alg_prim); 2365 } else { 2366 ipsec_capab_algparm_t *alp; 2367 2368 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2369 ialg->alg_prim); 2370 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2371 ialg->alg_prim)) { 2372 cmn_err(CE_WARN, 2373 "ill_capability_ipsec_ack: " 2374 "no space for IPsec alg id %d", 2375 ialg->alg_prim); 2376 continue; 2377 } 2378 alp = &((*ill_capab)->encr_algparm[ 2379 ialg->alg_prim]); 2380 alp->minkeylen = ialg->alg_minbits; 2381 alp->maxkeylen = ialg->alg_maxbits; 2382 } 2383 ill->ill_capabilities |= ill_capab_flag; 2384 /* 2385 * indicate that a capability was enabled, which 2386 * will be used below to kick off a SADB dump 2387 * to the ill. 2388 */ 2389 need_sadb_dump = B_TRUE; 2390 } else { 2391 IPSECHW_DEBUG(IPSECHW_CAPAB, 2392 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2393 ialg->alg_prim)); 2394 2395 if (nmp == NULL) { 2396 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2397 if (nmp == NULL) { 2398 /* 2399 * Sending the PROMISC_ON/OFF 2400 * notification request failed. 2401 * We cannot enable the algorithms 2402 * since the Provider will not 2403 * notify IP of promiscous mode 2404 * changes, which could lead 2405 * to leakage of packets. 2406 */ 2407 cmn_err(CE_WARN, 2408 "ill_capability_ipsec_ack: " 2409 "could not enable IPsec Hardware " 2410 "acceleration for %s (ENOMEM)\n", 2411 ill->ill_name); 2412 return; 2413 } 2414 /* ptr to current output alg specifier */ 2415 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2416 } 2417 2418 /* 2419 * Copy current alg specifier, set ENABLE 2420 * flag, and advance to next output alg. 2421 * For now we enable all IPsec capabilities. 2422 */ 2423 ASSERT(oalg != NULL); 2424 bcopy(ialg, oalg, alg_len); 2425 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2426 nmp->b_wptr += alg_len; 2427 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2428 } 2429 2430 /* move to next input algorithm specifier */ 2431 ialg = (dl_capab_ipsec_alg_t *) 2432 ((char *)ialg + alg_len); 2433 } 2434 2435 if (nmp != NULL) 2436 /* 2437 * nmp points to a DL_CAPABILITY_REQ message to enable 2438 * IPsec hardware acceleration. 2439 */ 2440 ill_dlpi_send(ill, nmp); 2441 2442 if (need_sadb_dump) 2443 /* 2444 * An acknowledgement corresponding to a request to 2445 * enable acceleration was received, notify SADB. 2446 */ 2447 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2448 } 2449 2450 /* 2451 * Given an mblk with enough space in it, create sub-capability entries for 2452 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2453 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2454 * in preparation for the reset the DL_CAPABILITY_REQ message. 2455 */ 2456 static void 2457 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2458 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2459 { 2460 dl_capab_ipsec_t *oipsec; 2461 dl_capab_ipsec_alg_t *oalg; 2462 dl_capability_sub_t *dl_subcap; 2463 int i, k; 2464 2465 ASSERT(nciphers > 0); 2466 ASSERT(ill_cap != NULL); 2467 ASSERT(mp != NULL); 2468 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2469 2470 /* dl_capability_sub_t for "stype" */ 2471 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2472 dl_subcap->dl_cap = stype; 2473 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2474 mp->b_wptr += sizeof (dl_capability_sub_t); 2475 2476 /* dl_capab_ipsec_t for "stype" */ 2477 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2478 oipsec->cip_version = 1; 2479 oipsec->cip_nciphers = nciphers; 2480 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2481 2482 /* create entries for "stype" AUTH ciphers */ 2483 for (i = 0; i < ill_cap->algs_size; i++) { 2484 for (k = 0; k < BITSPERBYTE; k++) { 2485 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2486 continue; 2487 2488 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2489 bzero((void *)oalg, sizeof (*oalg)); 2490 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2491 oalg->alg_prim = k + (BITSPERBYTE * i); 2492 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2493 } 2494 } 2495 /* create entries for "stype" ENCR ciphers */ 2496 for (i = 0; i < ill_cap->algs_size; i++) { 2497 for (k = 0; k < BITSPERBYTE; k++) { 2498 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2499 continue; 2500 2501 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2502 bzero((void *)oalg, sizeof (*oalg)); 2503 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2504 oalg->alg_prim = k + (BITSPERBYTE * i); 2505 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2506 } 2507 } 2508 } 2509 2510 /* 2511 * Macro to count number of 1s in a byte (8-bit word). The total count is 2512 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2513 * POPC instruction, but our macro is more flexible for an arbitrary length 2514 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2515 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2516 * stays that way, we can reduce the number of iterations required. 2517 */ 2518 #define COUNT_1S(val, sum) { \ 2519 uint8_t x = val & 0xff; \ 2520 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2521 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2522 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2523 } 2524 2525 /* ARGSUSED */ 2526 static void 2527 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2528 { 2529 mblk_t *mp; 2530 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2531 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2532 uint64_t ill_capabilities = ill->ill_capabilities; 2533 int ah_cnt = 0, esp_cnt = 0; 2534 int ah_len = 0, esp_len = 0; 2535 int i, size = 0; 2536 2537 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2538 return; 2539 2540 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2541 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2542 2543 /* Find out the number of ciphers for AH */ 2544 if (cap_ah != NULL) { 2545 for (i = 0; i < cap_ah->algs_size; i++) { 2546 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2547 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2548 } 2549 if (ah_cnt > 0) { 2550 size += sizeof (dl_capability_sub_t) + 2551 sizeof (dl_capab_ipsec_t); 2552 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2553 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2554 size += ah_len; 2555 } 2556 } 2557 2558 /* Find out the number of ciphers for ESP */ 2559 if (cap_esp != NULL) { 2560 for (i = 0; i < cap_esp->algs_size; i++) { 2561 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2562 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2563 } 2564 if (esp_cnt > 0) { 2565 size += sizeof (dl_capability_sub_t) + 2566 sizeof (dl_capab_ipsec_t); 2567 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2568 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2569 size += esp_len; 2570 } 2571 } 2572 2573 if (size == 0) { 2574 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2575 "there's nothing to reset\n")); 2576 return; 2577 } 2578 2579 mp = allocb(size, BPRI_HI); 2580 if (mp == NULL) { 2581 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2582 "request to disable IPSEC Hardware Acceleration\n")); 2583 return; 2584 } 2585 2586 /* 2587 * Clear the capability flags for IPSec HA but retain the ill 2588 * capability structures since it's possible that another thread 2589 * is still referring to them. The structures only get deallocated 2590 * when we destroy the ill. 2591 * 2592 * Various places check the flags to see if the ill is capable of 2593 * hardware acceleration, and by clearing them we ensure that new 2594 * outbound IPSec packets are sent down encrypted. 2595 */ 2596 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2597 2598 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2599 if (ah_cnt > 0) { 2600 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2601 cap_ah, mp); 2602 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2603 } 2604 2605 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2606 if (esp_cnt > 0) { 2607 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2608 cap_esp, mp); 2609 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2610 } 2611 2612 /* 2613 * At this point we've composed a bunch of sub-capabilities to be 2614 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2615 * by the caller. Upon receiving this reset message, the driver 2616 * must stop inbound decryption (by destroying all inbound SAs) 2617 * and let the corresponding packets come in encrypted. 2618 */ 2619 2620 if (*sc_mp != NULL) 2621 linkb(*sc_mp, mp); 2622 else 2623 *sc_mp = mp; 2624 } 2625 2626 static void 2627 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2628 boolean_t encapsulated) 2629 { 2630 boolean_t legacy = B_FALSE; 2631 2632 /* 2633 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2634 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2635 * instructed the driver to disable its advertised capabilities, 2636 * so there's no point in accepting any response at this moment. 2637 */ 2638 if (ill->ill_dlpi_capab_state == IDS_UNKNOWN) 2639 return; 2640 2641 /* 2642 * Note that only the following two sub-capabilities may be 2643 * considered as "legacy", since their original definitions 2644 * do not incorporate the dl_mid_t module ID token, and hence 2645 * may require the use of the wrapper sub-capability. 2646 */ 2647 switch (subp->dl_cap) { 2648 case DL_CAPAB_IPSEC_AH: 2649 case DL_CAPAB_IPSEC_ESP: 2650 legacy = B_TRUE; 2651 break; 2652 } 2653 2654 /* 2655 * For legacy sub-capabilities which don't incorporate a queue_t 2656 * pointer in their structures, discard them if we detect that 2657 * there are intermediate modules in between IP and the driver. 2658 */ 2659 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2660 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2661 "%d discarded; %d module(s) present below IP\n", 2662 subp->dl_cap, ill->ill_lmod_cnt)); 2663 return; 2664 } 2665 2666 switch (subp->dl_cap) { 2667 case DL_CAPAB_IPSEC_AH: 2668 case DL_CAPAB_IPSEC_ESP: 2669 ill_capability_ipsec_ack(ill, mp, subp); 2670 break; 2671 case DL_CAPAB_MDT: 2672 ill_capability_mdt_ack(ill, mp, subp); 2673 break; 2674 case DL_CAPAB_HCKSUM: 2675 ill_capability_hcksum_ack(ill, mp, subp); 2676 break; 2677 case DL_CAPAB_ZEROCOPY: 2678 ill_capability_zerocopy_ack(ill, mp, subp); 2679 break; 2680 case DL_CAPAB_POLL: 2681 if (!SOFT_RINGS_ENABLED()) 2682 ill_capability_dls_ack(ill, mp, subp); 2683 break; 2684 case DL_CAPAB_SOFT_RING: 2685 if (SOFT_RINGS_ENABLED()) 2686 ill_capability_dls_ack(ill, mp, subp); 2687 break; 2688 default: 2689 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2690 subp->dl_cap)); 2691 } 2692 } 2693 2694 /* 2695 * As part of negotiating polling capability, the driver tells us 2696 * the default (or normal) blanking interval and packet threshold 2697 * (the receive timer fires if blanking interval is reached or 2698 * the packet threshold is reached). 2699 * 2700 * As part of manipulating the polling interval, we always use our 2701 * estimated interval (avg service time * number of packets queued 2702 * on the squeue) but we try to blank for a minimum of 2703 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2704 * packet threshold during this time. When we are not in polling mode 2705 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2706 * rr_min_blank_ratio but up the packet cnt by a ratio of 2707 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2708 * possible although for a shorter interval. 2709 */ 2710 #define RR_MAX_BLANK_RATIO 20 2711 #define RR_MIN_BLANK_RATIO 10 2712 #define RR_MAX_PKT_CNT_RATIO 3 2713 #define RR_MIN_PKT_CNT_RATIO 3 2714 2715 /* 2716 * These can be tuned via /etc/system. 2717 */ 2718 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2719 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2720 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2721 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2722 2723 static mac_resource_handle_t 2724 ill_ring_add(void *arg, mac_resource_t *mrp) 2725 { 2726 ill_t *ill = (ill_t *)arg; 2727 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2728 ill_rx_ring_t *rx_ring; 2729 int ip_rx_index; 2730 2731 ASSERT(mrp != NULL); 2732 if (mrp->mr_type != MAC_RX_FIFO) { 2733 return (NULL); 2734 } 2735 ASSERT(ill != NULL); 2736 ASSERT(ill->ill_dls_capab != NULL); 2737 2738 mutex_enter(&ill->ill_lock); 2739 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2740 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2741 ASSERT(rx_ring != NULL); 2742 2743 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2744 time_t normal_blank_time = 2745 mrfp->mrf_normal_blank_time; 2746 uint_t normal_pkt_cnt = 2747 mrfp->mrf_normal_pkt_count; 2748 2749 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2750 2751 rx_ring->rr_blank = mrfp->mrf_blank; 2752 rx_ring->rr_handle = mrfp->mrf_arg; 2753 rx_ring->rr_ill = ill; 2754 rx_ring->rr_normal_blank_time = normal_blank_time; 2755 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2756 2757 rx_ring->rr_max_blank_time = 2758 normal_blank_time * rr_max_blank_ratio; 2759 rx_ring->rr_min_blank_time = 2760 normal_blank_time * rr_min_blank_ratio; 2761 rx_ring->rr_max_pkt_cnt = 2762 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2763 rx_ring->rr_min_pkt_cnt = 2764 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2765 2766 rx_ring->rr_ring_state = ILL_RING_INUSE; 2767 mutex_exit(&ill->ill_lock); 2768 2769 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2770 (int), ip_rx_index); 2771 return ((mac_resource_handle_t)rx_ring); 2772 } 2773 } 2774 2775 /* 2776 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2777 * we have devices which can overwhelm this limit, ILL_MAX_RING 2778 * should be made configurable. Meanwhile it cause no panic because 2779 * driver will pass ip_input a NULL handle which will make 2780 * IP allocate the default squeue and Polling mode will not 2781 * be used for this ring. 2782 */ 2783 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2784 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2785 2786 mutex_exit(&ill->ill_lock); 2787 return (NULL); 2788 } 2789 2790 static boolean_t 2791 ill_capability_dls_init(ill_t *ill) 2792 { 2793 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2794 conn_t *connp; 2795 size_t sz; 2796 2797 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2798 if (ill_dls == NULL) { 2799 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2800 "soft_ring enabled for ill=%s (%p) but data " 2801 "structs uninitialized\n", ill->ill_name, 2802 (void *)ill); 2803 } 2804 return (B_TRUE); 2805 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2806 if (ill_dls == NULL) { 2807 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2808 "polling enabled for ill=%s (%p) but data " 2809 "structs uninitialized\n", ill->ill_name, 2810 (void *)ill); 2811 } 2812 return (B_TRUE); 2813 } 2814 2815 if (ill_dls != NULL) { 2816 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2817 /* Soft_Ring or polling is being re-enabled */ 2818 2819 connp = ill_dls->ill_unbind_conn; 2820 ASSERT(rx_ring != NULL); 2821 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2822 bzero((void *)rx_ring, 2823 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2824 ill_dls->ill_ring_tbl = rx_ring; 2825 ill_dls->ill_unbind_conn = connp; 2826 return (B_TRUE); 2827 } 2828 2829 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 2830 return (B_FALSE); 2831 2832 sz = sizeof (ill_dls_capab_t); 2833 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2834 2835 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2836 if (ill_dls == NULL) { 2837 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2838 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2839 (void *)ill); 2840 CONN_DEC_REF(connp); 2841 return (B_FALSE); 2842 } 2843 2844 /* Allocate space to hold ring table */ 2845 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2846 ill->ill_dls_capab = ill_dls; 2847 ill_dls->ill_unbind_conn = connp; 2848 return (B_TRUE); 2849 } 2850 2851 /* 2852 * ill_capability_dls_disable: disable soft_ring and/or polling 2853 * capability. Since any of the rings might already be in use, need 2854 * to call ipsq_clean_all() which gets behind the squeue to disable 2855 * direct calls if necessary. 2856 */ 2857 static void 2858 ill_capability_dls_disable(ill_t *ill) 2859 { 2860 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2861 2862 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2863 ipsq_clean_all(ill); 2864 ill_dls->ill_tx = NULL; 2865 ill_dls->ill_tx_handle = NULL; 2866 ill_dls->ill_dls_change_status = NULL; 2867 ill_dls->ill_dls_bind = NULL; 2868 ill_dls->ill_dls_unbind = NULL; 2869 } 2870 2871 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2872 } 2873 2874 static void 2875 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2876 dl_capability_sub_t *isub) 2877 { 2878 uint_t size; 2879 uchar_t *rptr; 2880 dl_capab_dls_t dls, *odls; 2881 ill_dls_capab_t *ill_dls; 2882 mblk_t *nmp = NULL; 2883 dl_capability_req_t *ocap; 2884 uint_t sub_dl_cap = isub->dl_cap; 2885 2886 if (!ill_capability_dls_init(ill)) 2887 return; 2888 ill_dls = ill->ill_dls_capab; 2889 2890 /* Copy locally to get the members aligned */ 2891 bcopy((void *)idls, (void *)&dls, 2892 sizeof (dl_capab_dls_t)); 2893 2894 /* Get the tx function and handle from dld */ 2895 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2896 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2897 2898 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2899 ill_dls->ill_dls_change_status = 2900 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2901 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2902 ill_dls->ill_dls_unbind = 2903 (ip_dls_unbind_t)dls.dls_ring_unbind; 2904 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2905 } 2906 2907 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2908 isub->dl_length; 2909 2910 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2911 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2912 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2913 ill->ill_name, (void *)ill); 2914 return; 2915 } 2916 2917 /* initialize dl_capability_req_t */ 2918 rptr = nmp->b_rptr; 2919 ocap = (dl_capability_req_t *)rptr; 2920 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2921 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2922 rptr += sizeof (dl_capability_req_t); 2923 2924 /* initialize dl_capability_sub_t */ 2925 bcopy(isub, rptr, sizeof (*isub)); 2926 rptr += sizeof (*isub); 2927 2928 odls = (dl_capab_dls_t *)rptr; 2929 rptr += sizeof (dl_capab_dls_t); 2930 2931 /* initialize dl_capab_dls_t to be sent down */ 2932 dls.dls_rx_handle = (uintptr_t)ill; 2933 dls.dls_rx = (uintptr_t)ip_input; 2934 dls.dls_ring_add = (uintptr_t)ill_ring_add; 2935 2936 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2937 dls.dls_ring_cnt = ip_soft_rings_cnt; 2938 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 2939 dls.dls_flags = SOFT_RING_ENABLE; 2940 } else { 2941 dls.dls_flags = POLL_ENABLE; 2942 ip1dbg(("ill_capability_dls_capable: asking interface %s " 2943 "to enable polling\n", ill->ill_name)); 2944 } 2945 bcopy((void *)&dls, (void *)odls, 2946 sizeof (dl_capab_dls_t)); 2947 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2948 /* 2949 * nmp points to a DL_CAPABILITY_REQ message to 2950 * enable either soft_ring or polling 2951 */ 2952 ill_dlpi_send(ill, nmp); 2953 } 2954 2955 static void 2956 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 2957 { 2958 mblk_t *mp; 2959 dl_capab_dls_t *idls; 2960 dl_capability_sub_t *dl_subcap; 2961 int size; 2962 2963 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 2964 return; 2965 2966 ASSERT(ill->ill_dls_capab != NULL); 2967 2968 size = sizeof (*dl_subcap) + sizeof (*idls); 2969 2970 mp = allocb(size, BPRI_HI); 2971 if (mp == NULL) { 2972 ip1dbg(("ill_capability_dls_reset: unable to allocate " 2973 "request to disable soft_ring\n")); 2974 return; 2975 } 2976 2977 mp->b_wptr = mp->b_rptr + size; 2978 2979 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2980 dl_subcap->dl_length = sizeof (*idls); 2981 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2982 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 2983 else 2984 dl_subcap->dl_cap = DL_CAPAB_POLL; 2985 2986 idls = (dl_capab_dls_t *)(dl_subcap + 1); 2987 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2988 idls->dls_flags = SOFT_RING_DISABLE; 2989 else 2990 idls->dls_flags = POLL_DISABLE; 2991 2992 if (*sc_mp != NULL) 2993 linkb(*sc_mp, mp); 2994 else 2995 *sc_mp = mp; 2996 } 2997 2998 /* 2999 * Process a soft_ring/poll capability negotiation ack received 3000 * from a DLS Provider.isub must point to the sub-capability 3001 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 3002 */ 3003 static void 3004 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3005 { 3006 dl_capab_dls_t *idls; 3007 uint_t sub_dl_cap = isub->dl_cap; 3008 uint8_t *capend; 3009 3010 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 3011 sub_dl_cap == DL_CAPAB_POLL); 3012 3013 if (ill->ill_isv6) 3014 return; 3015 3016 /* 3017 * Note: range checks here are not absolutely sufficient to 3018 * make us robust against malformed messages sent by drivers; 3019 * this is in keeping with the rest of IP's dlpi handling. 3020 * (Remember, it's coming from something else in the kernel 3021 * address space) 3022 */ 3023 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3024 if (capend > mp->b_wptr) { 3025 cmn_err(CE_WARN, "ill_capability_dls_ack: " 3026 "malformed sub-capability too long for mblk"); 3027 return; 3028 } 3029 3030 /* 3031 * There are two types of acks we process here: 3032 * 1. acks in reply to a (first form) generic capability req 3033 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 3034 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 3035 * capability req. 3036 */ 3037 idls = (dl_capab_dls_t *)(isub + 1); 3038 3039 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 3040 ip1dbg(("ill_capability_dls_ack: mid token for dls " 3041 "capability isn't as expected; pass-thru " 3042 "module(s) detected, discarding capability\n")); 3043 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 3044 /* 3045 * This is a capability renegotitation case. 3046 * The interface better be unusable at this 3047 * point other wise bad things will happen 3048 * if we disable direct calls on a running 3049 * and up interface. 3050 */ 3051 ill_capability_dls_disable(ill); 3052 } 3053 return; 3054 } 3055 3056 switch (idls->dls_flags) { 3057 default: 3058 /* Disable if unknown flag */ 3059 case SOFT_RING_DISABLE: 3060 case POLL_DISABLE: 3061 ill_capability_dls_disable(ill); 3062 break; 3063 case SOFT_RING_CAPABLE: 3064 case POLL_CAPABLE: 3065 /* 3066 * If the capability was already enabled, its safe 3067 * to disable it first to get rid of stale information 3068 * and then start enabling it again. 3069 */ 3070 ill_capability_dls_disable(ill); 3071 ill_capability_dls_capable(ill, idls, isub); 3072 break; 3073 case SOFT_RING_ENABLE: 3074 case POLL_ENABLE: 3075 mutex_enter(&ill->ill_lock); 3076 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3077 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3078 ASSERT(ill->ill_dls_capab != NULL); 3079 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3080 } 3081 if (sub_dl_cap == DL_CAPAB_POLL && 3082 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3083 ASSERT(ill->ill_dls_capab != NULL); 3084 ill->ill_capabilities |= ILL_CAPAB_POLL; 3085 ip1dbg(("ill_capability_dls_ack: interface %s " 3086 "has enabled polling\n", ill->ill_name)); 3087 } 3088 mutex_exit(&ill->ill_lock); 3089 break; 3090 } 3091 } 3092 3093 /* 3094 * Process a hardware checksum offload capability negotiation ack received 3095 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3096 * of a DL_CAPABILITY_ACK message. 3097 */ 3098 static void 3099 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3100 { 3101 dl_capability_req_t *ocap; 3102 dl_capab_hcksum_t *ihck, *ohck; 3103 ill_hcksum_capab_t **ill_hcksum; 3104 mblk_t *nmp = NULL; 3105 uint_t sub_dl_cap = isub->dl_cap; 3106 uint8_t *capend; 3107 3108 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3109 3110 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3111 3112 /* 3113 * Note: range checks here are not absolutely sufficient to 3114 * make us robust against malformed messages sent by drivers; 3115 * this is in keeping with the rest of IP's dlpi handling. 3116 * (Remember, it's coming from something else in the kernel 3117 * address space) 3118 */ 3119 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3120 if (capend > mp->b_wptr) { 3121 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3122 "malformed sub-capability too long for mblk"); 3123 return; 3124 } 3125 3126 /* 3127 * There are two types of acks we process here: 3128 * 1. acks in reply to a (first form) generic capability req 3129 * (no ENABLE flag set) 3130 * 2. acks in reply to a ENABLE capability req. 3131 * (ENABLE flag set) 3132 */ 3133 ihck = (dl_capab_hcksum_t *)(isub + 1); 3134 3135 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3136 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3137 "unsupported hardware checksum " 3138 "sub-capability (version %d, expected %d)", 3139 ihck->hcksum_version, HCKSUM_VERSION_1); 3140 return; 3141 } 3142 3143 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3144 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3145 "checksum capability isn't as expected; pass-thru " 3146 "module(s) detected, discarding capability\n")); 3147 return; 3148 } 3149 3150 #define CURR_HCKSUM_CAPAB \ 3151 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3152 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3153 3154 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3155 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3156 /* do ENABLE processing */ 3157 if (*ill_hcksum == NULL) { 3158 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3159 KM_NOSLEEP); 3160 3161 if (*ill_hcksum == NULL) { 3162 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3163 "could not enable hcksum version %d " 3164 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3165 ill->ill_name); 3166 return; 3167 } 3168 } 3169 3170 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3171 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3172 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3173 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3174 "has enabled hardware checksumming\n ", 3175 ill->ill_name)); 3176 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3177 /* 3178 * Enabling hardware checksum offload 3179 * Currently IP supports {TCP,UDP}/IPv4 3180 * partial and full cksum offload and 3181 * IPv4 header checksum offload. 3182 * Allocate new mblk which will 3183 * contain a new capability request 3184 * to enable hardware checksum offload. 3185 */ 3186 uint_t size; 3187 uchar_t *rptr; 3188 3189 size = sizeof (dl_capability_req_t) + 3190 sizeof (dl_capability_sub_t) + isub->dl_length; 3191 3192 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3193 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3194 "could not enable hardware cksum for %s (ENOMEM)\n", 3195 ill->ill_name); 3196 return; 3197 } 3198 3199 rptr = nmp->b_rptr; 3200 /* initialize dl_capability_req_t */ 3201 ocap = (dl_capability_req_t *)nmp->b_rptr; 3202 ocap->dl_sub_offset = 3203 sizeof (dl_capability_req_t); 3204 ocap->dl_sub_length = 3205 sizeof (dl_capability_sub_t) + 3206 isub->dl_length; 3207 nmp->b_rptr += sizeof (dl_capability_req_t); 3208 3209 /* initialize dl_capability_sub_t */ 3210 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3211 nmp->b_rptr += sizeof (*isub); 3212 3213 /* initialize dl_capab_hcksum_t */ 3214 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3215 bcopy(ihck, ohck, sizeof (*ihck)); 3216 3217 nmp->b_rptr = rptr; 3218 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3219 3220 /* Set ENABLE flag */ 3221 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3222 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3223 3224 /* 3225 * nmp points to a DL_CAPABILITY_REQ message to enable 3226 * hardware checksum acceleration. 3227 */ 3228 ill_dlpi_send(ill, nmp); 3229 } else { 3230 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3231 "advertised %x hardware checksum capability flags\n", 3232 ill->ill_name, ihck->hcksum_txflags)); 3233 } 3234 } 3235 3236 static void 3237 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3238 { 3239 mblk_t *mp; 3240 dl_capab_hcksum_t *hck_subcap; 3241 dl_capability_sub_t *dl_subcap; 3242 int size; 3243 3244 if (!ILL_HCKSUM_CAPABLE(ill)) 3245 return; 3246 3247 ASSERT(ill->ill_hcksum_capab != NULL); 3248 /* 3249 * Clear the capability flag for hardware checksum offload but 3250 * retain the ill_hcksum_capab structure since it's possible that 3251 * another thread is still referring to it. The structure only 3252 * gets deallocated when we destroy the ill. 3253 */ 3254 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3255 3256 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3257 3258 mp = allocb(size, BPRI_HI); 3259 if (mp == NULL) { 3260 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3261 "request to disable hardware checksum offload\n")); 3262 return; 3263 } 3264 3265 mp->b_wptr = mp->b_rptr + size; 3266 3267 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3268 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3269 dl_subcap->dl_length = sizeof (*hck_subcap); 3270 3271 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3272 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3273 hck_subcap->hcksum_txflags = 0; 3274 3275 if (*sc_mp != NULL) 3276 linkb(*sc_mp, mp); 3277 else 3278 *sc_mp = mp; 3279 } 3280 3281 static void 3282 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3283 { 3284 mblk_t *nmp = NULL; 3285 dl_capability_req_t *oc; 3286 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3287 ill_zerocopy_capab_t **ill_zerocopy_capab; 3288 uint_t sub_dl_cap = isub->dl_cap; 3289 uint8_t *capend; 3290 3291 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3292 3293 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3294 3295 /* 3296 * Note: range checks here are not absolutely sufficient to 3297 * make us robust against malformed messages sent by drivers; 3298 * this is in keeping with the rest of IP's dlpi handling. 3299 * (Remember, it's coming from something else in the kernel 3300 * address space) 3301 */ 3302 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3303 if (capend > mp->b_wptr) { 3304 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3305 "malformed sub-capability too long for mblk"); 3306 return; 3307 } 3308 3309 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3310 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3311 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3312 "unsupported ZEROCOPY sub-capability (version %d, " 3313 "expected %d)", zc_ic->zerocopy_version, 3314 ZEROCOPY_VERSION_1); 3315 return; 3316 } 3317 3318 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3319 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3320 "capability isn't as expected; pass-thru module(s) " 3321 "detected, discarding capability\n")); 3322 return; 3323 } 3324 3325 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3326 if (*ill_zerocopy_capab == NULL) { 3327 *ill_zerocopy_capab = 3328 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3329 KM_NOSLEEP); 3330 3331 if (*ill_zerocopy_capab == NULL) { 3332 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3333 "could not enable Zero-copy version %d " 3334 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3335 ill->ill_name); 3336 return; 3337 } 3338 } 3339 3340 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3341 "supports Zero-copy version %d\n", ill->ill_name, 3342 ZEROCOPY_VERSION_1)); 3343 3344 (*ill_zerocopy_capab)->ill_zerocopy_version = 3345 zc_ic->zerocopy_version; 3346 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3347 zc_ic->zerocopy_flags; 3348 3349 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3350 } else { 3351 uint_t size; 3352 uchar_t *rptr; 3353 3354 size = sizeof (dl_capability_req_t) + 3355 sizeof (dl_capability_sub_t) + 3356 sizeof (dl_capab_zerocopy_t); 3357 3358 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3359 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3360 "could not enable zerocopy for %s (ENOMEM)\n", 3361 ill->ill_name); 3362 return; 3363 } 3364 3365 rptr = nmp->b_rptr; 3366 /* initialize dl_capability_req_t */ 3367 oc = (dl_capability_req_t *)rptr; 3368 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3369 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3370 sizeof (dl_capab_zerocopy_t); 3371 rptr += sizeof (dl_capability_req_t); 3372 3373 /* initialize dl_capability_sub_t */ 3374 bcopy(isub, rptr, sizeof (*isub)); 3375 rptr += sizeof (*isub); 3376 3377 /* initialize dl_capab_zerocopy_t */ 3378 zc_oc = (dl_capab_zerocopy_t *)rptr; 3379 *zc_oc = *zc_ic; 3380 3381 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3382 "to enable zero-copy version %d\n", ill->ill_name, 3383 ZEROCOPY_VERSION_1)); 3384 3385 /* set VMSAFE_MEM flag */ 3386 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3387 3388 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3389 ill_dlpi_send(ill, nmp); 3390 } 3391 } 3392 3393 static void 3394 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3395 { 3396 mblk_t *mp; 3397 dl_capab_zerocopy_t *zerocopy_subcap; 3398 dl_capability_sub_t *dl_subcap; 3399 int size; 3400 3401 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3402 return; 3403 3404 ASSERT(ill->ill_zerocopy_capab != NULL); 3405 /* 3406 * Clear the capability flag for Zero-copy but retain the 3407 * ill_zerocopy_capab structure since it's possible that another 3408 * thread is still referring to it. The structure only gets 3409 * deallocated when we destroy the ill. 3410 */ 3411 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3412 3413 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3414 3415 mp = allocb(size, BPRI_HI); 3416 if (mp == NULL) { 3417 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3418 "request to disable Zero-copy\n")); 3419 return; 3420 } 3421 3422 mp->b_wptr = mp->b_rptr + size; 3423 3424 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3425 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3426 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3427 3428 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3429 zerocopy_subcap->zerocopy_version = 3430 ill->ill_zerocopy_capab->ill_zerocopy_version; 3431 zerocopy_subcap->zerocopy_flags = 0; 3432 3433 if (*sc_mp != NULL) 3434 linkb(*sc_mp, mp); 3435 else 3436 *sc_mp = mp; 3437 } 3438 3439 /* 3440 * Consume a new-style hardware capabilities negotiation ack. 3441 * Called from ip_rput_dlpi_writer(). 3442 */ 3443 void 3444 ill_capability_ack(ill_t *ill, mblk_t *mp) 3445 { 3446 dl_capability_ack_t *capp; 3447 dl_capability_sub_t *subp, *endp; 3448 3449 if (ill->ill_dlpi_capab_state == IDS_INPROGRESS) 3450 ill->ill_dlpi_capab_state = IDS_OK; 3451 3452 capp = (dl_capability_ack_t *)mp->b_rptr; 3453 3454 if (capp->dl_sub_length == 0) 3455 /* no new-style capabilities */ 3456 return; 3457 3458 /* make sure the driver supplied correct dl_sub_length */ 3459 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3460 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3461 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3462 return; 3463 } 3464 3465 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3466 /* 3467 * There are sub-capabilities. Process the ones we know about. 3468 * Loop until we don't have room for another sub-cap header.. 3469 */ 3470 for (subp = SC(capp, capp->dl_sub_offset), 3471 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3472 subp <= endp; 3473 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3474 3475 switch (subp->dl_cap) { 3476 case DL_CAPAB_ID_WRAPPER: 3477 ill_capability_id_ack(ill, mp, subp); 3478 break; 3479 default: 3480 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3481 break; 3482 } 3483 } 3484 #undef SC 3485 } 3486 3487 /* 3488 * This routine is called to scan the fragmentation reassembly table for 3489 * the specified ILL for any packets that are starting to smell. 3490 * dead_interval is the maximum time in seconds that will be tolerated. It 3491 * will either be the value specified in ip_g_frag_timeout, or zero if the 3492 * ILL is shutting down and it is time to blow everything off. 3493 * 3494 * It returns the number of seconds (as a time_t) that the next frag timer 3495 * should be scheduled for, 0 meaning that the timer doesn't need to be 3496 * re-started. Note that the method of calculating next_timeout isn't 3497 * entirely accurate since time will flow between the time we grab 3498 * current_time and the time we schedule the next timeout. This isn't a 3499 * big problem since this is the timer for sending an ICMP reassembly time 3500 * exceeded messages, and it doesn't have to be exactly accurate. 3501 * 3502 * This function is 3503 * sometimes called as writer, although this is not required. 3504 */ 3505 time_t 3506 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3507 { 3508 ipfb_t *ipfb; 3509 ipfb_t *endp; 3510 ipf_t *ipf; 3511 ipf_t *ipfnext; 3512 mblk_t *mp; 3513 time_t current_time = gethrestime_sec(); 3514 time_t next_timeout = 0; 3515 uint32_t hdr_length; 3516 mblk_t *send_icmp_head; 3517 mblk_t *send_icmp_head_v6; 3518 zoneid_t zoneid; 3519 3520 ipfb = ill->ill_frag_hash_tbl; 3521 if (ipfb == NULL) 3522 return (B_FALSE); 3523 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3524 /* Walk the frag hash table. */ 3525 for (; ipfb < endp; ipfb++) { 3526 send_icmp_head = NULL; 3527 send_icmp_head_v6 = NULL; 3528 mutex_enter(&ipfb->ipfb_lock); 3529 while ((ipf = ipfb->ipfb_ipf) != 0) { 3530 time_t frag_time = current_time - ipf->ipf_timestamp; 3531 time_t frag_timeout; 3532 3533 if (frag_time < dead_interval) { 3534 /* 3535 * There are some outstanding fragments 3536 * that will timeout later. Make note of 3537 * the time so that we can reschedule the 3538 * next timeout appropriately. 3539 */ 3540 frag_timeout = dead_interval - frag_time; 3541 if (next_timeout == 0 || 3542 frag_timeout < next_timeout) { 3543 next_timeout = frag_timeout; 3544 } 3545 break; 3546 } 3547 /* Time's up. Get it out of here. */ 3548 hdr_length = ipf->ipf_nf_hdr_len; 3549 ipfnext = ipf->ipf_hash_next; 3550 if (ipfnext) 3551 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3552 *ipf->ipf_ptphn = ipfnext; 3553 mp = ipf->ipf_mp->b_cont; 3554 for (; mp; mp = mp->b_cont) { 3555 /* Extra points for neatness. */ 3556 IP_REASS_SET_START(mp, 0); 3557 IP_REASS_SET_END(mp, 0); 3558 } 3559 mp = ipf->ipf_mp->b_cont; 3560 ill->ill_frag_count -= ipf->ipf_count; 3561 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3562 ipfb->ipfb_count -= ipf->ipf_count; 3563 ASSERT(ipfb->ipfb_frag_pkts > 0); 3564 ipfb->ipfb_frag_pkts--; 3565 /* 3566 * We do not send any icmp message from here because 3567 * we currently are holding the ipfb_lock for this 3568 * hash chain. If we try and send any icmp messages 3569 * from here we may end up via a put back into ip 3570 * trying to get the same lock, causing a recursive 3571 * mutex panic. Instead we build a list and send all 3572 * the icmp messages after we have dropped the lock. 3573 */ 3574 if (ill->ill_isv6) { 3575 BUMP_MIB(ill->ill_ip6_mib, ipv6ReasmFails); 3576 if (hdr_length != 0) { 3577 mp->b_next = send_icmp_head_v6; 3578 send_icmp_head_v6 = mp; 3579 } else { 3580 freemsg(mp); 3581 } 3582 } else { 3583 BUMP_MIB(&ip_mib, ipReasmFails); 3584 if (hdr_length != 0) { 3585 mp->b_next = send_icmp_head; 3586 send_icmp_head = mp; 3587 } else { 3588 freemsg(mp); 3589 } 3590 } 3591 freeb(ipf->ipf_mp); 3592 } 3593 mutex_exit(&ipfb->ipfb_lock); 3594 /* 3595 * Now need to send any icmp messages that we delayed from 3596 * above. 3597 */ 3598 while (send_icmp_head_v6 != NULL) { 3599 ip6_t *ip6h; 3600 3601 mp = send_icmp_head_v6; 3602 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3603 mp->b_next = NULL; 3604 if (mp->b_datap->db_type == M_CTL) 3605 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3606 else 3607 ip6h = (ip6_t *)mp->b_rptr; 3608 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3609 ill); 3610 if (zoneid == ALL_ZONES) { 3611 freemsg(mp); 3612 } else { 3613 icmp_time_exceeded_v6(ill->ill_wq, mp, 3614 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3615 B_FALSE, zoneid); 3616 } 3617 } 3618 while (send_icmp_head != NULL) { 3619 ipaddr_t dst; 3620 3621 mp = send_icmp_head; 3622 send_icmp_head = send_icmp_head->b_next; 3623 mp->b_next = NULL; 3624 3625 if (mp->b_datap->db_type == M_CTL) 3626 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3627 else 3628 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3629 3630 zoneid = ipif_lookup_addr_zoneid(dst, ill); 3631 if (zoneid == ALL_ZONES) { 3632 freemsg(mp); 3633 } else { 3634 icmp_time_exceeded(ill->ill_wq, mp, 3635 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid); 3636 } 3637 } 3638 } 3639 /* 3640 * A non-dying ILL will use the return value to decide whether to 3641 * restart the frag timer, and for how long. 3642 */ 3643 return (next_timeout); 3644 } 3645 3646 /* 3647 * This routine is called when the approximate count of mblk memory used 3648 * for the specified ILL has exceeded max_count. 3649 */ 3650 void 3651 ill_frag_prune(ill_t *ill, uint_t max_count) 3652 { 3653 ipfb_t *ipfb; 3654 ipf_t *ipf; 3655 size_t count; 3656 3657 /* 3658 * If we are here within ip_min_frag_prune_time msecs remove 3659 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3660 * ill_frag_free_num_pkts. 3661 */ 3662 mutex_enter(&ill->ill_lock); 3663 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3664 (ip_min_frag_prune_time != 0 ? 3665 ip_min_frag_prune_time : msec_per_tick)) { 3666 3667 ill->ill_frag_free_num_pkts++; 3668 3669 } else { 3670 ill->ill_frag_free_num_pkts = 0; 3671 } 3672 ill->ill_last_frag_clean_time = lbolt; 3673 mutex_exit(&ill->ill_lock); 3674 3675 /* 3676 * free ill_frag_free_num_pkts oldest packets from each bucket. 3677 */ 3678 if (ill->ill_frag_free_num_pkts != 0) { 3679 int ix; 3680 3681 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3682 ipfb = &ill->ill_frag_hash_tbl[ix]; 3683 mutex_enter(&ipfb->ipfb_lock); 3684 if (ipfb->ipfb_ipf != NULL) { 3685 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3686 ill->ill_frag_free_num_pkts); 3687 } 3688 mutex_exit(&ipfb->ipfb_lock); 3689 } 3690 } 3691 /* 3692 * While the reassembly list for this ILL is too big, prune a fragment 3693 * queue by age, oldest first. Note that the per ILL count is 3694 * approximate, while the per frag hash bucket counts are accurate. 3695 */ 3696 while (ill->ill_frag_count > max_count) { 3697 int ix; 3698 ipfb_t *oipfb = NULL; 3699 uint_t oldest = UINT_MAX; 3700 3701 count = 0; 3702 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3703 ipfb = &ill->ill_frag_hash_tbl[ix]; 3704 mutex_enter(&ipfb->ipfb_lock); 3705 ipf = ipfb->ipfb_ipf; 3706 if (ipf != NULL && ipf->ipf_gen < oldest) { 3707 oldest = ipf->ipf_gen; 3708 oipfb = ipfb; 3709 } 3710 count += ipfb->ipfb_count; 3711 mutex_exit(&ipfb->ipfb_lock); 3712 } 3713 /* Refresh the per ILL count */ 3714 ill->ill_frag_count = count; 3715 if (oipfb == NULL) { 3716 ill->ill_frag_count = 0; 3717 break; 3718 } 3719 if (count <= max_count) 3720 return; /* Somebody beat us to it, nothing to do */ 3721 mutex_enter(&oipfb->ipfb_lock); 3722 ipf = oipfb->ipfb_ipf; 3723 if (ipf != NULL) { 3724 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3725 } 3726 mutex_exit(&oipfb->ipfb_lock); 3727 } 3728 } 3729 3730 /* 3731 * free 'free_cnt' fragmented packets starting at ipf. 3732 */ 3733 void 3734 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3735 { 3736 size_t count; 3737 mblk_t *mp; 3738 mblk_t *tmp; 3739 ipf_t **ipfp = ipf->ipf_ptphn; 3740 3741 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3742 ASSERT(ipfp != NULL); 3743 ASSERT(ipf != NULL); 3744 3745 while (ipf != NULL && free_cnt-- > 0) { 3746 count = ipf->ipf_count; 3747 mp = ipf->ipf_mp; 3748 ipf = ipf->ipf_hash_next; 3749 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3750 IP_REASS_SET_START(tmp, 0); 3751 IP_REASS_SET_END(tmp, 0); 3752 } 3753 ill->ill_frag_count -= count; 3754 ASSERT(ipfb->ipfb_count >= count); 3755 ipfb->ipfb_count -= count; 3756 ASSERT(ipfb->ipfb_frag_pkts > 0); 3757 ipfb->ipfb_frag_pkts--; 3758 freemsg(mp); 3759 BUMP_MIB(&ip_mib, ipReasmFails); 3760 } 3761 3762 if (ipf) 3763 ipf->ipf_ptphn = ipfp; 3764 ipfp[0] = ipf; 3765 } 3766 3767 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3768 "obsolete and may be removed in a future release of Solaris. Use " \ 3769 "ifconfig(1M) to manipulate the forwarding status of an interface." 3770 3771 /* 3772 * For obsolete per-interface forwarding configuration; 3773 * called in response to ND_GET. 3774 */ 3775 /* ARGSUSED */ 3776 static int 3777 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3778 { 3779 ill_t *ill = (ill_t *)cp; 3780 3781 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3782 3783 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3784 return (0); 3785 } 3786 3787 /* 3788 * For obsolete per-interface forwarding configuration; 3789 * called in response to ND_SET. 3790 */ 3791 /* ARGSUSED */ 3792 static int 3793 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3794 cred_t *ioc_cr) 3795 { 3796 long value; 3797 int retval; 3798 3799 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3800 3801 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3802 value < 0 || value > 1) { 3803 return (EINVAL); 3804 } 3805 3806 rw_enter(&ill_g_lock, RW_READER); 3807 retval = ill_forward_set(q, mp, (value != 0), cp); 3808 rw_exit(&ill_g_lock); 3809 return (retval); 3810 } 3811 3812 /* 3813 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 3814 * IPMP group, make sure all ill's in the group adopt the new policy. Send 3815 * up RTS_IFINFO routing socket messages for each interface whose flags we 3816 * change. 3817 */ 3818 /* ARGSUSED */ 3819 int 3820 ill_forward_set(queue_t *q, mblk_t *mp, boolean_t enable, caddr_t cp) 3821 { 3822 ill_t *ill = (ill_t *)cp; 3823 ill_group_t *illgrp; 3824 3825 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ill_g_lock)); 3826 3827 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3828 (!enable && !(ill->ill_flags & ILLF_ROUTER)) || 3829 (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) 3830 return (EINVAL); 3831 3832 /* 3833 * If the ill is in an IPMP group, set the forwarding policy on all 3834 * members of the group to the same value. 3835 */ 3836 illgrp = ill->ill_group; 3837 if (illgrp != NULL) { 3838 ill_t *tmp_ill; 3839 3840 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 3841 tmp_ill = tmp_ill->ill_group_next) { 3842 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3843 (enable ? "Enabling" : "Disabling"), 3844 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 3845 tmp_ill->ill_name)); 3846 mutex_enter(&tmp_ill->ill_lock); 3847 if (enable) 3848 tmp_ill->ill_flags |= ILLF_ROUTER; 3849 else 3850 tmp_ill->ill_flags &= ~ILLF_ROUTER; 3851 mutex_exit(&tmp_ill->ill_lock); 3852 if (tmp_ill->ill_isv6) 3853 ill_set_nce_router_flags(tmp_ill, enable); 3854 /* Notify routing socket listeners of this change. */ 3855 ip_rts_ifmsg(tmp_ill->ill_ipif); 3856 } 3857 } else { 3858 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3859 (enable ? "Enabling" : "Disabling"), 3860 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3861 mutex_enter(&ill->ill_lock); 3862 if (enable) 3863 ill->ill_flags |= ILLF_ROUTER; 3864 else 3865 ill->ill_flags &= ~ILLF_ROUTER; 3866 mutex_exit(&ill->ill_lock); 3867 if (ill->ill_isv6) 3868 ill_set_nce_router_flags(ill, enable); 3869 /* Notify routing socket listeners of this change. */ 3870 ip_rts_ifmsg(ill->ill_ipif); 3871 } 3872 3873 return (0); 3874 } 3875 3876 /* 3877 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3878 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3879 * set or clear. 3880 */ 3881 static void 3882 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3883 { 3884 ipif_t *ipif; 3885 nce_t *nce; 3886 3887 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3888 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 3889 if (nce != NULL) { 3890 mutex_enter(&nce->nce_lock); 3891 if (enable) 3892 nce->nce_flags |= NCE_F_ISROUTER; 3893 else 3894 nce->nce_flags &= ~NCE_F_ISROUTER; 3895 mutex_exit(&nce->nce_lock); 3896 NCE_REFRELE(nce); 3897 } 3898 } 3899 } 3900 3901 /* 3902 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 3903 * for this ill. Make sure the v6/v4 question has been answered about this 3904 * ill. The creation of this ndd variable is only for backwards compatibility. 3905 * The preferred way to control per-interface IP forwarding is through the 3906 * ILLF_ROUTER interface flag. 3907 */ 3908 static int 3909 ill_set_ndd_name(ill_t *ill) 3910 { 3911 char *suffix; 3912 3913 ASSERT(IAM_WRITER_ILL(ill)); 3914 3915 if (ill->ill_isv6) 3916 suffix = ipv6_forward_suffix; 3917 else 3918 suffix = ipv4_forward_suffix; 3919 3920 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 3921 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 3922 /* 3923 * Copies over the '\0'. 3924 * Note that strlen(suffix) is always bounded. 3925 */ 3926 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 3927 strlen(suffix) + 1); 3928 3929 /* 3930 * Use of the nd table requires holding the reader lock. 3931 * Modifying the nd table thru nd_load/nd_unload requires 3932 * the writer lock. 3933 */ 3934 rw_enter(&ip_g_nd_lock, RW_WRITER); 3935 if (!nd_load(&ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 3936 nd_ill_forward_set, (caddr_t)ill)) { 3937 /* 3938 * If the nd_load failed, it only meant that it could not 3939 * allocate a new bunch of room for further NDD expansion. 3940 * Because of that, the ill_ndd_name will be set to 0, and 3941 * this interface is at the mercy of the global ip_forwarding 3942 * variable. 3943 */ 3944 rw_exit(&ip_g_nd_lock); 3945 ill->ill_ndd_name = NULL; 3946 return (ENOMEM); 3947 } 3948 rw_exit(&ip_g_nd_lock); 3949 return (0); 3950 } 3951 3952 /* 3953 * Intializes the context structure and returns the first ill in the list 3954 * cuurently start_list and end_list can have values: 3955 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 3956 * IP_V4_G_HEAD Traverse IPV4 list only. 3957 * IP_V6_G_HEAD Traverse IPV6 list only. 3958 */ 3959 3960 /* 3961 * We don't check for CONDEMNED ills here. Caller must do that if 3962 * necessary under the ill lock. 3963 */ 3964 ill_t * 3965 ill_first(int start_list, int end_list, ill_walk_context_t *ctx) 3966 { 3967 ill_if_t *ifp; 3968 ill_t *ill; 3969 avl_tree_t *avl_tree; 3970 3971 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 3972 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 3973 3974 /* 3975 * setup the lists to search 3976 */ 3977 if (end_list != MAX_G_HEADS) { 3978 ctx->ctx_current_list = start_list; 3979 ctx->ctx_last_list = end_list; 3980 } else { 3981 ctx->ctx_last_list = MAX_G_HEADS - 1; 3982 ctx->ctx_current_list = 0; 3983 } 3984 3985 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 3986 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 3987 if (ifp != (ill_if_t *) 3988 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 3989 avl_tree = &ifp->illif_avl_by_ppa; 3990 ill = avl_first(avl_tree); 3991 /* 3992 * ill is guaranteed to be non NULL or ifp should have 3993 * not existed. 3994 */ 3995 ASSERT(ill != NULL); 3996 return (ill); 3997 } 3998 ctx->ctx_current_list++; 3999 } 4000 4001 return (NULL); 4002 } 4003 4004 /* 4005 * returns the next ill in the list. ill_first() must have been called 4006 * before calling ill_next() or bad things will happen. 4007 */ 4008 4009 /* 4010 * We don't check for CONDEMNED ills here. Caller must do that if 4011 * necessary under the ill lock. 4012 */ 4013 ill_t * 4014 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 4015 { 4016 ill_if_t *ifp; 4017 ill_t *ill; 4018 4019 4020 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4021 ASSERT(lastill->ill_ifptr != (ill_if_t *) 4022 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)); 4023 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 4024 AVL_AFTER)) != NULL) { 4025 return (ill); 4026 } 4027 4028 /* goto next ill_ifp in the list. */ 4029 ifp = lastill->ill_ifptr->illif_next; 4030 4031 /* make sure not at end of circular list */ 4032 while (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 4033 if (++ctx->ctx_current_list > ctx->ctx_last_list) 4034 return (NULL); 4035 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 4036 } 4037 4038 return (avl_first(&ifp->illif_avl_by_ppa)); 4039 } 4040 4041 /* 4042 * Check interface name for correct format which is name+ppa. 4043 * name can contain characters and digits, the right most digits 4044 * make up the ppa number. use of octal is not allowed, name must contain 4045 * a ppa, return pointer to the start of ppa. 4046 * In case of error return NULL. 4047 */ 4048 static char * 4049 ill_get_ppa_ptr(char *name) 4050 { 4051 int namelen = mi_strlen(name); 4052 4053 int len = namelen; 4054 4055 name += len; 4056 while (len > 0) { 4057 name--; 4058 if (*name < '0' || *name > '9') 4059 break; 4060 len--; 4061 } 4062 4063 /* empty string, all digits, or no trailing digits */ 4064 if (len == 0 || len == (int)namelen) 4065 return (NULL); 4066 4067 name++; 4068 /* check for attempted use of octal */ 4069 if (*name == '0' && len != (int)namelen - 1) 4070 return (NULL); 4071 return (name); 4072 } 4073 4074 /* 4075 * use avl tree to locate the ill. 4076 */ 4077 static ill_t * 4078 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4079 ipsq_func_t func, int *error) 4080 { 4081 char *ppa_ptr = NULL; 4082 int len; 4083 uint_t ppa; 4084 ill_t *ill = NULL; 4085 ill_if_t *ifp; 4086 int list; 4087 ipsq_t *ipsq; 4088 4089 if (error != NULL) 4090 *error = 0; 4091 4092 /* 4093 * get ppa ptr 4094 */ 4095 if (isv6) 4096 list = IP_V6_G_HEAD; 4097 else 4098 list = IP_V4_G_HEAD; 4099 4100 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4101 if (error != NULL) 4102 *error = ENXIO; 4103 return (NULL); 4104 } 4105 4106 len = ppa_ptr - name + 1; 4107 4108 ppa = stoi(&ppa_ptr); 4109 4110 ifp = IP_VX_ILL_G_LIST(list); 4111 4112 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4113 /* 4114 * match is done on len - 1 as the name is not null 4115 * terminated it contains ppa in addition to the interface 4116 * name. 4117 */ 4118 if ((ifp->illif_name_len == len) && 4119 bcmp(ifp->illif_name, name, len - 1) == 0) { 4120 break; 4121 } else { 4122 ifp = ifp->illif_next; 4123 } 4124 } 4125 4126 4127 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4128 /* 4129 * Even the interface type does not exist. 4130 */ 4131 if (error != NULL) 4132 *error = ENXIO; 4133 return (NULL); 4134 } 4135 4136 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4137 if (ill != NULL) { 4138 /* 4139 * The block comment at the start of ipif_down 4140 * explains the use of the macros used below 4141 */ 4142 GRAB_CONN_LOCK(q); 4143 mutex_enter(&ill->ill_lock); 4144 if (ILL_CAN_LOOKUP(ill)) { 4145 ill_refhold_locked(ill); 4146 mutex_exit(&ill->ill_lock); 4147 RELEASE_CONN_LOCK(q); 4148 return (ill); 4149 } else if (ILL_CAN_WAIT(ill, q)) { 4150 ipsq = ill->ill_phyint->phyint_ipsq; 4151 mutex_enter(&ipsq->ipsq_lock); 4152 mutex_exit(&ill->ill_lock); 4153 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4154 mutex_exit(&ipsq->ipsq_lock); 4155 RELEASE_CONN_LOCK(q); 4156 *error = EINPROGRESS; 4157 return (NULL); 4158 } 4159 mutex_exit(&ill->ill_lock); 4160 RELEASE_CONN_LOCK(q); 4161 } 4162 if (error != NULL) 4163 *error = ENXIO; 4164 return (NULL); 4165 } 4166 4167 /* 4168 * comparison function for use with avl. 4169 */ 4170 static int 4171 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4172 { 4173 uint_t ppa; 4174 uint_t ill_ppa; 4175 4176 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4177 4178 ppa = *((uint_t *)ppa_ptr); 4179 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4180 /* 4181 * We want the ill with the lowest ppa to be on the 4182 * top. 4183 */ 4184 if (ill_ppa < ppa) 4185 return (1); 4186 if (ill_ppa > ppa) 4187 return (-1); 4188 return (0); 4189 } 4190 4191 /* 4192 * remove an interface type from the global list. 4193 */ 4194 static void 4195 ill_delete_interface_type(ill_if_t *interface) 4196 { 4197 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4198 4199 ASSERT(interface != NULL); 4200 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4201 4202 avl_destroy(&interface->illif_avl_by_ppa); 4203 if (interface->illif_ppa_arena != NULL) 4204 vmem_destroy(interface->illif_ppa_arena); 4205 4206 remque(interface); 4207 4208 mi_free(interface); 4209 } 4210 4211 /* Defined in ip_netinfo.c */ 4212 extern ddi_taskq_t *eventq_queue_nic; 4213 4214 /* 4215 * remove ill from the global list. 4216 */ 4217 static void 4218 ill_glist_delete(ill_t *ill) 4219 { 4220 char *nicname; 4221 size_t nicnamelen; 4222 hook_nic_event_t *info; 4223 4224 if (ill == NULL) 4225 return; 4226 4227 rw_enter(&ill_g_lock, RW_WRITER); 4228 4229 if (ill->ill_name != NULL) { 4230 nicname = kmem_alloc(ill->ill_name_length, KM_NOSLEEP); 4231 if (nicname != NULL) { 4232 bcopy(ill->ill_name, nicname, ill->ill_name_length); 4233 nicnamelen = ill->ill_name_length; 4234 } 4235 } else { 4236 nicname = NULL; 4237 nicnamelen = 0; 4238 } 4239 4240 /* 4241 * If the ill was never inserted into the AVL tree 4242 * we skip the if branch. 4243 */ 4244 if (ill->ill_ifptr != NULL) { 4245 /* 4246 * remove from AVL tree and free ppa number 4247 */ 4248 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4249 4250 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4251 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4252 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4253 } 4254 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4255 ill_delete_interface_type(ill->ill_ifptr); 4256 } 4257 4258 /* 4259 * Indicate ill is no longer in the list. 4260 */ 4261 ill->ill_ifptr = NULL; 4262 ill->ill_name_length = 0; 4263 ill->ill_name[0] = '\0'; 4264 ill->ill_ppa = UINT_MAX; 4265 } 4266 4267 /* 4268 * Run the unplumb hook after the NIC has disappeared from being 4269 * visible so that attempts to revalidate its existance will fail. 4270 * 4271 * This needs to be run inside the ill_g_lock perimeter to ensure 4272 * that the ordering of delivered events to listeners matches the 4273 * order of them in the kernel. 4274 */ 4275 if ((info = ill->ill_nic_event_info) != NULL) { 4276 if (info->hne_event != NE_DOWN) { 4277 ip2dbg(("ill_glist_delete: unexpected nic event %d " 4278 "attached for %s\n", info->hne_event, 4279 ill->ill_name)); 4280 if (info->hne_data != NULL) 4281 kmem_free(info->hne_data, info->hne_datalen); 4282 kmem_free(info, sizeof (hook_nic_event_t)); 4283 } else { 4284 if (ddi_taskq_dispatch(eventq_queue_nic, 4285 ip_ne_queue_func, (void *)info, DDI_SLEEP) 4286 == DDI_FAILURE) { 4287 ip2dbg(("ill_glist_delete: ddi_taskq_dispatch " 4288 "failed\n")); 4289 if (info->hne_data != NULL) 4290 kmem_free(info->hne_data, 4291 info->hne_datalen); 4292 kmem_free(info, sizeof (hook_nic_event_t)); 4293 } 4294 } 4295 } 4296 4297 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 4298 if (info != NULL) { 4299 info->hne_nic = ill->ill_phyint->phyint_ifindex; 4300 info->hne_lif = 0; 4301 info->hne_event = NE_UNPLUMB; 4302 info->hne_data = nicname; 4303 info->hne_datalen = nicnamelen; 4304 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 4305 } else { 4306 ip2dbg(("ill_glist_delete: could not attach UNPLUMB nic event " 4307 "information for %s (ENOMEM)\n", ill->ill_name)); 4308 if (nicname != NULL) 4309 kmem_free(nicname, nicnamelen); 4310 } 4311 4312 ill->ill_nic_event_info = info; 4313 4314 ill_phyint_free(ill); 4315 4316 rw_exit(&ill_g_lock); 4317 } 4318 4319 /* 4320 * allocate a ppa, if the number of plumbed interfaces of this type are 4321 * less than ill_no_arena do a linear search to find a unused ppa. 4322 * When the number goes beyond ill_no_arena switch to using an arena. 4323 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4324 * is the return value for an error condition, so allocation starts at one 4325 * and is decremented by one. 4326 */ 4327 static int 4328 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4329 { 4330 ill_t *tmp_ill; 4331 uint_t start, end; 4332 int ppa; 4333 4334 if (ifp->illif_ppa_arena == NULL && 4335 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4336 /* 4337 * Create an arena. 4338 */ 4339 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4340 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4341 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4342 /* allocate what has already been assigned */ 4343 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4344 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4345 tmp_ill, AVL_AFTER)) { 4346 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4347 1, /* size */ 4348 1, /* align/quantum */ 4349 0, /* phase */ 4350 0, /* nocross */ 4351 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), /* minaddr */ 4352 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), /* maxaddr */ 4353 VM_NOSLEEP|VM_FIRSTFIT); 4354 if (ppa == 0) { 4355 ip1dbg(("ill_alloc_ppa: ppa allocation" 4356 " failed while switching")); 4357 vmem_destroy(ifp->illif_ppa_arena); 4358 ifp->illif_ppa_arena = NULL; 4359 break; 4360 } 4361 } 4362 } 4363 4364 if (ifp->illif_ppa_arena != NULL) { 4365 if (ill->ill_ppa == UINT_MAX) { 4366 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4367 1, VM_NOSLEEP|VM_FIRSTFIT); 4368 if (ppa == 0) 4369 return (EAGAIN); 4370 ill->ill_ppa = --ppa; 4371 } else { 4372 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4373 1, /* size */ 4374 1, /* align/quantum */ 4375 0, /* phase */ 4376 0, /* nocross */ 4377 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4378 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4379 VM_NOSLEEP|VM_FIRSTFIT); 4380 /* 4381 * Most likely the allocation failed because 4382 * the requested ppa was in use. 4383 */ 4384 if (ppa == 0) 4385 return (EEXIST); 4386 } 4387 return (0); 4388 } 4389 4390 /* 4391 * No arena is in use and not enough (>ill_no_arena) interfaces have 4392 * been plumbed to create one. Do a linear search to get a unused ppa. 4393 */ 4394 if (ill->ill_ppa == UINT_MAX) { 4395 end = UINT_MAX - 1; 4396 start = 0; 4397 } else { 4398 end = start = ill->ill_ppa; 4399 } 4400 4401 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4402 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4403 if (start++ >= end) { 4404 if (ill->ill_ppa == UINT_MAX) 4405 return (EAGAIN); 4406 else 4407 return (EEXIST); 4408 } 4409 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4410 } 4411 ill->ill_ppa = start; 4412 return (0); 4413 } 4414 4415 /* 4416 * Insert ill into the list of configured ill's. Once this function completes, 4417 * the ill is globally visible and is available through lookups. More precisely 4418 * this happens after the caller drops the ill_g_lock. 4419 */ 4420 static int 4421 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4422 { 4423 ill_if_t *ill_interface; 4424 avl_index_t where = 0; 4425 int error; 4426 int name_length; 4427 int index; 4428 boolean_t check_length = B_FALSE; 4429 4430 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4431 4432 name_length = mi_strlen(name) + 1; 4433 4434 if (isv6) 4435 index = IP_V6_G_HEAD; 4436 else 4437 index = IP_V4_G_HEAD; 4438 4439 ill_interface = IP_VX_ILL_G_LIST(index); 4440 /* 4441 * Search for interface type based on name 4442 */ 4443 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4444 if ((ill_interface->illif_name_len == name_length) && 4445 (strcmp(ill_interface->illif_name, name) == 0)) { 4446 break; 4447 } 4448 ill_interface = ill_interface->illif_next; 4449 } 4450 4451 /* 4452 * Interface type not found, create one. 4453 */ 4454 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4455 4456 ill_g_head_t ghead; 4457 4458 /* 4459 * allocate ill_if_t structure 4460 */ 4461 4462 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4463 if (ill_interface == NULL) { 4464 return (ENOMEM); 4465 } 4466 4467 4468 4469 (void) strcpy(ill_interface->illif_name, name); 4470 ill_interface->illif_name_len = name_length; 4471 4472 avl_create(&ill_interface->illif_avl_by_ppa, 4473 ill_compare_ppa, sizeof (ill_t), 4474 offsetof(struct ill_s, ill_avl_byppa)); 4475 4476 /* 4477 * link the structure in the back to maintain order 4478 * of configuration for ifconfig output. 4479 */ 4480 ghead = ill_g_heads[index]; 4481 insque(ill_interface, ghead.ill_g_list_tail); 4482 4483 } 4484 4485 if (ill->ill_ppa == UINT_MAX) 4486 check_length = B_TRUE; 4487 4488 error = ill_alloc_ppa(ill_interface, ill); 4489 if (error != 0) { 4490 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4491 ill_delete_interface_type(ill->ill_ifptr); 4492 return (error); 4493 } 4494 4495 /* 4496 * When the ppa is choosen by the system, check that there is 4497 * enough space to insert ppa. if a specific ppa was passed in this 4498 * check is not required as the interface name passed in will have 4499 * the right ppa in it. 4500 */ 4501 if (check_length) { 4502 /* 4503 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4504 */ 4505 char buf[sizeof (uint_t) * 3]; 4506 4507 /* 4508 * convert ppa to string to calculate the amount of space 4509 * required for it in the name. 4510 */ 4511 numtos(ill->ill_ppa, buf); 4512 4513 /* Do we have enough space to insert ppa ? */ 4514 4515 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4516 /* Free ppa and interface type struct */ 4517 if (ill_interface->illif_ppa_arena != NULL) { 4518 vmem_free(ill_interface->illif_ppa_arena, 4519 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4520 } 4521 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4522 0) { 4523 ill_delete_interface_type(ill->ill_ifptr); 4524 } 4525 4526 return (EINVAL); 4527 } 4528 } 4529 4530 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4531 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4532 4533 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4534 &where); 4535 ill->ill_ifptr = ill_interface; 4536 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4537 4538 ill_phyint_reinit(ill); 4539 return (0); 4540 } 4541 4542 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4543 static boolean_t 4544 ipsq_init(ill_t *ill) 4545 { 4546 ipsq_t *ipsq; 4547 4548 /* Init the ipsq and impicitly enter as writer */ 4549 ill->ill_phyint->phyint_ipsq = 4550 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4551 if (ill->ill_phyint->phyint_ipsq == NULL) 4552 return (B_FALSE); 4553 ipsq = ill->ill_phyint->phyint_ipsq; 4554 ipsq->ipsq_phyint_list = ill->ill_phyint; 4555 ill->ill_phyint->phyint_ipsq_next = NULL; 4556 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4557 ipsq->ipsq_refs = 1; 4558 ipsq->ipsq_writer = curthread; 4559 ipsq->ipsq_reentry_cnt = 1; 4560 #ifdef ILL_DEBUG 4561 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, IP_STACK_DEPTH); 4562 #endif 4563 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4564 return (B_TRUE); 4565 } 4566 4567 /* 4568 * ill_init is called by ip_open when a device control stream is opened. 4569 * It does a few initializations, and shoots a DL_INFO_REQ message down 4570 * to the driver. The response is later picked up in ip_rput_dlpi and 4571 * used to set up default mechanisms for talking to the driver. (Always 4572 * called as writer.) 4573 * 4574 * If this function returns error, ip_open will call ip_close which in 4575 * turn will call ill_delete to clean up any memory allocated here that 4576 * is not yet freed. 4577 */ 4578 int 4579 ill_init(queue_t *q, ill_t *ill) 4580 { 4581 int count; 4582 dl_info_req_t *dlir; 4583 mblk_t *info_mp; 4584 uchar_t *frag_ptr; 4585 4586 /* 4587 * The ill is initialized to zero by mi_alloc*(). In addition 4588 * some fields already contain valid values, initialized in 4589 * ip_open(), before we reach here. 4590 */ 4591 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4592 4593 ill->ill_rq = q; 4594 ill->ill_wq = WR(q); 4595 4596 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4597 BPRI_HI); 4598 if (info_mp == NULL) 4599 return (ENOMEM); 4600 4601 /* 4602 * Allocate sufficient space to contain our fragment hash table and 4603 * the device name. 4604 */ 4605 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4606 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4607 if (frag_ptr == NULL) { 4608 freemsg(info_mp); 4609 return (ENOMEM); 4610 } 4611 ill->ill_frag_ptr = frag_ptr; 4612 ill->ill_frag_free_num_pkts = 0; 4613 ill->ill_last_frag_clean_time = 0; 4614 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4615 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4616 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4617 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4618 NULL, MUTEX_DEFAULT, NULL); 4619 } 4620 4621 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4622 if (ill->ill_phyint == NULL) { 4623 freemsg(info_mp); 4624 mi_free(frag_ptr); 4625 return (ENOMEM); 4626 } 4627 4628 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4629 /* 4630 * For now pretend this is a v4 ill. We need to set phyint_ill* 4631 * at this point because of the following reason. If we can't 4632 * enter the ipsq at some point and cv_wait, the writer that 4633 * wakes us up tries to locate us using the list of all phyints 4634 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4635 * If we don't set it now, we risk a missed wakeup. 4636 */ 4637 ill->ill_phyint->phyint_illv4 = ill; 4638 ill->ill_ppa = UINT_MAX; 4639 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4640 4641 if (!ipsq_init(ill)) { 4642 freemsg(info_mp); 4643 mi_free(frag_ptr); 4644 mi_free(ill->ill_phyint); 4645 return (ENOMEM); 4646 } 4647 4648 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4649 4650 4651 /* Frag queue limit stuff */ 4652 ill->ill_frag_count = 0; 4653 ill->ill_ipf_gen = 0; 4654 4655 ill->ill_global_timer = INFINITY; 4656 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4657 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4658 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4659 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4660 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4661 4662 /* 4663 * Initialize IPv6 configuration variables. The IP module is always 4664 * opened as an IPv4 module. Instead tracking down the cases where 4665 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4666 * here for convenience, this has no effect until the ill is set to do 4667 * IPv6. 4668 */ 4669 ill->ill_reachable_time = ND_REACHABLE_TIME; 4670 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4671 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4672 ill->ill_max_buf = ND_MAX_Q; 4673 ill->ill_refcnt = 0; 4674 4675 /* Send down the Info Request to the driver. */ 4676 info_mp->b_datap->db_type = M_PCPROTO; 4677 dlir = (dl_info_req_t *)info_mp->b_rptr; 4678 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4679 dlir->dl_primitive = DL_INFO_REQ; 4680 4681 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4682 4683 qprocson(q); 4684 ill_dlpi_send(ill, info_mp); 4685 4686 return (0); 4687 } 4688 4689 /* 4690 * ill_dls_info 4691 * creates datalink socket info from the device. 4692 */ 4693 int 4694 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4695 { 4696 size_t length; 4697 ill_t *ill = ipif->ipif_ill; 4698 4699 sdl->sdl_family = AF_LINK; 4700 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4701 sdl->sdl_type = ipif->ipif_type; 4702 (void) ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4703 length = mi_strlen(sdl->sdl_data); 4704 ASSERT(length < 256); 4705 sdl->sdl_nlen = (uchar_t)length; 4706 sdl->sdl_alen = ill->ill_phys_addr_length; 4707 mutex_enter(&ill->ill_lock); 4708 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) { 4709 bcopy(ill->ill_phys_addr, &sdl->sdl_data[length], 4710 ill->ill_phys_addr_length); 4711 } 4712 mutex_exit(&ill->ill_lock); 4713 sdl->sdl_slen = 0; 4714 return (sizeof (struct sockaddr_dl)); 4715 } 4716 4717 /* 4718 * ill_xarp_info 4719 * creates xarp info from the device. 4720 */ 4721 static int 4722 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4723 { 4724 sdl->sdl_family = AF_LINK; 4725 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4726 sdl->sdl_type = ill->ill_type; 4727 (void) ipif_get_name(ill->ill_ipif, sdl->sdl_data, 4728 sizeof (sdl->sdl_data)); 4729 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4730 sdl->sdl_alen = ill->ill_phys_addr_length; 4731 sdl->sdl_slen = 0; 4732 return (sdl->sdl_nlen); 4733 } 4734 4735 static int 4736 loopback_kstat_update(kstat_t *ksp, int rw) 4737 { 4738 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 4739 4740 if (rw == KSTAT_WRITE) 4741 return (EACCES); 4742 kn[0].value.ui32 = loopback_packets; 4743 kn[1].value.ui32 = loopback_packets; 4744 return (0); 4745 } 4746 4747 4748 /* 4749 * Has ifindex been plumbed already. 4750 */ 4751 static boolean_t 4752 phyint_exists(uint_t index) 4753 { 4754 phyint_t *phyi; 4755 4756 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4757 /* 4758 * Indexes are stored in the phyint - a common structure 4759 * to both IPv4 and IPv6. 4760 */ 4761 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 4762 (void *) &index, NULL); 4763 return (phyi != NULL); 4764 } 4765 4766 /* 4767 * Assign a unique interface index for the phyint. 4768 */ 4769 static boolean_t 4770 phyint_assign_ifindex(phyint_t *phyi) 4771 { 4772 uint_t starting_index; 4773 4774 ASSERT(phyi->phyint_ifindex == 0); 4775 if (!ill_index_wrap) { 4776 phyi->phyint_ifindex = ill_index++; 4777 if (ill_index == 0) { 4778 /* Reached the uint_t limit Next time wrap */ 4779 ill_index_wrap = B_TRUE; 4780 } 4781 return (B_TRUE); 4782 } 4783 4784 /* 4785 * Start reusing unused indexes. Note that we hold the ill_g_lock 4786 * at this point and don't want to call any function that attempts 4787 * to get the lock again. 4788 */ 4789 starting_index = ill_index++; 4790 for (; ill_index != starting_index; ill_index++) { 4791 if (ill_index != 0 && !phyint_exists(ill_index)) { 4792 /* found unused index - use it */ 4793 phyi->phyint_ifindex = ill_index; 4794 return (B_TRUE); 4795 } 4796 } 4797 4798 /* 4799 * all interface indicies are inuse. 4800 */ 4801 return (B_FALSE); 4802 } 4803 4804 /* 4805 * Return a pointer to the ill which matches the supplied name. Note that 4806 * the ill name length includes the null termination character. (May be 4807 * called as writer.) 4808 * If do_alloc and the interface is "lo0" it will be automatically created. 4809 * Cannot bump up reference on condemned ills. So dup detect can't be done 4810 * using this func. 4811 */ 4812 ill_t * 4813 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4814 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc) 4815 { 4816 ill_t *ill; 4817 ipif_t *ipif; 4818 kstat_named_t *kn; 4819 boolean_t isloopback; 4820 ipsq_t *old_ipsq; 4821 4822 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4823 4824 rw_enter(&ill_g_lock, RW_READER); 4825 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4826 rw_exit(&ill_g_lock); 4827 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4828 return (ill); 4829 4830 /* 4831 * Couldn't find it. Does this happen to be a lookup for the 4832 * loopback device and are we allowed to allocate it? 4833 */ 4834 if (!isloopback || !do_alloc) 4835 return (NULL); 4836 4837 rw_enter(&ill_g_lock, RW_WRITER); 4838 4839 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4840 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4841 rw_exit(&ill_g_lock); 4842 return (ill); 4843 } 4844 4845 /* Create the loopback device on demand */ 4846 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4847 sizeof (ipif_loopback_name), BPRI_MED)); 4848 if (ill == NULL) 4849 goto done; 4850 4851 *ill = ill_null; 4852 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4853 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4854 if (ill->ill_phyint == NULL) 4855 goto done; 4856 4857 if (isv6) 4858 ill->ill_phyint->phyint_illv6 = ill; 4859 else 4860 ill->ill_phyint->phyint_illv4 = ill; 4861 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4862 ill->ill_max_frag = IP_LOOPBACK_MTU; 4863 /* Add room for tcp+ip headers */ 4864 if (isv6) { 4865 ill->ill_isv6 = B_TRUE; 4866 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 4867 if (!ill_allocate_mibs(ill)) 4868 goto done; 4869 } else { 4870 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 4871 } 4872 ill->ill_max_mtu = ill->ill_max_frag; 4873 /* 4874 * ipif_loopback_name can't be pointed at directly because its used 4875 * by both the ipv4 and ipv6 interfaces. When the ill is removed 4876 * from the glist, ill_glist_delete() sets the first character of 4877 * ill_name to '\0'. 4878 */ 4879 ill->ill_name = (char *)ill + sizeof (*ill); 4880 (void) strcpy(ill->ill_name, ipif_loopback_name); 4881 ill->ill_name_length = sizeof (ipif_loopback_name); 4882 /* Set ill_name_set for ill_phyint_reinit to work properly */ 4883 4884 ill->ill_global_timer = INFINITY; 4885 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4886 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4887 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4888 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4889 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4890 4891 /* No resolver here. */ 4892 ill->ill_net_type = IRE_LOOPBACK; 4893 4894 /* Initialize the ipsq */ 4895 if (!ipsq_init(ill)) 4896 goto done; 4897 4898 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 4899 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 4900 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 4901 #ifdef ILL_DEBUG 4902 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 4903 #endif 4904 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 4905 if (ipif == NULL) 4906 goto done; 4907 4908 ill->ill_flags = ILLF_MULTICAST; 4909 4910 /* Set up default loopback address and mask. */ 4911 if (!isv6) { 4912 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 4913 4914 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 4915 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4916 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 4917 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4918 ipif->ipif_v6subnet); 4919 ill->ill_flags |= ILLF_IPV4; 4920 } else { 4921 ipif->ipif_v6lcl_addr = ipv6_loopback; 4922 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4923 ipif->ipif_v6net_mask = ipv6_all_ones; 4924 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4925 ipif->ipif_v6subnet); 4926 ill->ill_flags |= ILLF_IPV6; 4927 } 4928 4929 /* 4930 * Chain us in at the end of the ill list. hold the ill 4931 * before we make it globally visible. 1 for the lookup. 4932 */ 4933 ill->ill_refcnt = 0; 4934 ill_refhold(ill); 4935 4936 ill->ill_frag_count = 0; 4937 ill->ill_frag_free_num_pkts = 0; 4938 ill->ill_last_frag_clean_time = 0; 4939 4940 old_ipsq = ill->ill_phyint->phyint_ipsq; 4941 4942 if (ill_glist_insert(ill, "lo", isv6) != 0) 4943 cmn_err(CE_PANIC, "cannot insert loopback interface"); 4944 4945 /* Let SCTP know so that it can add this to its list */ 4946 sctp_update_ill(ill, SCTP_ILL_INSERT); 4947 4948 /* Let SCTP know about this IPIF, so that it can add it to its list */ 4949 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 4950 4951 /* 4952 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 4953 */ 4954 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 4955 /* Loopback ills aren't in any IPMP group */ 4956 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 4957 ipsq_delete(old_ipsq); 4958 } 4959 4960 /* 4961 * Delay this till the ipif is allocated as ipif_allocate 4962 * de-references ill_phyint for getting the ifindex. We 4963 * can't do this before ipif_allocate because ill_phyint_reinit 4964 * -> phyint_assign_ifindex expects ipif to be present. 4965 */ 4966 mutex_enter(&ill->ill_phyint->phyint_lock); 4967 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 4968 mutex_exit(&ill->ill_phyint->phyint_lock); 4969 4970 if (loopback_ksp == NULL) { 4971 /* Export loopback interface statistics */ 4972 loopback_ksp = kstat_create("lo", 0, ipif_loopback_name, "net", 4973 KSTAT_TYPE_NAMED, 2, 0); 4974 if (loopback_ksp != NULL) { 4975 loopback_ksp->ks_update = loopback_kstat_update; 4976 kn = KSTAT_NAMED_PTR(loopback_ksp); 4977 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 4978 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 4979 kstat_install(loopback_ksp); 4980 } 4981 } 4982 4983 if (error != NULL) 4984 *error = 0; 4985 *did_alloc = B_TRUE; 4986 rw_exit(&ill_g_lock); 4987 return (ill); 4988 done: 4989 if (ill != NULL) { 4990 if (ill->ill_phyint != NULL) { 4991 ipsq_t *ipsq; 4992 4993 ipsq = ill->ill_phyint->phyint_ipsq; 4994 if (ipsq != NULL) 4995 kmem_free(ipsq, sizeof (ipsq_t)); 4996 mi_free(ill->ill_phyint); 4997 } 4998 ill_free_mib(ill); 4999 mi_free(ill); 5000 } 5001 rw_exit(&ill_g_lock); 5002 if (error != NULL) 5003 *error = ENOMEM; 5004 return (NULL); 5005 } 5006 5007 /* 5008 * Return a pointer to the ill which matches the index and IP version type. 5009 */ 5010 ill_t * 5011 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 5012 ipsq_func_t func, int *err) 5013 { 5014 ill_t *ill; 5015 ipsq_t *ipsq; 5016 phyint_t *phyi; 5017 5018 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 5019 (q != NULL && mp != NULL && func != NULL && err != NULL)); 5020 5021 if (err != NULL) 5022 *err = 0; 5023 5024 /* 5025 * Indexes are stored in the phyint - a common structure 5026 * to both IPv4 and IPv6. 5027 */ 5028 rw_enter(&ill_g_lock, RW_READER); 5029 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 5030 (void *) &index, NULL); 5031 if (phyi != NULL) { 5032 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5033 if (ill != NULL) { 5034 /* 5035 * The block comment at the start of ipif_down 5036 * explains the use of the macros used below 5037 */ 5038 GRAB_CONN_LOCK(q); 5039 mutex_enter(&ill->ill_lock); 5040 if (ILL_CAN_LOOKUP(ill)) { 5041 ill_refhold_locked(ill); 5042 mutex_exit(&ill->ill_lock); 5043 RELEASE_CONN_LOCK(q); 5044 rw_exit(&ill_g_lock); 5045 return (ill); 5046 } else if (ILL_CAN_WAIT(ill, q)) { 5047 ipsq = ill->ill_phyint->phyint_ipsq; 5048 mutex_enter(&ipsq->ipsq_lock); 5049 rw_exit(&ill_g_lock); 5050 mutex_exit(&ill->ill_lock); 5051 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5052 mutex_exit(&ipsq->ipsq_lock); 5053 RELEASE_CONN_LOCK(q); 5054 *err = EINPROGRESS; 5055 return (NULL); 5056 } 5057 RELEASE_CONN_LOCK(q); 5058 mutex_exit(&ill->ill_lock); 5059 } 5060 } 5061 rw_exit(&ill_g_lock); 5062 if (err != NULL) 5063 *err = ENXIO; 5064 return (NULL); 5065 } 5066 5067 /* 5068 * Return the ifindex next in sequence after the passed in ifindex. 5069 * If there is no next ifindex for the given protocol, return 0. 5070 */ 5071 uint_t 5072 ill_get_next_ifindex(uint_t index, boolean_t isv6) 5073 { 5074 phyint_t *phyi; 5075 phyint_t *phyi_initial; 5076 uint_t ifindex; 5077 5078 rw_enter(&ill_g_lock, RW_READER); 5079 5080 if (index == 0) { 5081 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 5082 } else { 5083 phyi = phyi_initial = avl_find( 5084 &phyint_g_list.phyint_list_avl_by_index, 5085 (void *) &index, NULL); 5086 } 5087 5088 for (; phyi != NULL; 5089 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 5090 phyi, AVL_AFTER)) { 5091 /* 5092 * If we're not returning the first interface in the tree 5093 * and we still haven't moved past the phyint_t that 5094 * corresponds to index, avl_walk needs to be called again 5095 */ 5096 if (!((index != 0) && (phyi == phyi_initial))) { 5097 if (isv6) { 5098 if ((phyi->phyint_illv6) && 5099 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5100 (phyi->phyint_illv6->ill_isv6 == 1)) 5101 break; 5102 } else { 5103 if ((phyi->phyint_illv4) && 5104 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5105 (phyi->phyint_illv4->ill_isv6 == 0)) 5106 break; 5107 } 5108 } 5109 } 5110 5111 rw_exit(&ill_g_lock); 5112 5113 if (phyi != NULL) 5114 ifindex = phyi->phyint_ifindex; 5115 else 5116 ifindex = 0; 5117 5118 return (ifindex); 5119 } 5120 5121 5122 /* 5123 * Return the ifindex for the named interface. 5124 * If there is no next ifindex for the interface, return 0. 5125 */ 5126 uint_t 5127 ill_get_ifindex_by_name(char *name) 5128 { 5129 phyint_t *phyi; 5130 avl_index_t where = 0; 5131 uint_t ifindex; 5132 5133 rw_enter(&ill_g_lock, RW_READER); 5134 5135 if ((phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 5136 name, &where)) == NULL) { 5137 rw_exit(&ill_g_lock); 5138 return (0); 5139 } 5140 5141 ifindex = phyi->phyint_ifindex; 5142 5143 rw_exit(&ill_g_lock); 5144 5145 return (ifindex); 5146 } 5147 5148 5149 /* 5150 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5151 * that gives a running thread a reference to the ill. This reference must be 5152 * released by the thread when it is done accessing the ill and related 5153 * objects. ill_refcnt can not be used to account for static references 5154 * such as other structures pointing to an ill. Callers must generally 5155 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5156 * or be sure that the ill is not being deleted or changing state before 5157 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5158 * ill won't change any of its critical state such as address, netmask etc. 5159 */ 5160 void 5161 ill_refhold(ill_t *ill) 5162 { 5163 mutex_enter(&ill->ill_lock); 5164 ill->ill_refcnt++; 5165 ILL_TRACE_REF(ill); 5166 mutex_exit(&ill->ill_lock); 5167 } 5168 5169 void 5170 ill_refhold_locked(ill_t *ill) 5171 { 5172 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5173 ill->ill_refcnt++; 5174 ILL_TRACE_REF(ill); 5175 } 5176 5177 int 5178 ill_check_and_refhold(ill_t *ill) 5179 { 5180 mutex_enter(&ill->ill_lock); 5181 if (ILL_CAN_LOOKUP(ill)) { 5182 ill_refhold_locked(ill); 5183 mutex_exit(&ill->ill_lock); 5184 return (0); 5185 } 5186 mutex_exit(&ill->ill_lock); 5187 return (ILL_LOOKUP_FAILED); 5188 } 5189 5190 /* 5191 * Must not be called while holding any locks. Otherwise if this is 5192 * the last reference to be released, there is a chance of recursive mutex 5193 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5194 * to restart an ioctl. 5195 */ 5196 void 5197 ill_refrele(ill_t *ill) 5198 { 5199 mutex_enter(&ill->ill_lock); 5200 ASSERT(ill->ill_refcnt != 0); 5201 ill->ill_refcnt--; 5202 ILL_UNTRACE_REF(ill); 5203 if (ill->ill_refcnt != 0) { 5204 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5205 mutex_exit(&ill->ill_lock); 5206 return; 5207 } 5208 5209 /* Drops the ill_lock */ 5210 ipif_ill_refrele_tail(ill); 5211 } 5212 5213 /* 5214 * Obtain a weak reference count on the ill. This reference ensures the 5215 * ill won't be freed, but the ill may change any of its critical state 5216 * such as netmask, address etc. Returns an error if the ill has started 5217 * closing. 5218 */ 5219 boolean_t 5220 ill_waiter_inc(ill_t *ill) 5221 { 5222 mutex_enter(&ill->ill_lock); 5223 if (ill->ill_state_flags & ILL_CONDEMNED) { 5224 mutex_exit(&ill->ill_lock); 5225 return (B_FALSE); 5226 } 5227 ill->ill_waiters++; 5228 mutex_exit(&ill->ill_lock); 5229 return (B_TRUE); 5230 } 5231 5232 void 5233 ill_waiter_dcr(ill_t *ill) 5234 { 5235 mutex_enter(&ill->ill_lock); 5236 ill->ill_waiters--; 5237 if (ill->ill_waiters == 0) 5238 cv_broadcast(&ill->ill_cv); 5239 mutex_exit(&ill->ill_lock); 5240 } 5241 5242 /* 5243 * Named Dispatch routine to produce a formatted report on all ILLs. 5244 * This report is accessed by using the ndd utility to "get" ND variable 5245 * "ip_ill_status". 5246 */ 5247 /* ARGSUSED */ 5248 int 5249 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5250 { 5251 ill_t *ill; 5252 ill_walk_context_t ctx; 5253 5254 (void) mi_mpprintf(mp, 5255 "ILL " MI_COL_HDRPAD_STR 5256 /* 01234567[89ABCDEF] */ 5257 "rq " MI_COL_HDRPAD_STR 5258 /* 01234567[89ABCDEF] */ 5259 "wq " MI_COL_HDRPAD_STR 5260 /* 01234567[89ABCDEF] */ 5261 "upcnt mxfrg err name"); 5262 /* 12345 12345 123 xxxxxxxx */ 5263 5264 rw_enter(&ill_g_lock, RW_READER); 5265 ill = ILL_START_WALK_ALL(&ctx); 5266 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5267 (void) mi_mpprintf(mp, 5268 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5269 "%05u %05u %03d %s", 5270 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5271 ill->ill_ipif_up_count, 5272 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5273 } 5274 rw_exit(&ill_g_lock); 5275 5276 return (0); 5277 } 5278 5279 /* 5280 * Named Dispatch routine to produce a formatted report on all IPIFs. 5281 * This report is accessed by using the ndd utility to "get" ND variable 5282 * "ip_ipif_status". 5283 */ 5284 /* ARGSUSED */ 5285 int 5286 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5287 { 5288 char buf1[INET6_ADDRSTRLEN]; 5289 char buf2[INET6_ADDRSTRLEN]; 5290 char buf3[INET6_ADDRSTRLEN]; 5291 char buf4[INET6_ADDRSTRLEN]; 5292 char buf5[INET6_ADDRSTRLEN]; 5293 char buf6[INET6_ADDRSTRLEN]; 5294 char buf[LIFNAMSIZ]; 5295 ill_t *ill; 5296 ipif_t *ipif; 5297 nv_t *nvp; 5298 uint64_t flags; 5299 zoneid_t zoneid; 5300 ill_walk_context_t ctx; 5301 5302 (void) mi_mpprintf(mp, 5303 "IPIF metric mtu in/out/forward name zone flags...\n" 5304 "\tlocal address\n" 5305 "\tsrc address\n" 5306 "\tsubnet\n" 5307 "\tmask\n" 5308 "\tbroadcast\n" 5309 "\tp-p-dst"); 5310 5311 ASSERT(q->q_next == NULL); 5312 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5313 5314 rw_enter(&ill_g_lock, RW_READER); 5315 ill = ILL_START_WALK_ALL(&ctx); 5316 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5317 for (ipif = ill->ill_ipif; ipif != NULL; 5318 ipif = ipif->ipif_next) { 5319 if (zoneid != GLOBAL_ZONEID && 5320 zoneid != ipif->ipif_zoneid && 5321 ipif->ipif_zoneid != ALL_ZONES) 5322 continue; 5323 (void) mi_mpprintf(mp, 5324 MI_COL_PTRFMT_STR 5325 "%04u %05u %u/%u/%u %s %d", 5326 (void *)ipif, 5327 ipif->ipif_metric, ipif->ipif_mtu, 5328 ipif->ipif_ib_pkt_count, 5329 ipif->ipif_ob_pkt_count, 5330 ipif->ipif_fo_pkt_count, 5331 ipif_get_name(ipif, buf, sizeof (buf)), 5332 ipif->ipif_zoneid); 5333 5334 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5335 ipif->ipif_ill->ill_phyint->phyint_flags; 5336 5337 /* Tack on text strings for any flags. */ 5338 nvp = ipif_nv_tbl; 5339 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5340 if (nvp->nv_value & flags) 5341 (void) mi_mpprintf_nr(mp, " %s", 5342 nvp->nv_name); 5343 } 5344 (void) mi_mpprintf(mp, 5345 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5346 inet_ntop(AF_INET6, 5347 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5348 inet_ntop(AF_INET6, 5349 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5350 inet_ntop(AF_INET6, 5351 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5352 inet_ntop(AF_INET6, 5353 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5354 inet_ntop(AF_INET6, 5355 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5356 inet_ntop(AF_INET6, 5357 &ipif->ipif_v6pp_dst_addr, 5358 buf6, sizeof (buf6))); 5359 } 5360 } 5361 rw_exit(&ill_g_lock); 5362 return (0); 5363 } 5364 5365 /* 5366 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5367 * driver. We construct best guess defaults for lower level information that 5368 * we need. If an interface is brought up without injection of any overriding 5369 * information from outside, we have to be ready to go with these defaults. 5370 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5371 * we primarely want the dl_provider_style. 5372 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5373 * at which point we assume the other part of the information is valid. 5374 */ 5375 void 5376 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5377 { 5378 uchar_t *brdcst_addr; 5379 uint_t brdcst_addr_length, phys_addr_length; 5380 t_scalar_t sap_length; 5381 dl_info_ack_t *dlia; 5382 ip_m_t *ipm; 5383 dl_qos_cl_sel1_t *sel1; 5384 5385 ASSERT(IAM_WRITER_ILL(ill)); 5386 5387 /* 5388 * Till the ill is fully up ILL_CHANGING will be set and 5389 * the ill is not globally visible. So no need for a lock. 5390 */ 5391 dlia = (dl_info_ack_t *)mp->b_rptr; 5392 ill->ill_mactype = dlia->dl_mac_type; 5393 5394 ipm = ip_m_lookup(dlia->dl_mac_type); 5395 if (ipm == NULL) { 5396 ipm = ip_m_lookup(DL_OTHER); 5397 ASSERT(ipm != NULL); 5398 } 5399 ill->ill_media = ipm; 5400 5401 /* 5402 * When the new DLPI stuff is ready we'll pull lengths 5403 * from dlia. 5404 */ 5405 if (dlia->dl_version == DL_VERSION_2) { 5406 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5407 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5408 brdcst_addr_length); 5409 if (brdcst_addr == NULL) { 5410 brdcst_addr_length = 0; 5411 } 5412 sap_length = dlia->dl_sap_length; 5413 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5414 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5415 brdcst_addr_length, sap_length, phys_addr_length)); 5416 } else { 5417 brdcst_addr_length = 6; 5418 brdcst_addr = ip_six_byte_all_ones; 5419 sap_length = -2; 5420 phys_addr_length = brdcst_addr_length; 5421 } 5422 5423 ill->ill_bcast_addr_length = brdcst_addr_length; 5424 ill->ill_phys_addr_length = phys_addr_length; 5425 ill->ill_sap_length = sap_length; 5426 ill->ill_max_frag = dlia->dl_max_sdu; 5427 ill->ill_max_mtu = ill->ill_max_frag; 5428 5429 ill->ill_type = ipm->ip_m_type; 5430 5431 if (!ill->ill_dlpi_style_set) { 5432 if (dlia->dl_provider_style == DL_STYLE2) 5433 ill->ill_needs_attach = 1; 5434 5435 /* 5436 * Allocate the first ipif on this ill. We don't delay it 5437 * further as ioctl handling assumes atleast one ipif to 5438 * be present. 5439 * 5440 * At this point we don't know whether the ill is v4 or v6. 5441 * We will know this whan the SIOCSLIFNAME happens and 5442 * the correct value for ill_isv6 will be assigned in 5443 * ipif_set_values(). We need to hold the ill lock and 5444 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5445 * the wakeup. 5446 */ 5447 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5448 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5449 mutex_enter(&ill->ill_lock); 5450 ASSERT(ill->ill_dlpi_style_set == 0); 5451 ill->ill_dlpi_style_set = 1; 5452 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5453 cv_broadcast(&ill->ill_cv); 5454 mutex_exit(&ill->ill_lock); 5455 freemsg(mp); 5456 return; 5457 } 5458 ASSERT(ill->ill_ipif != NULL); 5459 /* 5460 * We know whether it is IPv4 or IPv6 now, as this is the 5461 * second DL_INFO_ACK we are recieving in response to the 5462 * DL_INFO_REQ sent in ipif_set_values. 5463 */ 5464 if (ill->ill_isv6) 5465 ill->ill_sap = IP6_DL_SAP; 5466 else 5467 ill->ill_sap = IP_DL_SAP; 5468 /* 5469 * Set ipif_mtu which is used to set the IRE's 5470 * ire_max_frag value. The driver could have sent 5471 * a different mtu from what it sent last time. No 5472 * need to call ipif_mtu_change because IREs have 5473 * not yet been created. 5474 */ 5475 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5476 /* 5477 * Clear all the flags that were set based on ill_bcast_addr_length 5478 * and ill_phys_addr_length (in ipif_set_values) as these could have 5479 * changed now and we need to re-evaluate. 5480 */ 5481 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5482 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5483 5484 /* 5485 * Free ill_resolver_mp and ill_bcast_mp as things could have 5486 * changed now. 5487 */ 5488 if (ill->ill_bcast_addr_length == 0) { 5489 if (ill->ill_resolver_mp != NULL) 5490 freemsg(ill->ill_resolver_mp); 5491 if (ill->ill_bcast_mp != NULL) 5492 freemsg(ill->ill_bcast_mp); 5493 if (ill->ill_flags & ILLF_XRESOLV) 5494 ill->ill_net_type = IRE_IF_RESOLVER; 5495 else 5496 ill->ill_net_type = IRE_IF_NORESOLVER; 5497 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5498 ill->ill_phys_addr_length, 5499 ill->ill_sap, 5500 ill->ill_sap_length); 5501 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5502 5503 if (ill->ill_isv6) 5504 /* 5505 * Note: xresolv interfaces will eventually need NOARP 5506 * set here as well, but that will require those 5507 * external resolvers to have some knowledge of 5508 * that flag and act appropriately. Not to be changed 5509 * at present. 5510 */ 5511 ill->ill_flags |= ILLF_NONUD; 5512 else 5513 ill->ill_flags |= ILLF_NOARP; 5514 5515 if (ill->ill_phys_addr_length == 0) { 5516 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5517 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5518 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5519 } else { 5520 /* pt-pt supports multicast. */ 5521 ill->ill_flags |= ILLF_MULTICAST; 5522 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5523 } 5524 } 5525 } else { 5526 ill->ill_net_type = IRE_IF_RESOLVER; 5527 if (ill->ill_bcast_mp != NULL) 5528 freemsg(ill->ill_bcast_mp); 5529 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5530 ill->ill_bcast_addr_length, ill->ill_sap, 5531 ill->ill_sap_length); 5532 /* 5533 * Later detect lack of DLPI driver multicast 5534 * capability by catching DL_ENABMULTI errors in 5535 * ip_rput_dlpi. 5536 */ 5537 ill->ill_flags |= ILLF_MULTICAST; 5538 if (!ill->ill_isv6) 5539 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5540 } 5541 /* By default an interface does not support any CoS marking */ 5542 ill->ill_flags &= ~ILLF_COS_ENABLED; 5543 5544 /* 5545 * If we get QoS information in DL_INFO_ACK, the device supports 5546 * some form of CoS marking, set ILLF_COS_ENABLED. 5547 */ 5548 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5549 dlia->dl_qos_length); 5550 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5551 ill->ill_flags |= ILLF_COS_ENABLED; 5552 } 5553 5554 /* Clear any previous error indication. */ 5555 ill->ill_error = 0; 5556 freemsg(mp); 5557 } 5558 5559 /* 5560 * Perform various checks to verify that an address would make sense as a 5561 * local, remote, or subnet interface address. 5562 */ 5563 static boolean_t 5564 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5565 { 5566 ipaddr_t net_mask; 5567 5568 /* 5569 * Don't allow all zeroes, all ones or experimental address, but allow 5570 * all ones netmask. 5571 */ 5572 if ((net_mask = ip_net_mask(addr)) == 0) 5573 return (B_FALSE); 5574 /* A given netmask overrides the "guess" netmask */ 5575 if (subnet_mask != 0) 5576 net_mask = subnet_mask; 5577 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5578 (addr == (addr | ~net_mask)))) { 5579 return (B_FALSE); 5580 } 5581 if (CLASSD(addr)) 5582 return (B_FALSE); 5583 5584 return (B_TRUE); 5585 } 5586 5587 /* 5588 * ipif_lookup_group 5589 * Returns held ipif 5590 */ 5591 ipif_t * 5592 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid) 5593 { 5594 ire_t *ire; 5595 ipif_t *ipif; 5596 5597 ire = ire_lookup_multi(group, zoneid); 5598 if (ire == NULL) 5599 return (NULL); 5600 ipif = ire->ire_ipif; 5601 ipif_refhold(ipif); 5602 ire_refrele(ire); 5603 return (ipif); 5604 } 5605 5606 /* 5607 * Look for an ipif with the specified interface address and destination. 5608 * The destination address is used only for matching point-to-point interfaces. 5609 */ 5610 ipif_t * 5611 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5612 ipsq_func_t func, int *error) 5613 { 5614 ipif_t *ipif; 5615 ill_t *ill; 5616 ill_walk_context_t ctx; 5617 ipsq_t *ipsq; 5618 5619 if (error != NULL) 5620 *error = 0; 5621 5622 /* 5623 * First match all the point-to-point interfaces 5624 * before looking at non-point-to-point interfaces. 5625 * This is done to avoid returning non-point-to-point 5626 * ipif instead of unnumbered point-to-point ipif. 5627 */ 5628 rw_enter(&ill_g_lock, RW_READER); 5629 ill = ILL_START_WALK_V4(&ctx); 5630 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5631 GRAB_CONN_LOCK(q); 5632 mutex_enter(&ill->ill_lock); 5633 for (ipif = ill->ill_ipif; ipif != NULL; 5634 ipif = ipif->ipif_next) { 5635 /* Allow the ipif to be down */ 5636 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5637 (ipif->ipif_lcl_addr == if_addr) && 5638 (ipif->ipif_pp_dst_addr == dst)) { 5639 /* 5640 * The block comment at the start of ipif_down 5641 * explains the use of the macros used below 5642 */ 5643 if (IPIF_CAN_LOOKUP(ipif)) { 5644 ipif_refhold_locked(ipif); 5645 mutex_exit(&ill->ill_lock); 5646 RELEASE_CONN_LOCK(q); 5647 rw_exit(&ill_g_lock); 5648 return (ipif); 5649 } else if (IPIF_CAN_WAIT(ipif, q)) { 5650 ipsq = ill->ill_phyint->phyint_ipsq; 5651 mutex_enter(&ipsq->ipsq_lock); 5652 mutex_exit(&ill->ill_lock); 5653 rw_exit(&ill_g_lock); 5654 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5655 ill); 5656 mutex_exit(&ipsq->ipsq_lock); 5657 RELEASE_CONN_LOCK(q); 5658 *error = EINPROGRESS; 5659 return (NULL); 5660 } 5661 } 5662 } 5663 mutex_exit(&ill->ill_lock); 5664 RELEASE_CONN_LOCK(q); 5665 } 5666 rw_exit(&ill_g_lock); 5667 5668 /* lookup the ipif based on interface address */ 5669 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error); 5670 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5671 return (ipif); 5672 } 5673 5674 /* 5675 * Look for an ipif with the specified address. For point-point links 5676 * we look for matches on either the destination address and the local 5677 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5678 * is set. 5679 * Matches on a specific ill if match_ill is set. 5680 */ 5681 ipif_t * 5682 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5683 mblk_t *mp, ipsq_func_t func, int *error) 5684 { 5685 ipif_t *ipif; 5686 ill_t *ill; 5687 boolean_t ptp = B_FALSE; 5688 ipsq_t *ipsq; 5689 ill_walk_context_t ctx; 5690 5691 if (error != NULL) 5692 *error = 0; 5693 5694 rw_enter(&ill_g_lock, RW_READER); 5695 /* 5696 * Repeat twice, first based on local addresses and 5697 * next time for pointopoint. 5698 */ 5699 repeat: 5700 ill = ILL_START_WALK_V4(&ctx); 5701 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5702 if (match_ill != NULL && ill != match_ill) { 5703 continue; 5704 } 5705 GRAB_CONN_LOCK(q); 5706 mutex_enter(&ill->ill_lock); 5707 for (ipif = ill->ill_ipif; ipif != NULL; 5708 ipif = ipif->ipif_next) { 5709 if (zoneid != ALL_ZONES && 5710 zoneid != ipif->ipif_zoneid && 5711 ipif->ipif_zoneid != ALL_ZONES) 5712 continue; 5713 /* Allow the ipif to be down */ 5714 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5715 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5716 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5717 (ipif->ipif_pp_dst_addr == addr))) { 5718 /* 5719 * The block comment at the start of ipif_down 5720 * explains the use of the macros used below 5721 */ 5722 if (IPIF_CAN_LOOKUP(ipif)) { 5723 ipif_refhold_locked(ipif); 5724 mutex_exit(&ill->ill_lock); 5725 RELEASE_CONN_LOCK(q); 5726 rw_exit(&ill_g_lock); 5727 return (ipif); 5728 } else if (IPIF_CAN_WAIT(ipif, q)) { 5729 ipsq = ill->ill_phyint->phyint_ipsq; 5730 mutex_enter(&ipsq->ipsq_lock); 5731 mutex_exit(&ill->ill_lock); 5732 rw_exit(&ill_g_lock); 5733 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5734 ill); 5735 mutex_exit(&ipsq->ipsq_lock); 5736 RELEASE_CONN_LOCK(q); 5737 *error = EINPROGRESS; 5738 return (NULL); 5739 } 5740 } 5741 } 5742 mutex_exit(&ill->ill_lock); 5743 RELEASE_CONN_LOCK(q); 5744 } 5745 5746 /* If we already did the ptp case, then we are done */ 5747 if (ptp) { 5748 rw_exit(&ill_g_lock); 5749 if (error != NULL) 5750 *error = ENXIO; 5751 return (NULL); 5752 } 5753 ptp = B_TRUE; 5754 goto repeat; 5755 } 5756 5757 /* 5758 * Look for an ipif with the specified address. For point-point links 5759 * we look for matches on either the destination address and the local 5760 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5761 * is set. 5762 * Matches on a specific ill if match_ill is set. 5763 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 5764 */ 5765 zoneid_t 5766 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill) 5767 { 5768 zoneid_t zoneid; 5769 ipif_t *ipif; 5770 ill_t *ill; 5771 boolean_t ptp = B_FALSE; 5772 ill_walk_context_t ctx; 5773 5774 rw_enter(&ill_g_lock, RW_READER); 5775 /* 5776 * Repeat twice, first based on local addresses and 5777 * next time for pointopoint. 5778 */ 5779 repeat: 5780 ill = ILL_START_WALK_V4(&ctx); 5781 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5782 if (match_ill != NULL && ill != match_ill) { 5783 continue; 5784 } 5785 mutex_enter(&ill->ill_lock); 5786 for (ipif = ill->ill_ipif; ipif != NULL; 5787 ipif = ipif->ipif_next) { 5788 /* Allow the ipif to be down */ 5789 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5790 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5791 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5792 (ipif->ipif_pp_dst_addr == addr)) && 5793 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 5794 zoneid = ipif->ipif_zoneid; 5795 mutex_exit(&ill->ill_lock); 5796 rw_exit(&ill_g_lock); 5797 /* 5798 * If ipif_zoneid was ALL_ZONES then we have 5799 * a trusted extensions shared IP address. 5800 * In that case GLOBAL_ZONEID works to send. 5801 */ 5802 if (zoneid == ALL_ZONES) 5803 zoneid = GLOBAL_ZONEID; 5804 return (zoneid); 5805 } 5806 } 5807 mutex_exit(&ill->ill_lock); 5808 } 5809 5810 /* If we already did the ptp case, then we are done */ 5811 if (ptp) { 5812 rw_exit(&ill_g_lock); 5813 return (ALL_ZONES); 5814 } 5815 ptp = B_TRUE; 5816 goto repeat; 5817 } 5818 5819 /* 5820 * Look for an ipif that matches the specified remote address i.e. the 5821 * ipif that would receive the specified packet. 5822 * First look for directly connected interfaces and then do a recursive 5823 * IRE lookup and pick the first ipif corresponding to the source address in the 5824 * ire. 5825 * Returns: held ipif 5826 */ 5827 ipif_t * 5828 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 5829 { 5830 ipif_t *ipif; 5831 ire_t *ire; 5832 5833 ASSERT(!ill->ill_isv6); 5834 5835 /* 5836 * Someone could be changing this ipif currently or change it 5837 * after we return this. Thus a few packets could use the old 5838 * old values. However structure updates/creates (ire, ilg, ilm etc) 5839 * will atomically be updated or cleaned up with the new value 5840 * Thus we don't need a lock to check the flags or other attrs below. 5841 */ 5842 mutex_enter(&ill->ill_lock); 5843 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5844 if (!IPIF_CAN_LOOKUP(ipif)) 5845 continue; 5846 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 5847 ipif->ipif_zoneid != ALL_ZONES) 5848 continue; 5849 /* Allow the ipif to be down */ 5850 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5851 if ((ipif->ipif_pp_dst_addr == addr) || 5852 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 5853 ipif->ipif_lcl_addr == addr)) { 5854 ipif_refhold_locked(ipif); 5855 mutex_exit(&ill->ill_lock); 5856 return (ipif); 5857 } 5858 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 5859 ipif_refhold_locked(ipif); 5860 mutex_exit(&ill->ill_lock); 5861 return (ipif); 5862 } 5863 } 5864 mutex_exit(&ill->ill_lock); 5865 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 5866 NULL, MATCH_IRE_RECURSIVE); 5867 if (ire != NULL) { 5868 /* 5869 * The callers of this function wants to know the 5870 * interface on which they have to send the replies 5871 * back. For IRE_CACHES that have ire_stq and ire_ipif 5872 * derived from different ills, we really don't care 5873 * what we return here. 5874 */ 5875 ipif = ire->ire_ipif; 5876 if (ipif != NULL) { 5877 ipif_refhold(ipif); 5878 ire_refrele(ire); 5879 return (ipif); 5880 } 5881 ire_refrele(ire); 5882 } 5883 /* Pick the first interface */ 5884 ipif = ipif_get_next_ipif(NULL, ill); 5885 return (ipif); 5886 } 5887 5888 /* 5889 * This func does not prevent refcnt from increasing. But if 5890 * the caller has taken steps to that effect, then this func 5891 * can be used to determine whether the ill has become quiescent 5892 */ 5893 boolean_t 5894 ill_is_quiescent(ill_t *ill) 5895 { 5896 ipif_t *ipif; 5897 5898 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5899 5900 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5901 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5902 return (B_FALSE); 5903 } 5904 } 5905 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 5906 ill->ill_nce_cnt != 0 || ill->ill_srcif_refcnt != 0 || 5907 ill->ill_mrtun_refcnt != 0) { 5908 return (B_FALSE); 5909 } 5910 return (B_TRUE); 5911 } 5912 5913 /* 5914 * This func does not prevent refcnt from increasing. But if 5915 * the caller has taken steps to that effect, then this func 5916 * can be used to determine whether the ipif has become quiescent 5917 */ 5918 static boolean_t 5919 ipif_is_quiescent(ipif_t *ipif) 5920 { 5921 ill_t *ill; 5922 5923 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5924 5925 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5926 return (B_FALSE); 5927 } 5928 5929 ill = ipif->ipif_ill; 5930 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 5931 ill->ill_logical_down) { 5932 return (B_TRUE); 5933 } 5934 5935 /* This is the last ipif going down or being deleted on this ill */ 5936 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 5937 return (B_FALSE); 5938 } 5939 5940 return (B_TRUE); 5941 } 5942 5943 /* 5944 * This func does not prevent refcnt from increasing. But if 5945 * the caller has taken steps to that effect, then this func 5946 * can be used to determine whether the ipifs marked with IPIF_MOVING 5947 * have become quiescent and can be moved in a failover/failback. 5948 */ 5949 static ipif_t * 5950 ill_quiescent_to_move(ill_t *ill) 5951 { 5952 ipif_t *ipif; 5953 5954 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5955 5956 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5957 if (ipif->ipif_state_flags & IPIF_MOVING) { 5958 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5959 return (ipif); 5960 } 5961 } 5962 } 5963 return (NULL); 5964 } 5965 5966 /* 5967 * The ipif/ill/ire has been refreled. Do the tail processing. 5968 * Determine if the ipif or ill in question has become quiescent and if so 5969 * wakeup close and/or restart any queued pending ioctl that is waiting 5970 * for the ipif_down (or ill_down) 5971 */ 5972 void 5973 ipif_ill_refrele_tail(ill_t *ill) 5974 { 5975 mblk_t *mp; 5976 conn_t *connp; 5977 ipsq_t *ipsq; 5978 ipif_t *ipif; 5979 5980 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5981 5982 if ((ill->ill_state_flags & ILL_CONDEMNED) && 5983 ill_is_quiescent(ill)) { 5984 /* ill_close may be waiting */ 5985 cv_broadcast(&ill->ill_cv); 5986 } 5987 5988 /* ipsq can't change because ill_lock is held */ 5989 ipsq = ill->ill_phyint->phyint_ipsq; 5990 if (ipsq->ipsq_waitfor == 0) { 5991 /* Not waiting for anything, just return. */ 5992 mutex_exit(&ill->ill_lock); 5993 return; 5994 } 5995 ASSERT(ipsq->ipsq_pending_mp != NULL && 5996 ipsq->ipsq_pending_ipif != NULL); 5997 /* 5998 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 5999 * Last ipif going down needs to down the ill, so ill_ire_cnt must 6000 * be zero for restarting an ioctl that ends up downing the ill. 6001 */ 6002 ipif = ipsq->ipsq_pending_ipif; 6003 if (ipif->ipif_ill != ill) { 6004 /* The ioctl is pending on some other ill. */ 6005 mutex_exit(&ill->ill_lock); 6006 return; 6007 } 6008 6009 switch (ipsq->ipsq_waitfor) { 6010 case IPIF_DOWN: 6011 case IPIF_FREE: 6012 if (!ipif_is_quiescent(ipif)) { 6013 mutex_exit(&ill->ill_lock); 6014 return; 6015 } 6016 break; 6017 6018 case ILL_DOWN: 6019 case ILL_FREE: 6020 /* 6021 * case ILL_FREE arises only for loopback. otherwise ill_delete 6022 * waits synchronously in ip_close, and no message is queued in 6023 * ipsq_pending_mp at all in this case 6024 */ 6025 if (!ill_is_quiescent(ill)) { 6026 mutex_exit(&ill->ill_lock); 6027 return; 6028 } 6029 6030 break; 6031 6032 case ILL_MOVE_OK: 6033 if (ill_quiescent_to_move(ill) != NULL) { 6034 mutex_exit(&ill->ill_lock); 6035 return; 6036 } 6037 6038 break; 6039 default: 6040 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 6041 (void *)ipsq, ipsq->ipsq_waitfor); 6042 } 6043 6044 /* 6045 * Incr refcnt for the qwriter_ip call below which 6046 * does a refrele 6047 */ 6048 ill_refhold_locked(ill); 6049 mutex_exit(&ill->ill_lock); 6050 6051 mp = ipsq_pending_mp_get(ipsq, &connp); 6052 ASSERT(mp != NULL); 6053 6054 switch (mp->b_datap->db_type) { 6055 case M_ERROR: 6056 case M_HANGUP: 6057 (void) qwriter_ip(NULL, ill, ill->ill_rq, mp, 6058 ipif_all_down_tail, CUR_OP, B_TRUE); 6059 return; 6060 6061 case M_IOCTL: 6062 case M_IOCDATA: 6063 (void) qwriter_ip(NULL, ill, 6064 (connp != NULL ? CONNP_TO_WQ(connp) : ill->ill_wq), mp, 6065 ip_reprocess_ioctl, CUR_OP, B_TRUE); 6066 return; 6067 6068 default: 6069 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6070 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6071 } 6072 } 6073 6074 #ifdef ILL_DEBUG 6075 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6076 void 6077 th_trace_rrecord(th_trace_t *th_trace) 6078 { 6079 tr_buf_t *tr_buf; 6080 uint_t lastref; 6081 6082 lastref = th_trace->th_trace_lastref; 6083 lastref++; 6084 if (lastref == TR_BUF_MAX) 6085 lastref = 0; 6086 th_trace->th_trace_lastref = lastref; 6087 tr_buf = &th_trace->th_trbuf[lastref]; 6088 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, IP_STACK_DEPTH); 6089 } 6090 6091 th_trace_t * 6092 th_trace_ipif_lookup(ipif_t *ipif) 6093 { 6094 int bucket_id; 6095 th_trace_t *th_trace; 6096 6097 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6098 6099 bucket_id = IP_TR_HASH(curthread); 6100 ASSERT(bucket_id < IP_TR_HASH_MAX); 6101 6102 for (th_trace = ipif->ipif_trace[bucket_id]; th_trace != NULL; 6103 th_trace = th_trace->th_next) { 6104 if (th_trace->th_id == curthread) 6105 return (th_trace); 6106 } 6107 return (NULL); 6108 } 6109 6110 void 6111 ipif_trace_ref(ipif_t *ipif) 6112 { 6113 int bucket_id; 6114 th_trace_t *th_trace; 6115 6116 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6117 6118 if (ipif->ipif_trace_disable) 6119 return; 6120 6121 /* 6122 * Attempt to locate the trace buffer for the curthread. 6123 * If it does not exist, then allocate a new trace buffer 6124 * and link it in list of trace bufs for this ipif, at the head 6125 */ 6126 th_trace = th_trace_ipif_lookup(ipif); 6127 if (th_trace == NULL) { 6128 bucket_id = IP_TR_HASH(curthread); 6129 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6130 KM_NOSLEEP); 6131 if (th_trace == NULL) { 6132 ipif->ipif_trace_disable = B_TRUE; 6133 ipif_trace_cleanup(ipif); 6134 return; 6135 } 6136 th_trace->th_id = curthread; 6137 th_trace->th_next = ipif->ipif_trace[bucket_id]; 6138 th_trace->th_prev = &ipif->ipif_trace[bucket_id]; 6139 if (th_trace->th_next != NULL) 6140 th_trace->th_next->th_prev = &th_trace->th_next; 6141 ipif->ipif_trace[bucket_id] = th_trace; 6142 } 6143 ASSERT(th_trace->th_refcnt >= 0 && 6144 th_trace->th_refcnt < TR_BUF_MAX -1); 6145 th_trace->th_refcnt++; 6146 th_trace_rrecord(th_trace); 6147 } 6148 6149 void 6150 ipif_untrace_ref(ipif_t *ipif) 6151 { 6152 th_trace_t *th_trace; 6153 6154 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6155 6156 if (ipif->ipif_trace_disable) 6157 return; 6158 th_trace = th_trace_ipif_lookup(ipif); 6159 ASSERT(th_trace != NULL); 6160 ASSERT(th_trace->th_refcnt > 0); 6161 6162 th_trace->th_refcnt--; 6163 th_trace_rrecord(th_trace); 6164 } 6165 6166 th_trace_t * 6167 th_trace_ill_lookup(ill_t *ill) 6168 { 6169 th_trace_t *th_trace; 6170 int bucket_id; 6171 6172 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6173 6174 bucket_id = IP_TR_HASH(curthread); 6175 ASSERT(bucket_id < IP_TR_HASH_MAX); 6176 6177 for (th_trace = ill->ill_trace[bucket_id]; th_trace != NULL; 6178 th_trace = th_trace->th_next) { 6179 if (th_trace->th_id == curthread) 6180 return (th_trace); 6181 } 6182 return (NULL); 6183 } 6184 6185 void 6186 ill_trace_ref(ill_t *ill) 6187 { 6188 int bucket_id; 6189 th_trace_t *th_trace; 6190 6191 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6192 if (ill->ill_trace_disable) 6193 return; 6194 /* 6195 * Attempt to locate the trace buffer for the curthread. 6196 * If it does not exist, then allocate a new trace buffer 6197 * and link it in list of trace bufs for this ill, at the head 6198 */ 6199 th_trace = th_trace_ill_lookup(ill); 6200 if (th_trace == NULL) { 6201 bucket_id = IP_TR_HASH(curthread); 6202 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6203 KM_NOSLEEP); 6204 if (th_trace == NULL) { 6205 ill->ill_trace_disable = B_TRUE; 6206 ill_trace_cleanup(ill); 6207 return; 6208 } 6209 th_trace->th_id = curthread; 6210 th_trace->th_next = ill->ill_trace[bucket_id]; 6211 th_trace->th_prev = &ill->ill_trace[bucket_id]; 6212 if (th_trace->th_next != NULL) 6213 th_trace->th_next->th_prev = &th_trace->th_next; 6214 ill->ill_trace[bucket_id] = th_trace; 6215 } 6216 ASSERT(th_trace->th_refcnt >= 0 && 6217 th_trace->th_refcnt < TR_BUF_MAX - 1); 6218 6219 th_trace->th_refcnt++; 6220 th_trace_rrecord(th_trace); 6221 } 6222 6223 void 6224 ill_untrace_ref(ill_t *ill) 6225 { 6226 th_trace_t *th_trace; 6227 6228 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6229 6230 if (ill->ill_trace_disable) 6231 return; 6232 th_trace = th_trace_ill_lookup(ill); 6233 ASSERT(th_trace != NULL); 6234 ASSERT(th_trace->th_refcnt > 0); 6235 6236 th_trace->th_refcnt--; 6237 th_trace_rrecord(th_trace); 6238 } 6239 6240 /* 6241 * Verify that this thread has no refs to the ipif and free 6242 * the trace buffers 6243 */ 6244 /* ARGSUSED */ 6245 void 6246 ipif_thread_exit(ipif_t *ipif, void *dummy) 6247 { 6248 th_trace_t *th_trace; 6249 6250 mutex_enter(&ipif->ipif_ill->ill_lock); 6251 6252 th_trace = th_trace_ipif_lookup(ipif); 6253 if (th_trace == NULL) { 6254 mutex_exit(&ipif->ipif_ill->ill_lock); 6255 return; 6256 } 6257 ASSERT(th_trace->th_refcnt == 0); 6258 /* unlink th_trace and free it */ 6259 *th_trace->th_prev = th_trace->th_next; 6260 if (th_trace->th_next != NULL) 6261 th_trace->th_next->th_prev = th_trace->th_prev; 6262 th_trace->th_next = NULL; 6263 th_trace->th_prev = NULL; 6264 kmem_free(th_trace, sizeof (th_trace_t)); 6265 6266 mutex_exit(&ipif->ipif_ill->ill_lock); 6267 } 6268 6269 /* 6270 * Verify that this thread has no refs to the ill and free 6271 * the trace buffers 6272 */ 6273 /* ARGSUSED */ 6274 void 6275 ill_thread_exit(ill_t *ill, void *dummy) 6276 { 6277 th_trace_t *th_trace; 6278 6279 mutex_enter(&ill->ill_lock); 6280 6281 th_trace = th_trace_ill_lookup(ill); 6282 if (th_trace == NULL) { 6283 mutex_exit(&ill->ill_lock); 6284 return; 6285 } 6286 ASSERT(th_trace->th_refcnt == 0); 6287 /* unlink th_trace and free it */ 6288 *th_trace->th_prev = th_trace->th_next; 6289 if (th_trace->th_next != NULL) 6290 th_trace->th_next->th_prev = th_trace->th_prev; 6291 th_trace->th_next = NULL; 6292 th_trace->th_prev = NULL; 6293 kmem_free(th_trace, sizeof (th_trace_t)); 6294 6295 mutex_exit(&ill->ill_lock); 6296 } 6297 #endif 6298 6299 #ifdef ILL_DEBUG 6300 void 6301 ip_thread_exit(void) 6302 { 6303 ill_t *ill; 6304 ipif_t *ipif; 6305 ill_walk_context_t ctx; 6306 6307 rw_enter(&ill_g_lock, RW_READER); 6308 ill = ILL_START_WALK_ALL(&ctx); 6309 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6310 for (ipif = ill->ill_ipif; ipif != NULL; 6311 ipif = ipif->ipif_next) { 6312 ipif_thread_exit(ipif, NULL); 6313 } 6314 ill_thread_exit(ill, NULL); 6315 } 6316 rw_exit(&ill_g_lock); 6317 6318 ire_walk(ire_thread_exit, NULL); 6319 ndp_walk_common(&ndp4, NULL, nce_thread_exit, NULL, B_FALSE); 6320 ndp_walk_common(&ndp6, NULL, nce_thread_exit, NULL, B_FALSE); 6321 } 6322 6323 /* 6324 * Called when ipif is unplumbed or when memory alloc fails 6325 */ 6326 void 6327 ipif_trace_cleanup(ipif_t *ipif) 6328 { 6329 int i; 6330 th_trace_t *th_trace; 6331 th_trace_t *th_trace_next; 6332 6333 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6334 for (th_trace = ipif->ipif_trace[i]; th_trace != NULL; 6335 th_trace = th_trace_next) { 6336 th_trace_next = th_trace->th_next; 6337 kmem_free(th_trace, sizeof (th_trace_t)); 6338 } 6339 ipif->ipif_trace[i] = NULL; 6340 } 6341 } 6342 6343 /* 6344 * Called when ill is unplumbed or when memory alloc fails 6345 */ 6346 void 6347 ill_trace_cleanup(ill_t *ill) 6348 { 6349 int i; 6350 th_trace_t *th_trace; 6351 th_trace_t *th_trace_next; 6352 6353 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6354 for (th_trace = ill->ill_trace[i]; th_trace != NULL; 6355 th_trace = th_trace_next) { 6356 th_trace_next = th_trace->th_next; 6357 kmem_free(th_trace, sizeof (th_trace_t)); 6358 } 6359 ill->ill_trace[i] = NULL; 6360 } 6361 } 6362 6363 #else 6364 void ip_thread_exit(void) {} 6365 #endif 6366 6367 void 6368 ipif_refhold_locked(ipif_t *ipif) 6369 { 6370 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6371 ipif->ipif_refcnt++; 6372 IPIF_TRACE_REF(ipif); 6373 } 6374 6375 void 6376 ipif_refhold(ipif_t *ipif) 6377 { 6378 ill_t *ill; 6379 6380 ill = ipif->ipif_ill; 6381 mutex_enter(&ill->ill_lock); 6382 ipif->ipif_refcnt++; 6383 IPIF_TRACE_REF(ipif); 6384 mutex_exit(&ill->ill_lock); 6385 } 6386 6387 /* 6388 * Must not be called while holding any locks. Otherwise if this is 6389 * the last reference to be released there is a chance of recursive mutex 6390 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6391 * to restart an ioctl. 6392 */ 6393 void 6394 ipif_refrele(ipif_t *ipif) 6395 { 6396 ill_t *ill; 6397 6398 ill = ipif->ipif_ill; 6399 6400 mutex_enter(&ill->ill_lock); 6401 ASSERT(ipif->ipif_refcnt != 0); 6402 ipif->ipif_refcnt--; 6403 IPIF_UNTRACE_REF(ipif); 6404 if (ipif->ipif_refcnt != 0) { 6405 mutex_exit(&ill->ill_lock); 6406 return; 6407 } 6408 6409 /* Drops the ill_lock */ 6410 ipif_ill_refrele_tail(ill); 6411 } 6412 6413 ipif_t * 6414 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6415 { 6416 ipif_t *ipif; 6417 6418 mutex_enter(&ill->ill_lock); 6419 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6420 ipif != NULL; ipif = ipif->ipif_next) { 6421 if (!IPIF_CAN_LOOKUP(ipif)) 6422 continue; 6423 ipif_refhold_locked(ipif); 6424 mutex_exit(&ill->ill_lock); 6425 return (ipif); 6426 } 6427 mutex_exit(&ill->ill_lock); 6428 return (NULL); 6429 } 6430 6431 /* 6432 * TODO: make this table extendible at run time 6433 * Return a pointer to the mac type info for 'mac_type' 6434 */ 6435 static ip_m_t * 6436 ip_m_lookup(t_uscalar_t mac_type) 6437 { 6438 ip_m_t *ipm; 6439 6440 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6441 if (ipm->ip_m_mac_type == mac_type) 6442 return (ipm); 6443 return (NULL); 6444 } 6445 6446 /* 6447 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6448 * ipif_arg is passed in to associate it with the correct interface. 6449 * We may need to restart this operation if the ipif cannot be looked up 6450 * due to an exclusive operation that is currently in progress. The restart 6451 * entry point is specified by 'func' 6452 */ 6453 int 6454 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6455 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6456 ire_t **ire_arg, boolean_t ioctl_msg, queue_t *q, mblk_t *mp, 6457 ipsq_func_t func, struct rtsa_s *sp) 6458 { 6459 ire_t *ire; 6460 ire_t *gw_ire = NULL; 6461 ipif_t *ipif = NULL; 6462 boolean_t ipif_refheld = B_FALSE; 6463 uint_t type; 6464 int match_flags = MATCH_IRE_TYPE; 6465 int error; 6466 tsol_gc_t *gc = NULL; 6467 tsol_gcgrp_t *gcgrp = NULL; 6468 boolean_t gcgrp_xtraref = B_FALSE; 6469 6470 ip1dbg(("ip_rt_add:")); 6471 6472 if (ire_arg != NULL) 6473 *ire_arg = NULL; 6474 6475 /* 6476 * If this is the case of RTF_HOST being set, then we set the netmask 6477 * to all ones (regardless if one was supplied). 6478 */ 6479 if (flags & RTF_HOST) 6480 mask = IP_HOST_MASK; 6481 6482 /* 6483 * Prevent routes with a zero gateway from being created (since 6484 * interfaces can currently be plumbed and brought up no assigned 6485 * address). 6486 * For routes with RTA_SRCIFP, the gateway address can be 0.0.0.0. 6487 */ 6488 if (gw_addr == 0 && src_ipif == NULL) 6489 return (ENETUNREACH); 6490 /* 6491 * Get the ipif, if any, corresponding to the gw_addr 6492 */ 6493 if (gw_addr != 0) { 6494 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, 6495 &error); 6496 if (ipif != NULL) { 6497 if (IS_VNI(ipif->ipif_ill)) { 6498 ipif_refrele(ipif); 6499 return (EINVAL); 6500 } 6501 ipif_refheld = B_TRUE; 6502 } else if (error == EINPROGRESS) { 6503 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6504 return (EINPROGRESS); 6505 } else { 6506 error = 0; 6507 } 6508 } 6509 6510 if (ipif != NULL) { 6511 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6512 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6513 } else { 6514 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6515 } 6516 6517 /* 6518 * GateD will attempt to create routes with a loopback interface 6519 * address as the gateway and with RTF_GATEWAY set. We allow 6520 * these routes to be added, but create them as interface routes 6521 * since the gateway is an interface address. 6522 */ 6523 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6524 flags &= ~RTF_GATEWAY; 6525 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6526 mask == IP_HOST_MASK) { 6527 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6528 ALL_ZONES, NULL, match_flags); 6529 if (ire != NULL) { 6530 ire_refrele(ire); 6531 if (ipif_refheld) 6532 ipif_refrele(ipif); 6533 return (EEXIST); 6534 } 6535 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" 6536 "for 0x%x\n", (void *)ipif, 6537 ipif->ipif_ire_type, 6538 ntohl(ipif->ipif_lcl_addr))); 6539 ire = ire_create( 6540 (uchar_t *)&dst_addr, /* dest address */ 6541 (uchar_t *)&mask, /* mask */ 6542 (uchar_t *)&ipif->ipif_src_addr, 6543 NULL, /* no gateway */ 6544 NULL, 6545 &ipif->ipif_mtu, 6546 NULL, 6547 ipif->ipif_rq, /* recv-from queue */ 6548 NULL, /* no send-to queue */ 6549 ipif->ipif_ire_type, /* LOOPBACK */ 6550 NULL, 6551 ipif, 6552 NULL, 6553 0, 6554 0, 6555 0, 6556 (ipif->ipif_flags & IPIF_PRIVATE) ? 6557 RTF_PRIVATE : 0, 6558 &ire_uinfo_null, 6559 NULL, 6560 NULL); 6561 6562 if (ire == NULL) { 6563 if (ipif_refheld) 6564 ipif_refrele(ipif); 6565 return (ENOMEM); 6566 } 6567 error = ire_add(&ire, q, mp, func, B_FALSE); 6568 if (error == 0) 6569 goto save_ire; 6570 if (ipif_refheld) 6571 ipif_refrele(ipif); 6572 return (error); 6573 6574 } 6575 } 6576 6577 /* 6578 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6579 * and the gateway address provided is one of the system's interface 6580 * addresses. By using the routing socket interface and supplying an 6581 * RTA_IFP sockaddr with an interface index, an alternate method of 6582 * specifying an interface route to be created is available which uses 6583 * the interface index that specifies the outgoing interface rather than 6584 * the address of an outgoing interface (which may not be able to 6585 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6586 * flag, routes can be specified which not only specify the next-hop to 6587 * be used when routing to a certain prefix, but also which outgoing 6588 * interface should be used. 6589 * 6590 * Previously, interfaces would have unique addresses assigned to them 6591 * and so the address assigned to a particular interface could be used 6592 * to identify a particular interface. One exception to this was the 6593 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6594 * 6595 * With the advent of IPv6 and its link-local addresses, this 6596 * restriction was relaxed and interfaces could share addresses between 6597 * themselves. In fact, typically all of the link-local interfaces on 6598 * an IPv6 node or router will have the same link-local address. In 6599 * order to differentiate between these interfaces, the use of an 6600 * interface index is necessary and this index can be carried inside a 6601 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6602 * of using the interface index, however, is that all of the ipif's that 6603 * are part of an ill have the same index and so the RTA_IFP sockaddr 6604 * cannot be used to differentiate between ipif's (or logical 6605 * interfaces) that belong to the same ill (physical interface). 6606 * 6607 * For example, in the following case involving IPv4 interfaces and 6608 * logical interfaces 6609 * 6610 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6611 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6612 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6613 * 6614 * the ipif's corresponding to each of these interface routes can be 6615 * uniquely identified by the "gateway" (actually interface address). 6616 * 6617 * In this case involving multiple IPv6 default routes to a particular 6618 * link-local gateway, the use of RTA_IFP is necessary to specify which 6619 * default route is of interest: 6620 * 6621 * default fe80::123:4567:89ab:cdef U if0 6622 * default fe80::123:4567:89ab:cdef U if1 6623 */ 6624 6625 /* RTF_GATEWAY not set */ 6626 if (!(flags & RTF_GATEWAY)) { 6627 queue_t *stq; 6628 queue_t *rfq = NULL; 6629 ill_t *in_ill = NULL; 6630 6631 if (sp != NULL) { 6632 ip2dbg(("ip_rt_add: gateway security attributes " 6633 "cannot be set with interface route\n")); 6634 if (ipif_refheld) 6635 ipif_refrele(ipif); 6636 return (EINVAL); 6637 } 6638 6639 /* 6640 * As the interface index specified with the RTA_IFP sockaddr is 6641 * the same for all ipif's off of an ill, the matching logic 6642 * below uses MATCH_IRE_ILL if such an index was specified. 6643 * This means that routes sharing the same prefix when added 6644 * using a RTA_IFP sockaddr must have distinct interface 6645 * indices (namely, they must be on distinct ill's). 6646 * 6647 * On the other hand, since the gateway address will usually be 6648 * different for each ipif on the system, the matching logic 6649 * uses MATCH_IRE_IPIF in the case of a traditional interface 6650 * route. This means that interface routes for the same prefix 6651 * can be created if they belong to distinct ipif's and if a 6652 * RTA_IFP sockaddr is not present. 6653 */ 6654 if (ipif_arg != NULL) { 6655 if (ipif_refheld) { 6656 ipif_refrele(ipif); 6657 ipif_refheld = B_FALSE; 6658 } 6659 ipif = ipif_arg; 6660 match_flags |= MATCH_IRE_ILL; 6661 } else { 6662 /* 6663 * Check the ipif corresponding to the gw_addr 6664 */ 6665 if (ipif == NULL) 6666 return (ENETUNREACH); 6667 match_flags |= MATCH_IRE_IPIF; 6668 } 6669 ASSERT(ipif != NULL); 6670 /* 6671 * If src_ipif is not NULL, we have to create 6672 * an ire with non-null ire_in_ill value 6673 */ 6674 if (src_ipif != NULL) { 6675 in_ill = src_ipif->ipif_ill; 6676 } 6677 6678 /* 6679 * We check for an existing entry at this point. 6680 * 6681 * Since a netmask isn't passed in via the ioctl interface 6682 * (SIOCADDRT), we don't check for a matching netmask in that 6683 * case. 6684 */ 6685 if (!ioctl_msg) 6686 match_flags |= MATCH_IRE_MASK; 6687 if (src_ipif != NULL) { 6688 /* Look up in the special table */ 6689 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6690 ipif, src_ipif->ipif_ill, match_flags); 6691 } else { 6692 ire = ire_ftable_lookup(dst_addr, mask, 0, 6693 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6694 NULL, match_flags); 6695 } 6696 if (ire != NULL) { 6697 ire_refrele(ire); 6698 if (ipif_refheld) 6699 ipif_refrele(ipif); 6700 return (EEXIST); 6701 } 6702 6703 if (src_ipif != NULL) { 6704 /* 6705 * Create the special ire for the IRE table 6706 * which hangs out of ire_in_ill. This ire 6707 * is in-between IRE_CACHE and IRE_INTERFACE. 6708 * Thus rfq is non-NULL. 6709 */ 6710 rfq = ipif->ipif_rq; 6711 } 6712 /* Create the usual interface ires */ 6713 6714 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6715 ? ipif->ipif_rq : ipif->ipif_wq; 6716 6717 /* 6718 * Create a copy of the IRE_LOOPBACK, 6719 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6720 * the modified address and netmask. 6721 */ 6722 ire = ire_create( 6723 (uchar_t *)&dst_addr, 6724 (uint8_t *)&mask, 6725 (uint8_t *)&ipif->ipif_src_addr, 6726 NULL, 6727 NULL, 6728 &ipif->ipif_mtu, 6729 NULL, 6730 rfq, 6731 stq, 6732 ipif->ipif_net_type, 6733 ipif->ipif_resolver_mp, 6734 ipif, 6735 in_ill, 6736 0, 6737 0, 6738 0, 6739 flags, 6740 &ire_uinfo_null, 6741 NULL, 6742 NULL); 6743 if (ire == NULL) { 6744 if (ipif_refheld) 6745 ipif_refrele(ipif); 6746 return (ENOMEM); 6747 } 6748 6749 /* 6750 * Some software (for example, GateD and Sun Cluster) attempts 6751 * to create (what amount to) IRE_PREFIX routes with the 6752 * loopback address as the gateway. This is primarily done to 6753 * set up prefixes with the RTF_REJECT flag set (for example, 6754 * when generating aggregate routes.) 6755 * 6756 * If the IRE type (as defined by ipif->ipif_net_type) is 6757 * IRE_LOOPBACK, then we map the request into a 6758 * IRE_IF_NORESOLVER. 6759 * 6760 * Needless to say, the real IRE_LOOPBACK is NOT created by this 6761 * routine, but rather using ire_create() directly. 6762 * 6763 */ 6764 if (ipif->ipif_net_type == IRE_LOOPBACK) 6765 ire->ire_type = IRE_IF_NORESOLVER; 6766 6767 error = ire_add(&ire, q, mp, func, B_FALSE); 6768 if (error == 0) 6769 goto save_ire; 6770 6771 /* 6772 * In the result of failure, ire_add() will have already 6773 * deleted the ire in question, so there is no need to 6774 * do that here. 6775 */ 6776 if (ipif_refheld) 6777 ipif_refrele(ipif); 6778 return (error); 6779 } 6780 if (ipif_refheld) { 6781 ipif_refrele(ipif); 6782 ipif_refheld = B_FALSE; 6783 } 6784 6785 if (src_ipif != NULL) { 6786 /* RTA_SRCIFP is not supported on RTF_GATEWAY */ 6787 ip2dbg(("ip_rt_add: SRCIF cannot be set with gateway route\n")); 6788 return (EINVAL); 6789 } 6790 /* 6791 * Get an interface IRE for the specified gateway. 6792 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 6793 * gateway, it is currently unreachable and we fail the request 6794 * accordingly. 6795 */ 6796 ipif = ipif_arg; 6797 if (ipif_arg != NULL) 6798 match_flags |= MATCH_IRE_ILL; 6799 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 6800 ALL_ZONES, 0, NULL, match_flags); 6801 if (gw_ire == NULL) 6802 return (ENETUNREACH); 6803 6804 /* 6805 * We create one of three types of IREs as a result of this request 6806 * based on the netmask. A netmask of all ones (which is automatically 6807 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 6808 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 6809 * created. Otherwise, an IRE_PREFIX route is created for the 6810 * destination prefix. 6811 */ 6812 if (mask == IP_HOST_MASK) 6813 type = IRE_HOST; 6814 else if (mask == 0) 6815 type = IRE_DEFAULT; 6816 else 6817 type = IRE_PREFIX; 6818 6819 /* check for a duplicate entry */ 6820 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 6821 NULL, ALL_ZONES, 0, NULL, 6822 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW); 6823 if (ire != NULL) { 6824 ire_refrele(gw_ire); 6825 ire_refrele(ire); 6826 return (EEXIST); 6827 } 6828 6829 /* Security attribute exists */ 6830 if (sp != NULL) { 6831 tsol_gcgrp_addr_t ga; 6832 6833 /* find or create the gateway credentials group */ 6834 ga.ga_af = AF_INET; 6835 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 6836 6837 /* we hold reference to it upon success */ 6838 gcgrp = gcgrp_lookup(&ga, B_TRUE); 6839 if (gcgrp == NULL) { 6840 ire_refrele(gw_ire); 6841 return (ENOMEM); 6842 } 6843 6844 /* 6845 * Create and add the security attribute to the group; a 6846 * reference to the group is made upon allocating a new 6847 * entry successfully. If it finds an already-existing 6848 * entry for the security attribute in the group, it simply 6849 * returns it and no new reference is made to the group. 6850 */ 6851 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 6852 if (gc == NULL) { 6853 /* release reference held by gcgrp_lookup */ 6854 GCGRP_REFRELE(gcgrp); 6855 ire_refrele(gw_ire); 6856 return (ENOMEM); 6857 } 6858 } 6859 6860 /* Create the IRE. */ 6861 ire = ire_create( 6862 (uchar_t *)&dst_addr, /* dest address */ 6863 (uchar_t *)&mask, /* mask */ 6864 /* src address assigned by the caller? */ 6865 (uchar_t *)(((src_addr != INADDR_ANY) && 6866 (flags & RTF_SETSRC)) ? &src_addr : NULL), 6867 (uchar_t *)&gw_addr, /* gateway address */ 6868 NULL, /* no in-srcaddress */ 6869 &gw_ire->ire_max_frag, 6870 NULL, /* no Fast Path header */ 6871 NULL, /* no recv-from queue */ 6872 NULL, /* no send-to queue */ 6873 (ushort_t)type, /* IRE type */ 6874 NULL, 6875 ipif_arg, 6876 NULL, 6877 0, 6878 0, 6879 0, 6880 flags, 6881 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 6882 gc, /* security attribute */ 6883 NULL); 6884 /* 6885 * The ire holds a reference to the 'gc' and the 'gc' holds a 6886 * reference to the 'gcgrp'. We can now release the extra reference 6887 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 6888 */ 6889 if (gcgrp_xtraref) 6890 GCGRP_REFRELE(gcgrp); 6891 if (ire == NULL) { 6892 if (gc != NULL) 6893 GC_REFRELE(gc); 6894 ire_refrele(gw_ire); 6895 return (ENOMEM); 6896 } 6897 6898 /* 6899 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 6900 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 6901 */ 6902 6903 /* Add the new IRE. */ 6904 error = ire_add(&ire, q, mp, func, B_FALSE); 6905 if (error != 0) { 6906 /* 6907 * In the result of failure, ire_add() will have already 6908 * deleted the ire in question, so there is no need to 6909 * do that here. 6910 */ 6911 ire_refrele(gw_ire); 6912 return (error); 6913 } 6914 6915 if (flags & RTF_MULTIRT) { 6916 /* 6917 * Invoke the CGTP (multirouting) filtering module 6918 * to add the dst address in the filtering database. 6919 * Replicated inbound packets coming from that address 6920 * will be filtered to discard the duplicates. 6921 * It is not necessary to call the CGTP filter hook 6922 * when the dst address is a broadcast or multicast, 6923 * because an IP source address cannot be a broadcast 6924 * or a multicast. 6925 */ 6926 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 6927 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 6928 if (ire_dst != NULL) { 6929 ip_cgtp_bcast_add(ire, ire_dst); 6930 ire_refrele(ire_dst); 6931 goto save_ire; 6932 } 6933 if ((ip_cgtp_filter_ops != NULL) && !CLASSD(ire->ire_addr)) { 6934 int res = ip_cgtp_filter_ops->cfo_add_dest_v4( 6935 ire->ire_addr, 6936 ire->ire_gateway_addr, 6937 ire->ire_src_addr, 6938 gw_ire->ire_src_addr); 6939 if (res != 0) { 6940 ire_refrele(gw_ire); 6941 ire_delete(ire); 6942 return (res); 6943 } 6944 } 6945 } 6946 6947 /* 6948 * Now that the prefix IRE entry has been created, delete any 6949 * existing gateway IRE cache entries as well as any IRE caches 6950 * using the gateway, and force them to be created through 6951 * ip_newroute. 6952 */ 6953 if (gc != NULL) { 6954 ASSERT(gcgrp != NULL); 6955 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES); 6956 } 6957 6958 save_ire: 6959 if (gw_ire != NULL) { 6960 ire_refrele(gw_ire); 6961 } 6962 /* 6963 * We do not do save_ire for the routes added with RTA_SRCIFP 6964 * flag. This route is only added and deleted by mipagent. 6965 * So, for simplicity of design, we refrain from saving 6966 * ires that are created with srcif value. This may change 6967 * in future if we find more usage of srcifp feature. 6968 */ 6969 if (ipif != NULL && src_ipif == NULL) { 6970 /* 6971 * Save enough information so that we can recreate the IRE if 6972 * the interface goes down and then up. The metrics associated 6973 * with the route will be saved as well when rts_setmetrics() is 6974 * called after the IRE has been created. In the case where 6975 * memory cannot be allocated, none of this information will be 6976 * saved. 6977 */ 6978 ipif_save_ire(ipif, ire); 6979 } 6980 if (ioctl_msg) 6981 ip_rts_rtmsg(RTM_OLDADD, ire, 0); 6982 if (ire_arg != NULL) { 6983 /* 6984 * Store the ire that was successfully added into where ire_arg 6985 * points to so that callers don't have to look it up 6986 * themselves (but they are responsible for ire_refrele()ing 6987 * the ire when they are finished with it). 6988 */ 6989 *ire_arg = ire; 6990 } else { 6991 ire_refrele(ire); /* Held in ire_add */ 6992 } 6993 if (ipif_refheld) 6994 ipif_refrele(ipif); 6995 return (0); 6996 } 6997 6998 /* 6999 * ip_rt_delete is called to delete an IPv4 route. 7000 * ipif_arg is passed in to associate it with the correct interface. 7001 * src_ipif is passed to associate the incoming interface of the packet. 7002 * We may need to restart this operation if the ipif cannot be looked up 7003 * due to an exclusive operation that is currently in progress. The restart 7004 * entry point is specified by 'func' 7005 */ 7006 /* ARGSUSED4 */ 7007 int 7008 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7009 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 7010 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func) 7011 { 7012 ire_t *ire = NULL; 7013 ipif_t *ipif; 7014 boolean_t ipif_refheld = B_FALSE; 7015 uint_t type; 7016 uint_t match_flags = MATCH_IRE_TYPE; 7017 int err = 0; 7018 7019 ip1dbg(("ip_rt_delete:")); 7020 /* 7021 * If this is the case of RTF_HOST being set, then we set the netmask 7022 * to all ones. Otherwise, we use the netmask if one was supplied. 7023 */ 7024 if (flags & RTF_HOST) { 7025 mask = IP_HOST_MASK; 7026 match_flags |= MATCH_IRE_MASK; 7027 } else if (rtm_addrs & RTA_NETMASK) { 7028 match_flags |= MATCH_IRE_MASK; 7029 } 7030 7031 /* 7032 * Note that RTF_GATEWAY is never set on a delete, therefore 7033 * we check if the gateway address is one of our interfaces first, 7034 * and fall back on RTF_GATEWAY routes. 7035 * 7036 * This makes it possible to delete an original 7037 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7038 * 7039 * As the interface index specified with the RTA_IFP sockaddr is the 7040 * same for all ipif's off of an ill, the matching logic below uses 7041 * MATCH_IRE_ILL if such an index was specified. This means a route 7042 * sharing the same prefix and interface index as the the route 7043 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7044 * is specified in the request. 7045 * 7046 * On the other hand, since the gateway address will usually be 7047 * different for each ipif on the system, the matching logic 7048 * uses MATCH_IRE_IPIF in the case of a traditional interface 7049 * route. This means that interface routes for the same prefix can be 7050 * uniquely identified if they belong to distinct ipif's and if a 7051 * RTA_IFP sockaddr is not present. 7052 * 7053 * For more detail on specifying routes by gateway address and by 7054 * interface index, see the comments in ip_rt_add(). 7055 * gw_addr could be zero in some cases when both RTA_SRCIFP and 7056 * RTA_IFP are specified. If RTA_SRCIFP is specified and both 7057 * RTA_IFP and gateway_addr are NULL/zero, then delete will not 7058 * succeed. 7059 */ 7060 if (src_ipif != NULL) { 7061 if (ipif_arg == NULL && gw_addr != 0) { 7062 ipif_arg = ipif_lookup_interface(gw_addr, dst_addr, 7063 q, mp, func, &err); 7064 if (ipif_arg != NULL) 7065 ipif_refheld = B_TRUE; 7066 } 7067 if (ipif_arg == NULL) { 7068 err = (err == EINPROGRESS) ? err : ESRCH; 7069 return (err); 7070 } 7071 ipif = ipif_arg; 7072 } else { 7073 ipif = ipif_lookup_interface(gw_addr, dst_addr, 7074 q, mp, func, &err); 7075 if (ipif != NULL) 7076 ipif_refheld = B_TRUE; 7077 else if (err == EINPROGRESS) 7078 return (err); 7079 else 7080 err = 0; 7081 } 7082 if (ipif != NULL) { 7083 if (ipif_arg != NULL) { 7084 if (ipif_refheld) { 7085 ipif_refrele(ipif); 7086 ipif_refheld = B_FALSE; 7087 } 7088 ipif = ipif_arg; 7089 match_flags |= MATCH_IRE_ILL; 7090 } else { 7091 match_flags |= MATCH_IRE_IPIF; 7092 } 7093 if (src_ipif != NULL) { 7094 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 7095 ipif, src_ipif->ipif_ill, match_flags); 7096 } else { 7097 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7098 ire = ire_ctable_lookup(dst_addr, 0, 7099 IRE_LOOPBACK, ipif, ALL_ZONES, NULL, 7100 match_flags); 7101 } 7102 if (ire == NULL) { 7103 ire = ire_ftable_lookup(dst_addr, mask, 0, 7104 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 7105 NULL, match_flags); 7106 } 7107 } 7108 } 7109 7110 if (ire == NULL) { 7111 /* 7112 * At this point, the gateway address is not one of our own 7113 * addresses or a matching interface route was not found. We 7114 * set the IRE type to lookup based on whether 7115 * this is a host route, a default route or just a prefix. 7116 * 7117 * If an ipif_arg was passed in, then the lookup is based on an 7118 * interface index so MATCH_IRE_ILL is added to match_flags. 7119 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7120 * set as the route being looked up is not a traditional 7121 * interface route. 7122 * Since we do not add gateway route with srcipif, we don't 7123 * expect to find it either. 7124 */ 7125 if (src_ipif != NULL) { 7126 if (ipif_refheld) 7127 ipif_refrele(ipif); 7128 return (ESRCH); 7129 } else { 7130 match_flags &= ~MATCH_IRE_IPIF; 7131 match_flags |= MATCH_IRE_GW; 7132 if (ipif_arg != NULL) 7133 match_flags |= MATCH_IRE_ILL; 7134 if (mask == IP_HOST_MASK) 7135 type = IRE_HOST; 7136 else if (mask == 0) 7137 type = IRE_DEFAULT; 7138 else 7139 type = IRE_PREFIX; 7140 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, 7141 ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags); 7142 } 7143 } 7144 7145 if (ipif_refheld) 7146 ipif_refrele(ipif); 7147 7148 /* ipif is not refheld anymore */ 7149 if (ire == NULL) 7150 return (ESRCH); 7151 7152 if (ire->ire_flags & RTF_MULTIRT) { 7153 /* 7154 * Invoke the CGTP (multirouting) filtering module 7155 * to remove the dst address from the filtering database. 7156 * Packets coming from that address will no longer be 7157 * filtered to remove duplicates. 7158 */ 7159 if (ip_cgtp_filter_ops != NULL) { 7160 err = ip_cgtp_filter_ops->cfo_del_dest_v4(ire->ire_addr, 7161 ire->ire_gateway_addr); 7162 } 7163 ip_cgtp_bcast_delete(ire); 7164 } 7165 7166 ipif = ire->ire_ipif; 7167 /* 7168 * Removing from ipif_saved_ire_mp is not necessary 7169 * when src_ipif being non-NULL. ip_rt_add does not 7170 * save the ires which src_ipif being non-NULL. 7171 */ 7172 if (ipif != NULL && src_ipif == NULL) { 7173 ipif_remove_ire(ipif, ire); 7174 } 7175 if (ioctl_msg) 7176 ip_rts_rtmsg(RTM_OLDDEL, ire, 0); 7177 ire_delete(ire); 7178 ire_refrele(ire); 7179 return (err); 7180 } 7181 7182 /* 7183 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7184 */ 7185 /* ARGSUSED */ 7186 int 7187 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7188 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7189 { 7190 ipaddr_t dst_addr; 7191 ipaddr_t gw_addr; 7192 ipaddr_t mask; 7193 int error = 0; 7194 mblk_t *mp1; 7195 struct rtentry *rt; 7196 ipif_t *ipif = NULL; 7197 7198 ip1dbg(("ip_siocaddrt:")); 7199 /* Existence of mp1 verified in ip_wput_nondata */ 7200 mp1 = mp->b_cont->b_cont; 7201 rt = (struct rtentry *)mp1->b_rptr; 7202 7203 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7204 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7205 7206 /* 7207 * If the RTF_HOST flag is on, this is a request to assign a gateway 7208 * to a particular host address. In this case, we set the netmask to 7209 * all ones for the particular destination address. Otherwise, 7210 * determine the netmask to be used based on dst_addr and the interfaces 7211 * in use. 7212 */ 7213 if (rt->rt_flags & RTF_HOST) { 7214 mask = IP_HOST_MASK; 7215 } else { 7216 /* 7217 * Note that ip_subnet_mask returns a zero mask in the case of 7218 * default (an all-zeroes address). 7219 */ 7220 mask = ip_subnet_mask(dst_addr, &ipif); 7221 } 7222 7223 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7224 NULL, B_TRUE, q, mp, ip_process_ioctl, NULL); 7225 if (ipif != NULL) 7226 ipif_refrele(ipif); 7227 return (error); 7228 } 7229 7230 /* 7231 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7232 */ 7233 /* ARGSUSED */ 7234 int 7235 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7236 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7237 { 7238 ipaddr_t dst_addr; 7239 ipaddr_t gw_addr; 7240 ipaddr_t mask; 7241 int error; 7242 mblk_t *mp1; 7243 struct rtentry *rt; 7244 ipif_t *ipif = NULL; 7245 7246 ip1dbg(("ip_siocdelrt:")); 7247 /* Existence of mp1 verified in ip_wput_nondata */ 7248 mp1 = mp->b_cont->b_cont; 7249 rt = (struct rtentry *)mp1->b_rptr; 7250 7251 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7252 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7253 7254 /* 7255 * If the RTF_HOST flag is on, this is a request to delete a gateway 7256 * to a particular host address. In this case, we set the netmask to 7257 * all ones for the particular destination address. Otherwise, 7258 * determine the netmask to be used based on dst_addr and the interfaces 7259 * in use. 7260 */ 7261 if (rt->rt_flags & RTF_HOST) { 7262 mask = IP_HOST_MASK; 7263 } else { 7264 /* 7265 * Note that ip_subnet_mask returns a zero mask in the case of 7266 * default (an all-zeroes address). 7267 */ 7268 mask = ip_subnet_mask(dst_addr, &ipif); 7269 } 7270 7271 error = ip_rt_delete(dst_addr, mask, gw_addr, 7272 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, NULL, 7273 B_TRUE, q, mp, ip_process_ioctl); 7274 if (ipif != NULL) 7275 ipif_refrele(ipif); 7276 return (error); 7277 } 7278 7279 /* 7280 * Enqueue the mp onto the ipsq, chained by b_next. 7281 * b_prev stores the function to be executed later, and b_queue the queue 7282 * where this mp originated. 7283 */ 7284 void 7285 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7286 ill_t *pending_ill) 7287 { 7288 conn_t *connp = NULL; 7289 7290 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7291 ASSERT(func != NULL); 7292 7293 mp->b_queue = q; 7294 mp->b_prev = (void *)func; 7295 mp->b_next = NULL; 7296 7297 switch (type) { 7298 case CUR_OP: 7299 if (ipsq->ipsq_mptail != NULL) { 7300 ASSERT(ipsq->ipsq_mphead != NULL); 7301 ipsq->ipsq_mptail->b_next = mp; 7302 } else { 7303 ASSERT(ipsq->ipsq_mphead == NULL); 7304 ipsq->ipsq_mphead = mp; 7305 } 7306 ipsq->ipsq_mptail = mp; 7307 break; 7308 7309 case NEW_OP: 7310 if (ipsq->ipsq_xopq_mptail != NULL) { 7311 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7312 ipsq->ipsq_xopq_mptail->b_next = mp; 7313 } else { 7314 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7315 ipsq->ipsq_xopq_mphead = mp; 7316 } 7317 ipsq->ipsq_xopq_mptail = mp; 7318 break; 7319 default: 7320 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7321 } 7322 7323 if (CONN_Q(q) && pending_ill != NULL) { 7324 connp = Q_TO_CONN(q); 7325 7326 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7327 connp->conn_oper_pending_ill = pending_ill; 7328 } 7329 } 7330 7331 /* 7332 * Return the mp at the head of the ipsq. After emptying the ipsq 7333 * look at the next ioctl, if this ioctl is complete. Otherwise 7334 * return, we will resume when we complete the current ioctl. 7335 * The current ioctl will wait till it gets a response from the 7336 * driver below. 7337 */ 7338 static mblk_t * 7339 ipsq_dq(ipsq_t *ipsq) 7340 { 7341 mblk_t *mp; 7342 7343 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7344 7345 mp = ipsq->ipsq_mphead; 7346 if (mp != NULL) { 7347 ipsq->ipsq_mphead = mp->b_next; 7348 if (ipsq->ipsq_mphead == NULL) 7349 ipsq->ipsq_mptail = NULL; 7350 mp->b_next = NULL; 7351 return (mp); 7352 } 7353 if (ipsq->ipsq_current_ipif != NULL) 7354 return (NULL); 7355 mp = ipsq->ipsq_xopq_mphead; 7356 if (mp != NULL) { 7357 ipsq->ipsq_xopq_mphead = mp->b_next; 7358 if (ipsq->ipsq_xopq_mphead == NULL) 7359 ipsq->ipsq_xopq_mptail = NULL; 7360 mp->b_next = NULL; 7361 return (mp); 7362 } 7363 return (NULL); 7364 } 7365 7366 /* 7367 * Enter the ipsq corresponding to ill, by waiting synchronously till 7368 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7369 * will have to drain completely before ipsq_enter returns success. 7370 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 7371 * and the ipsq_exit logic will start the next enqueued ioctl after 7372 * completion of the current ioctl. If 'force' is used, we don't wait 7373 * for the enqueued ioctls. This is needed when a conn_close wants to 7374 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7375 * of an ill can also use this option. But we dont' use it currently. 7376 */ 7377 #define ENTER_SQ_WAIT_TICKS 100 7378 boolean_t 7379 ipsq_enter(ill_t *ill, boolean_t force) 7380 { 7381 ipsq_t *ipsq; 7382 boolean_t waited_enough = B_FALSE; 7383 7384 /* 7385 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 7386 * Since the <ill-ipsq> assocs could change while we wait for the 7387 * writer, it is easier to wait on a fixed global rather than try to 7388 * cv_wait on a changing ipsq. 7389 */ 7390 mutex_enter(&ill->ill_lock); 7391 for (;;) { 7392 if (ill->ill_state_flags & ILL_CONDEMNED) { 7393 mutex_exit(&ill->ill_lock); 7394 return (B_FALSE); 7395 } 7396 7397 ipsq = ill->ill_phyint->phyint_ipsq; 7398 mutex_enter(&ipsq->ipsq_lock); 7399 if (ipsq->ipsq_writer == NULL && 7400 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 7401 break; 7402 } else if (ipsq->ipsq_writer != NULL) { 7403 mutex_exit(&ipsq->ipsq_lock); 7404 cv_wait(&ill->ill_cv, &ill->ill_lock); 7405 } else { 7406 mutex_exit(&ipsq->ipsq_lock); 7407 if (force) { 7408 (void) cv_timedwait(&ill->ill_cv, 7409 &ill->ill_lock, 7410 lbolt + ENTER_SQ_WAIT_TICKS); 7411 waited_enough = B_TRUE; 7412 continue; 7413 } else { 7414 cv_wait(&ill->ill_cv, &ill->ill_lock); 7415 } 7416 } 7417 } 7418 7419 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7420 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7421 ipsq->ipsq_writer = curthread; 7422 ipsq->ipsq_reentry_cnt++; 7423 #ifdef ILL_DEBUG 7424 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7425 #endif 7426 mutex_exit(&ipsq->ipsq_lock); 7427 mutex_exit(&ill->ill_lock); 7428 return (B_TRUE); 7429 } 7430 7431 /* 7432 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7433 * certain critical operations like plumbing (i.e. most set ioctls), 7434 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7435 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7436 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7437 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7438 * threads executing in the ipsq. Responses from the driver pertain to the 7439 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7440 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7441 * 7442 * If a thread does not want to reenter the ipsq when it is already writer, 7443 * it must make sure that the specified reentry point to be called later 7444 * when the ipsq is empty, nor any code path starting from the specified reentry 7445 * point must never ever try to enter the ipsq again. Otherwise it can lead 7446 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7447 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7448 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7449 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7450 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7451 * ioctl if the current ioctl has completed. If the current ioctl is still 7452 * in progress it simply returns. The current ioctl could be waiting for 7453 * a response from another module (arp_ or the driver or could be waiting for 7454 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7455 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7456 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7457 * ipsq_current_ipif is clear which happens only on ioctl completion. 7458 */ 7459 7460 /* 7461 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7462 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7463 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7464 * completion. 7465 */ 7466 ipsq_t * 7467 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7468 ipsq_func_t func, int type, boolean_t reentry_ok) 7469 { 7470 ipsq_t *ipsq; 7471 7472 /* Only 1 of ipif or ill can be specified */ 7473 ASSERT((ipif != NULL) ^ (ill != NULL)); 7474 if (ipif != NULL) 7475 ill = ipif->ipif_ill; 7476 7477 /* 7478 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7479 * ipsq of an ill can't change when ill_lock is held. 7480 */ 7481 GRAB_CONN_LOCK(q); 7482 mutex_enter(&ill->ill_lock); 7483 ipsq = ill->ill_phyint->phyint_ipsq; 7484 mutex_enter(&ipsq->ipsq_lock); 7485 7486 /* 7487 * 1. Enter the ipsq if we are already writer and reentry is ok. 7488 * (Note: If the caller does not specify reentry_ok then neither 7489 * 'func' nor any of its callees must ever attempt to enter the ipsq 7490 * again. Otherwise it can lead to an infinite loop 7491 * 2. Enter the ipsq if there is no current writer and this attempted 7492 * entry is part of the current ioctl or operation 7493 * 3. Enter the ipsq if there is no current writer and this is a new 7494 * ioctl (or operation) and the ioctl (or operation) queue is 7495 * empty and there is no ioctl (or operation) currently in progress 7496 */ 7497 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7498 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7499 ipsq->ipsq_current_ipif == NULL))) || 7500 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7501 /* Success. */ 7502 ipsq->ipsq_reentry_cnt++; 7503 ipsq->ipsq_writer = curthread; 7504 mutex_exit(&ipsq->ipsq_lock); 7505 mutex_exit(&ill->ill_lock); 7506 RELEASE_CONN_LOCK(q); 7507 #ifdef ILL_DEBUG 7508 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7509 #endif 7510 return (ipsq); 7511 } 7512 7513 ipsq_enq(ipsq, q, mp, func, type, ill); 7514 7515 mutex_exit(&ipsq->ipsq_lock); 7516 mutex_exit(&ill->ill_lock); 7517 RELEASE_CONN_LOCK(q); 7518 return (NULL); 7519 } 7520 7521 /* 7522 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7523 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7524 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7525 * completion. 7526 * 7527 * This function does a refrele on the ipif/ill. 7528 */ 7529 void 7530 qwriter_ip(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7531 ipsq_func_t func, int type, boolean_t reentry_ok) 7532 { 7533 ipsq_t *ipsq; 7534 7535 ipsq = ipsq_try_enter(ipif, ill, q, mp, func, type, reentry_ok); 7536 /* 7537 * Caller must have done a refhold on the ipif. ipif_refrele 7538 * happens on the passed ipif. We can do this since we are 7539 * already exclusive, or we won't access ipif henceforth, Both 7540 * this func and caller will just return if we ipsq_try_enter 7541 * fails above. This is needed because func needs to 7542 * see the correct refcount. Eg. removeif can work only then. 7543 */ 7544 if (ipif != NULL) 7545 ipif_refrele(ipif); 7546 else 7547 ill_refrele(ill); 7548 if (ipsq != NULL) { 7549 (*func)(ipsq, q, mp, NULL); 7550 ipsq_exit(ipsq, B_TRUE, B_TRUE); 7551 } 7552 } 7553 7554 /* 7555 * If there are more than ILL_GRP_CNT ills in a group, 7556 * we use kmem alloc'd buffers, else use the stack 7557 */ 7558 #define ILL_GRP_CNT 14 7559 /* 7560 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 7561 * Called by a thread that is currently exclusive on this ipsq. 7562 */ 7563 void 7564 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 7565 { 7566 queue_t *q; 7567 mblk_t *mp; 7568 ipsq_func_t func; 7569 int next; 7570 ill_t **ill_list = NULL; 7571 size_t ill_list_size = 0; 7572 int cnt = 0; 7573 boolean_t need_ipsq_free = B_FALSE; 7574 7575 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7576 mutex_enter(&ipsq->ipsq_lock); 7577 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 7578 if (ipsq->ipsq_reentry_cnt != 1) { 7579 ipsq->ipsq_reentry_cnt--; 7580 mutex_exit(&ipsq->ipsq_lock); 7581 return; 7582 } 7583 7584 mp = ipsq_dq(ipsq); 7585 while (mp != NULL) { 7586 again: 7587 mutex_exit(&ipsq->ipsq_lock); 7588 func = (ipsq_func_t)mp->b_prev; 7589 q = (queue_t *)mp->b_queue; 7590 mp->b_prev = NULL; 7591 mp->b_queue = NULL; 7592 7593 /* 7594 * If 'q' is an conn queue, it is valid, since we did a 7595 * a refhold on the connp, at the start of the ioctl. 7596 * If 'q' is an ill queue, it is valid, since close of an 7597 * ill will clean up the 'ipsq'. 7598 */ 7599 (*func)(ipsq, q, mp, NULL); 7600 7601 mutex_enter(&ipsq->ipsq_lock); 7602 mp = ipsq_dq(ipsq); 7603 } 7604 7605 mutex_exit(&ipsq->ipsq_lock); 7606 7607 /* 7608 * Need to grab the locks in the right order. Need to 7609 * atomically check (under ipsq_lock) that there are no 7610 * messages before relinquishing the ipsq. Also need to 7611 * atomically wakeup waiters on ill_cv while holding ill_lock. 7612 * Holding ill_g_lock ensures that ipsq list of ills is stable. 7613 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 7614 * to grab ill_g_lock as writer. 7615 */ 7616 rw_enter(&ill_g_lock, ipsq->ipsq_split ? RW_WRITER : RW_READER); 7617 7618 /* ipsq_refs can't change while ill_g_lock is held as reader */ 7619 if (ipsq->ipsq_refs != 0) { 7620 /* At most 2 ills v4/v6 per phyint */ 7621 cnt = ipsq->ipsq_refs << 1; 7622 ill_list_size = cnt * sizeof (ill_t *); 7623 /* 7624 * If memory allocation fails, we will do the split 7625 * the next time ipsq_exit is called for whatever reason. 7626 * As long as the ipsq_split flag is set the need to 7627 * split is remembered. 7628 */ 7629 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 7630 if (ill_list != NULL) 7631 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 7632 } 7633 mutex_enter(&ipsq->ipsq_lock); 7634 mp = ipsq_dq(ipsq); 7635 if (mp != NULL) { 7636 /* oops, some message has landed up, we can't get out */ 7637 if (ill_list != NULL) 7638 ill_unlock_ills(ill_list, cnt); 7639 rw_exit(&ill_g_lock); 7640 if (ill_list != NULL) 7641 kmem_free(ill_list, ill_list_size); 7642 ill_list = NULL; 7643 ill_list_size = 0; 7644 cnt = 0; 7645 goto again; 7646 } 7647 7648 /* 7649 * Split only if no ioctl is pending and if memory alloc succeeded 7650 * above. 7651 */ 7652 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 7653 ill_list != NULL) { 7654 /* 7655 * No new ill can join this ipsq since we are holding the 7656 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 7657 * ipsq. ill_split_ipsq may fail due to memory shortage. 7658 * If so we will retry on the next ipsq_exit. 7659 */ 7660 ipsq->ipsq_split = ill_split_ipsq(ipsq); 7661 } 7662 7663 /* 7664 * We are holding the ipsq lock, hence no new messages can 7665 * land up on the ipsq, and there are no messages currently. 7666 * Now safe to get out. Wake up waiters and relinquish ipsq 7667 * atomically while holding ill locks. 7668 */ 7669 ipsq->ipsq_writer = NULL; 7670 ipsq->ipsq_reentry_cnt--; 7671 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7672 #ifdef ILL_DEBUG 7673 ipsq->ipsq_depth = 0; 7674 #endif 7675 mutex_exit(&ipsq->ipsq_lock); 7676 /* 7677 * For IPMP this should wake up all ills in this ipsq. 7678 * We need to hold the ill_lock while waking up waiters to 7679 * avoid missed wakeups. But there is no need to acquire all 7680 * the ill locks and then wakeup. If we have not acquired all 7681 * the locks (due to memory failure above) ill_signal_ipsq_ills 7682 * wakes up ills one at a time after getting the right ill_lock 7683 */ 7684 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 7685 if (ill_list != NULL) 7686 ill_unlock_ills(ill_list, cnt); 7687 if (ipsq->ipsq_refs == 0) 7688 need_ipsq_free = B_TRUE; 7689 rw_exit(&ill_g_lock); 7690 if (ill_list != 0) 7691 kmem_free(ill_list, ill_list_size); 7692 7693 if (need_ipsq_free) { 7694 /* 7695 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 7696 * looked up. ipsq can be looked up only thru ill or phyint 7697 * and there are no ills/phyint on this ipsq. 7698 */ 7699 ipsq_delete(ipsq); 7700 } 7701 /* 7702 * Now start any igmp or mld timers that could not be started 7703 * while inside the ipsq. The timers can't be started while inside 7704 * the ipsq, since igmp_start_timers may need to call untimeout() 7705 * which can't be done while holding a lock i.e. the ipsq. Otherwise 7706 * there could be a deadlock since the timeout handlers 7707 * mld_timeout_handler / igmp_timeout_handler also synchronously 7708 * wait in ipsq_enter() trying to get the ipsq. 7709 * 7710 * However there is one exception to the above. If this thread is 7711 * itself the igmp/mld timeout handler thread, then we don't want 7712 * to start any new timer until the current handler is done. The 7713 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 7714 * all others pass B_TRUE. 7715 */ 7716 if (start_igmp_timer) { 7717 mutex_enter(&igmp_timer_lock); 7718 next = igmp_deferred_next; 7719 igmp_deferred_next = INFINITY; 7720 mutex_exit(&igmp_timer_lock); 7721 7722 if (next != INFINITY) 7723 igmp_start_timers(next); 7724 } 7725 7726 if (start_mld_timer) { 7727 mutex_enter(&mld_timer_lock); 7728 next = mld_deferred_next; 7729 mld_deferred_next = INFINITY; 7730 mutex_exit(&mld_timer_lock); 7731 7732 if (next != INFINITY) 7733 mld_start_timers(next); 7734 } 7735 } 7736 7737 /* 7738 * The ill is closing. Flush all messages on the ipsq that originated 7739 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 7740 * for this ill since ipsq_enter could not have entered until then. 7741 * New messages can't be queued since the CONDEMNED flag is set. 7742 */ 7743 static void 7744 ipsq_flush(ill_t *ill) 7745 { 7746 queue_t *q; 7747 mblk_t *prev; 7748 mblk_t *mp; 7749 mblk_t *mp_next; 7750 ipsq_t *ipsq; 7751 7752 ASSERT(IAM_WRITER_ILL(ill)); 7753 ipsq = ill->ill_phyint->phyint_ipsq; 7754 /* 7755 * Flush any messages sent up by the driver. 7756 */ 7757 mutex_enter(&ipsq->ipsq_lock); 7758 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 7759 mp_next = mp->b_next; 7760 q = mp->b_queue; 7761 if (q == ill->ill_rq || q == ill->ill_wq) { 7762 /* Remove the mp from the ipsq */ 7763 if (prev == NULL) 7764 ipsq->ipsq_mphead = mp->b_next; 7765 else 7766 prev->b_next = mp->b_next; 7767 if (ipsq->ipsq_mptail == mp) { 7768 ASSERT(mp_next == NULL); 7769 ipsq->ipsq_mptail = prev; 7770 } 7771 inet_freemsg(mp); 7772 } else { 7773 prev = mp; 7774 } 7775 } 7776 mutex_exit(&ipsq->ipsq_lock); 7777 (void) ipsq_pending_mp_cleanup(ill, NULL); 7778 ipsq_xopq_mp_cleanup(ill, NULL); 7779 ill_pending_mp_cleanup(ill); 7780 } 7781 7782 /* 7783 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 7784 * The real cleanup happens behind the squeue via ip_squeue_clean function but 7785 * we need to protect ourselfs from 2 threads trying to cleanup at the same 7786 * time (possible with one port going down for aggr and someone tearing down the 7787 * entire aggr simultaneously. So we use ill_inuse_ref protected by ill_lock 7788 * to indicate when the cleanup has started (1 ref) and when the cleanup 7789 * is done (0 ref). When a new ring gets assigned to squeue, we start by 7790 * putting 2 ref on ill_inuse_ref. 7791 */ 7792 static void 7793 ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 7794 { 7795 conn_t *connp; 7796 squeue_t *sqp; 7797 mblk_t *mp; 7798 7799 ASSERT(rx_ring != NULL); 7800 7801 /* Just clean one squeue */ 7802 mutex_enter(&ill->ill_lock); 7803 /* 7804 * Reset the ILL_SOFT_RING_ASSIGN bit so that 7805 * ip_squeue_soft_ring_affinty() will not go 7806 * ahead with assigning rings. 7807 */ 7808 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 7809 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 7810 /* Some operations pending on the ring. Wait */ 7811 cv_wait(&ill->ill_cv, &ill->ill_lock); 7812 7813 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 7814 /* 7815 * Someone already trying to clean 7816 * this squeue or its already been cleaned. 7817 */ 7818 mutex_exit(&ill->ill_lock); 7819 return; 7820 } 7821 sqp = rx_ring->rr_sqp; 7822 7823 if (sqp == NULL) { 7824 /* 7825 * The rx_ring never had a squeue assigned to it. 7826 * We are under ill_lock so we can clean it up 7827 * here itself since no one can get to it. 7828 */ 7829 rx_ring->rr_blank = NULL; 7830 rx_ring->rr_handle = NULL; 7831 rx_ring->rr_sqp = NULL; 7832 rx_ring->rr_ring_state = ILL_RING_FREE; 7833 mutex_exit(&ill->ill_lock); 7834 return; 7835 } 7836 7837 /* Set the state that its being cleaned */ 7838 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 7839 ASSERT(sqp != NULL); 7840 mutex_exit(&ill->ill_lock); 7841 7842 /* 7843 * Use the preallocated ill_unbind_conn for this purpose 7844 */ 7845 connp = ill->ill_dls_capab->ill_unbind_conn; 7846 7847 ASSERT(!connp->conn_tcp->tcp_closemp.b_prev); 7848 TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); 7849 if (connp->conn_tcp->tcp_closemp.b_prev == NULL) 7850 connp->conn_tcp->tcp_closemp_used = 1; 7851 else 7852 connp->conn_tcp->tcp_closemp_used++; 7853 mp = &connp->conn_tcp->tcp_closemp; 7854 CONN_INC_REF(connp); 7855 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 7856 7857 mutex_enter(&ill->ill_lock); 7858 while (rx_ring->rr_ring_state != ILL_RING_FREE) 7859 cv_wait(&ill->ill_cv, &ill->ill_lock); 7860 7861 mutex_exit(&ill->ill_lock); 7862 } 7863 7864 static void 7865 ipsq_clean_all(ill_t *ill) 7866 { 7867 int idx; 7868 7869 /* 7870 * No need to clean if poll_capab isn't set for this ill 7871 */ 7872 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 7873 return; 7874 7875 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 7876 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 7877 ipsq_clean_ring(ill, ipr); 7878 } 7879 7880 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 7881 } 7882 7883 /* ARGSUSED */ 7884 int 7885 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7886 ip_ioctl_cmd_t *ipip, void *ifreq) 7887 { 7888 ill_t *ill; 7889 struct lifreq *lifr = (struct lifreq *)ifreq; 7890 boolean_t isv6; 7891 conn_t *connp; 7892 7893 connp = Q_TO_CONN(q); 7894 isv6 = connp->conn_af_isv6; 7895 /* 7896 * Set original index. 7897 * Failover and failback move logical interfaces 7898 * from one physical interface to another. The 7899 * original index indicates the parent of a logical 7900 * interface, in other words, the physical interface 7901 * the logical interface will be moved back to on 7902 * failback. 7903 */ 7904 7905 /* 7906 * Don't allow the original index to be changed 7907 * for non-failover addresses, autoconfigured 7908 * addresses, or IPv6 link local addresses. 7909 */ 7910 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 7911 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 7912 return (EINVAL); 7913 } 7914 /* 7915 * The new original index must be in use by some 7916 * physical interface. 7917 */ 7918 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 7919 NULL, NULL); 7920 if (ill == NULL) 7921 return (ENXIO); 7922 ill_refrele(ill); 7923 7924 ipif->ipif_orig_ifindex = lifr->lifr_index; 7925 /* 7926 * When this ipif gets failed back, don't 7927 * preserve the original id, as it is no 7928 * longer applicable. 7929 */ 7930 ipif->ipif_orig_ipifid = 0; 7931 /* 7932 * For IPv4, change the original index of any 7933 * multicast addresses associated with the 7934 * ipif to the new value. 7935 */ 7936 if (!isv6) { 7937 ilm_t *ilm; 7938 7939 mutex_enter(&ipif->ipif_ill->ill_lock); 7940 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 7941 ilm = ilm->ilm_next) { 7942 if (ilm->ilm_ipif == ipif) { 7943 ilm->ilm_orig_ifindex = lifr->lifr_index; 7944 } 7945 } 7946 mutex_exit(&ipif->ipif_ill->ill_lock); 7947 } 7948 return (0); 7949 } 7950 7951 /* ARGSUSED */ 7952 int 7953 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7954 ip_ioctl_cmd_t *ipip, void *ifreq) 7955 { 7956 struct lifreq *lifr = (struct lifreq *)ifreq; 7957 7958 /* 7959 * Get the original interface index i.e the one 7960 * before FAILOVER if it ever happened. 7961 */ 7962 lifr->lifr_index = ipif->ipif_orig_ifindex; 7963 return (0); 7964 } 7965 7966 /* 7967 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 7968 * refhold and return the associated ipif 7969 */ 7970 int 7971 ip_extract_tunreq(queue_t *q, mblk_t *mp, ipif_t **ipifp, ipsq_func_t func) 7972 { 7973 boolean_t exists; 7974 struct iftun_req *ta; 7975 ipif_t *ipif; 7976 ill_t *ill; 7977 boolean_t isv6; 7978 mblk_t *mp1; 7979 int error; 7980 conn_t *connp; 7981 7982 /* Existence verified in ip_wput_nondata */ 7983 mp1 = mp->b_cont->b_cont; 7984 ta = (struct iftun_req *)mp1->b_rptr; 7985 /* 7986 * Null terminate the string to protect against buffer 7987 * overrun. String was generated by user code and may not 7988 * be trusted. 7989 */ 7990 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 7991 7992 connp = Q_TO_CONN(q); 7993 isv6 = connp->conn_af_isv6; 7994 7995 /* Disallows implicit create */ 7996 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 7997 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 7998 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error); 7999 if (ipif == NULL) 8000 return (error); 8001 8002 if (ipif->ipif_id != 0) { 8003 /* 8004 * We really don't want to set/get tunnel parameters 8005 * on virtual tunnel interfaces. Only allow the 8006 * base tunnel to do these. 8007 */ 8008 ipif_refrele(ipif); 8009 return (EINVAL); 8010 } 8011 8012 /* 8013 * Send down to tunnel mod for ioctl processing. 8014 * Will finish ioctl in ip_rput_other(). 8015 */ 8016 ill = ipif->ipif_ill; 8017 if (ill->ill_net_type == IRE_LOOPBACK) { 8018 ipif_refrele(ipif); 8019 return (EOPNOTSUPP); 8020 } 8021 8022 if (ill->ill_wq == NULL) { 8023 ipif_refrele(ipif); 8024 return (ENXIO); 8025 } 8026 /* 8027 * Mark the ioctl as coming from an IPv6 interface for 8028 * tun's convenience. 8029 */ 8030 if (ill->ill_isv6) 8031 ta->ifta_flags |= 0x80000000; 8032 *ipifp = ipif; 8033 return (0); 8034 } 8035 8036 /* 8037 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8038 * and return the associated ipif. 8039 * Return value: 8040 * Non zero: An error has occurred. ci may not be filled out. 8041 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8042 * a held ipif in ci.ci_ipif. 8043 */ 8044 int 8045 ip_extract_lifreq_cmn(queue_t *q, mblk_t *mp, int cmd_type, int flags, 8046 cmd_info_t *ci, ipsq_func_t func) 8047 { 8048 sin_t *sin; 8049 sin6_t *sin6; 8050 char *name; 8051 struct ifreq *ifr; 8052 struct lifreq *lifr; 8053 ipif_t *ipif = NULL; 8054 ill_t *ill; 8055 conn_t *connp; 8056 boolean_t isv6; 8057 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8058 boolean_t exists; 8059 int err; 8060 mblk_t *mp1; 8061 zoneid_t zoneid; 8062 8063 if (q->q_next != NULL) { 8064 ill = (ill_t *)q->q_ptr; 8065 isv6 = ill->ill_isv6; 8066 connp = NULL; 8067 zoneid = ALL_ZONES; 8068 } else { 8069 ill = NULL; 8070 connp = Q_TO_CONN(q); 8071 isv6 = connp->conn_af_isv6; 8072 zoneid = connp->conn_zoneid; 8073 if (zoneid == GLOBAL_ZONEID) { 8074 /* global zone can access ipifs in all zones */ 8075 zoneid = ALL_ZONES; 8076 } 8077 } 8078 8079 /* Has been checked in ip_wput_nondata */ 8080 mp1 = mp->b_cont->b_cont; 8081 8082 8083 if (cmd_type == IF_CMD) { 8084 /* This a old style SIOC[GS]IF* command */ 8085 ifr = (struct ifreq *)mp1->b_rptr; 8086 /* 8087 * Null terminate the string to protect against buffer 8088 * overrun. String was generated by user code and may not 8089 * be trusted. 8090 */ 8091 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8092 sin = (sin_t *)&ifr->ifr_addr; 8093 name = ifr->ifr_name; 8094 ci->ci_sin = sin; 8095 ci->ci_sin6 = NULL; 8096 ci->ci_lifr = (struct lifreq *)ifr; 8097 } else { 8098 /* This a new style SIOC[GS]LIF* command */ 8099 ASSERT(cmd_type == LIF_CMD); 8100 lifr = (struct lifreq *)mp1->b_rptr; 8101 /* 8102 * Null terminate the string to protect against buffer 8103 * overrun. String was generated by user code and may not 8104 * be trusted. 8105 */ 8106 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8107 name = lifr->lifr_name; 8108 sin = (sin_t *)&lifr->lifr_addr; 8109 sin6 = (sin6_t *)&lifr->lifr_addr; 8110 if (iocp->ioc_cmd == SIOCSLIFGROUPNAME) { 8111 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 8112 LIFNAMSIZ); 8113 } 8114 ci->ci_sin = sin; 8115 ci->ci_sin6 = sin6; 8116 ci->ci_lifr = lifr; 8117 } 8118 8119 8120 if (iocp->ioc_cmd == SIOCSLIFNAME) { 8121 /* 8122 * The ioctl will be failed if the ioctl comes down 8123 * an conn stream 8124 */ 8125 if (ill == NULL) { 8126 /* 8127 * Not an ill queue, return EINVAL same as the 8128 * old error code. 8129 */ 8130 return (ENXIO); 8131 } 8132 ipif = ill->ill_ipif; 8133 ipif_refhold(ipif); 8134 } else { 8135 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8136 &exists, isv6, zoneid, 8137 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err); 8138 if (ipif == NULL) { 8139 if (err == EINPROGRESS) 8140 return (err); 8141 if (iocp->ioc_cmd == SIOCLIFFAILOVER || 8142 iocp->ioc_cmd == SIOCLIFFAILBACK) { 8143 /* 8144 * Need to try both v4 and v6 since this 8145 * ioctl can come down either v4 or v6 8146 * socket. The lifreq.lifr_family passed 8147 * down by this ioctl is AF_UNSPEC. 8148 */ 8149 ipif = ipif_lookup_on_name(name, 8150 mi_strlen(name), B_FALSE, &exists, !isv6, 8151 zoneid, (connp == NULL) ? q : 8152 CONNP_TO_WQ(connp), mp, func, &err); 8153 if (err == EINPROGRESS) 8154 return (err); 8155 } 8156 err = 0; /* Ensure we don't use it below */ 8157 } 8158 } 8159 8160 /* 8161 * Old style [GS]IFCMD does not admit IPv6 ipif 8162 */ 8163 if (ipif != NULL && ipif->ipif_isv6 && cmd_type == IF_CMD) { 8164 ipif_refrele(ipif); 8165 return (ENXIO); 8166 } 8167 8168 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8169 name[0] == '\0') { 8170 /* 8171 * Handle a or a SIOC?IF* with a null name 8172 * during plumb (on the ill queue before the I_PLINK). 8173 */ 8174 ipif = ill->ill_ipif; 8175 ipif_refhold(ipif); 8176 } 8177 8178 if (ipif == NULL) 8179 return (ENXIO); 8180 8181 /* 8182 * Allow only GET operations if this ipif has been created 8183 * temporarily due to a MOVE operation. 8184 */ 8185 if (ipif->ipif_replace_zero && !(flags & IPI_REPL)) { 8186 ipif_refrele(ipif); 8187 return (EINVAL); 8188 } 8189 8190 ci->ci_ipif = ipif; 8191 return (0); 8192 } 8193 8194 /* 8195 * Return the total number of ipifs. 8196 */ 8197 static uint_t 8198 ip_get_numifs(zoneid_t zoneid) 8199 { 8200 uint_t numifs = 0; 8201 ill_t *ill; 8202 ill_walk_context_t ctx; 8203 ipif_t *ipif; 8204 8205 rw_enter(&ill_g_lock, RW_READER); 8206 ill = ILL_START_WALK_V4(&ctx); 8207 8208 while (ill != NULL) { 8209 for (ipif = ill->ill_ipif; ipif != NULL; 8210 ipif = ipif->ipif_next) { 8211 if (ipif->ipif_zoneid == zoneid || 8212 ipif->ipif_zoneid == ALL_ZONES) 8213 numifs++; 8214 } 8215 ill = ill_next(&ctx, ill); 8216 } 8217 rw_exit(&ill_g_lock); 8218 return (numifs); 8219 } 8220 8221 /* 8222 * Return the total number of ipifs. 8223 */ 8224 static uint_t 8225 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid) 8226 { 8227 uint_t numifs = 0; 8228 ill_t *ill; 8229 ipif_t *ipif; 8230 ill_walk_context_t ctx; 8231 8232 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8233 8234 rw_enter(&ill_g_lock, RW_READER); 8235 if (family == AF_INET) 8236 ill = ILL_START_WALK_V4(&ctx); 8237 else if (family == AF_INET6) 8238 ill = ILL_START_WALK_V6(&ctx); 8239 else 8240 ill = ILL_START_WALK_ALL(&ctx); 8241 8242 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8243 for (ipif = ill->ill_ipif; ipif != NULL; 8244 ipif = ipif->ipif_next) { 8245 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8246 !(lifn_flags & LIFC_NOXMIT)) 8247 continue; 8248 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8249 !(lifn_flags & LIFC_TEMPORARY)) 8250 continue; 8251 if (((ipif->ipif_flags & 8252 (IPIF_NOXMIT|IPIF_NOLOCAL| 8253 IPIF_DEPRECATED)) || 8254 (ill->ill_phyint->phyint_flags & 8255 PHYI_LOOPBACK) || 8256 !(ipif->ipif_flags & IPIF_UP)) && 8257 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8258 continue; 8259 8260 if (zoneid != ipif->ipif_zoneid && 8261 ipif->ipif_zoneid != ALL_ZONES && 8262 (zoneid != GLOBAL_ZONEID || 8263 !(lifn_flags & LIFC_ALLZONES))) 8264 continue; 8265 8266 numifs++; 8267 } 8268 } 8269 rw_exit(&ill_g_lock); 8270 return (numifs); 8271 } 8272 8273 uint_t 8274 ip_get_lifsrcofnum(ill_t *ill) 8275 { 8276 uint_t numifs = 0; 8277 ill_t *ill_head = ill; 8278 8279 /* 8280 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8281 * other thread may be trying to relink the ILLs in this usesrc group 8282 * and adjusting the ill_usesrc_grp_next pointers 8283 */ 8284 rw_enter(&ill_g_usesrc_lock, RW_READER); 8285 if ((ill->ill_usesrc_ifindex == 0) && 8286 (ill->ill_usesrc_grp_next != NULL)) { 8287 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8288 ill = ill->ill_usesrc_grp_next) 8289 numifs++; 8290 } 8291 rw_exit(&ill_g_usesrc_lock); 8292 8293 return (numifs); 8294 } 8295 8296 /* Null values are passed in for ipif, sin, and ifreq */ 8297 /* ARGSUSED */ 8298 int 8299 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8300 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8301 { 8302 int *nump; 8303 8304 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8305 8306 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8307 nump = (int *)mp->b_cont->b_cont->b_rptr; 8308 8309 *nump = ip_get_numifs(Q_TO_CONN(q)->conn_zoneid); 8310 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8311 return (0); 8312 } 8313 8314 /* Null values are passed in for ipif, sin, and ifreq */ 8315 /* ARGSUSED */ 8316 int 8317 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8318 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8319 { 8320 struct lifnum *lifn; 8321 mblk_t *mp1; 8322 8323 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8324 8325 /* Existence checked in ip_wput_nondata */ 8326 mp1 = mp->b_cont->b_cont; 8327 8328 lifn = (struct lifnum *)mp1->b_rptr; 8329 switch (lifn->lifn_family) { 8330 case AF_UNSPEC: 8331 case AF_INET: 8332 case AF_INET6: 8333 break; 8334 default: 8335 return (EAFNOSUPPORT); 8336 } 8337 8338 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8339 Q_TO_CONN(q)->conn_zoneid); 8340 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8341 return (0); 8342 } 8343 8344 /* ARGSUSED */ 8345 int 8346 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8347 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8348 { 8349 STRUCT_HANDLE(ifconf, ifc); 8350 mblk_t *mp1; 8351 struct iocblk *iocp; 8352 struct ifreq *ifr; 8353 ill_walk_context_t ctx; 8354 ill_t *ill; 8355 ipif_t *ipif; 8356 struct sockaddr_in *sin; 8357 int32_t ifclen; 8358 zoneid_t zoneid; 8359 8360 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8361 8362 ip1dbg(("ip_sioctl_get_ifconf")); 8363 /* Existence verified in ip_wput_nondata */ 8364 mp1 = mp->b_cont->b_cont; 8365 iocp = (struct iocblk *)mp->b_rptr; 8366 zoneid = Q_TO_CONN(q)->conn_zoneid; 8367 8368 /* 8369 * The original SIOCGIFCONF passed in a struct ifconf which specified 8370 * the user buffer address and length into which the list of struct 8371 * ifreqs was to be copied. Since AT&T Streams does not seem to 8372 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8373 * the SIOCGIFCONF operation was redefined to simply provide 8374 * a large output buffer into which we are supposed to jam the ifreq 8375 * array. The same ioctl command code was used, despite the fact that 8376 * both the applications and the kernel code had to change, thus making 8377 * it impossible to support both interfaces. 8378 * 8379 * For reasons not good enough to try to explain, the following 8380 * algorithm is used for deciding what to do with one of these: 8381 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8382 * form with the output buffer coming down as the continuation message. 8383 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8384 * and we have to copy in the ifconf structure to find out how big the 8385 * output buffer is and where to copy out to. Sure no problem... 8386 * 8387 */ 8388 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8389 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8390 int numifs = 0; 8391 size_t ifc_bufsize; 8392 8393 /* 8394 * Must be (better be!) continuation of a TRANSPARENT 8395 * IOCTL. We just copied in the ifconf structure. 8396 */ 8397 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8398 (struct ifconf *)mp1->b_rptr); 8399 8400 /* 8401 * Allocate a buffer to hold requested information. 8402 * 8403 * If ifc_len is larger than what is needed, we only 8404 * allocate what we will use. 8405 * 8406 * If ifc_len is smaller than what is needed, return 8407 * EINVAL. 8408 * 8409 * XXX: the ill_t structure can hava 2 counters, for 8410 * v4 and v6 (not just ill_ipif_up_count) to store the 8411 * number of interfaces for a device, so we don't need 8412 * to count them here... 8413 */ 8414 numifs = ip_get_numifs(zoneid); 8415 8416 ifclen = STRUCT_FGET(ifc, ifc_len); 8417 ifc_bufsize = numifs * sizeof (struct ifreq); 8418 if (ifc_bufsize > ifclen) { 8419 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8420 /* old behaviour */ 8421 return (EINVAL); 8422 } else { 8423 ifc_bufsize = ifclen; 8424 } 8425 } 8426 8427 mp1 = mi_copyout_alloc(q, mp, 8428 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8429 if (mp1 == NULL) 8430 return (ENOMEM); 8431 8432 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8433 } 8434 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8435 /* 8436 * the SIOCGIFCONF ioctl only knows about 8437 * IPv4 addresses, so don't try to tell 8438 * it about interfaces with IPv6-only 8439 * addresses. (Last parm 'isv6' is B_FALSE) 8440 */ 8441 8442 ifr = (struct ifreq *)mp1->b_rptr; 8443 8444 rw_enter(&ill_g_lock, RW_READER); 8445 ill = ILL_START_WALK_V4(&ctx); 8446 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8447 for (ipif = ill->ill_ipif; ipif != NULL; 8448 ipif = ipif->ipif_next) { 8449 if (zoneid != ipif->ipif_zoneid && 8450 ipif->ipif_zoneid != ALL_ZONES) 8451 continue; 8452 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8453 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8454 /* old behaviour */ 8455 rw_exit(&ill_g_lock); 8456 return (EINVAL); 8457 } else { 8458 goto if_copydone; 8459 } 8460 } 8461 (void) ipif_get_name(ipif, 8462 ifr->ifr_name, 8463 sizeof (ifr->ifr_name)); 8464 sin = (sin_t *)&ifr->ifr_addr; 8465 *sin = sin_null; 8466 sin->sin_family = AF_INET; 8467 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8468 ifr++; 8469 } 8470 } 8471 if_copydone: 8472 rw_exit(&ill_g_lock); 8473 mp1->b_wptr = (uchar_t *)ifr; 8474 8475 if (STRUCT_BUF(ifc) != NULL) { 8476 STRUCT_FSET(ifc, ifc_len, 8477 (int)((uchar_t *)ifr - mp1->b_rptr)); 8478 } 8479 return (0); 8480 } 8481 8482 /* 8483 * Get the interfaces using the address hosted on the interface passed in, 8484 * as a source adddress 8485 */ 8486 /* ARGSUSED */ 8487 int 8488 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8489 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8490 { 8491 mblk_t *mp1; 8492 ill_t *ill, *ill_head; 8493 ipif_t *ipif, *orig_ipif; 8494 int numlifs = 0; 8495 size_t lifs_bufsize, lifsmaxlen; 8496 struct lifreq *lifr; 8497 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8498 uint_t ifindex; 8499 zoneid_t zoneid; 8500 int err = 0; 8501 boolean_t isv6 = B_FALSE; 8502 struct sockaddr_in *sin; 8503 struct sockaddr_in6 *sin6; 8504 8505 STRUCT_HANDLE(lifsrcof, lifs); 8506 8507 ASSERT(q->q_next == NULL); 8508 8509 zoneid = Q_TO_CONN(q)->conn_zoneid; 8510 8511 /* Existence verified in ip_wput_nondata */ 8512 mp1 = mp->b_cont->b_cont; 8513 8514 /* 8515 * Must be (better be!) continuation of a TRANSPARENT 8516 * IOCTL. We just copied in the lifsrcof structure. 8517 */ 8518 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8519 (struct lifsrcof *)mp1->b_rptr); 8520 8521 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8522 return (EINVAL); 8523 8524 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8525 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8526 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8527 ip_process_ioctl, &err); 8528 if (ipif == NULL) { 8529 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8530 ifindex)); 8531 return (err); 8532 } 8533 8534 8535 /* Allocate a buffer to hold requested information */ 8536 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8537 lifs_bufsize = numlifs * sizeof (struct lifreq); 8538 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8539 /* The actual size needed is always returned in lifs_len */ 8540 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8541 8542 /* If the amount we need is more than what is passed in, abort */ 8543 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8544 ipif_refrele(ipif); 8545 return (0); 8546 } 8547 8548 mp1 = mi_copyout_alloc(q, mp, 8549 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8550 if (mp1 == NULL) { 8551 ipif_refrele(ipif); 8552 return (ENOMEM); 8553 } 8554 8555 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8556 bzero(mp1->b_rptr, lifs_bufsize); 8557 8558 lifr = (struct lifreq *)mp1->b_rptr; 8559 8560 ill = ill_head = ipif->ipif_ill; 8561 orig_ipif = ipif; 8562 8563 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8564 rw_enter(&ill_g_usesrc_lock, RW_READER); 8565 rw_enter(&ill_g_lock, RW_READER); 8566 8567 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8568 for (; (ill != NULL) && (ill != ill_head); 8569 ill = ill->ill_usesrc_grp_next) { 8570 8571 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8572 break; 8573 8574 ipif = ill->ill_ipif; 8575 (void) ipif_get_name(ipif, 8576 lifr->lifr_name, sizeof (lifr->lifr_name)); 8577 if (ipif->ipif_isv6) { 8578 sin6 = (sin6_t *)&lifr->lifr_addr; 8579 *sin6 = sin6_null; 8580 sin6->sin6_family = AF_INET6; 8581 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8582 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8583 &ipif->ipif_v6net_mask); 8584 } else { 8585 sin = (sin_t *)&lifr->lifr_addr; 8586 *sin = sin_null; 8587 sin->sin_family = AF_INET; 8588 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8589 lifr->lifr_addrlen = ip_mask_to_plen( 8590 ipif->ipif_net_mask); 8591 } 8592 lifr++; 8593 } 8594 rw_exit(&ill_g_usesrc_lock); 8595 rw_exit(&ill_g_lock); 8596 ipif_refrele(orig_ipif); 8597 mp1->b_wptr = (uchar_t *)lifr; 8598 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8599 8600 return (0); 8601 } 8602 8603 /* ARGSUSED */ 8604 int 8605 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8606 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8607 { 8608 mblk_t *mp1; 8609 int list; 8610 ill_t *ill; 8611 ipif_t *ipif; 8612 int flags; 8613 int numlifs = 0; 8614 size_t lifc_bufsize; 8615 struct lifreq *lifr; 8616 sa_family_t family; 8617 struct sockaddr_in *sin; 8618 struct sockaddr_in6 *sin6; 8619 ill_walk_context_t ctx; 8620 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8621 int32_t lifclen; 8622 zoneid_t zoneid; 8623 STRUCT_HANDLE(lifconf, lifc); 8624 8625 ip1dbg(("ip_sioctl_get_lifconf")); 8626 8627 ASSERT(q->q_next == NULL); 8628 8629 zoneid = Q_TO_CONN(q)->conn_zoneid; 8630 8631 /* Existence verified in ip_wput_nondata */ 8632 mp1 = mp->b_cont->b_cont; 8633 8634 /* 8635 * An extended version of SIOCGIFCONF that takes an 8636 * additional address family and flags field. 8637 * AF_UNSPEC retrieve both IPv4 and IPv6. 8638 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8639 * interfaces are omitted. 8640 * Similarly, IPIF_TEMPORARY interfaces are omitted 8641 * unless LIFC_TEMPORARY is specified. 8642 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8643 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8644 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8645 * has priority over LIFC_NOXMIT. 8646 */ 8647 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8648 8649 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8650 return (EINVAL); 8651 8652 /* 8653 * Must be (better be!) continuation of a TRANSPARENT 8654 * IOCTL. We just copied in the lifconf structure. 8655 */ 8656 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8657 8658 family = STRUCT_FGET(lifc, lifc_family); 8659 flags = STRUCT_FGET(lifc, lifc_flags); 8660 8661 switch (family) { 8662 case AF_UNSPEC: 8663 /* 8664 * walk all ILL's. 8665 */ 8666 list = MAX_G_HEADS; 8667 break; 8668 case AF_INET: 8669 /* 8670 * walk only IPV4 ILL's. 8671 */ 8672 list = IP_V4_G_HEAD; 8673 break; 8674 case AF_INET6: 8675 /* 8676 * walk only IPV6 ILL's. 8677 */ 8678 list = IP_V6_G_HEAD; 8679 break; 8680 default: 8681 return (EAFNOSUPPORT); 8682 } 8683 8684 /* 8685 * Allocate a buffer to hold requested information. 8686 * 8687 * If lifc_len is larger than what is needed, we only 8688 * allocate what we will use. 8689 * 8690 * If lifc_len is smaller than what is needed, return 8691 * EINVAL. 8692 */ 8693 numlifs = ip_get_numlifs(family, flags, zoneid); 8694 lifc_bufsize = numlifs * sizeof (struct lifreq); 8695 lifclen = STRUCT_FGET(lifc, lifc_len); 8696 if (lifc_bufsize > lifclen) { 8697 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8698 return (EINVAL); 8699 else 8700 lifc_bufsize = lifclen; 8701 } 8702 8703 mp1 = mi_copyout_alloc(q, mp, 8704 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8705 if (mp1 == NULL) 8706 return (ENOMEM); 8707 8708 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8709 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8710 8711 lifr = (struct lifreq *)mp1->b_rptr; 8712 8713 rw_enter(&ill_g_lock, RW_READER); 8714 ill = ill_first(list, list, &ctx); 8715 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8716 for (ipif = ill->ill_ipif; ipif != NULL; 8717 ipif = ipif->ipif_next) { 8718 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8719 !(flags & LIFC_NOXMIT)) 8720 continue; 8721 8722 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8723 !(flags & LIFC_TEMPORARY)) 8724 continue; 8725 8726 if (((ipif->ipif_flags & 8727 (IPIF_NOXMIT|IPIF_NOLOCAL| 8728 IPIF_DEPRECATED)) || 8729 (ill->ill_phyint->phyint_flags & 8730 PHYI_LOOPBACK) || 8731 !(ipif->ipif_flags & IPIF_UP)) && 8732 (flags & LIFC_EXTERNAL_SOURCE)) 8733 continue; 8734 8735 if (zoneid != ipif->ipif_zoneid && 8736 ipif->ipif_zoneid != ALL_ZONES && 8737 (zoneid != GLOBAL_ZONEID || 8738 !(flags & LIFC_ALLZONES))) 8739 continue; 8740 8741 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8742 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8743 rw_exit(&ill_g_lock); 8744 return (EINVAL); 8745 } else { 8746 goto lif_copydone; 8747 } 8748 } 8749 8750 (void) ipif_get_name(ipif, 8751 lifr->lifr_name, 8752 sizeof (lifr->lifr_name)); 8753 if (ipif->ipif_isv6) { 8754 sin6 = (sin6_t *)&lifr->lifr_addr; 8755 *sin6 = sin6_null; 8756 sin6->sin6_family = AF_INET6; 8757 sin6->sin6_addr = 8758 ipif->ipif_v6lcl_addr; 8759 lifr->lifr_addrlen = 8760 ip_mask_to_plen_v6( 8761 &ipif->ipif_v6net_mask); 8762 } else { 8763 sin = (sin_t *)&lifr->lifr_addr; 8764 *sin = sin_null; 8765 sin->sin_family = AF_INET; 8766 sin->sin_addr.s_addr = 8767 ipif->ipif_lcl_addr; 8768 lifr->lifr_addrlen = 8769 ip_mask_to_plen( 8770 ipif->ipif_net_mask); 8771 } 8772 lifr++; 8773 } 8774 } 8775 lif_copydone: 8776 rw_exit(&ill_g_lock); 8777 8778 mp1->b_wptr = (uchar_t *)lifr; 8779 if (STRUCT_BUF(lifc) != NULL) { 8780 STRUCT_FSET(lifc, lifc_len, 8781 (int)((uchar_t *)lifr - mp1->b_rptr)); 8782 } 8783 return (0); 8784 } 8785 8786 /* ARGSUSED */ 8787 int 8788 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 8789 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8790 { 8791 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8792 ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 8793 return (0); 8794 } 8795 8796 static void 8797 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 8798 { 8799 ip6_asp_t *table; 8800 size_t table_size; 8801 mblk_t *data_mp; 8802 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8803 8804 /* These two ioctls are I_STR only */ 8805 if (iocp->ioc_count == TRANSPARENT) { 8806 miocnak(q, mp, 0, EINVAL); 8807 return; 8808 } 8809 8810 data_mp = mp->b_cont; 8811 if (data_mp == NULL) { 8812 /* The user passed us a NULL argument */ 8813 table = NULL; 8814 table_size = iocp->ioc_count; 8815 } else { 8816 /* 8817 * The user provided a table. The stream head 8818 * may have copied in the user data in chunks, 8819 * so make sure everything is pulled up 8820 * properly. 8821 */ 8822 if (MBLKL(data_mp) < iocp->ioc_count) { 8823 mblk_t *new_data_mp; 8824 if ((new_data_mp = msgpullup(data_mp, -1)) == 8825 NULL) { 8826 miocnak(q, mp, 0, ENOMEM); 8827 return; 8828 } 8829 freemsg(data_mp); 8830 data_mp = new_data_mp; 8831 mp->b_cont = data_mp; 8832 } 8833 table = (ip6_asp_t *)data_mp->b_rptr; 8834 table_size = iocp->ioc_count; 8835 } 8836 8837 switch (iocp->ioc_cmd) { 8838 case SIOCGIP6ADDRPOLICY: 8839 iocp->ioc_rval = ip6_asp_get(table, table_size); 8840 if (iocp->ioc_rval == -1) 8841 iocp->ioc_error = EINVAL; 8842 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8843 else if (table != NULL && 8844 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 8845 ip6_asp_t *src = table; 8846 ip6_asp32_t *dst = (void *)table; 8847 int count = table_size / sizeof (ip6_asp_t); 8848 int i; 8849 8850 /* 8851 * We need to do an in-place shrink of the array 8852 * to match the alignment attributes of the 8853 * 32-bit ABI looking at it. 8854 */ 8855 /* LINTED: logical expression always true: op "||" */ 8856 ASSERT(sizeof (*src) > sizeof (*dst)); 8857 for (i = 1; i < count; i++) 8858 bcopy(src + i, dst + i, sizeof (*dst)); 8859 } 8860 #endif 8861 break; 8862 8863 case SIOCSIP6ADDRPOLICY: 8864 ASSERT(mp->b_prev == NULL); 8865 mp->b_prev = (void *)q; 8866 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8867 /* 8868 * We pass in the datamodel here so that the ip6_asp_replace() 8869 * routine can handle converting from 32-bit to native formats 8870 * where necessary. 8871 * 8872 * A better way to handle this might be to convert the inbound 8873 * data structure here, and hang it off a new 'mp'; thus the 8874 * ip6_asp_replace() logic would always be dealing with native 8875 * format data structures.. 8876 * 8877 * (An even simpler way to handle these ioctls is to just 8878 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 8879 * and just recompile everything that depends on it.) 8880 */ 8881 #endif 8882 ip6_asp_replace(mp, table, table_size, B_FALSE, 8883 iocp->ioc_flag & IOC_MODELS); 8884 return; 8885 } 8886 8887 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 8888 qreply(q, mp); 8889 } 8890 8891 static void 8892 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 8893 { 8894 mblk_t *data_mp; 8895 struct dstinforeq *dir; 8896 uint8_t *end, *cur; 8897 in6_addr_t *daddr, *saddr; 8898 ipaddr_t v4daddr; 8899 ire_t *ire; 8900 char *slabel, *dlabel; 8901 boolean_t isipv4; 8902 int match_ire; 8903 ill_t *dst_ill; 8904 ipif_t *src_ipif, *ire_ipif; 8905 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8906 zoneid_t zoneid; 8907 8908 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8909 zoneid = Q_TO_CONN(q)->conn_zoneid; 8910 8911 /* 8912 * This ioctl is I_STR only, and must have a 8913 * data mblk following the M_IOCTL mblk. 8914 */ 8915 data_mp = mp->b_cont; 8916 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 8917 miocnak(q, mp, 0, EINVAL); 8918 return; 8919 } 8920 8921 if (MBLKL(data_mp) < iocp->ioc_count) { 8922 mblk_t *new_data_mp; 8923 8924 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 8925 miocnak(q, mp, 0, ENOMEM); 8926 return; 8927 } 8928 freemsg(data_mp); 8929 data_mp = new_data_mp; 8930 mp->b_cont = data_mp; 8931 } 8932 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 8933 8934 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 8935 end - cur >= sizeof (struct dstinforeq); 8936 cur += sizeof (struct dstinforeq)) { 8937 dir = (struct dstinforeq *)cur; 8938 daddr = &dir->dir_daddr; 8939 saddr = &dir->dir_saddr; 8940 8941 /* 8942 * ip_addr_scope_v6() and ip6_asp_lookup() handle 8943 * v4 mapped addresses; ire_ftable_lookup[_v6]() 8944 * and ipif_select_source[_v6]() do not. 8945 */ 8946 dir->dir_dscope = ip_addr_scope_v6(daddr); 8947 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence); 8948 8949 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 8950 if (isipv4) { 8951 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 8952 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 8953 0, NULL, NULL, zoneid, 0, NULL, match_ire); 8954 } else { 8955 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 8956 0, NULL, NULL, zoneid, 0, NULL, match_ire); 8957 } 8958 if (ire == NULL) { 8959 dir->dir_dreachable = 0; 8960 8961 /* move on to next dst addr */ 8962 continue; 8963 } 8964 dir->dir_dreachable = 1; 8965 8966 ire_ipif = ire->ire_ipif; 8967 if (ire_ipif == NULL) 8968 goto next_dst; 8969 8970 /* 8971 * We expect to get back an interface ire or a 8972 * gateway ire cache entry. For both types, the 8973 * output interface is ire_ipif->ipif_ill. 8974 */ 8975 dst_ill = ire_ipif->ipif_ill; 8976 dir->dir_dmactype = dst_ill->ill_mactype; 8977 8978 if (isipv4) { 8979 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 8980 } else { 8981 src_ipif = ipif_select_source_v6(dst_ill, 8982 daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, 8983 zoneid); 8984 } 8985 if (src_ipif == NULL) 8986 goto next_dst; 8987 8988 *saddr = src_ipif->ipif_v6lcl_addr; 8989 dir->dir_sscope = ip_addr_scope_v6(saddr); 8990 slabel = ip6_asp_lookup(saddr, NULL); 8991 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 8992 dir->dir_sdeprecated = 8993 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 8994 ipif_refrele(src_ipif); 8995 next_dst: 8996 ire_refrele(ire); 8997 } 8998 miocack(q, mp, iocp->ioc_count, 0); 8999 } 9000 9001 9002 /* 9003 * Check if this is an address assigned to this machine. 9004 * Skips interfaces that are down by using ire checks. 9005 * Translates mapped addresses to v4 addresses and then 9006 * treats them as such, returning true if the v4 address 9007 * associated with this mapped address is configured. 9008 * Note: Applications will have to be careful what they do 9009 * with the response; use of mapped addresses limits 9010 * what can be done with the socket, especially with 9011 * respect to socket options and ioctls - neither IPv4 9012 * options nor IPv6 sticky options/ancillary data options 9013 * may be used. 9014 */ 9015 /* ARGSUSED */ 9016 int 9017 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9018 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9019 { 9020 struct sioc_addrreq *sia; 9021 sin_t *sin; 9022 ire_t *ire; 9023 mblk_t *mp1; 9024 zoneid_t zoneid; 9025 9026 ip1dbg(("ip_sioctl_tmyaddr")); 9027 9028 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9029 zoneid = Q_TO_CONN(q)->conn_zoneid; 9030 9031 /* Existence verified in ip_wput_nondata */ 9032 mp1 = mp->b_cont->b_cont; 9033 sia = (struct sioc_addrreq *)mp1->b_rptr; 9034 sin = (sin_t *)&sia->sa_addr; 9035 switch (sin->sin_family) { 9036 case AF_INET6: { 9037 sin6_t *sin6 = (sin6_t *)sin; 9038 9039 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9040 ipaddr_t v4_addr; 9041 9042 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9043 v4_addr); 9044 ire = ire_ctable_lookup(v4_addr, 0, 9045 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9046 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9047 } else { 9048 in6_addr_t v6addr; 9049 9050 v6addr = sin6->sin6_addr; 9051 ire = ire_ctable_lookup_v6(&v6addr, 0, 9052 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9053 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9054 } 9055 break; 9056 } 9057 case AF_INET: { 9058 ipaddr_t v4addr; 9059 9060 v4addr = sin->sin_addr.s_addr; 9061 ire = ire_ctable_lookup(v4addr, 0, 9062 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9063 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9064 break; 9065 } 9066 default: 9067 return (EAFNOSUPPORT); 9068 } 9069 if (ire != NULL) { 9070 sia->sa_res = 1; 9071 ire_refrele(ire); 9072 } else { 9073 sia->sa_res = 0; 9074 } 9075 return (0); 9076 } 9077 9078 /* 9079 * Check if this is an address assigned on-link i.e. neighbor, 9080 * and makes sure it's reachable from the current zone. 9081 * Returns true for my addresses as well. 9082 * Translates mapped addresses to v4 addresses and then 9083 * treats them as such, returning true if the v4 address 9084 * associated with this mapped address is configured. 9085 * Note: Applications will have to be careful what they do 9086 * with the response; use of mapped addresses limits 9087 * what can be done with the socket, especially with 9088 * respect to socket options and ioctls - neither IPv4 9089 * options nor IPv6 sticky options/ancillary data options 9090 * may be used. 9091 */ 9092 /* ARGSUSED */ 9093 int 9094 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9095 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9096 { 9097 struct sioc_addrreq *sia; 9098 sin_t *sin; 9099 mblk_t *mp1; 9100 ire_t *ire = NULL; 9101 zoneid_t zoneid; 9102 9103 ip1dbg(("ip_sioctl_tonlink")); 9104 9105 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9106 zoneid = Q_TO_CONN(q)->conn_zoneid; 9107 9108 /* Existence verified in ip_wput_nondata */ 9109 mp1 = mp->b_cont->b_cont; 9110 sia = (struct sioc_addrreq *)mp1->b_rptr; 9111 sin = (sin_t *)&sia->sa_addr; 9112 9113 /* 9114 * Match addresses with a zero gateway field to avoid 9115 * routes going through a router. 9116 * Exclude broadcast and multicast addresses. 9117 */ 9118 switch (sin->sin_family) { 9119 case AF_INET6: { 9120 sin6_t *sin6 = (sin6_t *)sin; 9121 9122 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9123 ipaddr_t v4_addr; 9124 9125 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9126 v4_addr); 9127 if (!CLASSD(v4_addr)) { 9128 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9129 NULL, NULL, zoneid, NULL, 9130 MATCH_IRE_GW); 9131 } 9132 } else { 9133 in6_addr_t v6addr; 9134 in6_addr_t v6gw; 9135 9136 v6addr = sin6->sin6_addr; 9137 v6gw = ipv6_all_zeros; 9138 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9139 ire = ire_route_lookup_v6(&v6addr, 0, 9140 &v6gw, 0, NULL, NULL, zoneid, 9141 NULL, MATCH_IRE_GW); 9142 } 9143 } 9144 break; 9145 } 9146 case AF_INET: { 9147 ipaddr_t v4addr; 9148 9149 v4addr = sin->sin_addr.s_addr; 9150 if (!CLASSD(v4addr)) { 9151 ire = ire_route_lookup(v4addr, 0, 0, 0, 9152 NULL, NULL, zoneid, NULL, 9153 MATCH_IRE_GW); 9154 } 9155 break; 9156 } 9157 default: 9158 return (EAFNOSUPPORT); 9159 } 9160 sia->sa_res = 0; 9161 if (ire != NULL) { 9162 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9163 IRE_LOCAL|IRE_LOOPBACK)) { 9164 sia->sa_res = 1; 9165 } 9166 ire_refrele(ire); 9167 } 9168 return (0); 9169 } 9170 9171 /* 9172 * TBD: implement when kernel maintaines a list of site prefixes. 9173 */ 9174 /* ARGSUSED */ 9175 int 9176 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9177 ip_ioctl_cmd_t *ipip, void *ifreq) 9178 { 9179 return (ENXIO); 9180 } 9181 9182 /* ARGSUSED */ 9183 int 9184 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9185 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9186 { 9187 ill_t *ill; 9188 mblk_t *mp1; 9189 conn_t *connp; 9190 boolean_t success; 9191 9192 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 9193 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9194 /* ioctl comes down on an conn */ 9195 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9196 connp = Q_TO_CONN(q); 9197 9198 mp->b_datap->db_type = M_IOCTL; 9199 9200 /* 9201 * Send down a copy. (copymsg does not copy b_next/b_prev). 9202 * The original mp contains contaminated b_next values due to 'mi', 9203 * which is needed to do the mi_copy_done. Unfortunately if we 9204 * send down the original mblk itself and if we are popped due to an 9205 * an unplumb before the response comes back from tunnel, 9206 * the streamhead (which does a freemsg) will see this contaminated 9207 * message and the assertion in freemsg about non-null b_next/b_prev 9208 * will panic a DEBUG kernel. 9209 */ 9210 mp1 = copymsg(mp); 9211 if (mp1 == NULL) 9212 return (ENOMEM); 9213 9214 ill = ipif->ipif_ill; 9215 mutex_enter(&connp->conn_lock); 9216 mutex_enter(&ill->ill_lock); 9217 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 9218 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9219 mp, 0); 9220 } else { 9221 success = ill_pending_mp_add(ill, connp, mp); 9222 } 9223 mutex_exit(&ill->ill_lock); 9224 mutex_exit(&connp->conn_lock); 9225 9226 if (success) { 9227 ip1dbg(("sending down tunparam request ")); 9228 putnext(ill->ill_wq, mp1); 9229 return (EINPROGRESS); 9230 } else { 9231 /* The conn has started closing */ 9232 freemsg(mp1); 9233 return (EINTR); 9234 } 9235 } 9236 9237 static int 9238 ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, 9239 boolean_t x_arp_ioctl, boolean_t if_arp_ioctl) 9240 { 9241 mblk_t *mp1; 9242 mblk_t *mp2; 9243 mblk_t *pending_mp; 9244 ipaddr_t ipaddr; 9245 area_t *area; 9246 struct iocblk *iocp; 9247 conn_t *connp; 9248 struct arpreq *ar; 9249 struct xarpreq *xar; 9250 boolean_t success; 9251 int flags, alength; 9252 char *lladdr; 9253 9254 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9255 connp = Q_TO_CONN(q); 9256 9257 iocp = (struct iocblk *)mp->b_rptr; 9258 /* 9259 * ill has already been set depending on whether 9260 * bsd style or interface style ioctl. 9261 */ 9262 ASSERT(ill != NULL); 9263 9264 /* 9265 * Is this one of the new SIOC*XARP ioctls? 9266 */ 9267 if (x_arp_ioctl) { 9268 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9269 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9270 ar = NULL; 9271 9272 flags = xar->xarp_flags; 9273 lladdr = LLADDR(&xar->xarp_ha); 9274 /* 9275 * Validate against user's link layer address length 9276 * input and name and addr length limits. 9277 */ 9278 alength = ill->ill_phys_addr_length; 9279 if (iocp->ioc_cmd == SIOCSXARP) { 9280 if (alength != xar->xarp_ha.sdl_alen || 9281 (alength + xar->xarp_ha.sdl_nlen > 9282 sizeof (xar->xarp_ha.sdl_data))) 9283 return (EINVAL); 9284 } 9285 } else { 9286 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9287 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9288 xar = NULL; 9289 9290 flags = ar->arp_flags; 9291 lladdr = ar->arp_ha.sa_data; 9292 /* 9293 * Theoretically, the sa_family could tell us what link 9294 * layer type this operation is trying to deal with. By 9295 * common usage AF_UNSPEC means ethernet. We'll assume 9296 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9297 * for now. Our new SIOC*XARP ioctls can be used more 9298 * generally. 9299 * 9300 * If the underlying media happens to have a non 6 byte 9301 * address, arp module will fail set/get, but the del 9302 * operation will succeed. 9303 */ 9304 alength = 6; 9305 if ((iocp->ioc_cmd != SIOCDARP) && 9306 (alength != ill->ill_phys_addr_length)) { 9307 return (EINVAL); 9308 } 9309 } 9310 9311 /* 9312 * We are going to pass up to ARP a packet chain that looks 9313 * like: 9314 * 9315 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9316 * 9317 * Get a copy of the original IOCTL mblk to head the chain, 9318 * to be sent up (in mp1). Also get another copy to store 9319 * in the ill_pending_mp list, for matching the response 9320 * when it comes back from ARP. 9321 */ 9322 mp1 = copyb(mp); 9323 pending_mp = copymsg(mp); 9324 if (mp1 == NULL || pending_mp == NULL) { 9325 if (mp1 != NULL) 9326 freeb(mp1); 9327 if (pending_mp != NULL) 9328 inet_freemsg(pending_mp); 9329 return (ENOMEM); 9330 } 9331 9332 ipaddr = sin->sin_addr.s_addr; 9333 9334 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9335 (caddr_t)&ipaddr); 9336 if (mp2 == NULL) { 9337 freeb(mp1); 9338 inet_freemsg(pending_mp); 9339 return (ENOMEM); 9340 } 9341 /* Put together the chain. */ 9342 mp1->b_cont = mp2; 9343 mp1->b_datap->db_type = M_IOCTL; 9344 mp2->b_cont = mp; 9345 mp2->b_datap->db_type = M_DATA; 9346 9347 iocp = (struct iocblk *)mp1->b_rptr; 9348 9349 /* 9350 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9351 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9352 * cp_private field (or cp_rval on 32-bit systems) in place of the 9353 * ioc_count field; set ioc_count to be correct. 9354 */ 9355 iocp->ioc_count = MBLKL(mp1->b_cont); 9356 9357 /* 9358 * Set the proper command in the ARP message. 9359 * Convert the SIOC{G|S|D}ARP calls into our 9360 * AR_ENTRY_xxx calls. 9361 */ 9362 area = (area_t *)mp2->b_rptr; 9363 switch (iocp->ioc_cmd) { 9364 case SIOCDARP: 9365 case SIOCDXARP: 9366 /* 9367 * We defer deleting the corresponding IRE until 9368 * we return from arp. 9369 */ 9370 area->area_cmd = AR_ENTRY_DELETE; 9371 area->area_proto_mask_offset = 0; 9372 break; 9373 case SIOCGARP: 9374 case SIOCGXARP: 9375 area->area_cmd = AR_ENTRY_SQUERY; 9376 area->area_proto_mask_offset = 0; 9377 break; 9378 case SIOCSARP: 9379 case SIOCSXARP: { 9380 /* 9381 * Delete the corresponding ire to make sure IP will 9382 * pick up any change from arp. 9383 */ 9384 if (!if_arp_ioctl) { 9385 (void) ip_ire_clookup_and_delete(ipaddr, NULL); 9386 break; 9387 } else { 9388 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9389 if (ipif != NULL) { 9390 (void) ip_ire_clookup_and_delete(ipaddr, ipif); 9391 ipif_refrele(ipif); 9392 } 9393 break; 9394 } 9395 } 9396 } 9397 iocp->ioc_cmd = area->area_cmd; 9398 9399 /* 9400 * Before sending 'mp' to ARP, we have to clear the b_next 9401 * and b_prev. Otherwise if STREAMS encounters such a message 9402 * in freemsg(), (because ARP can close any time) it can cause 9403 * a panic. But mi code needs the b_next and b_prev values of 9404 * mp->b_cont, to complete the ioctl. So we store it here 9405 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9406 * when the response comes down from ARP. 9407 */ 9408 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9409 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9410 mp->b_cont->b_next = NULL; 9411 mp->b_cont->b_prev = NULL; 9412 9413 mutex_enter(&connp->conn_lock); 9414 mutex_enter(&ill->ill_lock); 9415 /* conn has not yet started closing, hence this can't fail */ 9416 success = ill_pending_mp_add(ill, connp, pending_mp); 9417 ASSERT(success); 9418 mutex_exit(&ill->ill_lock); 9419 mutex_exit(&connp->conn_lock); 9420 9421 /* 9422 * Fill in the rest of the ARP operation fields. 9423 */ 9424 area->area_hw_addr_length = alength; 9425 bcopy(lladdr, 9426 (char *)area + area->area_hw_addr_offset, 9427 area->area_hw_addr_length); 9428 /* Translate the flags. */ 9429 if (flags & ATF_PERM) 9430 area->area_flags |= ACE_F_PERMANENT; 9431 if (flags & ATF_PUBL) 9432 area->area_flags |= ACE_F_PUBLISH; 9433 if (flags & ATF_AUTHORITY) 9434 area->area_flags |= ACE_F_AUTHORITY; 9435 9436 /* 9437 * Up to ARP it goes. The response will come 9438 * back in ip_wput as an M_IOCACK message, and 9439 * will be handed to ip_sioctl_iocack for 9440 * completion. 9441 */ 9442 putnext(ill->ill_rq, mp1); 9443 return (EINPROGRESS); 9444 } 9445 9446 /* ARGSUSED */ 9447 int 9448 ip_sioctl_xarp(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9449 ip_ioctl_cmd_t *ipip, void *ifreq) 9450 { 9451 struct xarpreq *xar; 9452 boolean_t isv6; 9453 mblk_t *mp1; 9454 int err; 9455 conn_t *connp; 9456 int ifnamelen; 9457 ire_t *ire = NULL; 9458 ill_t *ill = NULL; 9459 struct sockaddr_in *sin; 9460 boolean_t if_arp_ioctl = B_FALSE; 9461 9462 /* ioctl comes down on an conn */ 9463 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9464 connp = Q_TO_CONN(q); 9465 isv6 = connp->conn_af_isv6; 9466 9467 /* Existance verified in ip_wput_nondata */ 9468 mp1 = mp->b_cont->b_cont; 9469 9470 ASSERT(MBLKL(mp1) >= sizeof (*xar)); 9471 xar = (struct xarpreq *)mp1->b_rptr; 9472 sin = (sin_t *)&xar->xarp_pa; 9473 9474 if (isv6 || (xar->xarp_ha.sdl_family != AF_LINK) || 9475 (xar->xarp_pa.ss_family != AF_INET)) 9476 return (ENXIO); 9477 9478 ifnamelen = xar->xarp_ha.sdl_nlen; 9479 if (ifnamelen != 0) { 9480 char *cptr, cval; 9481 9482 if (ifnamelen >= LIFNAMSIZ) 9483 return (EINVAL); 9484 9485 /* 9486 * Instead of bcopying a bunch of bytes, 9487 * null-terminate the string in-situ. 9488 */ 9489 cptr = xar->xarp_ha.sdl_data + ifnamelen; 9490 cval = *cptr; 9491 *cptr = '\0'; 9492 ill = ill_lookup_on_name(xar->xarp_ha.sdl_data, 9493 B_FALSE, isv6, CONNP_TO_WQ(connp), mp, ip_process_ioctl, 9494 &err, NULL); 9495 *cptr = cval; 9496 if (ill == NULL) 9497 return (err); 9498 if (ill->ill_net_type != IRE_IF_RESOLVER) { 9499 ill_refrele(ill); 9500 return (ENXIO); 9501 } 9502 9503 if_arp_ioctl = B_TRUE; 9504 } else { 9505 /* 9506 * PSARC 2003/088 states that if sdl_nlen == 0, it behaves 9507 * as an extended BSD ioctl. The kernel uses the IP address 9508 * to figure out the network interface. 9509 */ 9510 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9511 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9512 ((ill = ire_to_ill(ire)) == NULL) || 9513 (ill->ill_net_type != IRE_IF_RESOLVER)) { 9514 if (ire != NULL) 9515 ire_refrele(ire); 9516 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9517 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9518 NULL, MATCH_IRE_TYPE); 9519 if ((ire == NULL) || 9520 ((ill = ire_to_ill(ire)) == NULL)) { 9521 if (ire != NULL) 9522 ire_refrele(ire); 9523 return (ENXIO); 9524 } 9525 } 9526 ASSERT(ire != NULL && ill != NULL); 9527 } 9528 9529 err = ip_sioctl_arp_common(ill, q, mp, sin, B_TRUE, if_arp_ioctl); 9530 if (if_arp_ioctl) 9531 ill_refrele(ill); 9532 if (ire != NULL) 9533 ire_refrele(ire); 9534 9535 return (err); 9536 } 9537 9538 /* 9539 * ARP IOCTLs. 9540 * How does IP get in the business of fronting ARP configuration/queries? 9541 * Well its like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9542 * are by tradition passed in through a datagram socket. That lands in IP. 9543 * As it happens, this is just as well since the interface is quite crude in 9544 * that it passes in no information about protocol or hardware types, or 9545 * interface association. After making the protocol assumption, IP is in 9546 * the position to look up the name of the ILL, which ARP will need, and 9547 * format a request that can be handled by ARP. The request is passed up 9548 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9549 * back a response. ARP supports its own set of more general IOCTLs, in 9550 * case anyone is interested. 9551 */ 9552 /* ARGSUSED */ 9553 int 9554 ip_sioctl_arp(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9555 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9556 { 9557 struct arpreq *ar; 9558 struct sockaddr_in *sin; 9559 ire_t *ire; 9560 boolean_t isv6; 9561 mblk_t *mp1; 9562 int err; 9563 conn_t *connp; 9564 ill_t *ill; 9565 9566 /* ioctl comes down on an conn */ 9567 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9568 connp = Q_TO_CONN(q); 9569 isv6 = connp->conn_af_isv6; 9570 if (isv6) 9571 return (ENXIO); 9572 9573 /* Existance verified in ip_wput_nondata */ 9574 mp1 = mp->b_cont->b_cont; 9575 9576 ar = (struct arpreq *)mp1->b_rptr; 9577 sin = (sin_t *)&ar->arp_pa; 9578 9579 /* 9580 * We need to let ARP know on which interface the IP 9581 * address has an ARP mapping. In the IPMP case, a 9582 * simple forwarding table lookup will return the 9583 * IRE_IF_RESOLVER for the first interface in the group, 9584 * which might not be the interface on which the 9585 * requested IP address was resolved due to the ill 9586 * selection algorithm (see ip_newroute_get_dst_ill()). 9587 * So we do a cache table lookup first: if the IRE cache 9588 * entry for the IP address is still there, it will 9589 * contain the ill pointer for the right interface, so 9590 * we use that. If the cache entry has been flushed, we 9591 * fall back to the forwarding table lookup. This should 9592 * be rare enough since IRE cache entries have a longer 9593 * life expectancy than ARP cache entries. 9594 */ 9595 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9596 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9597 ((ill = ire_to_ill(ire)) == NULL)) { 9598 if (ire != NULL) 9599 ire_refrele(ire); 9600 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9601 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9602 NULL, MATCH_IRE_TYPE); 9603 if ((ire == NULL) || ((ill = ire_to_ill(ire)) == NULL)) { 9604 if (ire != NULL) 9605 ire_refrele(ire); 9606 return (ENXIO); 9607 } 9608 } 9609 ASSERT(ire != NULL && ill != NULL); 9610 9611 err = ip_sioctl_arp_common(ill, q, mp, sin, B_FALSE, B_FALSE); 9612 ire_refrele(ire); 9613 return (err); 9614 } 9615 9616 /* 9617 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9618 * atomically set/clear the muxids. Also complete the ioctl by acking or 9619 * naking it. Note that the code is structured such that the link type, 9620 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9621 * its clones use the persistent link, while pppd(1M) and perhaps many 9622 * other daemons may use non-persistent link. When combined with some 9623 * ill_t states, linking and unlinking lower streams may be used as 9624 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9625 */ 9626 /* ARGSUSED */ 9627 void 9628 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9629 { 9630 mblk_t *mp1; 9631 mblk_t *mp2; 9632 struct linkblk *li; 9633 queue_t *ipwq; 9634 char *name; 9635 struct qinit *qinfo; 9636 struct ipmx_s *ipmxp; 9637 ill_t *ill = NULL; 9638 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9639 int err = 0; 9640 boolean_t entered_ipsq = B_FALSE; 9641 boolean_t islink; 9642 queue_t *dwq = NULL; 9643 9644 ASSERT(iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_PUNLINK || 9645 iocp->ioc_cmd == I_LINK || iocp->ioc_cmd == I_UNLINK); 9646 9647 islink = (iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_LINK) ? 9648 B_TRUE : B_FALSE; 9649 9650 mp1 = mp->b_cont; /* This is the linkblk info */ 9651 li = (struct linkblk *)mp1->b_rptr; 9652 9653 /* 9654 * ARP has added this special mblk, and the utility is asking us 9655 * to perform consistency checks, and also atomically set the 9656 * muxid. Ifconfig is an example. It achieves this by using 9657 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9658 * to /dev/udp[6] stream for use as the mux when plinking the IP 9659 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9660 * and other comments in this routine for more details. 9661 */ 9662 mp2 = mp1->b_cont; /* This is added by ARP */ 9663 9664 /* 9665 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9666 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9667 * get the special mblk above. For backward compatibility, we just 9668 * return success. The utility will use SIOCSLIFMUXID to store 9669 * the muxids. This is not atomic, and can leave the streams 9670 * unplumbable if the utility is interrrupted, before it does the 9671 * SIOCSLIFMUXID. 9672 */ 9673 if (mp2 == NULL) { 9674 /* 9675 * At this point we don't know whether or not this is the 9676 * IP module stream or the ARP device stream. We need to 9677 * walk the lower stream in order to find this out, since 9678 * the capability negotiation is done only on the IP module 9679 * stream. IP module instance is identified by the module 9680 * name IP, non-null q_next, and it's wput not being ip_lwput. 9681 * STREAMS ensures that the lower stream (l_qbot) will not 9682 * vanish until this ioctl completes. So we can safely walk 9683 * the stream or refer to the q_ptr. 9684 */ 9685 ipwq = li->l_qbot; 9686 while (ipwq != NULL) { 9687 qinfo = ipwq->q_qinfo; 9688 name = qinfo->qi_minfo->mi_idname; 9689 if (name != NULL && name[0] != NULL && 9690 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9691 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9692 (ipwq->q_next != NULL)) { 9693 break; 9694 } 9695 ipwq = ipwq->q_next; 9696 } 9697 /* 9698 * This looks like an IP module stream, so trigger 9699 * the capability reset or re-negotiation if necessary. 9700 */ 9701 if (ipwq != NULL) { 9702 ill = ipwq->q_ptr; 9703 ASSERT(ill != NULL); 9704 9705 if (ipsq == NULL) { 9706 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9707 ip_sioctl_plink, NEW_OP, B_TRUE); 9708 if (ipsq == NULL) 9709 return; 9710 entered_ipsq = B_TRUE; 9711 } 9712 ASSERT(IAM_WRITER_ILL(ill)); 9713 /* 9714 * Store the upper read queue of the module 9715 * immediately below IP, and count the total 9716 * number of lower modules. Do this only 9717 * for I_PLINK or I_LINK event. 9718 */ 9719 ill->ill_lmod_rq = NULL; 9720 ill->ill_lmod_cnt = 0; 9721 if (islink && (dwq = ipwq->q_next) != NULL) { 9722 ill->ill_lmod_rq = RD(dwq); 9723 9724 while (dwq != NULL) { 9725 ill->ill_lmod_cnt++; 9726 dwq = dwq->q_next; 9727 } 9728 } 9729 /* 9730 * There's no point in resetting or re-negotiating if 9731 * we are not bound to the driver, so only do this if 9732 * the DLPI state is idle (up); we assume such state 9733 * since ill_ipif_up_count gets incremented in 9734 * ipif_up_done(), which is after we are bound to the 9735 * driver. Note that in the case of logical 9736 * interfaces, IP won't rebind to the driver unless 9737 * the ill_ipif_up_count is 0, meaning that all other 9738 * IP interfaces (including the main ipif) are in the 9739 * down state. Because of this, we use such counter 9740 * as an indicator, instead of relying on the IPIF_UP 9741 * flag, which is per ipif instance. 9742 */ 9743 if (ill->ill_ipif_up_count > 0) { 9744 if (islink) 9745 ill_capability_probe(ill); 9746 else 9747 ill_capability_reset(ill); 9748 } 9749 } 9750 goto done; 9751 } 9752 9753 /* 9754 * This is an I_{P}LINK sent down by ifconfig on 9755 * /dev/arp. ARP has appended this last (3rd) mblk, 9756 * giving more info. STREAMS ensures that the lower 9757 * stream (l_qbot) will not vanish until this ioctl 9758 * completes. So we can safely walk the stream or refer 9759 * to the q_ptr. 9760 */ 9761 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9762 if (ipmxp->ipmx_arpdev_stream) { 9763 /* 9764 * The operation is occuring on the arp-device 9765 * stream. 9766 */ 9767 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9768 q, mp, ip_sioctl_plink, &err, NULL); 9769 if (ill == NULL) { 9770 if (err == EINPROGRESS) { 9771 return; 9772 } else { 9773 err = EINVAL; 9774 goto done; 9775 } 9776 } 9777 9778 if (ipsq == NULL) { 9779 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9780 NEW_OP, B_TRUE); 9781 if (ipsq == NULL) { 9782 ill_refrele(ill); 9783 return; 9784 } 9785 entered_ipsq = B_TRUE; 9786 } 9787 ASSERT(IAM_WRITER_ILL(ill)); 9788 ill_refrele(ill); 9789 /* 9790 * To ensure consistency between IP and ARP, 9791 * the following LIFO scheme is used in 9792 * plink/punlink. (IP first, ARP last). 9793 * This is because the muxid's are stored 9794 * in the IP stream on the ill. 9795 * 9796 * I_{P}LINK: ifconfig plinks the IP stream before 9797 * plinking the ARP stream. On an arp-dev 9798 * stream, IP checks that it is not yet 9799 * plinked, and it also checks that the 9800 * corresponding IP stream is already plinked. 9801 * 9802 * I_{P}UNLINK: ifconfig punlinks the ARP stream 9803 * before punlinking the IP stream. IP does 9804 * not allow punlink of the IP stream unless 9805 * the arp stream has been punlinked. 9806 * 9807 */ 9808 if ((islink && 9809 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9810 (!islink && 9811 ill->ill_arp_muxid != li->l_index)) { 9812 err = EINVAL; 9813 goto done; 9814 } 9815 if (islink) { 9816 ill->ill_arp_muxid = li->l_index; 9817 } else { 9818 ill->ill_arp_muxid = 0; 9819 } 9820 } else { 9821 /* 9822 * This must be the IP module stream with or 9823 * without arp. Walk the stream and locate the 9824 * IP module. An IP module instance is 9825 * identified by the module name IP, non-null 9826 * q_next, and it's wput not being ip_lwput. 9827 */ 9828 ipwq = li->l_qbot; 9829 while (ipwq != NULL) { 9830 qinfo = ipwq->q_qinfo; 9831 name = qinfo->qi_minfo->mi_idname; 9832 if (name != NULL && name[0] != NULL && 9833 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9834 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9835 (ipwq->q_next != NULL)) { 9836 break; 9837 } 9838 ipwq = ipwq->q_next; 9839 } 9840 if (ipwq != NULL) { 9841 ill = ipwq->q_ptr; 9842 ASSERT(ill != NULL); 9843 9844 if (ipsq == NULL) { 9845 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9846 ip_sioctl_plink, NEW_OP, B_TRUE); 9847 if (ipsq == NULL) 9848 return; 9849 entered_ipsq = B_TRUE; 9850 } 9851 ASSERT(IAM_WRITER_ILL(ill)); 9852 /* 9853 * Return error if the ip_mux_id is 9854 * non-zero and command is I_{P}LINK. 9855 * If command is I_{P}UNLINK, return 9856 * error if the arp-devstr is not 9857 * yet punlinked. 9858 */ 9859 if ((islink && ill->ill_ip_muxid != 0) || 9860 (!islink && ill->ill_arp_muxid != 0)) { 9861 err = EINVAL; 9862 goto done; 9863 } 9864 ill->ill_lmod_rq = NULL; 9865 ill->ill_lmod_cnt = 0; 9866 if (islink) { 9867 /* 9868 * Store the upper read queue of the module 9869 * immediately below IP, and count the total 9870 * number of lower modules. 9871 */ 9872 if ((dwq = ipwq->q_next) != NULL) { 9873 ill->ill_lmod_rq = RD(dwq); 9874 9875 while (dwq != NULL) { 9876 ill->ill_lmod_cnt++; 9877 dwq = dwq->q_next; 9878 } 9879 } 9880 ill->ill_ip_muxid = li->l_index; 9881 } else { 9882 ill->ill_ip_muxid = 0; 9883 } 9884 9885 /* 9886 * See comments above about resetting/re- 9887 * negotiating driver sub-capabilities. 9888 */ 9889 if (ill->ill_ipif_up_count > 0) { 9890 if (islink) 9891 ill_capability_probe(ill); 9892 else 9893 ill_capability_reset(ill); 9894 } 9895 } 9896 } 9897 done: 9898 iocp->ioc_count = 0; 9899 iocp->ioc_error = err; 9900 if (err == 0) 9901 mp->b_datap->db_type = M_IOCACK; 9902 else 9903 mp->b_datap->db_type = M_IOCNAK; 9904 qreply(q, mp); 9905 9906 /* Conn was refheld in ip_sioctl_copyin_setup */ 9907 if (CONN_Q(q)) 9908 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9909 if (entered_ipsq) 9910 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9911 } 9912 9913 /* 9914 * Search the ioctl command in the ioctl tables and return a pointer 9915 * to the ioctl command information. The ioctl command tables are 9916 * static and fully populated at compile time. 9917 */ 9918 ip_ioctl_cmd_t * 9919 ip_sioctl_lookup(int ioc_cmd) 9920 { 9921 int index; 9922 ip_ioctl_cmd_t *ipip; 9923 ip_ioctl_cmd_t *ipip_end; 9924 9925 if (ioc_cmd == IPI_DONTCARE) 9926 return (NULL); 9927 9928 /* 9929 * Do a 2 step search. First search the indexed table 9930 * based on the least significant byte of the ioctl cmd. 9931 * If we don't find a match, then search the misc table 9932 * serially. 9933 */ 9934 index = ioc_cmd & 0xFF; 9935 if (index < ip_ndx_ioctl_count) { 9936 ipip = &ip_ndx_ioctl_table[index]; 9937 if (ipip->ipi_cmd == ioc_cmd) { 9938 /* Found a match in the ndx table */ 9939 return (ipip); 9940 } 9941 } 9942 9943 /* Search the misc table */ 9944 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 9945 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 9946 if (ipip->ipi_cmd == ioc_cmd) 9947 /* Found a match in the misc table */ 9948 return (ipip); 9949 } 9950 9951 return (NULL); 9952 } 9953 9954 /* 9955 * Wrapper function for resuming deferred ioctl processing 9956 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 9957 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 9958 */ 9959 /* ARGSUSED */ 9960 void 9961 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 9962 void *dummy_arg) 9963 { 9964 ip_sioctl_copyin_setup(q, mp); 9965 } 9966 9967 /* 9968 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 9969 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 9970 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 9971 * We establish here the size of the block to be copied in. mi_copyin 9972 * arranges for this to happen, an processing continues in ip_wput with 9973 * an M_IOCDATA message. 9974 */ 9975 void 9976 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 9977 { 9978 int copyin_size; 9979 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9980 ip_ioctl_cmd_t *ipip; 9981 cred_t *cr; 9982 9983 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 9984 if (ipip == NULL) { 9985 /* 9986 * The ioctl is not one we understand or own. 9987 * Pass it along to be processed down stream, 9988 * if this is a module instance of IP, else nak 9989 * the ioctl. 9990 */ 9991 if (q->q_next == NULL) { 9992 goto nak; 9993 } else { 9994 putnext(q, mp); 9995 return; 9996 } 9997 } 9998 9999 /* 10000 * If this is deferred, then we will do all the checks when we 10001 * come back. 10002 */ 10003 if ((iocp->ioc_cmd == SIOCGDSTINFO || 10004 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup()) { 10005 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 10006 return; 10007 } 10008 10009 /* 10010 * Only allow a very small subset of IP ioctls on this stream if 10011 * IP is a module and not a driver. Allowing ioctls to be processed 10012 * in this case may cause assert failures or data corruption. 10013 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10014 * ioctls allowed on an IP module stream, after which this stream 10015 * normally becomes a multiplexor (at which time the stream head 10016 * will fail all ioctls). 10017 */ 10018 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10019 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10020 /* 10021 * Pass common Streams ioctls which the IP 10022 * module does not own or consume along to 10023 * be processed down stream. 10024 */ 10025 putnext(q, mp); 10026 return; 10027 } else { 10028 goto nak; 10029 } 10030 } 10031 10032 /* Make sure we have ioctl data to process. */ 10033 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10034 goto nak; 10035 10036 /* 10037 * Prefer dblk credential over ioctl credential; some synthesized 10038 * ioctls have kcred set because there's no way to crhold() 10039 * a credential in some contexts. (ioc_cr is not crfree() by 10040 * the framework; the caller of ioctl needs to hold the reference 10041 * for the duration of the call). 10042 */ 10043 cr = DB_CREDDEF(mp, iocp->ioc_cr); 10044 10045 /* Make sure normal users don't send down privileged ioctls */ 10046 if ((ipip->ipi_flags & IPI_PRIV) && 10047 (cr != NULL) && secpolicy_net_config(cr, B_TRUE) != 0) { 10048 /* We checked the privilege earlier but log it here */ 10049 miocnak(q, mp, 0, secpolicy_net_config(cr, B_FALSE)); 10050 return; 10051 } 10052 10053 /* 10054 * The ioctl command tables can only encode fixed length 10055 * ioctl data. If the length is variable, the table will 10056 * encode the length as zero. Such special cases are handled 10057 * below in the switch. 10058 */ 10059 if (ipip->ipi_copyin_size != 0) { 10060 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10061 return; 10062 } 10063 10064 switch (iocp->ioc_cmd) { 10065 case O_SIOCGIFCONF: 10066 case SIOCGIFCONF: 10067 /* 10068 * This IOCTL is hilarious. See comments in 10069 * ip_sioctl_get_ifconf for the story. 10070 */ 10071 if (iocp->ioc_count == TRANSPARENT) 10072 copyin_size = SIZEOF_STRUCT(ifconf, 10073 iocp->ioc_flag); 10074 else 10075 copyin_size = iocp->ioc_count; 10076 mi_copyin(q, mp, NULL, copyin_size); 10077 return; 10078 10079 case O_SIOCGLIFCONF: 10080 case SIOCGLIFCONF: 10081 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10082 mi_copyin(q, mp, NULL, copyin_size); 10083 return; 10084 10085 case SIOCGLIFSRCOF: 10086 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10087 mi_copyin(q, mp, NULL, copyin_size); 10088 return; 10089 case SIOCGIP6ADDRPOLICY: 10090 ip_sioctl_ip6addrpolicy(q, mp); 10091 ip6_asp_table_refrele(); 10092 return; 10093 10094 case SIOCSIP6ADDRPOLICY: 10095 ip_sioctl_ip6addrpolicy(q, mp); 10096 return; 10097 10098 case SIOCGDSTINFO: 10099 ip_sioctl_dstinfo(q, mp); 10100 ip6_asp_table_refrele(); 10101 return; 10102 10103 case I_PLINK: 10104 case I_PUNLINK: 10105 case I_LINK: 10106 case I_UNLINK: 10107 /* 10108 * We treat non-persistent link similarly as the persistent 10109 * link case, in terms of plumbing/unplumbing, as well as 10110 * dynamic re-plumbing events indicator. See comments 10111 * in ip_sioctl_plink() for more. 10112 * 10113 * Request can be enqueued in the 'ipsq' while waiting 10114 * to become exclusive. So bump up the conn ref. 10115 */ 10116 if (CONN_Q(q)) 10117 CONN_INC_REF(Q_TO_CONN(q)); 10118 ip_sioctl_plink(NULL, q, mp, NULL); 10119 return; 10120 10121 case ND_GET: 10122 case ND_SET: 10123 /* 10124 * Use of the nd table requires holding the reader lock. 10125 * Modifying the nd table thru nd_load/nd_unload requires 10126 * the writer lock. 10127 */ 10128 rw_enter(&ip_g_nd_lock, RW_READER); 10129 if (nd_getset(q, ip_g_nd, mp)) { 10130 rw_exit(&ip_g_nd_lock); 10131 10132 if (iocp->ioc_error) 10133 iocp->ioc_count = 0; 10134 mp->b_datap->db_type = M_IOCACK; 10135 qreply(q, mp); 10136 return; 10137 } 10138 rw_exit(&ip_g_nd_lock); 10139 /* 10140 * We don't understand this subioctl of ND_GET / ND_SET. 10141 * Maybe intended for some driver / module below us 10142 */ 10143 if (q->q_next) { 10144 putnext(q, mp); 10145 } else { 10146 iocp->ioc_error = ENOENT; 10147 mp->b_datap->db_type = M_IOCNAK; 10148 iocp->ioc_count = 0; 10149 qreply(q, mp); 10150 } 10151 return; 10152 10153 case IP_IOCTL: 10154 ip_wput_ioctl(q, mp); 10155 return; 10156 default: 10157 cmn_err(CE_PANIC, "should not happen "); 10158 } 10159 nak: 10160 if (mp->b_cont != NULL) { 10161 freemsg(mp->b_cont); 10162 mp->b_cont = NULL; 10163 } 10164 iocp->ioc_error = EINVAL; 10165 mp->b_datap->db_type = M_IOCNAK; 10166 iocp->ioc_count = 0; 10167 qreply(q, mp); 10168 } 10169 10170 /* ip_wput hands off ARP IOCTL responses to us */ 10171 void 10172 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 10173 { 10174 struct arpreq *ar; 10175 struct xarpreq *xar; 10176 area_t *area; 10177 mblk_t *area_mp; 10178 struct iocblk *iocp; 10179 mblk_t *orig_ioc_mp, *tmp; 10180 struct iocblk *orig_iocp; 10181 ill_t *ill; 10182 conn_t *connp = NULL; 10183 uint_t ioc_id; 10184 mblk_t *pending_mp; 10185 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10186 int *flagsp; 10187 char *storage = NULL; 10188 sin_t *sin; 10189 ipaddr_t addr; 10190 int err; 10191 10192 ill = q->q_ptr; 10193 ASSERT(ill != NULL); 10194 10195 /* 10196 * We should get back from ARP a packet chain that looks like: 10197 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10198 */ 10199 if (!(area_mp = mp->b_cont) || 10200 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10201 !(orig_ioc_mp = area_mp->b_cont) || 10202 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10203 freemsg(mp); 10204 return; 10205 } 10206 10207 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10208 10209 tmp = (orig_ioc_mp->b_cont)->b_cont; 10210 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10211 (orig_iocp->ioc_cmd == SIOCSXARP) || 10212 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10213 x_arp_ioctl = B_TRUE; 10214 xar = (struct xarpreq *)tmp->b_rptr; 10215 sin = (sin_t *)&xar->xarp_pa; 10216 flagsp = &xar->xarp_flags; 10217 storage = xar->xarp_ha.sdl_data; 10218 if (xar->xarp_ha.sdl_nlen != 0) 10219 ifx_arp_ioctl = B_TRUE; 10220 } else { 10221 ar = (struct arpreq *)tmp->b_rptr; 10222 sin = (sin_t *)&ar->arp_pa; 10223 flagsp = &ar->arp_flags; 10224 storage = ar->arp_ha.sa_data; 10225 } 10226 10227 iocp = (struct iocblk *)mp->b_rptr; 10228 10229 /* 10230 * Pick out the originating queue based on the ioc_id. 10231 */ 10232 ioc_id = iocp->ioc_id; 10233 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 10234 if (pending_mp == NULL) { 10235 ASSERT(connp == NULL); 10236 inet_freemsg(mp); 10237 return; 10238 } 10239 ASSERT(connp != NULL); 10240 q = CONNP_TO_WQ(connp); 10241 10242 /* Uncouple the internally generated IOCTL from the original one */ 10243 area = (area_t *)area_mp->b_rptr; 10244 area_mp->b_cont = NULL; 10245 10246 /* 10247 * Restore the b_next and b_prev used by mi code. This is needed 10248 * to complete the ioctl using mi* functions. We stored them in 10249 * the pending mp prior to sending the request to ARP. 10250 */ 10251 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10252 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10253 inet_freemsg(pending_mp); 10254 10255 /* 10256 * We're done if there was an error or if this is not an SIOCG{X}ARP 10257 * Catch the case where there is an IRE_CACHE by no entry in the 10258 * arp table. 10259 */ 10260 addr = sin->sin_addr.s_addr; 10261 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10262 ire_t *ire; 10263 dl_unitdata_req_t *dlup; 10264 mblk_t *llmp; 10265 int addr_len; 10266 ill_t *ipsqill = NULL; 10267 10268 if (ifx_arp_ioctl) { 10269 /* 10270 * There's no need to lookup the ill, since 10271 * we've already done that when we started 10272 * processing the ioctl and sent the message 10273 * to ARP on that ill. So use the ill that 10274 * is stored in q->q_ptr. 10275 */ 10276 ipsqill = ill; 10277 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10278 ipsqill->ill_ipif, ALL_ZONES, 10279 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 10280 } else { 10281 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10282 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 10283 if (ire != NULL) 10284 ipsqill = ire_to_ill(ire); 10285 } 10286 10287 if ((x_arp_ioctl) && (ipsqill != NULL)) 10288 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10289 10290 if (ire != NULL) { 10291 /* 10292 * Since the ire obtained from cachetable is used for 10293 * mac addr copying below, treat an incomplete ire as if 10294 * as if we never found it. 10295 */ 10296 if (ire->ire_nce != NULL && 10297 ire->ire_nce->nce_state != ND_REACHABLE) { 10298 ire_refrele(ire); 10299 ire = NULL; 10300 ipsqill = NULL; 10301 goto errack; 10302 } 10303 *flagsp = ATF_INUSE; 10304 llmp = (ire->ire_nce != NULL ? 10305 ire->ire_nce->nce_res_mp : NULL); 10306 if (llmp != NULL && ipsqill != NULL) { 10307 uchar_t *macaddr; 10308 10309 addr_len = ipsqill->ill_phys_addr_length; 10310 if (x_arp_ioctl && ((addr_len + 10311 ipsqill->ill_name_length) > 10312 sizeof (xar->xarp_ha.sdl_data))) { 10313 ire_refrele(ire); 10314 freemsg(mp); 10315 ip_ioctl_finish(q, orig_ioc_mp, 10316 EINVAL, NO_COPYOUT, NULL, NULL); 10317 return; 10318 } 10319 *flagsp |= ATF_COM; 10320 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10321 if (ipsqill->ill_sap_length < 0) 10322 macaddr = llmp->b_rptr + 10323 dlup->dl_dest_addr_offset; 10324 else 10325 macaddr = llmp->b_rptr + 10326 dlup->dl_dest_addr_offset + 10327 ipsqill->ill_sap_length; 10328 /* 10329 * For SIOCGARP, MAC address length 10330 * validation has already been done 10331 * before the ioctl was issued to ARP to 10332 * allow it to progress only on 6 byte 10333 * addressable (ethernet like) media. Thus 10334 * the mac address copying can not overwrite 10335 * the sa_data area below. 10336 */ 10337 bcopy(macaddr, storage, addr_len); 10338 } 10339 /* Ditch the internal IOCTL. */ 10340 freemsg(mp); 10341 ire_refrele(ire); 10342 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 10343 return; 10344 } 10345 } 10346 10347 /* 10348 * Delete the coresponding IRE_CACHE if any. 10349 * Reset the error if there was one (in case there was no entry 10350 * in arp.) 10351 */ 10352 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10353 ipif_t *ipintf = NULL; 10354 10355 if (ifx_arp_ioctl) { 10356 /* 10357 * There's no need to lookup the ill, since 10358 * we've already done that when we started 10359 * processing the ioctl and sent the message 10360 * to ARP on that ill. So use the ill that 10361 * is stored in q->q_ptr. 10362 */ 10363 ipintf = ill->ill_ipif; 10364 } 10365 if (ip_ire_clookup_and_delete(addr, ipintf)) { 10366 /* 10367 * The address in "addr" may be an entry for a 10368 * router. If that's true, then any off-net 10369 * IRE_CACHE entries that go through the router 10370 * with address "addr" must be clobbered. Use 10371 * ire_walk to achieve this goal. 10372 */ 10373 if (ifx_arp_ioctl) 10374 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10375 ire_delete_cache_gw, (char *)&addr, ill); 10376 else 10377 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10378 ALL_ZONES); 10379 iocp->ioc_error = 0; 10380 } 10381 } 10382 errack: 10383 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10384 err = iocp->ioc_error; 10385 freemsg(mp); 10386 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL, NULL); 10387 return; 10388 } 10389 10390 /* 10391 * Completion of an SIOCG{X}ARP. Translate the information from 10392 * the area_t into the struct {x}arpreq. 10393 */ 10394 if (x_arp_ioctl) { 10395 storage += ill_xarp_info(&xar->xarp_ha, ill); 10396 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10397 sizeof (xar->xarp_ha.sdl_data)) { 10398 freemsg(mp); 10399 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, 10400 NO_COPYOUT, NULL, NULL); 10401 return; 10402 } 10403 } 10404 *flagsp = ATF_INUSE; 10405 if (area->area_flags & ACE_F_PERMANENT) 10406 *flagsp |= ATF_PERM; 10407 if (area->area_flags & ACE_F_PUBLISH) 10408 *flagsp |= ATF_PUBL; 10409 if (area->area_flags & ACE_F_AUTHORITY) 10410 *flagsp |= ATF_AUTHORITY; 10411 if (area->area_hw_addr_length != 0) { 10412 *flagsp |= ATF_COM; 10413 /* 10414 * For SIOCGARP, MAC address length validation has 10415 * already been done before the ioctl was issued to ARP 10416 * to allow it to progress only on 6 byte addressable 10417 * (ethernet like) media. Thus the mac address copying 10418 * can not overwrite the sa_data area below. 10419 */ 10420 bcopy((char *)area + area->area_hw_addr_offset, 10421 storage, area->area_hw_addr_length); 10422 } 10423 10424 /* Ditch the internal IOCTL. */ 10425 freemsg(mp); 10426 /* Complete the original. */ 10427 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 10428 } 10429 10430 /* 10431 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10432 * interface) create the next available logical interface for this 10433 * physical interface. 10434 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10435 * ipif with the specified name. 10436 * 10437 * If the address family is not AF_UNSPEC then set the address as well. 10438 * 10439 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10440 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10441 * 10442 * Executed as a writer on the ill or ill group. 10443 * So no lock is needed to traverse the ipif chain, or examine the 10444 * phyint flags. 10445 */ 10446 /* ARGSUSED */ 10447 int 10448 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10449 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10450 { 10451 mblk_t *mp1; 10452 struct lifreq *lifr; 10453 boolean_t isv6; 10454 boolean_t exists; 10455 char *name; 10456 char *endp; 10457 char *cp; 10458 int namelen; 10459 ipif_t *ipif; 10460 long id; 10461 ipsq_t *ipsq; 10462 ill_t *ill; 10463 sin_t *sin; 10464 int err = 0; 10465 boolean_t found_sep = B_FALSE; 10466 conn_t *connp; 10467 zoneid_t zoneid; 10468 int orig_ifindex = 0; 10469 10470 ip1dbg(("ip_sioctl_addif\n")); 10471 /* Existence of mp1 has been checked in ip_wput_nondata */ 10472 mp1 = mp->b_cont->b_cont; 10473 /* 10474 * Null terminate the string to protect against buffer 10475 * overrun. String was generated by user code and may not 10476 * be trusted. 10477 */ 10478 lifr = (struct lifreq *)mp1->b_rptr; 10479 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10480 name = lifr->lifr_name; 10481 ASSERT(CONN_Q(q)); 10482 connp = Q_TO_CONN(q); 10483 isv6 = connp->conn_af_isv6; 10484 zoneid = connp->conn_zoneid; 10485 namelen = mi_strlen(name); 10486 if (namelen == 0) 10487 return (EINVAL); 10488 10489 exists = B_FALSE; 10490 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10491 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10492 /* 10493 * Allow creating lo0 using SIOCLIFADDIF. 10494 * can't be any other writer thread. So can pass null below 10495 * for the last 4 args to ipif_lookup_name. 10496 */ 10497 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, 10498 B_TRUE, &exists, isv6, zoneid, NULL, NULL, NULL, NULL); 10499 /* Prevent any further action */ 10500 if (ipif == NULL) { 10501 return (ENOBUFS); 10502 } else if (!exists) { 10503 /* We created the ipif now and as writer */ 10504 ipif_refrele(ipif); 10505 return (0); 10506 } else { 10507 ill = ipif->ipif_ill; 10508 ill_refhold(ill); 10509 ipif_refrele(ipif); 10510 } 10511 } else { 10512 /* Look for a colon in the name. */ 10513 endp = &name[namelen]; 10514 for (cp = endp; --cp > name; ) { 10515 if (*cp == IPIF_SEPARATOR_CHAR) { 10516 found_sep = B_TRUE; 10517 /* 10518 * Reject any non-decimal aliases for plumbing 10519 * of logical interfaces. Aliases with leading 10520 * zeroes are also rejected as they introduce 10521 * ambiguity in the naming of the interfaces. 10522 * Comparing with "0" takes care of all such 10523 * cases. 10524 */ 10525 if ((strncmp("0", cp+1, 1)) == 0) 10526 return (EINVAL); 10527 10528 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10529 id <= 0 || *endp != '\0') { 10530 return (EINVAL); 10531 } 10532 *cp = '\0'; 10533 break; 10534 } 10535 } 10536 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10537 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL); 10538 if (found_sep) 10539 *cp = IPIF_SEPARATOR_CHAR; 10540 if (ill == NULL) 10541 return (err); 10542 } 10543 10544 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10545 B_TRUE); 10546 10547 /* 10548 * Release the refhold due to the lookup, now that we are excl 10549 * or we are just returning 10550 */ 10551 ill_refrele(ill); 10552 10553 if (ipsq == NULL) 10554 return (EINPROGRESS); 10555 10556 /* 10557 * If the interface is failed, inactive or offlined, look for a working 10558 * interface in the ill group and create the ipif there. If we can't 10559 * find a good interface, create the ipif anyway so that in.mpathd can 10560 * move it to the first repaired interface. 10561 */ 10562 if ((ill->ill_phyint->phyint_flags & 10563 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10564 ill->ill_phyint->phyint_groupname_len != 0) { 10565 phyint_t *phyi; 10566 char *groupname = ill->ill_phyint->phyint_groupname; 10567 10568 /* 10569 * We're looking for a working interface, but it doesn't matter 10570 * if it's up or down; so instead of following the group lists, 10571 * we look at each physical interface and compare the groupname. 10572 * We're only interested in interfaces with IPv4 (resp. IPv6) 10573 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 10574 * Otherwise we create the ipif on the failed interface. 10575 */ 10576 rw_enter(&ill_g_lock, RW_READER); 10577 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 10578 for (; phyi != NULL; 10579 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 10580 phyi, AVL_AFTER)) { 10581 if (phyi->phyint_groupname_len == 0) 10582 continue; 10583 ASSERT(phyi->phyint_groupname != NULL); 10584 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 10585 !(phyi->phyint_flags & 10586 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10587 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 10588 (phyi->phyint_illv4 != NULL))) { 10589 break; 10590 } 10591 } 10592 rw_exit(&ill_g_lock); 10593 10594 if (phyi != NULL) { 10595 orig_ifindex = ill->ill_phyint->phyint_ifindex; 10596 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 10597 phyi->phyint_illv4); 10598 } 10599 } 10600 10601 /* 10602 * We are now exclusive on the ipsq, so an ill move will be serialized 10603 * before or after us. 10604 */ 10605 ASSERT(IAM_WRITER_ILL(ill)); 10606 ASSERT(ill->ill_move_in_progress == B_FALSE); 10607 10608 if (found_sep && orig_ifindex == 0) { 10609 /* Now see if there is an IPIF with this unit number. */ 10610 for (ipif = ill->ill_ipif; ipif != NULL; 10611 ipif = ipif->ipif_next) { 10612 if (ipif->ipif_id == id) { 10613 err = EEXIST; 10614 goto done; 10615 } 10616 } 10617 } 10618 10619 /* 10620 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10621 * of lo0. We never come here when we plumb lo0:0. It 10622 * happens in ipif_lookup_on_name. 10623 * The specified unit number is ignored when we create the ipif on a 10624 * different interface. However, we save it in ipif_orig_ipifid below so 10625 * that the ipif fails back to the right position. 10626 */ 10627 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 10628 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 10629 err = ENOBUFS; 10630 goto done; 10631 } 10632 10633 /* Return created name with ioctl */ 10634 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10635 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10636 ip1dbg(("created %s\n", lifr->lifr_name)); 10637 10638 /* Set address */ 10639 sin = (sin_t *)&lifr->lifr_addr; 10640 if (sin->sin_family != AF_UNSPEC) { 10641 err = ip_sioctl_addr(ipif, sin, q, mp, 10642 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10643 } 10644 10645 /* Set ifindex and unit number for failback */ 10646 if (err == 0 && orig_ifindex != 0) { 10647 ipif->ipif_orig_ifindex = orig_ifindex; 10648 if (found_sep) { 10649 ipif->ipif_orig_ipifid = id; 10650 } 10651 } 10652 10653 done: 10654 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10655 return (err); 10656 } 10657 10658 /* 10659 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10660 * interface) delete it based on the IP address (on this physical interface). 10661 * Otherwise delete it based on the ipif_id. 10662 * Also, special handling to allow a removeif of lo0. 10663 */ 10664 /* ARGSUSED */ 10665 int 10666 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10667 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10668 { 10669 conn_t *connp; 10670 ill_t *ill = ipif->ipif_ill; 10671 boolean_t success; 10672 10673 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10674 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10675 ASSERT(IAM_WRITER_IPIF(ipif)); 10676 10677 connp = Q_TO_CONN(q); 10678 /* 10679 * Special case for unplumbing lo0 (the loopback physical interface). 10680 * If unplumbing lo0, the incoming address structure has been 10681 * initialized to all zeros. When unplumbing lo0, all its logical 10682 * interfaces must be removed too. 10683 * 10684 * Note that this interface may be called to remove a specific 10685 * loopback logical interface (eg, lo0:1). But in that case 10686 * ipif->ipif_id != 0 so that the code path for that case is the 10687 * same as any other interface (meaning it skips the code directly 10688 * below). 10689 */ 10690 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10691 if (sin->sin_family == AF_UNSPEC && 10692 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10693 /* 10694 * Mark it condemned. No new ref. will be made to ill. 10695 */ 10696 mutex_enter(&ill->ill_lock); 10697 ill->ill_state_flags |= ILL_CONDEMNED; 10698 for (ipif = ill->ill_ipif; ipif != NULL; 10699 ipif = ipif->ipif_next) { 10700 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10701 } 10702 mutex_exit(&ill->ill_lock); 10703 10704 ipif = ill->ill_ipif; 10705 /* unplumb the loopback interface */ 10706 ill_delete(ill); 10707 mutex_enter(&connp->conn_lock); 10708 mutex_enter(&ill->ill_lock); 10709 ASSERT(ill->ill_group == NULL); 10710 10711 /* Are any references to this ill active */ 10712 if (ill_is_quiescent(ill)) { 10713 mutex_exit(&ill->ill_lock); 10714 mutex_exit(&connp->conn_lock); 10715 ill_delete_tail(ill); 10716 mi_free(ill); 10717 return (0); 10718 } 10719 success = ipsq_pending_mp_add(connp, ipif, 10720 CONNP_TO_WQ(connp), mp, ILL_FREE); 10721 mutex_exit(&connp->conn_lock); 10722 mutex_exit(&ill->ill_lock); 10723 if (success) 10724 return (EINPROGRESS); 10725 else 10726 return (EINTR); 10727 } 10728 } 10729 10730 /* 10731 * We are exclusive on the ipsq, so an ill move will be serialized 10732 * before or after us. 10733 */ 10734 ASSERT(ill->ill_move_in_progress == B_FALSE); 10735 10736 if (ipif->ipif_id == 0) { 10737 /* Find based on address */ 10738 if (ipif->ipif_isv6) { 10739 sin6_t *sin6; 10740 10741 if (sin->sin_family != AF_INET6) 10742 return (EAFNOSUPPORT); 10743 10744 sin6 = (sin6_t *)sin; 10745 /* We are a writer, so we should be able to lookup */ 10746 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10747 ill, ALL_ZONES, NULL, NULL, NULL, NULL); 10748 if (ipif == NULL) { 10749 /* 10750 * Maybe the address in on another interface in 10751 * the same IPMP group? We check this below. 10752 */ 10753 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10754 NULL, ALL_ZONES, NULL, NULL, NULL, NULL); 10755 } 10756 } else { 10757 ipaddr_t addr; 10758 10759 if (sin->sin_family != AF_INET) 10760 return (EAFNOSUPPORT); 10761 10762 addr = sin->sin_addr.s_addr; 10763 /* We are a writer, so we should be able to lookup */ 10764 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 10765 NULL, NULL, NULL); 10766 if (ipif == NULL) { 10767 /* 10768 * Maybe the address in on another interface in 10769 * the same IPMP group? We check this below. 10770 */ 10771 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 10772 NULL, NULL, NULL, NULL); 10773 } 10774 } 10775 if (ipif == NULL) { 10776 return (EADDRNOTAVAIL); 10777 } 10778 /* 10779 * When the address to be removed is hosted on a different 10780 * interface, we check if the interface is in the same IPMP 10781 * group as the specified one; if so we proceed with the 10782 * removal. 10783 * ill->ill_group is NULL when the ill is down, so we have to 10784 * compare the group names instead. 10785 */ 10786 if (ipif->ipif_ill != ill && 10787 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 10788 ill->ill_phyint->phyint_groupname_len == 0 || 10789 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 10790 ill->ill_phyint->phyint_groupname) != 0)) { 10791 ipif_refrele(ipif); 10792 return (EADDRNOTAVAIL); 10793 } 10794 10795 /* This is a writer */ 10796 ipif_refrele(ipif); 10797 } 10798 10799 /* 10800 * Can not delete instance zero since it is tied to the ill. 10801 */ 10802 if (ipif->ipif_id == 0) 10803 return (EBUSY); 10804 10805 mutex_enter(&ill->ill_lock); 10806 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10807 mutex_exit(&ill->ill_lock); 10808 10809 ipif_free(ipif); 10810 10811 mutex_enter(&connp->conn_lock); 10812 mutex_enter(&ill->ill_lock); 10813 10814 /* Are any references to this ipif active */ 10815 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 10816 mutex_exit(&ill->ill_lock); 10817 mutex_exit(&connp->conn_lock); 10818 ipif_non_duplicate(ipif); 10819 ipif_down_tail(ipif); 10820 ipif_free_tail(ipif); 10821 return (0); 10822 } 10823 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10824 IPIF_FREE); 10825 mutex_exit(&ill->ill_lock); 10826 mutex_exit(&connp->conn_lock); 10827 if (success) 10828 return (EINPROGRESS); 10829 else 10830 return (EINTR); 10831 } 10832 10833 /* 10834 * Restart the removeif ioctl. The refcnt has gone down to 0. 10835 * The ipif is already condemned. So can't find it thru lookups. 10836 */ 10837 /* ARGSUSED */ 10838 int 10839 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10840 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10841 { 10842 ill_t *ill; 10843 10844 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10845 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10846 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10847 ill = ipif->ipif_ill; 10848 ASSERT(IAM_WRITER_ILL(ill)); 10849 ASSERT((ipif->ipif_state_flags & IPIF_CONDEMNED) && 10850 (ill->ill_state_flags & IPIF_CONDEMNED)); 10851 ill_delete_tail(ill); 10852 mi_free(ill); 10853 return (0); 10854 } 10855 10856 ill = ipif->ipif_ill; 10857 ASSERT(IAM_WRITER_IPIF(ipif)); 10858 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10859 10860 ipif_non_duplicate(ipif); 10861 ipif_down_tail(ipif); 10862 ipif_free_tail(ipif); 10863 10864 ILL_UNMARK_CHANGING(ill); 10865 return (0); 10866 } 10867 10868 /* 10869 * Set the local interface address. 10870 * Allow an address of all zero when the interface is down. 10871 */ 10872 /* ARGSUSED */ 10873 int 10874 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10875 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10876 { 10877 int err = 0; 10878 in6_addr_t v6addr; 10879 boolean_t need_up = B_FALSE; 10880 10881 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10882 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10883 10884 ASSERT(IAM_WRITER_IPIF(ipif)); 10885 10886 if (ipif->ipif_isv6) { 10887 sin6_t *sin6; 10888 ill_t *ill; 10889 phyint_t *phyi; 10890 10891 if (sin->sin_family != AF_INET6) 10892 return (EAFNOSUPPORT); 10893 10894 sin6 = (sin6_t *)sin; 10895 v6addr = sin6->sin6_addr; 10896 ill = ipif->ipif_ill; 10897 phyi = ill->ill_phyint; 10898 10899 /* 10900 * Enforce that true multicast interfaces have a link-local 10901 * address for logical unit 0. 10902 */ 10903 if (ipif->ipif_id == 0 && 10904 (ill->ill_flags & ILLF_MULTICAST) && 10905 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 10906 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 10907 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 10908 return (EADDRNOTAVAIL); 10909 } 10910 10911 /* 10912 * up interfaces shouldn't have the unspecified address 10913 * unless they also have the IPIF_NOLOCAL flags set and 10914 * have a subnet assigned. 10915 */ 10916 if ((ipif->ipif_flags & IPIF_UP) && 10917 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 10918 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 10919 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 10920 return (EADDRNOTAVAIL); 10921 } 10922 10923 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10924 return (EADDRNOTAVAIL); 10925 } else { 10926 ipaddr_t addr; 10927 10928 if (sin->sin_family != AF_INET) 10929 return (EAFNOSUPPORT); 10930 10931 addr = sin->sin_addr.s_addr; 10932 10933 /* Allow 0 as the local address. */ 10934 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10935 return (EADDRNOTAVAIL); 10936 10937 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10938 } 10939 10940 10941 /* 10942 * Even if there is no change we redo things just to rerun 10943 * ipif_set_default. 10944 */ 10945 if (ipif->ipif_flags & IPIF_UP) { 10946 /* 10947 * Setting a new local address, make sure 10948 * we have net and subnet bcast ire's for 10949 * the old address if we need them. 10950 */ 10951 if (!ipif->ipif_isv6) 10952 ipif_check_bcast_ires(ipif); 10953 /* 10954 * If the interface is already marked up, 10955 * we call ipif_down which will take care 10956 * of ditching any IREs that have been set 10957 * up based on the old interface address. 10958 */ 10959 err = ipif_logical_down(ipif, q, mp); 10960 if (err == EINPROGRESS) 10961 return (err); 10962 ipif_down_tail(ipif); 10963 need_up = 1; 10964 } 10965 10966 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 10967 return (err); 10968 } 10969 10970 int 10971 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10972 boolean_t need_up) 10973 { 10974 in6_addr_t v6addr; 10975 ipaddr_t addr; 10976 sin6_t *sin6; 10977 int sinlen; 10978 int err = 0; 10979 ill_t *ill = ipif->ipif_ill; 10980 boolean_t need_dl_down; 10981 boolean_t need_arp_down; 10982 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10983 10984 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 10985 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10986 ASSERT(IAM_WRITER_IPIF(ipif)); 10987 10988 /* Must cancel any pending timer before taking the ill_lock */ 10989 if (ipif->ipif_recovery_id != 0) 10990 (void) untimeout(ipif->ipif_recovery_id); 10991 ipif->ipif_recovery_id = 0; 10992 10993 if (ipif->ipif_isv6) { 10994 sin6 = (sin6_t *)sin; 10995 v6addr = sin6->sin6_addr; 10996 sinlen = sizeof (struct sockaddr_in6); 10997 } else { 10998 addr = sin->sin_addr.s_addr; 10999 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11000 sinlen = sizeof (struct sockaddr_in); 11001 } 11002 mutex_enter(&ill->ill_lock); 11003 ipif->ipif_v6lcl_addr = v6addr; 11004 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 11005 ipif->ipif_v6src_addr = ipv6_all_zeros; 11006 } else { 11007 ipif->ipif_v6src_addr = v6addr; 11008 } 11009 ipif->ipif_addr_ready = 0; 11010 11011 /* 11012 * If the interface was previously marked as a duplicate, then since 11013 * we've now got a "new" address, it should no longer be considered a 11014 * duplicate -- even if the "new" address is the same as the old one. 11015 * Note that if all ipifs are down, we may have a pending ARP down 11016 * event to handle. This is because we want to recover from duplicates 11017 * and thus delay tearing down ARP until the duplicates have been 11018 * removed or disabled. 11019 */ 11020 need_dl_down = need_arp_down = B_FALSE; 11021 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11022 need_arp_down = !need_up; 11023 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11024 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11025 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11026 need_dl_down = B_TRUE; 11027 } 11028 } 11029 11030 if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && 11031 !ill->ill_is_6to4tun) { 11032 queue_t *wqp = ill->ill_wq; 11033 11034 /* 11035 * The local address of this interface is a 6to4 address, 11036 * check if this interface is in fact a 6to4 tunnel or just 11037 * an interface configured with a 6to4 address. We are only 11038 * interested in the former. 11039 */ 11040 if (wqp != NULL) { 11041 while ((wqp->q_next != NULL) && 11042 (wqp->q_next->q_qinfo != NULL) && 11043 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 11044 11045 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 11046 == TUN6TO4_MODID) { 11047 /* set for use in IP */ 11048 ill->ill_is_6to4tun = 1; 11049 break; 11050 } 11051 wqp = wqp->q_next; 11052 } 11053 } 11054 } 11055 11056 ipif_set_default(ipif); 11057 11058 /* 11059 * When publishing an interface address change event, we only notify 11060 * the event listeners of the new address. It is assumed that if they 11061 * actively care about the addresses assigned that they will have 11062 * already discovered the previous address assigned (if there was one.) 11063 * 11064 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11065 */ 11066 if (iocp->ioc_cmd != SIOCLIFADDIF) { 11067 hook_nic_event_t *info; 11068 if ((info = ipif->ipif_ill->ill_nic_event_info) != NULL) { 11069 ip2dbg(("ip_sioctl_addr_tail: unexpected nic event %d " 11070 "attached for %s\n", info->hne_event, 11071 ill->ill_name)); 11072 if (info->hne_data != NULL) 11073 kmem_free(info->hne_data, info->hne_datalen); 11074 kmem_free(info, sizeof (hook_nic_event_t)); 11075 } 11076 11077 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 11078 if (info != NULL) { 11079 info->hne_nic = 11080 ipif->ipif_ill->ill_phyint->phyint_ifindex; 11081 info->hne_lif = MAP_IPIF_ID(ipif->ipif_id); 11082 info->hne_event = NE_ADDRESS_CHANGE; 11083 info->hne_family = ipif->ipif_isv6 ? ipv6 : ipv4; 11084 info->hne_data = kmem_alloc(sinlen, KM_NOSLEEP); 11085 if (info->hne_data != NULL) { 11086 info->hne_datalen = sinlen; 11087 bcopy(sin, info->hne_data, sinlen); 11088 } else { 11089 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11090 "address information for ADDRESS_CHANGE nic" 11091 " event of %s (ENOMEM)\n", 11092 ipif->ipif_ill->ill_name)); 11093 kmem_free(info, sizeof (hook_nic_event_t)); 11094 } 11095 } else 11096 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11097 "ADDRESS_CHANGE nic event information for %s " 11098 "(ENOMEM)\n", ipif->ipif_ill->ill_name)); 11099 11100 ipif->ipif_ill->ill_nic_event_info = info; 11101 } 11102 11103 mutex_exit(&ipif->ipif_ill->ill_lock); 11104 11105 if (need_up) { 11106 /* 11107 * Now bring the interface back up. If this 11108 * is the only IPIF for the ILL, ipif_up 11109 * will have to re-bind to the device, so 11110 * we may get back EINPROGRESS, in which 11111 * case, this IOCTL will get completed in 11112 * ip_rput_dlpi when we see the DL_BIND_ACK. 11113 */ 11114 err = ipif_up(ipif, q, mp); 11115 } else { 11116 /* 11117 * Update the IPIF list in SCTP, ipif_up_done() will do it 11118 * if need_up is true. 11119 */ 11120 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11121 } 11122 11123 if (need_dl_down) 11124 ill_dl_down(ill); 11125 if (need_arp_down) 11126 ipif_arp_down(ipif); 11127 11128 return (err); 11129 } 11130 11131 11132 /* 11133 * Restart entry point to restart the address set operation after the 11134 * refcounts have dropped to zero. 11135 */ 11136 /* ARGSUSED */ 11137 int 11138 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11139 ip_ioctl_cmd_t *ipip, void *ifreq) 11140 { 11141 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11142 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11143 ASSERT(IAM_WRITER_IPIF(ipif)); 11144 ipif_down_tail(ipif); 11145 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11146 } 11147 11148 /* ARGSUSED */ 11149 int 11150 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11151 ip_ioctl_cmd_t *ipip, void *if_req) 11152 { 11153 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11154 struct lifreq *lifr = (struct lifreq *)if_req; 11155 11156 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11157 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11158 /* 11159 * The net mask and address can't change since we have a 11160 * reference to the ipif. So no lock is necessary. 11161 */ 11162 if (ipif->ipif_isv6) { 11163 *sin6 = sin6_null; 11164 sin6->sin6_family = AF_INET6; 11165 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11166 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11167 lifr->lifr_addrlen = 11168 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11169 } else { 11170 *sin = sin_null; 11171 sin->sin_family = AF_INET; 11172 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11173 if (ipip->ipi_cmd_type == LIF_CMD) { 11174 lifr->lifr_addrlen = 11175 ip_mask_to_plen(ipif->ipif_net_mask); 11176 } 11177 } 11178 return (0); 11179 } 11180 11181 /* 11182 * Set the destination address for a pt-pt interface. 11183 */ 11184 /* ARGSUSED */ 11185 int 11186 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11187 ip_ioctl_cmd_t *ipip, void *if_req) 11188 { 11189 int err = 0; 11190 in6_addr_t v6addr; 11191 boolean_t need_up = B_FALSE; 11192 11193 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11194 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11195 ASSERT(IAM_WRITER_IPIF(ipif)); 11196 11197 if (ipif->ipif_isv6) { 11198 sin6_t *sin6; 11199 11200 if (sin->sin_family != AF_INET6) 11201 return (EAFNOSUPPORT); 11202 11203 sin6 = (sin6_t *)sin; 11204 v6addr = sin6->sin6_addr; 11205 11206 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11207 return (EADDRNOTAVAIL); 11208 } else { 11209 ipaddr_t addr; 11210 11211 if (sin->sin_family != AF_INET) 11212 return (EAFNOSUPPORT); 11213 11214 addr = sin->sin_addr.s_addr; 11215 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11216 return (EADDRNOTAVAIL); 11217 11218 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11219 } 11220 11221 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11222 return (0); /* No change */ 11223 11224 if (ipif->ipif_flags & IPIF_UP) { 11225 /* 11226 * If the interface is already marked up, 11227 * we call ipif_down which will take care 11228 * of ditching any IREs that have been set 11229 * up based on the old pp dst address. 11230 */ 11231 err = ipif_logical_down(ipif, q, mp); 11232 if (err == EINPROGRESS) 11233 return (err); 11234 ipif_down_tail(ipif); 11235 need_up = B_TRUE; 11236 } 11237 /* 11238 * could return EINPROGRESS. If so ioctl will complete in 11239 * ip_rput_dlpi_writer 11240 */ 11241 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11242 return (err); 11243 } 11244 11245 static int 11246 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11247 boolean_t need_up) 11248 { 11249 in6_addr_t v6addr; 11250 ill_t *ill = ipif->ipif_ill; 11251 int err = 0; 11252 boolean_t need_dl_down; 11253 boolean_t need_arp_down; 11254 11255 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11256 ipif->ipif_id, (void *)ipif)); 11257 11258 /* Must cancel any pending timer before taking the ill_lock */ 11259 if (ipif->ipif_recovery_id != 0) 11260 (void) untimeout(ipif->ipif_recovery_id); 11261 ipif->ipif_recovery_id = 0; 11262 11263 if (ipif->ipif_isv6) { 11264 sin6_t *sin6; 11265 11266 sin6 = (sin6_t *)sin; 11267 v6addr = sin6->sin6_addr; 11268 } else { 11269 ipaddr_t addr; 11270 11271 addr = sin->sin_addr.s_addr; 11272 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11273 } 11274 mutex_enter(&ill->ill_lock); 11275 /* Set point to point destination address. */ 11276 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11277 /* 11278 * Allow this as a means of creating logical 11279 * pt-pt interfaces on top of e.g. an Ethernet. 11280 * XXX Undocumented HACK for testing. 11281 * pt-pt interfaces are created with NUD disabled. 11282 */ 11283 ipif->ipif_flags |= IPIF_POINTOPOINT; 11284 ipif->ipif_flags &= ~IPIF_BROADCAST; 11285 if (ipif->ipif_isv6) 11286 ill->ill_flags |= ILLF_NONUD; 11287 } 11288 11289 /* 11290 * If the interface was previously marked as a duplicate, then since 11291 * we've now got a "new" address, it should no longer be considered a 11292 * duplicate -- even if the "new" address is the same as the old one. 11293 * Note that if all ipifs are down, we may have a pending ARP down 11294 * event to handle. 11295 */ 11296 need_dl_down = need_arp_down = B_FALSE; 11297 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11298 need_arp_down = !need_up; 11299 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11300 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11301 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11302 need_dl_down = B_TRUE; 11303 } 11304 } 11305 11306 /* Set the new address. */ 11307 ipif->ipif_v6pp_dst_addr = v6addr; 11308 /* Make sure subnet tracks pp_dst */ 11309 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11310 mutex_exit(&ill->ill_lock); 11311 11312 if (need_up) { 11313 /* 11314 * Now bring the interface back up. If this 11315 * is the only IPIF for the ILL, ipif_up 11316 * will have to re-bind to the device, so 11317 * we may get back EINPROGRESS, in which 11318 * case, this IOCTL will get completed in 11319 * ip_rput_dlpi when we see the DL_BIND_ACK. 11320 */ 11321 err = ipif_up(ipif, q, mp); 11322 } 11323 11324 if (need_dl_down) 11325 ill_dl_down(ill); 11326 11327 if (need_arp_down) 11328 ipif_arp_down(ipif); 11329 return (err); 11330 } 11331 11332 /* 11333 * Restart entry point to restart the dstaddress set operation after the 11334 * refcounts have dropped to zero. 11335 */ 11336 /* ARGSUSED */ 11337 int 11338 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11339 ip_ioctl_cmd_t *ipip, void *ifreq) 11340 { 11341 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11342 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11343 ipif_down_tail(ipif); 11344 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11345 } 11346 11347 /* ARGSUSED */ 11348 int 11349 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11350 ip_ioctl_cmd_t *ipip, void *if_req) 11351 { 11352 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11353 11354 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11355 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11356 /* 11357 * Get point to point destination address. The addresses can't 11358 * change since we hold a reference to the ipif. 11359 */ 11360 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11361 return (EADDRNOTAVAIL); 11362 11363 if (ipif->ipif_isv6) { 11364 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11365 *sin6 = sin6_null; 11366 sin6->sin6_family = AF_INET6; 11367 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11368 } else { 11369 *sin = sin_null; 11370 sin->sin_family = AF_INET; 11371 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11372 } 11373 return (0); 11374 } 11375 11376 /* 11377 * part of ipmp, make this func return the active/inactive state and 11378 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 11379 */ 11380 /* 11381 * This function either sets or clears the IFF_INACTIVE flag. 11382 * 11383 * As long as there are some addresses or multicast memberships on the 11384 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 11385 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 11386 * will be used for outbound packets. 11387 * 11388 * Caller needs to verify the validity of setting IFF_INACTIVE. 11389 */ 11390 static void 11391 phyint_inactive(phyint_t *phyi) 11392 { 11393 ill_t *ill_v4; 11394 ill_t *ill_v6; 11395 ipif_t *ipif; 11396 ilm_t *ilm; 11397 11398 ill_v4 = phyi->phyint_illv4; 11399 ill_v6 = phyi->phyint_illv6; 11400 11401 /* 11402 * No need for a lock while traversing the list since iam 11403 * a writer 11404 */ 11405 if (ill_v4 != NULL) { 11406 ASSERT(IAM_WRITER_ILL(ill_v4)); 11407 for (ipif = ill_v4->ill_ipif; ipif != NULL; 11408 ipif = ipif->ipif_next) { 11409 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11410 mutex_enter(&phyi->phyint_lock); 11411 phyi->phyint_flags &= ~PHYI_INACTIVE; 11412 mutex_exit(&phyi->phyint_lock); 11413 return; 11414 } 11415 } 11416 for (ilm = ill_v4->ill_ilm; ilm != NULL; 11417 ilm = ilm->ilm_next) { 11418 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11419 mutex_enter(&phyi->phyint_lock); 11420 phyi->phyint_flags &= ~PHYI_INACTIVE; 11421 mutex_exit(&phyi->phyint_lock); 11422 return; 11423 } 11424 } 11425 } 11426 if (ill_v6 != NULL) { 11427 ill_v6 = phyi->phyint_illv6; 11428 for (ipif = ill_v6->ill_ipif; ipif != NULL; 11429 ipif = ipif->ipif_next) { 11430 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11431 mutex_enter(&phyi->phyint_lock); 11432 phyi->phyint_flags &= ~PHYI_INACTIVE; 11433 mutex_exit(&phyi->phyint_lock); 11434 return; 11435 } 11436 } 11437 for (ilm = ill_v6->ill_ilm; ilm != NULL; 11438 ilm = ilm->ilm_next) { 11439 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11440 mutex_enter(&phyi->phyint_lock); 11441 phyi->phyint_flags &= ~PHYI_INACTIVE; 11442 mutex_exit(&phyi->phyint_lock); 11443 return; 11444 } 11445 } 11446 } 11447 mutex_enter(&phyi->phyint_lock); 11448 phyi->phyint_flags |= PHYI_INACTIVE; 11449 mutex_exit(&phyi->phyint_lock); 11450 } 11451 11452 /* 11453 * This function is called only when the phyint flags change. Currently 11454 * called from ip_sioctl_flags. We re-do the broadcast nomination so 11455 * that we can select a good ill. 11456 */ 11457 static void 11458 ip_redo_nomination(phyint_t *phyi) 11459 { 11460 ill_t *ill_v4; 11461 11462 ill_v4 = phyi->phyint_illv4; 11463 11464 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 11465 ASSERT(IAM_WRITER_ILL(ill_v4)); 11466 if (ill_v4->ill_group->illgrp_ill_count > 1) 11467 ill_nominate_bcast_rcv(ill_v4->ill_group); 11468 } 11469 } 11470 11471 /* 11472 * Heuristic to check if ill is INACTIVE. 11473 * Checks if ill has an ipif with an usable ip address. 11474 * 11475 * Return values: 11476 * B_TRUE - ill is INACTIVE; has no usable ipif 11477 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 11478 */ 11479 static boolean_t 11480 ill_is_inactive(ill_t *ill) 11481 { 11482 ipif_t *ipif; 11483 11484 /* Check whether it is in an IPMP group */ 11485 if (ill->ill_phyint->phyint_groupname == NULL) 11486 return (B_FALSE); 11487 11488 if (ill->ill_ipif_up_count == 0) 11489 return (B_TRUE); 11490 11491 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11492 uint64_t flags = ipif->ipif_flags; 11493 11494 /* 11495 * This ipif is usable if it is IPIF_UP and not a 11496 * dedicated test address. A dedicated test address 11497 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 11498 * (note in particular that V6 test addresses are 11499 * link-local data addresses and thus are marked 11500 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 11501 */ 11502 if ((flags & IPIF_UP) && 11503 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 11504 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 11505 return (B_FALSE); 11506 } 11507 return (B_TRUE); 11508 } 11509 11510 /* 11511 * Set interface flags. 11512 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 11513 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 11514 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 11515 * 11516 * NOTE : We really don't enforce that ipif_id zero should be used 11517 * for setting any flags other than IFF_LOGINT_FLAGS. This 11518 * is because applications generally does SICGLIFFLAGS and 11519 * ORs in the new flags (that affects the logical) and does a 11520 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11521 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11522 * flags that will be turned on is correct with respect to 11523 * ipif_id 0. For backward compatibility reasons, it is not done. 11524 */ 11525 /* ARGSUSED */ 11526 int 11527 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11528 ip_ioctl_cmd_t *ipip, void *if_req) 11529 { 11530 uint64_t turn_on; 11531 uint64_t turn_off; 11532 int err; 11533 boolean_t need_up = B_FALSE; 11534 phyint_t *phyi; 11535 ill_t *ill; 11536 uint64_t intf_flags; 11537 boolean_t phyint_flags_modified = B_FALSE; 11538 uint64_t flags; 11539 struct ifreq *ifr; 11540 struct lifreq *lifr; 11541 boolean_t set_linklocal = B_FALSE; 11542 boolean_t zero_source = B_FALSE; 11543 11544 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11545 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11546 11547 ASSERT(IAM_WRITER_IPIF(ipif)); 11548 11549 ill = ipif->ipif_ill; 11550 phyi = ill->ill_phyint; 11551 11552 if (ipip->ipi_cmd_type == IF_CMD) { 11553 ifr = (struct ifreq *)if_req; 11554 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11555 } else { 11556 lifr = (struct lifreq *)if_req; 11557 flags = lifr->lifr_flags; 11558 } 11559 11560 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11561 11562 /* 11563 * Has the flags been set correctly till now ? 11564 */ 11565 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11566 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11567 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11568 /* 11569 * Compare the new flags to the old, and partition 11570 * into those coming on and those going off. 11571 * For the 16 bit command keep the bits above bit 16 unchanged. 11572 */ 11573 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11574 flags |= intf_flags & ~0xFFFF; 11575 11576 /* 11577 * First check which bits will change and then which will 11578 * go on and off 11579 */ 11580 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 11581 if (!turn_on) 11582 return (0); /* No change */ 11583 11584 turn_off = intf_flags & turn_on; 11585 turn_on ^= turn_off; 11586 err = 0; 11587 11588 /* 11589 * Don't allow any bits belonging to the logical interface 11590 * to be set or cleared on the replacement ipif that was 11591 * created temporarily during a MOVE. 11592 */ 11593 if (ipif->ipif_replace_zero && 11594 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 11595 return (EINVAL); 11596 } 11597 11598 /* 11599 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11600 * IPv6 interfaces. 11601 */ 11602 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11603 return (EINVAL); 11604 11605 /* 11606 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11607 * interfaces. It makes no sense in that context. 11608 */ 11609 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11610 return (EINVAL); 11611 11612 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11613 zero_source = B_TRUE; 11614 11615 /* 11616 * For IPv6 ipif_id 0, don't allow the interface to be up without 11617 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11618 * If the link local address isn't set, and can be set, it will get 11619 * set later on in this function. 11620 */ 11621 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11622 (flags & IFF_UP) && !zero_source && 11623 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11624 if (ipif_cant_setlinklocal(ipif)) 11625 return (EINVAL); 11626 set_linklocal = B_TRUE; 11627 } 11628 11629 /* 11630 * ILL cannot be part of a usesrc group and and IPMP group at the 11631 * same time. No need to grab ill_g_usesrc_lock here, see 11632 * synchronization notes in ip.c 11633 */ 11634 if (turn_on & PHYI_STANDBY && 11635 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 11636 return (EINVAL); 11637 } 11638 11639 /* 11640 * If we modify physical interface flags, we'll potentially need to 11641 * send up two routing socket messages for the changes (one for the 11642 * IPv4 ill, and another for the IPv6 ill). Note that here. 11643 */ 11644 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11645 phyint_flags_modified = B_TRUE; 11646 11647 /* 11648 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 11649 * we need to flush the IRE_CACHES belonging to this ill. 11650 * We handle this case here without doing the DOWN/UP dance 11651 * like it is done for other flags. If some other flags are 11652 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 11653 * below will handle it by bringing it down and then 11654 * bringing it UP. 11655 */ 11656 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 11657 ill_t *ill_v4, *ill_v6; 11658 11659 ill_v4 = phyi->phyint_illv4; 11660 ill_v6 = phyi->phyint_illv6; 11661 11662 /* 11663 * First set the INACTIVE flag if needed. Then delete the ires. 11664 * ire_add will atomically prevent creating new IRE_CACHEs 11665 * unless hidden flag is set. 11666 * PHYI_FAILED and PHYI_INACTIVE are exclusive 11667 */ 11668 if ((turn_on & PHYI_FAILED) && 11669 ((intf_flags & PHYI_STANDBY) || !ipmp_enable_failback)) { 11670 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 11671 phyi->phyint_flags &= ~PHYI_INACTIVE; 11672 } 11673 if ((turn_off & PHYI_FAILED) && 11674 ((intf_flags & PHYI_STANDBY) || 11675 (!ipmp_enable_failback && ill_is_inactive(ill)))) { 11676 phyint_inactive(phyi); 11677 } 11678 11679 if (turn_on & PHYI_STANDBY) { 11680 /* 11681 * We implicitly set INACTIVE only when STANDBY is set. 11682 * INACTIVE is also set on non-STANDBY phyint when user 11683 * disables FAILBACK using configuration file. 11684 * Do not allow STANDBY to be set on such INACTIVE 11685 * phyint 11686 */ 11687 if (phyi->phyint_flags & PHYI_INACTIVE) 11688 return (EINVAL); 11689 if (!(phyi->phyint_flags & PHYI_FAILED)) 11690 phyint_inactive(phyi); 11691 } 11692 if (turn_off & PHYI_STANDBY) { 11693 if (ipmp_enable_failback) { 11694 /* 11695 * Reset PHYI_INACTIVE. 11696 */ 11697 phyi->phyint_flags &= ~PHYI_INACTIVE; 11698 } else if (ill_is_inactive(ill) && 11699 !(phyi->phyint_flags & PHYI_FAILED)) { 11700 /* 11701 * Need to set INACTIVE, when user sets 11702 * STANDBY on a non-STANDBY phyint and 11703 * later resets STANDBY 11704 */ 11705 phyint_inactive(phyi); 11706 } 11707 } 11708 /* 11709 * We should always send up a message so that the 11710 * daemons come to know of it. Note that the zeroth 11711 * interface can be down and the check below for IPIF_UP 11712 * will not make sense as we are actually setting 11713 * a phyint flag here. We assume that the ipif used 11714 * is always the zeroth ipif. (ip_rts_ifmsg does not 11715 * send up any message for non-zero ipifs). 11716 */ 11717 phyint_flags_modified = B_TRUE; 11718 11719 if (ill_v4 != NULL) { 11720 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11721 IRE_CACHE, ill_stq_cache_delete, 11722 (char *)ill_v4, ill_v4); 11723 illgrp_reset_schednext(ill_v4); 11724 } 11725 if (ill_v6 != NULL) { 11726 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11727 IRE_CACHE, ill_stq_cache_delete, 11728 (char *)ill_v6, ill_v6); 11729 illgrp_reset_schednext(ill_v6); 11730 } 11731 } 11732 11733 /* 11734 * If ILLF_ROUTER changes, we need to change the ip forwarding 11735 * status of the interface and, if the interface is part of an IPMP 11736 * group, all other interfaces that are part of the same IPMP 11737 * group. 11738 */ 11739 if ((turn_on | turn_off) & ILLF_ROUTER) { 11740 (void) ill_forward_set(q, mp, ((turn_on & ILLF_ROUTER) != 0), 11741 (caddr_t)ill); 11742 } 11743 11744 /* 11745 * If the interface is not UP and we are not going to 11746 * bring it UP, record the flags and return. When the 11747 * interface comes UP later, the right actions will be 11748 * taken. 11749 */ 11750 if (!(ipif->ipif_flags & IPIF_UP) && 11751 !(turn_on & IPIF_UP)) { 11752 /* Record new flags in their respective places. */ 11753 mutex_enter(&ill->ill_lock); 11754 mutex_enter(&ill->ill_phyint->phyint_lock); 11755 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11756 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11757 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11758 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11759 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11760 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11761 mutex_exit(&ill->ill_lock); 11762 mutex_exit(&ill->ill_phyint->phyint_lock); 11763 11764 /* 11765 * We do the broadcast and nomination here rather 11766 * than waiting for a FAILOVER/FAILBACK to happen. In 11767 * the case of FAILBACK from INACTIVE standby to the 11768 * interface that has been repaired, PHYI_FAILED has not 11769 * been cleared yet. If there are only two interfaces in 11770 * that group, all we have is a FAILED and INACTIVE 11771 * interface. If we do the nomination soon after a failback, 11772 * the broadcast nomination code would select the 11773 * INACTIVE interface for receiving broadcasts as FAILED is 11774 * not yet cleared. As we don't want STANDBY/INACTIVE to 11775 * receive broadcast packets, we need to redo nomination 11776 * when the FAILED is cleared here. Thus, in general we 11777 * always do the nomination here for FAILED, STANDBY 11778 * and OFFLINE. 11779 */ 11780 if (((turn_on | turn_off) & 11781 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 11782 ip_redo_nomination(phyi); 11783 } 11784 if (phyint_flags_modified) { 11785 if (phyi->phyint_illv4 != NULL) { 11786 ip_rts_ifmsg(phyi->phyint_illv4-> 11787 ill_ipif); 11788 } 11789 if (phyi->phyint_illv6 != NULL) { 11790 ip_rts_ifmsg(phyi->phyint_illv6-> 11791 ill_ipif); 11792 } 11793 } 11794 return (0); 11795 } else if (set_linklocal || zero_source) { 11796 mutex_enter(&ill->ill_lock); 11797 if (set_linklocal) 11798 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11799 if (zero_source) 11800 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11801 mutex_exit(&ill->ill_lock); 11802 } 11803 11804 /* 11805 * Disallow IPv6 interfaces coming up that have the unspecified address, 11806 * or point-to-point interfaces with an unspecified destination. We do 11807 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11808 * have a subnet assigned, which is how in.ndpd currently manages its 11809 * onlink prefix list when no addresses are configured with those 11810 * prefixes. 11811 */ 11812 if (ipif->ipif_isv6 && 11813 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11814 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11815 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11816 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11817 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11818 return (EINVAL); 11819 } 11820 11821 /* 11822 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11823 * from being brought up. 11824 */ 11825 if (!ipif->ipif_isv6 && 11826 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11827 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11828 return (EINVAL); 11829 } 11830 11831 /* 11832 * The only flag changes that we currently take specific action on 11833 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 11834 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 11835 * IPIF_PREFERRED. This is done by bring the ipif down, changing 11836 * the flags and bringing it back up again. 11837 */ 11838 if ((turn_on|turn_off) & 11839 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11840 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 11841 /* 11842 * Taking this ipif down, make sure we have 11843 * valid net and subnet bcast ire's for other 11844 * logical interfaces, if we need them. 11845 */ 11846 if (!ipif->ipif_isv6) 11847 ipif_check_bcast_ires(ipif); 11848 11849 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11850 !(turn_off & IPIF_UP)) { 11851 need_up = B_TRUE; 11852 if (ipif->ipif_flags & IPIF_UP) 11853 ill->ill_logical_down = 1; 11854 turn_on &= ~IPIF_UP; 11855 } 11856 err = ipif_down(ipif, q, mp); 11857 ip1dbg(("ipif_down returns %d err ", err)); 11858 if (err == EINPROGRESS) 11859 return (err); 11860 ipif_down_tail(ipif); 11861 } 11862 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 11863 } 11864 11865 static int 11866 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 11867 boolean_t need_up) 11868 { 11869 ill_t *ill; 11870 phyint_t *phyi; 11871 uint64_t turn_on; 11872 uint64_t turn_off; 11873 uint64_t intf_flags; 11874 boolean_t phyint_flags_modified = B_FALSE; 11875 int err = 0; 11876 boolean_t set_linklocal = B_FALSE; 11877 boolean_t zero_source = B_FALSE; 11878 11879 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 11880 ipif->ipif_ill->ill_name, ipif->ipif_id)); 11881 11882 ASSERT(IAM_WRITER_IPIF(ipif)); 11883 11884 ill = ipif->ipif_ill; 11885 phyi = ill->ill_phyint; 11886 11887 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11888 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 11889 11890 turn_off = intf_flags & turn_on; 11891 turn_on ^= turn_off; 11892 11893 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 11894 phyint_flags_modified = B_TRUE; 11895 11896 /* 11897 * Now we change the flags. Track current value of 11898 * other flags in their respective places. 11899 */ 11900 mutex_enter(&ill->ill_lock); 11901 mutex_enter(&phyi->phyint_lock); 11902 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11903 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11904 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11905 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11906 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11907 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11908 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 11909 set_linklocal = B_TRUE; 11910 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 11911 } 11912 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 11913 zero_source = B_TRUE; 11914 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 11915 } 11916 mutex_exit(&ill->ill_lock); 11917 mutex_exit(&phyi->phyint_lock); 11918 11919 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 11920 ip_redo_nomination(phyi); 11921 11922 if (set_linklocal) 11923 (void) ipif_setlinklocal(ipif); 11924 11925 if (zero_source) 11926 ipif->ipif_v6src_addr = ipv6_all_zeros; 11927 else 11928 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 11929 11930 if (need_up) { 11931 /* 11932 * XXX ipif_up really does not know whether a phyint flags 11933 * was modified or not. So, it sends up information on 11934 * only one routing sockets message. As we don't bring up 11935 * the interface and also set STANDBY/FAILED simultaneously 11936 * it should be okay. 11937 */ 11938 err = ipif_up(ipif, q, mp); 11939 } else { 11940 /* 11941 * Make sure routing socket sees all changes to the flags. 11942 * ipif_up_done* handles this when we use ipif_up. 11943 */ 11944 if (phyint_flags_modified) { 11945 if (phyi->phyint_illv4 != NULL) { 11946 ip_rts_ifmsg(phyi->phyint_illv4-> 11947 ill_ipif); 11948 } 11949 if (phyi->phyint_illv6 != NULL) { 11950 ip_rts_ifmsg(phyi->phyint_illv6-> 11951 ill_ipif); 11952 } 11953 } else { 11954 ip_rts_ifmsg(ipif); 11955 } 11956 } 11957 return (err); 11958 } 11959 11960 /* 11961 * Restart entry point to restart the flags restart operation after the 11962 * refcounts have dropped to zero. 11963 */ 11964 /* ARGSUSED */ 11965 int 11966 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11967 ip_ioctl_cmd_t *ipip, void *if_req) 11968 { 11969 int err; 11970 struct ifreq *ifr = (struct ifreq *)if_req; 11971 struct lifreq *lifr = (struct lifreq *)if_req; 11972 11973 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 11974 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11975 11976 ipif_down_tail(ipif); 11977 if (ipip->ipi_cmd_type == IF_CMD) { 11978 /* 11979 * Since ip_sioctl_flags expects an int and ifr_flags 11980 * is a short we need to cast ifr_flags into an int 11981 * to avoid having sign extension cause bits to get 11982 * set that should not be. 11983 */ 11984 err = ip_sioctl_flags_tail(ipif, 11985 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 11986 q, mp, B_TRUE); 11987 } else { 11988 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 11989 q, mp, B_TRUE); 11990 } 11991 return (err); 11992 } 11993 11994 /* ARGSUSED */ 11995 int 11996 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11997 ip_ioctl_cmd_t *ipip, void *if_req) 11998 { 11999 /* 12000 * Has the flags been set correctly till now ? 12001 */ 12002 ill_t *ill = ipif->ipif_ill; 12003 phyint_t *phyi = ill->ill_phyint; 12004 12005 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 12006 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12007 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 12008 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 12009 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 12010 12011 /* 12012 * Need a lock since some flags can be set even when there are 12013 * references to the ipif. 12014 */ 12015 mutex_enter(&ill->ill_lock); 12016 if (ipip->ipi_cmd_type == IF_CMD) { 12017 struct ifreq *ifr = (struct ifreq *)if_req; 12018 12019 /* Get interface flags (low 16 only). */ 12020 ifr->ifr_flags = ((ipif->ipif_flags | 12021 ill->ill_flags | phyi->phyint_flags) & 0xffff); 12022 } else { 12023 struct lifreq *lifr = (struct lifreq *)if_req; 12024 12025 /* Get interface flags. */ 12026 lifr->lifr_flags = ipif->ipif_flags | 12027 ill->ill_flags | phyi->phyint_flags; 12028 } 12029 mutex_exit(&ill->ill_lock); 12030 return (0); 12031 } 12032 12033 /* ARGSUSED */ 12034 int 12035 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12036 ip_ioctl_cmd_t *ipip, void *if_req) 12037 { 12038 int mtu; 12039 int ip_min_mtu; 12040 struct ifreq *ifr; 12041 struct lifreq *lifr; 12042 ire_t *ire; 12043 12044 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 12045 ipif->ipif_id, (void *)ipif)); 12046 if (ipip->ipi_cmd_type == IF_CMD) { 12047 ifr = (struct ifreq *)if_req; 12048 mtu = ifr->ifr_metric; 12049 } else { 12050 lifr = (struct lifreq *)if_req; 12051 mtu = lifr->lifr_mtu; 12052 } 12053 12054 if (ipif->ipif_isv6) 12055 ip_min_mtu = IPV6_MIN_MTU; 12056 else 12057 ip_min_mtu = IP_MIN_MTU; 12058 12059 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 12060 return (EINVAL); 12061 12062 /* 12063 * Change the MTU size in all relevant ire's. 12064 * Mtu change Vs. new ire creation - protocol below. 12065 * First change ipif_mtu and the ire_max_frag of the 12066 * interface ire. Then do an ire walk and change the 12067 * ire_max_frag of all affected ires. During ire_add 12068 * under the bucket lock, set the ire_max_frag of the 12069 * new ire being created from the ipif/ire from which 12070 * it is being derived. If an mtu change happens after 12071 * the ire is added, the new ire will be cleaned up. 12072 * Conversely if the mtu change happens before the ire 12073 * is added, ire_add will see the new value of the mtu. 12074 */ 12075 ipif->ipif_mtu = mtu; 12076 ipif->ipif_flags |= IPIF_FIXEDMTU; 12077 12078 if (ipif->ipif_isv6) 12079 ire = ipif_to_ire_v6(ipif); 12080 else 12081 ire = ipif_to_ire(ipif); 12082 if (ire != NULL) { 12083 ire->ire_max_frag = ipif->ipif_mtu; 12084 ire_refrele(ire); 12085 } 12086 if (ipif->ipif_flags & IPIF_UP) { 12087 if (ipif->ipif_isv6) 12088 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES); 12089 else 12090 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES); 12091 } 12092 /* Update the MTU in SCTP's list */ 12093 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12094 return (0); 12095 } 12096 12097 /* Get interface MTU. */ 12098 /* ARGSUSED */ 12099 int 12100 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12101 ip_ioctl_cmd_t *ipip, void *if_req) 12102 { 12103 struct ifreq *ifr; 12104 struct lifreq *lifr; 12105 12106 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 12107 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12108 if (ipip->ipi_cmd_type == IF_CMD) { 12109 ifr = (struct ifreq *)if_req; 12110 ifr->ifr_metric = ipif->ipif_mtu; 12111 } else { 12112 lifr = (struct lifreq *)if_req; 12113 lifr->lifr_mtu = ipif->ipif_mtu; 12114 } 12115 return (0); 12116 } 12117 12118 /* Set interface broadcast address. */ 12119 /* ARGSUSED2 */ 12120 int 12121 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12122 ip_ioctl_cmd_t *ipip, void *if_req) 12123 { 12124 ipaddr_t addr; 12125 ire_t *ire; 12126 12127 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 12128 ipif->ipif_id)); 12129 12130 ASSERT(IAM_WRITER_IPIF(ipif)); 12131 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12132 return (EADDRNOTAVAIL); 12133 12134 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 12135 12136 if (sin->sin_family != AF_INET) 12137 return (EAFNOSUPPORT); 12138 12139 addr = sin->sin_addr.s_addr; 12140 if (ipif->ipif_flags & IPIF_UP) { 12141 /* 12142 * If we are already up, make sure the new 12143 * broadcast address makes sense. If it does, 12144 * there should be an IRE for it already. 12145 * Don't match on ipif, only on the ill 12146 * since we are sharing these now. Don't use 12147 * MATCH_IRE_ILL_GROUP as we are looking for 12148 * the broadcast ire on this ill and each ill 12149 * in the group has its own broadcast ire. 12150 */ 12151 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 12152 ipif, ALL_ZONES, NULL, 12153 (MATCH_IRE_ILL | MATCH_IRE_TYPE)); 12154 if (ire == NULL) { 12155 return (EINVAL); 12156 } else { 12157 ire_refrele(ire); 12158 } 12159 } 12160 /* 12161 * Changing the broadcast addr for this ipif. 12162 * Make sure we have valid net and subnet bcast 12163 * ire's for other logical interfaces, if needed. 12164 */ 12165 if (addr != ipif->ipif_brd_addr) 12166 ipif_check_bcast_ires(ipif); 12167 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 12168 return (0); 12169 } 12170 12171 /* Get interface broadcast address. */ 12172 /* ARGSUSED */ 12173 int 12174 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12175 ip_ioctl_cmd_t *ipip, void *if_req) 12176 { 12177 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 12178 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12179 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12180 return (EADDRNOTAVAIL); 12181 12182 /* IPIF_BROADCAST not possible with IPv6 */ 12183 ASSERT(!ipif->ipif_isv6); 12184 *sin = sin_null; 12185 sin->sin_family = AF_INET; 12186 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 12187 return (0); 12188 } 12189 12190 /* 12191 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 12192 */ 12193 /* ARGSUSED */ 12194 int 12195 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12196 ip_ioctl_cmd_t *ipip, void *if_req) 12197 { 12198 int err = 0; 12199 in6_addr_t v6mask; 12200 12201 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 12202 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12203 12204 ASSERT(IAM_WRITER_IPIF(ipif)); 12205 12206 if (ipif->ipif_isv6) { 12207 sin6_t *sin6; 12208 12209 if (sin->sin_family != AF_INET6) 12210 return (EAFNOSUPPORT); 12211 12212 sin6 = (sin6_t *)sin; 12213 v6mask = sin6->sin6_addr; 12214 } else { 12215 ipaddr_t mask; 12216 12217 if (sin->sin_family != AF_INET) 12218 return (EAFNOSUPPORT); 12219 12220 mask = sin->sin_addr.s_addr; 12221 V4MASK_TO_V6(mask, v6mask); 12222 } 12223 12224 /* 12225 * No big deal if the interface isn't already up, or the mask 12226 * isn't really changing, or this is pt-pt. 12227 */ 12228 if (!(ipif->ipif_flags & IPIF_UP) || 12229 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12230 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12231 ipif->ipif_v6net_mask = v6mask; 12232 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12233 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12234 ipif->ipif_v6net_mask, 12235 ipif->ipif_v6subnet); 12236 } 12237 return (0); 12238 } 12239 /* 12240 * Make sure we have valid net and subnet broadcast ire's 12241 * for the old netmask, if needed by other logical interfaces. 12242 */ 12243 if (!ipif->ipif_isv6) 12244 ipif_check_bcast_ires(ipif); 12245 12246 err = ipif_logical_down(ipif, q, mp); 12247 if (err == EINPROGRESS) 12248 return (err); 12249 ipif_down_tail(ipif); 12250 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12251 return (err); 12252 } 12253 12254 static int 12255 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12256 { 12257 in6_addr_t v6mask; 12258 int err = 0; 12259 12260 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12261 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12262 12263 if (ipif->ipif_isv6) { 12264 sin6_t *sin6; 12265 12266 sin6 = (sin6_t *)sin; 12267 v6mask = sin6->sin6_addr; 12268 } else { 12269 ipaddr_t mask; 12270 12271 mask = sin->sin_addr.s_addr; 12272 V4MASK_TO_V6(mask, v6mask); 12273 } 12274 12275 ipif->ipif_v6net_mask = v6mask; 12276 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12277 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12278 ipif->ipif_v6subnet); 12279 } 12280 err = ipif_up(ipif, q, mp); 12281 12282 if (err == 0 || err == EINPROGRESS) { 12283 /* 12284 * The interface must be DL_BOUND if this packet has to 12285 * go out on the wire. Since we only go through a logical 12286 * down and are bound with the driver during an internal 12287 * down/up that is satisfied. 12288 */ 12289 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12290 /* Potentially broadcast an address mask reply. */ 12291 ipif_mask_reply(ipif); 12292 } 12293 } 12294 return (err); 12295 } 12296 12297 /* ARGSUSED */ 12298 int 12299 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12300 ip_ioctl_cmd_t *ipip, void *if_req) 12301 { 12302 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12303 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12304 ipif_down_tail(ipif); 12305 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12306 } 12307 12308 /* Get interface net mask. */ 12309 /* ARGSUSED */ 12310 int 12311 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12312 ip_ioctl_cmd_t *ipip, void *if_req) 12313 { 12314 struct lifreq *lifr = (struct lifreq *)if_req; 12315 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12316 12317 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12318 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12319 12320 /* 12321 * net mask can't change since we have a reference to the ipif. 12322 */ 12323 if (ipif->ipif_isv6) { 12324 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12325 *sin6 = sin6_null; 12326 sin6->sin6_family = AF_INET6; 12327 sin6->sin6_addr = ipif->ipif_v6net_mask; 12328 lifr->lifr_addrlen = 12329 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12330 } else { 12331 *sin = sin_null; 12332 sin->sin_family = AF_INET; 12333 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12334 if (ipip->ipi_cmd_type == LIF_CMD) { 12335 lifr->lifr_addrlen = 12336 ip_mask_to_plen(ipif->ipif_net_mask); 12337 } 12338 } 12339 return (0); 12340 } 12341 12342 /* ARGSUSED */ 12343 int 12344 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12345 ip_ioctl_cmd_t *ipip, void *if_req) 12346 { 12347 12348 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12349 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12350 /* 12351 * Set interface metric. We don't use this for 12352 * anything but we keep track of it in case it is 12353 * important to routing applications or such. 12354 */ 12355 if (ipip->ipi_cmd_type == IF_CMD) { 12356 struct ifreq *ifr; 12357 12358 ifr = (struct ifreq *)if_req; 12359 ipif->ipif_metric = ifr->ifr_metric; 12360 } else { 12361 struct lifreq *lifr; 12362 12363 lifr = (struct lifreq *)if_req; 12364 ipif->ipif_metric = lifr->lifr_metric; 12365 } 12366 return (0); 12367 } 12368 12369 12370 /* ARGSUSED */ 12371 int 12372 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12373 ip_ioctl_cmd_t *ipip, void *if_req) 12374 { 12375 12376 /* Get interface metric. */ 12377 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12378 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12379 if (ipip->ipi_cmd_type == IF_CMD) { 12380 struct ifreq *ifr; 12381 12382 ifr = (struct ifreq *)if_req; 12383 ifr->ifr_metric = ipif->ipif_metric; 12384 } else { 12385 struct lifreq *lifr; 12386 12387 lifr = (struct lifreq *)if_req; 12388 lifr->lifr_metric = ipif->ipif_metric; 12389 } 12390 12391 return (0); 12392 } 12393 12394 /* ARGSUSED */ 12395 int 12396 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12397 ip_ioctl_cmd_t *ipip, void *if_req) 12398 { 12399 12400 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12401 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12402 /* 12403 * Set the muxid returned from I_PLINK. 12404 */ 12405 if (ipip->ipi_cmd_type == IF_CMD) { 12406 struct ifreq *ifr = (struct ifreq *)if_req; 12407 12408 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12409 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12410 } else { 12411 struct lifreq *lifr = (struct lifreq *)if_req; 12412 12413 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12414 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12415 } 12416 return (0); 12417 } 12418 12419 /* ARGSUSED */ 12420 int 12421 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12422 ip_ioctl_cmd_t *ipip, void *if_req) 12423 { 12424 12425 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12426 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12427 /* 12428 * Get the muxid saved in ill for I_PUNLINK. 12429 */ 12430 if (ipip->ipi_cmd_type == IF_CMD) { 12431 struct ifreq *ifr = (struct ifreq *)if_req; 12432 12433 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12434 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12435 } else { 12436 struct lifreq *lifr = (struct lifreq *)if_req; 12437 12438 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12439 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12440 } 12441 return (0); 12442 } 12443 12444 /* 12445 * Set the subnet prefix. Does not modify the broadcast address. 12446 */ 12447 /* ARGSUSED */ 12448 int 12449 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12450 ip_ioctl_cmd_t *ipip, void *if_req) 12451 { 12452 int err = 0; 12453 in6_addr_t v6addr; 12454 in6_addr_t v6mask; 12455 boolean_t need_up = B_FALSE; 12456 int addrlen; 12457 12458 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12459 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12460 12461 ASSERT(IAM_WRITER_IPIF(ipif)); 12462 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12463 12464 if (ipif->ipif_isv6) { 12465 sin6_t *sin6; 12466 12467 if (sin->sin_family != AF_INET6) 12468 return (EAFNOSUPPORT); 12469 12470 sin6 = (sin6_t *)sin; 12471 v6addr = sin6->sin6_addr; 12472 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12473 return (EADDRNOTAVAIL); 12474 } else { 12475 ipaddr_t addr; 12476 12477 if (sin->sin_family != AF_INET) 12478 return (EAFNOSUPPORT); 12479 12480 addr = sin->sin_addr.s_addr; 12481 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12482 return (EADDRNOTAVAIL); 12483 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12484 /* Add 96 bits */ 12485 addrlen += IPV6_ABITS - IP_ABITS; 12486 } 12487 12488 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12489 return (EINVAL); 12490 12491 /* Check if bits in the address is set past the mask */ 12492 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12493 return (EINVAL); 12494 12495 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12496 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12497 return (0); /* No change */ 12498 12499 if (ipif->ipif_flags & IPIF_UP) { 12500 /* 12501 * If the interface is already marked up, 12502 * we call ipif_down which will take care 12503 * of ditching any IREs that have been set 12504 * up based on the old interface address. 12505 */ 12506 err = ipif_logical_down(ipif, q, mp); 12507 if (err == EINPROGRESS) 12508 return (err); 12509 ipif_down_tail(ipif); 12510 need_up = B_TRUE; 12511 } 12512 12513 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12514 return (err); 12515 } 12516 12517 static int 12518 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12519 queue_t *q, mblk_t *mp, boolean_t need_up) 12520 { 12521 ill_t *ill = ipif->ipif_ill; 12522 int err = 0; 12523 12524 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12525 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12526 12527 /* Set the new address. */ 12528 mutex_enter(&ill->ill_lock); 12529 ipif->ipif_v6net_mask = v6mask; 12530 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12531 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12532 ipif->ipif_v6subnet); 12533 } 12534 mutex_exit(&ill->ill_lock); 12535 12536 if (need_up) { 12537 /* 12538 * Now bring the interface back up. If this 12539 * is the only IPIF for the ILL, ipif_up 12540 * will have to re-bind to the device, so 12541 * we may get back EINPROGRESS, in which 12542 * case, this IOCTL will get completed in 12543 * ip_rput_dlpi when we see the DL_BIND_ACK. 12544 */ 12545 err = ipif_up(ipif, q, mp); 12546 if (err == EINPROGRESS) 12547 return (err); 12548 } 12549 return (err); 12550 } 12551 12552 /* ARGSUSED */ 12553 int 12554 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12555 ip_ioctl_cmd_t *ipip, void *if_req) 12556 { 12557 int addrlen; 12558 in6_addr_t v6addr; 12559 in6_addr_t v6mask; 12560 struct lifreq *lifr = (struct lifreq *)if_req; 12561 12562 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12563 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12564 ipif_down_tail(ipif); 12565 12566 addrlen = lifr->lifr_addrlen; 12567 if (ipif->ipif_isv6) { 12568 sin6_t *sin6; 12569 12570 sin6 = (sin6_t *)sin; 12571 v6addr = sin6->sin6_addr; 12572 } else { 12573 ipaddr_t addr; 12574 12575 addr = sin->sin_addr.s_addr; 12576 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12577 addrlen += IPV6_ABITS - IP_ABITS; 12578 } 12579 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12580 12581 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12582 } 12583 12584 /* ARGSUSED */ 12585 int 12586 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12587 ip_ioctl_cmd_t *ipip, void *if_req) 12588 { 12589 struct lifreq *lifr = (struct lifreq *)if_req; 12590 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12591 12592 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12593 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12594 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12595 12596 if (ipif->ipif_isv6) { 12597 *sin6 = sin6_null; 12598 sin6->sin6_family = AF_INET6; 12599 sin6->sin6_addr = ipif->ipif_v6subnet; 12600 lifr->lifr_addrlen = 12601 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12602 } else { 12603 *sin = sin_null; 12604 sin->sin_family = AF_INET; 12605 sin->sin_addr.s_addr = ipif->ipif_subnet; 12606 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12607 } 12608 return (0); 12609 } 12610 12611 /* 12612 * Set the IPv6 address token. 12613 */ 12614 /* ARGSUSED */ 12615 int 12616 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12617 ip_ioctl_cmd_t *ipi, void *if_req) 12618 { 12619 ill_t *ill = ipif->ipif_ill; 12620 int err; 12621 in6_addr_t v6addr; 12622 in6_addr_t v6mask; 12623 boolean_t need_up = B_FALSE; 12624 int i; 12625 sin6_t *sin6 = (sin6_t *)sin; 12626 struct lifreq *lifr = (struct lifreq *)if_req; 12627 int addrlen; 12628 12629 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12630 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12631 ASSERT(IAM_WRITER_IPIF(ipif)); 12632 12633 addrlen = lifr->lifr_addrlen; 12634 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12635 if (ipif->ipif_id != 0) 12636 return (EINVAL); 12637 12638 if (!ipif->ipif_isv6) 12639 return (EINVAL); 12640 12641 if (addrlen > IPV6_ABITS) 12642 return (EINVAL); 12643 12644 v6addr = sin6->sin6_addr; 12645 12646 /* 12647 * The length of the token is the length from the end. To get 12648 * the proper mask for this, compute the mask of the bits not 12649 * in the token; ie. the prefix, and then xor to get the mask. 12650 */ 12651 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12652 return (EINVAL); 12653 for (i = 0; i < 4; i++) { 12654 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12655 } 12656 12657 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12658 ill->ill_token_length == addrlen) 12659 return (0); /* No change */ 12660 12661 if (ipif->ipif_flags & IPIF_UP) { 12662 err = ipif_logical_down(ipif, q, mp); 12663 if (err == EINPROGRESS) 12664 return (err); 12665 ipif_down_tail(ipif); 12666 need_up = B_TRUE; 12667 } 12668 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12669 return (err); 12670 } 12671 12672 static int 12673 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12674 mblk_t *mp, boolean_t need_up) 12675 { 12676 in6_addr_t v6addr; 12677 in6_addr_t v6mask; 12678 ill_t *ill = ipif->ipif_ill; 12679 int i; 12680 int err = 0; 12681 12682 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12683 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12684 v6addr = sin6->sin6_addr; 12685 /* 12686 * The length of the token is the length from the end. To get 12687 * the proper mask for this, compute the mask of the bits not 12688 * in the token; ie. the prefix, and then xor to get the mask. 12689 */ 12690 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12691 for (i = 0; i < 4; i++) 12692 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12693 12694 mutex_enter(&ill->ill_lock); 12695 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12696 ill->ill_token_length = addrlen; 12697 mutex_exit(&ill->ill_lock); 12698 12699 if (need_up) { 12700 /* 12701 * Now bring the interface back up. If this 12702 * is the only IPIF for the ILL, ipif_up 12703 * will have to re-bind to the device, so 12704 * we may get back EINPROGRESS, in which 12705 * case, this IOCTL will get completed in 12706 * ip_rput_dlpi when we see the DL_BIND_ACK. 12707 */ 12708 err = ipif_up(ipif, q, mp); 12709 if (err == EINPROGRESS) 12710 return (err); 12711 } 12712 return (err); 12713 } 12714 12715 /* ARGSUSED */ 12716 int 12717 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12718 ip_ioctl_cmd_t *ipi, void *if_req) 12719 { 12720 ill_t *ill; 12721 sin6_t *sin6 = (sin6_t *)sin; 12722 struct lifreq *lifr = (struct lifreq *)if_req; 12723 12724 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12725 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12726 if (ipif->ipif_id != 0) 12727 return (EINVAL); 12728 12729 ill = ipif->ipif_ill; 12730 if (!ill->ill_isv6) 12731 return (ENXIO); 12732 12733 *sin6 = sin6_null; 12734 sin6->sin6_family = AF_INET6; 12735 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12736 sin6->sin6_addr = ill->ill_token; 12737 lifr->lifr_addrlen = ill->ill_token_length; 12738 return (0); 12739 } 12740 12741 /* 12742 * Set (hardware) link specific information that might override 12743 * what was acquired through the DL_INFO_ACK. 12744 * The logic is as follows. 12745 * 12746 * become exclusive 12747 * set CHANGING flag 12748 * change mtu on affected IREs 12749 * clear CHANGING flag 12750 * 12751 * An ire add that occurs before the CHANGING flag is set will have its mtu 12752 * changed by the ip_sioctl_lnkinfo. 12753 * 12754 * During the time the CHANGING flag is set, no new ires will be added to the 12755 * bucket, and ire add will fail (due the CHANGING flag). 12756 * 12757 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12758 * before it is added to the bucket. 12759 * 12760 * Obviously only 1 thread can set the CHANGING flag and we need to become 12761 * exclusive to set the flag. 12762 */ 12763 /* ARGSUSED */ 12764 int 12765 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12766 ip_ioctl_cmd_t *ipi, void *if_req) 12767 { 12768 ill_t *ill = ipif->ipif_ill; 12769 ipif_t *nipif; 12770 int ip_min_mtu; 12771 boolean_t mtu_walk = B_FALSE; 12772 struct lifreq *lifr = (struct lifreq *)if_req; 12773 lif_ifinfo_req_t *lir; 12774 ire_t *ire; 12775 12776 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12777 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12778 lir = &lifr->lifr_ifinfo; 12779 ASSERT(IAM_WRITER_IPIF(ipif)); 12780 12781 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12782 if (ipif->ipif_id != 0) 12783 return (EINVAL); 12784 12785 /* Set interface MTU. */ 12786 if (ipif->ipif_isv6) 12787 ip_min_mtu = IPV6_MIN_MTU; 12788 else 12789 ip_min_mtu = IP_MIN_MTU; 12790 12791 /* 12792 * Verify values before we set anything. Allow zero to 12793 * mean unspecified. 12794 */ 12795 if (lir->lir_maxmtu != 0 && 12796 (lir->lir_maxmtu > ill->ill_max_frag || 12797 lir->lir_maxmtu < ip_min_mtu)) 12798 return (EINVAL); 12799 if (lir->lir_reachtime != 0 && 12800 lir->lir_reachtime > ND_MAX_REACHTIME) 12801 return (EINVAL); 12802 if (lir->lir_reachretrans != 0 && 12803 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12804 return (EINVAL); 12805 12806 mutex_enter(&ill->ill_lock); 12807 ill->ill_state_flags |= ILL_CHANGING; 12808 for (nipif = ill->ill_ipif; nipif != NULL; 12809 nipif = nipif->ipif_next) { 12810 nipif->ipif_state_flags |= IPIF_CHANGING; 12811 } 12812 12813 mutex_exit(&ill->ill_lock); 12814 12815 if (lir->lir_maxmtu != 0) { 12816 ill->ill_max_mtu = lir->lir_maxmtu; 12817 ill->ill_mtu_userspecified = 1; 12818 mtu_walk = B_TRUE; 12819 } 12820 12821 if (lir->lir_reachtime != 0) 12822 ill->ill_reachable_time = lir->lir_reachtime; 12823 12824 if (lir->lir_reachretrans != 0) 12825 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12826 12827 ill->ill_max_hops = lir->lir_maxhops; 12828 12829 ill->ill_max_buf = ND_MAX_Q; 12830 12831 if (mtu_walk) { 12832 /* 12833 * Set the MTU on all ipifs associated with this ill except 12834 * for those whose MTU was fixed via SIOCSLIFMTU. 12835 */ 12836 for (nipif = ill->ill_ipif; nipif != NULL; 12837 nipif = nipif->ipif_next) { 12838 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12839 continue; 12840 12841 nipif->ipif_mtu = ill->ill_max_mtu; 12842 12843 if (!(nipif->ipif_flags & IPIF_UP)) 12844 continue; 12845 12846 if (nipif->ipif_isv6) 12847 ire = ipif_to_ire_v6(nipif); 12848 else 12849 ire = ipif_to_ire(nipif); 12850 if (ire != NULL) { 12851 ire->ire_max_frag = ipif->ipif_mtu; 12852 ire_refrele(ire); 12853 } 12854 if (ill->ill_isv6) { 12855 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 12856 ipif_mtu_change, (char *)nipif, 12857 ill); 12858 } else { 12859 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 12860 ipif_mtu_change, (char *)nipif, 12861 ill); 12862 } 12863 } 12864 } 12865 12866 mutex_enter(&ill->ill_lock); 12867 for (nipif = ill->ill_ipif; nipif != NULL; 12868 nipif = nipif->ipif_next) { 12869 nipif->ipif_state_flags &= ~IPIF_CHANGING; 12870 } 12871 ILL_UNMARK_CHANGING(ill); 12872 mutex_exit(&ill->ill_lock); 12873 12874 return (0); 12875 } 12876 12877 /* ARGSUSED */ 12878 int 12879 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12880 ip_ioctl_cmd_t *ipi, void *if_req) 12881 { 12882 struct lif_ifinfo_req *lir; 12883 ill_t *ill = ipif->ipif_ill; 12884 12885 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 12886 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12887 if (ipif->ipif_id != 0) 12888 return (EINVAL); 12889 12890 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 12891 lir->lir_maxhops = ill->ill_max_hops; 12892 lir->lir_reachtime = ill->ill_reachable_time; 12893 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 12894 lir->lir_maxmtu = ill->ill_max_mtu; 12895 12896 return (0); 12897 } 12898 12899 /* 12900 * Return best guess as to the subnet mask for the specified address. 12901 * Based on the subnet masks for all the configured interfaces. 12902 * 12903 * We end up returning a zero mask in the case of default, multicast or 12904 * experimental. 12905 */ 12906 static ipaddr_t 12907 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp) 12908 { 12909 ipaddr_t net_mask; 12910 ill_t *ill; 12911 ipif_t *ipif; 12912 ill_walk_context_t ctx; 12913 ipif_t *fallback_ipif = NULL; 12914 12915 net_mask = ip_net_mask(addr); 12916 if (net_mask == 0) { 12917 *ipifp = NULL; 12918 return (0); 12919 } 12920 12921 /* Let's check to see if this is maybe a local subnet route. */ 12922 /* this function only applies to IPv4 interfaces */ 12923 rw_enter(&ill_g_lock, RW_READER); 12924 ill = ILL_START_WALK_V4(&ctx); 12925 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 12926 mutex_enter(&ill->ill_lock); 12927 for (ipif = ill->ill_ipif; ipif != NULL; 12928 ipif = ipif->ipif_next) { 12929 if (!IPIF_CAN_LOOKUP(ipif)) 12930 continue; 12931 if (!(ipif->ipif_flags & IPIF_UP)) 12932 continue; 12933 if ((ipif->ipif_subnet & net_mask) == 12934 (addr & net_mask)) { 12935 /* 12936 * Don't trust pt-pt interfaces if there are 12937 * other interfaces. 12938 */ 12939 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 12940 if (fallback_ipif == NULL) { 12941 ipif_refhold_locked(ipif); 12942 fallback_ipif = ipif; 12943 } 12944 continue; 12945 } 12946 12947 /* 12948 * Fine. Just assume the same net mask as the 12949 * directly attached subnet interface is using. 12950 */ 12951 ipif_refhold_locked(ipif); 12952 mutex_exit(&ill->ill_lock); 12953 rw_exit(&ill_g_lock); 12954 if (fallback_ipif != NULL) 12955 ipif_refrele(fallback_ipif); 12956 *ipifp = ipif; 12957 return (ipif->ipif_net_mask); 12958 } 12959 } 12960 mutex_exit(&ill->ill_lock); 12961 } 12962 rw_exit(&ill_g_lock); 12963 12964 *ipifp = fallback_ipif; 12965 return ((fallback_ipif != NULL) ? 12966 fallback_ipif->ipif_net_mask : net_mask); 12967 } 12968 12969 /* 12970 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 12971 */ 12972 static void 12973 ip_wput_ioctl(queue_t *q, mblk_t *mp) 12974 { 12975 IOCP iocp; 12976 ipft_t *ipft; 12977 ipllc_t *ipllc; 12978 mblk_t *mp1; 12979 cred_t *cr; 12980 int error = 0; 12981 conn_t *connp; 12982 12983 ip1dbg(("ip_wput_ioctl")); 12984 iocp = (IOCP)mp->b_rptr; 12985 mp1 = mp->b_cont; 12986 if (mp1 == NULL) { 12987 iocp->ioc_error = EINVAL; 12988 mp->b_datap->db_type = M_IOCNAK; 12989 iocp->ioc_count = 0; 12990 qreply(q, mp); 12991 return; 12992 } 12993 12994 /* 12995 * These IOCTLs provide various control capabilities to 12996 * upstream agents such as ULPs and processes. There 12997 * are currently two such IOCTLs implemented. They 12998 * are used by TCP to provide update information for 12999 * existing IREs and to forcibly delete an IRE for a 13000 * host that is not responding, thereby forcing an 13001 * attempt at a new route. 13002 */ 13003 iocp->ioc_error = EINVAL; 13004 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 13005 goto done; 13006 13007 ipllc = (ipllc_t *)mp1->b_rptr; 13008 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 13009 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 13010 break; 13011 } 13012 /* 13013 * prefer credential from mblk over ioctl; 13014 * see ip_sioctl_copyin_setup 13015 */ 13016 cr = DB_CREDDEF(mp, iocp->ioc_cr); 13017 13018 /* 13019 * Refhold the conn in case the request gets queued up in some lookup 13020 */ 13021 ASSERT(CONN_Q(q)); 13022 connp = Q_TO_CONN(q); 13023 CONN_INC_REF(connp); 13024 if (ipft->ipft_pfi && 13025 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 13026 pullupmsg(mp1, ipft->ipft_min_size))) { 13027 error = (*ipft->ipft_pfi)(q, 13028 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 13029 } 13030 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 13031 /* 13032 * CONN_OPER_PENDING_DONE happens in the function called 13033 * through ipft_pfi above. 13034 */ 13035 return; 13036 } 13037 13038 CONN_OPER_PENDING_DONE(connp); 13039 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 13040 freemsg(mp); 13041 return; 13042 } 13043 iocp->ioc_error = error; 13044 13045 done: 13046 mp->b_datap->db_type = M_IOCACK; 13047 if (iocp->ioc_error) 13048 iocp->ioc_count = 0; 13049 qreply(q, mp); 13050 } 13051 13052 /* 13053 * Lookup an ipif using the sequence id (ipif_seqid) 13054 */ 13055 ipif_t * 13056 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 13057 { 13058 ipif_t *ipif; 13059 13060 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13061 13062 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13063 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 13064 return (ipif); 13065 } 13066 return (NULL); 13067 } 13068 13069 uint64_t ipif_g_seqid; 13070 13071 /* 13072 * Assign a unique id for the ipif. This is used later when we send 13073 * IRES to ARP for resolution where we initialize ire_ipif_seqid 13074 * to the value pointed by ire_ipif->ipif_seqid. Later when the 13075 * IRE is added, we verify that ipif has not disappeared. 13076 */ 13077 13078 static void 13079 ipif_assign_seqid(ipif_t *ipif) 13080 { 13081 ipif->ipif_seqid = atomic_add_64_nv(&ipif_g_seqid, 1); 13082 } 13083 13084 /* 13085 * Insert the ipif, so that the list of ipifs on the ill will be sorted 13086 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 13087 * be inserted into the first space available in the list. The value of 13088 * ipif_id will then be set to the appropriate value for its position. 13089 */ 13090 static int 13091 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 13092 { 13093 ill_t *ill; 13094 ipif_t *tipif; 13095 ipif_t **tipifp; 13096 int id; 13097 13098 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 13099 IAM_WRITER_IPIF(ipif)); 13100 13101 ill = ipif->ipif_ill; 13102 ASSERT(ill != NULL); 13103 13104 /* 13105 * In the case of lo0:0 we already hold the ill_g_lock. 13106 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13107 * ipif_insert. Another such caller is ipif_move. 13108 */ 13109 if (acquire_g_lock) 13110 rw_enter(&ill_g_lock, RW_WRITER); 13111 if (acquire_ill_lock) 13112 mutex_enter(&ill->ill_lock); 13113 id = ipif->ipif_id; 13114 tipifp = &(ill->ill_ipif); 13115 if (id == -1) { /* need to find a real id */ 13116 id = 0; 13117 while ((tipif = *tipifp) != NULL) { 13118 ASSERT(tipif->ipif_id >= id); 13119 if (tipif->ipif_id != id) 13120 break; /* non-consecutive id */ 13121 id++; 13122 tipifp = &(tipif->ipif_next); 13123 } 13124 /* limit number of logical interfaces */ 13125 if (id >= ip_addrs_per_if) { 13126 if (acquire_ill_lock) 13127 mutex_exit(&ill->ill_lock); 13128 if (acquire_g_lock) 13129 rw_exit(&ill_g_lock); 13130 return (-1); 13131 } 13132 ipif->ipif_id = id; /* assign new id */ 13133 } else if (id < ip_addrs_per_if) { 13134 /* we have a real id; insert ipif in the right place */ 13135 while ((tipif = *tipifp) != NULL) { 13136 ASSERT(tipif->ipif_id != id); 13137 if (tipif->ipif_id > id) 13138 break; /* found correct location */ 13139 tipifp = &(tipif->ipif_next); 13140 } 13141 } else { 13142 if (acquire_ill_lock) 13143 mutex_exit(&ill->ill_lock); 13144 if (acquire_g_lock) 13145 rw_exit(&ill_g_lock); 13146 return (-1); 13147 } 13148 13149 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13150 13151 ipif->ipif_next = tipif; 13152 *tipifp = ipif; 13153 if (acquire_ill_lock) 13154 mutex_exit(&ill->ill_lock); 13155 if (acquire_g_lock) 13156 rw_exit(&ill_g_lock); 13157 return (0); 13158 } 13159 13160 /* 13161 * Allocate and initialize a new interface control structure. (Always 13162 * called as writer.) 13163 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13164 * is not part of the global linked list of ills. ipif_seqid is unique 13165 * in the system and to preserve the uniqueness, it is assigned only 13166 * when ill becomes part of the global list. At that point ill will 13167 * have a name. If it doesn't get assigned here, it will get assigned 13168 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13169 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13170 * the interface flags or any other information from the DL_INFO_ACK for 13171 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13172 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13173 * second DL_INFO_ACK comes in from the driver. 13174 */ 13175 static ipif_t * 13176 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 13177 { 13178 ipif_t *ipif; 13179 phyint_t *phyi; 13180 13181 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13182 ill->ill_name, id, (void *)ill)); 13183 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13184 13185 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13186 return (NULL); 13187 *ipif = ipif_zero; /* start clean */ 13188 13189 ipif->ipif_ill = ill; 13190 ipif->ipif_id = id; /* could be -1 */ 13191 ipif->ipif_zoneid = GLOBAL_ZONEID; 13192 13193 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13194 13195 ipif->ipif_refcnt = 0; 13196 ipif->ipif_saved_ire_cnt = 0; 13197 13198 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 13199 mi_free(ipif); 13200 return (NULL); 13201 } 13202 /* -1 id should have been replaced by real id */ 13203 id = ipif->ipif_id; 13204 ASSERT(id >= 0); 13205 13206 if (ill->ill_name[0] != '\0') { 13207 ipif_assign_seqid(ipif); 13208 if (ill->ill_phyint->phyint_ifindex != 0) 13209 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 13210 } 13211 /* 13212 * Keep a copy of original id in ipif_orig_ipifid. Failback 13213 * will attempt to restore the original id. The SIOCSLIFOINDEX 13214 * ioctl sets ipif_orig_ipifid to zero. 13215 */ 13216 ipif->ipif_orig_ipifid = id; 13217 13218 /* 13219 * We grab the ill_lock and phyint_lock to protect the flag changes. 13220 * The ipif is still not up and can't be looked up until the 13221 * ioctl completes and the IPIF_CHANGING flag is cleared. 13222 */ 13223 mutex_enter(&ill->ill_lock); 13224 mutex_enter(&ill->ill_phyint->phyint_lock); 13225 /* 13226 * Set the running flag when logical interface zero is created. 13227 * For subsequent logical interfaces, a DLPI link down 13228 * notification message may have cleared the running flag to 13229 * indicate the link is down, so we shouldn't just blindly set it. 13230 */ 13231 if (id == 0) 13232 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 13233 ipif->ipif_ire_type = ire_type; 13234 phyi = ill->ill_phyint; 13235 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 13236 13237 if (ipif->ipif_isv6) { 13238 ill->ill_flags |= ILLF_IPV6; 13239 } else { 13240 ipaddr_t inaddr_any = INADDR_ANY; 13241 13242 ill->ill_flags |= ILLF_IPV4; 13243 13244 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13245 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13246 &ipif->ipif_v6lcl_addr); 13247 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13248 &ipif->ipif_v6src_addr); 13249 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13250 &ipif->ipif_v6subnet); 13251 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13252 &ipif->ipif_v6net_mask); 13253 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13254 &ipif->ipif_v6brd_addr); 13255 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13256 &ipif->ipif_v6pp_dst_addr); 13257 } 13258 13259 /* 13260 * Don't set the interface flags etc. now, will do it in 13261 * ip_ll_subnet_defaults. 13262 */ 13263 if (!initialize) { 13264 mutex_exit(&ill->ill_lock); 13265 mutex_exit(&ill->ill_phyint->phyint_lock); 13266 return (ipif); 13267 } 13268 ipif->ipif_mtu = ill->ill_max_mtu; 13269 13270 if (ill->ill_bcast_addr_length != 0) { 13271 /* 13272 * Later detect lack of DLPI driver multicast 13273 * capability by catching DL_ENABMULTI errors in 13274 * ip_rput_dlpi. 13275 */ 13276 ill->ill_flags |= ILLF_MULTICAST; 13277 if (!ipif->ipif_isv6) 13278 ipif->ipif_flags |= IPIF_BROADCAST; 13279 } else { 13280 if (ill->ill_net_type != IRE_LOOPBACK) { 13281 if (ipif->ipif_isv6) 13282 /* 13283 * Note: xresolv interfaces will eventually need 13284 * NOARP set here as well, but that will require 13285 * those external resolvers to have some 13286 * knowledge of that flag and act appropriately. 13287 * Not to be changed at present. 13288 */ 13289 ill->ill_flags |= ILLF_NONUD; 13290 else 13291 ill->ill_flags |= ILLF_NOARP; 13292 } 13293 if (ill->ill_phys_addr_length == 0) { 13294 if (ill->ill_media && 13295 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 13296 ipif->ipif_flags |= IPIF_NOXMIT; 13297 phyi->phyint_flags |= PHYI_VIRTUAL; 13298 } else { 13299 /* pt-pt supports multicast. */ 13300 ill->ill_flags |= ILLF_MULTICAST; 13301 if (ill->ill_net_type == IRE_LOOPBACK) { 13302 phyi->phyint_flags |= 13303 (PHYI_LOOPBACK | PHYI_VIRTUAL); 13304 } else { 13305 ipif->ipif_flags |= IPIF_POINTOPOINT; 13306 } 13307 } 13308 } 13309 } 13310 mutex_exit(&ill->ill_lock); 13311 mutex_exit(&ill->ill_phyint->phyint_lock); 13312 return (ipif); 13313 } 13314 13315 /* 13316 * If appropriate, send a message up to the resolver delete the entry 13317 * for the address of this interface which is going out of business. 13318 * (Always called as writer). 13319 * 13320 * NOTE : We need to check for NULL mps as some of the fields are 13321 * initialized only for some interface types. See ipif_resolver_up() 13322 * for details. 13323 */ 13324 void 13325 ipif_arp_down(ipif_t *ipif) 13326 { 13327 mblk_t *mp; 13328 ill_t *ill = ipif->ipif_ill; 13329 13330 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13331 ASSERT(IAM_WRITER_IPIF(ipif)); 13332 13333 /* Delete the mapping for the local address */ 13334 mp = ipif->ipif_arp_del_mp; 13335 if (mp != NULL) { 13336 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13337 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13338 putnext(ill->ill_rq, mp); 13339 ipif->ipif_arp_del_mp = NULL; 13340 } 13341 13342 /* 13343 * If this is the last ipif that is going down and there are no 13344 * duplicate addresses we may yet attempt to re-probe, then we need to 13345 * clean up ARP completely. 13346 */ 13347 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13348 13349 /* Send up AR_INTERFACE_DOWN message */ 13350 mp = ill->ill_arp_down_mp; 13351 if (mp != NULL) { 13352 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13353 *(unsigned *)mp->b_rptr, ill->ill_name, 13354 ipif->ipif_id)); 13355 putnext(ill->ill_rq, mp); 13356 ill->ill_arp_down_mp = NULL; 13357 } 13358 13359 /* Tell ARP to delete the multicast mappings */ 13360 mp = ill->ill_arp_del_mapping_mp; 13361 if (mp != NULL) { 13362 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13363 *(unsigned *)mp->b_rptr, ill->ill_name, 13364 ipif->ipif_id)); 13365 putnext(ill->ill_rq, mp); 13366 ill->ill_arp_del_mapping_mp = NULL; 13367 } 13368 } 13369 } 13370 13371 /* 13372 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 13373 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 13374 * that it wants the add_mp allocated in this function to be returned 13375 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 13376 * just re-do the multicast, it wants us to send the add_mp to ARP also. 13377 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 13378 * as it does a ipif_arp_down after calling this function - which will 13379 * remove what we add here. 13380 * 13381 * Returns -1 on failures and 0 on success. 13382 */ 13383 int 13384 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13385 { 13386 mblk_t *del_mp = NULL; 13387 mblk_t *add_mp = NULL; 13388 mblk_t *mp; 13389 ill_t *ill = ipif->ipif_ill; 13390 phyint_t *phyi = ill->ill_phyint; 13391 ipaddr_t addr, mask, extract_mask = 0; 13392 arma_t *arma; 13393 uint8_t *maddr, *bphys_addr; 13394 uint32_t hw_start; 13395 dl_unitdata_req_t *dlur; 13396 13397 ASSERT(IAM_WRITER_IPIF(ipif)); 13398 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13399 return (0); 13400 13401 /* 13402 * Delete the existing mapping from ARP. Normally ipif_down 13403 * -> ipif_arp_down should send this up to ARP. The only 13404 * reason we would find this when we are switching from 13405 * Multicast to Broadcast where we did not do a down. 13406 */ 13407 mp = ill->ill_arp_del_mapping_mp; 13408 if (mp != NULL) { 13409 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13410 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13411 putnext(ill->ill_rq, mp); 13412 ill->ill_arp_del_mapping_mp = NULL; 13413 } 13414 13415 if (arp_add_mapping_mp != NULL) 13416 *arp_add_mapping_mp = NULL; 13417 13418 /* 13419 * Check that the address is not to long for the constant 13420 * length reserved in the template arma_t. 13421 */ 13422 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13423 return (-1); 13424 13425 /* Add mapping mblk */ 13426 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13427 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13428 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13429 (caddr_t)&addr); 13430 if (add_mp == NULL) 13431 return (-1); 13432 arma = (arma_t *)add_mp->b_rptr; 13433 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13434 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13435 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13436 13437 /* 13438 * Determine the broadcast address. 13439 */ 13440 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13441 if (ill->ill_sap_length < 0) 13442 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13443 else 13444 bphys_addr = (uchar_t *)dlur + 13445 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13446 /* 13447 * Check PHYI_MULTI_BCAST and length of physical 13448 * address to determine if we use the mapping or the 13449 * broadcast address. 13450 */ 13451 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13452 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13453 bphys_addr, maddr, &hw_start, &extract_mask)) 13454 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13455 13456 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13457 (ill->ill_flags & ILLF_MULTICAST)) { 13458 /* Make sure this will not match the "exact" entry. */ 13459 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13460 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13461 (caddr_t)&addr); 13462 if (del_mp == NULL) { 13463 freemsg(add_mp); 13464 return (-1); 13465 } 13466 bcopy(&extract_mask, (char *)arma + 13467 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13468 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13469 /* Use link-layer broadcast address for MULTI_BCAST */ 13470 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13471 ip2dbg(("ipif_arp_setup_multicast: adding" 13472 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13473 } else { 13474 arma->arma_hw_mapping_start = hw_start; 13475 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13476 " ARP setup for %s\n", ill->ill_name)); 13477 } 13478 } else { 13479 freemsg(add_mp); 13480 ASSERT(del_mp == NULL); 13481 /* It is neither MULTICAST nor MULTI_BCAST */ 13482 return (0); 13483 } 13484 ASSERT(add_mp != NULL && del_mp != NULL); 13485 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13486 ill->ill_arp_del_mapping_mp = del_mp; 13487 if (arp_add_mapping_mp != NULL) { 13488 /* The caller just wants the mblks allocated */ 13489 *arp_add_mapping_mp = add_mp; 13490 } else { 13491 /* The caller wants us to send it to arp */ 13492 putnext(ill->ill_rq, add_mp); 13493 } 13494 return (0); 13495 } 13496 13497 /* 13498 * Get the resolver set up for a new interface address. 13499 * (Always called as writer.) 13500 * Called both for IPv4 and IPv6 interfaces, 13501 * though it only sets up the resolver for v6 13502 * if it's an xresolv interface (one using an external resolver). 13503 * Honors ILLF_NOARP. 13504 * The enumerated value res_act is used to tune the behavior. 13505 * If set to Res_act_initial, then we set up all the resolver 13506 * structures for a new interface. If set to Res_act_move, then 13507 * we just send an AR_ENTRY_ADD message up to ARP for IPv4 13508 * interfaces; this is called by ip_rput_dlpi_writer() to handle 13509 * asynchronous hardware address change notification. If set to 13510 * Res_act_defend, then we tell ARP that it needs to send a single 13511 * gratuitous message in defense of the address. 13512 * Returns error on failure. 13513 */ 13514 int 13515 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13516 { 13517 caddr_t addr; 13518 mblk_t *arp_up_mp = NULL; 13519 mblk_t *arp_down_mp = NULL; 13520 mblk_t *arp_add_mp = NULL; 13521 mblk_t *arp_del_mp = NULL; 13522 mblk_t *arp_add_mapping_mp = NULL; 13523 mblk_t *arp_del_mapping_mp = NULL; 13524 ill_t *ill = ipif->ipif_ill; 13525 uchar_t *area_p = NULL; 13526 uchar_t *ared_p = NULL; 13527 int err = ENOMEM; 13528 boolean_t was_dup; 13529 13530 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13531 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13532 ASSERT(IAM_WRITER_IPIF(ipif)); 13533 13534 was_dup = B_FALSE; 13535 if (res_act == Res_act_initial) { 13536 ipif->ipif_addr_ready = 0; 13537 /* 13538 * We're bringing an interface up here. There's no way that we 13539 * should need to shut down ARP now. 13540 */ 13541 mutex_enter(&ill->ill_lock); 13542 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13543 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13544 ill->ill_ipif_dup_count--; 13545 was_dup = B_TRUE; 13546 } 13547 mutex_exit(&ill->ill_lock); 13548 } 13549 if (ipif->ipif_recovery_id != 0) 13550 (void) untimeout(ipif->ipif_recovery_id); 13551 ipif->ipif_recovery_id = 0; 13552 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13553 ipif->ipif_addr_ready = 1; 13554 return (0); 13555 } 13556 /* NDP will set the ipif_addr_ready flag when it's ready */ 13557 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13558 return (0); 13559 13560 if (ill->ill_isv6) { 13561 /* 13562 * External resolver for IPv6 13563 */ 13564 ASSERT(res_act == Res_act_initial); 13565 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 13566 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 13567 area_p = (uchar_t *)&ip6_area_template; 13568 ared_p = (uchar_t *)&ip6_ared_template; 13569 } 13570 } else { 13571 /* 13572 * IPv4 arp case. If the ARP stream has already started 13573 * closing, fail this request for ARP bringup. Else 13574 * record the fact that an ARP bringup is pending. 13575 */ 13576 mutex_enter(&ill->ill_lock); 13577 if (ill->ill_arp_closing) { 13578 mutex_exit(&ill->ill_lock); 13579 err = EINVAL; 13580 goto failed; 13581 } else { 13582 if (ill->ill_ipif_up_count == 0 && 13583 ill->ill_ipif_dup_count == 0 && !was_dup) 13584 ill->ill_arp_bringup_pending = 1; 13585 mutex_exit(&ill->ill_lock); 13586 } 13587 if (ipif->ipif_lcl_addr != INADDR_ANY) { 13588 addr = (caddr_t)&ipif->ipif_lcl_addr; 13589 area_p = (uchar_t *)&ip_area_template; 13590 ared_p = (uchar_t *)&ip_ared_template; 13591 } 13592 } 13593 13594 /* 13595 * Add an entry for the local address in ARP only if it 13596 * is not UNNUMBERED and the address is not INADDR_ANY. 13597 */ 13598 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { 13599 area_t *area; 13600 13601 /* Now ask ARP to publish our address. */ 13602 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 13603 if (arp_add_mp == NULL) 13604 goto failed; 13605 area = (area_t *)arp_add_mp->b_rptr; 13606 if (res_act != Res_act_initial) { 13607 /* 13608 * Copy the new hardware address and length into 13609 * arp_add_mp to be sent to ARP. 13610 */ 13611 area->area_hw_addr_length = 13612 ill->ill_phys_addr_length; 13613 bcopy((char *)ill->ill_phys_addr, 13614 ((char *)area + area->area_hw_addr_offset), 13615 area->area_hw_addr_length); 13616 } 13617 13618 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | 13619 ACE_F_MYADDR; 13620 13621 if (res_act == Res_act_defend) { 13622 area->area_flags |= ACE_F_DEFEND; 13623 /* 13624 * If we're just defending our address now, then 13625 * there's no need to set up ARP multicast mappings. 13626 * The publish command is enough. 13627 */ 13628 goto done; 13629 } 13630 13631 if (res_act != Res_act_initial) 13632 goto arp_setup_multicast; 13633 13634 /* 13635 * Allocate an ARP deletion message so we know we can tell ARP 13636 * when the interface goes down. 13637 */ 13638 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 13639 if (arp_del_mp == NULL) 13640 goto failed; 13641 13642 } else { 13643 if (res_act != Res_act_initial) 13644 goto done; 13645 } 13646 /* 13647 * Need to bring up ARP or setup multicast mapping only 13648 * when the first interface is coming UP. 13649 */ 13650 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 13651 was_dup) { 13652 goto done; 13653 } 13654 13655 /* 13656 * Allocate an ARP down message (to be saved) and an ARP up 13657 * message. 13658 */ 13659 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13660 if (arp_down_mp == NULL) 13661 goto failed; 13662 13663 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13664 if (arp_up_mp == NULL) 13665 goto failed; 13666 13667 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13668 goto done; 13669 13670 arp_setup_multicast: 13671 /* 13672 * Setup the multicast mappings. This function initializes 13673 * ill_arp_del_mapping_mp also. This does not need to be done for 13674 * IPv6. 13675 */ 13676 if (!ill->ill_isv6) { 13677 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13678 if (err != 0) 13679 goto failed; 13680 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13681 ASSERT(arp_add_mapping_mp != NULL); 13682 } 13683 13684 done: 13685 if (arp_del_mp != NULL) { 13686 ASSERT(ipif->ipif_arp_del_mp == NULL); 13687 ipif->ipif_arp_del_mp = arp_del_mp; 13688 } 13689 if (arp_down_mp != NULL) { 13690 ASSERT(ill->ill_arp_down_mp == NULL); 13691 ill->ill_arp_down_mp = arp_down_mp; 13692 } 13693 if (arp_del_mapping_mp != NULL) { 13694 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13695 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13696 } 13697 if (arp_up_mp != NULL) { 13698 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13699 ill->ill_name, ipif->ipif_id)); 13700 putnext(ill->ill_rq, arp_up_mp); 13701 } 13702 if (arp_add_mp != NULL) { 13703 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13704 ill->ill_name, ipif->ipif_id)); 13705 /* 13706 * If it's an extended ARP implementation, then we'll wait to 13707 * hear that DAD has finished before using the interface. 13708 */ 13709 if (!ill->ill_arp_extend) 13710 ipif->ipif_addr_ready = 1; 13711 putnext(ill->ill_rq, arp_add_mp); 13712 } else { 13713 ipif->ipif_addr_ready = 1; 13714 } 13715 if (arp_add_mapping_mp != NULL) { 13716 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13717 ill->ill_name, ipif->ipif_id)); 13718 putnext(ill->ill_rq, arp_add_mapping_mp); 13719 } 13720 if (res_act != Res_act_initial) 13721 return (0); 13722 13723 if (ill->ill_flags & ILLF_NOARP) 13724 err = ill_arp_off(ill); 13725 else 13726 err = ill_arp_on(ill); 13727 if (err != 0) { 13728 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 13729 freemsg(ipif->ipif_arp_del_mp); 13730 freemsg(ill->ill_arp_down_mp); 13731 freemsg(ill->ill_arp_del_mapping_mp); 13732 ipif->ipif_arp_del_mp = NULL; 13733 ill->ill_arp_down_mp = NULL; 13734 ill->ill_arp_del_mapping_mp = NULL; 13735 return (err); 13736 } 13737 return ((ill->ill_ipif_up_count != 0 || was_dup || 13738 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 13739 13740 failed: 13741 ip1dbg(("ipif_resolver_up: FAILED\n")); 13742 freemsg(arp_add_mp); 13743 freemsg(arp_del_mp); 13744 freemsg(arp_add_mapping_mp); 13745 freemsg(arp_up_mp); 13746 freemsg(arp_down_mp); 13747 ill->ill_arp_bringup_pending = 0; 13748 return (err); 13749 } 13750 13751 /* 13752 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 13753 * just gone back up. 13754 */ 13755 static void 13756 ipif_arp_start_dad(ipif_t *ipif) 13757 { 13758 ill_t *ill = ipif->ipif_ill; 13759 mblk_t *arp_add_mp; 13760 area_t *area; 13761 13762 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 13763 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13764 ipif->ipif_lcl_addr == INADDR_ANY || 13765 (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 13766 (char *)&ipif->ipif_lcl_addr)) == NULL) { 13767 /* 13768 * If we can't contact ARP for some reason, that's not really a 13769 * problem. Just send out the routing socket notification that 13770 * DAD completion would have done, and continue. 13771 */ 13772 ipif_mask_reply(ipif); 13773 ip_rts_ifmsg(ipif); 13774 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13775 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13776 ipif->ipif_addr_ready = 1; 13777 return; 13778 } 13779 13780 /* Setting the 'unverified' flag restarts DAD */ 13781 area = (area_t *)arp_add_mp->b_rptr; 13782 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 13783 ACE_F_UNVERIFIED; 13784 putnext(ill->ill_rq, arp_add_mp); 13785 } 13786 13787 static void 13788 ipif_ndp_start_dad(ipif_t *ipif) 13789 { 13790 nce_t *nce; 13791 13792 nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); 13793 if (nce == NULL) 13794 return; 13795 13796 if (!ndp_restart_dad(nce)) { 13797 /* 13798 * If we can't restart DAD for some reason, that's not really a 13799 * problem. Just send out the routing socket notification that 13800 * DAD completion would have done, and continue. 13801 */ 13802 ip_rts_ifmsg(ipif); 13803 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13804 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13805 ipif->ipif_addr_ready = 1; 13806 } 13807 NCE_REFRELE(nce); 13808 } 13809 13810 /* 13811 * Restart duplicate address detection on all interfaces on the given ill. 13812 * 13813 * This is called when an interface transitions from down to up 13814 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 13815 * 13816 * Note that since the underlying physical link has transitioned, we must cause 13817 * at least one routing socket message to be sent here, either via DAD 13818 * completion or just by default on the first ipif. (If we don't do this, then 13819 * in.mpathd will see long delays when doing link-based failure recovery.) 13820 */ 13821 void 13822 ill_restart_dad(ill_t *ill, boolean_t went_up) 13823 { 13824 ipif_t *ipif; 13825 13826 if (ill == NULL) 13827 return; 13828 13829 /* 13830 * If layer two doesn't support duplicate address detection, then just 13831 * send the routing socket message now and be done with it. 13832 */ 13833 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 13834 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 13835 ip_rts_ifmsg(ill->ill_ipif); 13836 return; 13837 } 13838 13839 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13840 if (went_up) { 13841 if (ipif->ipif_flags & IPIF_UP) { 13842 if (ill->ill_isv6) 13843 ipif_ndp_start_dad(ipif); 13844 else 13845 ipif_arp_start_dad(ipif); 13846 } else if (ill->ill_isv6 && 13847 (ipif->ipif_flags & IPIF_DUPLICATE)) { 13848 /* 13849 * For IPv4, the ARP module itself will 13850 * automatically start the DAD process when it 13851 * sees DL_NOTE_LINK_UP. We respond to the 13852 * AR_CN_READY at the completion of that task. 13853 * For IPv6, we must kick off the bring-up 13854 * process now. 13855 */ 13856 ndp_do_recovery(ipif); 13857 } else { 13858 /* 13859 * Unfortunately, the first ipif is "special" 13860 * and represents the underlying ill in the 13861 * routing socket messages. Thus, when this 13862 * one ipif is down, we must still notify so 13863 * that the user knows the IFF_RUNNING status 13864 * change. (If the first ipif is up, then 13865 * we'll handle eventual routing socket 13866 * notification via DAD completion.) 13867 */ 13868 if (ipif == ill->ill_ipif) 13869 ip_rts_ifmsg(ill->ill_ipif); 13870 } 13871 } else { 13872 /* 13873 * After link down, we'll need to send a new routing 13874 * message when the link comes back, so clear 13875 * ipif_addr_ready. 13876 */ 13877 ipif->ipif_addr_ready = 0; 13878 } 13879 } 13880 13881 /* 13882 * If we've torn down links, then notify the user right away. 13883 */ 13884 if (!went_up) 13885 ip_rts_ifmsg(ill->ill_ipif); 13886 } 13887 13888 /* 13889 * Wakeup all threads waiting to enter the ipsq, and sleeping 13890 * on any of the ills in this ipsq. The ill_lock of the ill 13891 * must be held so that waiters don't miss wakeups 13892 */ 13893 static void 13894 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 13895 { 13896 phyint_t *phyint; 13897 13898 phyint = ipsq->ipsq_phyint_list; 13899 while (phyint != NULL) { 13900 if (phyint->phyint_illv4) { 13901 if (!caller_holds_lock) 13902 mutex_enter(&phyint->phyint_illv4->ill_lock); 13903 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13904 cv_broadcast(&phyint->phyint_illv4->ill_cv); 13905 if (!caller_holds_lock) 13906 mutex_exit(&phyint->phyint_illv4->ill_lock); 13907 } 13908 if (phyint->phyint_illv6) { 13909 if (!caller_holds_lock) 13910 mutex_enter(&phyint->phyint_illv6->ill_lock); 13911 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13912 cv_broadcast(&phyint->phyint_illv6->ill_cv); 13913 if (!caller_holds_lock) 13914 mutex_exit(&phyint->phyint_illv6->ill_lock); 13915 } 13916 phyint = phyint->phyint_ipsq_next; 13917 } 13918 } 13919 13920 static ipsq_t * 13921 ipsq_create(char *groupname) 13922 { 13923 ipsq_t *ipsq; 13924 13925 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13926 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 13927 if (ipsq == NULL) { 13928 return (NULL); 13929 } 13930 13931 if (groupname != NULL) 13932 (void) strcpy(ipsq->ipsq_name, groupname); 13933 else 13934 ipsq->ipsq_name[0] = '\0'; 13935 13936 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 13937 ipsq->ipsq_flags |= IPSQ_GROUP; 13938 ipsq->ipsq_next = ipsq_g_head; 13939 ipsq_g_head = ipsq; 13940 return (ipsq); 13941 } 13942 13943 /* 13944 * Return an ipsq correspoding to the groupname. If 'create' is true 13945 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 13946 * uniquely with an IPMP group. However during IPMP groupname operations, 13947 * multiple IPMP groups may be associated with a single ipsq. But no 13948 * IPMP group can be associated with more than 1 ipsq at any time. 13949 * For example 13950 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 13951 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 13952 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 13953 * 13954 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 13955 * status shown below during the execution of the above command. 13956 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 13957 * 13958 * After the completion of the above groupname command we return to the stable 13959 * state shown below. 13960 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 13961 * hme4 mpk17-85 ipsq2 mpk17-85 1 13962 * 13963 * Because of the above, we don't search based on the ipsq_name since that 13964 * would miss the correct ipsq during certain windows as shown above. 13965 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 13966 * natural state. 13967 */ 13968 static ipsq_t * 13969 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq) 13970 { 13971 ipsq_t *ipsq; 13972 int group_len; 13973 phyint_t *phyint; 13974 13975 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 13976 13977 group_len = strlen(groupname); 13978 ASSERT(group_len != 0); 13979 group_len++; 13980 13981 for (ipsq = ipsq_g_head; ipsq != NULL; ipsq = ipsq->ipsq_next) { 13982 /* 13983 * When an ipsq is being split, and ill_split_ipsq 13984 * calls this function, we exclude it from being considered. 13985 */ 13986 if (ipsq == exclude_ipsq) 13987 continue; 13988 13989 /* 13990 * Compare against the ipsq_name. The groupname change happens 13991 * in 2 phases. The 1st phase merges the from group into 13992 * the to group's ipsq, by calling ill_merge_groups and restarts 13993 * the ioctl. The 2nd phase then locates the ipsq again thru 13994 * ipsq_name. At this point the phyint_groupname has not been 13995 * updated. 13996 */ 13997 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 13998 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 13999 /* 14000 * Verify that an ipmp groupname is exactly 14001 * part of 1 ipsq and is not found in any other 14002 * ipsq. 14003 */ 14004 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) == 14005 NULL); 14006 return (ipsq); 14007 } 14008 14009 /* 14010 * Comparison against ipsq_name alone is not sufficient. 14011 * In the case when groups are currently being 14012 * merged, the ipsq could hold other IPMP groups temporarily. 14013 * so we walk the phyint list and compare against the 14014 * phyint_groupname as well. 14015 */ 14016 phyint = ipsq->ipsq_phyint_list; 14017 while (phyint != NULL) { 14018 if ((group_len == phyint->phyint_groupname_len) && 14019 (bcmp(phyint->phyint_groupname, groupname, 14020 group_len) == 0)) { 14021 /* 14022 * Verify that an ipmp groupname is exactly 14023 * part of 1 ipsq and is not found in any other 14024 * ipsq. 14025 */ 14026 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) 14027 == NULL); 14028 return (ipsq); 14029 } 14030 phyint = phyint->phyint_ipsq_next; 14031 } 14032 } 14033 if (create) 14034 ipsq = ipsq_create(groupname); 14035 return (ipsq); 14036 } 14037 14038 static void 14039 ipsq_delete(ipsq_t *ipsq) 14040 { 14041 ipsq_t *nipsq; 14042 ipsq_t *pipsq = NULL; 14043 14044 /* 14045 * We don't hold the ipsq lock, but we are sure no new 14046 * messages can land up, since the ipsq_refs is zero. 14047 * i.e. this ipsq is unnamed and no phyint or phyint group 14048 * is associated with this ipsq. (Lookups are based on ill_name 14049 * or phyint_group_name) 14050 */ 14051 ASSERT(ipsq->ipsq_refs == 0); 14052 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 14053 ASSERT(ipsq->ipsq_pending_mp == NULL); 14054 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 14055 /* 14056 * This is not the ipsq of an IPMP group. 14057 */ 14058 kmem_free(ipsq, sizeof (ipsq_t)); 14059 return; 14060 } 14061 14062 rw_enter(&ill_g_lock, RW_WRITER); 14063 14064 /* 14065 * Locate the ipsq before we can remove it from 14066 * the singly linked list of ipsq's. 14067 */ 14068 for (nipsq = ipsq_g_head; nipsq != NULL; nipsq = nipsq->ipsq_next) { 14069 if (nipsq == ipsq) { 14070 break; 14071 } 14072 pipsq = nipsq; 14073 } 14074 14075 ASSERT(nipsq == ipsq); 14076 14077 /* unlink ipsq from the list */ 14078 if (pipsq != NULL) 14079 pipsq->ipsq_next = ipsq->ipsq_next; 14080 else 14081 ipsq_g_head = ipsq->ipsq_next; 14082 kmem_free(ipsq, sizeof (ipsq_t)); 14083 rw_exit(&ill_g_lock); 14084 } 14085 14086 static void 14087 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 14088 queue_t *q) 14089 14090 { 14091 14092 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 14093 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 14094 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 14095 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 14096 ASSERT(current_mp != NULL); 14097 14098 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 14099 NEW_OP, NULL); 14100 14101 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 14102 new_ipsq->ipsq_xopq_mphead != NULL); 14103 14104 /* 14105 * move from old ipsq to the new ipsq. 14106 */ 14107 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 14108 if (old_ipsq->ipsq_xopq_mphead != NULL) 14109 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 14110 14111 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 14112 } 14113 14114 void 14115 ill_group_cleanup(ill_t *ill) 14116 { 14117 ill_t *ill_v4; 14118 ill_t *ill_v6; 14119 ipif_t *ipif; 14120 14121 ill_v4 = ill->ill_phyint->phyint_illv4; 14122 ill_v6 = ill->ill_phyint->phyint_illv6; 14123 14124 if (ill_v4 != NULL) { 14125 mutex_enter(&ill_v4->ill_lock); 14126 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14127 ipif = ipif->ipif_next) { 14128 IPIF_UNMARK_MOVING(ipif); 14129 } 14130 ill_v4->ill_up_ipifs = B_FALSE; 14131 mutex_exit(&ill_v4->ill_lock); 14132 } 14133 14134 if (ill_v6 != NULL) { 14135 mutex_enter(&ill_v6->ill_lock); 14136 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14137 ipif = ipif->ipif_next) { 14138 IPIF_UNMARK_MOVING(ipif); 14139 } 14140 ill_v6->ill_up_ipifs = B_FALSE; 14141 mutex_exit(&ill_v6->ill_lock); 14142 } 14143 } 14144 /* 14145 * This function is called when an ill has had a change in its group status 14146 * to bring up all the ipifs that were up before the change. 14147 */ 14148 int 14149 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 14150 { 14151 ipif_t *ipif; 14152 ill_t *ill_v4; 14153 ill_t *ill_v6; 14154 ill_t *from_ill; 14155 int err = 0; 14156 14157 14158 ASSERT(IAM_WRITER_ILL(ill)); 14159 14160 /* 14161 * Except for ipif_state_flags and ill_state_flags the other 14162 * fields of the ipif/ill that are modified below are protected 14163 * implicitly since we are a writer. We would have tried to down 14164 * even an ipif that was already down, in ill_down_ipifs. So we 14165 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 14166 */ 14167 ill_v4 = ill->ill_phyint->phyint_illv4; 14168 ill_v6 = ill->ill_phyint->phyint_illv6; 14169 if (ill_v4 != NULL) { 14170 ill_v4->ill_up_ipifs = B_TRUE; 14171 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14172 ipif = ipif->ipif_next) { 14173 mutex_enter(&ill_v4->ill_lock); 14174 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14175 IPIF_UNMARK_MOVING(ipif); 14176 mutex_exit(&ill_v4->ill_lock); 14177 if (ipif->ipif_was_up) { 14178 if (!(ipif->ipif_flags & IPIF_UP)) 14179 err = ipif_up(ipif, q, mp); 14180 ipif->ipif_was_up = B_FALSE; 14181 if (err != 0) { 14182 /* 14183 * Can there be any other error ? 14184 */ 14185 ASSERT(err == EINPROGRESS); 14186 return (err); 14187 } 14188 } 14189 } 14190 mutex_enter(&ill_v4->ill_lock); 14191 ill_v4->ill_state_flags &= ~ILL_CHANGING; 14192 mutex_exit(&ill_v4->ill_lock); 14193 ill_v4->ill_up_ipifs = B_FALSE; 14194 if (ill_v4->ill_move_in_progress) { 14195 ASSERT(ill_v4->ill_move_peer != NULL); 14196 ill_v4->ill_move_in_progress = B_FALSE; 14197 from_ill = ill_v4->ill_move_peer; 14198 from_ill->ill_move_in_progress = B_FALSE; 14199 from_ill->ill_move_peer = NULL; 14200 mutex_enter(&from_ill->ill_lock); 14201 from_ill->ill_state_flags &= ~ILL_CHANGING; 14202 mutex_exit(&from_ill->ill_lock); 14203 if (ill_v6 == NULL) { 14204 if (from_ill->ill_phyint->phyint_flags & 14205 PHYI_STANDBY) { 14206 phyint_inactive(from_ill->ill_phyint); 14207 } 14208 if (ill_v4->ill_phyint->phyint_flags & 14209 PHYI_STANDBY) { 14210 phyint_inactive(ill_v4->ill_phyint); 14211 } 14212 } 14213 ill_v4->ill_move_peer = NULL; 14214 } 14215 } 14216 14217 if (ill_v6 != NULL) { 14218 ill_v6->ill_up_ipifs = B_TRUE; 14219 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14220 ipif = ipif->ipif_next) { 14221 mutex_enter(&ill_v6->ill_lock); 14222 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14223 IPIF_UNMARK_MOVING(ipif); 14224 mutex_exit(&ill_v6->ill_lock); 14225 if (ipif->ipif_was_up) { 14226 if (!(ipif->ipif_flags & IPIF_UP)) 14227 err = ipif_up(ipif, q, mp); 14228 ipif->ipif_was_up = B_FALSE; 14229 if (err != 0) { 14230 /* 14231 * Can there be any other error ? 14232 */ 14233 ASSERT(err == EINPROGRESS); 14234 return (err); 14235 } 14236 } 14237 } 14238 mutex_enter(&ill_v6->ill_lock); 14239 ill_v6->ill_state_flags &= ~ILL_CHANGING; 14240 mutex_exit(&ill_v6->ill_lock); 14241 ill_v6->ill_up_ipifs = B_FALSE; 14242 if (ill_v6->ill_move_in_progress) { 14243 ASSERT(ill_v6->ill_move_peer != NULL); 14244 ill_v6->ill_move_in_progress = B_FALSE; 14245 from_ill = ill_v6->ill_move_peer; 14246 from_ill->ill_move_in_progress = B_FALSE; 14247 from_ill->ill_move_peer = NULL; 14248 mutex_enter(&from_ill->ill_lock); 14249 from_ill->ill_state_flags &= ~ILL_CHANGING; 14250 mutex_exit(&from_ill->ill_lock); 14251 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 14252 phyint_inactive(from_ill->ill_phyint); 14253 } 14254 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 14255 phyint_inactive(ill_v6->ill_phyint); 14256 } 14257 ill_v6->ill_move_peer = NULL; 14258 } 14259 } 14260 return (0); 14261 } 14262 14263 /* 14264 * bring down all the approriate ipifs. 14265 */ 14266 /* ARGSUSED */ 14267 static void 14268 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 14269 { 14270 ipif_t *ipif; 14271 14272 ASSERT(IAM_WRITER_ILL(ill)); 14273 14274 /* 14275 * Except for ipif_state_flags the other fields of the ipif/ill that 14276 * are modified below are protected implicitly since we are a writer 14277 */ 14278 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14279 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 14280 continue; 14281 if (index == 0 || index == ipif->ipif_orig_ifindex) { 14282 /* 14283 * We go through the ipif_down logic even if the ipif 14284 * is already down, since routes can be added based 14285 * on down ipifs. Going through ipif_down once again 14286 * will delete any IREs created based on these routes. 14287 */ 14288 if (ipif->ipif_flags & IPIF_UP) 14289 ipif->ipif_was_up = B_TRUE; 14290 /* 14291 * If called with chk_nofailover true ipif is moving. 14292 */ 14293 mutex_enter(&ill->ill_lock); 14294 if (chk_nofailover) { 14295 ipif->ipif_state_flags |= 14296 IPIF_MOVING | IPIF_CHANGING; 14297 } else { 14298 ipif->ipif_state_flags |= IPIF_CHANGING; 14299 } 14300 mutex_exit(&ill->ill_lock); 14301 /* 14302 * Need to re-create net/subnet bcast ires if 14303 * they are dependent on ipif. 14304 */ 14305 if (!ipif->ipif_isv6) 14306 ipif_check_bcast_ires(ipif); 14307 (void) ipif_logical_down(ipif, NULL, NULL); 14308 ipif_non_duplicate(ipif); 14309 ipif_down_tail(ipif); 14310 /* 14311 * We don't do ipif_multicast_down for IPv4 in 14312 * ipif_down. We need to set this so that 14313 * ipif_multicast_up will join the 14314 * ALLHOSTS_GROUP on to_ill. 14315 */ 14316 ipif->ipif_multicast_up = B_FALSE; 14317 } 14318 } 14319 } 14320 14321 #define IPSQ_INC_REF(ipsq) { \ 14322 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 14323 (ipsq)->ipsq_refs++; \ 14324 } 14325 14326 #define IPSQ_DEC_REF(ipsq) { \ 14327 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 14328 (ipsq)->ipsq_refs--; \ 14329 if ((ipsq)->ipsq_refs == 0) \ 14330 (ipsq)->ipsq_name[0] = '\0'; \ 14331 } 14332 14333 /* 14334 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14335 * new_ipsq. 14336 */ 14337 static void 14338 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq) 14339 { 14340 phyint_t *phyint; 14341 phyint_t *next_phyint; 14342 14343 /* 14344 * To change the ipsq of an ill, we need to hold the ill_g_lock as 14345 * writer and the ill_lock of the ill in question. Also the dest 14346 * ipsq can't vanish while we hold the ill_g_lock as writer. 14347 */ 14348 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14349 14350 phyint = cur_ipsq->ipsq_phyint_list; 14351 cur_ipsq->ipsq_phyint_list = NULL; 14352 while (phyint != NULL) { 14353 next_phyint = phyint->phyint_ipsq_next; 14354 IPSQ_DEC_REF(cur_ipsq); 14355 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 14356 new_ipsq->ipsq_phyint_list = phyint; 14357 IPSQ_INC_REF(new_ipsq); 14358 phyint->phyint_ipsq = new_ipsq; 14359 phyint = next_phyint; 14360 } 14361 } 14362 14363 #define SPLIT_SUCCESS 0 14364 #define SPLIT_NOT_NEEDED 1 14365 #define SPLIT_FAILED 2 14366 14367 int 14368 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry) 14369 { 14370 ipsq_t *newipsq = NULL; 14371 14372 /* 14373 * Assertions denote pre-requisites for changing the ipsq of 14374 * a phyint 14375 */ 14376 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14377 /* 14378 * <ill-phyint> assocs can't change while ill_g_lock 14379 * is held as writer. See ill_phyint_reinit() 14380 */ 14381 ASSERT(phyint->phyint_illv4 == NULL || 14382 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14383 ASSERT(phyint->phyint_illv6 == NULL || 14384 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14385 14386 if ((phyint->phyint_groupname_len != 14387 (strlen(cur_ipsq->ipsq_name) + 1) || 14388 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 14389 phyint->phyint_groupname_len) != 0)) { 14390 /* 14391 * Once we fail in creating a new ipsq due to memory shortage, 14392 * don't attempt to create new ipsq again, based on another 14393 * phyint, since we want all phyints belonging to an IPMP group 14394 * to be in the same ipsq even in the event of mem alloc fails. 14395 */ 14396 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 14397 cur_ipsq); 14398 if (newipsq == NULL) { 14399 /* Memory allocation failure */ 14400 return (SPLIT_FAILED); 14401 } else { 14402 /* ipsq_refs protected by ill_g_lock (writer) */ 14403 IPSQ_DEC_REF(cur_ipsq); 14404 phyint->phyint_ipsq = newipsq; 14405 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 14406 newipsq->ipsq_phyint_list = phyint; 14407 IPSQ_INC_REF(newipsq); 14408 return (SPLIT_SUCCESS); 14409 } 14410 } 14411 return (SPLIT_NOT_NEEDED); 14412 } 14413 14414 /* 14415 * The ill locks of the phyint and the ill_g_lock (writer) must be held 14416 * to do this split 14417 */ 14418 static int 14419 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq) 14420 { 14421 ipsq_t *newipsq; 14422 14423 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14424 /* 14425 * <ill-phyint> assocs can't change while ill_g_lock 14426 * is held as writer. See ill_phyint_reinit() 14427 */ 14428 14429 ASSERT(phyint->phyint_illv4 == NULL || 14430 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14431 ASSERT(phyint->phyint_illv6 == NULL || 14432 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14433 14434 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 14435 phyint->phyint_illv4: phyint->phyint_illv6)) { 14436 /* 14437 * ipsq_init failed due to no memory 14438 * caller will use the same ipsq 14439 */ 14440 return (SPLIT_FAILED); 14441 } 14442 14443 /* ipsq_ref is protected by ill_g_lock (writer) */ 14444 IPSQ_DEC_REF(cur_ipsq); 14445 14446 /* 14447 * This is a new ipsq that is unknown to the world. 14448 * So we don't need to hold ipsq_lock, 14449 */ 14450 newipsq = phyint->phyint_ipsq; 14451 newipsq->ipsq_writer = NULL; 14452 newipsq->ipsq_reentry_cnt--; 14453 ASSERT(newipsq->ipsq_reentry_cnt == 0); 14454 #ifdef ILL_DEBUG 14455 newipsq->ipsq_depth = 0; 14456 #endif 14457 14458 return (SPLIT_SUCCESS); 14459 } 14460 14461 /* 14462 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14463 * ipsq's representing their individual groups or themselves. Return 14464 * whether split needs to be retried again later. 14465 */ 14466 static boolean_t 14467 ill_split_ipsq(ipsq_t *cur_ipsq) 14468 { 14469 phyint_t *phyint; 14470 phyint_t *next_phyint; 14471 int error; 14472 boolean_t need_retry = B_FALSE; 14473 14474 phyint = cur_ipsq->ipsq_phyint_list; 14475 cur_ipsq->ipsq_phyint_list = NULL; 14476 while (phyint != NULL) { 14477 next_phyint = phyint->phyint_ipsq_next; 14478 /* 14479 * 'created' will tell us whether the callee actually 14480 * created an ipsq. Lack of memory may force the callee 14481 * to return without creating an ipsq. 14482 */ 14483 if (phyint->phyint_groupname == NULL) { 14484 error = ill_split_to_own_ipsq(phyint, cur_ipsq); 14485 } else { 14486 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 14487 need_retry); 14488 } 14489 14490 switch (error) { 14491 case SPLIT_FAILED: 14492 need_retry = B_TRUE; 14493 /* FALLTHRU */ 14494 case SPLIT_NOT_NEEDED: 14495 /* 14496 * Keep it on the list. 14497 */ 14498 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 14499 cur_ipsq->ipsq_phyint_list = phyint; 14500 break; 14501 case SPLIT_SUCCESS: 14502 break; 14503 default: 14504 ASSERT(0); 14505 } 14506 14507 phyint = next_phyint; 14508 } 14509 return (need_retry); 14510 } 14511 14512 /* 14513 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 14514 * and return the ills in the list. This list will be 14515 * needed to unlock all the ills later on by the caller. 14516 * The <ill-ipsq> associations could change between the 14517 * lock and unlock. Hence the unlock can't traverse the 14518 * ipsq to get the list of ills. 14519 */ 14520 static int 14521 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 14522 { 14523 int cnt = 0; 14524 phyint_t *phyint; 14525 14526 /* 14527 * The caller holds ill_g_lock to ensure that the ill memberships 14528 * of the ipsq don't change 14529 */ 14530 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 14531 14532 phyint = ipsq->ipsq_phyint_list; 14533 while (phyint != NULL) { 14534 if (phyint->phyint_illv4 != NULL) { 14535 ASSERT(cnt < list_max); 14536 list[cnt++] = phyint->phyint_illv4; 14537 } 14538 if (phyint->phyint_illv6 != NULL) { 14539 ASSERT(cnt < list_max); 14540 list[cnt++] = phyint->phyint_illv6; 14541 } 14542 phyint = phyint->phyint_ipsq_next; 14543 } 14544 ill_lock_ills(list, cnt); 14545 return (cnt); 14546 } 14547 14548 void 14549 ill_lock_ills(ill_t **list, int cnt) 14550 { 14551 int i; 14552 14553 if (cnt > 1) { 14554 boolean_t try_again; 14555 do { 14556 try_again = B_FALSE; 14557 for (i = 0; i < cnt - 1; i++) { 14558 if (list[i] < list[i + 1]) { 14559 ill_t *tmp; 14560 14561 /* swap the elements */ 14562 tmp = list[i]; 14563 list[i] = list[i + 1]; 14564 list[i + 1] = tmp; 14565 try_again = B_TRUE; 14566 } 14567 } 14568 } while (try_again); 14569 } 14570 14571 for (i = 0; i < cnt; i++) { 14572 if (i == 0) { 14573 if (list[i] != NULL) 14574 mutex_enter(&list[i]->ill_lock); 14575 else 14576 return; 14577 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14578 mutex_enter(&list[i]->ill_lock); 14579 } 14580 } 14581 } 14582 14583 void 14584 ill_unlock_ills(ill_t **list, int cnt) 14585 { 14586 int i; 14587 14588 for (i = 0; i < cnt; i++) { 14589 if ((i == 0) && (list[i] != NULL)) { 14590 mutex_exit(&list[i]->ill_lock); 14591 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14592 mutex_exit(&list[i]->ill_lock); 14593 } 14594 } 14595 } 14596 14597 /* 14598 * Merge all the ills from 1 ipsq group into another ipsq group. 14599 * The source ipsq group is specified by the ipsq associated with 14600 * 'from_ill'. The destination ipsq group is specified by the ipsq 14601 * associated with 'to_ill' or 'groupname' respectively. 14602 * Note that ipsq itself does not have a reference count mechanism 14603 * and functions don't look up an ipsq and pass it around. Instead 14604 * functions pass around an ill or groupname, and the ipsq is looked 14605 * up from the ill or groupname and the required operation performed 14606 * atomically with the lookup on the ipsq. 14607 */ 14608 static int 14609 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 14610 queue_t *q) 14611 { 14612 ipsq_t *old_ipsq; 14613 ipsq_t *new_ipsq; 14614 ill_t **ill_list; 14615 int cnt; 14616 size_t ill_list_size; 14617 boolean_t became_writer_on_new_sq = B_FALSE; 14618 14619 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 14620 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 14621 14622 /* 14623 * Need to hold ill_g_lock as writer and also the ill_lock to 14624 * change the <ill-ipsq> assoc of an ill. Need to hold the 14625 * ipsq_lock to prevent new messages from landing on an ipsq. 14626 */ 14627 rw_enter(&ill_g_lock, RW_WRITER); 14628 14629 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 14630 if (groupname != NULL) 14631 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL); 14632 else { 14633 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 14634 } 14635 14636 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 14637 14638 /* 14639 * both groups are on the same ipsq. 14640 */ 14641 if (old_ipsq == new_ipsq) { 14642 rw_exit(&ill_g_lock); 14643 return (0); 14644 } 14645 14646 cnt = old_ipsq->ipsq_refs << 1; 14647 ill_list_size = cnt * sizeof (ill_t *); 14648 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 14649 if (ill_list == NULL) { 14650 rw_exit(&ill_g_lock); 14651 return (ENOMEM); 14652 } 14653 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 14654 14655 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 14656 mutex_enter(&new_ipsq->ipsq_lock); 14657 if ((new_ipsq->ipsq_writer == NULL && 14658 new_ipsq->ipsq_current_ipif == NULL) || 14659 (new_ipsq->ipsq_writer == curthread)) { 14660 new_ipsq->ipsq_writer = curthread; 14661 new_ipsq->ipsq_reentry_cnt++; 14662 became_writer_on_new_sq = B_TRUE; 14663 } 14664 14665 /* 14666 * We are holding ill_g_lock as writer and all the ill locks of 14667 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 14668 * message can land up on the old ipsq even though we don't hold the 14669 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 14670 */ 14671 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 14672 14673 /* 14674 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 14675 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 14676 * assocs. till we release the ill_g_lock, and hence it can't vanish. 14677 */ 14678 ill_merge_ipsq(old_ipsq, new_ipsq); 14679 14680 /* 14681 * Mark the new ipsq as needing a split since it is currently 14682 * being shared by more than 1 IPMP group. The split will 14683 * occur at the end of ipsq_exit 14684 */ 14685 new_ipsq->ipsq_split = B_TRUE; 14686 14687 /* Now release all the locks */ 14688 mutex_exit(&new_ipsq->ipsq_lock); 14689 ill_unlock_ills(ill_list, cnt); 14690 rw_exit(&ill_g_lock); 14691 14692 kmem_free(ill_list, ill_list_size); 14693 14694 /* 14695 * If we succeeded in becoming writer on the new ipsq, then 14696 * drain the new ipsq and start processing all enqueued messages 14697 * including the current ioctl we are processing which is either 14698 * a set groupname or failover/failback. 14699 */ 14700 if (became_writer_on_new_sq) 14701 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 14702 14703 /* 14704 * syncq has been changed and all the messages have been moved. 14705 */ 14706 mutex_enter(&old_ipsq->ipsq_lock); 14707 old_ipsq->ipsq_current_ipif = NULL; 14708 mutex_exit(&old_ipsq->ipsq_lock); 14709 return (EINPROGRESS); 14710 } 14711 14712 /* 14713 * Delete and add the loopback copy and non-loopback copy of 14714 * the BROADCAST ire corresponding to ill and addr. Used to 14715 * group broadcast ires together when ill becomes part of 14716 * a group. 14717 * 14718 * This function is also called when ill is leaving the group 14719 * so that the ires belonging to the group gets re-grouped. 14720 */ 14721 static void 14722 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 14723 { 14724 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 14725 ire_t **ire_ptpn = &ire_head; 14726 14727 /* 14728 * The loopback and non-loopback IREs are inserted in the order in which 14729 * they're found, on the basis that they are correctly ordered (loopback 14730 * first). 14731 */ 14732 for (;;) { 14733 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14734 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14735 if (ire == NULL) 14736 break; 14737 14738 /* 14739 * we are passing in KM_SLEEP because it is not easy to 14740 * go back to a sane state in case of memory failure. 14741 */ 14742 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 14743 ASSERT(nire != NULL); 14744 bzero(nire, sizeof (ire_t)); 14745 /* 14746 * Don't use ire_max_frag directly since we don't 14747 * hold on to 'ire' until we add the new ire 'nire' and 14748 * we don't want the new ire to have a dangling reference 14749 * to 'ire'. The ire_max_frag of a broadcast ire must 14750 * be in sync with the ipif_mtu of the associate ipif. 14751 * For eg. this happens as a result of SIOCSLIFNAME, 14752 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 14753 * the driver. A change in ire_max_frag triggered as 14754 * as a result of path mtu discovery, or due to an 14755 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 14756 * route change -mtu command does not apply to broadcast ires. 14757 * 14758 * XXX We need a recovery strategy here if ire_init fails 14759 */ 14760 if (ire_init(nire, 14761 (uchar_t *)&ire->ire_addr, 14762 (uchar_t *)&ire->ire_mask, 14763 (uchar_t *)&ire->ire_src_addr, 14764 (uchar_t *)&ire->ire_gateway_addr, 14765 (uchar_t *)&ire->ire_in_src_addr, 14766 ire->ire_stq == NULL ? &ip_loopback_mtu : 14767 &ire->ire_ipif->ipif_mtu, 14768 (ire->ire_nce != NULL ? ire->ire_nce->nce_fp_mp : NULL), 14769 ire->ire_rfq, 14770 ire->ire_stq, 14771 ire->ire_type, 14772 (ire->ire_nce != NULL? ire->ire_nce->nce_res_mp : NULL), 14773 ire->ire_ipif, 14774 ire->ire_in_ill, 14775 ire->ire_cmask, 14776 ire->ire_phandle, 14777 ire->ire_ihandle, 14778 ire->ire_flags, 14779 &ire->ire_uinfo, 14780 NULL, 14781 NULL) == NULL) { 14782 cmn_err(CE_PANIC, "ire_init() failed"); 14783 } 14784 ire_delete(ire); 14785 ire_refrele(ire); 14786 14787 /* 14788 * The newly created IREs are inserted at the tail of the list 14789 * starting with ire_head. As we've just allocated them no one 14790 * knows about them so it's safe. 14791 */ 14792 *ire_ptpn = nire; 14793 ire_ptpn = &nire->ire_next; 14794 } 14795 14796 for (nire = ire_head; nire != NULL; nire = nire_next) { 14797 int error; 14798 ire_t *oire; 14799 /* unlink the IRE from our list before calling ire_add() */ 14800 nire_next = nire->ire_next; 14801 nire->ire_next = NULL; 14802 14803 /* ire_add adds the ire at the right place in the list */ 14804 oire = nire; 14805 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); 14806 ASSERT(error == 0); 14807 ASSERT(oire == nire); 14808 ire_refrele(nire); /* Held in ire_add */ 14809 } 14810 } 14811 14812 /* 14813 * This function is usually called when an ill is inserted in 14814 * a group and all the ipifs are already UP. As all the ipifs 14815 * are already UP, the broadcast ires have already been created 14816 * and been inserted. But, ire_add_v4 would not have grouped properly. 14817 * We need to re-group for the benefit of ip_wput_ire which 14818 * expects BROADCAST ires to be grouped properly to avoid sending 14819 * more than one copy of the broadcast packet per group. 14820 * 14821 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 14822 * because when ipif_up_done ends up calling this, ires have 14823 * already been added before illgrp_insert i.e before ill_group 14824 * has been initialized. 14825 */ 14826 static void 14827 ill_group_bcast_for_xmit(ill_t *ill) 14828 { 14829 ill_group_t *illgrp; 14830 ipif_t *ipif; 14831 ipaddr_t addr; 14832 ipaddr_t net_mask; 14833 ipaddr_t subnet_netmask; 14834 14835 illgrp = ill->ill_group; 14836 14837 /* 14838 * This function is called even when an ill is deleted from 14839 * the group. Hence, illgrp could be null. 14840 */ 14841 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 14842 return; 14843 14844 /* 14845 * Delete all the BROADCAST ires matching this ill and add 14846 * them back. This time, ire_add_v4 should take care of 14847 * grouping them with others because ill is part of the 14848 * group. 14849 */ 14850 ill_bcast_delete_and_add(ill, 0); 14851 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 14852 14853 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14854 14855 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14856 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14857 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14858 } else { 14859 net_mask = htonl(IN_CLASSA_NET); 14860 } 14861 addr = net_mask & ipif->ipif_subnet; 14862 ill_bcast_delete_and_add(ill, addr); 14863 ill_bcast_delete_and_add(ill, ~net_mask | addr); 14864 14865 subnet_netmask = ipif->ipif_net_mask; 14866 addr = ipif->ipif_subnet; 14867 ill_bcast_delete_and_add(ill, addr); 14868 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 14869 } 14870 } 14871 14872 /* 14873 * This function is called from illgrp_delete when ill is being deleted 14874 * from the group. 14875 * 14876 * As ill is not there in the group anymore, any address belonging 14877 * to this ill should be cleared of IRE_MARK_NORECV. 14878 */ 14879 static void 14880 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 14881 { 14882 ire_t *ire; 14883 irb_t *irb; 14884 14885 ASSERT(ill->ill_group == NULL); 14886 14887 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14888 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14889 14890 if (ire != NULL) { 14891 /* 14892 * IPMP and plumbing operations are serialized on the ipsq, so 14893 * no one will insert or delete a broadcast ire under our feet. 14894 */ 14895 irb = ire->ire_bucket; 14896 rw_enter(&irb->irb_lock, RW_READER); 14897 ire_refrele(ire); 14898 14899 for (; ire != NULL; ire = ire->ire_next) { 14900 if (ire->ire_addr != addr) 14901 break; 14902 if (ire_to_ill(ire) != ill) 14903 continue; 14904 14905 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 14906 ire->ire_marks &= ~IRE_MARK_NORECV; 14907 } 14908 rw_exit(&irb->irb_lock); 14909 } 14910 } 14911 14912 /* 14913 * This function must be called only after the broadcast ires 14914 * have been grouped together. For a given address addr, nominate 14915 * only one of the ires whose interface is not FAILED or OFFLINE. 14916 * 14917 * This is also called when an ipif goes down, so that we can nominate 14918 * a different ire with the same address for receiving. 14919 */ 14920 static void 14921 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr) 14922 { 14923 irb_t *irb; 14924 ire_t *ire; 14925 ire_t *ire1; 14926 ire_t *save_ire; 14927 ire_t **irep = NULL; 14928 boolean_t first = B_TRUE; 14929 ire_t *clear_ire = NULL; 14930 ire_t *start_ire = NULL; 14931 ire_t *new_lb_ire; 14932 ire_t *new_nlb_ire; 14933 boolean_t new_lb_ire_used = B_FALSE; 14934 boolean_t new_nlb_ire_used = B_FALSE; 14935 uint64_t match_flags; 14936 uint64_t phyi_flags; 14937 boolean_t fallback = B_FALSE; 14938 14939 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 14940 NULL, MATCH_IRE_TYPE); 14941 /* 14942 * We may not be able to find some ires if a previous 14943 * ire_create failed. This happens when an ipif goes 14944 * down and we are unable to create BROADCAST ires due 14945 * to memory failure. Thus, we have to check for NULL 14946 * below. This should handle the case for LOOPBACK, 14947 * POINTOPOINT and interfaces with some POINTOPOINT 14948 * logicals for which there are no BROADCAST ires. 14949 */ 14950 if (ire == NULL) 14951 return; 14952 /* 14953 * Currently IRE_BROADCASTS are deleted when an ipif 14954 * goes down which runs exclusively. Thus, setting 14955 * IRE_MARK_RCVD should not race with ire_delete marking 14956 * IRE_MARK_CONDEMNED. We grab the lock below just to 14957 * be consistent with other parts of the code that walks 14958 * a given bucket. 14959 */ 14960 save_ire = ire; 14961 irb = ire->ire_bucket; 14962 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14963 if (new_lb_ire == NULL) { 14964 ire_refrele(ire); 14965 return; 14966 } 14967 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14968 if (new_nlb_ire == NULL) { 14969 ire_refrele(ire); 14970 kmem_cache_free(ire_cache, new_lb_ire); 14971 return; 14972 } 14973 IRB_REFHOLD(irb); 14974 rw_enter(&irb->irb_lock, RW_WRITER); 14975 /* 14976 * Get to the first ire matching the address and the 14977 * group. If the address does not match we are done 14978 * as we could not find the IRE. If the address matches 14979 * we should get to the first one matching the group. 14980 */ 14981 while (ire != NULL) { 14982 if (ire->ire_addr != addr || 14983 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14984 break; 14985 } 14986 ire = ire->ire_next; 14987 } 14988 match_flags = PHYI_FAILED | PHYI_INACTIVE; 14989 start_ire = ire; 14990 redo: 14991 while (ire != NULL && ire->ire_addr == addr && 14992 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14993 /* 14994 * The first ire for any address within a group 14995 * should always be the one with IRE_MARK_NORECV cleared 14996 * so that ip_wput_ire can avoid searching for one. 14997 * Note down the insertion point which will be used 14998 * later. 14999 */ 15000 if (first && (irep == NULL)) 15001 irep = ire->ire_ptpn; 15002 /* 15003 * PHYI_FAILED is set when the interface fails. 15004 * This interface might have become good, but the 15005 * daemon has not yet detected. We should still 15006 * not receive on this. PHYI_OFFLINE should never 15007 * be picked as this has been offlined and soon 15008 * be removed. 15009 */ 15010 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 15011 if (phyi_flags & PHYI_OFFLINE) { 15012 ire->ire_marks |= IRE_MARK_NORECV; 15013 ire = ire->ire_next; 15014 continue; 15015 } 15016 if (phyi_flags & match_flags) { 15017 ire->ire_marks |= IRE_MARK_NORECV; 15018 ire = ire->ire_next; 15019 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 15020 PHYI_INACTIVE) { 15021 fallback = B_TRUE; 15022 } 15023 continue; 15024 } 15025 if (first) { 15026 /* 15027 * We will move this to the front of the list later 15028 * on. 15029 */ 15030 clear_ire = ire; 15031 ire->ire_marks &= ~IRE_MARK_NORECV; 15032 } else { 15033 ire->ire_marks |= IRE_MARK_NORECV; 15034 } 15035 first = B_FALSE; 15036 ire = ire->ire_next; 15037 } 15038 /* 15039 * If we never nominated anybody, try nominating at least 15040 * an INACTIVE, if we found one. Do it only once though. 15041 */ 15042 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 15043 fallback) { 15044 match_flags = PHYI_FAILED; 15045 ire = start_ire; 15046 irep = NULL; 15047 goto redo; 15048 } 15049 ire_refrele(save_ire); 15050 15051 /* 15052 * irep non-NULL indicates that we entered the while loop 15053 * above. If clear_ire is at the insertion point, we don't 15054 * have to do anything. clear_ire will be NULL if all the 15055 * interfaces are failed. 15056 * 15057 * We cannot unlink and reinsert the ire at the right place 15058 * in the list since there can be other walkers of this bucket. 15059 * Instead we delete and recreate the ire 15060 */ 15061 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 15062 ire_t *clear_ire_stq = NULL; 15063 mblk_t *fp_mp = NULL, *res_mp = NULL; 15064 15065 bzero(new_lb_ire, sizeof (ire_t)); 15066 if (clear_ire->ire_nce != NULL) { 15067 fp_mp = clear_ire->ire_nce->nce_fp_mp; 15068 res_mp = clear_ire->ire_nce->nce_res_mp; 15069 } 15070 /* XXX We need a recovery strategy here. */ 15071 if (ire_init(new_lb_ire, 15072 (uchar_t *)&clear_ire->ire_addr, 15073 (uchar_t *)&clear_ire->ire_mask, 15074 (uchar_t *)&clear_ire->ire_src_addr, 15075 (uchar_t *)&clear_ire->ire_gateway_addr, 15076 (uchar_t *)&clear_ire->ire_in_src_addr, 15077 &clear_ire->ire_max_frag, 15078 fp_mp, 15079 clear_ire->ire_rfq, 15080 clear_ire->ire_stq, 15081 clear_ire->ire_type, 15082 res_mp, 15083 clear_ire->ire_ipif, 15084 clear_ire->ire_in_ill, 15085 clear_ire->ire_cmask, 15086 clear_ire->ire_phandle, 15087 clear_ire->ire_ihandle, 15088 clear_ire->ire_flags, 15089 &clear_ire->ire_uinfo, 15090 NULL, 15091 NULL) == NULL) 15092 cmn_err(CE_PANIC, "ire_init() failed"); 15093 if (clear_ire->ire_stq == NULL) { 15094 ire_t *ire_next = clear_ire->ire_next; 15095 if (ire_next != NULL && 15096 ire_next->ire_stq != NULL && 15097 ire_next->ire_addr == clear_ire->ire_addr && 15098 ire_next->ire_ipif->ipif_ill == 15099 clear_ire->ire_ipif->ipif_ill) { 15100 clear_ire_stq = ire_next; 15101 15102 bzero(new_nlb_ire, sizeof (ire_t)); 15103 if (clear_ire_stq->ire_nce != NULL) { 15104 fp_mp = 15105 clear_ire_stq->ire_nce->nce_fp_mp; 15106 res_mp = 15107 clear_ire_stq->ire_nce->nce_res_mp; 15108 } else { 15109 fp_mp = res_mp = NULL; 15110 } 15111 /* XXX We need a recovery strategy here. */ 15112 if (ire_init(new_nlb_ire, 15113 (uchar_t *)&clear_ire_stq->ire_addr, 15114 (uchar_t *)&clear_ire_stq->ire_mask, 15115 (uchar_t *)&clear_ire_stq->ire_src_addr, 15116 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 15117 (uchar_t *)&clear_ire_stq->ire_in_src_addr, 15118 &clear_ire_stq->ire_max_frag, 15119 fp_mp, 15120 clear_ire_stq->ire_rfq, 15121 clear_ire_stq->ire_stq, 15122 clear_ire_stq->ire_type, 15123 res_mp, 15124 clear_ire_stq->ire_ipif, 15125 clear_ire_stq->ire_in_ill, 15126 clear_ire_stq->ire_cmask, 15127 clear_ire_stq->ire_phandle, 15128 clear_ire_stq->ire_ihandle, 15129 clear_ire_stq->ire_flags, 15130 &clear_ire_stq->ire_uinfo, 15131 NULL, 15132 NULL) == NULL) 15133 cmn_err(CE_PANIC, "ire_init() failed"); 15134 } 15135 } 15136 15137 /* 15138 * Delete the ire. We can't call ire_delete() since 15139 * we are holding the bucket lock. We can't release the 15140 * bucket lock since we can't allow irep to change. So just 15141 * mark it CONDEMNED. The IRB_REFRELE will delete the 15142 * ire from the list and do the refrele. 15143 */ 15144 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 15145 irb->irb_marks |= IRB_MARK_CONDEMNED; 15146 15147 if (clear_ire_stq != NULL) { 15148 ire_fastpath_list_delete( 15149 (ill_t *)clear_ire_stq->ire_stq->q_ptr, 15150 clear_ire_stq); 15151 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 15152 } 15153 15154 /* 15155 * Also take care of otherfields like ib/ob pkt count 15156 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 15157 */ 15158 15159 /* Add the new ire's. Insert at *irep */ 15160 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 15161 ire1 = *irep; 15162 if (ire1 != NULL) 15163 ire1->ire_ptpn = &new_lb_ire->ire_next; 15164 new_lb_ire->ire_next = ire1; 15165 /* Link the new one in. */ 15166 new_lb_ire->ire_ptpn = irep; 15167 membar_producer(); 15168 *irep = new_lb_ire; 15169 new_lb_ire_used = B_TRUE; 15170 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 15171 new_lb_ire->ire_bucket->irb_ire_cnt++; 15172 new_lb_ire->ire_ipif->ipif_ire_cnt++; 15173 15174 if (clear_ire_stq != NULL) { 15175 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 15176 irep = &new_lb_ire->ire_next; 15177 /* Add the new ire. Insert at *irep */ 15178 ire1 = *irep; 15179 if (ire1 != NULL) 15180 ire1->ire_ptpn = &new_nlb_ire->ire_next; 15181 new_nlb_ire->ire_next = ire1; 15182 /* Link the new one in. */ 15183 new_nlb_ire->ire_ptpn = irep; 15184 membar_producer(); 15185 *irep = new_nlb_ire; 15186 new_nlb_ire_used = B_TRUE; 15187 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 15188 new_nlb_ire->ire_bucket->irb_ire_cnt++; 15189 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 15190 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 15191 } 15192 } 15193 rw_exit(&irb->irb_lock); 15194 if (!new_lb_ire_used) 15195 kmem_cache_free(ire_cache, new_lb_ire); 15196 if (!new_nlb_ire_used) 15197 kmem_cache_free(ire_cache, new_nlb_ire); 15198 IRB_REFRELE(irb); 15199 } 15200 15201 /* 15202 * Whenever an ipif goes down we have to renominate a different 15203 * broadcast ire to receive. Whenever an ipif comes up, we need 15204 * to make sure that we have only one nominated to receive. 15205 */ 15206 static void 15207 ipif_renominate_bcast(ipif_t *ipif) 15208 { 15209 ill_t *ill = ipif->ipif_ill; 15210 ipaddr_t subnet_addr; 15211 ipaddr_t net_addr; 15212 ipaddr_t net_mask = 0; 15213 ipaddr_t subnet_netmask; 15214 ipaddr_t addr; 15215 ill_group_t *illgrp; 15216 15217 illgrp = ill->ill_group; 15218 /* 15219 * If this is the last ipif going down, it might take 15220 * the ill out of the group. In that case ipif_down -> 15221 * illgrp_delete takes care of doing the nomination. 15222 * ipif_down does not call for this case. 15223 */ 15224 ASSERT(illgrp != NULL); 15225 15226 /* There could not have been any ires associated with this */ 15227 if (ipif->ipif_subnet == 0) 15228 return; 15229 15230 ill_mark_bcast(illgrp, 0); 15231 ill_mark_bcast(illgrp, INADDR_BROADCAST); 15232 15233 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15234 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15235 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15236 } else { 15237 net_mask = htonl(IN_CLASSA_NET); 15238 } 15239 addr = net_mask & ipif->ipif_subnet; 15240 ill_mark_bcast(illgrp, addr); 15241 15242 net_addr = ~net_mask | addr; 15243 ill_mark_bcast(illgrp, net_addr); 15244 15245 subnet_netmask = ipif->ipif_net_mask; 15246 addr = ipif->ipif_subnet; 15247 ill_mark_bcast(illgrp, addr); 15248 15249 subnet_addr = ~subnet_netmask | addr; 15250 ill_mark_bcast(illgrp, subnet_addr); 15251 } 15252 15253 /* 15254 * Whenever we form or delete ill groups, we need to nominate one set of 15255 * BROADCAST ires for receiving in the group. 15256 * 15257 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 15258 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 15259 * for ill_ipif_up_count to be non-zero. This is the only case where 15260 * ill_ipif_up_count is zero and we would still find the ires. 15261 * 15262 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 15263 * ipif is UP and we just have to do the nomination. 15264 * 15265 * 3) When ill_handoff_responsibility calls us, some ill has been removed 15266 * from the group. So, we have to do the nomination. 15267 * 15268 * Because of (3), there could be just one ill in the group. But we have 15269 * to nominate still as IRE_MARK_NORCV may have been marked on this. 15270 * Thus, this function does not optimize when there is only one ill as 15271 * it is not correct for (3). 15272 */ 15273 static void 15274 ill_nominate_bcast_rcv(ill_group_t *illgrp) 15275 { 15276 ill_t *ill; 15277 ipif_t *ipif; 15278 ipaddr_t subnet_addr; 15279 ipaddr_t prev_subnet_addr = 0; 15280 ipaddr_t net_addr; 15281 ipaddr_t prev_net_addr = 0; 15282 ipaddr_t net_mask = 0; 15283 ipaddr_t subnet_netmask; 15284 ipaddr_t addr; 15285 15286 /* 15287 * When the last memeber is leaving, there is nothing to 15288 * nominate. 15289 */ 15290 if (illgrp->illgrp_ill_count == 0) { 15291 ASSERT(illgrp->illgrp_ill == NULL); 15292 return; 15293 } 15294 15295 ill = illgrp->illgrp_ill; 15296 ASSERT(!ill->ill_isv6); 15297 /* 15298 * We assume that ires with same address and belonging to the 15299 * same group, has been grouped together. Nominating a *single* 15300 * ill in the group for sending and receiving broadcast is done 15301 * by making sure that the first BROADCAST ire (which will be 15302 * the one returned by ire_ctable_lookup for ip_rput and the 15303 * one that will be used in ip_wput_ire) will be the one that 15304 * will not have IRE_MARK_NORECV set. 15305 * 15306 * 1) ip_rput checks and discards packets received on ires marked 15307 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 15308 * broadcast packets. We need to clear IRE_MARK_NORECV on the 15309 * first ire in the group for every broadcast address in the group. 15310 * ip_rput will accept packets only on the first ire i.e only 15311 * one copy of the ill. 15312 * 15313 * 2) ip_wput_ire needs to send out just one copy of the broadcast 15314 * packet for the whole group. It needs to send out on the ill 15315 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 15316 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 15317 * the copy echoed back on other port where the ire is not marked 15318 * with IRE_MARK_NORECV. 15319 * 15320 * Note that we just need to have the first IRE either loopback or 15321 * non-loopback (either of them may not exist if ire_create failed 15322 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 15323 * always hit the first one and hence will always accept one copy. 15324 * 15325 * We have a broadcast ire per ill for all the unique prefixes 15326 * hosted on that ill. As we don't have a way of knowing the 15327 * unique prefixes on a given ill and hence in the whole group, 15328 * we just call ill_mark_bcast on all the prefixes that exist 15329 * in the group. For the common case of one prefix, the code 15330 * below optimizes by remebering the last address used for 15331 * markng. In the case of multiple prefixes, this will still 15332 * optimize depending the order of prefixes. 15333 * 15334 * The only unique address across the whole group is 0.0.0.0 and 15335 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 15336 * the first ire in the bucket for receiving and disables the 15337 * others. 15338 */ 15339 ill_mark_bcast(illgrp, 0); 15340 ill_mark_bcast(illgrp, INADDR_BROADCAST); 15341 for (; ill != NULL; ill = ill->ill_group_next) { 15342 15343 for (ipif = ill->ill_ipif; ipif != NULL; 15344 ipif = ipif->ipif_next) { 15345 15346 if (!(ipif->ipif_flags & IPIF_UP) || 15347 ipif->ipif_subnet == 0) { 15348 continue; 15349 } 15350 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15351 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15352 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15353 } else { 15354 net_mask = htonl(IN_CLASSA_NET); 15355 } 15356 addr = net_mask & ipif->ipif_subnet; 15357 if (prev_net_addr == 0 || prev_net_addr != addr) { 15358 ill_mark_bcast(illgrp, addr); 15359 net_addr = ~net_mask | addr; 15360 ill_mark_bcast(illgrp, net_addr); 15361 } 15362 prev_net_addr = addr; 15363 15364 subnet_netmask = ipif->ipif_net_mask; 15365 addr = ipif->ipif_subnet; 15366 if (prev_subnet_addr == 0 || 15367 prev_subnet_addr != addr) { 15368 ill_mark_bcast(illgrp, addr); 15369 subnet_addr = ~subnet_netmask | addr; 15370 ill_mark_bcast(illgrp, subnet_addr); 15371 } 15372 prev_subnet_addr = addr; 15373 } 15374 } 15375 } 15376 15377 /* 15378 * This function is called while forming ill groups. 15379 * 15380 * Currently, we handle only allmulti groups. We want to join 15381 * allmulti on only one of the ills in the groups. In future, 15382 * when we have link aggregation, we may have to join normal 15383 * multicast groups on multiple ills as switch does inbound load 15384 * balancing. Following are the functions that calls this 15385 * function : 15386 * 15387 * 1) ill_recover_multicast : Interface is coming back UP. 15388 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 15389 * will call ill_recover_multicast to recover all the multicast 15390 * groups. We need to make sure that only one member is joined 15391 * in the ill group. 15392 * 15393 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 15394 * Somebody is joining allmulti. We need to make sure that only one 15395 * member is joined in the group. 15396 * 15397 * 3) illgrp_insert : If allmulti has already joined, we need to make 15398 * sure that only one member is joined in the group. 15399 * 15400 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 15401 * allmulti who we have nominated. We need to pick someother ill. 15402 * 15403 * 5) illgrp_delete : The ill we nominated is leaving the group, 15404 * we need to pick a new ill to join the group. 15405 * 15406 * For (1), (2), (5) - we just have to check whether there is 15407 * a good ill joined in the group. If we could not find any ills 15408 * joined the group, we should join. 15409 * 15410 * For (4), the one that was nominated to receive, left the group. 15411 * There could be nobody joined in the group when this function is 15412 * called. 15413 * 15414 * For (3) - we need to explicitly check whether there are multiple 15415 * ills joined in the group. 15416 * 15417 * For simplicity, we don't differentiate any of the above cases. We 15418 * just leave the group if it is joined on any of them and join on 15419 * the first good ill. 15420 */ 15421 int 15422 ill_nominate_mcast_rcv(ill_group_t *illgrp) 15423 { 15424 ilm_t *ilm; 15425 ill_t *ill; 15426 ill_t *fallback_inactive_ill = NULL; 15427 ill_t *fallback_failed_ill = NULL; 15428 int ret = 0; 15429 15430 /* 15431 * Leave the allmulti on all the ills and start fresh. 15432 */ 15433 for (ill = illgrp->illgrp_ill; ill != NULL; 15434 ill = ill->ill_group_next) { 15435 if (ill->ill_join_allmulti) 15436 (void) ip_leave_allmulti(ill->ill_ipif); 15437 } 15438 15439 /* 15440 * Choose a good ill. Fallback to inactive or failed if 15441 * none available. We need to fallback to FAILED in the 15442 * case where we have 2 interfaces in a group - where 15443 * one of them is failed and another is a good one and 15444 * the good one (not marked inactive) is leaving the group. 15445 */ 15446 ret = 0; 15447 for (ill = illgrp->illgrp_ill; ill != NULL; 15448 ill = ill->ill_group_next) { 15449 /* Never pick an offline interface */ 15450 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 15451 continue; 15452 15453 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 15454 fallback_failed_ill = ill; 15455 continue; 15456 } 15457 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 15458 fallback_inactive_ill = ill; 15459 continue; 15460 } 15461 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15462 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15463 ret = ip_join_allmulti(ill->ill_ipif); 15464 /* 15465 * ip_join_allmulti can fail because of memory 15466 * failures. So, make sure we join at least 15467 * on one ill. 15468 */ 15469 if (ill->ill_join_allmulti) 15470 return (0); 15471 } 15472 } 15473 } 15474 if (ret != 0) { 15475 /* 15476 * If we tried nominating above and failed to do so, 15477 * return error. We might have tried multiple times. 15478 * But, return the latest error. 15479 */ 15480 return (ret); 15481 } 15482 if ((ill = fallback_inactive_ill) != NULL) { 15483 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15484 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15485 ret = ip_join_allmulti(ill->ill_ipif); 15486 return (ret); 15487 } 15488 } 15489 } else if ((ill = fallback_failed_ill) != NULL) { 15490 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15491 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15492 ret = ip_join_allmulti(ill->ill_ipif); 15493 return (ret); 15494 } 15495 } 15496 } 15497 return (0); 15498 } 15499 15500 /* 15501 * This function is called from illgrp_delete after it is 15502 * deleted from the group to reschedule responsibilities 15503 * to a different ill. 15504 */ 15505 static void 15506 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 15507 { 15508 ilm_t *ilm; 15509 ipif_t *ipif; 15510 ipaddr_t subnet_addr; 15511 ipaddr_t net_addr; 15512 ipaddr_t net_mask = 0; 15513 ipaddr_t subnet_netmask; 15514 ipaddr_t addr; 15515 15516 ASSERT(ill->ill_group == NULL); 15517 /* 15518 * Broadcast Responsibility: 15519 * 15520 * 1. If this ill has been nominated for receiving broadcast 15521 * packets, we need to find a new one. Before we find a new 15522 * one, we need to re-group the ires that are part of this new 15523 * group (assumed by ill_nominate_bcast_rcv). We do this by 15524 * calling ill_group_bcast_for_xmit(ill) which will do the right 15525 * thing for us. 15526 * 15527 * 2. If this ill was not nominated for receiving broadcast 15528 * packets, we need to clear the IRE_MARK_NORECV flag 15529 * so that we continue to send up broadcast packets. 15530 */ 15531 if (!ill->ill_isv6) { 15532 /* 15533 * Case 1 above : No optimization here. Just redo the 15534 * nomination. 15535 */ 15536 ill_group_bcast_for_xmit(ill); 15537 ill_nominate_bcast_rcv(illgrp); 15538 15539 /* 15540 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 15541 */ 15542 ill_clear_bcast_mark(ill, 0); 15543 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 15544 15545 for (ipif = ill->ill_ipif; ipif != NULL; 15546 ipif = ipif->ipif_next) { 15547 15548 if (!(ipif->ipif_flags & IPIF_UP) || 15549 ipif->ipif_subnet == 0) { 15550 continue; 15551 } 15552 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15553 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15554 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15555 } else { 15556 net_mask = htonl(IN_CLASSA_NET); 15557 } 15558 addr = net_mask & ipif->ipif_subnet; 15559 ill_clear_bcast_mark(ill, addr); 15560 15561 net_addr = ~net_mask | addr; 15562 ill_clear_bcast_mark(ill, net_addr); 15563 15564 subnet_netmask = ipif->ipif_net_mask; 15565 addr = ipif->ipif_subnet; 15566 ill_clear_bcast_mark(ill, addr); 15567 15568 subnet_addr = ~subnet_netmask | addr; 15569 ill_clear_bcast_mark(ill, subnet_addr); 15570 } 15571 } 15572 15573 /* 15574 * Multicast Responsibility. 15575 * 15576 * If we have joined allmulti on this one, find a new member 15577 * in the group to join allmulti. As this ill is already part 15578 * of allmulti, we don't have to join on this one. 15579 * 15580 * If we have not joined allmulti on this one, there is no 15581 * responsibility to handoff. But we need to take new 15582 * responsibility i.e, join allmulti on this one if we need 15583 * to. 15584 */ 15585 if (ill->ill_join_allmulti) { 15586 (void) ill_nominate_mcast_rcv(illgrp); 15587 } else { 15588 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15589 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15590 (void) ip_join_allmulti(ill->ill_ipif); 15591 break; 15592 } 15593 } 15594 } 15595 15596 /* 15597 * We intentionally do the flushing of IRE_CACHES only matching 15598 * on the ill and not on groups. Note that we are already deleted 15599 * from the group. 15600 * 15601 * This will make sure that all IRE_CACHES whose stq is pointing 15602 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 15603 * deleted and IRE_CACHES that are not pointing at this ill will 15604 * be left alone. 15605 */ 15606 if (ill->ill_isv6) { 15607 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15608 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15609 } else { 15610 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15611 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15612 } 15613 15614 /* 15615 * Some conn may have cached one of the IREs deleted above. By removing 15616 * the ire reference, we clean up the extra reference to the ill held in 15617 * ire->ire_stq. 15618 */ 15619 ipcl_walk(conn_cleanup_stale_ire, NULL); 15620 15621 /* 15622 * Re-do source address selection for all the members in the 15623 * group, if they borrowed source address from one of the ipifs 15624 * in this ill. 15625 */ 15626 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15627 if (ill->ill_isv6) { 15628 ipif_update_other_ipifs_v6(ipif, illgrp); 15629 } else { 15630 ipif_update_other_ipifs(ipif, illgrp); 15631 } 15632 } 15633 } 15634 15635 /* 15636 * Delete the ill from the group. The caller makes sure that it is 15637 * in a group and it okay to delete from the group. So, we always 15638 * delete here. 15639 */ 15640 static void 15641 illgrp_delete(ill_t *ill) 15642 { 15643 ill_group_t *illgrp; 15644 ill_group_t *tmpg; 15645 ill_t *tmp_ill; 15646 15647 /* 15648 * Reset illgrp_ill_schednext if it was pointing at us. 15649 * We need to do this before we set ill_group to NULL. 15650 */ 15651 rw_enter(&ill_g_lock, RW_WRITER); 15652 mutex_enter(&ill->ill_lock); 15653 15654 illgrp_reset_schednext(ill); 15655 15656 illgrp = ill->ill_group; 15657 15658 /* Delete the ill from illgrp. */ 15659 if (illgrp->illgrp_ill == ill) { 15660 illgrp->illgrp_ill = ill->ill_group_next; 15661 } else { 15662 tmp_ill = illgrp->illgrp_ill; 15663 while (tmp_ill->ill_group_next != ill) { 15664 tmp_ill = tmp_ill->ill_group_next; 15665 ASSERT(tmp_ill != NULL); 15666 } 15667 tmp_ill->ill_group_next = ill->ill_group_next; 15668 } 15669 ill->ill_group = NULL; 15670 ill->ill_group_next = NULL; 15671 15672 illgrp->illgrp_ill_count--; 15673 mutex_exit(&ill->ill_lock); 15674 rw_exit(&ill_g_lock); 15675 15676 /* 15677 * As this ill is leaving the group, we need to hand off 15678 * the responsibilities to the other ills in the group, if 15679 * this ill had some responsibilities. 15680 */ 15681 15682 ill_handoff_responsibility(ill, illgrp); 15683 15684 rw_enter(&ill_g_lock, RW_WRITER); 15685 15686 if (illgrp->illgrp_ill_count == 0) { 15687 15688 ASSERT(illgrp->illgrp_ill == NULL); 15689 if (ill->ill_isv6) { 15690 if (illgrp == illgrp_head_v6) { 15691 illgrp_head_v6 = illgrp->illgrp_next; 15692 } else { 15693 tmpg = illgrp_head_v6; 15694 while (tmpg->illgrp_next != illgrp) { 15695 tmpg = tmpg->illgrp_next; 15696 ASSERT(tmpg != NULL); 15697 } 15698 tmpg->illgrp_next = illgrp->illgrp_next; 15699 } 15700 } else { 15701 if (illgrp == illgrp_head_v4) { 15702 illgrp_head_v4 = illgrp->illgrp_next; 15703 } else { 15704 tmpg = illgrp_head_v4; 15705 while (tmpg->illgrp_next != illgrp) { 15706 tmpg = tmpg->illgrp_next; 15707 ASSERT(tmpg != NULL); 15708 } 15709 tmpg->illgrp_next = illgrp->illgrp_next; 15710 } 15711 } 15712 mutex_destroy(&illgrp->illgrp_lock); 15713 mi_free(illgrp); 15714 } 15715 rw_exit(&ill_g_lock); 15716 15717 /* 15718 * Even though the ill is out of the group its not necessary 15719 * to set ipsq_split as TRUE as the ipifs could be down temporarily 15720 * We will split the ipsq when phyint_groupname is set to NULL. 15721 */ 15722 15723 /* 15724 * Send a routing sockets message if we are deleting from 15725 * groups with names. 15726 */ 15727 if (ill->ill_phyint->phyint_groupname_len != 0) 15728 ip_rts_ifmsg(ill->ill_ipif); 15729 } 15730 15731 /* 15732 * Re-do source address selection. This is normally called when 15733 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 15734 * ipif comes up. 15735 */ 15736 void 15737 ill_update_source_selection(ill_t *ill) 15738 { 15739 ipif_t *ipif; 15740 15741 ASSERT(IAM_WRITER_ILL(ill)); 15742 15743 if (ill->ill_group != NULL) 15744 ill = ill->ill_group->illgrp_ill; 15745 15746 for (; ill != NULL; ill = ill->ill_group_next) { 15747 for (ipif = ill->ill_ipif; ipif != NULL; 15748 ipif = ipif->ipif_next) { 15749 if (ill->ill_isv6) 15750 ipif_recreate_interface_routes_v6(NULL, ipif); 15751 else 15752 ipif_recreate_interface_routes(NULL, ipif); 15753 } 15754 } 15755 } 15756 15757 /* 15758 * Insert ill in a group headed by illgrp_head. The caller can either 15759 * pass a groupname in which case we search for a group with the 15760 * same name to insert in or pass a group to insert in. This function 15761 * would only search groups with names. 15762 * 15763 * NOTE : The caller should make sure that there is at least one ipif 15764 * UP on this ill so that illgrp_scheduler can pick this ill 15765 * for outbound packets. If ill_ipif_up_count is zero, we have 15766 * already sent a DL_UNBIND to the driver and we don't want to 15767 * send anymore packets. We don't assert for ipif_up_count 15768 * to be greater than zero, because ipif_up_done wants to call 15769 * this function before bumping up the ipif_up_count. See 15770 * ipif_up_done() for details. 15771 */ 15772 int 15773 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 15774 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 15775 { 15776 ill_group_t *illgrp; 15777 ill_t *prev_ill; 15778 phyint_t *phyi; 15779 15780 ASSERT(ill->ill_group == NULL); 15781 15782 rw_enter(&ill_g_lock, RW_WRITER); 15783 mutex_enter(&ill->ill_lock); 15784 15785 if (groupname != NULL) { 15786 /* 15787 * Look for a group with a matching groupname to insert. 15788 */ 15789 for (illgrp = *illgrp_head; illgrp != NULL; 15790 illgrp = illgrp->illgrp_next) { 15791 15792 ill_t *tmp_ill; 15793 15794 /* 15795 * If we have an ill_group_t in the list which has 15796 * no ill_t assigned then we must be in the process of 15797 * removing this group. We skip this as illgrp_delete() 15798 * will remove it from the list. 15799 */ 15800 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 15801 ASSERT(illgrp->illgrp_ill_count == 0); 15802 continue; 15803 } 15804 15805 ASSERT(tmp_ill->ill_phyint != NULL); 15806 phyi = tmp_ill->ill_phyint; 15807 /* 15808 * Look at groups which has names only. 15809 */ 15810 if (phyi->phyint_groupname_len == 0) 15811 continue; 15812 /* 15813 * Names are stored in the phyint common to both 15814 * IPv4 and IPv6. 15815 */ 15816 if (mi_strcmp(phyi->phyint_groupname, 15817 groupname) == 0) { 15818 break; 15819 } 15820 } 15821 } else { 15822 /* 15823 * If the caller passes in a NULL "grp_to_insert", we 15824 * allocate one below and insert this singleton. 15825 */ 15826 illgrp = grp_to_insert; 15827 } 15828 15829 ill->ill_group_next = NULL; 15830 15831 if (illgrp == NULL) { 15832 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 15833 if (illgrp == NULL) { 15834 return (ENOMEM); 15835 } 15836 illgrp->illgrp_next = *illgrp_head; 15837 *illgrp_head = illgrp; 15838 illgrp->illgrp_ill = ill; 15839 illgrp->illgrp_ill_count = 1; 15840 ill->ill_group = illgrp; 15841 /* 15842 * Used in illgrp_scheduler to protect multiple threads 15843 * from traversing the list. 15844 */ 15845 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 15846 } else { 15847 ASSERT(ill->ill_net_type == 15848 illgrp->illgrp_ill->ill_net_type); 15849 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 15850 15851 /* Insert ill at tail of this group */ 15852 prev_ill = illgrp->illgrp_ill; 15853 while (prev_ill->ill_group_next != NULL) 15854 prev_ill = prev_ill->ill_group_next; 15855 prev_ill->ill_group_next = ill; 15856 ill->ill_group = illgrp; 15857 illgrp->illgrp_ill_count++; 15858 /* 15859 * Inherit group properties. Currently only forwarding 15860 * is the property we try to keep the same with all the 15861 * ills. When there are more, we will abstract this into 15862 * a function. 15863 */ 15864 ill->ill_flags &= ~ILLF_ROUTER; 15865 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 15866 } 15867 mutex_exit(&ill->ill_lock); 15868 rw_exit(&ill_g_lock); 15869 15870 /* 15871 * 1) When ipif_up_done() calls this function, ipif_up_count 15872 * may be zero as it has not yet been bumped. But the ires 15873 * have already been added. So, we do the nomination here 15874 * itself. But, when ip_sioctl_groupname calls this, it checks 15875 * for ill_ipif_up_count != 0. Thus we don't check for 15876 * ill_ipif_up_count here while nominating broadcast ires for 15877 * receive. 15878 * 15879 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 15880 * to group them properly as ire_add() has already happened 15881 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 15882 * case, we need to do it here anyway. 15883 */ 15884 if (!ill->ill_isv6) { 15885 ill_group_bcast_for_xmit(ill); 15886 ill_nominate_bcast_rcv(illgrp); 15887 } 15888 15889 if (!ipif_is_coming_up) { 15890 /* 15891 * When ipif_up_done() calls this function, the multicast 15892 * groups have not been joined yet. So, there is no point in 15893 * nomination. ip_join_allmulti will handle groups when 15894 * ill_recover_multicast is called from ipif_up_done() later. 15895 */ 15896 (void) ill_nominate_mcast_rcv(illgrp); 15897 /* 15898 * ipif_up_done calls ill_update_source_selection 15899 * anyway. Moreover, we don't want to re-create 15900 * interface routes while ipif_up_done() still has reference 15901 * to them. Refer to ipif_up_done() for more details. 15902 */ 15903 ill_update_source_selection(ill); 15904 } 15905 15906 /* 15907 * Send a routing sockets message if we are inserting into 15908 * groups with names. 15909 */ 15910 if (groupname != NULL) 15911 ip_rts_ifmsg(ill->ill_ipif); 15912 return (0); 15913 } 15914 15915 /* 15916 * Return the first phyint matching the groupname. There could 15917 * be more than one when there are ill groups. 15918 * 15919 * Needs work: called only from ip_sioctl_groupname 15920 */ 15921 static phyint_t * 15922 phyint_lookup_group(char *groupname) 15923 { 15924 phyint_t *phyi; 15925 15926 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 15927 /* 15928 * Group names are stored in the phyint - a common structure 15929 * to both IPv4 and IPv6. 15930 */ 15931 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 15932 for (; phyi != NULL; 15933 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 15934 phyi, AVL_AFTER)) { 15935 if (phyi->phyint_groupname_len == 0) 15936 continue; 15937 ASSERT(phyi->phyint_groupname != NULL); 15938 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 15939 return (phyi); 15940 } 15941 return (NULL); 15942 } 15943 15944 15945 15946 /* 15947 * MT notes on creation and deletion of IPMP groups 15948 * 15949 * Creation and deletion of IPMP groups introduce the need to merge or 15950 * split the associated serialization objects i.e the ipsq's. Normally all 15951 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 15952 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 15953 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 15954 * is a need to change the <ill-ipsq> association and we have to operate on both 15955 * the source and destination IPMP groups. For eg. attempting to set the 15956 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 15957 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 15958 * source or destination IPMP group are mapped to a single ipsq for executing 15959 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 15960 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 15961 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 15962 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 15963 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 15964 * ipsq has to be examined for redoing the <ill-ipsq> associations. 15965 * 15966 * In the above example the ioctl handling code locates the current ipsq of hme0 15967 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 15968 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 15969 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 15970 * the destination ipsq. If the destination ipsq is not busy, it also enters 15971 * the destination ipsq exclusively. Now the actual groupname setting operation 15972 * can proceed. If the destination ipsq is busy, the operation is enqueued 15973 * on the destination (merged) ipsq and will be handled in the unwind from 15974 * ipsq_exit. 15975 * 15976 * To prevent other threads accessing the ill while the group name change is 15977 * in progres, we bring down the ipifs which also removes the ill from the 15978 * group. The group is changed in phyint and when the first ipif on the ill 15979 * is brought up, the ill is inserted into the right IPMP group by 15980 * illgrp_insert. 15981 */ 15982 /* ARGSUSED */ 15983 int 15984 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15985 ip_ioctl_cmd_t *ipip, void *ifreq) 15986 { 15987 int i; 15988 char *tmp; 15989 int namelen; 15990 ill_t *ill = ipif->ipif_ill; 15991 ill_t *ill_v4, *ill_v6; 15992 int err = 0; 15993 phyint_t *phyi; 15994 phyint_t *phyi_tmp; 15995 struct lifreq *lifr; 15996 mblk_t *mp1; 15997 char *groupname; 15998 ipsq_t *ipsq; 15999 16000 ASSERT(IAM_WRITER_IPIF(ipif)); 16001 16002 /* Existance verified in ip_wput_nondata */ 16003 mp1 = mp->b_cont->b_cont; 16004 lifr = (struct lifreq *)mp1->b_rptr; 16005 groupname = lifr->lifr_groupname; 16006 16007 if (ipif->ipif_id != 0) 16008 return (EINVAL); 16009 16010 phyi = ill->ill_phyint; 16011 ASSERT(phyi != NULL); 16012 16013 if (phyi->phyint_flags & PHYI_VIRTUAL) 16014 return (EINVAL); 16015 16016 tmp = groupname; 16017 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 16018 ; 16019 16020 if (i == LIFNAMSIZ) { 16021 /* no null termination */ 16022 return (EINVAL); 16023 } 16024 16025 /* 16026 * Calculate the namelen exclusive of the null 16027 * termination character. 16028 */ 16029 namelen = tmp - groupname; 16030 16031 ill_v4 = phyi->phyint_illv4; 16032 ill_v6 = phyi->phyint_illv6; 16033 16034 /* 16035 * ILL cannot be part of a usesrc group and and IPMP group at the 16036 * same time. No need to grab the ill_g_usesrc_lock here, see 16037 * synchronization notes in ip.c 16038 */ 16039 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 16040 return (EINVAL); 16041 } 16042 16043 /* 16044 * mark the ill as changing. 16045 * this should queue all new requests on the syncq. 16046 */ 16047 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16048 16049 if (ill_v4 != NULL) 16050 ill_v4->ill_state_flags |= ILL_CHANGING; 16051 if (ill_v6 != NULL) 16052 ill_v6->ill_state_flags |= ILL_CHANGING; 16053 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16054 16055 if (namelen == 0) { 16056 /* 16057 * Null string means remove this interface from the 16058 * existing group. 16059 */ 16060 if (phyi->phyint_groupname_len == 0) { 16061 /* 16062 * Never was in a group. 16063 */ 16064 err = 0; 16065 goto done; 16066 } 16067 16068 /* 16069 * IPv4 or IPv6 may be temporarily out of the group when all 16070 * the ipifs are down. Thus, we need to check for ill_group to 16071 * be non-NULL. 16072 */ 16073 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 16074 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16075 mutex_enter(&ill_v4->ill_lock); 16076 if (!ill_is_quiescent(ill_v4)) { 16077 /* 16078 * ipsq_pending_mp_add will not fail since 16079 * connp is NULL 16080 */ 16081 (void) ipsq_pending_mp_add(NULL, 16082 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16083 mutex_exit(&ill_v4->ill_lock); 16084 err = EINPROGRESS; 16085 goto done; 16086 } 16087 mutex_exit(&ill_v4->ill_lock); 16088 } 16089 16090 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 16091 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16092 mutex_enter(&ill_v6->ill_lock); 16093 if (!ill_is_quiescent(ill_v6)) { 16094 (void) ipsq_pending_mp_add(NULL, 16095 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16096 mutex_exit(&ill_v6->ill_lock); 16097 err = EINPROGRESS; 16098 goto done; 16099 } 16100 mutex_exit(&ill_v6->ill_lock); 16101 } 16102 16103 rw_enter(&ill_g_lock, RW_WRITER); 16104 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16105 mutex_enter(&phyi->phyint_lock); 16106 ASSERT(phyi->phyint_groupname != NULL); 16107 mi_free(phyi->phyint_groupname); 16108 phyi->phyint_groupname = NULL; 16109 phyi->phyint_groupname_len = 0; 16110 mutex_exit(&phyi->phyint_lock); 16111 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16112 rw_exit(&ill_g_lock); 16113 err = ill_up_ipifs(ill, q, mp); 16114 16115 /* 16116 * set the split flag so that the ipsq can be split 16117 */ 16118 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16119 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16120 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16121 16122 } else { 16123 if (phyi->phyint_groupname_len != 0) { 16124 ASSERT(phyi->phyint_groupname != NULL); 16125 /* Are we inserting in the same group ? */ 16126 if (mi_strcmp(groupname, 16127 phyi->phyint_groupname) == 0) { 16128 err = 0; 16129 goto done; 16130 } 16131 } 16132 16133 rw_enter(&ill_g_lock, RW_READER); 16134 /* 16135 * Merge ipsq for the group's. 16136 * This check is here as multiple groups/ills might be 16137 * sharing the same ipsq. 16138 * If we have to merege than the operation is restarted 16139 * on the new ipsq. 16140 */ 16141 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL); 16142 if (phyi->phyint_ipsq != ipsq) { 16143 rw_exit(&ill_g_lock); 16144 err = ill_merge_groups(ill, NULL, groupname, mp, q); 16145 goto done; 16146 } 16147 /* 16148 * Running exclusive on new ipsq. 16149 */ 16150 16151 ASSERT(ipsq != NULL); 16152 ASSERT(ipsq->ipsq_writer == curthread); 16153 16154 /* 16155 * Check whether the ill_type and ill_net_type matches before 16156 * we allocate any memory so that the cleanup is easier. 16157 * 16158 * We can't group dissimilar ones as we can't load spread 16159 * packets across the group because of potential link-level 16160 * header differences. 16161 */ 16162 phyi_tmp = phyint_lookup_group(groupname); 16163 if (phyi_tmp != NULL) { 16164 if ((ill_v4 != NULL && 16165 phyi_tmp->phyint_illv4 != NULL) && 16166 ((ill_v4->ill_net_type != 16167 phyi_tmp->phyint_illv4->ill_net_type) || 16168 (ill_v4->ill_type != 16169 phyi_tmp->phyint_illv4->ill_type))) { 16170 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16171 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16172 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16173 rw_exit(&ill_g_lock); 16174 return (EINVAL); 16175 } 16176 if ((ill_v6 != NULL && 16177 phyi_tmp->phyint_illv6 != NULL) && 16178 ((ill_v6->ill_net_type != 16179 phyi_tmp->phyint_illv6->ill_net_type) || 16180 (ill_v6->ill_type != 16181 phyi_tmp->phyint_illv6->ill_type))) { 16182 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16183 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16184 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16185 rw_exit(&ill_g_lock); 16186 return (EINVAL); 16187 } 16188 } 16189 16190 rw_exit(&ill_g_lock); 16191 16192 /* 16193 * bring down all v4 ipifs. 16194 */ 16195 if (ill_v4 != NULL) { 16196 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16197 } 16198 16199 /* 16200 * bring down all v6 ipifs. 16201 */ 16202 if (ill_v6 != NULL) { 16203 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16204 } 16205 16206 /* 16207 * make sure all ipifs are down and there are no active 16208 * references. Call to ipsq_pending_mp_add will not fail 16209 * since connp is NULL. 16210 */ 16211 if (ill_v4 != NULL) { 16212 mutex_enter(&ill_v4->ill_lock); 16213 if (!ill_is_quiescent(ill_v4)) { 16214 (void) ipsq_pending_mp_add(NULL, 16215 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16216 mutex_exit(&ill_v4->ill_lock); 16217 err = EINPROGRESS; 16218 goto done; 16219 } 16220 mutex_exit(&ill_v4->ill_lock); 16221 } 16222 16223 if (ill_v6 != NULL) { 16224 mutex_enter(&ill_v6->ill_lock); 16225 if (!ill_is_quiescent(ill_v6)) { 16226 (void) ipsq_pending_mp_add(NULL, 16227 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16228 mutex_exit(&ill_v6->ill_lock); 16229 err = EINPROGRESS; 16230 goto done; 16231 } 16232 mutex_exit(&ill_v6->ill_lock); 16233 } 16234 16235 /* 16236 * allocate including space for null terminator 16237 * before we insert. 16238 */ 16239 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 16240 if (tmp == NULL) 16241 return (ENOMEM); 16242 16243 rw_enter(&ill_g_lock, RW_WRITER); 16244 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16245 mutex_enter(&phyi->phyint_lock); 16246 if (phyi->phyint_groupname_len != 0) { 16247 ASSERT(phyi->phyint_groupname != NULL); 16248 mi_free(phyi->phyint_groupname); 16249 } 16250 16251 /* 16252 * setup the new group name. 16253 */ 16254 phyi->phyint_groupname = tmp; 16255 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 16256 phyi->phyint_groupname_len = namelen + 1; 16257 mutex_exit(&phyi->phyint_lock); 16258 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16259 rw_exit(&ill_g_lock); 16260 16261 err = ill_up_ipifs(ill, q, mp); 16262 } 16263 16264 done: 16265 /* 16266 * normally ILL_CHANGING is cleared in ill_up_ipifs. 16267 */ 16268 if (err != EINPROGRESS) { 16269 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16270 if (ill_v4 != NULL) 16271 ill_v4->ill_state_flags &= ~ILL_CHANGING; 16272 if (ill_v6 != NULL) 16273 ill_v6->ill_state_flags &= ~ILL_CHANGING; 16274 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16275 } 16276 return (err); 16277 } 16278 16279 /* ARGSUSED */ 16280 int 16281 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 16282 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 16283 { 16284 ill_t *ill; 16285 phyint_t *phyi; 16286 struct lifreq *lifr; 16287 mblk_t *mp1; 16288 16289 /* Existence verified in ip_wput_nondata */ 16290 mp1 = mp->b_cont->b_cont; 16291 lifr = (struct lifreq *)mp1->b_rptr; 16292 ill = ipif->ipif_ill; 16293 phyi = ill->ill_phyint; 16294 16295 lifr->lifr_groupname[0] = '\0'; 16296 /* 16297 * ill_group may be null if all the interfaces 16298 * are down. But still, the phyint should always 16299 * hold the name. 16300 */ 16301 if (phyi->phyint_groupname_len != 0) { 16302 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 16303 phyi->phyint_groupname_len); 16304 } 16305 16306 return (0); 16307 } 16308 16309 16310 typedef struct conn_move_s { 16311 ill_t *cm_from_ill; 16312 ill_t *cm_to_ill; 16313 int cm_ifindex; 16314 } conn_move_t; 16315 16316 /* 16317 * ipcl_walk function for moving conn_multicast_ill for a given ill. 16318 */ 16319 static void 16320 conn_move(conn_t *connp, caddr_t arg) 16321 { 16322 conn_move_t *connm; 16323 int ifindex; 16324 int i; 16325 ill_t *from_ill; 16326 ill_t *to_ill; 16327 ilg_t *ilg; 16328 ilm_t *ret_ilm; 16329 16330 connm = (conn_move_t *)arg; 16331 ifindex = connm->cm_ifindex; 16332 from_ill = connm->cm_from_ill; 16333 to_ill = connm->cm_to_ill; 16334 16335 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 16336 16337 /* All multicast fields protected by conn_lock */ 16338 mutex_enter(&connp->conn_lock); 16339 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 16340 if ((connp->conn_outgoing_ill == from_ill) && 16341 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 16342 connp->conn_outgoing_ill = to_ill; 16343 connp->conn_incoming_ill = to_ill; 16344 } 16345 16346 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 16347 16348 if ((connp->conn_multicast_ill == from_ill) && 16349 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 16350 connp->conn_multicast_ill = connm->cm_to_ill; 16351 } 16352 16353 /* Change IP_XMIT_IF associations */ 16354 if ((connp->conn_xmit_if_ill == from_ill) && 16355 (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) { 16356 connp->conn_xmit_if_ill = to_ill; 16357 } 16358 /* 16359 * Change the ilg_ill to point to the new one. This assumes 16360 * ilm_move_v6 has moved the ilms to new_ill and the driver 16361 * has been told to receive packets on this interface. 16362 * ilm_move_v6 FAILBACKS all the ilms successfully always. 16363 * But when doing a FAILOVER, it might fail with ENOMEM and so 16364 * some ilms may not have moved. We check to see whether 16365 * the ilms have moved to to_ill. We can't check on from_ill 16366 * as in the process of moving, we could have split an ilm 16367 * in to two - which has the same orig_ifindex and v6group. 16368 * 16369 * For IPv4, ilg_ipif moves implicitly. The code below really 16370 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 16371 */ 16372 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 16373 ilg = &connp->conn_ilg[i]; 16374 if ((ilg->ilg_ill == from_ill) && 16375 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 16376 /* ifindex != 0 indicates failback */ 16377 if (ifindex != 0) { 16378 connp->conn_ilg[i].ilg_ill = to_ill; 16379 continue; 16380 } 16381 16382 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 16383 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 16384 connp->conn_zoneid); 16385 16386 if (ret_ilm != NULL) 16387 connp->conn_ilg[i].ilg_ill = to_ill; 16388 } 16389 } 16390 mutex_exit(&connp->conn_lock); 16391 } 16392 16393 static void 16394 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 16395 { 16396 conn_move_t connm; 16397 16398 connm.cm_from_ill = from_ill; 16399 connm.cm_to_ill = to_ill; 16400 connm.cm_ifindex = ifindex; 16401 16402 ipcl_walk(conn_move, (caddr_t)&connm); 16403 } 16404 16405 /* 16406 * ilm has been moved from from_ill to to_ill. 16407 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 16408 * appropriately. 16409 * 16410 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 16411 * the code there de-references ipif_ill to get the ill to 16412 * send multicast requests. It does not work as ipif is on its 16413 * move and already moved when this function is called. 16414 * Thus, we need to use from_ill and to_ill send down multicast 16415 * requests. 16416 */ 16417 static void 16418 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 16419 { 16420 ipif_t *ipif; 16421 ilm_t *ilm; 16422 16423 /* 16424 * See whether we need to send down DL_ENABMULTI_REQ on 16425 * to_ill as ilm has just been added. 16426 */ 16427 ASSERT(IAM_WRITER_ILL(to_ill)); 16428 ASSERT(IAM_WRITER_ILL(from_ill)); 16429 16430 ILM_WALKER_HOLD(to_ill); 16431 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16432 16433 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 16434 continue; 16435 /* 16436 * no locks held, ill/ipif cannot dissappear as long 16437 * as we are writer. 16438 */ 16439 ipif = to_ill->ill_ipif; 16440 /* 16441 * No need to hold any lock as we are the writer and this 16442 * can only be changed by a writer. 16443 */ 16444 ilm->ilm_is_new = B_FALSE; 16445 16446 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 16447 ipif->ipif_flags & IPIF_POINTOPOINT) { 16448 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 16449 "resolver\n")); 16450 continue; /* Must be IRE_IF_NORESOLVER */ 16451 } 16452 16453 16454 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16455 ip1dbg(("ilm_send_multicast_reqs: " 16456 "to_ill MULTI_BCAST\n")); 16457 goto from; 16458 } 16459 16460 if (to_ill->ill_isv6) 16461 mld_joingroup(ilm); 16462 else 16463 igmp_joingroup(ilm); 16464 16465 if (to_ill->ill_ipif_up_count == 0) { 16466 /* 16467 * Nobody there. All multicast addresses will be 16468 * re-joined when we get the DL_BIND_ACK bringing the 16469 * interface up. 16470 */ 16471 ilm->ilm_notify_driver = B_FALSE; 16472 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 16473 goto from; 16474 } 16475 16476 /* 16477 * For allmulti address, we want to join on only one interface. 16478 * Checking for ilm_numentries_v6 is not correct as you may 16479 * find an ilm with zero address on to_ill, but we may not 16480 * have nominated to_ill for receiving. Thus, if we have 16481 * nominated from_ill (ill_join_allmulti is set), nominate 16482 * only if to_ill is not already nominated (to_ill normally 16483 * should not have been nominated if "from_ill" has already 16484 * been nominated. As we don't prevent failovers from happening 16485 * across groups, we don't assert). 16486 */ 16487 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16488 /* 16489 * There is no need to hold ill locks as we are 16490 * writer on both ills and when ill_join_allmulti 16491 * is changed the thread is always a writer. 16492 */ 16493 if (from_ill->ill_join_allmulti && 16494 !to_ill->ill_join_allmulti) { 16495 (void) ip_join_allmulti(to_ill->ill_ipif); 16496 } 16497 } else if (ilm->ilm_notify_driver) { 16498 16499 /* 16500 * This is a newly moved ilm so we need to tell the 16501 * driver about the new group. There can be more than 16502 * one ilm's for the same group in the list each with a 16503 * different orig_ifindex. We have to inform the driver 16504 * once. In ilm_move_v[4,6] we only set the flag 16505 * ilm_notify_driver for the first ilm. 16506 */ 16507 16508 (void) ip_ll_send_enabmulti_req(to_ill, 16509 &ilm->ilm_v6addr); 16510 } 16511 16512 ilm->ilm_notify_driver = B_FALSE; 16513 16514 /* 16515 * See whether we need to send down DL_DISABMULTI_REQ on 16516 * from_ill as ilm has just been removed. 16517 */ 16518 from: 16519 ipif = from_ill->ill_ipif; 16520 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 16521 ipif->ipif_flags & IPIF_POINTOPOINT) { 16522 ip1dbg(("ilm_send_multicast_reqs: " 16523 "from_ill not resolver\n")); 16524 continue; /* Must be IRE_IF_NORESOLVER */ 16525 } 16526 16527 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16528 ip1dbg(("ilm_send_multicast_reqs: " 16529 "from_ill MULTI_BCAST\n")); 16530 continue; 16531 } 16532 16533 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16534 if (from_ill->ill_join_allmulti) 16535 (void) ip_leave_allmulti(from_ill->ill_ipif); 16536 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 16537 (void) ip_ll_send_disabmulti_req(from_ill, 16538 &ilm->ilm_v6addr); 16539 } 16540 } 16541 ILM_WALKER_RELE(to_ill); 16542 } 16543 16544 /* 16545 * This function is called when all multicast memberships needs 16546 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 16547 * called only once unlike the IPv4 counterpart where it is called after 16548 * every logical interface is moved. The reason is due to multicast 16549 * memberships are joined using an interface address in IPv4 while in 16550 * IPv6, interface index is used. 16551 */ 16552 static void 16553 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 16554 { 16555 ilm_t *ilm; 16556 ilm_t *ilm_next; 16557 ilm_t *new_ilm; 16558 ilm_t **ilmp; 16559 int count; 16560 char buf[INET6_ADDRSTRLEN]; 16561 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 16562 16563 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16564 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16565 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16566 16567 if (ifindex == 0) { 16568 /* 16569 * Form the solicited node mcast address which is used later. 16570 */ 16571 ipif_t *ipif; 16572 16573 ipif = from_ill->ill_ipif; 16574 ASSERT(ipif->ipif_id == 0); 16575 16576 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 16577 } 16578 16579 ilmp = &from_ill->ill_ilm; 16580 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16581 ilm_next = ilm->ilm_next; 16582 16583 if (ilm->ilm_flags & ILM_DELETED) { 16584 ilmp = &ilm->ilm_next; 16585 continue; 16586 } 16587 16588 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 16589 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 16590 ASSERT(ilm->ilm_orig_ifindex != 0); 16591 if (ilm->ilm_orig_ifindex == ifindex) { 16592 /* 16593 * We are failing back multicast memberships. 16594 * If the same ilm exists in to_ill, it means somebody 16595 * has joined the same group there e.g. ff02::1 16596 * is joined within the kernel when the interfaces 16597 * came UP. 16598 */ 16599 ASSERT(ilm->ilm_ipif == NULL); 16600 if (new_ilm != NULL) { 16601 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16602 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16603 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16604 new_ilm->ilm_is_new = B_TRUE; 16605 } 16606 } else { 16607 /* 16608 * check if we can just move the ilm 16609 */ 16610 if (from_ill->ill_ilm_walker_cnt != 0) { 16611 /* 16612 * We have walkers we cannot move 16613 * the ilm, so allocate a new ilm, 16614 * this (old) ilm will be marked 16615 * ILM_DELETED at the end of the loop 16616 * and will be freed when the 16617 * last walker exits. 16618 */ 16619 new_ilm = (ilm_t *)mi_zalloc 16620 (sizeof (ilm_t)); 16621 if (new_ilm == NULL) { 16622 ip0dbg(("ilm_move_v6: " 16623 "FAILBACK of IPv6" 16624 " multicast address %s : " 16625 "from %s to" 16626 " %s failed : ENOMEM \n", 16627 inet_ntop(AF_INET6, 16628 &ilm->ilm_v6addr, buf, 16629 sizeof (buf)), 16630 from_ill->ill_name, 16631 to_ill->ill_name)); 16632 16633 ilmp = &ilm->ilm_next; 16634 continue; 16635 } 16636 *new_ilm = *ilm; 16637 /* 16638 * we don't want new_ilm linked to 16639 * ilm's filter list. 16640 */ 16641 new_ilm->ilm_filter = NULL; 16642 } else { 16643 /* 16644 * No walkers we can move the ilm. 16645 * lets take it out of the list. 16646 */ 16647 *ilmp = ilm->ilm_next; 16648 ilm->ilm_next = NULL; 16649 new_ilm = ilm; 16650 } 16651 16652 /* 16653 * if this is the first ilm for the group 16654 * set ilm_notify_driver so that we notify the 16655 * driver in ilm_send_multicast_reqs. 16656 */ 16657 if (ilm_lookup_ill_v6(to_ill, 16658 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16659 new_ilm->ilm_notify_driver = B_TRUE; 16660 16661 new_ilm->ilm_ill = to_ill; 16662 /* Add to the to_ill's list */ 16663 new_ilm->ilm_next = to_ill->ill_ilm; 16664 to_ill->ill_ilm = new_ilm; 16665 /* 16666 * set the flag so that mld_joingroup is 16667 * called in ilm_send_multicast_reqs(). 16668 */ 16669 new_ilm->ilm_is_new = B_TRUE; 16670 } 16671 goto bottom; 16672 } else if (ifindex != 0) { 16673 /* 16674 * If this is FAILBACK (ifindex != 0) and the ifindex 16675 * has not matched above, look at the next ilm. 16676 */ 16677 ilmp = &ilm->ilm_next; 16678 continue; 16679 } 16680 /* 16681 * If we are here, it means ifindex is 0. Failover 16682 * everything. 16683 * 16684 * We need to handle solicited node mcast address 16685 * and all_nodes mcast address differently as they 16686 * are joined witin the kenrel (ipif_multicast_up) 16687 * and potentially from the userland. We are called 16688 * after the ipifs of from_ill has been moved. 16689 * If we still find ilms on ill with solicited node 16690 * mcast address or all_nodes mcast address, it must 16691 * belong to the UP interface that has not moved e.g. 16692 * ipif_id 0 with the link local prefix does not move. 16693 * We join this on the new ill accounting for all the 16694 * userland memberships so that applications don't 16695 * see any failure. 16696 * 16697 * We need to make sure that we account only for the 16698 * solicited node and all node multicast addresses 16699 * that was brought UP on these. In the case of 16700 * a failover from A to B, we might have ilms belonging 16701 * to A (ilm_orig_ifindex pointing at A) on B accounting 16702 * for the membership from the userland. If we are failing 16703 * over from B to C now, we will find the ones belonging 16704 * to A on B. These don't account for the ill_ipif_up_count. 16705 * They just move from B to C. The check below on 16706 * ilm_orig_ifindex ensures that. 16707 */ 16708 if ((ilm->ilm_orig_ifindex == 16709 from_ill->ill_phyint->phyint_ifindex) && 16710 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 16711 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 16712 &ilm->ilm_v6addr))) { 16713 ASSERT(ilm->ilm_refcnt > 0); 16714 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 16715 /* 16716 * For indentation reasons, we are not using a 16717 * "else" here. 16718 */ 16719 if (count == 0) { 16720 ilmp = &ilm->ilm_next; 16721 continue; 16722 } 16723 ilm->ilm_refcnt -= count; 16724 if (new_ilm != NULL) { 16725 /* 16726 * Can find one with the same 16727 * ilm_orig_ifindex, if we are failing 16728 * over to a STANDBY. This happens 16729 * when somebody wants to join a group 16730 * on a STANDBY interface and we 16731 * internally join on a different one. 16732 * If we had joined on from_ill then, a 16733 * failover now will find a new ilm 16734 * with this index. 16735 */ 16736 ip1dbg(("ilm_move_v6: FAILOVER, found" 16737 " new ilm on %s, group address %s\n", 16738 to_ill->ill_name, 16739 inet_ntop(AF_INET6, 16740 &ilm->ilm_v6addr, buf, 16741 sizeof (buf)))); 16742 new_ilm->ilm_refcnt += count; 16743 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16744 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16745 new_ilm->ilm_is_new = B_TRUE; 16746 } 16747 } else { 16748 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16749 if (new_ilm == NULL) { 16750 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 16751 " multicast address %s : from %s to" 16752 " %s failed : ENOMEM \n", 16753 inet_ntop(AF_INET6, 16754 &ilm->ilm_v6addr, buf, 16755 sizeof (buf)), from_ill->ill_name, 16756 to_ill->ill_name)); 16757 ilmp = &ilm->ilm_next; 16758 continue; 16759 } 16760 *new_ilm = *ilm; 16761 new_ilm->ilm_filter = NULL; 16762 new_ilm->ilm_refcnt = count; 16763 new_ilm->ilm_timer = INFINITY; 16764 new_ilm->ilm_rtx.rtx_timer = INFINITY; 16765 new_ilm->ilm_is_new = B_TRUE; 16766 /* 16767 * If the to_ill has not joined this 16768 * group we need to tell the driver in 16769 * ill_send_multicast_reqs. 16770 */ 16771 if (ilm_lookup_ill_v6(to_ill, 16772 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16773 new_ilm->ilm_notify_driver = B_TRUE; 16774 16775 new_ilm->ilm_ill = to_ill; 16776 /* Add to the to_ill's list */ 16777 new_ilm->ilm_next = to_ill->ill_ilm; 16778 to_ill->ill_ilm = new_ilm; 16779 ASSERT(new_ilm->ilm_ipif == NULL); 16780 } 16781 if (ilm->ilm_refcnt == 0) { 16782 goto bottom; 16783 } else { 16784 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16785 CLEAR_SLIST(new_ilm->ilm_filter); 16786 ilmp = &ilm->ilm_next; 16787 } 16788 continue; 16789 } else { 16790 /* 16791 * ifindex = 0 means, move everything pointing at 16792 * from_ill. We are doing this becuase ill has 16793 * either FAILED or became INACTIVE. 16794 * 16795 * As we would like to move things later back to 16796 * from_ill, we want to retain the identity of this 16797 * ilm. Thus, we don't blindly increment the reference 16798 * count on the ilms matching the address alone. We 16799 * need to match on the ilm_orig_index also. new_ilm 16800 * was obtained by matching ilm_orig_index also. 16801 */ 16802 if (new_ilm != NULL) { 16803 /* 16804 * This is possible only if a previous restore 16805 * was incomplete i.e restore to 16806 * ilm_orig_ifindex left some ilms because 16807 * of some failures. Thus when we are failing 16808 * again, we might find our old friends there. 16809 */ 16810 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 16811 " on %s, group address %s\n", 16812 to_ill->ill_name, 16813 inet_ntop(AF_INET6, 16814 &ilm->ilm_v6addr, buf, 16815 sizeof (buf)))); 16816 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16817 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16818 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16819 new_ilm->ilm_is_new = B_TRUE; 16820 } 16821 } else { 16822 if (from_ill->ill_ilm_walker_cnt != 0) { 16823 new_ilm = (ilm_t *) 16824 mi_zalloc(sizeof (ilm_t)); 16825 if (new_ilm == NULL) { 16826 ip0dbg(("ilm_move_v6: " 16827 "FAILOVER of IPv6" 16828 " multicast address %s : " 16829 "from %s to" 16830 " %s failed : ENOMEM \n", 16831 inet_ntop(AF_INET6, 16832 &ilm->ilm_v6addr, buf, 16833 sizeof (buf)), 16834 from_ill->ill_name, 16835 to_ill->ill_name)); 16836 16837 ilmp = &ilm->ilm_next; 16838 continue; 16839 } 16840 *new_ilm = *ilm; 16841 new_ilm->ilm_filter = NULL; 16842 } else { 16843 *ilmp = ilm->ilm_next; 16844 new_ilm = ilm; 16845 } 16846 /* 16847 * If the to_ill has not joined this 16848 * group we need to tell the driver in 16849 * ill_send_multicast_reqs. 16850 */ 16851 if (ilm_lookup_ill_v6(to_ill, 16852 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16853 new_ilm->ilm_notify_driver = B_TRUE; 16854 16855 /* Add to the to_ill's list */ 16856 new_ilm->ilm_next = to_ill->ill_ilm; 16857 to_ill->ill_ilm = new_ilm; 16858 ASSERT(ilm->ilm_ipif == NULL); 16859 new_ilm->ilm_ill = to_ill; 16860 new_ilm->ilm_is_new = B_TRUE; 16861 } 16862 16863 } 16864 16865 bottom: 16866 /* 16867 * Revert multicast filter state to (EXCLUDE, NULL). 16868 * new_ilm->ilm_is_new should already be set if needed. 16869 */ 16870 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16871 CLEAR_SLIST(new_ilm->ilm_filter); 16872 /* 16873 * We allocated/got a new ilm, free the old one. 16874 */ 16875 if (new_ilm != ilm) { 16876 if (from_ill->ill_ilm_walker_cnt == 0) { 16877 *ilmp = ilm->ilm_next; 16878 ilm->ilm_next = NULL; 16879 FREE_SLIST(ilm->ilm_filter); 16880 FREE_SLIST(ilm->ilm_pendsrcs); 16881 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 16882 FREE_SLIST(ilm->ilm_rtx.rtx_block); 16883 mi_free((char *)ilm); 16884 } else { 16885 ilm->ilm_flags |= ILM_DELETED; 16886 from_ill->ill_ilm_cleanup_reqd = 1; 16887 ilmp = &ilm->ilm_next; 16888 } 16889 } 16890 } 16891 } 16892 16893 /* 16894 * Move all the multicast memberships to to_ill. Called when 16895 * an ipif moves from "from_ill" to "to_ill". This function is slightly 16896 * different from IPv6 counterpart as multicast memberships are associated 16897 * with ills in IPv6. This function is called after every ipif is moved 16898 * unlike IPv6, where it is moved only once. 16899 */ 16900 static void 16901 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 16902 { 16903 ilm_t *ilm; 16904 ilm_t *ilm_next; 16905 ilm_t *new_ilm; 16906 ilm_t **ilmp; 16907 16908 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16909 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16910 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16911 16912 ilmp = &from_ill->ill_ilm; 16913 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16914 ilm_next = ilm->ilm_next; 16915 16916 if (ilm->ilm_flags & ILM_DELETED) { 16917 ilmp = &ilm->ilm_next; 16918 continue; 16919 } 16920 16921 ASSERT(ilm->ilm_ipif != NULL); 16922 16923 if (ilm->ilm_ipif != ipif) { 16924 ilmp = &ilm->ilm_next; 16925 continue; 16926 } 16927 16928 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 16929 htonl(INADDR_ALLHOSTS_GROUP)) { 16930 /* 16931 * We joined this in ipif_multicast_up 16932 * and we never did an ipif_multicast_down 16933 * for IPv4. If nobody else from the userland 16934 * has reference, we free the ilm, and later 16935 * when this ipif comes up on the new ill, 16936 * we will join this again. 16937 */ 16938 if (--ilm->ilm_refcnt == 0) 16939 goto delete_ilm; 16940 16941 new_ilm = ilm_lookup_ipif(ipif, 16942 V4_PART_OF_V6(ilm->ilm_v6addr)); 16943 if (new_ilm != NULL) { 16944 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16945 /* 16946 * We still need to deal with the from_ill. 16947 */ 16948 new_ilm->ilm_is_new = B_TRUE; 16949 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16950 CLEAR_SLIST(new_ilm->ilm_filter); 16951 goto delete_ilm; 16952 } 16953 /* 16954 * If we could not find one e.g. ipif is 16955 * still down on to_ill, we add this ilm 16956 * on ill_new to preserve the reference 16957 * count. 16958 */ 16959 } 16960 /* 16961 * When ipifs move, ilms always move with it 16962 * to the NEW ill. Thus we should never be 16963 * able to find ilm till we really move it here. 16964 */ 16965 ASSERT(ilm_lookup_ipif(ipif, 16966 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 16967 16968 if (from_ill->ill_ilm_walker_cnt != 0) { 16969 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16970 if (new_ilm == NULL) { 16971 char buf[INET6_ADDRSTRLEN]; 16972 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 16973 " multicast address %s : " 16974 "from %s to" 16975 " %s failed : ENOMEM \n", 16976 inet_ntop(AF_INET, 16977 &ilm->ilm_v6addr, buf, 16978 sizeof (buf)), 16979 from_ill->ill_name, 16980 to_ill->ill_name)); 16981 16982 ilmp = &ilm->ilm_next; 16983 continue; 16984 } 16985 *new_ilm = *ilm; 16986 /* We don't want new_ilm linked to ilm's filter list */ 16987 new_ilm->ilm_filter = NULL; 16988 } else { 16989 /* Remove from the list */ 16990 *ilmp = ilm->ilm_next; 16991 new_ilm = ilm; 16992 } 16993 16994 /* 16995 * If we have never joined this group on the to_ill 16996 * make sure we tell the driver. 16997 */ 16998 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 16999 ALL_ZONES) == NULL) 17000 new_ilm->ilm_notify_driver = B_TRUE; 17001 17002 /* Add to the to_ill's list */ 17003 new_ilm->ilm_next = to_ill->ill_ilm; 17004 to_ill->ill_ilm = new_ilm; 17005 new_ilm->ilm_is_new = B_TRUE; 17006 17007 /* 17008 * Revert multicast filter state to (EXCLUDE, NULL) 17009 */ 17010 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17011 CLEAR_SLIST(new_ilm->ilm_filter); 17012 17013 /* 17014 * Delete only if we have allocated a new ilm. 17015 */ 17016 if (new_ilm != ilm) { 17017 delete_ilm: 17018 if (from_ill->ill_ilm_walker_cnt == 0) { 17019 /* Remove from the list */ 17020 *ilmp = ilm->ilm_next; 17021 ilm->ilm_next = NULL; 17022 FREE_SLIST(ilm->ilm_filter); 17023 FREE_SLIST(ilm->ilm_pendsrcs); 17024 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17025 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17026 mi_free((char *)ilm); 17027 } else { 17028 ilm->ilm_flags |= ILM_DELETED; 17029 from_ill->ill_ilm_cleanup_reqd = 1; 17030 ilmp = &ilm->ilm_next; 17031 } 17032 } 17033 } 17034 } 17035 17036 static uint_t 17037 ipif_get_id(ill_t *ill, uint_t id) 17038 { 17039 uint_t unit; 17040 ipif_t *tipif; 17041 boolean_t found = B_FALSE; 17042 17043 /* 17044 * During failback, we want to go back to the same id 17045 * instead of the smallest id so that the original 17046 * configuration is maintained. id is non-zero in that 17047 * case. 17048 */ 17049 if (id != 0) { 17050 /* 17051 * While failing back, if we still have an ipif with 17052 * MAX_ADDRS_PER_IF, it means this will be replaced 17053 * as soon as we return from this function. It was 17054 * to set to MAX_ADDRS_PER_IF by the caller so that 17055 * we can choose the smallest id. Thus we return zero 17056 * in that case ignoring the hint. 17057 */ 17058 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 17059 return (0); 17060 for (tipif = ill->ill_ipif; tipif != NULL; 17061 tipif = tipif->ipif_next) { 17062 if (tipif->ipif_id == id) { 17063 found = B_TRUE; 17064 break; 17065 } 17066 } 17067 /* 17068 * If somebody already plumbed another logical 17069 * with the same id, we won't be able to find it. 17070 */ 17071 if (!found) 17072 return (id); 17073 } 17074 for (unit = 0; unit <= ip_addrs_per_if; unit++) { 17075 found = B_FALSE; 17076 for (tipif = ill->ill_ipif; tipif != NULL; 17077 tipif = tipif->ipif_next) { 17078 if (tipif->ipif_id == unit) { 17079 found = B_TRUE; 17080 break; 17081 } 17082 } 17083 if (!found) 17084 break; 17085 } 17086 return (unit); 17087 } 17088 17089 /* ARGSUSED */ 17090 static int 17091 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 17092 ipif_t **rep_ipif_ptr) 17093 { 17094 ill_t *from_ill; 17095 ipif_t *rep_ipif; 17096 ipif_t **ipifp; 17097 uint_t unit; 17098 int err = 0; 17099 ipif_t *to_ipif; 17100 struct iocblk *iocp; 17101 boolean_t failback_cmd; 17102 boolean_t remove_ipif; 17103 int rc; 17104 17105 ASSERT(IAM_WRITER_ILL(to_ill)); 17106 ASSERT(IAM_WRITER_IPIF(ipif)); 17107 17108 iocp = (struct iocblk *)mp->b_rptr; 17109 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 17110 remove_ipif = B_FALSE; 17111 17112 from_ill = ipif->ipif_ill; 17113 17114 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17115 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17116 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 17117 17118 /* 17119 * Don't move LINK LOCAL addresses as they are tied to 17120 * physical interface. 17121 */ 17122 if (from_ill->ill_isv6 && 17123 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 17124 ipif->ipif_was_up = B_FALSE; 17125 IPIF_UNMARK_MOVING(ipif); 17126 return (0); 17127 } 17128 17129 /* 17130 * We set the ipif_id to maximum so that the search for 17131 * ipif_id will pick the lowest number i.e 0 in the 17132 * following 2 cases : 17133 * 17134 * 1) We have a replacement ipif at the head of to_ill. 17135 * We can't remove it yet as we can exceed ip_addrs_per_if 17136 * on to_ill and hence the MOVE might fail. We want to 17137 * remove it only if we could move the ipif. Thus, by 17138 * setting it to the MAX value, we make the search in 17139 * ipif_get_id return the zeroth id. 17140 * 17141 * 2) When DR pulls out the NIC and re-plumbs the interface, 17142 * we might just have a zero address plumbed on the ipif 17143 * with zero id in the case of IPv4. We remove that while 17144 * doing the failback. We want to remove it only if we 17145 * could move the ipif. Thus, by setting it to the MAX 17146 * value, we make the search in ipif_get_id return the 17147 * zeroth id. 17148 * 17149 * Both (1) and (2) are done only when when we are moving 17150 * an ipif (either due to failover/failback) which originally 17151 * belonged to this interface i.e the ipif_orig_ifindex is 17152 * the same as to_ill's ifindex. This is needed so that 17153 * FAILOVER from A -> B ( A failed) followed by FAILOVER 17154 * from B -> A (B is being removed from the group) and 17155 * FAILBACK from A -> B restores the original configuration. 17156 * Without the check for orig_ifindex, the second FAILOVER 17157 * could make the ipif belonging to B replace the A's zeroth 17158 * ipif and the subsequent failback re-creating the replacement 17159 * ipif again. 17160 * 17161 * NOTE : We created the replacement ipif when we did a 17162 * FAILOVER (See below). We could check for FAILBACK and 17163 * then look for replacement ipif to be removed. But we don't 17164 * want to do that because we wan't to allow the possibility 17165 * of a FAILOVER from A -> B (which creates the replacement ipif), 17166 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 17167 * from B -> A. 17168 */ 17169 to_ipif = to_ill->ill_ipif; 17170 if ((to_ill->ill_phyint->phyint_ifindex == 17171 ipif->ipif_orig_ifindex) && 17172 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 17173 ASSERT(to_ipif->ipif_id == 0); 17174 remove_ipif = B_TRUE; 17175 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 17176 } 17177 /* 17178 * Find the lowest logical unit number on the to_ill. 17179 * If we are failing back, try to get the original id 17180 * rather than the lowest one so that the original 17181 * configuration is maintained. 17182 * 17183 * XXX need a better scheme for this. 17184 */ 17185 if (failback_cmd) { 17186 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 17187 } else { 17188 unit = ipif_get_id(to_ill, 0); 17189 } 17190 17191 /* Reset back to zero in case we fail below */ 17192 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 17193 to_ipif->ipif_id = 0; 17194 17195 if (unit == ip_addrs_per_if) { 17196 ipif->ipif_was_up = B_FALSE; 17197 IPIF_UNMARK_MOVING(ipif); 17198 return (EINVAL); 17199 } 17200 17201 /* 17202 * ipif is ready to move from "from_ill" to "to_ill". 17203 * 17204 * 1) If we are moving ipif with id zero, create a 17205 * replacement ipif for this ipif on from_ill. If this fails 17206 * fail the MOVE operation. 17207 * 17208 * 2) Remove the replacement ipif on to_ill if any. 17209 * We could remove the replacement ipif when we are moving 17210 * the ipif with id zero. But what if somebody already 17211 * unplumbed it ? Thus we always remove it if it is present. 17212 * We want to do it only if we are sure we are going to 17213 * move the ipif to to_ill which is why there are no 17214 * returns due to error till ipif is linked to to_ill. 17215 * Note that the first ipif that we failback will always 17216 * be zero if it is present. 17217 */ 17218 if (ipif->ipif_id == 0) { 17219 ipaddr_t inaddr_any = INADDR_ANY; 17220 17221 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 17222 if (rep_ipif == NULL) { 17223 ipif->ipif_was_up = B_FALSE; 17224 IPIF_UNMARK_MOVING(ipif); 17225 return (ENOMEM); 17226 } 17227 *rep_ipif = ipif_zero; 17228 /* 17229 * Before we put the ipif on the list, store the addresses 17230 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 17231 * assumes so. This logic is not any different from what 17232 * ipif_allocate does. 17233 */ 17234 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17235 &rep_ipif->ipif_v6lcl_addr); 17236 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17237 &rep_ipif->ipif_v6src_addr); 17238 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17239 &rep_ipif->ipif_v6subnet); 17240 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17241 &rep_ipif->ipif_v6net_mask); 17242 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17243 &rep_ipif->ipif_v6brd_addr); 17244 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17245 &rep_ipif->ipif_v6pp_dst_addr); 17246 /* 17247 * We mark IPIF_NOFAILOVER so that this can never 17248 * move. 17249 */ 17250 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 17251 rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; 17252 rep_ipif->ipif_replace_zero = B_TRUE; 17253 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 17254 MUTEX_DEFAULT, NULL); 17255 rep_ipif->ipif_id = 0; 17256 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 17257 rep_ipif->ipif_ill = from_ill; 17258 rep_ipif->ipif_orig_ifindex = 17259 from_ill->ill_phyint->phyint_ifindex; 17260 /* Insert at head */ 17261 rep_ipif->ipif_next = from_ill->ill_ipif; 17262 from_ill->ill_ipif = rep_ipif; 17263 /* 17264 * We don't really care to let apps know about 17265 * this interface. 17266 */ 17267 } 17268 17269 if (remove_ipif) { 17270 /* 17271 * We set to a max value above for this case to get 17272 * id zero. ASSERT that we did get one. 17273 */ 17274 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 17275 rep_ipif = to_ipif; 17276 to_ill->ill_ipif = rep_ipif->ipif_next; 17277 rep_ipif->ipif_next = NULL; 17278 /* 17279 * If some apps scanned and find this interface, 17280 * it is time to let them know, so that they can 17281 * delete it. 17282 */ 17283 17284 *rep_ipif_ptr = rep_ipif; 17285 } 17286 17287 /* Get it out of the ILL interface list. */ 17288 ipifp = &ipif->ipif_ill->ill_ipif; 17289 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 17290 if (*ipifp == ipif) { 17291 *ipifp = ipif->ipif_next; 17292 break; 17293 } 17294 } 17295 17296 /* Assign the new ill */ 17297 ipif->ipif_ill = to_ill; 17298 ipif->ipif_id = unit; 17299 /* id has already been checked */ 17300 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 17301 ASSERT(rc == 0); 17302 /* Let SCTP update its list */ 17303 sctp_move_ipif(ipif, from_ill, to_ill); 17304 /* 17305 * Handle the failover and failback of ipif_t between 17306 * ill_t that have differing maximum mtu values. 17307 */ 17308 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 17309 if (ipif->ipif_saved_mtu == 0) { 17310 /* 17311 * As this ipif_t is moving to an ill_t 17312 * that has a lower ill_max_mtu, its 17313 * ipif_mtu needs to be saved so it can 17314 * be restored during failback or during 17315 * failover to an ill_t which has a 17316 * higher ill_max_mtu. 17317 */ 17318 ipif->ipif_saved_mtu = ipif->ipif_mtu; 17319 ipif->ipif_mtu = to_ill->ill_max_mtu; 17320 } else { 17321 /* 17322 * The ipif_t is, once again, moving to 17323 * an ill_t that has a lower maximum mtu 17324 * value. 17325 */ 17326 ipif->ipif_mtu = to_ill->ill_max_mtu; 17327 } 17328 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 17329 ipif->ipif_saved_mtu != 0) { 17330 /* 17331 * The mtu of this ipif_t had to be reduced 17332 * during an earlier failover; this is an 17333 * opportunity for it to be increased (either as 17334 * part of another failover or a failback). 17335 */ 17336 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 17337 ipif->ipif_mtu = ipif->ipif_saved_mtu; 17338 ipif->ipif_saved_mtu = 0; 17339 } else { 17340 ipif->ipif_mtu = to_ill->ill_max_mtu; 17341 } 17342 } 17343 17344 /* 17345 * We preserve all the other fields of the ipif including 17346 * ipif_saved_ire_mp. The routes that are saved here will 17347 * be recreated on the new interface and back on the old 17348 * interface when we move back. 17349 */ 17350 ASSERT(ipif->ipif_arp_del_mp == NULL); 17351 17352 return (err); 17353 } 17354 17355 static int 17356 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 17357 int ifindex, ipif_t **rep_ipif_ptr) 17358 { 17359 ipif_t *mipif; 17360 ipif_t *ipif_next; 17361 int err; 17362 17363 /* 17364 * We don't really try to MOVE back things if some of the 17365 * operations fail. The daemon will take care of moving again 17366 * later on. 17367 */ 17368 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 17369 ipif_next = mipif->ipif_next; 17370 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 17371 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 17372 17373 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 17374 17375 /* 17376 * When the MOVE fails, it is the job of the 17377 * application to take care of this properly 17378 * i.e try again if it is ENOMEM. 17379 */ 17380 if (mipif->ipif_ill != from_ill) { 17381 /* 17382 * ipif has moved. 17383 * 17384 * Move the multicast memberships associated 17385 * with this ipif to the new ill. For IPv6, we 17386 * do it once after all the ipifs are moved 17387 * (in ill_move) as they are not associated 17388 * with ipifs. 17389 * 17390 * We need to move the ilms as the ipif has 17391 * already been moved to a new ill even 17392 * in the case of errors. Neither 17393 * ilm_free(ipif) will find the ilm 17394 * when somebody unplumbs this ipif nor 17395 * ilm_delete(ilm) will be able to find the 17396 * ilm, if we don't move now. 17397 */ 17398 if (!from_ill->ill_isv6) 17399 ilm_move_v4(from_ill, to_ill, mipif); 17400 } 17401 17402 if (err != 0) 17403 return (err); 17404 } 17405 } 17406 return (0); 17407 } 17408 17409 static int 17410 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 17411 { 17412 int ifindex; 17413 int err; 17414 struct iocblk *iocp; 17415 ipif_t *ipif; 17416 ipif_t *rep_ipif_ptr = NULL; 17417 ipif_t *from_ipif = NULL; 17418 boolean_t check_rep_if = B_FALSE; 17419 17420 iocp = (struct iocblk *)mp->b_rptr; 17421 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 17422 /* 17423 * Move everything pointing at from_ill to to_ill. 17424 * We acheive this by passing in 0 as ifindex. 17425 */ 17426 ifindex = 0; 17427 } else { 17428 /* 17429 * Move everything pointing at from_ill whose original 17430 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 17431 * We acheive this by passing in ifindex rather than 0. 17432 * Multicast vifs, ilgs move implicitly because ipifs move. 17433 */ 17434 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 17435 ifindex = to_ill->ill_phyint->phyint_ifindex; 17436 } 17437 17438 /* 17439 * Determine if there is at least one ipif that would move from 17440 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 17441 * ipif (if it exists) on the to_ill would be consumed as a result of 17442 * the move, in which case we need to quiesce the replacement ipif also. 17443 */ 17444 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 17445 from_ipif = from_ipif->ipif_next) { 17446 if (((ifindex == 0) || 17447 (ifindex == from_ipif->ipif_orig_ifindex)) && 17448 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 17449 check_rep_if = B_TRUE; 17450 break; 17451 } 17452 } 17453 17454 17455 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 17456 17457 GRAB_ILL_LOCKS(from_ill, to_ill); 17458 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 17459 (void) ipsq_pending_mp_add(NULL, ipif, q, 17460 mp, ILL_MOVE_OK); 17461 RELEASE_ILL_LOCKS(from_ill, to_ill); 17462 return (EINPROGRESS); 17463 } 17464 17465 /* Check if the replacement ipif is quiescent to delete */ 17466 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 17467 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 17468 to_ill->ill_ipif->ipif_state_flags |= 17469 IPIF_MOVING | IPIF_CHANGING; 17470 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 17471 (void) ipsq_pending_mp_add(NULL, ipif, q, 17472 mp, ILL_MOVE_OK); 17473 RELEASE_ILL_LOCKS(from_ill, to_ill); 17474 return (EINPROGRESS); 17475 } 17476 } 17477 RELEASE_ILL_LOCKS(from_ill, to_ill); 17478 17479 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 17480 rw_enter(&ill_g_lock, RW_WRITER); 17481 GRAB_ILL_LOCKS(from_ill, to_ill); 17482 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 17483 17484 /* ilm_move is done inside ipif_move for IPv4 */ 17485 if (err == 0 && from_ill->ill_isv6) 17486 ilm_move_v6(from_ill, to_ill, ifindex); 17487 17488 RELEASE_ILL_LOCKS(from_ill, to_ill); 17489 rw_exit(&ill_g_lock); 17490 17491 /* 17492 * send rts messages and multicast messages. 17493 */ 17494 if (rep_ipif_ptr != NULL) { 17495 ip_rts_ifmsg(rep_ipif_ptr); 17496 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 17497 IPIF_TRACE_CLEANUP(rep_ipif_ptr); 17498 mi_free(rep_ipif_ptr); 17499 } 17500 17501 conn_move_ill(from_ill, to_ill, ifindex); 17502 17503 return (err); 17504 } 17505 17506 /* 17507 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 17508 * Also checks for the validity of the arguments. 17509 * Note: We are already exclusive inside the from group. 17510 * It is upto the caller to release refcnt on the to_ill's. 17511 */ 17512 static int 17513 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 17514 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 17515 { 17516 int dst_index; 17517 ipif_t *ipif_v4, *ipif_v6; 17518 struct lifreq *lifr; 17519 mblk_t *mp1; 17520 boolean_t exists; 17521 sin_t *sin; 17522 int err = 0; 17523 17524 if ((mp1 = mp->b_cont) == NULL) 17525 return (EPROTO); 17526 17527 if ((mp1 = mp1->b_cont) == NULL) 17528 return (EPROTO); 17529 17530 lifr = (struct lifreq *)mp1->b_rptr; 17531 sin = (sin_t *)&lifr->lifr_addr; 17532 17533 /* 17534 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 17535 * specific operations. 17536 */ 17537 if (sin->sin_family != AF_UNSPEC) 17538 return (EINVAL); 17539 17540 /* 17541 * Get ipif with id 0. We are writer on the from ill. So we can pass 17542 * NULLs for the last 4 args and we know the lookup won't fail 17543 * with EINPROGRESS. 17544 */ 17545 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 17546 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 17547 ALL_ZONES, NULL, NULL, NULL, NULL); 17548 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 17549 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 17550 ALL_ZONES, NULL, NULL, NULL, NULL); 17551 17552 if (ipif_v4 == NULL && ipif_v6 == NULL) 17553 return (ENXIO); 17554 17555 if (ipif_v4 != NULL) { 17556 ASSERT(ipif_v4->ipif_refcnt != 0); 17557 if (ipif_v4->ipif_id != 0) { 17558 err = EINVAL; 17559 goto done; 17560 } 17561 17562 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 17563 *ill_from_v4 = ipif_v4->ipif_ill; 17564 } 17565 17566 if (ipif_v6 != NULL) { 17567 ASSERT(ipif_v6->ipif_refcnt != 0); 17568 if (ipif_v6->ipif_id != 0) { 17569 err = EINVAL; 17570 goto done; 17571 } 17572 17573 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 17574 *ill_from_v6 = ipif_v6->ipif_ill; 17575 } 17576 17577 err = 0; 17578 dst_index = lifr->lifr_movetoindex; 17579 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 17580 q, mp, ip_process_ioctl, &err); 17581 if (err != 0) { 17582 /* 17583 * There could be only v6. 17584 */ 17585 if (err != ENXIO) 17586 goto done; 17587 err = 0; 17588 } 17589 17590 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 17591 q, mp, ip_process_ioctl, &err); 17592 if (err != 0) { 17593 if (err != ENXIO) 17594 goto done; 17595 if (*ill_to_v4 == NULL) { 17596 err = ENXIO; 17597 goto done; 17598 } 17599 err = 0; 17600 } 17601 17602 /* 17603 * If we have something to MOVE i.e "from" not NULL, 17604 * "to" should be non-NULL. 17605 */ 17606 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 17607 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 17608 err = EINVAL; 17609 } 17610 17611 done: 17612 if (ipif_v4 != NULL) 17613 ipif_refrele(ipif_v4); 17614 if (ipif_v6 != NULL) 17615 ipif_refrele(ipif_v6); 17616 return (err); 17617 } 17618 17619 /* 17620 * FAILOVER and FAILBACK are modelled as MOVE operations. 17621 * 17622 * We don't check whether the MOVE is within the same group or 17623 * not, because this ioctl can be used as a generic mechanism 17624 * to failover from interface A to B, though things will function 17625 * only if they are really part of the same group. Moreover, 17626 * all ipifs may be down and hence temporarily out of the group. 17627 * 17628 * ipif's that need to be moved are first brought down; V4 ipifs are brought 17629 * down first and then V6. For each we wait for the ipif's to become quiescent. 17630 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 17631 * have been deleted and there are no active references. Once quiescent the 17632 * ipif's are moved and brought up on the new ill. 17633 * 17634 * Normally the source ill and destination ill belong to the same IPMP group 17635 * and hence the same ipsq_t. In the event they don't belong to the same 17636 * same group the two ipsq's are first merged into one ipsq - that of the 17637 * to_ill. The multicast memberships on the source and destination ill cannot 17638 * change during the move operation since multicast joins/leaves also have to 17639 * execute on the same ipsq and are hence serialized. 17640 */ 17641 /* ARGSUSED */ 17642 int 17643 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17644 ip_ioctl_cmd_t *ipip, void *ifreq) 17645 { 17646 ill_t *ill_to_v4 = NULL; 17647 ill_t *ill_to_v6 = NULL; 17648 ill_t *ill_from_v4 = NULL; 17649 ill_t *ill_from_v6 = NULL; 17650 int err = 0; 17651 17652 /* 17653 * setup from and to ill's, we can get EINPROGRESS only for 17654 * to_ill's. 17655 */ 17656 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 17657 &ill_to_v4, &ill_to_v6); 17658 17659 if (err != 0) { 17660 ip0dbg(("ip_sioctl_move: extract args failed\n")); 17661 goto done; 17662 } 17663 17664 /* 17665 * nothing to do. 17666 */ 17667 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 17668 goto done; 17669 } 17670 17671 /* 17672 * nothing to do. 17673 */ 17674 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 17675 goto done; 17676 } 17677 17678 /* 17679 * Mark the ill as changing. 17680 * ILL_CHANGING flag is cleared when the ipif's are brought up 17681 * in ill_up_ipifs in case of error they are cleared below. 17682 */ 17683 17684 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17685 if (ill_from_v4 != NULL) 17686 ill_from_v4->ill_state_flags |= ILL_CHANGING; 17687 if (ill_from_v6 != NULL) 17688 ill_from_v6->ill_state_flags |= ILL_CHANGING; 17689 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17690 17691 /* 17692 * Make sure that both src and dst are 17693 * in the same syncq group. If not make it happen. 17694 * We are not holding any locks because we are the writer 17695 * on the from_ipsq and we will hold locks in ill_merge_groups 17696 * to protect to_ipsq against changing. 17697 */ 17698 if (ill_from_v4 != NULL) { 17699 if (ill_from_v4->ill_phyint->phyint_ipsq != 17700 ill_to_v4->ill_phyint->phyint_ipsq) { 17701 err = ill_merge_groups(ill_from_v4, ill_to_v4, 17702 NULL, mp, q); 17703 goto err_ret; 17704 17705 } 17706 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 17707 } else { 17708 17709 if (ill_from_v6->ill_phyint->phyint_ipsq != 17710 ill_to_v6->ill_phyint->phyint_ipsq) { 17711 err = ill_merge_groups(ill_from_v6, ill_to_v6, 17712 NULL, mp, q); 17713 goto err_ret; 17714 17715 } 17716 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 17717 } 17718 17719 /* 17720 * Now that the ipsq's have been merged and we are the writer 17721 * lets mark to_ill as changing as well. 17722 */ 17723 17724 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17725 if (ill_to_v4 != NULL) 17726 ill_to_v4->ill_state_flags |= ILL_CHANGING; 17727 if (ill_to_v6 != NULL) 17728 ill_to_v6->ill_state_flags |= ILL_CHANGING; 17729 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17730 17731 /* 17732 * Its ok for us to proceed with the move even if 17733 * ill_pending_mp is non null on one of the from ill's as the reply 17734 * should not be looking at the ipif, it should only care about the 17735 * ill itself. 17736 */ 17737 17738 /* 17739 * lets move ipv4 first. 17740 */ 17741 if (ill_from_v4 != NULL) { 17742 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 17743 ill_from_v4->ill_move_in_progress = B_TRUE; 17744 ill_to_v4->ill_move_in_progress = B_TRUE; 17745 ill_to_v4->ill_move_peer = ill_from_v4; 17746 ill_from_v4->ill_move_peer = ill_to_v4; 17747 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 17748 } 17749 17750 /* 17751 * Now lets move ipv6. 17752 */ 17753 if (err == 0 && ill_from_v6 != NULL) { 17754 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 17755 ill_from_v6->ill_move_in_progress = B_TRUE; 17756 ill_to_v6->ill_move_in_progress = B_TRUE; 17757 ill_to_v6->ill_move_peer = ill_from_v6; 17758 ill_from_v6->ill_move_peer = ill_to_v6; 17759 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 17760 } 17761 17762 err_ret: 17763 /* 17764 * EINPROGRESS means we are waiting for the ipif's that need to be 17765 * moved to become quiescent. 17766 */ 17767 if (err == EINPROGRESS) { 17768 goto done; 17769 } 17770 17771 /* 17772 * if err is set ill_up_ipifs will not be called 17773 * lets clear the flags. 17774 */ 17775 17776 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17777 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17778 /* 17779 * Some of the clearing may be redundant. But it is simple 17780 * not making any extra checks. 17781 */ 17782 if (ill_from_v6 != NULL) { 17783 ill_from_v6->ill_move_in_progress = B_FALSE; 17784 ill_from_v6->ill_move_peer = NULL; 17785 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 17786 } 17787 if (ill_from_v4 != NULL) { 17788 ill_from_v4->ill_move_in_progress = B_FALSE; 17789 ill_from_v4->ill_move_peer = NULL; 17790 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 17791 } 17792 if (ill_to_v6 != NULL) { 17793 ill_to_v6->ill_move_in_progress = B_FALSE; 17794 ill_to_v6->ill_move_peer = NULL; 17795 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 17796 } 17797 if (ill_to_v4 != NULL) { 17798 ill_to_v4->ill_move_in_progress = B_FALSE; 17799 ill_to_v4->ill_move_peer = NULL; 17800 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 17801 } 17802 17803 /* 17804 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 17805 * Do this always to maintain proper state i.e even in case of errors. 17806 * As phyint_inactive looks at both v4 and v6 interfaces, 17807 * we need not call on both v4 and v6 interfaces. 17808 */ 17809 if (ill_from_v4 != NULL) { 17810 if ((ill_from_v4->ill_phyint->phyint_flags & 17811 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17812 phyint_inactive(ill_from_v4->ill_phyint); 17813 } 17814 } else if (ill_from_v6 != NULL) { 17815 if ((ill_from_v6->ill_phyint->phyint_flags & 17816 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17817 phyint_inactive(ill_from_v6->ill_phyint); 17818 } 17819 } 17820 17821 if (ill_to_v4 != NULL) { 17822 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17823 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17824 } 17825 } else if (ill_to_v6 != NULL) { 17826 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17827 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17828 } 17829 } 17830 17831 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17832 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17833 17834 no_err: 17835 /* 17836 * lets bring the interfaces up on the to_ill. 17837 */ 17838 if (err == 0) { 17839 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 17840 q, mp); 17841 } 17842 17843 if (err == 0) { 17844 if (ill_from_v4 != NULL && ill_to_v4 != NULL) 17845 ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); 17846 17847 if (ill_from_v6 != NULL && ill_to_v6 != NULL) 17848 ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); 17849 } 17850 done: 17851 17852 if (ill_to_v4 != NULL) { 17853 ill_refrele(ill_to_v4); 17854 } 17855 if (ill_to_v6 != NULL) { 17856 ill_refrele(ill_to_v6); 17857 } 17858 17859 return (err); 17860 } 17861 17862 static void 17863 ill_dl_down(ill_t *ill) 17864 { 17865 /* 17866 * The ill is down; unbind but stay attached since we're still 17867 * associated with a PPA. If we have negotiated DLPI capabilites 17868 * with the data link service provider (IDS_OK) then reset them. 17869 * The interval between unbinding and rebinding is potentially 17870 * unbounded hence we cannot assume things will be the same. 17871 * The DLPI capabilities will be probed again when the data link 17872 * is brought up. 17873 */ 17874 mblk_t *mp = ill->ill_unbind_mp; 17875 hook_nic_event_t *info; 17876 17877 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 17878 17879 ill->ill_unbind_mp = NULL; 17880 if (mp != NULL) { 17881 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 17882 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 17883 ill->ill_name)); 17884 mutex_enter(&ill->ill_lock); 17885 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 17886 mutex_exit(&ill->ill_lock); 17887 if (ill->ill_dlpi_capab_state == IDS_OK) 17888 ill_capability_reset(ill); 17889 ill_dlpi_send(ill, mp); 17890 } 17891 17892 /* 17893 * Toss all of our multicast memberships. We could keep them, but 17894 * then we'd have to do bookkeeping of any joins and leaves performed 17895 * by the application while the the interface is down (we can't just 17896 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 17897 * on a downed interface). 17898 */ 17899 ill_leave_multicast(ill); 17900 17901 mutex_enter(&ill->ill_lock); 17902 17903 ill->ill_dl_up = 0; 17904 17905 if ((info = ill->ill_nic_event_info) != NULL) { 17906 ip2dbg(("ill_dl_down:unexpected nic event %d attached for %s\n", 17907 info->hne_event, ill->ill_name)); 17908 if (info->hne_data != NULL) 17909 kmem_free(info->hne_data, info->hne_datalen); 17910 kmem_free(info, sizeof (hook_nic_event_t)); 17911 } 17912 17913 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 17914 if (info != NULL) { 17915 info->hne_nic = ill->ill_phyint->phyint_ifindex; 17916 info->hne_lif = 0; 17917 info->hne_event = NE_DOWN; 17918 info->hne_data = NULL; 17919 info->hne_datalen = 0; 17920 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 17921 } else 17922 ip2dbg(("ill_dl_down: could not attach DOWN nic event " 17923 "information for %s (ENOMEM)\n", ill->ill_name)); 17924 17925 ill->ill_nic_event_info = info; 17926 17927 mutex_exit(&ill->ill_lock); 17928 } 17929 17930 void 17931 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 17932 { 17933 union DL_primitives *dlp; 17934 t_uscalar_t prim; 17935 17936 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17937 17938 dlp = (union DL_primitives *)mp->b_rptr; 17939 prim = dlp->dl_primitive; 17940 17941 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 17942 dlpi_prim_str(prim), prim, ill->ill_name)); 17943 17944 switch (prim) { 17945 case DL_PHYS_ADDR_REQ: 17946 { 17947 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 17948 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 17949 break; 17950 } 17951 case DL_BIND_REQ: 17952 mutex_enter(&ill->ill_lock); 17953 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 17954 mutex_exit(&ill->ill_lock); 17955 break; 17956 } 17957 17958 ill->ill_dlpi_pending = prim; 17959 17960 /* 17961 * Some drivers send M_FLUSH up to IP as part of unbind 17962 * request. When this M_FLUSH is sent back to the driver, 17963 * this can go after we send the detach request if the 17964 * M_FLUSH ends up in IP's syncq. To avoid that, we reply 17965 * to the M_FLUSH in ip_rput and locally generate another 17966 * M_FLUSH for the correctness. This will get freed in 17967 * ip_wput_nondata. 17968 */ 17969 if (prim == DL_UNBIND_REQ) 17970 (void) putnextctl1(ill->ill_rq, M_FLUSH, FLUSHRW); 17971 17972 putnext(ill->ill_wq, mp); 17973 } 17974 17975 /* 17976 * Send a DLPI control message to the driver but make sure there 17977 * is only one outstanding message. Uses ill_dlpi_pending to tell 17978 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 17979 * when an ACK or a NAK is received to process the next queued message. 17980 * 17981 * We don't protect ill_dlpi_pending with any lock. This is okay as 17982 * every place where its accessed, ip is exclusive while accessing 17983 * ill_dlpi_pending except when this function is called from ill_init() 17984 */ 17985 void 17986 ill_dlpi_send(ill_t *ill, mblk_t *mp) 17987 { 17988 mblk_t **mpp; 17989 17990 ASSERT(IAM_WRITER_ILL(ill)); 17991 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17992 17993 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 17994 /* Must queue message. Tail insertion */ 17995 mpp = &ill->ill_dlpi_deferred; 17996 while (*mpp != NULL) 17997 mpp = &((*mpp)->b_next); 17998 17999 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 18000 ill->ill_name)); 18001 18002 *mpp = mp; 18003 return; 18004 } 18005 18006 ill_dlpi_dispatch(ill, mp); 18007 } 18008 18009 /* 18010 * Called when an DLPI control message has been acked or nacked to 18011 * send down the next queued message (if any). 18012 */ 18013 void 18014 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 18015 { 18016 mblk_t *mp; 18017 18018 ASSERT(IAM_WRITER_ILL(ill)); 18019 18020 ASSERT(prim != DL_PRIM_INVAL); 18021 if (ill->ill_dlpi_pending != prim) { 18022 if (ill->ill_dlpi_pending == DL_PRIM_INVAL) { 18023 (void) mi_strlog(ill->ill_rq, 1, 18024 SL_CONSOLE|SL_ERROR|SL_TRACE, 18025 "ill_dlpi_done: unsolicited ack for %s from %s\n", 18026 dlpi_prim_str(prim), ill->ill_name); 18027 } else { 18028 (void) mi_strlog(ill->ill_rq, 1, 18029 SL_CONSOLE|SL_ERROR|SL_TRACE, 18030 "ill_dlpi_done: unexpected ack for %s from %s " 18031 "(expecting ack for %s)\n", 18032 dlpi_prim_str(prim), ill->ill_name, 18033 dlpi_prim_str(ill->ill_dlpi_pending)); 18034 } 18035 return; 18036 } 18037 18038 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 18039 dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 18040 18041 if ((mp = ill->ill_dlpi_deferred) == NULL) { 18042 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18043 return; 18044 } 18045 18046 ill->ill_dlpi_deferred = mp->b_next; 18047 mp->b_next = NULL; 18048 18049 ill_dlpi_dispatch(ill, mp); 18050 } 18051 18052 void 18053 conn_delete_ire(conn_t *connp, caddr_t arg) 18054 { 18055 ipif_t *ipif = (ipif_t *)arg; 18056 ire_t *ire; 18057 18058 /* 18059 * Look at the cached ires on conns which has pointers to ipifs. 18060 * We just call ire_refrele which clears up the reference 18061 * to ire. Called when a conn closes. Also called from ipif_free 18062 * to cleanup indirect references to the stale ipif via the cached ire. 18063 */ 18064 mutex_enter(&connp->conn_lock); 18065 ire = connp->conn_ire_cache; 18066 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 18067 connp->conn_ire_cache = NULL; 18068 mutex_exit(&connp->conn_lock); 18069 IRE_REFRELE_NOTR(ire); 18070 return; 18071 } 18072 mutex_exit(&connp->conn_lock); 18073 18074 } 18075 18076 /* 18077 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 18078 * of IREs. Those IREs may have been previously cached in the conn structure. 18079 * This ipcl_walk() walker function releases all references to such IREs based 18080 * on the condemned flag. 18081 */ 18082 /* ARGSUSED */ 18083 void 18084 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 18085 { 18086 ire_t *ire; 18087 18088 mutex_enter(&connp->conn_lock); 18089 ire = connp->conn_ire_cache; 18090 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 18091 connp->conn_ire_cache = NULL; 18092 mutex_exit(&connp->conn_lock); 18093 IRE_REFRELE_NOTR(ire); 18094 return; 18095 } 18096 mutex_exit(&connp->conn_lock); 18097 } 18098 18099 /* 18100 * Take down a specific interface, but don't lose any information about it. 18101 * Also delete interface from its interface group (ifgrp). 18102 * (Always called as writer.) 18103 * This function goes through the down sequence even if the interface is 18104 * already down. There are 2 reasons. 18105 * a. Currently we permit interface routes that depend on down interfaces 18106 * to be added. This behaviour itself is questionable. However it appears 18107 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 18108 * time. We go thru the cleanup in order to remove these routes. 18109 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 18110 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 18111 * down, but we need to cleanup i.e. do ill_dl_down and 18112 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 18113 * 18114 * IP-MT notes: 18115 * 18116 * Model of reference to interfaces. 18117 * 18118 * The following members in ipif_t track references to the ipif. 18119 * int ipif_refcnt; Active reference count 18120 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 18121 * The following members in ill_t track references to the ill. 18122 * int ill_refcnt; active refcnt 18123 * uint_t ill_ire_cnt; Number of ires referencing ill 18124 * uint_t ill_nce_cnt; Number of nces referencing ill 18125 * 18126 * Reference to an ipif or ill can be obtained in any of the following ways. 18127 * 18128 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 18129 * Pointers to ipif / ill from other data structures viz ire and conn. 18130 * Implicit reference to the ipif / ill by holding a reference to the ire. 18131 * 18132 * The ipif/ill lookup functions return a reference held ipif / ill. 18133 * ipif_refcnt and ill_refcnt track the reference counts respectively. 18134 * This is a purely dynamic reference count associated with threads holding 18135 * references to the ipif / ill. Pointers from other structures do not 18136 * count towards this reference count. 18137 * 18138 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 18139 * ipif/ill. This is incremented whenever a new ire is created referencing the 18140 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 18141 * actually added to the ire hash table. The count is decremented in 18142 * ire_inactive where the ire is destroyed. 18143 * 18144 * nce's reference ill's thru nce_ill and the count of nce's associated with 18145 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 18146 * ndp_add() where the nce is actually added to the table. Similarly it is 18147 * decremented in ndp_inactive where the nce is destroyed. 18148 * 18149 * Flow of ioctls involving interface down/up 18150 * 18151 * The following is the sequence of an attempt to set some critical flags on an 18152 * up interface. 18153 * ip_sioctl_flags 18154 * ipif_down 18155 * wait for ipif to be quiescent 18156 * ipif_down_tail 18157 * ip_sioctl_flags_tail 18158 * 18159 * All set ioctls that involve down/up sequence would have a skeleton similar 18160 * to the above. All the *tail functions are called after the refcounts have 18161 * dropped to the appropriate values. 18162 * 18163 * The mechanism to quiesce an ipif is as follows. 18164 * 18165 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 18166 * on the ipif. Callers either pass a flag requesting wait or the lookup 18167 * functions will return NULL. 18168 * 18169 * Delete all ires referencing this ipif 18170 * 18171 * Any thread attempting to do an ipif_refhold on an ipif that has been 18172 * obtained thru a cached pointer will first make sure that 18173 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 18174 * increment the refcount. 18175 * 18176 * The above guarantees that the ipif refcount will eventually come down to 18177 * zero and the ipif will quiesce, once all threads that currently hold a 18178 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 18179 * ipif_refcount has dropped to zero and all ire's associated with this ipif 18180 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 18181 * drop to zero. 18182 * 18183 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 18184 * 18185 * Threads trying to lookup an ipif or ill can pass a flag requesting 18186 * wait and restart if the ipif / ill cannot be looked up currently. 18187 * For eg. bind, and route operations (Eg. route add / delete) cannot return 18188 * failure if the ipif is currently undergoing an exclusive operation, and 18189 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 18190 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 18191 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 18192 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 18193 * change while the ill_lock is held. Before dropping the ill_lock we acquire 18194 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 18195 * until we release the ipsq_lock, even though the the ill/ipif state flags 18196 * can change after we drop the ill_lock. 18197 * 18198 * An attempt to send out a packet using an ipif that is currently 18199 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 18200 * operation and restart it later when the exclusive condition on the ipif ends. 18201 * This is an example of not passing the wait flag to the lookup functions. For 18202 * example an attempt to refhold and use conn->conn_multicast_ipif and send 18203 * out a multicast packet on that ipif will fail while the ipif is 18204 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 18205 * currently IPIF_CHANGING will also fail. 18206 */ 18207 int 18208 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18209 { 18210 ill_t *ill = ipif->ipif_ill; 18211 phyint_t *phyi; 18212 conn_t *connp; 18213 boolean_t success; 18214 boolean_t ipif_was_up = B_FALSE; 18215 18216 ASSERT(IAM_WRITER_IPIF(ipif)); 18217 18218 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 18219 18220 if (ipif->ipif_flags & IPIF_UP) { 18221 mutex_enter(&ill->ill_lock); 18222 ipif->ipif_flags &= ~IPIF_UP; 18223 ASSERT(ill->ill_ipif_up_count > 0); 18224 --ill->ill_ipif_up_count; 18225 mutex_exit(&ill->ill_lock); 18226 ipif_was_up = B_TRUE; 18227 /* Update status in SCTP's list */ 18228 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 18229 } 18230 18231 /* 18232 * Blow away v6 memberships we established in ipif_multicast_up(); the 18233 * v4 ones are left alone (as is the ipif_multicast_up flag, so we 18234 * know not to rejoin when the interface is brought back up). 18235 */ 18236 if (ipif->ipif_isv6) 18237 ipif_multicast_down(ipif); 18238 /* 18239 * Remove from the mapping for __sin6_src_id. We insert only 18240 * when the address is not INADDR_ANY. As IPv4 addresses are 18241 * stored as mapped addresses, we need to check for mapped 18242 * INADDR_ANY also. 18243 */ 18244 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 18245 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 18246 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 18247 int err; 18248 18249 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 18250 ipif->ipif_zoneid); 18251 if (err != 0) { 18252 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 18253 } 18254 } 18255 18256 /* 18257 * Before we delete the ill from the group (if any), we need 18258 * to make sure that we delete all the routes dependent on 18259 * this and also any ipifs dependent on this ipif for 18260 * source address. We need to do before we delete from 18261 * the group because 18262 * 18263 * 1) ipif_down_delete_ire de-references ill->ill_group. 18264 * 18265 * 2) ipif_update_other_ipifs needs to walk the whole group 18266 * for re-doing source address selection. Note that 18267 * ipif_select_source[_v6] called from 18268 * ipif_update_other_ipifs[_v6] will not pick this ipif 18269 * because we have already marked down here i.e cleared 18270 * IPIF_UP. 18271 */ 18272 if (ipif->ipif_isv6) 18273 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 18274 else 18275 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 18276 18277 /* 18278 * Need to add these also to be saved and restored when the 18279 * ipif is brought down and up 18280 */ 18281 mutex_enter(&ire_mrtun_lock); 18282 if (ire_mrtun_count != 0) { 18283 mutex_exit(&ire_mrtun_lock); 18284 ire_walk_ill_mrtun(0, 0, ipif_down_delete_ire, 18285 (char *)ipif, NULL); 18286 } else { 18287 mutex_exit(&ire_mrtun_lock); 18288 } 18289 18290 mutex_enter(&ire_srcif_table_lock); 18291 if (ire_srcif_table_count > 0) { 18292 mutex_exit(&ire_srcif_table_lock); 18293 ire_walk_srcif_table_v4(ipif_down_delete_ire, (char *)ipif); 18294 } else { 18295 mutex_exit(&ire_srcif_table_lock); 18296 } 18297 18298 /* 18299 * Cleaning up the conn_ire_cache or conns must be done only after the 18300 * ires have been deleted above. Otherwise a thread could end up 18301 * caching an ire in a conn after we have finished the cleanup of the 18302 * conn. The caching is done after making sure that the ire is not yet 18303 * condemned. Also documented in the block comment above ip_output 18304 */ 18305 ipcl_walk(conn_cleanup_stale_ire, NULL); 18306 /* Also, delete the ires cached in SCTP */ 18307 sctp_ire_cache_flush(ipif); 18308 18309 /* Resolve any IPsec/IKE NAT-T instances that depend on this ipif. */ 18310 nattymod_clean_ipif(ipif); 18311 18312 /* 18313 * Update any other ipifs which have used "our" local address as 18314 * a source address. This entails removing and recreating IRE_INTERFACE 18315 * entries for such ipifs. 18316 */ 18317 if (ipif->ipif_isv6) 18318 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 18319 else 18320 ipif_update_other_ipifs(ipif, ill->ill_group); 18321 18322 if (ipif_was_up) { 18323 /* 18324 * Check whether it is last ipif to leave this group. 18325 * If this is the last ipif to leave, we should remove 18326 * this ill from the group as ipif_select_source will not 18327 * be able to find any useful ipifs if this ill is selected 18328 * for load balancing. 18329 * 18330 * For nameless groups, we should call ifgrp_delete if this 18331 * belongs to some group. As this ipif is going down, we may 18332 * need to reconstruct groups. 18333 */ 18334 phyi = ill->ill_phyint; 18335 /* 18336 * If the phyint_groupname_len is 0, it may or may not 18337 * be in the nameless group. If the phyint_groupname_len is 18338 * not 0, then this ill should be part of some group. 18339 * As we always insert this ill in the group if 18340 * phyint_groupname_len is not zero when the first ipif 18341 * comes up (in ipif_up_done), it should be in a group 18342 * when the namelen is not 0. 18343 * 18344 * NOTE : When we delete the ill from the group,it will 18345 * blow away all the IRE_CACHES pointing either at this ipif or 18346 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 18347 * should be pointing at this ill. 18348 */ 18349 ASSERT(phyi->phyint_groupname_len == 0 || 18350 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 18351 18352 if (phyi->phyint_groupname_len != 0) { 18353 if (ill->ill_ipif_up_count == 0) 18354 illgrp_delete(ill); 18355 } 18356 18357 /* 18358 * If we have deleted some of the broadcast ires associated 18359 * with this ipif, we need to re-nominate somebody else if 18360 * the ires that we deleted were the nominated ones. 18361 */ 18362 if (ill->ill_group != NULL && !ill->ill_isv6) 18363 ipif_renominate_bcast(ipif); 18364 } 18365 18366 /* 18367 * neighbor-discovery or arp entries for this interface. 18368 */ 18369 ipif_ndp_down(ipif); 18370 18371 /* 18372 * If mp is NULL the caller will wait for the appropriate refcnt. 18373 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 18374 * and ill_delete -> ipif_free -> ipif_down 18375 */ 18376 if (mp == NULL) { 18377 ASSERT(q == NULL); 18378 return (0); 18379 } 18380 18381 if (CONN_Q(q)) { 18382 connp = Q_TO_CONN(q); 18383 mutex_enter(&connp->conn_lock); 18384 } else { 18385 connp = NULL; 18386 } 18387 mutex_enter(&ill->ill_lock); 18388 /* 18389 * Are there any ire's pointing to this ipif that are still active ? 18390 * If this is the last ipif going down, are there any ire's pointing 18391 * to this ill that are still active ? 18392 */ 18393 if (ipif_is_quiescent(ipif)) { 18394 mutex_exit(&ill->ill_lock); 18395 if (connp != NULL) 18396 mutex_exit(&connp->conn_lock); 18397 return (0); 18398 } 18399 18400 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 18401 ill->ill_name, (void *)ill)); 18402 /* 18403 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 18404 * drops down, the operation will be restarted by ipif_ill_refrele_tail 18405 * which in turn is called by the last refrele on the ipif/ill/ire. 18406 */ 18407 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 18408 if (!success) { 18409 /* The conn is closing. So just return */ 18410 ASSERT(connp != NULL); 18411 mutex_exit(&ill->ill_lock); 18412 mutex_exit(&connp->conn_lock); 18413 return (EINTR); 18414 } 18415 18416 mutex_exit(&ill->ill_lock); 18417 if (connp != NULL) 18418 mutex_exit(&connp->conn_lock); 18419 return (EINPROGRESS); 18420 } 18421 18422 void 18423 ipif_down_tail(ipif_t *ipif) 18424 { 18425 ill_t *ill = ipif->ipif_ill; 18426 18427 /* 18428 * Skip any loopback interface (null wq). 18429 * If this is the last logical interface on the ill 18430 * have ill_dl_down tell the driver we are gone (unbind) 18431 * Note that lun 0 can ipif_down even though 18432 * there are other logical units that are up. 18433 * This occurs e.g. when we change a "significant" IFF_ flag. 18434 */ 18435 if (ill->ill_wq != NULL && !ill->ill_logical_down && 18436 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 18437 ill->ill_dl_up) { 18438 ill_dl_down(ill); 18439 } 18440 ill->ill_logical_down = 0; 18441 18442 /* 18443 * Have to be after removing the routes in ipif_down_delete_ire. 18444 */ 18445 if (ipif->ipif_isv6) { 18446 if (ill->ill_flags & ILLF_XRESOLV) 18447 ipif_arp_down(ipif); 18448 } else { 18449 ipif_arp_down(ipif); 18450 } 18451 18452 ip_rts_ifmsg(ipif); 18453 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 18454 } 18455 18456 /* 18457 * Bring interface logically down without bringing the physical interface 18458 * down e.g. when the netmask is changed. This avoids long lasting link 18459 * negotiations between an ethernet interface and a certain switches. 18460 */ 18461 static int 18462 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18463 { 18464 /* 18465 * The ill_logical_down flag is a transient flag. It is set here 18466 * and is cleared once the down has completed in ipif_down_tail. 18467 * This flag does not indicate whether the ill stream is in the 18468 * DL_BOUND state with the driver. Instead this flag is used by 18469 * ipif_down_tail to determine whether to DL_UNBIND the stream with 18470 * the driver. The state of the ill stream i.e. whether it is 18471 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 18472 */ 18473 ipif->ipif_ill->ill_logical_down = 1; 18474 return (ipif_down(ipif, q, mp)); 18475 } 18476 18477 /* 18478 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 18479 * If the usesrc client ILL is already part of a usesrc group or not, 18480 * in either case a ire_stq with the matching usesrc client ILL will 18481 * locate the IRE's that need to be deleted. We want IREs to be created 18482 * with the new source address. 18483 */ 18484 static void 18485 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 18486 { 18487 ill_t *ucill = (ill_t *)ill_arg; 18488 18489 ASSERT(IAM_WRITER_ILL(ucill)); 18490 18491 if (ire->ire_stq == NULL) 18492 return; 18493 18494 if ((ire->ire_type == IRE_CACHE) && 18495 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 18496 ire_delete(ire); 18497 } 18498 18499 /* 18500 * ire_walk routine to delete every IRE dependent on the interface 18501 * address that is going down. (Always called as writer.) 18502 * Works for both v4 and v6. 18503 * In addition for checking for ire_ipif matches it also checks for 18504 * IRE_CACHE entries which have the same source address as the 18505 * disappearing ipif since ipif_select_source might have picked 18506 * that source. Note that ipif_down/ipif_update_other_ipifs takes 18507 * care of any IRE_INTERFACE with the disappearing source address. 18508 */ 18509 static void 18510 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 18511 { 18512 ipif_t *ipif = (ipif_t *)ipif_arg; 18513 ill_t *ire_ill; 18514 ill_t *ipif_ill; 18515 18516 ASSERT(IAM_WRITER_IPIF(ipif)); 18517 if (ire->ire_ipif == NULL) 18518 return; 18519 18520 /* 18521 * For IPv4, we derive source addresses for an IRE from ipif's 18522 * belonging to the same IPMP group as the IRE's outgoing 18523 * interface. If an IRE's outgoing interface isn't in the 18524 * same IPMP group as a particular ipif, then that ipif 18525 * couldn't have been used as a source address for this IRE. 18526 * 18527 * For IPv6, source addresses are only restricted to the IPMP group 18528 * if the IRE is for a link-local address or a multicast address. 18529 * Otherwise, source addresses for an IRE can be chosen from 18530 * interfaces other than the the outgoing interface for that IRE. 18531 * 18532 * For source address selection details, see ipif_select_source() 18533 * and ipif_select_source_v6(). 18534 */ 18535 if (ire->ire_ipversion == IPV4_VERSION || 18536 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 18537 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 18538 ire_ill = ire->ire_ipif->ipif_ill; 18539 ipif_ill = ipif->ipif_ill; 18540 18541 if (ire_ill->ill_group != ipif_ill->ill_group) { 18542 return; 18543 } 18544 } 18545 18546 18547 if (ire->ire_ipif != ipif) { 18548 /* 18549 * Look for a matching source address. 18550 */ 18551 if (ire->ire_type != IRE_CACHE) 18552 return; 18553 if (ipif->ipif_flags & IPIF_NOLOCAL) 18554 return; 18555 18556 if (ire->ire_ipversion == IPV4_VERSION) { 18557 if (ire->ire_src_addr != ipif->ipif_src_addr) 18558 return; 18559 } else { 18560 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 18561 &ipif->ipif_v6lcl_addr)) 18562 return; 18563 } 18564 ire_delete(ire); 18565 return; 18566 } 18567 /* 18568 * ire_delete() will do an ire_flush_cache which will delete 18569 * all ire_ipif matches 18570 */ 18571 ire_delete(ire); 18572 } 18573 18574 /* 18575 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 18576 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 18577 * 2) when an interface is brought up or down (on that ill). 18578 * This ensures that the IRE_CACHE entries don't retain stale source 18579 * address selection results. 18580 */ 18581 void 18582 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 18583 { 18584 ill_t *ill = (ill_t *)ill_arg; 18585 ill_t *ipif_ill; 18586 18587 ASSERT(IAM_WRITER_ILL(ill)); 18588 /* 18589 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18590 * Hence this should be IRE_CACHE. 18591 */ 18592 ASSERT(ire->ire_type == IRE_CACHE); 18593 18594 /* 18595 * We are called for IRE_CACHES whose ire_ipif matches ill. 18596 * We are only interested in IRE_CACHES that has borrowed 18597 * the source address from ill_arg e.g. ipif_up_done[_v6] 18598 * for which we need to look at ire_ipif->ipif_ill match 18599 * with ill. 18600 */ 18601 ASSERT(ire->ire_ipif != NULL); 18602 ipif_ill = ire->ire_ipif->ipif_ill; 18603 if (ipif_ill == ill || (ill->ill_group != NULL && 18604 ipif_ill->ill_group == ill->ill_group)) { 18605 ire_delete(ire); 18606 } 18607 } 18608 18609 /* 18610 * Delete all the ire whose stq references ill_arg. 18611 */ 18612 static void 18613 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 18614 { 18615 ill_t *ill = (ill_t *)ill_arg; 18616 ill_t *ire_ill; 18617 18618 ASSERT(IAM_WRITER_ILL(ill)); 18619 /* 18620 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18621 * Hence this should be IRE_CACHE. 18622 */ 18623 ASSERT(ire->ire_type == IRE_CACHE); 18624 18625 /* 18626 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18627 * matches ill. We are only interested in IRE_CACHES that 18628 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 18629 * filtering here. 18630 */ 18631 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 18632 18633 if (ire_ill == ill) 18634 ire_delete(ire); 18635 } 18636 18637 /* 18638 * This is called when an ill leaves the group. We want to delete 18639 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 18640 * pointing at ill. 18641 */ 18642 static void 18643 illgrp_cache_delete(ire_t *ire, char *ill_arg) 18644 { 18645 ill_t *ill = (ill_t *)ill_arg; 18646 18647 ASSERT(IAM_WRITER_ILL(ill)); 18648 ASSERT(ill->ill_group == NULL); 18649 /* 18650 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18651 * Hence this should be IRE_CACHE. 18652 */ 18653 ASSERT(ire->ire_type == IRE_CACHE); 18654 /* 18655 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18656 * matches ill. We are interested in both. 18657 */ 18658 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 18659 (ire->ire_ipif->ipif_ill == ill)); 18660 18661 ire_delete(ire); 18662 } 18663 18664 /* 18665 * Initiate deallocate of an IPIF. Always called as writer. Called by 18666 * ill_delete or ip_sioctl_removeif. 18667 */ 18668 static void 18669 ipif_free(ipif_t *ipif) 18670 { 18671 ASSERT(IAM_WRITER_IPIF(ipif)); 18672 18673 if (ipif->ipif_recovery_id != 0) 18674 (void) untimeout(ipif->ipif_recovery_id); 18675 ipif->ipif_recovery_id = 0; 18676 18677 /* Remove conn references */ 18678 reset_conn_ipif(ipif); 18679 18680 /* 18681 * Make sure we have valid net and subnet broadcast ire's for the 18682 * other ipif's which share them with this ipif. 18683 */ 18684 if (!ipif->ipif_isv6) 18685 ipif_check_bcast_ires(ipif); 18686 18687 /* 18688 * Take down the interface. We can be called either from ill_delete 18689 * or from ip_sioctl_removeif. 18690 */ 18691 (void) ipif_down(ipif, NULL, NULL); 18692 18693 rw_enter(&ill_g_lock, RW_WRITER); 18694 /* Remove pointers to this ill in the multicast routing tables */ 18695 reset_mrt_vif_ipif(ipif); 18696 rw_exit(&ill_g_lock); 18697 } 18698 18699 static void 18700 ipif_free_tail(ipif_t *ipif) 18701 { 18702 mblk_t *mp; 18703 ipif_t **ipifp; 18704 18705 /* 18706 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 18707 */ 18708 mutex_enter(&ipif->ipif_saved_ire_lock); 18709 mp = ipif->ipif_saved_ire_mp; 18710 ipif->ipif_saved_ire_mp = NULL; 18711 mutex_exit(&ipif->ipif_saved_ire_lock); 18712 freemsg(mp); 18713 18714 /* 18715 * Need to hold both ill_g_lock and ill_lock while 18716 * inserting or removing an ipif from the linked list 18717 * of ipifs hanging off the ill. 18718 */ 18719 rw_enter(&ill_g_lock, RW_WRITER); 18720 /* 18721 * Remove all multicast memberships on the interface now. 18722 * This removes IPv4 multicast memberships joined within 18723 * the kernel as ipif_down does not do ipif_multicast_down 18724 * for IPv4. IPv6 is not handled here as the multicast memberships 18725 * are based on ill and not on ipif. 18726 */ 18727 ilm_free(ipif); 18728 18729 /* 18730 * Since we held the ill_g_lock while doing the ilm_free above, 18731 * we can assert the ilms were really deleted and not just marked 18732 * ILM_DELETED. 18733 */ 18734 ASSERT(ilm_walk_ipif(ipif) == 0); 18735 18736 18737 IPIF_TRACE_CLEANUP(ipif); 18738 18739 /* Ask SCTP to take it out of it list */ 18740 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 18741 18742 mutex_enter(&ipif->ipif_ill->ill_lock); 18743 /* Get it out of the ILL interface list. */ 18744 ipifp = &ipif->ipif_ill->ill_ipif; 18745 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 18746 if (*ipifp == ipif) { 18747 *ipifp = ipif->ipif_next; 18748 break; 18749 } 18750 } 18751 18752 mutex_exit(&ipif->ipif_ill->ill_lock); 18753 rw_exit(&ill_g_lock); 18754 18755 mutex_destroy(&ipif->ipif_saved_ire_lock); 18756 18757 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 18758 18759 /* Free the memory. */ 18760 mi_free((char *)ipif); 18761 } 18762 18763 /* 18764 * Returns an ipif name in the form "ill_name/unit" if ipif_id is not zero, 18765 * "ill_name" otherwise. 18766 */ 18767 char * 18768 ipif_get_name(const ipif_t *ipif, char *buf, int len) 18769 { 18770 char lbuf[32]; 18771 char *name; 18772 size_t name_len; 18773 18774 buf[0] = '\0'; 18775 if (!ipif) 18776 return (buf); 18777 name = ipif->ipif_ill->ill_name; 18778 name_len = ipif->ipif_ill->ill_name_length; 18779 if (ipif->ipif_id != 0) { 18780 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 18781 ipif->ipif_id); 18782 name = lbuf; 18783 name_len = mi_strlen(name) + 1; 18784 } 18785 len -= 1; 18786 buf[len] = '\0'; 18787 len = MIN(len, name_len); 18788 bcopy(name, buf, len); 18789 return (buf); 18790 } 18791 18792 /* 18793 * Find an IPIF based on the name passed in. Names can be of the 18794 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 18795 * The <phys> string can have forms like <dev><#> (e.g., le0), 18796 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 18797 * When there is no colon, the implied unit id is zero. <phys> must 18798 * correspond to the name of an ILL. (May be called as writer.) 18799 */ 18800 static ipif_t * 18801 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 18802 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 18803 mblk_t *mp, ipsq_func_t func, int *error) 18804 { 18805 char *cp; 18806 char *endp; 18807 long id; 18808 ill_t *ill; 18809 ipif_t *ipif; 18810 uint_t ire_type; 18811 boolean_t did_alloc = B_FALSE; 18812 ipsq_t *ipsq; 18813 18814 if (error != NULL) 18815 *error = 0; 18816 18817 /* 18818 * If the caller wants to us to create the ipif, make sure we have a 18819 * valid zoneid 18820 */ 18821 ASSERT(!do_alloc || zoneid != ALL_ZONES); 18822 18823 if (namelen == 0) { 18824 if (error != NULL) 18825 *error = ENXIO; 18826 return (NULL); 18827 } 18828 18829 *exists = B_FALSE; 18830 /* Look for a colon in the name. */ 18831 endp = &name[namelen]; 18832 for (cp = endp; --cp > name; ) { 18833 if (*cp == IPIF_SEPARATOR_CHAR) 18834 break; 18835 } 18836 18837 if (*cp == IPIF_SEPARATOR_CHAR) { 18838 /* 18839 * Reject any non-decimal aliases for logical 18840 * interfaces. Aliases with leading zeroes 18841 * are also rejected as they introduce ambiguity 18842 * in the naming of the interfaces. 18843 * In order to confirm with existing semantics, 18844 * and to not break any programs/script relying 18845 * on that behaviour, if<0>:0 is considered to be 18846 * a valid interface. 18847 * 18848 * If alias has two or more digits and the first 18849 * is zero, fail. 18850 */ 18851 if (&cp[2] < endp && cp[1] == '0') 18852 return (NULL); 18853 } 18854 18855 if (cp <= name) { 18856 cp = endp; 18857 } else { 18858 *cp = '\0'; 18859 } 18860 18861 /* 18862 * Look up the ILL, based on the portion of the name 18863 * before the slash. ill_lookup_on_name returns a held ill. 18864 * Temporary to check whether ill exists already. If so 18865 * ill_lookup_on_name will clear it. 18866 */ 18867 ill = ill_lookup_on_name(name, do_alloc, isv6, 18868 q, mp, func, error, &did_alloc); 18869 if (cp != endp) 18870 *cp = IPIF_SEPARATOR_CHAR; 18871 if (ill == NULL) 18872 return (NULL); 18873 18874 /* Establish the unit number in the name. */ 18875 id = 0; 18876 if (cp < endp && *endp == '\0') { 18877 /* If there was a colon, the unit number follows. */ 18878 cp++; 18879 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 18880 ill_refrele(ill); 18881 if (error != NULL) 18882 *error = ENXIO; 18883 return (NULL); 18884 } 18885 } 18886 18887 GRAB_CONN_LOCK(q); 18888 mutex_enter(&ill->ill_lock); 18889 /* Now see if there is an IPIF with this unit number. */ 18890 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 18891 if (ipif->ipif_id == id) { 18892 if (zoneid != ALL_ZONES && 18893 zoneid != ipif->ipif_zoneid && 18894 ipif->ipif_zoneid != ALL_ZONES) { 18895 mutex_exit(&ill->ill_lock); 18896 RELEASE_CONN_LOCK(q); 18897 ill_refrele(ill); 18898 if (error != NULL) 18899 *error = ENXIO; 18900 return (NULL); 18901 } 18902 /* 18903 * The block comment at the start of ipif_down 18904 * explains the use of the macros used below 18905 */ 18906 if (IPIF_CAN_LOOKUP(ipif)) { 18907 ipif_refhold_locked(ipif); 18908 mutex_exit(&ill->ill_lock); 18909 if (!did_alloc) 18910 *exists = B_TRUE; 18911 /* 18912 * Drop locks before calling ill_refrele 18913 * since it can potentially call into 18914 * ipif_ill_refrele_tail which can end up 18915 * in trying to acquire any lock. 18916 */ 18917 RELEASE_CONN_LOCK(q); 18918 ill_refrele(ill); 18919 return (ipif); 18920 } else if (IPIF_CAN_WAIT(ipif, q)) { 18921 ipsq = ill->ill_phyint->phyint_ipsq; 18922 mutex_enter(&ipsq->ipsq_lock); 18923 mutex_exit(&ill->ill_lock); 18924 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 18925 mutex_exit(&ipsq->ipsq_lock); 18926 RELEASE_CONN_LOCK(q); 18927 ill_refrele(ill); 18928 *error = EINPROGRESS; 18929 return (NULL); 18930 } 18931 } 18932 } 18933 RELEASE_CONN_LOCK(q); 18934 18935 if (!do_alloc) { 18936 mutex_exit(&ill->ill_lock); 18937 ill_refrele(ill); 18938 if (error != NULL) 18939 *error = ENXIO; 18940 return (NULL); 18941 } 18942 18943 /* 18944 * If none found, atomically allocate and return a new one. 18945 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 18946 * to support "receive only" use of lo0:1 etc. as is still done 18947 * below as an initial guess. 18948 * However, this is now likely to be overriden later in ipif_up_done() 18949 * when we know for sure what address has been configured on the 18950 * interface, since we might have more than one loopback interface 18951 * with a loopback address, e.g. in the case of zones, and all the 18952 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 18953 */ 18954 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 18955 ire_type = IRE_LOOPBACK; 18956 else 18957 ire_type = IRE_LOCAL; 18958 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 18959 if (ipif != NULL) 18960 ipif_refhold_locked(ipif); 18961 else if (error != NULL) 18962 *error = ENOMEM; 18963 mutex_exit(&ill->ill_lock); 18964 ill_refrele(ill); 18965 return (ipif); 18966 } 18967 18968 /* 18969 * This routine is called whenever a new address comes up on an ipif. If 18970 * we are configured to respond to address mask requests, then we are supposed 18971 * to broadcast an address mask reply at this time. This routine is also 18972 * called if we are already up, but a netmask change is made. This is legal 18973 * but might not make the system manager very popular. (May be called 18974 * as writer.) 18975 */ 18976 void 18977 ipif_mask_reply(ipif_t *ipif) 18978 { 18979 icmph_t *icmph; 18980 ipha_t *ipha; 18981 mblk_t *mp; 18982 18983 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 18984 18985 if (!ip_respond_to_address_mask_broadcast) 18986 return; 18987 18988 /* ICMP mask reply is IPv4 only */ 18989 ASSERT(!ipif->ipif_isv6); 18990 /* ICMP mask reply is not for a loopback interface */ 18991 ASSERT(ipif->ipif_ill->ill_wq != NULL); 18992 18993 mp = allocb(REPLY_LEN, BPRI_HI); 18994 if (mp == NULL) 18995 return; 18996 mp->b_wptr = mp->b_rptr + REPLY_LEN; 18997 18998 ipha = (ipha_t *)mp->b_rptr; 18999 bzero(ipha, REPLY_LEN); 19000 *ipha = icmp_ipha; 19001 ipha->ipha_ttl = ip_broadcast_ttl; 19002 ipha->ipha_src = ipif->ipif_src_addr; 19003 ipha->ipha_dst = ipif->ipif_brd_addr; 19004 ipha->ipha_length = htons(REPLY_LEN); 19005 ipha->ipha_ident = 0; 19006 19007 icmph = (icmph_t *)&ipha[1]; 19008 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 19009 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 19010 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 19011 if (icmph->icmph_checksum == 0) 19012 icmph->icmph_checksum = 0xffff; 19013 19014 put(ipif->ipif_wq, mp); 19015 19016 #undef REPLY_LEN 19017 } 19018 19019 /* 19020 * When the mtu in the ipif changes, we call this routine through ire_walk 19021 * to update all the relevant IREs. 19022 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19023 */ 19024 static void 19025 ipif_mtu_change(ire_t *ire, char *ipif_arg) 19026 { 19027 ipif_t *ipif = (ipif_t *)ipif_arg; 19028 19029 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 19030 return; 19031 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 19032 } 19033 19034 /* 19035 * When the mtu in the ill changes, we call this routine through ire_walk 19036 * to update all the relevant IREs. 19037 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19038 */ 19039 void 19040 ill_mtu_change(ire_t *ire, char *ill_arg) 19041 { 19042 ill_t *ill = (ill_t *)ill_arg; 19043 19044 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 19045 return; 19046 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 19047 } 19048 19049 /* 19050 * Join the ipif specific multicast groups. 19051 * Must be called after a mapping has been set up in the resolver. (Always 19052 * called as writer.) 19053 */ 19054 void 19055 ipif_multicast_up(ipif_t *ipif) 19056 { 19057 int err, index; 19058 ill_t *ill; 19059 19060 ASSERT(IAM_WRITER_IPIF(ipif)); 19061 19062 ill = ipif->ipif_ill; 19063 index = ill->ill_phyint->phyint_ifindex; 19064 19065 ip1dbg(("ipif_multicast_up\n")); 19066 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 19067 return; 19068 19069 if (ipif->ipif_isv6) { 19070 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 19071 return; 19072 19073 /* Join the all hosts multicast address */ 19074 ip1dbg(("ipif_multicast_up - addmulti\n")); 19075 /* 19076 * Passing B_TRUE means we have to join the multicast 19077 * membership on this interface even though this is 19078 * FAILED. If we join on a different one in the group, 19079 * we will not be able to delete the membership later 19080 * as we currently don't track where we join when we 19081 * join within the kernel unlike applications where 19082 * we have ilg/ilg_orig_index. See ip_addmulti_v6 19083 * for more on this. 19084 */ 19085 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 19086 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19087 if (err != 0) { 19088 ip0dbg(("ipif_multicast_up: " 19089 "all_hosts_mcast failed %d\n", 19090 err)); 19091 return; 19092 } 19093 /* 19094 * Enable multicast for the solicited node multicast address 19095 */ 19096 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19097 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19098 19099 ipv6_multi.s6_addr32[3] |= 19100 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19101 19102 err = ip_addmulti_v6(&ipv6_multi, ill, index, 19103 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 19104 NULL); 19105 if (err != 0) { 19106 ip0dbg(("ipif_multicast_up: solicited MC" 19107 " failed %d\n", err)); 19108 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 19109 ill, ill->ill_phyint->phyint_ifindex, 19110 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19111 return; 19112 } 19113 } 19114 } else { 19115 if (ipif->ipif_lcl_addr == INADDR_ANY) 19116 return; 19117 19118 /* Join the all hosts multicast address */ 19119 ip1dbg(("ipif_multicast_up - addmulti\n")); 19120 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 19121 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19122 if (err) { 19123 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 19124 return; 19125 } 19126 } 19127 ipif->ipif_multicast_up = 1; 19128 } 19129 19130 /* 19131 * Blow away any IPv6 multicast groups that we joined in ipif_multicast_up(); 19132 * any explicit memberships are blown away in ill_leave_multicast() when the 19133 * ill is brought down. 19134 */ 19135 static void 19136 ipif_multicast_down(ipif_t *ipif) 19137 { 19138 int err; 19139 19140 ASSERT(IAM_WRITER_IPIF(ipif)); 19141 19142 ip1dbg(("ipif_multicast_down\n")); 19143 if (!ipif->ipif_multicast_up) 19144 return; 19145 19146 ASSERT(ipif->ipif_isv6); 19147 19148 ip1dbg(("ipif_multicast_down - delmulti\n")); 19149 19150 /* 19151 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 19152 * we should look for ilms on this ill rather than the ones that have 19153 * been failed over here. They are here temporarily. As 19154 * ipif_multicast_up has joined on this ill, we should delete only 19155 * from this ill. 19156 */ 19157 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 19158 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 19159 B_TRUE, B_TRUE); 19160 if (err != 0) { 19161 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 19162 err)); 19163 } 19164 /* 19165 * Disable multicast for the solicited node multicast address 19166 */ 19167 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19168 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19169 19170 ipv6_multi.s6_addr32[3] |= 19171 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19172 19173 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 19174 ipif->ipif_ill->ill_phyint->phyint_ifindex, 19175 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19176 19177 if (err != 0) { 19178 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 19179 err)); 19180 } 19181 } 19182 19183 ipif->ipif_multicast_up = 0; 19184 } 19185 19186 /* 19187 * Used when an interface comes up to recreate any extra routes on this 19188 * interface. 19189 */ 19190 static ire_t ** 19191 ipif_recover_ire(ipif_t *ipif) 19192 { 19193 mblk_t *mp; 19194 ire_t **ipif_saved_irep; 19195 ire_t **irep; 19196 19197 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 19198 ipif->ipif_id)); 19199 19200 mutex_enter(&ipif->ipif_saved_ire_lock); 19201 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 19202 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 19203 if (ipif_saved_irep == NULL) { 19204 mutex_exit(&ipif->ipif_saved_ire_lock); 19205 return (NULL); 19206 } 19207 19208 irep = ipif_saved_irep; 19209 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 19210 ire_t *ire; 19211 queue_t *rfq; 19212 queue_t *stq; 19213 ifrt_t *ifrt; 19214 uchar_t *src_addr; 19215 uchar_t *gateway_addr; 19216 mblk_t *resolver_mp; 19217 ushort_t type; 19218 19219 /* 19220 * When the ire was initially created and then added in 19221 * ip_rt_add(), it was created either using ipif->ipif_net_type 19222 * in the case of a traditional interface route, or as one of 19223 * the IRE_OFFSUBNET types (with the exception of 19224 * IRE_HOST types ire which is created by icmp_redirect() and 19225 * which we don't need to save or recover). In the case where 19226 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 19227 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 19228 * to satisfy software like GateD and Sun Cluster which creates 19229 * routes using the the loopback interface's address as a 19230 * gateway. 19231 * 19232 * As ifrt->ifrt_type reflects the already updated ire_type and 19233 * since ire_create() expects that IRE_IF_NORESOLVER will have 19234 * a valid nce_res_mp field (which doesn't make sense for a 19235 * IRE_LOOPBACK), ire_create() will be called in the same way 19236 * here as in ip_rt_add(), namely using ipif->ipif_net_type when 19237 * the route looks like a traditional interface route (where 19238 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 19239 * the saved ifrt->ifrt_type. This means that in the case where 19240 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 19241 * ire_create() will be an IRE_LOOPBACK, it will then be turned 19242 * into an IRE_IF_NORESOLVER and then added by ire_add(). 19243 */ 19244 ifrt = (ifrt_t *)mp->b_rptr; 19245 if (ifrt->ifrt_type & IRE_INTERFACE) { 19246 rfq = NULL; 19247 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 19248 ? ipif->ipif_rq : ipif->ipif_wq; 19249 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19250 ? (uint8_t *)&ifrt->ifrt_src_addr 19251 : (uint8_t *)&ipif->ipif_src_addr; 19252 gateway_addr = NULL; 19253 resolver_mp = ipif->ipif_resolver_mp; 19254 type = ipif->ipif_net_type; 19255 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 19256 /* Recover multiroute broadcast IRE. */ 19257 rfq = ipif->ipif_rq; 19258 stq = ipif->ipif_wq; 19259 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19260 ? (uint8_t *)&ifrt->ifrt_src_addr 19261 : (uint8_t *)&ipif->ipif_src_addr; 19262 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19263 resolver_mp = ipif->ipif_bcast_mp; 19264 type = ifrt->ifrt_type; 19265 } else { 19266 rfq = NULL; 19267 stq = NULL; 19268 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19269 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 19270 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19271 resolver_mp = NULL; 19272 type = ifrt->ifrt_type; 19273 } 19274 19275 /* 19276 * Create a copy of the IRE with the saved address and netmask. 19277 */ 19278 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 19279 "0x%x/0x%x\n", 19280 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 19281 ntohl(ifrt->ifrt_addr), 19282 ntohl(ifrt->ifrt_mask))); 19283 ire = ire_create( 19284 (uint8_t *)&ifrt->ifrt_addr, 19285 (uint8_t *)&ifrt->ifrt_mask, 19286 src_addr, 19287 gateway_addr, 19288 NULL, 19289 &ifrt->ifrt_max_frag, 19290 NULL, 19291 rfq, 19292 stq, 19293 type, 19294 resolver_mp, 19295 ipif, 19296 NULL, 19297 0, 19298 0, 19299 0, 19300 ifrt->ifrt_flags, 19301 &ifrt->ifrt_iulp_info, 19302 NULL, 19303 NULL); 19304 19305 if (ire == NULL) { 19306 mutex_exit(&ipif->ipif_saved_ire_lock); 19307 kmem_free(ipif_saved_irep, 19308 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 19309 return (NULL); 19310 } 19311 19312 /* 19313 * Some software (for example, GateD and Sun Cluster) attempts 19314 * to create (what amount to) IRE_PREFIX routes with the 19315 * loopback address as the gateway. This is primarily done to 19316 * set up prefixes with the RTF_REJECT flag set (for example, 19317 * when generating aggregate routes.) 19318 * 19319 * If the IRE type (as defined by ipif->ipif_net_type) is 19320 * IRE_LOOPBACK, then we map the request into a 19321 * IRE_IF_NORESOLVER. 19322 */ 19323 if (ipif->ipif_net_type == IRE_LOOPBACK) 19324 ire->ire_type = IRE_IF_NORESOLVER; 19325 /* 19326 * ire held by ire_add, will be refreled' towards the 19327 * the end of ipif_up_done 19328 */ 19329 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 19330 *irep = ire; 19331 irep++; 19332 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 19333 } 19334 mutex_exit(&ipif->ipif_saved_ire_lock); 19335 return (ipif_saved_irep); 19336 } 19337 19338 /* 19339 * Used to set the netmask and broadcast address to default values when the 19340 * interface is brought up. (Always called as writer.) 19341 */ 19342 static void 19343 ipif_set_default(ipif_t *ipif) 19344 { 19345 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19346 19347 if (!ipif->ipif_isv6) { 19348 /* 19349 * Interface holds an IPv4 address. Default 19350 * mask is the natural netmask. 19351 */ 19352 if (!ipif->ipif_net_mask) { 19353 ipaddr_t v4mask; 19354 19355 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 19356 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 19357 } 19358 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19359 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19360 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19361 } else { 19362 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19363 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19364 } 19365 /* 19366 * NOTE: SunOS 4.X does this even if the broadcast address 19367 * has been already set thus we do the same here. 19368 */ 19369 if (ipif->ipif_flags & IPIF_BROADCAST) { 19370 ipaddr_t v4addr; 19371 19372 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 19373 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 19374 } 19375 } else { 19376 /* 19377 * Interface holds an IPv6-only address. Default 19378 * mask is all-ones. 19379 */ 19380 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 19381 ipif->ipif_v6net_mask = ipv6_all_ones; 19382 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19383 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19384 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19385 } else { 19386 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19387 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19388 } 19389 } 19390 } 19391 19392 /* 19393 * Return 0 if this address can be used as local address without causing 19394 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 19395 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 19396 * Special checks are needed to allow the same IPv6 link-local address 19397 * on different ills. 19398 * TODO: allowing the same site-local address on different ill's. 19399 */ 19400 int 19401 ip_addr_availability_check(ipif_t *new_ipif) 19402 { 19403 in6_addr_t our_v6addr; 19404 ill_t *ill; 19405 ipif_t *ipif; 19406 ill_walk_context_t ctx; 19407 19408 ASSERT(IAM_WRITER_IPIF(new_ipif)); 19409 ASSERT(MUTEX_HELD(&ip_addr_avail_lock)); 19410 ASSERT(RW_READ_HELD(&ill_g_lock)); 19411 19412 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 19413 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 19414 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 19415 return (0); 19416 19417 our_v6addr = new_ipif->ipif_v6lcl_addr; 19418 19419 if (new_ipif->ipif_isv6) 19420 ill = ILL_START_WALK_V6(&ctx); 19421 else 19422 ill = ILL_START_WALK_V4(&ctx); 19423 19424 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19425 for (ipif = ill->ill_ipif; ipif != NULL; 19426 ipif = ipif->ipif_next) { 19427 if ((ipif == new_ipif) || 19428 !(ipif->ipif_flags & IPIF_UP) || 19429 (ipif->ipif_flags & IPIF_UNNUMBERED)) 19430 continue; 19431 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 19432 &our_v6addr)) { 19433 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 19434 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 19435 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 19436 ipif->ipif_flags |= IPIF_UNNUMBERED; 19437 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 19438 new_ipif->ipif_ill != ill) 19439 continue; 19440 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 19441 new_ipif->ipif_ill != ill) 19442 continue; 19443 else if (new_ipif->ipif_zoneid != 19444 ipif->ipif_zoneid && 19445 ipif->ipif_zoneid != ALL_ZONES && 19446 (ill->ill_phyint->phyint_flags & 19447 PHYI_LOOPBACK)) 19448 continue; 19449 else if (new_ipif->ipif_ill == ill) 19450 return (EADDRINUSE); 19451 else 19452 return (EADDRNOTAVAIL); 19453 } 19454 } 19455 } 19456 19457 return (0); 19458 } 19459 19460 /* 19461 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 19462 * IREs for the ipif. 19463 * When the routine returns EINPROGRESS then mp has been consumed and 19464 * the ioctl will be acked from ip_rput_dlpi. 19465 */ 19466 static int 19467 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 19468 { 19469 ill_t *ill = ipif->ipif_ill; 19470 boolean_t isv6 = ipif->ipif_isv6; 19471 int err = 0; 19472 boolean_t success; 19473 19474 ASSERT(IAM_WRITER_IPIF(ipif)); 19475 19476 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 19477 19478 /* Shouldn't get here if it is already up. */ 19479 if (ipif->ipif_flags & IPIF_UP) 19480 return (EALREADY); 19481 19482 /* Skip arp/ndp for any loopback interface. */ 19483 if (ill->ill_wq != NULL) { 19484 conn_t *connp = Q_TO_CONN(q); 19485 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19486 19487 if (!ill->ill_dl_up) { 19488 /* 19489 * ill_dl_up is not yet set. i.e. we are yet to 19490 * DL_BIND with the driver and this is the first 19491 * logical interface on the ill to become "up". 19492 * Tell the driver to get going (via DL_BIND_REQ). 19493 * Note that changing "significant" IFF_ flags 19494 * address/netmask etc cause a down/up dance, but 19495 * does not cause an unbind (DL_UNBIND) with the driver 19496 */ 19497 return (ill_dl_up(ill, ipif, mp, q)); 19498 } 19499 19500 /* 19501 * ipif_resolver_up may end up sending an 19502 * AR_INTERFACE_UP message to ARP, which would, in 19503 * turn send a DLPI message to the driver. ioctls are 19504 * serialized and so we cannot send more than one 19505 * interface up message at a time. If ipif_resolver_up 19506 * does send an interface up message to ARP, we get 19507 * EINPROGRESS and we will complete in ip_arp_done. 19508 */ 19509 19510 ASSERT(connp != NULL); 19511 ASSERT(ipsq->ipsq_pending_mp == NULL); 19512 mutex_enter(&connp->conn_lock); 19513 mutex_enter(&ill->ill_lock); 19514 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19515 mutex_exit(&ill->ill_lock); 19516 mutex_exit(&connp->conn_lock); 19517 if (!success) 19518 return (EINTR); 19519 19520 /* 19521 * Crank up IPv6 neighbor discovery 19522 * Unlike ARP, this should complete when 19523 * ipif_ndp_up returns. However, for 19524 * ILLF_XRESOLV interfaces we also send a 19525 * AR_INTERFACE_UP to the external resolver. 19526 * That ioctl will complete in ip_rput. 19527 */ 19528 if (isv6) { 19529 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, 19530 B_FALSE); 19531 if (err != 0) { 19532 if (err != EINPROGRESS) 19533 mp = ipsq_pending_mp_get(ipsq, &connp); 19534 return (err); 19535 } 19536 } 19537 /* Now, ARP */ 19538 err = ipif_resolver_up(ipif, Res_act_initial); 19539 if (err == EINPROGRESS) { 19540 /* We will complete it in ip_arp_done */ 19541 return (err); 19542 } 19543 mp = ipsq_pending_mp_get(ipsq, &connp); 19544 ASSERT(mp != NULL); 19545 if (err != 0) 19546 return (err); 19547 } else { 19548 /* 19549 * Interfaces without underlying hardware don't do duplicate 19550 * address detection. 19551 */ 19552 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 19553 ipif->ipif_addr_ready = 1; 19554 } 19555 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 19556 } 19557 19558 /* 19559 * Perform a bind for the physical device. 19560 * When the routine returns EINPROGRESS then mp has been consumed and 19561 * the ioctl will be acked from ip_rput_dlpi. 19562 * Allocate an unbind message and save it until ipif_down. 19563 */ 19564 static int 19565 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 19566 { 19567 mblk_t *areq_mp = NULL; 19568 mblk_t *bind_mp = NULL; 19569 mblk_t *unbind_mp = NULL; 19570 conn_t *connp; 19571 boolean_t success; 19572 19573 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 19574 ASSERT(IAM_WRITER_ILL(ill)); 19575 19576 ASSERT(mp != NULL); 19577 19578 /* Create a resolver cookie for ARP */ 19579 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 19580 areq_t *areq; 19581 uint16_t sap_addr; 19582 19583 areq_mp = ill_arp_alloc(ill, 19584 (uchar_t *)&ip_areq_template, 0); 19585 if (areq_mp == NULL) { 19586 return (ENOMEM); 19587 } 19588 freemsg(ill->ill_resolver_mp); 19589 ill->ill_resolver_mp = areq_mp; 19590 areq = (areq_t *)areq_mp->b_rptr; 19591 sap_addr = ill->ill_sap; 19592 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 19593 /* 19594 * Wait till we call ill_pending_mp_add to determine 19595 * the success before we free the ill_resolver_mp and 19596 * attach areq_mp in it's place. 19597 */ 19598 } 19599 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 19600 DL_BIND_REQ); 19601 if (bind_mp == NULL) 19602 goto bad; 19603 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 19604 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 19605 19606 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 19607 if (unbind_mp == NULL) 19608 goto bad; 19609 19610 /* 19611 * Record state needed to complete this operation when the 19612 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 19613 */ 19614 if (WR(q)->q_next == NULL) { 19615 connp = Q_TO_CONN(q); 19616 mutex_enter(&connp->conn_lock); 19617 } else { 19618 connp = NULL; 19619 } 19620 mutex_enter(&ipif->ipif_ill->ill_lock); 19621 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19622 mutex_exit(&ipif->ipif_ill->ill_lock); 19623 if (connp != NULL) 19624 mutex_exit(&connp->conn_lock); 19625 if (!success) 19626 goto bad; 19627 19628 /* 19629 * Save the unbind message for ill_dl_down(); it will be consumed when 19630 * the interface goes down. 19631 */ 19632 ASSERT(ill->ill_unbind_mp == NULL); 19633 ill->ill_unbind_mp = unbind_mp; 19634 19635 ill_dlpi_send(ill, bind_mp); 19636 /* Send down link-layer capabilities probe if not already done. */ 19637 ill_capability_probe(ill); 19638 19639 /* 19640 * Sysid used to rely on the fact that netboots set domainname 19641 * and the like. Now that miniroot boots aren't strictly netboots 19642 * and miniroot network configuration is driven from userland 19643 * these things still need to be set. This situation can be detected 19644 * by comparing the interface being configured here to the one 19645 * dhcack was set to reference by the boot loader. Once sysid is 19646 * converted to use dhcp_ipc_getinfo() this call can go away. 19647 */ 19648 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) && 19649 (strcmp(ill->ill_name, dhcack) == 0) && 19650 (strlen(srpc_domain) == 0)) { 19651 if (dhcpinit() != 0) 19652 cmn_err(CE_WARN, "no cached dhcp response"); 19653 } 19654 19655 /* 19656 * This operation will complete in ip_rput_dlpi with either 19657 * a DL_BIND_ACK or DL_ERROR_ACK. 19658 */ 19659 return (EINPROGRESS); 19660 bad: 19661 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 19662 /* 19663 * We don't have to check for possible removal from illgrp 19664 * as we have not yet inserted in illgrp. For groups 19665 * without names, this ipif is still not UP and hence 19666 * this could not have possibly had any influence in forming 19667 * groups. 19668 */ 19669 19670 if (bind_mp != NULL) 19671 freemsg(bind_mp); 19672 if (unbind_mp != NULL) 19673 freemsg(unbind_mp); 19674 return (ENOMEM); 19675 } 19676 19677 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 19678 19679 /* 19680 * DLPI and ARP is up. 19681 * Create all the IREs associated with an interface bring up multicast. 19682 * Set the interface flag and finish other initialization 19683 * that potentially had to be differed to after DL_BIND_ACK. 19684 */ 19685 int 19686 ipif_up_done(ipif_t *ipif) 19687 { 19688 ire_t *ire_array[20]; 19689 ire_t **irep = ire_array; 19690 ire_t **irep1; 19691 ipaddr_t net_mask = 0; 19692 ipaddr_t subnet_mask, route_mask; 19693 ill_t *ill = ipif->ipif_ill; 19694 queue_t *stq; 19695 ipif_t *src_ipif; 19696 ipif_t *tmp_ipif; 19697 boolean_t flush_ire_cache = B_TRUE; 19698 int err = 0; 19699 phyint_t *phyi; 19700 ire_t **ipif_saved_irep = NULL; 19701 int ipif_saved_ire_cnt; 19702 int cnt; 19703 boolean_t src_ipif_held = B_FALSE; 19704 boolean_t ire_added = B_FALSE; 19705 boolean_t loopback = B_FALSE; 19706 19707 ip1dbg(("ipif_up_done(%s:%u)\n", 19708 ipif->ipif_ill->ill_name, ipif->ipif_id)); 19709 /* Check if this is a loopback interface */ 19710 if (ipif->ipif_ill->ill_wq == NULL) 19711 loopback = B_TRUE; 19712 19713 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19714 /* 19715 * If all other interfaces for this ill are down or DEPRECATED, 19716 * or otherwise unsuitable for source address selection, remove 19717 * any IRE_CACHE entries for this ill to make sure source 19718 * address selection gets to take this new ipif into account. 19719 * No need to hold ill_lock while traversing the ipif list since 19720 * we are writer 19721 */ 19722 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 19723 tmp_ipif = tmp_ipif->ipif_next) { 19724 if (((tmp_ipif->ipif_flags & 19725 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 19726 !(tmp_ipif->ipif_flags & IPIF_UP)) || 19727 (tmp_ipif == ipif)) 19728 continue; 19729 /* first useable pre-existing interface */ 19730 flush_ire_cache = B_FALSE; 19731 break; 19732 } 19733 if (flush_ire_cache) 19734 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 19735 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 19736 19737 /* 19738 * Figure out which way the send-to queue should go. Only 19739 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 19740 * should show up here. 19741 */ 19742 switch (ill->ill_net_type) { 19743 case IRE_IF_RESOLVER: 19744 stq = ill->ill_rq; 19745 break; 19746 case IRE_IF_NORESOLVER: 19747 case IRE_LOOPBACK: 19748 stq = ill->ill_wq; 19749 break; 19750 default: 19751 return (EINVAL); 19752 } 19753 19754 if (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) { 19755 /* 19756 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 19757 * ipif_lookup_on_name(), but in the case of zones we can have 19758 * several loopback addresses on lo0. So all the interfaces with 19759 * loopback addresses need to be marked IRE_LOOPBACK. 19760 */ 19761 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 19762 htonl(INADDR_LOOPBACK)) 19763 ipif->ipif_ire_type = IRE_LOOPBACK; 19764 else 19765 ipif->ipif_ire_type = IRE_LOCAL; 19766 } 19767 19768 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 19769 /* 19770 * Can't use our source address. Select a different 19771 * source address for the IRE_INTERFACE and IRE_LOCAL 19772 */ 19773 src_ipif = ipif_select_source(ipif->ipif_ill, 19774 ipif->ipif_subnet, ipif->ipif_zoneid); 19775 if (src_ipif == NULL) 19776 src_ipif = ipif; /* Last resort */ 19777 else 19778 src_ipif_held = B_TRUE; 19779 } else { 19780 src_ipif = ipif; 19781 } 19782 19783 /* Create all the IREs associated with this interface */ 19784 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19785 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19786 19787 /* 19788 * If we're on a labeled system then make sure that zone- 19789 * private addresses have proper remote host database entries. 19790 */ 19791 if (is_system_labeled() && 19792 ipif->ipif_ire_type != IRE_LOOPBACK && 19793 !tsol_check_interface_address(ipif)) 19794 return (EINVAL); 19795 19796 /* Register the source address for __sin6_src_id */ 19797 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 19798 ipif->ipif_zoneid); 19799 if (err != 0) { 19800 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 19801 return (err); 19802 } 19803 19804 /* If the interface address is set, create the local IRE. */ 19805 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 19806 (void *)ipif, 19807 ipif->ipif_ire_type, 19808 ntohl(ipif->ipif_lcl_addr))); 19809 *irep++ = ire_create( 19810 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 19811 (uchar_t *)&ip_g_all_ones, /* mask */ 19812 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 19813 NULL, /* no gateway */ 19814 NULL, 19815 &ip_loopback_mtuplus, /* max frag size */ 19816 NULL, 19817 ipif->ipif_rq, /* recv-from queue */ 19818 NULL, /* no send-to queue */ 19819 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 19820 NULL, 19821 ipif, 19822 NULL, 19823 0, 19824 0, 19825 0, 19826 (ipif->ipif_flags & IPIF_PRIVATE) ? 19827 RTF_PRIVATE : 0, 19828 &ire_uinfo_null, 19829 NULL, 19830 NULL); 19831 } else { 19832 ip1dbg(( 19833 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 19834 ipif->ipif_ire_type, 19835 ntohl(ipif->ipif_lcl_addr), 19836 (uint_t)ipif->ipif_flags)); 19837 } 19838 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19839 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19840 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 19841 } else { 19842 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 19843 } 19844 19845 subnet_mask = ipif->ipif_net_mask; 19846 19847 /* 19848 * If mask was not specified, use natural netmask of 19849 * interface address. Also, store this mask back into the 19850 * ipif struct. 19851 */ 19852 if (subnet_mask == 0) { 19853 subnet_mask = net_mask; 19854 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 19855 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 19856 ipif->ipif_v6subnet); 19857 } 19858 19859 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 19860 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 19861 ipif->ipif_subnet != INADDR_ANY) { 19862 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19863 19864 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19865 route_mask = IP_HOST_MASK; 19866 } else { 19867 route_mask = subnet_mask; 19868 } 19869 19870 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 19871 "creating if IRE ill_net_type 0x%x for 0x%x\n", 19872 (void *)ipif, (void *)ill, 19873 ill->ill_net_type, 19874 ntohl(ipif->ipif_subnet))); 19875 *irep++ = ire_create( 19876 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 19877 (uchar_t *)&route_mask, /* mask */ 19878 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 19879 NULL, /* no gateway */ 19880 NULL, 19881 &ipif->ipif_mtu, /* max frag */ 19882 NULL, 19883 NULL, /* no recv queue */ 19884 stq, /* send-to queue */ 19885 ill->ill_net_type, /* IF_[NO]RESOLVER */ 19886 ill->ill_resolver_mp, /* xmit header */ 19887 ipif, 19888 NULL, 19889 0, 19890 0, 19891 0, 19892 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 19893 &ire_uinfo_null, 19894 NULL, 19895 NULL); 19896 } 19897 19898 /* 19899 * If the interface address is set, create the broadcast IREs. 19900 * 19901 * ire_create_bcast checks if the proposed new IRE matches 19902 * any existing IRE's with the same physical interface (ILL). 19903 * This should get rid of duplicates. 19904 * ire_create_bcast also check IPIF_NOXMIT and does not create 19905 * any broadcast ires. 19906 */ 19907 if ((ipif->ipif_subnet != INADDR_ANY) && 19908 (ipif->ipif_flags & IPIF_BROADCAST)) { 19909 ipaddr_t addr; 19910 19911 ip1dbg(("ipif_up_done: creating broadcast IRE\n")); 19912 irep = ire_check_and_create_bcast(ipif, 0, irep, 19913 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19914 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, 19915 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19916 19917 /* 19918 * For backward compatibility, we need to create net 19919 * broadcast ire's based on the old "IP address class 19920 * system." The reason is that some old machines only 19921 * respond to these class derived net broadcast. 19922 * 19923 * But we should not create these net broadcast ire's if 19924 * the subnet_mask is shorter than the IP address class based 19925 * derived netmask. Otherwise, we may create a net 19926 * broadcast address which is the same as an IP address 19927 * on the subnet. Then TCP will refuse to talk to that 19928 * address. 19929 * 19930 * Nor do we need IRE_BROADCAST ire's for the interface 19931 * with the netmask as 0xFFFFFFFF, as IRE_LOCAL for that 19932 * interface is already created. Creating these broadcast 19933 * ire's will only create confusion as the "addr" is going 19934 * to be same as that of the IP address of the interface. 19935 */ 19936 if (net_mask < subnet_mask) { 19937 addr = net_mask & ipif->ipif_subnet; 19938 irep = ire_check_and_create_bcast(ipif, addr, irep, 19939 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19940 irep = ire_check_and_create_bcast(ipif, 19941 ~net_mask | addr, irep, 19942 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19943 } 19944 19945 if (subnet_mask != 0xFFFFFFFF) { 19946 addr = ipif->ipif_subnet; 19947 irep = ire_check_and_create_bcast(ipif, addr, irep, 19948 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19949 irep = ire_check_and_create_bcast(ipif, 19950 ~subnet_mask|addr, irep, 19951 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19952 } 19953 } 19954 19955 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19956 19957 /* If an earlier ire_create failed, get out now */ 19958 for (irep1 = irep; irep1 > ire_array; ) { 19959 irep1--; 19960 if (*irep1 == NULL) { 19961 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 19962 err = ENOMEM; 19963 goto bad; 19964 } 19965 } 19966 19967 /* 19968 * Need to atomically check for ip_addr_availablity_check 19969 * under ip_addr_avail_lock, and if it fails got bad, and remove 19970 * from group also.The ill_g_lock is grabbed as reader 19971 * just to make sure no new ills or new ipifs are being added 19972 * to the system while we are checking the uniqueness of addresses. 19973 */ 19974 rw_enter(&ill_g_lock, RW_READER); 19975 mutex_enter(&ip_addr_avail_lock); 19976 /* Mark it up, and increment counters. */ 19977 ipif->ipif_flags |= IPIF_UP; 19978 ill->ill_ipif_up_count++; 19979 err = ip_addr_availability_check(ipif); 19980 mutex_exit(&ip_addr_avail_lock); 19981 rw_exit(&ill_g_lock); 19982 19983 if (err != 0) { 19984 /* 19985 * Our address may already be up on the same ill. In this case, 19986 * the ARP entry for our ipif replaced the one for the other 19987 * ipif. So we don't want to delete it (otherwise the other ipif 19988 * would be unable to send packets). 19989 * ip_addr_availability_check() identifies this case for us and 19990 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 19991 * which is the expected error code. 19992 */ 19993 if (err == EADDRINUSE) { 19994 freemsg(ipif->ipif_arp_del_mp); 19995 ipif->ipif_arp_del_mp = NULL; 19996 err = EADDRNOTAVAIL; 19997 } 19998 ill->ill_ipif_up_count--; 19999 ipif->ipif_flags &= ~IPIF_UP; 20000 goto bad; 20001 } 20002 20003 /* 20004 * Add in all newly created IREs. ire_create_bcast() has 20005 * already checked for duplicates of the IRE_BROADCAST type. 20006 * We want to add before we call ifgrp_insert which wants 20007 * to know whether IRE_IF_RESOLVER exists or not. 20008 * 20009 * NOTE : We refrele the ire though we may branch to "bad" 20010 * later on where we do ire_delete. This is okay 20011 * because nobody can delete it as we are running 20012 * exclusively. 20013 */ 20014 for (irep1 = irep; irep1 > ire_array; ) { 20015 irep1--; 20016 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 20017 /* 20018 * refheld by ire_add. refele towards the end of the func 20019 */ 20020 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 20021 } 20022 ire_added = B_TRUE; 20023 /* 20024 * Form groups if possible. 20025 * 20026 * If we are supposed to be in a ill_group with a name, insert it 20027 * now as we know that at least one ipif is UP. Otherwise form 20028 * nameless groups. 20029 * 20030 * If ip_enable_group_ifs is set and ipif address is not 0, insert 20031 * this ipif into the appropriate interface group, or create a 20032 * new one. If this is already in a nameless group, we try to form 20033 * a bigger group looking at other ills potentially sharing this 20034 * ipif's prefix. 20035 */ 20036 phyi = ill->ill_phyint; 20037 if (phyi->phyint_groupname_len != 0) { 20038 ASSERT(phyi->phyint_groupname != NULL); 20039 if (ill->ill_ipif_up_count == 1) { 20040 ASSERT(ill->ill_group == NULL); 20041 err = illgrp_insert(&illgrp_head_v4, ill, 20042 phyi->phyint_groupname, NULL, B_TRUE); 20043 if (err != 0) { 20044 ip1dbg(("ipif_up_done: illgrp allocation " 20045 "failed, error %d\n", err)); 20046 goto bad; 20047 } 20048 } 20049 ASSERT(ill->ill_group != NULL); 20050 } 20051 20052 /* 20053 * When this is part of group, we need to make sure that 20054 * any broadcast ires created because of this ipif coming 20055 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 20056 * so that we don't receive duplicate broadcast packets. 20057 */ 20058 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 20059 ipif_renominate_bcast(ipif); 20060 20061 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 20062 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 20063 ipif_saved_irep = ipif_recover_ire(ipif); 20064 20065 if (!loopback) { 20066 /* 20067 * If the broadcast address has been set, make sure it makes 20068 * sense based on the interface address. 20069 * Only match on ill since we are sharing broadcast addresses. 20070 */ 20071 if ((ipif->ipif_brd_addr != INADDR_ANY) && 20072 (ipif->ipif_flags & IPIF_BROADCAST)) { 20073 ire_t *ire; 20074 20075 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 20076 IRE_BROADCAST, ipif, ALL_ZONES, 20077 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20078 20079 if (ire == NULL) { 20080 /* 20081 * If there isn't a matching broadcast IRE, 20082 * revert to the default for this netmask. 20083 */ 20084 ipif->ipif_v6brd_addr = ipv6_all_zeros; 20085 mutex_enter(&ipif->ipif_ill->ill_lock); 20086 ipif_set_default(ipif); 20087 mutex_exit(&ipif->ipif_ill->ill_lock); 20088 } else { 20089 ire_refrele(ire); 20090 } 20091 } 20092 20093 } 20094 20095 /* This is the first interface on this ill */ 20096 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 20097 /* 20098 * Need to recover all multicast memberships in the driver. 20099 * This had to be deferred until we had attached. 20100 */ 20101 ill_recover_multicast(ill); 20102 } 20103 /* Join the allhosts multicast address */ 20104 ipif_multicast_up(ipif); 20105 20106 if (!loopback) { 20107 /* 20108 * See whether anybody else would benefit from the 20109 * new ipif that we added. We call this always rather 20110 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 20111 * ipif is for the benefit of illgrp_insert (done above) 20112 * which does not do source address selection as it does 20113 * not want to re-create interface routes that we are 20114 * having reference to it here. 20115 */ 20116 ill_update_source_selection(ill); 20117 } 20118 20119 for (irep1 = irep; irep1 > ire_array; ) { 20120 irep1--; 20121 if (*irep1 != NULL) { 20122 /* was held in ire_add */ 20123 ire_refrele(*irep1); 20124 } 20125 } 20126 20127 cnt = ipif_saved_ire_cnt; 20128 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 20129 if (*irep1 != NULL) { 20130 /* was held in ire_add */ 20131 ire_refrele(*irep1); 20132 } 20133 } 20134 20135 if (!loopback && ipif->ipif_addr_ready) { 20136 /* Broadcast an address mask reply. */ 20137 ipif_mask_reply(ipif); 20138 } 20139 if (ipif_saved_irep != NULL) { 20140 kmem_free(ipif_saved_irep, 20141 ipif_saved_ire_cnt * sizeof (ire_t *)); 20142 } 20143 if (src_ipif_held) 20144 ipif_refrele(src_ipif); 20145 20146 /* 20147 * This had to be deferred until we had bound. Tell routing sockets and 20148 * others that this interface is up if it looks like the address has 20149 * been validated. Otherwise, if it isn't ready yet, wait for 20150 * duplicate address detection to do its thing. 20151 */ 20152 if (ipif->ipif_addr_ready) { 20153 ip_rts_ifmsg(ipif); 20154 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 20155 /* Let SCTP update the status for this ipif */ 20156 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20157 } 20158 return (0); 20159 20160 bad: 20161 ip1dbg(("ipif_up_done: FAILED \n")); 20162 /* 20163 * We don't have to bother removing from ill groups because 20164 * 20165 * 1) For groups with names, we insert only when the first ipif 20166 * comes up. In that case if it fails, it will not be in any 20167 * group. So, we need not try to remove for that case. 20168 * 20169 * 2) For groups without names, either we tried to insert ipif_ill 20170 * in a group as singleton or found some other group to become 20171 * a bigger group. For the former, if it fails we don't have 20172 * anything to do as ipif_ill is not in the group and for the 20173 * latter, there are no failures in illgrp_insert/illgrp_delete 20174 * (ENOMEM can't occur for this. Check ifgrp_insert). 20175 */ 20176 while (irep > ire_array) { 20177 irep--; 20178 if (*irep != NULL) { 20179 ire_delete(*irep); 20180 if (ire_added) 20181 ire_refrele(*irep); 20182 } 20183 } 20184 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid); 20185 20186 if (ipif_saved_irep != NULL) { 20187 kmem_free(ipif_saved_irep, 20188 ipif_saved_ire_cnt * sizeof (ire_t *)); 20189 } 20190 if (src_ipif_held) 20191 ipif_refrele(src_ipif); 20192 20193 ipif_arp_down(ipif); 20194 return (err); 20195 } 20196 20197 /* 20198 * Turn off the ARP with the ILLF_NOARP flag. 20199 */ 20200 static int 20201 ill_arp_off(ill_t *ill) 20202 { 20203 mblk_t *arp_off_mp = NULL; 20204 mblk_t *arp_on_mp = NULL; 20205 20206 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 20207 20208 ASSERT(IAM_WRITER_ILL(ill)); 20209 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20210 20211 /* 20212 * If the on message is still around we've already done 20213 * an arp_off without doing an arp_on thus there is no 20214 * work needed. 20215 */ 20216 if (ill->ill_arp_on_mp != NULL) 20217 return (0); 20218 20219 /* 20220 * Allocate an ARP on message (to be saved) and an ARP off message 20221 */ 20222 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 20223 if (!arp_off_mp) 20224 return (ENOMEM); 20225 20226 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 20227 if (!arp_on_mp) 20228 goto failed; 20229 20230 ASSERT(ill->ill_arp_on_mp == NULL); 20231 ill->ill_arp_on_mp = arp_on_mp; 20232 20233 /* Send an AR_INTERFACE_OFF request */ 20234 putnext(ill->ill_rq, arp_off_mp); 20235 return (0); 20236 failed: 20237 20238 if (arp_off_mp) 20239 freemsg(arp_off_mp); 20240 return (ENOMEM); 20241 } 20242 20243 /* 20244 * Turn on ARP by turning off the ILLF_NOARP flag. 20245 */ 20246 static int 20247 ill_arp_on(ill_t *ill) 20248 { 20249 mblk_t *mp; 20250 20251 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 20252 20253 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20254 20255 ASSERT(IAM_WRITER_ILL(ill)); 20256 /* 20257 * Send an AR_INTERFACE_ON request if we have already done 20258 * an arp_off (which allocated the message). 20259 */ 20260 if (ill->ill_arp_on_mp != NULL) { 20261 mp = ill->ill_arp_on_mp; 20262 ill->ill_arp_on_mp = NULL; 20263 putnext(ill->ill_rq, mp); 20264 } 20265 return (0); 20266 } 20267 20268 /* 20269 * Called after either deleting ill from the group or when setting 20270 * FAILED or STANDBY on the interface. 20271 */ 20272 static void 20273 illgrp_reset_schednext(ill_t *ill) 20274 { 20275 ill_group_t *illgrp; 20276 ill_t *save_ill; 20277 20278 ASSERT(IAM_WRITER_ILL(ill)); 20279 /* 20280 * When called from illgrp_delete, ill_group will be non-NULL. 20281 * But when called from ip_sioctl_flags, it could be NULL if 20282 * somebody is setting FAILED/INACTIVE on some interface which 20283 * is not part of a group. 20284 */ 20285 illgrp = ill->ill_group; 20286 if (illgrp == NULL) 20287 return; 20288 if (illgrp->illgrp_ill_schednext != ill) 20289 return; 20290 20291 illgrp->illgrp_ill_schednext = NULL; 20292 save_ill = ill; 20293 /* 20294 * Choose a good ill to be the next one for 20295 * outbound traffic. As the flags FAILED/STANDBY is 20296 * not yet marked when called from ip_sioctl_flags, 20297 * we check for ill separately. 20298 */ 20299 for (ill = illgrp->illgrp_ill; ill != NULL; 20300 ill = ill->ill_group_next) { 20301 if ((ill != save_ill) && 20302 !(ill->ill_phyint->phyint_flags & 20303 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 20304 illgrp->illgrp_ill_schednext = ill; 20305 return; 20306 } 20307 } 20308 } 20309 20310 /* 20311 * Given an ill, find the next ill in the group to be scheduled. 20312 * (This should be called by ip_newroute() before ire_create().) 20313 * The passed in ill may be pulled out of the group, after we have picked 20314 * up a different outgoing ill from the same group. However ire add will 20315 * atomically check this. 20316 */ 20317 ill_t * 20318 illgrp_scheduler(ill_t *ill) 20319 { 20320 ill_t *retill; 20321 ill_group_t *illgrp; 20322 int illcnt; 20323 int i; 20324 uint64_t flags; 20325 20326 /* 20327 * We don't use a lock to check for the ill_group. If this ill 20328 * is currently being inserted we may end up just returning this 20329 * ill itself. That is ok. 20330 */ 20331 if (ill->ill_group == NULL) { 20332 ill_refhold(ill); 20333 return (ill); 20334 } 20335 20336 /* 20337 * Grab the ill_g_lock as reader to make sure we are dealing with 20338 * a set of stable ills. No ill can be added or deleted or change 20339 * group while we hold the reader lock. 20340 */ 20341 rw_enter(&ill_g_lock, RW_READER); 20342 if ((illgrp = ill->ill_group) == NULL) { 20343 rw_exit(&ill_g_lock); 20344 ill_refhold(ill); 20345 return (ill); 20346 } 20347 20348 illcnt = illgrp->illgrp_ill_count; 20349 mutex_enter(&illgrp->illgrp_lock); 20350 retill = illgrp->illgrp_ill_schednext; 20351 20352 if (retill == NULL) 20353 retill = illgrp->illgrp_ill; 20354 20355 /* 20356 * We do a circular search beginning at illgrp_ill_schednext 20357 * or illgrp_ill. We don't check the flags against the ill lock 20358 * since it can change anytime. The ire creation will be atomic 20359 * and will fail if the ill is FAILED or OFFLINE. 20360 */ 20361 for (i = 0; i < illcnt; i++) { 20362 flags = retill->ill_phyint->phyint_flags; 20363 20364 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 20365 ILL_CAN_LOOKUP(retill)) { 20366 illgrp->illgrp_ill_schednext = retill->ill_group_next; 20367 ill_refhold(retill); 20368 break; 20369 } 20370 retill = retill->ill_group_next; 20371 if (retill == NULL) 20372 retill = illgrp->illgrp_ill; 20373 } 20374 mutex_exit(&illgrp->illgrp_lock); 20375 rw_exit(&ill_g_lock); 20376 20377 return (i == illcnt ? NULL : retill); 20378 } 20379 20380 /* 20381 * Checks for availbility of a usable source address (if there is one) when the 20382 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 20383 * this selection is done regardless of the destination. 20384 */ 20385 boolean_t 20386 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 20387 { 20388 uint_t ifindex; 20389 ipif_t *ipif = NULL; 20390 ill_t *uill; 20391 boolean_t isv6; 20392 20393 ASSERT(ill != NULL); 20394 20395 isv6 = ill->ill_isv6; 20396 ifindex = ill->ill_usesrc_ifindex; 20397 if (ifindex != 0) { 20398 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 20399 NULL); 20400 if (uill == NULL) 20401 return (NULL); 20402 mutex_enter(&uill->ill_lock); 20403 for (ipif = uill->ill_ipif; ipif != NULL; 20404 ipif = ipif->ipif_next) { 20405 if (!IPIF_CAN_LOOKUP(ipif)) 20406 continue; 20407 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20408 continue; 20409 if (!(ipif->ipif_flags & IPIF_UP)) 20410 continue; 20411 if (ipif->ipif_zoneid != zoneid) 20412 continue; 20413 if ((isv6 && 20414 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 20415 (ipif->ipif_lcl_addr == INADDR_ANY)) 20416 continue; 20417 mutex_exit(&uill->ill_lock); 20418 ill_refrele(uill); 20419 return (B_TRUE); 20420 } 20421 mutex_exit(&uill->ill_lock); 20422 ill_refrele(uill); 20423 } 20424 return (B_FALSE); 20425 } 20426 20427 /* 20428 * Determine the best source address given a destination address and an ill. 20429 * Prefers non-deprecated over deprecated but will return a deprecated 20430 * address if there is no other choice. If there is a usable source address 20431 * on the interface pointed to by ill_usesrc_ifindex then that is given 20432 * first preference. 20433 * 20434 * Returns NULL if there is no suitable source address for the ill. 20435 * This only occurs when there is no valid source address for the ill. 20436 */ 20437 ipif_t * 20438 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 20439 { 20440 ipif_t *ipif; 20441 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 20442 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 20443 int index = 0; 20444 boolean_t wrapped = B_FALSE; 20445 boolean_t same_subnet_only = B_FALSE; 20446 boolean_t ipif_same_found, ipif_other_found; 20447 boolean_t specific_found; 20448 ill_t *till, *usill = NULL; 20449 tsol_tpc_t *src_rhtp, *dst_rhtp; 20450 20451 if (ill->ill_usesrc_ifindex != 0) { 20452 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, B_FALSE, 20453 NULL, NULL, NULL, NULL); 20454 if (usill != NULL) 20455 ill = usill; /* Select source from usesrc ILL */ 20456 else 20457 return (NULL); 20458 } 20459 20460 /* 20461 * If we're dealing with an unlabeled destination on a labeled system, 20462 * make sure that we ignore source addresses that are incompatible with 20463 * the destination's default label. That destination's default label 20464 * must dominate the minimum label on the source address. 20465 */ 20466 dst_rhtp = NULL; 20467 if (is_system_labeled()) { 20468 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 20469 if (dst_rhtp == NULL) 20470 return (NULL); 20471 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 20472 TPC_RELE(dst_rhtp); 20473 dst_rhtp = NULL; 20474 } 20475 } 20476 20477 /* 20478 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 20479 * can be deleted. But an ipif/ill can get CONDEMNED any time. 20480 * After selecting the right ipif, under ill_lock make sure ipif is 20481 * not condemned, and increment refcnt. If ipif is CONDEMNED, 20482 * we retry. Inside the loop we still need to check for CONDEMNED, 20483 * but not under a lock. 20484 */ 20485 rw_enter(&ill_g_lock, RW_READER); 20486 20487 retry: 20488 till = ill; 20489 ipif_arr[0] = NULL; 20490 20491 if (till->ill_group != NULL) 20492 till = till->ill_group->illgrp_ill; 20493 20494 /* 20495 * Choose one good source address from each ill across the group. 20496 * If possible choose a source address in the same subnet as 20497 * the destination address. 20498 * 20499 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 20500 * This is okay because of the following. 20501 * 20502 * If PHYI_FAILED is set and we still have non-deprecated 20503 * addresses, it means the addresses have not yet been 20504 * failed over to a different interface. We potentially 20505 * select them to create IRE_CACHES, which will be later 20506 * flushed when the addresses move over. 20507 * 20508 * If PHYI_INACTIVE is set and we still have non-deprecated 20509 * addresses, it means either the user has configured them 20510 * or PHYI_INACTIVE has not been cleared after the addresses 20511 * been moved over. For the former, in.mpathd does a failover 20512 * when the interface becomes INACTIVE and hence we should 20513 * not find them. Once INACTIVE is set, we don't allow them 20514 * to create logical interfaces anymore. For the latter, a 20515 * flush will happen when INACTIVE is cleared which will 20516 * flush the IRE_CACHES. 20517 * 20518 * If PHYI_OFFLINE is set, all the addresses will be failed 20519 * over soon. We potentially select them to create IRE_CACHEs, 20520 * which will be later flushed when the addresses move over. 20521 * 20522 * NOTE : As ipif_select_source is called to borrow source address 20523 * for an ipif that is part of a group, source address selection 20524 * will be re-done whenever the group changes i.e either an 20525 * insertion/deletion in the group. 20526 * 20527 * Fill ipif_arr[] with source addresses, using these rules: 20528 * 20529 * 1. At most one source address from a given ill ends up 20530 * in ipif_arr[] -- that is, at most one of the ipif's 20531 * associated with a given ill ends up in ipif_arr[]. 20532 * 20533 * 2. If there is at least one non-deprecated ipif in the 20534 * IPMP group with a source address on the same subnet as 20535 * our destination, then fill ipif_arr[] only with 20536 * source addresses on the same subnet as our destination. 20537 * Note that because of (1), only the first 20538 * non-deprecated ipif found with a source address 20539 * matching the destination ends up in ipif_arr[]. 20540 * 20541 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 20542 * addresses not in the same subnet as our destination. 20543 * Again, because of (1), only the first off-subnet source 20544 * address will be chosen. 20545 * 20546 * 4. If there are no non-deprecated ipifs, then just use 20547 * the source address associated with the last deprecated 20548 * one we find that happens to be on the same subnet, 20549 * otherwise the first one not in the same subnet. 20550 */ 20551 specific_found = B_FALSE; 20552 for (; till != NULL; till = till->ill_group_next) { 20553 ipif_same_found = B_FALSE; 20554 ipif_other_found = B_FALSE; 20555 for (ipif = till->ill_ipif; ipif != NULL; 20556 ipif = ipif->ipif_next) { 20557 if (!IPIF_CAN_LOOKUP(ipif)) 20558 continue; 20559 /* Always skip NOLOCAL and ANYCAST interfaces */ 20560 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20561 continue; 20562 if (!(ipif->ipif_flags & IPIF_UP) || 20563 !ipif->ipif_addr_ready) 20564 continue; 20565 if (ipif->ipif_zoneid != zoneid && 20566 ipif->ipif_zoneid != ALL_ZONES) 20567 continue; 20568 /* 20569 * Interfaces with 0.0.0.0 address are allowed to be UP, 20570 * but are not valid as source addresses. 20571 */ 20572 if (ipif->ipif_lcl_addr == INADDR_ANY) 20573 continue; 20574 20575 /* 20576 * Check compatibility of local address for 20577 * destination's default label if we're on a labeled 20578 * system. Incompatible addresses can't be used at 20579 * all. 20580 */ 20581 if (dst_rhtp != NULL) { 20582 boolean_t incompat; 20583 20584 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 20585 IPV4_VERSION, B_FALSE); 20586 if (src_rhtp == NULL) 20587 continue; 20588 incompat = 20589 src_rhtp->tpc_tp.host_type != SUN_CIPSO || 20590 src_rhtp->tpc_tp.tp_doi != 20591 dst_rhtp->tpc_tp.tp_doi || 20592 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 20593 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 20594 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 20595 src_rhtp->tpc_tp.tp_sl_set_cipso)); 20596 TPC_RELE(src_rhtp); 20597 if (incompat) 20598 continue; 20599 } 20600 20601 /* 20602 * We prefer not to use all all-zones addresses, if we 20603 * can avoid it, as they pose problems with unlabeled 20604 * destinations. 20605 */ 20606 if (ipif->ipif_zoneid != ALL_ZONES) { 20607 if (!specific_found && 20608 (!same_subnet_only || 20609 (ipif->ipif_net_mask & dst) == 20610 ipif->ipif_subnet)) { 20611 index = 0; 20612 specific_found = B_TRUE; 20613 ipif_other_found = B_FALSE; 20614 } 20615 } else { 20616 if (specific_found) 20617 continue; 20618 } 20619 if (ipif->ipif_flags & IPIF_DEPRECATED) { 20620 if (ipif_dep == NULL || 20621 (ipif->ipif_net_mask & dst) == 20622 ipif->ipif_subnet) 20623 ipif_dep = ipif; 20624 continue; 20625 } 20626 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 20627 /* found a source address in the same subnet */ 20628 if (!same_subnet_only) { 20629 same_subnet_only = B_TRUE; 20630 index = 0; 20631 } 20632 ipif_same_found = B_TRUE; 20633 } else { 20634 if (same_subnet_only || ipif_other_found) 20635 continue; 20636 ipif_other_found = B_TRUE; 20637 } 20638 ipif_arr[index++] = ipif; 20639 if (index == MAX_IPIF_SELECT_SOURCE) { 20640 wrapped = B_TRUE; 20641 index = 0; 20642 } 20643 if (ipif_same_found) 20644 break; 20645 } 20646 } 20647 20648 if (ipif_arr[0] == NULL) { 20649 ipif = ipif_dep; 20650 } else { 20651 if (wrapped) 20652 index = MAX_IPIF_SELECT_SOURCE; 20653 ipif = ipif_arr[ipif_rand() % index]; 20654 ASSERT(ipif != NULL); 20655 } 20656 20657 if (ipif != NULL) { 20658 mutex_enter(&ipif->ipif_ill->ill_lock); 20659 if (!IPIF_CAN_LOOKUP(ipif)) { 20660 mutex_exit(&ipif->ipif_ill->ill_lock); 20661 goto retry; 20662 } 20663 ipif_refhold_locked(ipif); 20664 mutex_exit(&ipif->ipif_ill->ill_lock); 20665 } 20666 20667 rw_exit(&ill_g_lock); 20668 if (usill != NULL) 20669 ill_refrele(usill); 20670 if (dst_rhtp != NULL) 20671 TPC_RELE(dst_rhtp); 20672 20673 #ifdef DEBUG 20674 if (ipif == NULL) { 20675 char buf1[INET6_ADDRSTRLEN]; 20676 20677 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 20678 ill->ill_name, 20679 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 20680 } else { 20681 char buf1[INET6_ADDRSTRLEN]; 20682 char buf2[INET6_ADDRSTRLEN]; 20683 20684 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 20685 ipif->ipif_ill->ill_name, 20686 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 20687 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 20688 buf2, sizeof (buf2)))); 20689 } 20690 #endif /* DEBUG */ 20691 return (ipif); 20692 } 20693 20694 20695 /* 20696 * If old_ipif is not NULL, see if ipif was derived from old 20697 * ipif and if so, recreate the interface route by re-doing 20698 * source address selection. This happens when ipif_down -> 20699 * ipif_update_other_ipifs calls us. 20700 * 20701 * If old_ipif is NULL, just redo the source address selection 20702 * if needed. This happens when illgrp_insert or ipif_up_done 20703 * calls us. 20704 */ 20705 static void 20706 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 20707 { 20708 ire_t *ire; 20709 ire_t *ipif_ire; 20710 queue_t *stq; 20711 ipif_t *nipif; 20712 ill_t *ill; 20713 boolean_t need_rele = B_FALSE; 20714 20715 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 20716 ASSERT(IAM_WRITER_IPIF(ipif)); 20717 20718 ill = ipif->ipif_ill; 20719 if (!(ipif->ipif_flags & 20720 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 20721 /* 20722 * Can't possibly have borrowed the source 20723 * from old_ipif. 20724 */ 20725 return; 20726 } 20727 20728 /* 20729 * Is there any work to be done? No work if the address 20730 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 20731 * ipif_select_source() does not borrow addresses from 20732 * NOLOCAL and ANYCAST interfaces). 20733 */ 20734 if ((old_ipif != NULL) && 20735 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 20736 (old_ipif->ipif_ill->ill_wq == NULL) || 20737 (old_ipif->ipif_flags & 20738 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 20739 return; 20740 } 20741 20742 /* 20743 * Perform the same checks as when creating the 20744 * IRE_INTERFACE in ipif_up_done. 20745 */ 20746 if (!(ipif->ipif_flags & IPIF_UP)) 20747 return; 20748 20749 if ((ipif->ipif_flags & IPIF_NOXMIT) || 20750 (ipif->ipif_subnet == INADDR_ANY)) 20751 return; 20752 20753 ipif_ire = ipif_to_ire(ipif); 20754 if (ipif_ire == NULL) 20755 return; 20756 20757 /* 20758 * We know that ipif uses some other source for its 20759 * IRE_INTERFACE. Is it using the source of this 20760 * old_ipif? 20761 */ 20762 if (old_ipif != NULL && 20763 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 20764 ire_refrele(ipif_ire); 20765 return; 20766 } 20767 if (ip_debug > 2) { 20768 /* ip1dbg */ 20769 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 20770 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 20771 } 20772 20773 stq = ipif_ire->ire_stq; 20774 20775 /* 20776 * Can't use our source address. Select a different 20777 * source address for the IRE_INTERFACE. 20778 */ 20779 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 20780 if (nipif == NULL) { 20781 /* Last resort - all ipif's have IPIF_NOLOCAL */ 20782 nipif = ipif; 20783 } else { 20784 need_rele = B_TRUE; 20785 } 20786 20787 ire = ire_create( 20788 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 20789 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 20790 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 20791 NULL, /* no gateway */ 20792 NULL, 20793 &ipif->ipif_mtu, /* max frag */ 20794 NULL, /* fast path header */ 20795 NULL, /* no recv from queue */ 20796 stq, /* send-to queue */ 20797 ill->ill_net_type, /* IF_[NO]RESOLVER */ 20798 ill->ill_resolver_mp, /* xmit header */ 20799 ipif, 20800 NULL, 20801 0, 20802 0, 20803 0, 20804 0, 20805 &ire_uinfo_null, 20806 NULL, 20807 NULL); 20808 20809 if (ire != NULL) { 20810 ire_t *ret_ire; 20811 int error; 20812 20813 /* 20814 * We don't need ipif_ire anymore. We need to delete 20815 * before we add so that ire_add does not detect 20816 * duplicates. 20817 */ 20818 ire_delete(ipif_ire); 20819 ret_ire = ire; 20820 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 20821 ASSERT(error == 0); 20822 ASSERT(ire == ret_ire); 20823 /* Held in ire_add */ 20824 ire_refrele(ret_ire); 20825 } 20826 /* 20827 * Either we are falling through from above or could not 20828 * allocate a replacement. 20829 */ 20830 ire_refrele(ipif_ire); 20831 if (need_rele) 20832 ipif_refrele(nipif); 20833 } 20834 20835 /* 20836 * This old_ipif is going away. 20837 * 20838 * Determine if any other ipif's is using our address as 20839 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 20840 * IPIF_DEPRECATED). 20841 * Find the IRE_INTERFACE for such ipifs and recreate them 20842 * to use an different source address following the rules in 20843 * ipif_up_done. 20844 * 20845 * This function takes an illgrp as an argument so that illgrp_delete 20846 * can call this to update source address even after deleting the 20847 * old_ipif->ipif_ill from the ill group. 20848 */ 20849 static void 20850 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 20851 { 20852 ipif_t *ipif; 20853 ill_t *ill; 20854 char buf[INET6_ADDRSTRLEN]; 20855 20856 ASSERT(IAM_WRITER_IPIF(old_ipif)); 20857 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 20858 20859 ill = old_ipif->ipif_ill; 20860 20861 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 20862 ill->ill_name, 20863 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 20864 buf, sizeof (buf)))); 20865 /* 20866 * If this part of a group, look at all ills as ipif_select_source 20867 * borrows source address across all the ills in the group. 20868 */ 20869 if (illgrp != NULL) 20870 ill = illgrp->illgrp_ill; 20871 20872 for (; ill != NULL; ill = ill->ill_group_next) { 20873 for (ipif = ill->ill_ipif; ipif != NULL; 20874 ipif = ipif->ipif_next) { 20875 20876 if (ipif == old_ipif) 20877 continue; 20878 20879 ipif_recreate_interface_routes(old_ipif, ipif); 20880 } 20881 } 20882 } 20883 20884 /* ARGSUSED */ 20885 int 20886 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20887 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20888 { 20889 /* 20890 * ill_phyint_reinit merged the v4 and v6 into a single 20891 * ipsq. Could also have become part of a ipmp group in the 20892 * process, and we might not have been able to complete the 20893 * operation in ipif_set_values, if we could not become 20894 * exclusive. If so restart it here. 20895 */ 20896 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 20897 } 20898 20899 20900 /* ARGSUSED */ 20901 int 20902 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20903 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20904 { 20905 queue_t *q1 = q; 20906 char *cp; 20907 char interf_name[LIFNAMSIZ]; 20908 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 20909 20910 if (!q->q_next) { 20911 ip1dbg(( 20912 "if_unitsel: IF_UNITSEL: no q_next\n")); 20913 return (EINVAL); 20914 } 20915 20916 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 20917 return (EALREADY); 20918 20919 do { 20920 q1 = q1->q_next; 20921 } while (q1->q_next); 20922 cp = q1->q_qinfo->qi_minfo->mi_idname; 20923 (void) sprintf(interf_name, "%s%d", cp, ppa); 20924 20925 /* 20926 * Here we are not going to delay the ioack until after 20927 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 20928 * original ioctl message before sending the requests. 20929 */ 20930 return (ipif_set_values(q, mp, interf_name, &ppa)); 20931 } 20932 20933 /* ARGSUSED */ 20934 int 20935 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20936 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20937 { 20938 return (ENXIO); 20939 } 20940 20941 /* 20942 * Net and subnet broadcast ire's are now specific to the particular 20943 * physical interface (ill) and not to any one locigal interface (ipif). 20944 * However, if a particular logical interface is being taken down, it's 20945 * associated ire's will be taken down as well. Hence, when we go to 20946 * take down or change the local address, broadcast address or netmask 20947 * of a specific logical interface, we must check to make sure that we 20948 * have valid net and subnet broadcast ire's for the other logical 20949 * interfaces which may have been shared with the logical interface 20950 * being brought down or changed. 20951 * 20952 * There is one set of 0.0.0.0 and 255.255.255.255 per ill. Usually it 20953 * is tied to the first interface coming UP. If that ipif is going down, 20954 * we need to recreate them on the next valid ipif. 20955 * 20956 * Note: assume that the ipif passed in is still up so that it's IRE 20957 * entries are still valid. 20958 */ 20959 static void 20960 ipif_check_bcast_ires(ipif_t *test_ipif) 20961 { 20962 ipif_t *ipif; 20963 ire_t *test_subnet_ire, *test_net_ire; 20964 ire_t *test_allzero_ire, *test_allone_ire; 20965 ire_t *ire_array[12]; 20966 ire_t **irep = &ire_array[0]; 20967 ire_t **irep1; 20968 20969 ipaddr_t net_addr, subnet_addr, net_mask, subnet_mask; 20970 ipaddr_t test_net_addr, test_subnet_addr; 20971 ipaddr_t test_net_mask, test_subnet_mask; 20972 boolean_t need_net_bcast_ire = B_FALSE; 20973 boolean_t need_subnet_bcast_ire = B_FALSE; 20974 boolean_t allzero_bcast_ire_created = B_FALSE; 20975 boolean_t allone_bcast_ire_created = B_FALSE; 20976 boolean_t net_bcast_ire_created = B_FALSE; 20977 boolean_t subnet_bcast_ire_created = B_FALSE; 20978 20979 ipif_t *backup_ipif_net = (ipif_t *)NULL; 20980 ipif_t *backup_ipif_subnet = (ipif_t *)NULL; 20981 ipif_t *backup_ipif_allzeros = (ipif_t *)NULL; 20982 ipif_t *backup_ipif_allones = (ipif_t *)NULL; 20983 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 20984 20985 ASSERT(!test_ipif->ipif_isv6); 20986 ASSERT(IAM_WRITER_IPIF(test_ipif)); 20987 20988 /* 20989 * No broadcast IREs for the LOOPBACK interface 20990 * or others such as point to point and IPIF_NOXMIT. 20991 */ 20992 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 20993 (test_ipif->ipif_flags & IPIF_NOXMIT)) 20994 return; 20995 20996 test_allzero_ire = ire_ctable_lookup(0, 0, IRE_BROADCAST, 20997 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20998 20999 test_allone_ire = ire_ctable_lookup(INADDR_BROADCAST, 0, IRE_BROADCAST, 21000 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21001 21002 test_net_mask = ip_net_mask(test_ipif->ipif_subnet); 21003 test_subnet_mask = test_ipif->ipif_net_mask; 21004 21005 /* 21006 * If no net mask set, assume the default based on net class. 21007 */ 21008 if (test_subnet_mask == 0) 21009 test_subnet_mask = test_net_mask; 21010 21011 /* 21012 * Check if there is a network broadcast ire associated with this ipif 21013 */ 21014 test_net_addr = test_net_mask & test_ipif->ipif_subnet; 21015 test_net_ire = ire_ctable_lookup(test_net_addr, 0, IRE_BROADCAST, 21016 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21017 21018 /* 21019 * Check if there is a subnet broadcast IRE associated with this ipif 21020 */ 21021 test_subnet_addr = test_subnet_mask & test_ipif->ipif_subnet; 21022 test_subnet_ire = ire_ctable_lookup(test_subnet_addr, 0, IRE_BROADCAST, 21023 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21024 21025 /* 21026 * No broadcast ire's associated with this ipif. 21027 */ 21028 if ((test_subnet_ire == NULL) && (test_net_ire == NULL) && 21029 (test_allzero_ire == NULL) && (test_allone_ire == NULL)) { 21030 return; 21031 } 21032 21033 /* 21034 * We have established which bcast ires have to be replaced. 21035 * Next we try to locate ipifs that match there ires. 21036 * The rules are simple: If we find an ipif that matches on the subnet 21037 * address it will also match on the net address, the allzeros and 21038 * allones address. Any ipif that matches only on the net address will 21039 * also match the allzeros and allones addresses. 21040 * The other criterion is the ipif_flags. We look for non-deprecated 21041 * (and non-anycast and non-nolocal) ipifs as the best choice. 21042 * ipifs with check_flags matching (deprecated, etc) are used only 21043 * if good ipifs are not available. While looping, we save existing 21044 * deprecated ipifs as backup_ipif. 21045 * We loop through all the ipifs for this ill looking for ipifs 21046 * whose broadcast addr match the ipif passed in, but do not have 21047 * their own broadcast ires. For creating 0.0.0.0 and 21048 * 255.255.255.255 we just need an ipif on this ill to create. 21049 */ 21050 for (ipif = test_ipif->ipif_ill->ill_ipif; ipif != NULL; 21051 ipif = ipif->ipif_next) { 21052 21053 ASSERT(!ipif->ipif_isv6); 21054 /* 21055 * Already checked the ipif passed in. 21056 */ 21057 if (ipif == test_ipif) { 21058 continue; 21059 } 21060 21061 /* 21062 * We only need to recreate broadcast ires if another ipif in 21063 * the same zone uses them. The new ires must be created in the 21064 * same zone. 21065 */ 21066 if (ipif->ipif_zoneid != test_ipif->ipif_zoneid) { 21067 continue; 21068 } 21069 21070 /* 21071 * Only interested in logical interfaces with valid local 21072 * addresses or with the ability to broadcast. 21073 */ 21074 if ((ipif->ipif_subnet == 0) || 21075 !(ipif->ipif_flags & IPIF_BROADCAST) || 21076 (ipif->ipif_flags & IPIF_NOXMIT) || 21077 !(ipif->ipif_flags & IPIF_UP)) { 21078 continue; 21079 } 21080 /* 21081 * Check if there is a net broadcast ire for this 21082 * net address. If it turns out that the ipif we are 21083 * about to take down owns this ire, we must make a 21084 * new one because it is potentially going away. 21085 */ 21086 if (test_net_ire && (!net_bcast_ire_created)) { 21087 net_mask = ip_net_mask(ipif->ipif_subnet); 21088 net_addr = net_mask & ipif->ipif_subnet; 21089 if (net_addr == test_net_addr) { 21090 need_net_bcast_ire = B_TRUE; 21091 /* 21092 * Use DEPRECATED ipif only if no good 21093 * ires are available. subnet_addr is 21094 * a better match than net_addr. 21095 */ 21096 if ((ipif->ipif_flags & check_flags) && 21097 (backup_ipif_net == NULL)) { 21098 backup_ipif_net = ipif; 21099 } 21100 } 21101 } 21102 /* 21103 * Check if there is a subnet broadcast ire for this 21104 * net address. If it turns out that the ipif we are 21105 * about to take down owns this ire, we must make a 21106 * new one because it is potentially going away. 21107 */ 21108 if (test_subnet_ire && (!subnet_bcast_ire_created)) { 21109 subnet_mask = ipif->ipif_net_mask; 21110 subnet_addr = ipif->ipif_subnet; 21111 if (subnet_addr == test_subnet_addr) { 21112 need_subnet_bcast_ire = B_TRUE; 21113 if ((ipif->ipif_flags & check_flags) && 21114 (backup_ipif_subnet == NULL)) { 21115 backup_ipif_subnet = ipif; 21116 } 21117 } 21118 } 21119 21120 21121 /* Short circuit here if this ipif is deprecated */ 21122 if (ipif->ipif_flags & check_flags) { 21123 if ((test_allzero_ire != NULL) && 21124 (!allzero_bcast_ire_created) && 21125 (backup_ipif_allzeros == NULL)) { 21126 backup_ipif_allzeros = ipif; 21127 } 21128 if ((test_allone_ire != NULL) && 21129 (!allone_bcast_ire_created) && 21130 (backup_ipif_allones == NULL)) { 21131 backup_ipif_allones = ipif; 21132 } 21133 continue; 21134 } 21135 21136 /* 21137 * Found an ipif which has the same broadcast ire as the 21138 * ipif passed in and the ipif passed in "owns" the ire. 21139 * Create new broadcast ire's for this broadcast addr. 21140 */ 21141 if (need_net_bcast_ire && !net_bcast_ire_created) { 21142 irep = ire_create_bcast(ipif, net_addr, irep); 21143 irep = ire_create_bcast(ipif, 21144 ~net_mask | net_addr, irep); 21145 net_bcast_ire_created = B_TRUE; 21146 } 21147 if (need_subnet_bcast_ire && !subnet_bcast_ire_created) { 21148 irep = ire_create_bcast(ipif, subnet_addr, irep); 21149 irep = ire_create_bcast(ipif, 21150 ~subnet_mask | subnet_addr, irep); 21151 subnet_bcast_ire_created = B_TRUE; 21152 } 21153 if (test_allzero_ire != NULL && !allzero_bcast_ire_created) { 21154 irep = ire_create_bcast(ipif, 0, irep); 21155 allzero_bcast_ire_created = B_TRUE; 21156 } 21157 if (test_allone_ire != NULL && !allone_bcast_ire_created) { 21158 irep = ire_create_bcast(ipif, INADDR_BROADCAST, irep); 21159 allone_bcast_ire_created = B_TRUE; 21160 } 21161 /* 21162 * Once we have created all the appropriate ires, we 21163 * just break out of this loop to add what we have created. 21164 * This has been indented similar to ire_match_args for 21165 * readability. 21166 */ 21167 if (((test_net_ire == NULL) || 21168 (net_bcast_ire_created)) && 21169 ((test_subnet_ire == NULL) || 21170 (subnet_bcast_ire_created)) && 21171 ((test_allzero_ire == NULL) || 21172 (allzero_bcast_ire_created)) && 21173 ((test_allone_ire == NULL) || 21174 (allone_bcast_ire_created))) { 21175 break; 21176 } 21177 } 21178 21179 /* 21180 * Create bcast ires on deprecated ipifs if no non-deprecated ipifs 21181 * exist. 6 pairs of bcast ires are needed. 21182 * Note - the old ires are deleted in ipif_down. 21183 */ 21184 if (need_net_bcast_ire && !net_bcast_ire_created && backup_ipif_net) { 21185 ipif = backup_ipif_net; 21186 irep = ire_create_bcast(ipif, net_addr, irep); 21187 irep = ire_create_bcast(ipif, ~net_mask | net_addr, irep); 21188 net_bcast_ire_created = B_TRUE; 21189 } 21190 if (need_subnet_bcast_ire && !subnet_bcast_ire_created && 21191 backup_ipif_subnet) { 21192 ipif = backup_ipif_subnet; 21193 irep = ire_create_bcast(ipif, subnet_addr, irep); 21194 irep = ire_create_bcast(ipif, 21195 ~subnet_mask | subnet_addr, irep); 21196 subnet_bcast_ire_created = B_TRUE; 21197 } 21198 if (test_allzero_ire != NULL && !allzero_bcast_ire_created && 21199 backup_ipif_allzeros) { 21200 irep = ire_create_bcast(backup_ipif_allzeros, 0, irep); 21201 allzero_bcast_ire_created = B_TRUE; 21202 } 21203 if (test_allone_ire != NULL && !allone_bcast_ire_created && 21204 backup_ipif_allones) { 21205 irep = ire_create_bcast(backup_ipif_allones, 21206 INADDR_BROADCAST, irep); 21207 allone_bcast_ire_created = B_TRUE; 21208 } 21209 21210 /* 21211 * If we can't create all of them, don't add any of them. 21212 * Code in ip_wput_ire and ire_to_ill assumes that we 21213 * always have a non-loopback copy and loopback copy 21214 * for a given address. 21215 */ 21216 for (irep1 = irep; irep1 > ire_array; ) { 21217 irep1--; 21218 if (*irep1 == NULL) { 21219 ip0dbg(("ipif_check_bcast_ires: can't create " 21220 "IRE_BROADCAST, memory allocation failure\n")); 21221 while (irep > ire_array) { 21222 irep--; 21223 if (*irep != NULL) 21224 ire_delete(*irep); 21225 } 21226 goto bad; 21227 } 21228 } 21229 for (irep1 = irep; irep1 > ire_array; ) { 21230 int error; 21231 21232 irep1--; 21233 error = ire_add(irep1, NULL, NULL, NULL, B_FALSE); 21234 if (error == 0) { 21235 ire_refrele(*irep1); /* Held in ire_add */ 21236 } 21237 } 21238 bad: 21239 if (test_allzero_ire != NULL) 21240 ire_refrele(test_allzero_ire); 21241 if (test_allone_ire != NULL) 21242 ire_refrele(test_allone_ire); 21243 if (test_net_ire != NULL) 21244 ire_refrele(test_net_ire); 21245 if (test_subnet_ire != NULL) 21246 ire_refrele(test_subnet_ire); 21247 } 21248 21249 /* 21250 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 21251 * from lifr_flags and the name from lifr_name. 21252 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 21253 * since ipif_lookup_on_name uses the _isv6 flags when matching. 21254 * Returns EINPROGRESS when mp has been consumed by queueing it on 21255 * ill_pending_mp and the ioctl will complete in ip_rput. 21256 */ 21257 /* ARGSUSED */ 21258 int 21259 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21260 ip_ioctl_cmd_t *ipip, void *if_req) 21261 { 21262 int err; 21263 ill_t *ill; 21264 struct lifreq *lifr = (struct lifreq *)if_req; 21265 21266 ASSERT(ipif != NULL); 21267 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 21268 ASSERT(q->q_next != NULL); 21269 21270 ill = (ill_t *)q->q_ptr; 21271 /* 21272 * If we are not writer on 'q' then this interface exists already 21273 * and previous lookups (ipif_extract_lifreq_cmn) found this ipif. 21274 * So return EALREADY 21275 */ 21276 if (ill != ipif->ipif_ill) 21277 return (EALREADY); 21278 21279 if (ill->ill_name[0] != '\0') 21280 return (EALREADY); 21281 21282 /* 21283 * Set all the flags. Allows all kinds of override. Provide some 21284 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 21285 * unless there is either multicast/broadcast support in the driver 21286 * or it is a pt-pt link. 21287 */ 21288 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 21289 /* Meaningless to IP thus don't allow them to be set. */ 21290 ip1dbg(("ip_setname: EINVAL 1\n")); 21291 return (EINVAL); 21292 } 21293 /* 21294 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 21295 * ill_bcast_addr_length info. 21296 */ 21297 if (!ill->ill_needs_attach && 21298 ((lifr->lifr_flags & IFF_MULTICAST) && 21299 !(lifr->lifr_flags & IFF_POINTOPOINT) && 21300 ill->ill_bcast_addr_length == 0)) { 21301 /* Link not broadcast/pt-pt capable i.e. no multicast */ 21302 ip1dbg(("ip_setname: EINVAL 2\n")); 21303 return (EINVAL); 21304 } 21305 if ((lifr->lifr_flags & IFF_BROADCAST) && 21306 ((lifr->lifr_flags & IFF_IPV6) || 21307 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 21308 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 21309 ip1dbg(("ip_setname: EINVAL 3\n")); 21310 return (EINVAL); 21311 } 21312 if (lifr->lifr_flags & IFF_UP) { 21313 /* Can only be set with SIOCSLIFFLAGS */ 21314 ip1dbg(("ip_setname: EINVAL 4\n")); 21315 return (EINVAL); 21316 } 21317 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 21318 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 21319 ip1dbg(("ip_setname: EINVAL 5\n")); 21320 return (EINVAL); 21321 } 21322 /* 21323 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 21324 */ 21325 if ((lifr->lifr_flags & IFF_XRESOLV) && 21326 !(lifr->lifr_flags & IFF_IPV6) && 21327 !(ipif->ipif_isv6)) { 21328 ip1dbg(("ip_setname: EINVAL 6\n")); 21329 return (EINVAL); 21330 } 21331 21332 /* 21333 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 21334 * we have all the flags here. So, we assign rather than we OR. 21335 * We can't OR the flags here because we don't want to set 21336 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 21337 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 21338 * on lifr_flags value here. 21339 */ 21340 /* 21341 * This ill has not been inserted into the global list. 21342 * So we are still single threaded and don't need any lock 21343 */ 21344 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS & 21345 ~IFF_DUPLICATE; 21346 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 21347 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 21348 21349 /* We started off as V4. */ 21350 if (ill->ill_flags & ILLF_IPV6) { 21351 ill->ill_phyint->phyint_illv6 = ill; 21352 ill->ill_phyint->phyint_illv4 = NULL; 21353 } 21354 err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa); 21355 return (err); 21356 } 21357 21358 /* ARGSUSED */ 21359 int 21360 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21361 ip_ioctl_cmd_t *ipip, void *if_req) 21362 { 21363 /* 21364 * ill_phyint_reinit merged the v4 and v6 into a single 21365 * ipsq. Could also have become part of a ipmp group in the 21366 * process, and we might not have been able to complete the 21367 * slifname in ipif_set_values, if we could not become 21368 * exclusive. If so restart it here 21369 */ 21370 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21371 } 21372 21373 /* 21374 * Return a pointer to the ipif which matches the index, IP version type and 21375 * zoneid. 21376 */ 21377 ipif_t * 21378 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 21379 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 21380 { 21381 ill_t *ill; 21382 ipsq_t *ipsq; 21383 phyint_t *phyi; 21384 ipif_t *ipif; 21385 21386 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 21387 (q != NULL && mp != NULL && func != NULL && err != NULL)); 21388 21389 if (err != NULL) 21390 *err = 0; 21391 21392 /* 21393 * Indexes are stored in the phyint - a common structure 21394 * to both IPv4 and IPv6. 21395 */ 21396 21397 rw_enter(&ill_g_lock, RW_READER); 21398 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 21399 (void *) &index, NULL); 21400 if (phyi != NULL) { 21401 ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4; 21402 if (ill == NULL) { 21403 rw_exit(&ill_g_lock); 21404 if (err != NULL) 21405 *err = ENXIO; 21406 return (NULL); 21407 } 21408 GRAB_CONN_LOCK(q); 21409 mutex_enter(&ill->ill_lock); 21410 if (ILL_CAN_LOOKUP(ill)) { 21411 for (ipif = ill->ill_ipif; ipif != NULL; 21412 ipif = ipif->ipif_next) { 21413 if (IPIF_CAN_LOOKUP(ipif) && 21414 (zoneid == ALL_ZONES || 21415 zoneid == ipif->ipif_zoneid || 21416 ipif->ipif_zoneid == ALL_ZONES)) { 21417 ipif_refhold_locked(ipif); 21418 mutex_exit(&ill->ill_lock); 21419 RELEASE_CONN_LOCK(q); 21420 rw_exit(&ill_g_lock); 21421 return (ipif); 21422 } 21423 } 21424 } else if (ILL_CAN_WAIT(ill, q)) { 21425 ipsq = ill->ill_phyint->phyint_ipsq; 21426 mutex_enter(&ipsq->ipsq_lock); 21427 rw_exit(&ill_g_lock); 21428 mutex_exit(&ill->ill_lock); 21429 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 21430 mutex_exit(&ipsq->ipsq_lock); 21431 RELEASE_CONN_LOCK(q); 21432 *err = EINPROGRESS; 21433 return (NULL); 21434 } 21435 mutex_exit(&ill->ill_lock); 21436 RELEASE_CONN_LOCK(q); 21437 } 21438 rw_exit(&ill_g_lock); 21439 if (err != NULL) 21440 *err = ENXIO; 21441 return (NULL); 21442 } 21443 21444 typedef struct conn_change_s { 21445 uint_t cc_old_ifindex; 21446 uint_t cc_new_ifindex; 21447 } conn_change_t; 21448 21449 /* 21450 * ipcl_walk function for changing interface index. 21451 */ 21452 static void 21453 conn_change_ifindex(conn_t *connp, caddr_t arg) 21454 { 21455 conn_change_t *connc; 21456 uint_t old_ifindex; 21457 uint_t new_ifindex; 21458 int i; 21459 ilg_t *ilg; 21460 21461 connc = (conn_change_t *)arg; 21462 old_ifindex = connc->cc_old_ifindex; 21463 new_ifindex = connc->cc_new_ifindex; 21464 21465 if (connp->conn_orig_bound_ifindex == old_ifindex) 21466 connp->conn_orig_bound_ifindex = new_ifindex; 21467 21468 if (connp->conn_orig_multicast_ifindex == old_ifindex) 21469 connp->conn_orig_multicast_ifindex = new_ifindex; 21470 21471 if (connp->conn_orig_xmit_ifindex == old_ifindex) 21472 connp->conn_orig_xmit_ifindex = new_ifindex; 21473 21474 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 21475 ilg = &connp->conn_ilg[i]; 21476 if (ilg->ilg_orig_ifindex == old_ifindex) 21477 ilg->ilg_orig_ifindex = new_ifindex; 21478 } 21479 } 21480 21481 /* 21482 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 21483 * to new_index if it matches the old_index. 21484 * 21485 * Failovers typically happen within a group of ills. But somebody 21486 * can remove an ill from the group after a failover happened. If 21487 * we are setting the ifindex after this, we potentially need to 21488 * look at all the ills rather than just the ones in the group. 21489 * We cut down the work by looking at matching ill_net_types 21490 * and ill_types as we could not possibly grouped them together. 21491 */ 21492 static void 21493 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 21494 { 21495 ill_t *ill; 21496 ipif_t *ipif; 21497 uint_t old_ifindex; 21498 uint_t new_ifindex; 21499 ilm_t *ilm; 21500 ill_walk_context_t ctx; 21501 21502 old_ifindex = connc->cc_old_ifindex; 21503 new_ifindex = connc->cc_new_ifindex; 21504 21505 rw_enter(&ill_g_lock, RW_READER); 21506 ill = ILL_START_WALK_ALL(&ctx); 21507 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 21508 if ((ill_orig->ill_net_type != ill->ill_net_type) || 21509 (ill_orig->ill_type != ill->ill_type)) { 21510 continue; 21511 } 21512 for (ipif = ill->ill_ipif; ipif != NULL; 21513 ipif = ipif->ipif_next) { 21514 if (ipif->ipif_orig_ifindex == old_ifindex) 21515 ipif->ipif_orig_ifindex = new_ifindex; 21516 } 21517 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 21518 if (ilm->ilm_orig_ifindex == old_ifindex) 21519 ilm->ilm_orig_ifindex = new_ifindex; 21520 } 21521 } 21522 rw_exit(&ill_g_lock); 21523 } 21524 21525 /* 21526 * We first need to ensure that the new index is unique, and 21527 * then carry the change across both v4 and v6 ill representation 21528 * of the physical interface. 21529 */ 21530 /* ARGSUSED */ 21531 int 21532 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21533 ip_ioctl_cmd_t *ipip, void *ifreq) 21534 { 21535 ill_t *ill; 21536 ill_t *ill_other; 21537 phyint_t *phyi; 21538 int old_index; 21539 conn_change_t connc; 21540 struct ifreq *ifr = (struct ifreq *)ifreq; 21541 struct lifreq *lifr = (struct lifreq *)ifreq; 21542 uint_t index; 21543 ill_t *ill_v4; 21544 ill_t *ill_v6; 21545 21546 if (ipip->ipi_cmd_type == IF_CMD) 21547 index = ifr->ifr_index; 21548 else 21549 index = lifr->lifr_index; 21550 21551 /* 21552 * Only allow on physical interface. Also, index zero is illegal. 21553 * 21554 * Need to check for PHYI_FAILED and PHYI_INACTIVE 21555 * 21556 * 1) If PHYI_FAILED is set, a failover could have happened which 21557 * implies a possible failback might have to happen. As failback 21558 * depends on the old index, we should fail setting the index. 21559 * 21560 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 21561 * any addresses or multicast memberships are failed over to 21562 * a non-STANDBY interface. As failback depends on the old 21563 * index, we should fail setting the index for this case also. 21564 * 21565 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 21566 * Be consistent with PHYI_FAILED and fail the ioctl. 21567 */ 21568 ill = ipif->ipif_ill; 21569 phyi = ill->ill_phyint; 21570 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 21571 ipif->ipif_id != 0 || index == 0) { 21572 return (EINVAL); 21573 } 21574 old_index = phyi->phyint_ifindex; 21575 21576 /* If the index is not changing, no work to do */ 21577 if (old_index == index) 21578 return (0); 21579 21580 /* 21581 * Use ill_lookup_on_ifindex to determine if the 21582 * new index is unused and if so allow the change. 21583 */ 21584 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL); 21585 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL); 21586 if (ill_v6 != NULL || ill_v4 != NULL) { 21587 if (ill_v4 != NULL) 21588 ill_refrele(ill_v4); 21589 if (ill_v6 != NULL) 21590 ill_refrele(ill_v6); 21591 return (EBUSY); 21592 } 21593 21594 /* 21595 * The new index is unused. Set it in the phyint. 21596 * Locate the other ill so that we can send a routing 21597 * sockets message. 21598 */ 21599 if (ill->ill_isv6) { 21600 ill_other = phyi->phyint_illv4; 21601 } else { 21602 ill_other = phyi->phyint_illv6; 21603 } 21604 21605 phyi->phyint_ifindex = index; 21606 21607 connc.cc_old_ifindex = old_index; 21608 connc.cc_new_ifindex = index; 21609 ip_change_ifindex(ill, &connc); 21610 ipcl_walk(conn_change_ifindex, (caddr_t)&connc); 21611 21612 /* Send the routing sockets message */ 21613 ip_rts_ifmsg(ipif); 21614 if (ill_other != NULL) 21615 ip_rts_ifmsg(ill_other->ill_ipif); 21616 21617 return (0); 21618 } 21619 21620 /* ARGSUSED */ 21621 int 21622 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21623 ip_ioctl_cmd_t *ipip, void *ifreq) 21624 { 21625 struct ifreq *ifr = (struct ifreq *)ifreq; 21626 struct lifreq *lifr = (struct lifreq *)ifreq; 21627 21628 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 21629 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21630 /* Get the interface index */ 21631 if (ipip->ipi_cmd_type == IF_CMD) { 21632 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21633 } else { 21634 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21635 } 21636 return (0); 21637 } 21638 21639 /* ARGSUSED */ 21640 int 21641 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21642 ip_ioctl_cmd_t *ipip, void *ifreq) 21643 { 21644 struct lifreq *lifr = (struct lifreq *)ifreq; 21645 21646 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 21647 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21648 /* Get the interface zone */ 21649 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21650 lifr->lifr_zoneid = ipif->ipif_zoneid; 21651 return (0); 21652 } 21653 21654 /* 21655 * Set the zoneid of an interface. 21656 */ 21657 /* ARGSUSED */ 21658 int 21659 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21660 ip_ioctl_cmd_t *ipip, void *ifreq) 21661 { 21662 struct lifreq *lifr = (struct lifreq *)ifreq; 21663 int err = 0; 21664 boolean_t need_up = B_FALSE; 21665 zone_t *zptr; 21666 zone_status_t status; 21667 zoneid_t zoneid; 21668 21669 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21670 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 21671 if (!is_system_labeled()) 21672 return (ENOTSUP); 21673 zoneid = GLOBAL_ZONEID; 21674 } 21675 21676 /* cannot assign instance zero to a non-global zone */ 21677 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 21678 return (ENOTSUP); 21679 21680 /* 21681 * Cannot assign to a zone that doesn't exist or is shutting down. In 21682 * the event of a race with the zone shutdown processing, since IP 21683 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 21684 * interface will be cleaned up even if the zone is shut down 21685 * immediately after the status check. If the interface can't be brought 21686 * down right away, and the zone is shut down before the restart 21687 * function is called, we resolve the possible races by rechecking the 21688 * zone status in the restart function. 21689 */ 21690 if ((zptr = zone_find_by_id(zoneid)) == NULL) 21691 return (EINVAL); 21692 status = zone_status_get(zptr); 21693 zone_rele(zptr); 21694 21695 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 21696 return (EINVAL); 21697 21698 if (ipif->ipif_flags & IPIF_UP) { 21699 /* 21700 * If the interface is already marked up, 21701 * we call ipif_down which will take care 21702 * of ditching any IREs that have been set 21703 * up based on the old interface address. 21704 */ 21705 err = ipif_logical_down(ipif, q, mp); 21706 if (err == EINPROGRESS) 21707 return (err); 21708 ipif_down_tail(ipif); 21709 need_up = B_TRUE; 21710 } 21711 21712 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 21713 return (err); 21714 } 21715 21716 static int 21717 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 21718 queue_t *q, mblk_t *mp, boolean_t need_up) 21719 { 21720 int err = 0; 21721 21722 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 21723 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21724 21725 /* Set the new zone id. */ 21726 ipif->ipif_zoneid = zoneid; 21727 21728 /* Update sctp list */ 21729 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 21730 21731 if (need_up) { 21732 /* 21733 * Now bring the interface back up. If this 21734 * is the only IPIF for the ILL, ipif_up 21735 * will have to re-bind to the device, so 21736 * we may get back EINPROGRESS, in which 21737 * case, this IOCTL will get completed in 21738 * ip_rput_dlpi when we see the DL_BIND_ACK. 21739 */ 21740 err = ipif_up(ipif, q, mp); 21741 } 21742 return (err); 21743 } 21744 21745 /* ARGSUSED */ 21746 int 21747 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21748 ip_ioctl_cmd_t *ipip, void *if_req) 21749 { 21750 struct lifreq *lifr = (struct lifreq *)if_req; 21751 zoneid_t zoneid; 21752 zone_t *zptr; 21753 zone_status_t status; 21754 21755 ASSERT(ipif->ipif_id != 0); 21756 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21757 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 21758 zoneid = GLOBAL_ZONEID; 21759 21760 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 21761 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21762 21763 /* 21764 * We recheck the zone status to resolve the following race condition: 21765 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 21766 * 2) hme0:1 is up and can't be brought down right away; 21767 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 21768 * 3) zone "myzone" is halted; the zone status switches to 21769 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 21770 * the interfaces to remove - hme0:1 is not returned because it's not 21771 * yet in "myzone", so it won't be removed; 21772 * 4) the restart function for SIOCSLIFZONE is called; without the 21773 * status check here, we would have hme0:1 in "myzone" after it's been 21774 * destroyed. 21775 * Note that if the status check fails, we need to bring the interface 21776 * back to its state prior to ip_sioctl_slifzone(), hence the call to 21777 * ipif_up_done[_v6](). 21778 */ 21779 status = ZONE_IS_UNINITIALIZED; 21780 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 21781 status = zone_status_get(zptr); 21782 zone_rele(zptr); 21783 } 21784 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 21785 if (ipif->ipif_isv6) { 21786 (void) ipif_up_done_v6(ipif); 21787 } else { 21788 (void) ipif_up_done(ipif); 21789 } 21790 return (EINVAL); 21791 } 21792 21793 ipif_down_tail(ipif); 21794 21795 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 21796 B_TRUE)); 21797 } 21798 21799 /* ARGSUSED */ 21800 int 21801 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21802 ip_ioctl_cmd_t *ipip, void *ifreq) 21803 { 21804 struct lifreq *lifr = ifreq; 21805 21806 ASSERT(q->q_next == NULL); 21807 ASSERT(CONN_Q(q)); 21808 21809 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 21810 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21811 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 21812 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 21813 21814 return (0); 21815 } 21816 21817 21818 /* Find the previous ILL in this usesrc group */ 21819 static ill_t * 21820 ill_prev_usesrc(ill_t *uill) 21821 { 21822 ill_t *ill; 21823 21824 for (ill = uill->ill_usesrc_grp_next; 21825 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 21826 ill = ill->ill_usesrc_grp_next) 21827 /* do nothing */; 21828 return (ill); 21829 } 21830 21831 /* 21832 * Release all members of the usesrc group. This routine is called 21833 * from ill_delete when the interface being unplumbed is the 21834 * group head. 21835 */ 21836 static void 21837 ill_disband_usesrc_group(ill_t *uill) 21838 { 21839 ill_t *next_ill, *tmp_ill; 21840 ASSERT(RW_WRITE_HELD(&ill_g_usesrc_lock)); 21841 next_ill = uill->ill_usesrc_grp_next; 21842 21843 do { 21844 ASSERT(next_ill != NULL); 21845 tmp_ill = next_ill->ill_usesrc_grp_next; 21846 ASSERT(tmp_ill != NULL); 21847 next_ill->ill_usesrc_grp_next = NULL; 21848 next_ill->ill_usesrc_ifindex = 0; 21849 next_ill = tmp_ill; 21850 } while (next_ill->ill_usesrc_ifindex != 0); 21851 uill->ill_usesrc_grp_next = NULL; 21852 } 21853 21854 /* 21855 * Remove the client usesrc ILL from the list and relink to a new list 21856 */ 21857 int 21858 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 21859 { 21860 ill_t *ill, *tmp_ill; 21861 21862 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 21863 (uill != NULL) && RW_WRITE_HELD(&ill_g_usesrc_lock)); 21864 21865 /* 21866 * Check if the usesrc client ILL passed in is not already 21867 * in use as a usesrc ILL i.e one whose source address is 21868 * in use OR a usesrc ILL is not already in use as a usesrc 21869 * client ILL 21870 */ 21871 if ((ucill->ill_usesrc_ifindex == 0) || 21872 (uill->ill_usesrc_ifindex != 0)) { 21873 return (-1); 21874 } 21875 21876 ill = ill_prev_usesrc(ucill); 21877 ASSERT(ill->ill_usesrc_grp_next != NULL); 21878 21879 /* Remove from the current list */ 21880 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 21881 /* Only two elements in the list */ 21882 ASSERT(ill->ill_usesrc_ifindex == 0); 21883 ill->ill_usesrc_grp_next = NULL; 21884 } else { 21885 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 21886 } 21887 21888 if (ifindex == 0) { 21889 ucill->ill_usesrc_ifindex = 0; 21890 ucill->ill_usesrc_grp_next = NULL; 21891 return (0); 21892 } 21893 21894 ucill->ill_usesrc_ifindex = ifindex; 21895 tmp_ill = uill->ill_usesrc_grp_next; 21896 uill->ill_usesrc_grp_next = ucill; 21897 ucill->ill_usesrc_grp_next = 21898 (tmp_ill != NULL) ? tmp_ill : uill; 21899 return (0); 21900 } 21901 21902 /* 21903 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 21904 * ip.c for locking details. 21905 */ 21906 /* ARGSUSED */ 21907 int 21908 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21909 ip_ioctl_cmd_t *ipip, void *ifreq) 21910 { 21911 struct lifreq *lifr = (struct lifreq *)ifreq; 21912 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 21913 ill_flag_changed = B_FALSE; 21914 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 21915 int err = 0, ret; 21916 uint_t ifindex; 21917 phyint_t *us_phyint, *us_cli_phyint; 21918 ipsq_t *ipsq = NULL; 21919 21920 ASSERT(IAM_WRITER_IPIF(ipif)); 21921 ASSERT(q->q_next == NULL); 21922 ASSERT(CONN_Q(q)); 21923 21924 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 21925 us_cli_phyint = usesrc_cli_ill->ill_phyint; 21926 21927 ASSERT(us_cli_phyint != NULL); 21928 21929 /* 21930 * If the client ILL is being used for IPMP, abort. 21931 * Note, this can be done before ipsq_try_enter since we are already 21932 * exclusive on this ILL 21933 */ 21934 if ((us_cli_phyint->phyint_groupname != NULL) || 21935 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 21936 return (EINVAL); 21937 } 21938 21939 ifindex = lifr->lifr_index; 21940 if (ifindex == 0) { 21941 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 21942 /* non usesrc group interface, nothing to reset */ 21943 return (0); 21944 } 21945 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 21946 /* valid reset request */ 21947 reset_flg = B_TRUE; 21948 } 21949 21950 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 21951 ip_process_ioctl, &err); 21952 21953 if (usesrc_ill == NULL) { 21954 return (err); 21955 } 21956 21957 /* 21958 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 21959 * group nor can either of the interfaces be used for standy. So 21960 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 21961 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 21962 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 21963 * We are already exlusive on this ipsq i.e ipsq corresponding to 21964 * the usesrc_cli_ill 21965 */ 21966 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 21967 NEW_OP, B_TRUE); 21968 if (ipsq == NULL) { 21969 err = EINPROGRESS; 21970 /* Operation enqueued on the ipsq of the usesrc ILL */ 21971 goto done; 21972 } 21973 21974 /* Check if the usesrc_ill is used for IPMP */ 21975 us_phyint = usesrc_ill->ill_phyint; 21976 if ((us_phyint->phyint_groupname != NULL) || 21977 (us_phyint->phyint_flags & PHYI_STANDBY)) { 21978 err = EINVAL; 21979 goto done; 21980 } 21981 21982 /* 21983 * If the client is already in use as a usesrc_ill or a usesrc_ill is 21984 * already a client then return EINVAL 21985 */ 21986 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 21987 err = EINVAL; 21988 goto done; 21989 } 21990 21991 /* 21992 * If the ill_usesrc_ifindex field is already set to what it needs to 21993 * be then this is a duplicate operation. 21994 */ 21995 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 21996 err = 0; 21997 goto done; 21998 } 21999 22000 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 22001 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 22002 usesrc_ill->ill_isv6)); 22003 22004 /* 22005 * The next step ensures that no new ires will be created referencing 22006 * the client ill, until the ILL_CHANGING flag is cleared. Then 22007 * we go through an ire walk deleting all ire caches that reference 22008 * the client ill. New ires referencing the client ill that are added 22009 * to the ire table before the ILL_CHANGING flag is set, will be 22010 * cleaned up by the ire walk below. Attempt to add new ires referencing 22011 * the client ill while the ILL_CHANGING flag is set will be failed 22012 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 22013 * checks (under the ill_g_usesrc_lock) that the ire being added 22014 * is not stale, i.e the ire_stq and ire_ipif are consistent and 22015 * belong to the same usesrc group. 22016 */ 22017 mutex_enter(&usesrc_cli_ill->ill_lock); 22018 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 22019 mutex_exit(&usesrc_cli_ill->ill_lock); 22020 ill_flag_changed = B_TRUE; 22021 22022 if (ipif->ipif_isv6) 22023 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22024 ALL_ZONES); 22025 else 22026 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22027 ALL_ZONES); 22028 22029 /* 22030 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 22031 * and the ill_usesrc_ifindex fields 22032 */ 22033 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 22034 22035 if (reset_flg) { 22036 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 22037 if (ret != 0) { 22038 err = EINVAL; 22039 } 22040 rw_exit(&ill_g_usesrc_lock); 22041 goto done; 22042 } 22043 22044 /* 22045 * Four possibilities to consider: 22046 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 22047 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 22048 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 22049 * 4. Both are part of their respective usesrc groups 22050 */ 22051 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 22052 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22053 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 22054 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22055 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22056 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 22057 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 22058 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22059 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22060 /* Insert at head of list */ 22061 usesrc_cli_ill->ill_usesrc_grp_next = 22062 usesrc_ill->ill_usesrc_grp_next; 22063 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22064 } else { 22065 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 22066 ifindex); 22067 if (ret != 0) 22068 err = EINVAL; 22069 } 22070 rw_exit(&ill_g_usesrc_lock); 22071 22072 done: 22073 if (ill_flag_changed) { 22074 mutex_enter(&usesrc_cli_ill->ill_lock); 22075 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 22076 mutex_exit(&usesrc_cli_ill->ill_lock); 22077 } 22078 if (ipsq != NULL) 22079 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22080 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 22081 ill_refrele(usesrc_ill); 22082 return (err); 22083 } 22084 22085 /* 22086 * comparison function used by avl. 22087 */ 22088 static int 22089 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 22090 { 22091 22092 uint_t index; 22093 22094 ASSERT(phyip != NULL && index_ptr != NULL); 22095 22096 index = *((uint_t *)index_ptr); 22097 /* 22098 * let the phyint with the lowest index be on top. 22099 */ 22100 if (((phyint_t *)phyip)->phyint_ifindex < index) 22101 return (1); 22102 if (((phyint_t *)phyip)->phyint_ifindex > index) 22103 return (-1); 22104 return (0); 22105 } 22106 22107 /* 22108 * comparison function used by avl. 22109 */ 22110 static int 22111 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 22112 { 22113 ill_t *ill; 22114 int res = 0; 22115 22116 ASSERT(phyip != NULL && name_ptr != NULL); 22117 22118 if (((phyint_t *)phyip)->phyint_illv4) 22119 ill = ((phyint_t *)phyip)->phyint_illv4; 22120 else 22121 ill = ((phyint_t *)phyip)->phyint_illv6; 22122 ASSERT(ill != NULL); 22123 22124 res = strcmp(ill->ill_name, (char *)name_ptr); 22125 if (res > 0) 22126 return (1); 22127 else if (res < 0) 22128 return (-1); 22129 return (0); 22130 } 22131 /* 22132 * This function is called from ill_delete when the ill is being 22133 * unplumbed. We remove the reference from the phyint and we also 22134 * free the phyint when there are no more references to it. 22135 */ 22136 static void 22137 ill_phyint_free(ill_t *ill) 22138 { 22139 phyint_t *phyi; 22140 phyint_t *next_phyint; 22141 ipsq_t *cur_ipsq; 22142 22143 ASSERT(ill->ill_phyint != NULL); 22144 22145 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 22146 phyi = ill->ill_phyint; 22147 ill->ill_phyint = NULL; 22148 /* 22149 * ill_init allocates a phyint always to store the copy 22150 * of flags relevant to phyint. At that point in time, we could 22151 * not assign the name and hence phyint_illv4/v6 could not be 22152 * initialized. Later in ipif_set_values, we assign the name to 22153 * the ill, at which point in time we assign phyint_illv4/v6. 22154 * Thus we don't rely on phyint_illv6 to be initialized always. 22155 */ 22156 if (ill->ill_flags & ILLF_IPV6) { 22157 phyi->phyint_illv6 = NULL; 22158 } else { 22159 phyi->phyint_illv4 = NULL; 22160 } 22161 /* 22162 * ipif_down removes it from the group when the last ipif goes 22163 * down. 22164 */ 22165 ASSERT(ill->ill_group == NULL); 22166 22167 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 22168 return; 22169 22170 /* 22171 * Make sure this phyint was put in the list. 22172 */ 22173 if (phyi->phyint_ifindex > 0) { 22174 avl_remove(&phyint_g_list.phyint_list_avl_by_index, 22175 phyi); 22176 avl_remove(&phyint_g_list.phyint_list_avl_by_name, 22177 phyi); 22178 } 22179 /* 22180 * remove phyint from the ipsq list. 22181 */ 22182 cur_ipsq = phyi->phyint_ipsq; 22183 if (phyi == cur_ipsq->ipsq_phyint_list) { 22184 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 22185 } else { 22186 next_phyint = cur_ipsq->ipsq_phyint_list; 22187 while (next_phyint != NULL) { 22188 if (next_phyint->phyint_ipsq_next == phyi) { 22189 next_phyint->phyint_ipsq_next = 22190 phyi->phyint_ipsq_next; 22191 break; 22192 } 22193 next_phyint = next_phyint->phyint_ipsq_next; 22194 } 22195 ASSERT(next_phyint != NULL); 22196 } 22197 IPSQ_DEC_REF(cur_ipsq); 22198 22199 if (phyi->phyint_groupname_len != 0) { 22200 ASSERT(phyi->phyint_groupname != NULL); 22201 mi_free(phyi->phyint_groupname); 22202 } 22203 mi_free(phyi); 22204 } 22205 22206 /* 22207 * Attach the ill to the phyint structure which can be shared by both 22208 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 22209 * function is called from ipif_set_values and ill_lookup_on_name (for 22210 * loopback) where we know the name of the ill. We lookup the ill and if 22211 * there is one present already with the name use that phyint. Otherwise 22212 * reuse the one allocated by ill_init. 22213 */ 22214 static void 22215 ill_phyint_reinit(ill_t *ill) 22216 { 22217 boolean_t isv6 = ill->ill_isv6; 22218 phyint_t *phyi_old; 22219 phyint_t *phyi; 22220 avl_index_t where = 0; 22221 ill_t *ill_other = NULL; 22222 ipsq_t *ipsq; 22223 22224 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 22225 22226 phyi_old = ill->ill_phyint; 22227 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 22228 phyi_old->phyint_illv6 == NULL)); 22229 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 22230 phyi_old->phyint_illv4 == NULL)); 22231 ASSERT(phyi_old->phyint_ifindex == 0); 22232 22233 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 22234 ill->ill_name, &where); 22235 22236 /* 22237 * 1. We grabbed the ill_g_lock before inserting this ill into 22238 * the global list of ills. So no other thread could have located 22239 * this ill and hence the ipsq of this ill is guaranteed to be empty. 22240 * 2. Now locate the other protocol instance of this ill. 22241 * 3. Now grab both ill locks in the right order, and the phyint lock of 22242 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 22243 * of neither ill can change. 22244 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 22245 * other ill. 22246 * 5. Release all locks. 22247 */ 22248 22249 /* 22250 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 22251 * we are initializing IPv4. 22252 */ 22253 if (phyi != NULL) { 22254 ill_other = (isv6) ? phyi->phyint_illv4 : 22255 phyi->phyint_illv6; 22256 ASSERT(ill_other->ill_phyint != NULL); 22257 ASSERT((isv6 && !ill_other->ill_isv6) || 22258 (!isv6 && ill_other->ill_isv6)); 22259 GRAB_ILL_LOCKS(ill, ill_other); 22260 /* 22261 * We are potentially throwing away phyint_flags which 22262 * could be different from the one that we obtain from 22263 * ill_other->ill_phyint. But it is okay as we are assuming 22264 * that the state maintained within IP is correct. 22265 */ 22266 mutex_enter(&phyi->phyint_lock); 22267 if (isv6) { 22268 ASSERT(phyi->phyint_illv6 == NULL); 22269 phyi->phyint_illv6 = ill; 22270 } else { 22271 ASSERT(phyi->phyint_illv4 == NULL); 22272 phyi->phyint_illv4 = ill; 22273 } 22274 /* 22275 * This is a new ill, currently undergoing SLIFNAME 22276 * So we could not have joined an IPMP group until now. 22277 */ 22278 ASSERT(phyi_old->phyint_ipsq_next == NULL && 22279 phyi_old->phyint_groupname == NULL); 22280 22281 /* 22282 * This phyi_old is going away. Decref ipsq_refs and 22283 * assert it is zero. The ipsq itself will be freed in 22284 * ipsq_exit 22285 */ 22286 ipsq = phyi_old->phyint_ipsq; 22287 IPSQ_DEC_REF(ipsq); 22288 ASSERT(ipsq->ipsq_refs == 0); 22289 /* Get the singleton phyint out of the ipsq list */ 22290 ASSERT(phyi_old->phyint_ipsq_next == NULL); 22291 ipsq->ipsq_phyint_list = NULL; 22292 phyi_old->phyint_illv4 = NULL; 22293 phyi_old->phyint_illv6 = NULL; 22294 mi_free(phyi_old); 22295 } else { 22296 mutex_enter(&ill->ill_lock); 22297 /* 22298 * We don't need to acquire any lock, since 22299 * the ill is not yet visible globally and we 22300 * have not yet released the ill_g_lock. 22301 */ 22302 phyi = phyi_old; 22303 mutex_enter(&phyi->phyint_lock); 22304 /* XXX We need a recovery strategy here. */ 22305 if (!phyint_assign_ifindex(phyi)) 22306 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 22307 22308 avl_insert(&phyint_g_list.phyint_list_avl_by_name, 22309 (void *)phyi, where); 22310 22311 (void) avl_find(&phyint_g_list.phyint_list_avl_by_index, 22312 &phyi->phyint_ifindex, &where); 22313 avl_insert(&phyint_g_list.phyint_list_avl_by_index, 22314 (void *)phyi, where); 22315 } 22316 22317 /* 22318 * Reassigning ill_phyint automatically reassigns the ipsq also. 22319 * pending mp is not affected because that is per ill basis. 22320 */ 22321 ill->ill_phyint = phyi; 22322 22323 /* 22324 * Keep the index on ipif_orig_index to be used by FAILOVER. 22325 * We do this here as when the first ipif was allocated, 22326 * ipif_allocate does not know the right interface index. 22327 */ 22328 22329 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 22330 /* 22331 * Now that the phyint's ifindex has been assigned, complete the 22332 * remaining 22333 */ 22334 if (ill->ill_isv6) { 22335 ill->ill_ip6_mib->ipv6IfIndex = 22336 ill->ill_phyint->phyint_ifindex; 22337 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 22338 ill->ill_phyint->phyint_ifindex; 22339 } 22340 22341 /* 22342 * Generate an event within the hooks framework to indicate that 22343 * a new interface has just been added to IP. For this event to 22344 * be generated, the network interface must, at least, have an 22345 * ifindex assigned to it. 22346 * 22347 * This needs to be run inside the ill_g_lock perimeter to ensure 22348 * that the ordering of delivered events to listeners matches the 22349 * order of them in the kernel. 22350 * 22351 * This function could be called from ill_lookup_on_name. In that case 22352 * the interface is loopback "lo", which will not generate a NIC event. 22353 */ 22354 if (ill->ill_name_length <= 2 || 22355 ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') { 22356 hook_nic_event_t *info; 22357 if ((info = ill->ill_nic_event_info) != NULL) { 22358 ip2dbg(("ill_phyint_reinit: unexpected nic event %d " 22359 "attached for %s\n", info->hne_event, 22360 ill->ill_name)); 22361 if (info->hne_data != NULL) 22362 kmem_free(info->hne_data, info->hne_datalen); 22363 kmem_free(info, sizeof (hook_nic_event_t)); 22364 } 22365 22366 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 22367 if (info != NULL) { 22368 info->hne_nic = ill->ill_phyint->phyint_ifindex; 22369 info->hne_lif = 0; 22370 info->hne_event = NE_PLUMB; 22371 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 22372 info->hne_data = kmem_alloc(ill->ill_name_length, 22373 KM_NOSLEEP); 22374 if (info->hne_data != NULL) { 22375 info->hne_datalen = ill->ill_name_length; 22376 bcopy(ill->ill_name, info->hne_data, 22377 info->hne_datalen); 22378 } else { 22379 ip2dbg(("ill_phyint_reinit: could not attach " 22380 "ill_name information for PLUMB nic event " 22381 "of %s (ENOMEM)\n", ill->ill_name)); 22382 kmem_free(info, sizeof (hook_nic_event_t)); 22383 } 22384 } else 22385 ip2dbg(("ill_phyint_reinit: could not attach PLUMB nic " 22386 "event information for %s (ENOMEM)\n", 22387 ill->ill_name)); 22388 22389 ill->ill_nic_event_info = info; 22390 } 22391 22392 RELEASE_ILL_LOCKS(ill, ill_other); 22393 mutex_exit(&phyi->phyint_lock); 22394 } 22395 22396 /* 22397 * Notify any downstream modules of the name of this interface. 22398 * An M_IOCTL is used even though we don't expect a successful reply. 22399 * Any reply message from the driver (presumably an M_IOCNAK) will 22400 * eventually get discarded somewhere upstream. The message format is 22401 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 22402 * to IP. 22403 */ 22404 static void 22405 ip_ifname_notify(ill_t *ill, queue_t *q) 22406 { 22407 mblk_t *mp1, *mp2; 22408 struct iocblk *iocp; 22409 struct lifreq *lifr; 22410 22411 mp1 = mkiocb(SIOCSLIFNAME); 22412 if (mp1 == NULL) 22413 return; 22414 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 22415 if (mp2 == NULL) { 22416 freeb(mp1); 22417 return; 22418 } 22419 22420 mp1->b_cont = mp2; 22421 iocp = (struct iocblk *)mp1->b_rptr; 22422 iocp->ioc_count = sizeof (struct lifreq); 22423 22424 lifr = (struct lifreq *)mp2->b_rptr; 22425 mp2->b_wptr += sizeof (struct lifreq); 22426 bzero(lifr, sizeof (struct lifreq)); 22427 22428 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 22429 lifr->lifr_ppa = ill->ill_ppa; 22430 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 22431 22432 putnext(q, mp1); 22433 } 22434 22435 static boolean_t ip_trash_timer_started = B_FALSE; 22436 22437 static int 22438 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 22439 { 22440 int err; 22441 22442 /* Set the obsolete NDD per-interface forwarding name. */ 22443 err = ill_set_ndd_name(ill); 22444 if (err != 0) { 22445 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 22446 err); 22447 } 22448 22449 /* Tell downstream modules where they are. */ 22450 ip_ifname_notify(ill, q); 22451 22452 /* 22453 * ill_dl_phys returns EINPROGRESS in the usual case. 22454 * Error cases are ENOMEM ... 22455 */ 22456 err = ill_dl_phys(ill, ipif, mp, q); 22457 22458 /* 22459 * If there is no IRE expiration timer running, get one started. 22460 * igmp and mld timers will be triggered by the first multicast 22461 */ 22462 if (!ip_trash_timer_started) { 22463 /* 22464 * acquire the lock and check again. 22465 */ 22466 mutex_enter(&ip_trash_timer_lock); 22467 if (!ip_trash_timer_started) { 22468 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 22469 MSEC_TO_TICK(ip_timer_interval)); 22470 ip_trash_timer_started = B_TRUE; 22471 } 22472 mutex_exit(&ip_trash_timer_lock); 22473 } 22474 22475 if (ill->ill_isv6) { 22476 mutex_enter(&mld_slowtimeout_lock); 22477 if (mld_slowtimeout_id == 0) { 22478 mld_slowtimeout_id = timeout(mld_slowtimo, NULL, 22479 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22480 } 22481 mutex_exit(&mld_slowtimeout_lock); 22482 } else { 22483 mutex_enter(&igmp_slowtimeout_lock); 22484 if (igmp_slowtimeout_id == 0) { 22485 igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL, 22486 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22487 } 22488 mutex_exit(&igmp_slowtimeout_lock); 22489 } 22490 22491 return (err); 22492 } 22493 22494 /* 22495 * Common routine for ppa and ifname setting. Should be called exclusive. 22496 * 22497 * Returns EINPROGRESS when mp has been consumed by queueing it on 22498 * ill_pending_mp and the ioctl will complete in ip_rput. 22499 * 22500 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 22501 * the new name and new ppa in lifr_name and lifr_ppa respectively. 22502 * For SLIFNAME, we pass these values back to the userland. 22503 */ 22504 static int 22505 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 22506 { 22507 ill_t *ill; 22508 ipif_t *ipif; 22509 ipsq_t *ipsq; 22510 char *ppa_ptr; 22511 char *old_ptr; 22512 char old_char; 22513 int error; 22514 22515 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 22516 ASSERT(q->q_next != NULL); 22517 ASSERT(interf_name != NULL); 22518 22519 ill = (ill_t *)q->q_ptr; 22520 22521 ASSERT(ill->ill_name[0] == '\0'); 22522 ASSERT(IAM_WRITER_ILL(ill)); 22523 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 22524 ASSERT(ill->ill_ppa == UINT_MAX); 22525 22526 /* The ppa is sent down by ifconfig or is chosen */ 22527 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 22528 return (EINVAL); 22529 } 22530 22531 /* 22532 * make sure ppa passed in is same as ppa in the name. 22533 * This check is not made when ppa == UINT_MAX in that case ppa 22534 * in the name could be anything. System will choose a ppa and 22535 * update new_ppa_ptr and inter_name to contain the choosen ppa. 22536 */ 22537 if (*new_ppa_ptr != UINT_MAX) { 22538 /* stoi changes the pointer */ 22539 old_ptr = ppa_ptr; 22540 /* 22541 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 22542 * (they don't have an externally visible ppa). We assign one 22543 * here so that we can manage the interface. Note that in 22544 * the past this value was always 0 for DLPI 1 drivers. 22545 */ 22546 if (*new_ppa_ptr == 0) 22547 *new_ppa_ptr = stoi(&old_ptr); 22548 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 22549 return (EINVAL); 22550 } 22551 /* 22552 * terminate string before ppa 22553 * save char at that location. 22554 */ 22555 old_char = ppa_ptr[0]; 22556 ppa_ptr[0] = '\0'; 22557 22558 ill->ill_ppa = *new_ppa_ptr; 22559 /* 22560 * Finish as much work now as possible before calling ill_glist_insert 22561 * which makes the ill globally visible and also merges it with the 22562 * other protocol instance of this phyint. The remaining work is 22563 * done after entering the ipsq which may happen sometime later. 22564 * ill_set_ndd_name occurs after the ill has been made globally visible. 22565 */ 22566 ipif = ill->ill_ipif; 22567 22568 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 22569 ipif_assign_seqid(ipif); 22570 22571 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 22572 ill->ill_flags |= ILLF_IPV4; 22573 22574 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 22575 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 22576 22577 if (ill->ill_flags & ILLF_IPV6) { 22578 22579 ill->ill_isv6 = B_TRUE; 22580 if (ill->ill_rq != NULL) { 22581 ill->ill_rq->q_qinfo = &rinit_ipv6; 22582 ill->ill_wq->q_qinfo = &winit_ipv6; 22583 } 22584 22585 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 22586 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 22587 ipif->ipif_v6src_addr = ipv6_all_zeros; 22588 ipif->ipif_v6subnet = ipv6_all_zeros; 22589 ipif->ipif_v6net_mask = ipv6_all_zeros; 22590 ipif->ipif_v6brd_addr = ipv6_all_zeros; 22591 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 22592 /* 22593 * point-to-point or Non-mulicast capable 22594 * interfaces won't do NUD unless explicitly 22595 * configured to do so. 22596 */ 22597 if (ipif->ipif_flags & IPIF_POINTOPOINT || 22598 !(ill->ill_flags & ILLF_MULTICAST)) { 22599 ill->ill_flags |= ILLF_NONUD; 22600 } 22601 /* Make sure IPv4 specific flag is not set on IPv6 if */ 22602 if (ill->ill_flags & ILLF_NOARP) { 22603 /* 22604 * Note: xresolv interfaces will eventually need 22605 * NOARP set here as well, but that will require 22606 * those external resolvers to have some 22607 * knowledge of that flag and act appropriately. 22608 * Not to be changed at present. 22609 */ 22610 ill->ill_flags &= ~ILLF_NOARP; 22611 } 22612 /* 22613 * Set the ILLF_ROUTER flag according to the global 22614 * IPv6 forwarding policy. 22615 */ 22616 if (ipv6_forward != 0) 22617 ill->ill_flags |= ILLF_ROUTER; 22618 } else if (ill->ill_flags & ILLF_IPV4) { 22619 ill->ill_isv6 = B_FALSE; 22620 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 22621 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 22622 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 22623 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 22624 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 22625 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 22626 /* 22627 * Set the ILLF_ROUTER flag according to the global 22628 * IPv4 forwarding policy. 22629 */ 22630 if (ip_g_forward != 0) 22631 ill->ill_flags |= ILLF_ROUTER; 22632 } 22633 22634 ASSERT(ill->ill_phyint != NULL); 22635 22636 /* 22637 * The ipv6Ifindex and ipv6IfIcmpIfIndex assignments will 22638 * be completed in ill_glist_insert -> ill_phyint_reinit 22639 */ 22640 if (ill->ill_isv6) { 22641 /* allocate v6 mib */ 22642 if (!ill_allocate_mibs(ill)) 22643 return (ENOMEM); 22644 } 22645 22646 /* 22647 * Pick a default sap until we get the DL_INFO_ACK back from 22648 * the driver. 22649 */ 22650 if (ill->ill_sap == 0) { 22651 if (ill->ill_isv6) 22652 ill->ill_sap = IP6_DL_SAP; 22653 else 22654 ill->ill_sap = IP_DL_SAP; 22655 } 22656 22657 ill->ill_ifname_pending = 1; 22658 ill->ill_ifname_pending_err = 0; 22659 22660 ill_refhold(ill); 22661 rw_enter(&ill_g_lock, RW_WRITER); 22662 if ((error = ill_glist_insert(ill, interf_name, 22663 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 22664 ill->ill_ppa = UINT_MAX; 22665 ill->ill_name[0] = '\0'; 22666 /* 22667 * undo null termination done above. 22668 */ 22669 ppa_ptr[0] = old_char; 22670 rw_exit(&ill_g_lock); 22671 ill_refrele(ill); 22672 return (error); 22673 } 22674 22675 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 22676 22677 /* 22678 * When we return the buffer pointed to by interf_name should contain 22679 * the same name as in ill_name. 22680 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 22681 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 22682 * so copy full name and update the ppa ptr. 22683 * When ppa passed in != UINT_MAX all values are correct just undo 22684 * null termination, this saves a bcopy. 22685 */ 22686 if (*new_ppa_ptr == UINT_MAX) { 22687 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 22688 *new_ppa_ptr = ill->ill_ppa; 22689 } else { 22690 /* 22691 * undo null termination done above. 22692 */ 22693 ppa_ptr[0] = old_char; 22694 } 22695 22696 /* Let SCTP know about this ILL */ 22697 sctp_update_ill(ill, SCTP_ILL_INSERT); 22698 22699 /* and also about the first ipif */ 22700 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 22701 22702 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 22703 B_TRUE); 22704 22705 rw_exit(&ill_g_lock); 22706 ill_refrele(ill); 22707 if (ipsq == NULL) 22708 return (EINPROGRESS); 22709 22710 /* 22711 * Need to set the ipsq_current_ipif now, if we have changed ipsq 22712 * due to the phyint merge in ill_phyint_reinit. 22713 */ 22714 ASSERT(ipsq->ipsq_current_ipif == NULL || 22715 ipsq->ipsq_current_ipif == ipif); 22716 ipsq->ipsq_current_ipif = ipif; 22717 ipsq->ipsq_last_cmd = SIOCSLIFNAME; 22718 error = ipif_set_values_tail(ill, ipif, mp, q); 22719 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22720 if (error != 0 && error != EINPROGRESS) { 22721 /* 22722 * restore previous values 22723 */ 22724 ill->ill_isv6 = B_FALSE; 22725 } 22726 return (error); 22727 } 22728 22729 22730 extern void (*ip_cleanup_func)(void); 22731 22732 void 22733 ipif_init(void) 22734 { 22735 hrtime_t hrt; 22736 int i; 22737 22738 /* 22739 * Can't call drv_getparm here as it is too early in the boot. 22740 * As we use ipif_src_random just for picking a different 22741 * source address everytime, this need not be really random. 22742 */ 22743 hrt = gethrtime(); 22744 ipif_src_random = ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 22745 22746 for (i = 0; i < MAX_G_HEADS; i++) { 22747 ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ill_g_heads[i]; 22748 ill_g_heads[i].ill_g_list_tail = (ill_if_t *)&ill_g_heads[i]; 22749 } 22750 22751 avl_create(&phyint_g_list.phyint_list_avl_by_index, 22752 ill_phyint_compare_index, 22753 sizeof (phyint_t), 22754 offsetof(struct phyint, phyint_avl_by_index)); 22755 avl_create(&phyint_g_list.phyint_list_avl_by_name, 22756 ill_phyint_compare_name, 22757 sizeof (phyint_t), 22758 offsetof(struct phyint, phyint_avl_by_name)); 22759 22760 ip_cleanup_func = ip_thread_exit; 22761 } 22762 22763 /* 22764 * This is called by ip_rt_add when src_addr value is other than zero. 22765 * src_addr signifies the source address of the incoming packet. For 22766 * reverse tunnel route we need to create a source addr based routing 22767 * table. This routine creates ip_mrtun_table if it's empty and then 22768 * it adds the route entry hashed by source address. It verifies that 22769 * the outgoing interface is always a non-resolver interface (tunnel). 22770 */ 22771 int 22772 ip_mrtun_rt_add(ipaddr_t in_src_addr, int flags, ipif_t *ipif_arg, 22773 ipif_t *src_ipif, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func) 22774 { 22775 ire_t *ire; 22776 ire_t *save_ire; 22777 ipif_t *ipif; 22778 ill_t *in_ill = NULL; 22779 ill_t *out_ill; 22780 queue_t *stq; 22781 mblk_t *dlureq_mp; 22782 int error; 22783 22784 if (ire_arg != NULL) 22785 *ire_arg = NULL; 22786 ASSERT(in_src_addr != INADDR_ANY); 22787 22788 ipif = ipif_arg; 22789 if (ipif != NULL) { 22790 out_ill = ipif->ipif_ill; 22791 } else { 22792 ip1dbg(("ip_mrtun_rt_add: ipif is NULL\n")); 22793 return (EINVAL); 22794 } 22795 22796 if (src_ipif == NULL) { 22797 ip1dbg(("ip_mrtun_rt_add: src_ipif is NULL\n")); 22798 return (EINVAL); 22799 } 22800 in_ill = src_ipif->ipif_ill; 22801 22802 /* 22803 * Check for duplicates. We don't need to 22804 * match out_ill, because the uniqueness of 22805 * a route is only dependent on src_addr and 22806 * in_ill. 22807 */ 22808 ire = ire_mrtun_lookup(in_src_addr, in_ill); 22809 if (ire != NULL) { 22810 ire_refrele(ire); 22811 return (EEXIST); 22812 } 22813 if (ipif->ipif_net_type != IRE_IF_NORESOLVER) { 22814 ip2dbg(("ip_mrtun_rt_add: outgoing interface is type %d\n", 22815 ipif->ipif_net_type)); 22816 return (EINVAL); 22817 } 22818 22819 stq = ipif->ipif_wq; 22820 ASSERT(stq != NULL); 22821 22822 /* 22823 * The outgoing interface must be non-resolver 22824 * interface. 22825 */ 22826 dlureq_mp = ill_dlur_gen(NULL, 22827 out_ill->ill_phys_addr_length, out_ill->ill_sap, 22828 out_ill->ill_sap_length); 22829 22830 if (dlureq_mp == NULL) { 22831 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 22832 return (ENOMEM); 22833 } 22834 22835 /* Create the IRE. */ 22836 22837 ire = ire_create( 22838 NULL, /* Zero dst addr */ 22839 NULL, /* Zero mask */ 22840 NULL, /* Zero gateway addr */ 22841 NULL, /* Zero ipif_src addr */ 22842 (uint8_t *)&in_src_addr, /* in_src-addr */ 22843 &ipif->ipif_mtu, 22844 NULL, 22845 NULL, /* rfq */ 22846 stq, 22847 IRE_MIPRTUN, 22848 dlureq_mp, 22849 ipif, 22850 in_ill, 22851 0, 22852 0, 22853 0, 22854 flags, 22855 &ire_uinfo_null, 22856 NULL, 22857 NULL); 22858 22859 if (ire == NULL) { 22860 freeb(dlureq_mp); 22861 return (ENOMEM); 22862 } 22863 ip2dbg(("ip_mrtun_rt_add: mrtun route is created with type %d\n", 22864 ire->ire_type)); 22865 save_ire = ire; 22866 ASSERT(save_ire != NULL); 22867 error = ire_add_mrtun(&ire, q, mp, func); 22868 /* 22869 * If ire_add_mrtun() failed, the ire passed in was freed 22870 * so there is no need to do so here. 22871 */ 22872 if (error != 0) { 22873 return (error); 22874 } 22875 22876 /* Duplicate check */ 22877 if (ire != save_ire) { 22878 /* route already exists by now */ 22879 ire_refrele(ire); 22880 return (EEXIST); 22881 } 22882 22883 if (ire_arg != NULL) { 22884 /* 22885 * Store the ire that was just added. the caller 22886 * ip_rts_request responsible for doing ire_refrele() 22887 * on it. 22888 */ 22889 *ire_arg = ire; 22890 } else { 22891 ire_refrele(ire); /* held in ire_add_mrtun */ 22892 } 22893 22894 return (0); 22895 } 22896 22897 /* 22898 * It is called by ip_rt_delete() only when mipagent requests to delete 22899 * a reverse tunnel route that was added by ip_mrtun_rt_add() before. 22900 */ 22901 22902 int 22903 ip_mrtun_rt_delete(ipaddr_t in_src_addr, ipif_t *src_ipif) 22904 { 22905 ire_t *ire = NULL; 22906 22907 if (in_src_addr == INADDR_ANY) 22908 return (EINVAL); 22909 if (src_ipif == NULL) 22910 return (EINVAL); 22911 22912 /* search if this route exists in the ip_mrtun_table */ 22913 ire = ire_mrtun_lookup(in_src_addr, src_ipif->ipif_ill); 22914 if (ire == NULL) { 22915 ip2dbg(("ip_mrtun_rt_delete: ire not found\n")); 22916 return (ESRCH); 22917 } 22918 ire_delete(ire); 22919 ire_refrele(ire); 22920 return (0); 22921 } 22922 22923 /* 22924 * Lookup the ipif corresponding to the onlink destination address. For 22925 * point-to-point interfaces, it matches with remote endpoint destination 22926 * address. For point-to-multipoint interfaces it only tries to match the 22927 * destination with the interface's subnet address. The longest, most specific 22928 * match is found to take care of such rare network configurations like - 22929 * le0: 129.146.1.1/16 22930 * le1: 129.146.2.2/24 22931 * It is used only by SO_DONTROUTE at the moment. 22932 */ 22933 ipif_t * 22934 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid) 22935 { 22936 ipif_t *ipif, *best_ipif; 22937 ill_t *ill; 22938 ill_walk_context_t ctx; 22939 22940 ASSERT(zoneid != ALL_ZONES); 22941 best_ipif = NULL; 22942 22943 rw_enter(&ill_g_lock, RW_READER); 22944 ill = ILL_START_WALK_V4(&ctx); 22945 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 22946 mutex_enter(&ill->ill_lock); 22947 for (ipif = ill->ill_ipif; ipif != NULL; 22948 ipif = ipif->ipif_next) { 22949 if (!IPIF_CAN_LOOKUP(ipif)) 22950 continue; 22951 if (ipif->ipif_zoneid != zoneid && 22952 ipif->ipif_zoneid != ALL_ZONES) 22953 continue; 22954 /* 22955 * Point-to-point case. Look for exact match with 22956 * destination address. 22957 */ 22958 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 22959 if (ipif->ipif_pp_dst_addr == addr) { 22960 ipif_refhold_locked(ipif); 22961 mutex_exit(&ill->ill_lock); 22962 rw_exit(&ill_g_lock); 22963 if (best_ipif != NULL) 22964 ipif_refrele(best_ipif); 22965 return (ipif); 22966 } 22967 } else if (ipif->ipif_subnet == (addr & 22968 ipif->ipif_net_mask)) { 22969 /* 22970 * Point-to-multipoint case. Looping through to 22971 * find the most specific match. If there are 22972 * multiple best match ipif's then prefer ipif's 22973 * that are UP. If there is only one best match 22974 * ipif and it is DOWN we must still return it. 22975 */ 22976 if ((best_ipif == NULL) || 22977 (ipif->ipif_net_mask > 22978 best_ipif->ipif_net_mask) || 22979 ((ipif->ipif_net_mask == 22980 best_ipif->ipif_net_mask) && 22981 ((ipif->ipif_flags & IPIF_UP) && 22982 (!(best_ipif->ipif_flags & IPIF_UP))))) { 22983 ipif_refhold_locked(ipif); 22984 mutex_exit(&ill->ill_lock); 22985 rw_exit(&ill_g_lock); 22986 if (best_ipif != NULL) 22987 ipif_refrele(best_ipif); 22988 best_ipif = ipif; 22989 rw_enter(&ill_g_lock, RW_READER); 22990 mutex_enter(&ill->ill_lock); 22991 } 22992 } 22993 } 22994 mutex_exit(&ill->ill_lock); 22995 } 22996 rw_exit(&ill_g_lock); 22997 return (best_ipif); 22998 } 22999 23000 23001 /* 23002 * Save enough information so that we can recreate the IRE if 23003 * the interface goes down and then up. 23004 */ 23005 static void 23006 ipif_save_ire(ipif_t *ipif, ire_t *ire) 23007 { 23008 mblk_t *save_mp; 23009 23010 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 23011 if (save_mp != NULL) { 23012 ifrt_t *ifrt; 23013 23014 save_mp->b_wptr += sizeof (ifrt_t); 23015 ifrt = (ifrt_t *)save_mp->b_rptr; 23016 bzero(ifrt, sizeof (ifrt_t)); 23017 ifrt->ifrt_type = ire->ire_type; 23018 ifrt->ifrt_addr = ire->ire_addr; 23019 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 23020 ifrt->ifrt_src_addr = ire->ire_src_addr; 23021 ifrt->ifrt_mask = ire->ire_mask; 23022 ifrt->ifrt_flags = ire->ire_flags; 23023 ifrt->ifrt_max_frag = ire->ire_max_frag; 23024 mutex_enter(&ipif->ipif_saved_ire_lock); 23025 save_mp->b_cont = ipif->ipif_saved_ire_mp; 23026 ipif->ipif_saved_ire_mp = save_mp; 23027 ipif->ipif_saved_ire_cnt++; 23028 mutex_exit(&ipif->ipif_saved_ire_lock); 23029 } 23030 } 23031 23032 23033 static void 23034 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 23035 { 23036 mblk_t **mpp; 23037 mblk_t *mp; 23038 ifrt_t *ifrt; 23039 23040 /* Remove from ipif_saved_ire_mp list if it is there */ 23041 mutex_enter(&ipif->ipif_saved_ire_lock); 23042 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 23043 mpp = &(*mpp)->b_cont) { 23044 /* 23045 * On a given ipif, the triple of address, gateway and 23046 * mask is unique for each saved IRE (in the case of 23047 * ordinary interface routes, the gateway address is 23048 * all-zeroes). 23049 */ 23050 mp = *mpp; 23051 ifrt = (ifrt_t *)mp->b_rptr; 23052 if (ifrt->ifrt_addr == ire->ire_addr && 23053 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 23054 ifrt->ifrt_mask == ire->ire_mask) { 23055 *mpp = mp->b_cont; 23056 ipif->ipif_saved_ire_cnt--; 23057 freeb(mp); 23058 break; 23059 } 23060 } 23061 mutex_exit(&ipif->ipif_saved_ire_lock); 23062 } 23063 23064 23065 /* 23066 * IP multirouting broadcast routes handling 23067 * Append CGTP broadcast IREs to regular ones created 23068 * at ifconfig time. 23069 */ 23070 static void 23071 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst) 23072 { 23073 ire_t *ire_prim; 23074 23075 ASSERT(ire != NULL); 23076 ASSERT(ire_dst != NULL); 23077 23078 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23079 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23080 if (ire_prim != NULL) { 23081 /* 23082 * We are in the special case of broadcasts for 23083 * CGTP. We add an IRE_BROADCAST that holds 23084 * the RTF_MULTIRT flag, the destination 23085 * address of ire_dst and the low level 23086 * info of ire_prim. In other words, CGTP 23087 * broadcast is added to the redundant ipif. 23088 */ 23089 ipif_t *ipif_prim; 23090 ire_t *bcast_ire; 23091 23092 ipif_prim = ire_prim->ire_ipif; 23093 23094 ip2dbg(("ip_cgtp_filter_bcast_add: " 23095 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23096 (void *)ire_dst, (void *)ire_prim, 23097 (void *)ipif_prim)); 23098 23099 bcast_ire = ire_create( 23100 (uchar_t *)&ire->ire_addr, 23101 (uchar_t *)&ip_g_all_ones, 23102 (uchar_t *)&ire_dst->ire_src_addr, 23103 (uchar_t *)&ire->ire_gateway_addr, 23104 NULL, 23105 &ipif_prim->ipif_mtu, 23106 NULL, 23107 ipif_prim->ipif_rq, 23108 ipif_prim->ipif_wq, 23109 IRE_BROADCAST, 23110 ipif_prim->ipif_bcast_mp, 23111 ipif_prim, 23112 NULL, 23113 0, 23114 0, 23115 0, 23116 ire->ire_flags, 23117 &ire_uinfo_null, 23118 NULL, 23119 NULL); 23120 23121 if (bcast_ire != NULL) { 23122 23123 if (ire_add(&bcast_ire, NULL, NULL, NULL, 23124 B_FALSE) == 0) { 23125 ip2dbg(("ip_cgtp_filter_bcast_add: " 23126 "added bcast_ire %p\n", 23127 (void *)bcast_ire)); 23128 23129 ipif_save_ire(bcast_ire->ire_ipif, 23130 bcast_ire); 23131 ire_refrele(bcast_ire); 23132 } 23133 } 23134 ire_refrele(ire_prim); 23135 } 23136 } 23137 23138 23139 /* 23140 * IP multirouting broadcast routes handling 23141 * Remove the broadcast ire 23142 */ 23143 static void 23144 ip_cgtp_bcast_delete(ire_t *ire) 23145 { 23146 ire_t *ire_dst; 23147 23148 ASSERT(ire != NULL); 23149 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 23150 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23151 if (ire_dst != NULL) { 23152 ire_t *ire_prim; 23153 23154 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23155 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23156 if (ire_prim != NULL) { 23157 ipif_t *ipif_prim; 23158 ire_t *bcast_ire; 23159 23160 ipif_prim = ire_prim->ire_ipif; 23161 23162 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23163 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23164 (void *)ire_dst, (void *)ire_prim, 23165 (void *)ipif_prim)); 23166 23167 bcast_ire = ire_ctable_lookup(ire->ire_addr, 23168 ire->ire_gateway_addr, 23169 IRE_BROADCAST, 23170 ipif_prim, ALL_ZONES, 23171 NULL, 23172 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 23173 MATCH_IRE_MASK); 23174 23175 if (bcast_ire != NULL) { 23176 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23177 "looked up bcast_ire %p\n", 23178 (void *)bcast_ire)); 23179 ipif_remove_ire(bcast_ire->ire_ipif, 23180 bcast_ire); 23181 ire_delete(bcast_ire); 23182 } 23183 ire_refrele(ire_prim); 23184 } 23185 ire_refrele(ire_dst); 23186 } 23187 } 23188 23189 /* 23190 * IPsec hardware acceleration capabilities related functions. 23191 */ 23192 23193 /* 23194 * Free a per-ill IPsec capabilities structure. 23195 */ 23196 static void 23197 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 23198 { 23199 if (capab->auth_hw_algs != NULL) 23200 kmem_free(capab->auth_hw_algs, capab->algs_size); 23201 if (capab->encr_hw_algs != NULL) 23202 kmem_free(capab->encr_hw_algs, capab->algs_size); 23203 if (capab->encr_algparm != NULL) 23204 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 23205 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 23206 } 23207 23208 /* 23209 * Allocate a new per-ill IPsec capabilities structure. This structure 23210 * is specific to an IPsec protocol (AH or ESP). It is implemented as 23211 * an array which specifies, for each algorithm, whether this algorithm 23212 * is supported by the ill or not. 23213 */ 23214 static ill_ipsec_capab_t * 23215 ill_ipsec_capab_alloc(void) 23216 { 23217 ill_ipsec_capab_t *capab; 23218 uint_t nelems; 23219 23220 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 23221 if (capab == NULL) 23222 return (NULL); 23223 23224 /* we need one bit per algorithm */ 23225 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 23226 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 23227 23228 /* allocate memory to store algorithm flags */ 23229 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23230 if (capab->encr_hw_algs == NULL) 23231 goto nomem; 23232 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23233 if (capab->auth_hw_algs == NULL) 23234 goto nomem; 23235 /* 23236 * Leave encr_algparm NULL for now since we won't need it half 23237 * the time 23238 */ 23239 return (capab); 23240 23241 nomem: 23242 ill_ipsec_capab_free(capab); 23243 return (NULL); 23244 } 23245 23246 /* 23247 * Resize capability array. Since we're exclusive, this is OK. 23248 */ 23249 static boolean_t 23250 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 23251 { 23252 ipsec_capab_algparm_t *nalp, *oalp; 23253 uint32_t olen, nlen; 23254 23255 oalp = capab->encr_algparm; 23256 olen = capab->encr_algparm_size; 23257 23258 if (oalp != NULL) { 23259 if (algid < capab->encr_algparm_end) 23260 return (B_TRUE); 23261 } 23262 23263 nlen = (algid + 1) * sizeof (*nalp); 23264 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 23265 if (nalp == NULL) 23266 return (B_FALSE); 23267 23268 if (oalp != NULL) { 23269 bcopy(oalp, nalp, olen); 23270 kmem_free(oalp, olen); 23271 } 23272 capab->encr_algparm = nalp; 23273 capab->encr_algparm_size = nlen; 23274 capab->encr_algparm_end = algid + 1; 23275 23276 return (B_TRUE); 23277 } 23278 23279 /* 23280 * Compare the capabilities of the specified ill with the protocol 23281 * and algorithms specified by the SA passed as argument. 23282 * If they match, returns B_TRUE, B_FALSE if they do not match. 23283 * 23284 * The ill can be passed as a pointer to it, or by specifying its index 23285 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 23286 * 23287 * Called by ipsec_out_is_accelerated() do decide whether an outbound 23288 * packet is eligible for hardware acceleration, and by 23289 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 23290 * to a particular ill. 23291 */ 23292 boolean_t 23293 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 23294 ipsa_t *sa) 23295 { 23296 boolean_t sa_isv6; 23297 uint_t algid; 23298 struct ill_ipsec_capab_s *cpp; 23299 boolean_t need_refrele = B_FALSE; 23300 23301 if (ill == NULL) { 23302 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 23303 NULL, NULL, NULL); 23304 if (ill == NULL) { 23305 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 23306 return (B_FALSE); 23307 } 23308 need_refrele = B_TRUE; 23309 } 23310 23311 /* 23312 * Use the address length specified by the SA to determine 23313 * if it corresponds to a IPv6 address, and fail the matching 23314 * if the isv6 flag passed as argument does not match. 23315 * Note: this check is used for SADB capability checking before 23316 * sending SA information to an ill. 23317 */ 23318 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 23319 if (sa_isv6 != ill_isv6) 23320 /* protocol mismatch */ 23321 goto done; 23322 23323 /* 23324 * Check if the ill supports the protocol, algorithm(s) and 23325 * key size(s) specified by the SA, and get the pointers to 23326 * the algorithms supported by the ill. 23327 */ 23328 switch (sa->ipsa_type) { 23329 23330 case SADB_SATYPE_ESP: 23331 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 23332 /* ill does not support ESP acceleration */ 23333 goto done; 23334 cpp = ill->ill_ipsec_capab_esp; 23335 algid = sa->ipsa_auth_alg; 23336 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 23337 goto done; 23338 algid = sa->ipsa_encr_alg; 23339 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 23340 goto done; 23341 if (algid < cpp->encr_algparm_end) { 23342 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 23343 if (sa->ipsa_encrkeybits < alp->minkeylen) 23344 goto done; 23345 if (sa->ipsa_encrkeybits > alp->maxkeylen) 23346 goto done; 23347 } 23348 break; 23349 23350 case SADB_SATYPE_AH: 23351 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 23352 /* ill does not support AH acceleration */ 23353 goto done; 23354 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 23355 ill->ill_ipsec_capab_ah->auth_hw_algs)) 23356 goto done; 23357 break; 23358 } 23359 23360 if (need_refrele) 23361 ill_refrele(ill); 23362 return (B_TRUE); 23363 done: 23364 if (need_refrele) 23365 ill_refrele(ill); 23366 return (B_FALSE); 23367 } 23368 23369 23370 /* 23371 * Add a new ill to the list of IPsec capable ills. 23372 * Called from ill_capability_ipsec_ack() when an ACK was received 23373 * indicating that IPsec hardware processing was enabled for an ill. 23374 * 23375 * ill must point to the ill for which acceleration was enabled. 23376 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 23377 */ 23378 static void 23379 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 23380 { 23381 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 23382 uint_t sa_type; 23383 uint_t ipproto; 23384 23385 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 23386 (dl_cap == DL_CAPAB_IPSEC_ESP)); 23387 23388 switch (dl_cap) { 23389 case DL_CAPAB_IPSEC_AH: 23390 sa_type = SADB_SATYPE_AH; 23391 ills = &ipsec_capab_ills_ah; 23392 ipproto = IPPROTO_AH; 23393 break; 23394 case DL_CAPAB_IPSEC_ESP: 23395 sa_type = SADB_SATYPE_ESP; 23396 ills = &ipsec_capab_ills_esp; 23397 ipproto = IPPROTO_ESP; 23398 break; 23399 } 23400 23401 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 23402 23403 /* 23404 * Add ill index to list of hardware accelerators. If 23405 * already in list, do nothing. 23406 */ 23407 for (cur_ill = *ills; cur_ill != NULL && 23408 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 23409 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 23410 ; 23411 23412 if (cur_ill == NULL) { 23413 /* if this is a new entry for this ill */ 23414 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 23415 if (new_ill == NULL) { 23416 rw_exit(&ipsec_capab_ills_lock); 23417 return; 23418 } 23419 23420 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 23421 new_ill->ill_isv6 = ill->ill_isv6; 23422 new_ill->next = *ills; 23423 *ills = new_ill; 23424 } else if (!sadb_resync) { 23425 /* not resync'ing SADB and an entry exists for this ill */ 23426 rw_exit(&ipsec_capab_ills_lock); 23427 return; 23428 } 23429 23430 rw_exit(&ipsec_capab_ills_lock); 23431 23432 if (ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 23433 /* 23434 * IPsec module for protocol loaded, initiate dump 23435 * of the SADB to this ill. 23436 */ 23437 sadb_ill_download(ill, sa_type); 23438 } 23439 23440 /* 23441 * Remove an ill from the list of IPsec capable ills. 23442 */ 23443 static void 23444 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 23445 { 23446 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 23447 23448 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 23449 dl_cap == DL_CAPAB_IPSEC_ESP); 23450 23451 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipsec_capab_ills_ah : 23452 &ipsec_capab_ills_esp; 23453 23454 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 23455 23456 prev_ill = NULL; 23457 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 23458 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 23459 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 23460 ; 23461 if (cur_ill == NULL) { 23462 /* entry not found */ 23463 rw_exit(&ipsec_capab_ills_lock); 23464 return; 23465 } 23466 if (prev_ill == NULL) { 23467 /* entry at front of list */ 23468 *ills = NULL; 23469 } else { 23470 prev_ill->next = cur_ill->next; 23471 } 23472 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 23473 rw_exit(&ipsec_capab_ills_lock); 23474 } 23475 23476 23477 /* 23478 * Handling of DL_CONTROL_REQ messages that must be sent down to 23479 * an ill while having exclusive access. 23480 */ 23481 /* ARGSUSED */ 23482 static void 23483 ill_ipsec_capab_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 23484 { 23485 ill_t *ill = (ill_t *)q->q_ptr; 23486 23487 ill_dlpi_send(ill, mp); 23488 } 23489 23490 23491 /* 23492 * Called by SADB to send a DL_CONTROL_REQ message to every ill 23493 * supporting the specified IPsec protocol acceleration. 23494 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 23495 * We free the mblk and, if sa is non-null, release the held referece. 23496 */ 23497 void 23498 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa) 23499 { 23500 ipsec_capab_ill_t *ici, *cur_ici; 23501 ill_t *ill; 23502 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 23503 23504 ici = (sa_type == SADB_SATYPE_AH) ? ipsec_capab_ills_ah : 23505 ipsec_capab_ills_esp; 23506 23507 rw_enter(&ipsec_capab_ills_lock, RW_READER); 23508 23509 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 23510 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 23511 cur_ici->ill_isv6, NULL, NULL, NULL, NULL); 23512 23513 /* 23514 * Handle the case where the ill goes away while the SADB is 23515 * attempting to send messages. If it's going away, it's 23516 * nuking its shadow SADB, so we don't care.. 23517 */ 23518 23519 if (ill == NULL) 23520 continue; 23521 23522 if (sa != NULL) { 23523 /* 23524 * Make sure capabilities match before 23525 * sending SA to ill. 23526 */ 23527 if (!ipsec_capab_match(ill, cur_ici->ill_index, 23528 cur_ici->ill_isv6, sa)) { 23529 ill_refrele(ill); 23530 continue; 23531 } 23532 23533 mutex_enter(&sa->ipsa_lock); 23534 sa->ipsa_flags |= IPSA_F_HW; 23535 mutex_exit(&sa->ipsa_lock); 23536 } 23537 23538 /* 23539 * Copy template message, and add it to the front 23540 * of the mblk ship list. We want to avoid holding 23541 * the ipsec_capab_ills_lock while sending the 23542 * message to the ills. 23543 * 23544 * The b_next and b_prev are temporarily used 23545 * to build a list of mblks to be sent down, and to 23546 * save the ill to which they must be sent. 23547 */ 23548 nmp = copymsg(mp); 23549 if (nmp == NULL) { 23550 ill_refrele(ill); 23551 continue; 23552 } 23553 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 23554 nmp->b_next = mp_ship_list; 23555 mp_ship_list = nmp; 23556 nmp->b_prev = (mblk_t *)ill; 23557 } 23558 23559 rw_exit(&ipsec_capab_ills_lock); 23560 23561 nmp = mp_ship_list; 23562 while (nmp != NULL) { 23563 /* restore the mblk to a sane state */ 23564 next_mp = nmp->b_next; 23565 nmp->b_next = NULL; 23566 ill = (ill_t *)nmp->b_prev; 23567 nmp->b_prev = NULL; 23568 23569 /* 23570 * Ship the mblk to the ill, must be exclusive. Keep the 23571 * reference to the ill as qwriter_ip() does a ill_referele(). 23572 */ 23573 (void) qwriter_ip(NULL, ill, ill->ill_wq, nmp, 23574 ill_ipsec_capab_send_writer, NEW_OP, B_TRUE); 23575 23576 nmp = next_mp; 23577 } 23578 23579 if (sa != NULL) 23580 IPSA_REFRELE(sa); 23581 freemsg(mp); 23582 } 23583 23584 23585 /* 23586 * Derive an interface id from the link layer address. 23587 * Knows about IEEE 802 and IEEE EUI-64 mappings. 23588 */ 23589 static boolean_t 23590 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23591 { 23592 char *addr; 23593 23594 if (phys_length != ETHERADDRL) 23595 return (B_FALSE); 23596 23597 /* Form EUI-64 like address */ 23598 addr = (char *)&v6addr->s6_addr32[2]; 23599 bcopy((char *)phys_addr, addr, 3); 23600 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 23601 addr[3] = (char)0xff; 23602 addr[4] = (char)0xfe; 23603 bcopy((char *)phys_addr + 3, addr + 5, 3); 23604 return (B_TRUE); 23605 } 23606 23607 /* ARGSUSED */ 23608 static boolean_t 23609 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23610 { 23611 return (B_FALSE); 23612 } 23613 23614 /* ARGSUSED */ 23615 static boolean_t 23616 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23617 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23618 { 23619 /* 23620 * Multicast address mappings used over Ethernet/802.X. 23621 * This address is used as a base for mappings. 23622 */ 23623 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 23624 0x00, 0x00, 0x00}; 23625 23626 /* 23627 * Extract low order 32 bits from IPv6 multicast address. 23628 * Or that into the link layer address, starting from the 23629 * second byte. 23630 */ 23631 *hw_start = 2; 23632 v6_extract_mask->s6_addr32[0] = 0; 23633 v6_extract_mask->s6_addr32[1] = 0; 23634 v6_extract_mask->s6_addr32[2] = 0; 23635 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23636 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 23637 return (B_TRUE); 23638 } 23639 23640 /* 23641 * Indicate by return value whether multicast is supported. If not, 23642 * this code should not touch/change any parameters. 23643 */ 23644 /* ARGSUSED */ 23645 static boolean_t 23646 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23647 uint32_t *hw_start, ipaddr_t *extract_mask) 23648 { 23649 /* 23650 * Multicast address mappings used over Ethernet/802.X. 23651 * This address is used as a base for mappings. 23652 */ 23653 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 23654 0x00, 0x00, 0x00 }; 23655 23656 if (phys_length != ETHERADDRL) 23657 return (B_FALSE); 23658 23659 *extract_mask = htonl(0x007fffff); 23660 *hw_start = 2; 23661 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 23662 return (B_TRUE); 23663 } 23664 23665 /* 23666 * Derive IPoIB interface id from the link layer address. 23667 */ 23668 static boolean_t 23669 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23670 { 23671 char *addr; 23672 23673 if (phys_length != 20) 23674 return (B_FALSE); 23675 addr = (char *)&v6addr->s6_addr32[2]; 23676 bcopy(phys_addr + 12, addr, 8); 23677 /* 23678 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 23679 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 23680 * rules. In these cases, the IBA considers these GUIDs to be in 23681 * "Modified EUI-64" format, and thus toggling the u/l bit is not 23682 * required; vendors are required not to assign global EUI-64's 23683 * that differ only in u/l bit values, thus guaranteeing uniqueness 23684 * of the interface identifier. Whether the GUID is in modified 23685 * or proper EUI-64 format, the ipv6 identifier must have the u/l 23686 * bit set to 1. 23687 */ 23688 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 23689 return (B_TRUE); 23690 } 23691 23692 /* 23693 * Note on mapping from multicast IP addresses to IPoIB multicast link 23694 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 23695 * The format of an IPoIB multicast address is: 23696 * 23697 * 4 byte QPN Scope Sign. Pkey 23698 * +--------------------------------------------+ 23699 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 23700 * +--------------------------------------------+ 23701 * 23702 * The Scope and Pkey components are properties of the IBA port and 23703 * network interface. They can be ascertained from the broadcast address. 23704 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 23705 */ 23706 23707 static boolean_t 23708 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23709 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23710 { 23711 /* 23712 * Base IPoIB IPv6 multicast address used for mappings. 23713 * Does not contain the IBA scope/Pkey values. 23714 */ 23715 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23716 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 23717 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23718 23719 /* 23720 * Extract low order 80 bits from IPv6 multicast address. 23721 * Or that into the link layer address, starting from the 23722 * sixth byte. 23723 */ 23724 *hw_start = 6; 23725 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 23726 23727 /* 23728 * Now fill in the IBA scope/Pkey values from the broadcast address. 23729 */ 23730 *(maddr + 5) = *(bphys_addr + 5); 23731 *(maddr + 8) = *(bphys_addr + 8); 23732 *(maddr + 9) = *(bphys_addr + 9); 23733 23734 v6_extract_mask->s6_addr32[0] = 0; 23735 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 23736 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 23737 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23738 return (B_TRUE); 23739 } 23740 23741 static boolean_t 23742 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23743 uint32_t *hw_start, ipaddr_t *extract_mask) 23744 { 23745 /* 23746 * Base IPoIB IPv4 multicast address used for mappings. 23747 * Does not contain the IBA scope/Pkey values. 23748 */ 23749 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23750 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 23751 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23752 23753 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 23754 return (B_FALSE); 23755 23756 /* 23757 * Extract low order 28 bits from IPv4 multicast address. 23758 * Or that into the link layer address, starting from the 23759 * sixteenth byte. 23760 */ 23761 *extract_mask = htonl(0x0fffffff); 23762 *hw_start = 16; 23763 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 23764 23765 /* 23766 * Now fill in the IBA scope/Pkey values from the broadcast address. 23767 */ 23768 *(maddr + 5) = *(bphys_addr + 5); 23769 *(maddr + 8) = *(bphys_addr + 8); 23770 *(maddr + 9) = *(bphys_addr + 9); 23771 return (B_TRUE); 23772 } 23773 23774 /* 23775 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 23776 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 23777 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 23778 * the link-local address is preferred. 23779 */ 23780 boolean_t 23781 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23782 { 23783 ipif_t *ipif; 23784 ipif_t *maybe_ipif = NULL; 23785 23786 mutex_enter(&ill->ill_lock); 23787 if (ill->ill_state_flags & ILL_CONDEMNED) { 23788 mutex_exit(&ill->ill_lock); 23789 if (ipifp != NULL) 23790 *ipifp = NULL; 23791 return (B_FALSE); 23792 } 23793 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 23794 if (!IPIF_CAN_LOOKUP(ipif)) 23795 continue; 23796 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 23797 ipif->ipif_zoneid != ALL_ZONES) 23798 continue; 23799 if ((ipif->ipif_flags & flags) != flags) 23800 continue; 23801 23802 if (ipifp == NULL) { 23803 mutex_exit(&ill->ill_lock); 23804 ASSERT(maybe_ipif == NULL); 23805 return (B_TRUE); 23806 } 23807 if (!ill->ill_isv6 || 23808 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 23809 ipif_refhold_locked(ipif); 23810 mutex_exit(&ill->ill_lock); 23811 *ipifp = ipif; 23812 return (B_TRUE); 23813 } 23814 if (maybe_ipif == NULL) 23815 maybe_ipif = ipif; 23816 } 23817 if (ipifp != NULL) { 23818 if (maybe_ipif != NULL) 23819 ipif_refhold_locked(maybe_ipif); 23820 *ipifp = maybe_ipif; 23821 } 23822 mutex_exit(&ill->ill_lock); 23823 return (maybe_ipif != NULL); 23824 } 23825 23826 /* 23827 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 23828 */ 23829 boolean_t 23830 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23831 { 23832 ill_t *illg; 23833 23834 /* 23835 * We look at the passed-in ill first without grabbing ill_g_lock. 23836 */ 23837 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 23838 return (B_TRUE); 23839 } 23840 rw_enter(&ill_g_lock, RW_READER); 23841 if (ill->ill_group == NULL) { 23842 /* ill not in a group */ 23843 rw_exit(&ill_g_lock); 23844 return (B_FALSE); 23845 } 23846 23847 /* 23848 * There's no ipif in the zone on ill, however ill is part of an IPMP 23849 * group. We need to look for an ipif in the zone on all the ills in the 23850 * group. 23851 */ 23852 illg = ill->ill_group->illgrp_ill; 23853 do { 23854 /* 23855 * We don't call ipif_lookup_zoneid() on ill as we already know 23856 * that it's not there. 23857 */ 23858 if (illg != ill && 23859 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 23860 break; 23861 } 23862 } while ((illg = illg->ill_group_next) != NULL); 23863 rw_exit(&ill_g_lock); 23864 return (illg != NULL); 23865 } 23866 23867 /* 23868 * Check if this ill is only being used to send ICMP probes for IPMP 23869 */ 23870 boolean_t 23871 ill_is_probeonly(ill_t *ill) 23872 { 23873 /* 23874 * Check if the interface is FAILED, or INACTIVE 23875 */ 23876 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 23877 return (B_TRUE); 23878 23879 return (B_FALSE); 23880 } 23881 23882 /* 23883 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 23884 * If a pointer to an ipif_t is returned then the caller will need to do 23885 * an ill_refrele(). 23886 */ 23887 ipif_t * 23888 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6) 23889 { 23890 ipif_t *ipif; 23891 ill_t *ill; 23892 23893 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL); 23894 23895 if (ill == NULL) 23896 return (NULL); 23897 23898 mutex_enter(&ill->ill_lock); 23899 if (ill->ill_state_flags & ILL_CONDEMNED) { 23900 mutex_exit(&ill->ill_lock); 23901 ill_refrele(ill); 23902 return (NULL); 23903 } 23904 23905 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 23906 if (!IPIF_CAN_LOOKUP(ipif)) 23907 continue; 23908 if (lifidx == ipif->ipif_id) { 23909 ipif_refhold_locked(ipif); 23910 break; 23911 } 23912 } 23913 23914 mutex_exit(&ill->ill_lock); 23915 ill_refrele(ill); 23916 return (ipif); 23917 } 23918