1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 47 #include <sys/kmem.h> 48 #include <sys/systm.h> 49 #include <sys/param.h> 50 #include <sys/socket.h> 51 #include <sys/isa_defs.h> 52 #include <net/if.h> 53 #include <net/if_arp.h> 54 #include <net/if_types.h> 55 #include <net/if_dl.h> 56 #include <net/route.h> 57 #include <sys/sockio.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/igmp_var.h> 62 #include <sys/strsun.h> 63 #include <sys/policy.h> 64 #include <sys/ethernet.h> 65 66 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 67 #include <inet/mi.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/mib2.h> 71 #include <inet/ip.h> 72 #include <inet/ip6.h> 73 #include <inet/ip6_asp.h> 74 #include <inet/tcp.h> 75 #include <inet/ip_multi.h> 76 #include <inet/ip_ire.h> 77 #include <inet/ip_ftable.h> 78 #include <inet/ip_rts.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/ip_if.h> 81 #include <inet/ip_impl.h> 82 #include <inet/tun.h> 83 #include <inet/sctp_ip.h> 84 85 #include <net/pfkeyv2.h> 86 #include <inet/ipsec_info.h> 87 #include <inet/sadb.h> 88 #include <inet/ipsec_impl.h> 89 #include <sys/iphada.h> 90 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac.h> 96 97 #include <sys/systeminfo.h> 98 #include <sys/bootconf.h> 99 100 #include <sys/tsol/tndb.h> 101 #include <sys/tsol/tnet.h> 102 103 /* The character which tells where the ill_name ends */ 104 #define IPIF_SEPARATOR_CHAR ':' 105 106 /* IP ioctl function table entry */ 107 typedef struct ipft_s { 108 int ipft_cmd; 109 pfi_t ipft_pfi; 110 int ipft_min_size; 111 int ipft_flags; 112 } ipft_t; 113 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 114 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 115 116 typedef struct ip_sock_ar_s { 117 union { 118 area_t ip_sock_area; 119 ared_t ip_sock_ared; 120 areq_t ip_sock_areq; 121 } ip_sock_ar_u; 122 queue_t *ip_sock_ar_q; 123 } ip_sock_ar_t; 124 125 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 126 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 127 char *value, caddr_t cp, cred_t *ioc_cr); 128 129 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 130 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 131 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 132 mblk_t *mp, boolean_t need_up); 133 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 134 mblk_t *mp, boolean_t need_up); 135 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 136 queue_t *q, mblk_t *mp, boolean_t need_up); 137 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 138 mblk_t *mp, boolean_t need_up); 139 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 140 mblk_t *mp); 141 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 142 queue_t *q, mblk_t *mp, boolean_t need_up); 143 static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, 144 sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl); 145 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **); 146 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 147 static void ipsq_flush(ill_t *ill); 148 static void ipsq_clean_all(ill_t *ill); 149 static void ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring); 150 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 151 queue_t *q, mblk_t *mp, boolean_t need_up); 152 static void ipsq_delete(ipsq_t *); 153 154 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 155 boolean_t initialize); 156 static void ipif_check_bcast_ires(ipif_t *test_ipif); 157 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 158 static void ipif_delete_cache_ire(ire_t *, char *); 159 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 160 static void ipif_down_tail(ipif_t *ipif); 161 static void ipif_free(ipif_t *ipif); 162 static void ipif_free_tail(ipif_t *ipif); 163 static void ipif_mask_reply(ipif_t *); 164 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 165 static void ipif_multicast_down(ipif_t *ipif); 166 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 167 static void ipif_set_default(ipif_t *ipif); 168 static int ipif_set_values(queue_t *q, mblk_t *mp, 169 char *interf_name, uint_t *ppa); 170 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 171 queue_t *q); 172 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 173 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 174 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error); 175 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 176 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 177 178 static int ill_alloc_ppa(ill_if_t *, ill_t *); 179 static int ill_arp_off(ill_t *ill); 180 static int ill_arp_on(ill_t *ill); 181 static void ill_delete_interface_type(ill_if_t *); 182 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 183 static void ill_down(ill_t *ill); 184 static void ill_downi(ire_t *ire, char *ill_arg); 185 static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); 186 static void ill_down_tail(ill_t *ill); 187 static void ill_free_mib(ill_t *ill); 188 static void ill_glist_delete(ill_t *); 189 static boolean_t ill_has_usable_ipif(ill_t *); 190 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 191 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 192 static void ill_phyint_free(ill_t *ill); 193 static void ill_phyint_reinit(ill_t *ill); 194 static void ill_set_nce_router_flags(ill_t *, boolean_t); 195 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 196 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 197 static void ill_stq_cache_delete(ire_t *, char *); 198 199 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 200 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 201 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 202 in6_addr_t *); 203 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 204 ipaddr_t *); 205 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 206 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 207 in6_addr_t *); 208 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 209 ipaddr_t *); 210 211 static void ipif_save_ire(ipif_t *, ire_t *); 212 static void ipif_remove_ire(ipif_t *, ire_t *); 213 static void ip_cgtp_bcast_add(ire_t *, ire_t *); 214 static void ip_cgtp_bcast_delete(ire_t *); 215 216 /* 217 * Per-ill IPsec capabilities management. 218 */ 219 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 220 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 221 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 222 static void ill_ipsec_capab_delete(ill_t *, uint_t); 223 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 224 static void ill_capability_proto(ill_t *, int, mblk_t *); 225 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 226 boolean_t); 227 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 228 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 229 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 230 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 231 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 232 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 233 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 234 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 235 dl_capability_sub_t *); 236 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 237 238 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 239 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 240 static void ill_capability_dls_reset(ill_t *, mblk_t **); 241 static void ill_capability_dls_disable(ill_t *); 242 243 static void illgrp_cache_delete(ire_t *, char *); 244 static void illgrp_delete(ill_t *ill); 245 static void illgrp_reset_schednext(ill_t *ill); 246 247 static ill_t *ill_prev_usesrc(ill_t *); 248 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 249 static void ill_disband_usesrc_group(ill_t *); 250 251 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 252 253 /* 254 * if we go over the memory footprint limit more than once in this msec 255 * interval, we'll start pruning aggressively. 256 */ 257 int ip_min_frag_prune_time = 0; 258 259 /* 260 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 261 * and the IPsec DOI 262 */ 263 #define MAX_IPSEC_ALGS 256 264 265 #define BITSPERBYTE 8 266 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 267 268 #define IPSEC_ALG_ENABLE(algs, algid) \ 269 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 270 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 271 272 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 273 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 274 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 275 276 typedef uint8_t ipsec_capab_elem_t; 277 278 /* 279 * Per-algorithm parameters. Note that at present, only encryption 280 * algorithms have variable keysize (IKE does not provide a way to negotiate 281 * auth algorithm keysize). 282 * 283 * All sizes here are in bits. 284 */ 285 typedef struct 286 { 287 uint16_t minkeylen; 288 uint16_t maxkeylen; 289 } ipsec_capab_algparm_t; 290 291 /* 292 * Per-ill capabilities. 293 */ 294 struct ill_ipsec_capab_s { 295 ipsec_capab_elem_t *encr_hw_algs; 296 ipsec_capab_elem_t *auth_hw_algs; 297 uint32_t algs_size; /* size of _hw_algs in bytes */ 298 /* algorithm key lengths */ 299 ipsec_capab_algparm_t *encr_algparm; 300 uint32_t encr_algparm_size; 301 uint32_t encr_algparm_end; 302 }; 303 304 /* 305 * List of AH and ESP IPsec acceleration capable ills 306 */ 307 typedef struct ipsec_capab_ill_s { 308 uint_t ill_index; 309 boolean_t ill_isv6; 310 struct ipsec_capab_ill_s *next; 311 } ipsec_capab_ill_t; 312 313 static ipsec_capab_ill_t *ipsec_capab_ills_ah; 314 static ipsec_capab_ill_t *ipsec_capab_ills_esp; 315 krwlock_t ipsec_capab_ills_lock; 316 317 /* 318 * The field values are larger than strictly necessary for simple 319 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 320 */ 321 static area_t ip_area_template = { 322 AR_ENTRY_ADD, /* area_cmd */ 323 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 324 /* area_name_offset */ 325 /* area_name_length temporarily holds this structure length */ 326 sizeof (area_t), /* area_name_length */ 327 IP_ARP_PROTO_TYPE, /* area_proto */ 328 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 329 IP_ADDR_LEN, /* area_proto_addr_length */ 330 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 331 /* area_proto_mask_offset */ 332 0, /* area_flags */ 333 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 334 /* area_hw_addr_offset */ 335 /* Zero length hw_addr_length means 'use your idea of the address' */ 336 0 /* area_hw_addr_length */ 337 }; 338 339 /* 340 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 341 * support 342 */ 343 static area_t ip6_area_template = { 344 AR_ENTRY_ADD, /* area_cmd */ 345 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 346 /* area_name_offset */ 347 /* area_name_length temporarily holds this structure length */ 348 sizeof (area_t), /* area_name_length */ 349 IP_ARP_PROTO_TYPE, /* area_proto */ 350 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 351 IPV6_ADDR_LEN, /* area_proto_addr_length */ 352 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 353 /* area_proto_mask_offset */ 354 0, /* area_flags */ 355 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 356 /* area_hw_addr_offset */ 357 /* Zero length hw_addr_length means 'use your idea of the address' */ 358 0 /* area_hw_addr_length */ 359 }; 360 361 static ared_t ip_ared_template = { 362 AR_ENTRY_DELETE, 363 sizeof (ared_t) + IP_ADDR_LEN, 364 sizeof (ared_t), 365 IP_ARP_PROTO_TYPE, 366 sizeof (ared_t), 367 IP_ADDR_LEN 368 }; 369 370 static ared_t ip6_ared_template = { 371 AR_ENTRY_DELETE, 372 sizeof (ared_t) + IPV6_ADDR_LEN, 373 sizeof (ared_t), 374 IP_ARP_PROTO_TYPE, 375 sizeof (ared_t), 376 IPV6_ADDR_LEN 377 }; 378 379 /* 380 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 381 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 382 * areq is used). 383 */ 384 static areq_t ip_areq_template = { 385 AR_ENTRY_QUERY, /* cmd */ 386 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 387 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 388 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 389 sizeof (areq_t), /* target addr offset */ 390 IP_ADDR_LEN, /* target addr_length */ 391 0, /* flags */ 392 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 393 IP_ADDR_LEN, /* sender addr length */ 394 6, /* xmit_count */ 395 1000, /* (re)xmit_interval in milliseconds */ 396 4 /* max # of requests to buffer */ 397 /* anything else filled in by the code */ 398 }; 399 400 static arc_t ip_aru_template = { 401 AR_INTERFACE_UP, 402 sizeof (arc_t), /* Name offset */ 403 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 404 }; 405 406 static arc_t ip_ard_template = { 407 AR_INTERFACE_DOWN, 408 sizeof (arc_t), /* Name offset */ 409 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 410 }; 411 412 static arc_t ip_aron_template = { 413 AR_INTERFACE_ON, 414 sizeof (arc_t), /* Name offset */ 415 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 416 }; 417 418 static arc_t ip_aroff_template = { 419 AR_INTERFACE_OFF, 420 sizeof (arc_t), /* Name offset */ 421 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 422 }; 423 424 425 static arma_t ip_arma_multi_template = { 426 AR_MAPPING_ADD, 427 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 428 /* Name offset */ 429 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 430 IP_ARP_PROTO_TYPE, 431 sizeof (arma_t), /* proto_addr_offset */ 432 IP_ADDR_LEN, /* proto_addr_length */ 433 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 434 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 435 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 436 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 437 IP_MAX_HW_LEN, /* hw_addr_length */ 438 0, /* hw_mapping_start */ 439 }; 440 441 static ipft_t ip_ioctl_ftbl[] = { 442 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 443 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 444 IPFT_F_NO_REPLY }, 445 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 446 IPFT_F_NO_REPLY }, 447 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 448 { 0 } 449 }; 450 451 /* Simple ICMP IP Header Template */ 452 static ipha_t icmp_ipha = { 453 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 454 }; 455 456 /* Flag descriptors for ip_ipif_report */ 457 static nv_t ipif_nv_tbl[] = { 458 { IPIF_UP, "UP" }, 459 { IPIF_BROADCAST, "BROADCAST" }, 460 { ILLF_DEBUG, "DEBUG" }, 461 { PHYI_LOOPBACK, "LOOPBACK" }, 462 { IPIF_POINTOPOINT, "POINTOPOINT" }, 463 { ILLF_NOTRAILERS, "NOTRAILERS" }, 464 { PHYI_RUNNING, "RUNNING" }, 465 { ILLF_NOARP, "NOARP" }, 466 { PHYI_PROMISC, "PROMISC" }, 467 { PHYI_ALLMULTI, "ALLMULTI" }, 468 { PHYI_INTELLIGENT, "INTELLIGENT" }, 469 { ILLF_MULTICAST, "MULTICAST" }, 470 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 471 { IPIF_UNNUMBERED, "UNNUMBERED" }, 472 { IPIF_DHCPRUNNING, "DHCP" }, 473 { IPIF_PRIVATE, "PRIVATE" }, 474 { IPIF_NOXMIT, "NOXMIT" }, 475 { IPIF_NOLOCAL, "NOLOCAL" }, 476 { IPIF_DEPRECATED, "DEPRECATED" }, 477 { IPIF_PREFERRED, "PREFERRED" }, 478 { IPIF_TEMPORARY, "TEMPORARY" }, 479 { IPIF_ADDRCONF, "ADDRCONF" }, 480 { PHYI_VIRTUAL, "VIRTUAL" }, 481 { ILLF_ROUTER, "ROUTER" }, 482 { ILLF_NONUD, "NONUD" }, 483 { IPIF_ANYCAST, "ANYCAST" }, 484 { ILLF_NORTEXCH, "NORTEXCH" }, 485 { ILLF_IPV4, "IPV4" }, 486 { ILLF_IPV6, "IPV6" }, 487 { IPIF_MIPRUNNING, "MIP" }, 488 { IPIF_NOFAILOVER, "NOFAILOVER" }, 489 { PHYI_FAILED, "FAILED" }, 490 { PHYI_STANDBY, "STANDBY" }, 491 { PHYI_INACTIVE, "INACTIVE" }, 492 { PHYI_OFFLINE, "OFFLINE" }, 493 }; 494 495 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 496 497 static ip_m_t ip_m_tbl[] = { 498 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 499 ip_ether_v6intfid }, 500 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 501 ip_nodef_v6intfid }, 502 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 503 ip_nodef_v6intfid }, 504 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 505 ip_nodef_v6intfid }, 506 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 507 ip_ether_v6intfid }, 508 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 509 ip_ib_v6intfid }, 510 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 511 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 512 ip_nodef_v6intfid } 513 }; 514 515 static ill_t ill_null; /* Empty ILL for init. */ 516 char ipif_loopback_name[] = "lo0"; 517 static char *ipv4_forward_suffix = ":ip_forwarding"; 518 static char *ipv6_forward_suffix = ":ip6_forwarding"; 519 static kstat_t *loopback_ksp = NULL; 520 static sin6_t sin6_null; /* Zero address for quick clears */ 521 static sin_t sin_null; /* Zero address for quick clears */ 522 static uint_t ill_index = 1; /* Used to assign interface indicies */ 523 /* When set search for unused index */ 524 static boolean_t ill_index_wrap = B_FALSE; 525 /* When set search for unused ipif_seqid */ 526 static ipif_t ipif_zero; 527 uint_t ipif_src_random; 528 529 /* 530 * For details on the protection offered by these locks please refer 531 * to the notes under the Synchronization section at the start of ip.c 532 */ 533 krwlock_t ill_g_lock; /* The global ill_g_lock */ 534 kmutex_t ip_addr_avail_lock; /* Address availability check lock */ 535 ipsq_t *ipsq_g_head; /* List of all ipsq's on the system */ 536 537 krwlock_t ill_g_usesrc_lock; /* Protects usesrc related fields */ 538 539 /* 540 * illgrp_head/ifgrp_head is protected by IP's perimeter. 541 */ 542 static ill_group_t *illgrp_head_v4; /* Head of IPv4 ill groups */ 543 ill_group_t *illgrp_head_v6; /* Head of IPv6 ill groups */ 544 545 ill_g_head_t ill_g_heads[MAX_G_HEADS]; /* ILL List Head */ 546 547 /* 548 * ppa arena is created after these many 549 * interfaces have been plumbed. 550 */ 551 uint_t ill_no_arena = 12; 552 553 #pragma align CACHE_ALIGN_SIZE(phyint_g_list) 554 static phyint_list_t phyint_g_list; /* start of phyint list */ 555 556 /* 557 * Reflects value of FAILBACK variable in IPMP config file 558 * /etc/default/mpathd. Default value is B_TRUE. 559 * Set to B_FALSE if user disabled failback by configuring "FAILBACK=no" 560 * in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel. 561 */ 562 static boolean_t ipmp_enable_failback = B_TRUE; 563 564 /* 565 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 566 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 567 * set through platform specific code (Niagara/Ontario). 568 */ 569 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 570 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 571 572 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 573 574 static uint_t 575 ipif_rand(void) 576 { 577 ipif_src_random = ipif_src_random * 1103515245 + 12345; 578 return ((ipif_src_random >> 16) & 0x7fff); 579 } 580 581 /* 582 * Allocate per-interface mibs. Only used for ipv6. 583 * Returns true if ok. False otherwise. 584 * ipsq may not yet be allocated (loopback case ). 585 */ 586 static boolean_t 587 ill_allocate_mibs(ill_t *ill) 588 { 589 ASSERT(ill->ill_isv6); 590 591 /* Already allocated? */ 592 if (ill->ill_ip6_mib != NULL) { 593 ASSERT(ill->ill_icmp6_mib != NULL); 594 return (B_TRUE); 595 } 596 597 ill->ill_ip6_mib = kmem_zalloc(sizeof (*ill->ill_ip6_mib), 598 KM_NOSLEEP); 599 if (ill->ill_ip6_mib == NULL) { 600 return (B_FALSE); 601 } 602 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 603 KM_NOSLEEP); 604 if (ill->ill_icmp6_mib == NULL) { 605 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 606 ill->ill_ip6_mib = NULL; 607 return (B_FALSE); 608 } 609 /* 610 * The ipv6Ifindex and ipv6IfIcmpIndex will be assigned later 611 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 612 * -> ill_phyint_reinit 613 */ 614 return (B_TRUE); 615 } 616 617 /* 618 * Common code for preparation of ARP commands. Two points to remember: 619 * 1) The ill_name is tacked on at the end of the allocated space so 620 * the templates name_offset field must contain the total space 621 * to allocate less the name length. 622 * 623 * 2) The templates name_length field should contain the *template* 624 * length. We use it as a parameter to bcopy() and then write 625 * the real ill_name_length into the name_length field of the copy. 626 * (Always called as writer.) 627 */ 628 mblk_t * 629 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 630 { 631 arc_t *arc = (arc_t *)template; 632 char *cp; 633 int len; 634 mblk_t *mp; 635 uint_t name_length = ill->ill_name_length; 636 uint_t template_len = arc->arc_name_length; 637 638 len = arc->arc_name_offset + name_length; 639 mp = allocb(len, BPRI_HI); 640 if (mp == NULL) 641 return (NULL); 642 cp = (char *)mp->b_rptr; 643 mp->b_wptr = (uchar_t *)&cp[len]; 644 if (template_len) 645 bcopy(template, cp, template_len); 646 if (len > template_len) 647 bzero(&cp[template_len], len - template_len); 648 mp->b_datap->db_type = M_PROTO; 649 650 arc = (arc_t *)cp; 651 arc->arc_name_length = name_length; 652 cp = (char *)arc + arc->arc_name_offset; 653 bcopy(ill->ill_name, cp, name_length); 654 655 if (addr) { 656 area_t *area = (area_t *)mp->b_rptr; 657 658 cp = (char *)area + area->area_proto_addr_offset; 659 bcopy(addr, cp, area->area_proto_addr_length); 660 if (area->area_cmd == AR_ENTRY_ADD) { 661 cp = (char *)area; 662 len = area->area_proto_addr_length; 663 if (area->area_proto_mask_offset) 664 cp += area->area_proto_mask_offset; 665 else 666 cp += area->area_proto_addr_offset + len; 667 while (len-- > 0) 668 *cp++ = (char)~0; 669 } 670 } 671 return (mp); 672 } 673 674 /* 675 * Completely vaporize a lower level tap and all associated interfaces. 676 * ill_delete is called only out of ip_close when the device control 677 * stream is being closed. 678 */ 679 void 680 ill_delete(ill_t *ill) 681 { 682 ipif_t *ipif; 683 ill_t *prev_ill; 684 685 /* 686 * ill_delete may be forcibly entering the ipsq. The previous 687 * ioctl may not have completed and may need to be aborted. 688 * ipsq_flush takes care of it. If we don't need to enter the 689 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 690 * ill_delete_tail is sufficient. 691 */ 692 ipsq_flush(ill); 693 694 /* 695 * Nuke all interfaces. ipif_free will take down the interface, 696 * remove it from the list, and free the data structure. 697 * Walk down the ipif list and remove the logical interfaces 698 * first before removing the main ipif. We can't unplumb 699 * zeroth interface first in the case of IPv6 as reset_conn_ill 700 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 701 * POINTOPOINT. 702 * 703 * If ill_ipif was not properly initialized (i.e low on memory), 704 * then no interfaces to clean up. In this case just clean up the 705 * ill. 706 */ 707 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 708 ipif_free(ipif); 709 710 /* 711 * Used only by ill_arp_on and ill_arp_off, which are writers. 712 * So nobody can be using this mp now. Free the mp allocated for 713 * honoring ILLF_NOARP 714 */ 715 freemsg(ill->ill_arp_on_mp); 716 ill->ill_arp_on_mp = NULL; 717 718 /* Clean up msgs on pending upcalls for mrouted */ 719 reset_mrt_ill(ill); 720 721 /* 722 * ipif_free -> reset_conn_ipif will remove all multicast 723 * references for IPv4. For IPv6, we need to do it here as 724 * it points only at ills. 725 */ 726 reset_conn_ill(ill); 727 728 /* 729 * ill_down will arrange to blow off any IRE's dependent on this 730 * ILL, and shut down fragmentation reassembly. 731 */ 732 ill_down(ill); 733 734 /* Let SCTP know, so that it can remove this from its list. */ 735 sctp_update_ill(ill, SCTP_ILL_REMOVE); 736 737 /* 738 * If an address on this ILL is being used as a source address then 739 * clear out the pointers in other ILLs that point to this ILL. 740 */ 741 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 742 if (ill->ill_usesrc_grp_next != NULL) { 743 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 744 ill_disband_usesrc_group(ill); 745 } else { /* consumer of the usesrc ILL */ 746 prev_ill = ill_prev_usesrc(ill); 747 prev_ill->ill_usesrc_grp_next = 748 ill->ill_usesrc_grp_next; 749 } 750 } 751 rw_exit(&ill_g_usesrc_lock); 752 } 753 754 /* 755 * ill_delete_tail is called from ip_modclose after all references 756 * to the closing ill are gone. The wait is done in ip_modclose 757 */ 758 void 759 ill_delete_tail(ill_t *ill) 760 { 761 mblk_t **mpp; 762 ipif_t *ipif; 763 764 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 765 ipif_down_tail(ipif); 766 767 /* 768 * If polling capability is enabled (which signifies direct 769 * upcall into IP and driver has ill saved as a handle), 770 * we need to make sure that unbind has completed before we 771 * let the ill disappear and driver no longer has any reference 772 * to this ill. 773 */ 774 mutex_enter(&ill->ill_lock); 775 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 776 cv_wait(&ill->ill_cv, &ill->ill_lock); 777 mutex_exit(&ill->ill_lock); 778 779 /* 780 * Clean up polling and soft ring capabilities 781 */ 782 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 783 ill_capability_dls_disable(ill); 784 785 /* 786 * Send the detach if there's one to send (i.e., if we're above a 787 * style 2 DLPI driver). 788 */ 789 if (ill->ill_detach_mp != NULL) { 790 ill_dlpi_send(ill, ill->ill_detach_mp); 791 ill->ill_detach_mp = NULL; 792 } 793 794 if (ill->ill_net_type != IRE_LOOPBACK) 795 qprocsoff(ill->ill_rq); 796 797 /* 798 * We do an ipsq_flush once again now. New messages could have 799 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 800 * could also have landed up if an ioctl thread had looked up 801 * the ill before we set the ILL_CONDEMNED flag, but not yet 802 * enqueued the ioctl when we did the ipsq_flush last time. 803 */ 804 ipsq_flush(ill); 805 806 /* 807 * Free capabilities. 808 */ 809 if (ill->ill_ipsec_capab_ah != NULL) { 810 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 811 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 812 ill->ill_ipsec_capab_ah = NULL; 813 } 814 815 if (ill->ill_ipsec_capab_esp != NULL) { 816 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 817 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 818 ill->ill_ipsec_capab_esp = NULL; 819 } 820 821 if (ill->ill_mdt_capab != NULL) { 822 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 823 ill->ill_mdt_capab = NULL; 824 } 825 826 if (ill->ill_hcksum_capab != NULL) { 827 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 828 ill->ill_hcksum_capab = NULL; 829 } 830 831 if (ill->ill_zerocopy_capab != NULL) { 832 kmem_free(ill->ill_zerocopy_capab, 833 sizeof (ill_zerocopy_capab_t)); 834 ill->ill_zerocopy_capab = NULL; 835 } 836 837 if (ill->ill_dls_capab != NULL) { 838 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 839 ill->ill_dls_capab->ill_unbind_conn = NULL; 840 kmem_free(ill->ill_dls_capab, 841 sizeof (ill_dls_capab_t) + 842 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 843 ill->ill_dls_capab = NULL; 844 } 845 846 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 847 848 while (ill->ill_ipif != NULL) 849 ipif_free_tail(ill->ill_ipif); 850 851 ill_down_tail(ill); 852 853 /* 854 * We have removed all references to ilm from conn and the ones joined 855 * within the kernel. 856 * 857 * We don't walk conns, mrts and ires because 858 * 859 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 860 * 2) ill_down ->ill_downi walks all the ires and cleans up 861 * ill references. 862 */ 863 ASSERT(ilm_walk_ill(ill) == 0); 864 /* 865 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 866 * could free the phyint. No more reference to the phyint after this 867 * point. 868 */ 869 (void) ill_glist_delete(ill); 870 871 rw_enter(&ip_g_nd_lock, RW_WRITER); 872 if (ill->ill_ndd_name != NULL) 873 nd_unload(&ip_g_nd, ill->ill_ndd_name); 874 rw_exit(&ip_g_nd_lock); 875 876 877 if (ill->ill_frag_ptr != NULL) { 878 uint_t count; 879 880 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 881 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 882 } 883 mi_free(ill->ill_frag_ptr); 884 ill->ill_frag_ptr = NULL; 885 ill->ill_frag_hash_tbl = NULL; 886 } 887 if (ill->ill_nd_lla_mp != NULL) 888 freemsg(ill->ill_nd_lla_mp); 889 /* Free all retained control messages. */ 890 mpp = &ill->ill_first_mp_to_free; 891 do { 892 while (mpp[0]) { 893 mblk_t *mp; 894 mblk_t *mp1; 895 896 mp = mpp[0]; 897 mpp[0] = mp->b_next; 898 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 899 mp1->b_next = NULL; 900 mp1->b_prev = NULL; 901 } 902 freemsg(mp); 903 } 904 } while (mpp++ != &ill->ill_last_mp_to_free); 905 906 ill_free_mib(ill); 907 ILL_TRACE_CLEANUP(ill); 908 } 909 910 static void 911 ill_free_mib(ill_t *ill) 912 { 913 if (ill->ill_ip6_mib != NULL) { 914 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 915 ill->ill_ip6_mib = NULL; 916 } 917 if (ill->ill_icmp6_mib != NULL) { 918 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 919 ill->ill_icmp6_mib = NULL; 920 } 921 } 922 923 /* 924 * Concatenate together a physical address and a sap. 925 * 926 * Sap_lengths are interpreted as follows: 927 * sap_length == 0 ==> no sap 928 * sap_length > 0 ==> sap is at the head of the dlpi address 929 * sap_length < 0 ==> sap is at the tail of the dlpi address 930 */ 931 static void 932 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 933 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 934 { 935 uint16_t sap_addr = (uint16_t)sap_src; 936 937 if (sap_length == 0) { 938 if (phys_src == NULL) 939 bzero(dst, phys_length); 940 else 941 bcopy(phys_src, dst, phys_length); 942 } else if (sap_length < 0) { 943 if (phys_src == NULL) 944 bzero(dst, phys_length); 945 else 946 bcopy(phys_src, dst, phys_length); 947 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 948 } else { 949 bcopy(&sap_addr, dst, sizeof (sap_addr)); 950 if (phys_src == NULL) 951 bzero((char *)dst + sap_length, phys_length); 952 else 953 bcopy(phys_src, (char *)dst + sap_length, phys_length); 954 } 955 } 956 957 /* 958 * Generate a dl_unitdata_req mblk for the device and address given. 959 * addr_length is the length of the physical portion of the address. 960 * If addr is NULL include an all zero address of the specified length. 961 * TRUE? In any case, addr_length is taken to be the entire length of the 962 * dlpi address, including the absolute value of sap_length. 963 */ 964 mblk_t * 965 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 966 t_scalar_t sap_length) 967 { 968 dl_unitdata_req_t *dlur; 969 mblk_t *mp; 970 t_scalar_t abs_sap_length; /* absolute value */ 971 972 abs_sap_length = ABS(sap_length); 973 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 974 DL_UNITDATA_REQ); 975 if (mp == NULL) 976 return (NULL); 977 dlur = (dl_unitdata_req_t *)mp->b_rptr; 978 /* HACK: accomodate incompatible DLPI drivers */ 979 if (addr_length == 8) 980 addr_length = 6; 981 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 982 dlur->dl_dest_addr_offset = sizeof (*dlur); 983 dlur->dl_priority.dl_min = 0; 984 dlur->dl_priority.dl_max = 0; 985 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 986 (uchar_t *)&dlur[1]); 987 return (mp); 988 } 989 990 /* 991 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 992 * Return an error if we already have 1 or more ioctls in progress. 993 * This is used only for non-exclusive ioctls. Currently this is used 994 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 995 * and thus need to use ipsq_pending_mp_add. 996 */ 997 boolean_t 998 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 999 { 1000 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1001 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1002 /* 1003 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1004 */ 1005 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1006 (add_mp->b_datap->db_type == M_IOCTL)); 1007 1008 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1009 /* 1010 * Return error if the conn has started closing. The conn 1011 * could have finished cleaning up the pending mp list, 1012 * If so we should not add another mp to the list negating 1013 * the cleanup. 1014 */ 1015 if (connp->conn_state_flags & CONN_CLOSING) 1016 return (B_FALSE); 1017 /* 1018 * Add the pending mp to the head of the list, chained by b_next. 1019 * Note down the conn on which the ioctl request came, in b_prev. 1020 * This will be used to later get the conn, when we get a response 1021 * on the ill queue, from some other module (typically arp) 1022 */ 1023 add_mp->b_next = (void *)ill->ill_pending_mp; 1024 add_mp->b_queue = CONNP_TO_WQ(connp); 1025 ill->ill_pending_mp = add_mp; 1026 if (connp != NULL) 1027 connp->conn_oper_pending_ill = ill; 1028 return (B_TRUE); 1029 } 1030 1031 /* 1032 * Retrieve the ill_pending_mp and return it. We have to walk the list 1033 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1034 */ 1035 mblk_t * 1036 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1037 { 1038 mblk_t *prev = NULL; 1039 mblk_t *curr = NULL; 1040 uint_t id; 1041 conn_t *connp; 1042 1043 /* 1044 * When the conn closes, conn_ioctl_cleanup needs to clean 1045 * up the pending mp, but it does not know the ioc_id and 1046 * passes in a zero for it. 1047 */ 1048 mutex_enter(&ill->ill_lock); 1049 if (ioc_id != 0) 1050 *connpp = NULL; 1051 1052 /* Search the list for the appropriate ioctl based on ioc_id */ 1053 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1054 prev = curr, curr = curr->b_next) { 1055 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1056 connp = Q_TO_CONN(curr->b_queue); 1057 /* Match based on the ioc_id or based on the conn */ 1058 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1059 break; 1060 } 1061 1062 if (curr != NULL) { 1063 /* Unlink the mblk from the pending mp list */ 1064 if (prev != NULL) { 1065 prev->b_next = curr->b_next; 1066 } else { 1067 ASSERT(ill->ill_pending_mp == curr); 1068 ill->ill_pending_mp = curr->b_next; 1069 } 1070 1071 /* 1072 * conn refcnt must have been bumped up at the start of 1073 * the ioctl. So we can safely access the conn. 1074 */ 1075 ASSERT(CONN_Q(curr->b_queue)); 1076 *connpp = Q_TO_CONN(curr->b_queue); 1077 curr->b_next = NULL; 1078 curr->b_queue = NULL; 1079 } 1080 1081 mutex_exit(&ill->ill_lock); 1082 1083 return (curr); 1084 } 1085 1086 /* 1087 * Add the pending mp to the list. There can be only 1 pending mp 1088 * in the list. Any exclusive ioctl that needs to wait for a response 1089 * from another module or driver needs to use this function to set 1090 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1091 * the other module/driver. This is also used while waiting for the 1092 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1093 */ 1094 boolean_t 1095 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1096 int waitfor) 1097 { 1098 ipsq_t *ipsq; 1099 1100 ASSERT(IAM_WRITER_IPIF(ipif)); 1101 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1102 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1103 /* 1104 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1105 * M_ERROR/M_HANGUP from driver 1106 */ 1107 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1108 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP)); 1109 1110 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1111 if (connp != NULL) { 1112 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1113 /* 1114 * Return error if the conn has started closing. The conn 1115 * could have finished cleaning up the pending mp list, 1116 * If so we should not add another mp to the list negating 1117 * the cleanup. 1118 */ 1119 if (connp->conn_state_flags & CONN_CLOSING) 1120 return (B_FALSE); 1121 } 1122 mutex_enter(&ipsq->ipsq_lock); 1123 ipsq->ipsq_pending_ipif = ipif; 1124 /* 1125 * Note down the queue in b_queue. This will be returned by 1126 * ipsq_pending_mp_get. Caller will then use these values to restart 1127 * the processing 1128 */ 1129 add_mp->b_next = NULL; 1130 add_mp->b_queue = q; 1131 ipsq->ipsq_pending_mp = add_mp; 1132 ipsq->ipsq_waitfor = waitfor; 1133 /* 1134 * ipsq_current_ipif is needed to restart the operation from 1135 * ipif_ill_refrele_tail when the last reference to the ipi/ill 1136 * is gone. Since this is not an ioctl ipsq_current_ipif has not 1137 * been set until now. 1138 */ 1139 if (DB_TYPE(add_mp) == M_ERROR || DB_TYPE(add_mp) == M_HANGUP) { 1140 ASSERT(ipsq->ipsq_current_ipif == NULL); 1141 ipsq->ipsq_current_ipif = ipif; 1142 ipsq->ipsq_last_cmd = DB_TYPE(add_mp); 1143 } 1144 if (connp != NULL) 1145 connp->conn_oper_pending_ill = ipif->ipif_ill; 1146 mutex_exit(&ipsq->ipsq_lock); 1147 return (B_TRUE); 1148 } 1149 1150 /* 1151 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1152 * queued in the list. 1153 */ 1154 mblk_t * 1155 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1156 { 1157 mblk_t *curr = NULL; 1158 1159 mutex_enter(&ipsq->ipsq_lock); 1160 *connpp = NULL; 1161 if (ipsq->ipsq_pending_mp == NULL) { 1162 mutex_exit(&ipsq->ipsq_lock); 1163 return (NULL); 1164 } 1165 1166 /* There can be only 1 such excl message */ 1167 curr = ipsq->ipsq_pending_mp; 1168 ASSERT(curr != NULL && curr->b_next == NULL); 1169 ipsq->ipsq_pending_ipif = NULL; 1170 ipsq->ipsq_pending_mp = NULL; 1171 ipsq->ipsq_waitfor = 0; 1172 mutex_exit(&ipsq->ipsq_lock); 1173 1174 if (CONN_Q(curr->b_queue)) { 1175 /* 1176 * This mp did a refhold on the conn, at the start of the ioctl. 1177 * So we can safely return a pointer to the conn to the caller. 1178 */ 1179 *connpp = Q_TO_CONN(curr->b_queue); 1180 } else { 1181 *connpp = NULL; 1182 } 1183 curr->b_next = NULL; 1184 curr->b_prev = NULL; 1185 return (curr); 1186 } 1187 1188 /* 1189 * Cleanup the ioctl mp queued in ipsq_pending_mp 1190 * - Called in the ill_delete path 1191 * - Called in the M_ERROR or M_HANGUP path on the ill. 1192 * - Called in the conn close path. 1193 */ 1194 boolean_t 1195 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1196 { 1197 mblk_t *mp; 1198 ipsq_t *ipsq; 1199 queue_t *q; 1200 ipif_t *ipif; 1201 1202 ASSERT(IAM_WRITER_ILL(ill)); 1203 ipsq = ill->ill_phyint->phyint_ipsq; 1204 mutex_enter(&ipsq->ipsq_lock); 1205 /* 1206 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1207 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1208 * even if it is meant for another ill, since we have to enqueue 1209 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1210 * If connp is non-null we are called from the conn close path. 1211 */ 1212 mp = ipsq->ipsq_pending_mp; 1213 if (mp == NULL || (connp != NULL && 1214 mp->b_queue != CONNP_TO_WQ(connp))) { 1215 mutex_exit(&ipsq->ipsq_lock); 1216 return (B_FALSE); 1217 } 1218 /* Now remove from the ipsq_pending_mp */ 1219 ipsq->ipsq_pending_mp = NULL; 1220 q = mp->b_queue; 1221 mp->b_next = NULL; 1222 mp->b_prev = NULL; 1223 mp->b_queue = NULL; 1224 1225 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1226 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1227 if (ill->ill_move_in_progress) { 1228 ILL_CLEAR_MOVE(ill); 1229 } else if (ill->ill_up_ipifs) { 1230 ill_group_cleanup(ill); 1231 } 1232 1233 ipif = ipsq->ipsq_pending_ipif; 1234 ipsq->ipsq_pending_ipif = NULL; 1235 ipsq->ipsq_waitfor = 0; 1236 ipsq->ipsq_current_ipif = NULL; 1237 mutex_exit(&ipsq->ipsq_lock); 1238 1239 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1240 ip_ioctl_finish(q, mp, ENXIO, connp != NULL ? CONN_CLOSE : 1241 NO_COPYOUT, connp != NULL ? ipif : NULL, NULL); 1242 } else { 1243 /* 1244 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1245 * be just inet_freemsg. we have to restart it 1246 * otherwise the thread will be stuck. 1247 */ 1248 inet_freemsg(mp); 1249 } 1250 return (B_TRUE); 1251 } 1252 1253 /* 1254 * The ill is closing. Cleanup all the pending mps. Called exclusively 1255 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1256 * knows this ill, and hence nobody can add an mp to this list 1257 */ 1258 static void 1259 ill_pending_mp_cleanup(ill_t *ill) 1260 { 1261 mblk_t *mp; 1262 queue_t *q; 1263 1264 ASSERT(IAM_WRITER_ILL(ill)); 1265 1266 mutex_enter(&ill->ill_lock); 1267 /* 1268 * Every mp on the pending mp list originating from an ioctl 1269 * added 1 to the conn refcnt, at the start of the ioctl. 1270 * So bump it down now. See comments in ip_wput_nondata() 1271 */ 1272 while (ill->ill_pending_mp != NULL) { 1273 mp = ill->ill_pending_mp; 1274 ill->ill_pending_mp = mp->b_next; 1275 mutex_exit(&ill->ill_lock); 1276 1277 q = mp->b_queue; 1278 ASSERT(CONN_Q(q)); 1279 mp->b_next = NULL; 1280 mp->b_prev = NULL; 1281 mp->b_queue = NULL; 1282 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL, NULL); 1283 mutex_enter(&ill->ill_lock); 1284 } 1285 ill->ill_pending_ipif = NULL; 1286 1287 mutex_exit(&ill->ill_lock); 1288 } 1289 1290 /* 1291 * Called in the conn close path and ill delete path 1292 */ 1293 static void 1294 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1295 { 1296 ipsq_t *ipsq; 1297 mblk_t *prev; 1298 mblk_t *curr; 1299 mblk_t *next; 1300 queue_t *q; 1301 mblk_t *tmp_list = NULL; 1302 1303 ASSERT(IAM_WRITER_ILL(ill)); 1304 if (connp != NULL) 1305 q = CONNP_TO_WQ(connp); 1306 else 1307 q = ill->ill_wq; 1308 1309 ipsq = ill->ill_phyint->phyint_ipsq; 1310 /* 1311 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1312 * In the case of ioctl from a conn, there can be only 1 mp 1313 * queued on the ipsq. If an ill is being unplumbed, only messages 1314 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1315 * ioctls meant for this ill form conn's are not flushed. They will 1316 * be processed during ipsq_exit and will not find the ill and will 1317 * return error. 1318 */ 1319 mutex_enter(&ipsq->ipsq_lock); 1320 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1321 curr = next) { 1322 next = curr->b_next; 1323 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1324 /* Unlink the mblk from the pending mp list */ 1325 if (prev != NULL) { 1326 prev->b_next = curr->b_next; 1327 } else { 1328 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1329 ipsq->ipsq_xopq_mphead = curr->b_next; 1330 } 1331 if (ipsq->ipsq_xopq_mptail == curr) 1332 ipsq->ipsq_xopq_mptail = prev; 1333 /* 1334 * Create a temporary list and release the ipsq lock 1335 * New elements are added to the head of the tmp_list 1336 */ 1337 curr->b_next = tmp_list; 1338 tmp_list = curr; 1339 } else { 1340 prev = curr; 1341 } 1342 } 1343 mutex_exit(&ipsq->ipsq_lock); 1344 1345 while (tmp_list != NULL) { 1346 curr = tmp_list; 1347 tmp_list = curr->b_next; 1348 curr->b_next = NULL; 1349 curr->b_prev = NULL; 1350 curr->b_queue = NULL; 1351 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1352 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1353 CONN_CLOSE : NO_COPYOUT, NULL, NULL); 1354 } else { 1355 /* 1356 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1357 * this can't be just inet_freemsg. we have to 1358 * restart it otherwise the thread will be stuck. 1359 */ 1360 inet_freemsg(curr); 1361 } 1362 } 1363 } 1364 1365 /* 1366 * This conn has started closing. Cleanup any pending ioctl from this conn. 1367 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1368 */ 1369 void 1370 conn_ioctl_cleanup(conn_t *connp) 1371 { 1372 mblk_t *curr; 1373 ipsq_t *ipsq; 1374 ill_t *ill; 1375 boolean_t refheld; 1376 1377 /* 1378 * Is any exclusive ioctl pending ? If so clean it up. If the 1379 * ioctl has not yet started, the mp is pending in the list headed by 1380 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1381 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1382 * is currently executing now the mp is not queued anywhere but 1383 * conn_oper_pending_ill is null. The conn close will wait 1384 * till the conn_ref drops to zero. 1385 */ 1386 mutex_enter(&connp->conn_lock); 1387 ill = connp->conn_oper_pending_ill; 1388 if (ill == NULL) { 1389 mutex_exit(&connp->conn_lock); 1390 return; 1391 } 1392 1393 curr = ill_pending_mp_get(ill, &connp, 0); 1394 if (curr != NULL) { 1395 mutex_exit(&connp->conn_lock); 1396 CONN_DEC_REF(connp); 1397 inet_freemsg(curr); 1398 return; 1399 } 1400 /* 1401 * We may not be able to refhold the ill if the ill/ipif 1402 * is changing. But we need to make sure that the ill will 1403 * not vanish. So we just bump up the ill_waiter count. 1404 */ 1405 refheld = ill_waiter_inc(ill); 1406 mutex_exit(&connp->conn_lock); 1407 if (refheld) { 1408 if (ipsq_enter(ill, B_TRUE)) { 1409 ill_waiter_dcr(ill); 1410 /* 1411 * Check whether this ioctl has started and is 1412 * pending now in ipsq_pending_mp. If it is not 1413 * found there then check whether this ioctl has 1414 * not even started and is in the ipsq_xopq list. 1415 */ 1416 if (!ipsq_pending_mp_cleanup(ill, connp)) 1417 ipsq_xopq_mp_cleanup(ill, connp); 1418 ipsq = ill->ill_phyint->phyint_ipsq; 1419 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1420 return; 1421 } 1422 } 1423 1424 /* 1425 * The ill is also closing and we could not bump up the 1426 * ill_waiter_count or we could not enter the ipsq. Leave 1427 * the cleanup to ill_delete 1428 */ 1429 mutex_enter(&connp->conn_lock); 1430 while (connp->conn_oper_pending_ill != NULL) 1431 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1432 mutex_exit(&connp->conn_lock); 1433 if (refheld) 1434 ill_waiter_dcr(ill); 1435 } 1436 1437 /* 1438 * ipcl_walk function for cleaning up conn_*_ill fields. 1439 */ 1440 static void 1441 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1442 { 1443 ill_t *ill = (ill_t *)arg; 1444 ire_t *ire; 1445 1446 mutex_enter(&connp->conn_lock); 1447 if (connp->conn_multicast_ill == ill) { 1448 /* Revert to late binding */ 1449 connp->conn_multicast_ill = NULL; 1450 connp->conn_orig_multicast_ifindex = 0; 1451 } 1452 if (connp->conn_incoming_ill == ill) 1453 connp->conn_incoming_ill = NULL; 1454 if (connp->conn_outgoing_ill == ill) 1455 connp->conn_outgoing_ill = NULL; 1456 if (connp->conn_outgoing_pill == ill) 1457 connp->conn_outgoing_pill = NULL; 1458 if (connp->conn_nofailover_ill == ill) 1459 connp->conn_nofailover_ill = NULL; 1460 if (connp->conn_xmit_if_ill == ill) 1461 connp->conn_xmit_if_ill = NULL; 1462 if (connp->conn_ire_cache != NULL) { 1463 ire = connp->conn_ire_cache; 1464 /* 1465 * ip_newroute creates IRE_CACHE with ire_stq coming from 1466 * interface X and ipif coming from interface Y, if interface 1467 * X and Y are part of the same IPMPgroup. Thus whenever 1468 * interface X goes down, remove all references to it by 1469 * checking both on ire_ipif and ire_stq. 1470 */ 1471 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1472 (ire->ire_type == IRE_CACHE && 1473 ire->ire_stq == ill->ill_wq)) { 1474 connp->conn_ire_cache = NULL; 1475 mutex_exit(&connp->conn_lock); 1476 ire_refrele_notr(ire); 1477 return; 1478 } 1479 } 1480 mutex_exit(&connp->conn_lock); 1481 1482 } 1483 1484 /* ARGSUSED */ 1485 void 1486 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1487 { 1488 ill_t *ill = q->q_ptr; 1489 ipif_t *ipif; 1490 1491 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1492 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1493 ipif_down_tail(ipif); 1494 ill_down_tail(ill); 1495 freemsg(mp); 1496 ipsq->ipsq_current_ipif = NULL; 1497 } 1498 1499 /* 1500 * ill_down_start is called when we want to down this ill and bring it up again 1501 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1502 * all interfaces, but don't tear down any plumbing. 1503 */ 1504 boolean_t 1505 ill_down_start(queue_t *q, mblk_t *mp) 1506 { 1507 ill_t *ill; 1508 ipif_t *ipif; 1509 1510 ill = q->q_ptr; 1511 1512 ASSERT(IAM_WRITER_ILL(ill)); 1513 1514 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1515 (void) ipif_down(ipif, NULL, NULL); 1516 1517 ill_down(ill); 1518 1519 (void) ipsq_pending_mp_cleanup(ill, NULL); 1520 mutex_enter(&ill->ill_lock); 1521 /* 1522 * Atomically test and add the pending mp if references are 1523 * still active. 1524 */ 1525 if (!ill_is_quiescent(ill)) { 1526 /* 1527 * Get rid of any pending mps and cleanup. Call will 1528 * not fail since we are passing a null connp. 1529 */ 1530 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1531 mp, ILL_DOWN); 1532 mutex_exit(&ill->ill_lock); 1533 return (B_FALSE); 1534 } 1535 mutex_exit(&ill->ill_lock); 1536 return (B_TRUE); 1537 } 1538 1539 static void 1540 ill_down(ill_t *ill) 1541 { 1542 /* Blow off any IREs dependent on this ILL. */ 1543 ire_walk(ill_downi, (char *)ill); 1544 1545 mutex_enter(&ire_mrtun_lock); 1546 if (ire_mrtun_count != 0) { 1547 mutex_exit(&ire_mrtun_lock); 1548 ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif, 1549 (char *)ill, NULL); 1550 } else { 1551 mutex_exit(&ire_mrtun_lock); 1552 } 1553 1554 /* 1555 * If any interface based forwarding table exists 1556 * Blow off the ires there dependent on this ill 1557 */ 1558 mutex_enter(&ire_srcif_table_lock); 1559 if (ire_srcif_table_count > 0) { 1560 mutex_exit(&ire_srcif_table_lock); 1561 ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill); 1562 } else { 1563 mutex_exit(&ire_srcif_table_lock); 1564 } 1565 1566 /* Remove any conn_*_ill depending on this ill */ 1567 ipcl_walk(conn_cleanup_ill, (caddr_t)ill); 1568 1569 if (ill->ill_group != NULL) { 1570 illgrp_delete(ill); 1571 } 1572 1573 } 1574 1575 static void 1576 ill_down_tail(ill_t *ill) 1577 { 1578 int i; 1579 1580 /* Destroy ill_srcif_table if it exists */ 1581 /* Lock not reqd really because nobody should be able to access */ 1582 mutex_enter(&ill->ill_lock); 1583 if (ill->ill_srcif_table != NULL) { 1584 ill->ill_srcif_refcnt = 0; 1585 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 1586 rw_destroy(&ill->ill_srcif_table[i].irb_lock); 1587 } 1588 kmem_free(ill->ill_srcif_table, 1589 IP_SRCIF_TABLE_SIZE * sizeof (irb_t)); 1590 ill->ill_srcif_table = NULL; 1591 ill->ill_srcif_refcnt = 0; 1592 ill->ill_mrtun_refcnt = 0; 1593 } 1594 mutex_exit(&ill->ill_lock); 1595 } 1596 1597 /* 1598 * ire_walk routine used to delete every IRE that depends on queues 1599 * associated with 'ill'. (Always called as writer.) 1600 */ 1601 static void 1602 ill_downi(ire_t *ire, char *ill_arg) 1603 { 1604 ill_t *ill = (ill_t *)ill_arg; 1605 1606 /* 1607 * ip_newroute creates IRE_CACHE with ire_stq coming from 1608 * interface X and ipif coming from interface Y, if interface 1609 * X and Y are part of the same IPMP group. Thus whenever interface 1610 * X goes down, remove all references to it by checking both 1611 * on ire_ipif and ire_stq. 1612 */ 1613 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1614 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1615 ire_delete(ire); 1616 } 1617 } 1618 1619 /* 1620 * A seperate routine for deleting revtun and srcif based routes 1621 * are needed because the ires only deleted when the interface 1622 * is unplumbed. Also these ires have ire_in_ill non-null as well. 1623 * we want to keep mobile IP specific code separate. 1624 */ 1625 static void 1626 ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg) 1627 { 1628 ill_t *ill = (ill_t *)ill_arg; 1629 1630 ASSERT(ire->ire_in_ill != NULL); 1631 1632 if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) || 1633 (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) { 1634 ire_delete(ire); 1635 } 1636 } 1637 1638 /* 1639 * Remove ire/nce from the fastpath list. 1640 */ 1641 void 1642 ill_fastpath_nack(ill_t *ill) 1643 { 1644 if (ill->ill_isv6) { 1645 nce_fastpath_list_dispatch(ill, NULL, NULL); 1646 } else { 1647 ire_fastpath_list_dispatch(ill, NULL, NULL); 1648 } 1649 } 1650 1651 /* Consume an M_IOCACK of the fastpath probe. */ 1652 void 1653 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1654 { 1655 mblk_t *mp1 = mp; 1656 1657 /* 1658 * If this was the first attempt turn on the fastpath probing. 1659 */ 1660 mutex_enter(&ill->ill_lock); 1661 if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS) 1662 ill->ill_dlpi_fastpath_state = IDMS_OK; 1663 mutex_exit(&ill->ill_lock); 1664 1665 /* Free the M_IOCACK mblk, hold on to the data */ 1666 mp = mp->b_cont; 1667 freeb(mp1); 1668 if (mp == NULL) 1669 return; 1670 if (mp->b_cont != NULL) { 1671 /* 1672 * Update all IRE's or NCE's that are waiting for 1673 * fastpath update. 1674 */ 1675 if (ill->ill_isv6) { 1676 /* 1677 * update nce's in the fastpath list. 1678 */ 1679 nce_fastpath_list_dispatch(ill, 1680 ndp_fastpath_update, mp); 1681 } else { 1682 1683 /* 1684 * update ire's in the fastpath list. 1685 */ 1686 ire_fastpath_list_dispatch(ill, 1687 ire_fastpath_update, mp); 1688 /* 1689 * Check if we need to traverse reverse tunnel table. 1690 * Since there is only single ire_type (IRE_MIPRTUN) 1691 * in the table, we don't need to match on ire_type. 1692 * We have to check ire_mrtun_count and not the 1693 * ill_mrtun_refcnt since ill_mrtun_refcnt is set 1694 * on the incoming ill and here we are dealing with 1695 * outgoing ill. 1696 */ 1697 mutex_enter(&ire_mrtun_lock); 1698 if (ire_mrtun_count != 0) { 1699 mutex_exit(&ire_mrtun_lock); 1700 ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN, 1701 (void (*)(ire_t *, void *)) 1702 ire_fastpath_update, mp, ill); 1703 } else { 1704 mutex_exit(&ire_mrtun_lock); 1705 } 1706 } 1707 mp1 = mp->b_cont; 1708 freeb(mp); 1709 mp = mp1; 1710 } else { 1711 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1712 } 1713 1714 freeb(mp); 1715 } 1716 1717 /* 1718 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1719 * The data portion of the request is a dl_unitdata_req_t template for 1720 * what we would send downstream in the absence of a fastpath confirmation. 1721 */ 1722 int 1723 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1724 { 1725 struct iocblk *ioc; 1726 mblk_t *mp; 1727 1728 if (dlur_mp == NULL) 1729 return (EINVAL); 1730 1731 mutex_enter(&ill->ill_lock); 1732 switch (ill->ill_dlpi_fastpath_state) { 1733 case IDMS_FAILED: 1734 /* 1735 * Driver NAKed the first fastpath ioctl - assume it doesn't 1736 * support it. 1737 */ 1738 mutex_exit(&ill->ill_lock); 1739 return (ENOTSUP); 1740 case IDMS_UNKNOWN: 1741 /* This is the first probe */ 1742 ill->ill_dlpi_fastpath_state = IDMS_INPROGRESS; 1743 break; 1744 default: 1745 break; 1746 } 1747 mutex_exit(&ill->ill_lock); 1748 1749 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1750 return (EAGAIN); 1751 1752 mp->b_cont = copyb(dlur_mp); 1753 if (mp->b_cont == NULL) { 1754 freeb(mp); 1755 return (EAGAIN); 1756 } 1757 1758 ioc = (struct iocblk *)mp->b_rptr; 1759 ioc->ioc_count = msgdsize(mp->b_cont); 1760 1761 putnext(ill->ill_wq, mp); 1762 return (0); 1763 } 1764 1765 void 1766 ill_capability_probe(ill_t *ill) 1767 { 1768 /* 1769 * Do so only if negotiation is enabled, capabilities are unknown, 1770 * and a capability negotiation is not already in progress. 1771 */ 1772 if (ill->ill_capab_state != IDMS_UNKNOWN && 1773 ill->ill_capab_state != IDMS_RENEG) 1774 return; 1775 1776 ill->ill_capab_state = IDMS_INPROGRESS; 1777 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1778 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1779 } 1780 1781 void 1782 ill_capability_reset(ill_t *ill) 1783 { 1784 mblk_t *sc_mp = NULL; 1785 mblk_t *tmp; 1786 1787 /* 1788 * Note here that we reset the state to UNKNOWN, and later send 1789 * down the DL_CAPABILITY_REQ without first setting the state to 1790 * INPROGRESS. We do this in order to distinguish the 1791 * DL_CAPABILITY_ACK response which may come back in response to 1792 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1793 * also handle the case where the driver doesn't send us back 1794 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1795 * requires the state to be in UNKNOWN anyway. In any case, all 1796 * features are turned off until the state reaches IDMS_OK. 1797 */ 1798 ill->ill_capab_state = IDMS_UNKNOWN; 1799 1800 /* 1801 * Disable sub-capabilities and request a list of sub-capability 1802 * messages which will be sent down to the driver. Each handler 1803 * allocates the corresponding dl_capability_sub_t inside an 1804 * mblk, and links it to the existing sc_mp mblk, or return it 1805 * as sc_mp if it's the first sub-capability (the passed in 1806 * sc_mp is NULL). Upon returning from all capability handlers, 1807 * sc_mp will be pulled-up, before passing it downstream. 1808 */ 1809 ill_capability_mdt_reset(ill, &sc_mp); 1810 ill_capability_hcksum_reset(ill, &sc_mp); 1811 ill_capability_zerocopy_reset(ill, &sc_mp); 1812 ill_capability_ipsec_reset(ill, &sc_mp); 1813 ill_capability_dls_reset(ill, &sc_mp); 1814 1815 /* Nothing to send down in order to disable the capabilities? */ 1816 if (sc_mp == NULL) 1817 return; 1818 1819 tmp = msgpullup(sc_mp, -1); 1820 freemsg(sc_mp); 1821 if ((sc_mp = tmp) == NULL) { 1822 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1823 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1824 return; 1825 } 1826 1827 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1828 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1829 } 1830 1831 /* 1832 * Request or set new-style hardware capabilities supported by DLS provider. 1833 */ 1834 static void 1835 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1836 { 1837 mblk_t *mp; 1838 dl_capability_req_t *capb; 1839 size_t size = 0; 1840 uint8_t *ptr; 1841 1842 if (reqp != NULL) 1843 size = MBLKL(reqp); 1844 1845 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1846 if (mp == NULL) { 1847 freemsg(reqp); 1848 return; 1849 } 1850 ptr = mp->b_rptr; 1851 1852 capb = (dl_capability_req_t *)ptr; 1853 ptr += sizeof (dl_capability_req_t); 1854 1855 if (reqp != NULL) { 1856 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1857 capb->dl_sub_length = size; 1858 bcopy(reqp->b_rptr, ptr, size); 1859 ptr += size; 1860 mp->b_cont = reqp->b_cont; 1861 freeb(reqp); 1862 } 1863 ASSERT(ptr == mp->b_wptr); 1864 1865 ill_dlpi_send(ill, mp); 1866 } 1867 1868 static void 1869 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1870 { 1871 dl_capab_id_t *id_ic; 1872 uint_t sub_dl_cap = outers->dl_cap; 1873 dl_capability_sub_t *inners; 1874 uint8_t *capend; 1875 1876 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1877 1878 /* 1879 * Note: range checks here are not absolutely sufficient to 1880 * make us robust against malformed messages sent by drivers; 1881 * this is in keeping with the rest of IP's dlpi handling. 1882 * (Remember, it's coming from something else in the kernel 1883 * address space) 1884 */ 1885 1886 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1887 if (capend > mp->b_wptr) { 1888 cmn_err(CE_WARN, "ill_capability_id_ack: " 1889 "malformed sub-capability too long for mblk"); 1890 return; 1891 } 1892 1893 id_ic = (dl_capab_id_t *)(outers + 1); 1894 1895 if (outers->dl_length < sizeof (*id_ic) || 1896 (inners = &id_ic->id_subcap, 1897 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1898 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1899 "encapsulated capab type %d too long for mblk", 1900 inners->dl_cap); 1901 return; 1902 } 1903 1904 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1905 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1906 "isn't as expected; pass-thru module(s) detected, " 1907 "discarding capability\n", inners->dl_cap)); 1908 return; 1909 } 1910 1911 /* Process the encapsulated sub-capability */ 1912 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1913 } 1914 1915 /* 1916 * Process Multidata Transmit capability negotiation ack received from a 1917 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1918 * DL_CAPABILITY_ACK message. 1919 */ 1920 static void 1921 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1922 { 1923 mblk_t *nmp = NULL; 1924 dl_capability_req_t *oc; 1925 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1926 ill_mdt_capab_t **ill_mdt_capab; 1927 uint_t sub_dl_cap = isub->dl_cap; 1928 uint8_t *capend; 1929 1930 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1931 1932 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1933 1934 /* 1935 * Note: range checks here are not absolutely sufficient to 1936 * make us robust against malformed messages sent by drivers; 1937 * this is in keeping with the rest of IP's dlpi handling. 1938 * (Remember, it's coming from something else in the kernel 1939 * address space) 1940 */ 1941 1942 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1943 if (capend > mp->b_wptr) { 1944 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1945 "malformed sub-capability too long for mblk"); 1946 return; 1947 } 1948 1949 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1950 1951 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1952 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1953 "unsupported MDT sub-capability (version %d, expected %d)", 1954 mdt_ic->mdt_version, MDT_VERSION_2); 1955 return; 1956 } 1957 1958 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 1959 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 1960 "capability isn't as expected; pass-thru module(s) " 1961 "detected, discarding capability\n")); 1962 return; 1963 } 1964 1965 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 1966 1967 if (*ill_mdt_capab == NULL) { 1968 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 1969 KM_NOSLEEP); 1970 1971 if (*ill_mdt_capab == NULL) { 1972 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1973 "could not enable MDT version %d " 1974 "for %s (ENOMEM)\n", MDT_VERSION_2, 1975 ill->ill_name); 1976 return; 1977 } 1978 } 1979 1980 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 1981 "MDT version %d (%d bytes leading, %d bytes trailing " 1982 "header spaces, %d max pld bufs, %d span limit)\n", 1983 ill->ill_name, MDT_VERSION_2, 1984 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 1985 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 1986 1987 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 1988 (*ill_mdt_capab)->ill_mdt_on = 1; 1989 /* 1990 * Round the following values to the nearest 32-bit; ULP 1991 * may further adjust them to accomodate for additional 1992 * protocol headers. We pass these values to ULP during 1993 * bind time. 1994 */ 1995 (*ill_mdt_capab)->ill_mdt_hdr_head = 1996 roundup(mdt_ic->mdt_hdr_head, 4); 1997 (*ill_mdt_capab)->ill_mdt_hdr_tail = 1998 roundup(mdt_ic->mdt_hdr_tail, 4); 1999 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 2000 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 2001 2002 ill->ill_capabilities |= ILL_CAPAB_MDT; 2003 } else { 2004 uint_t size; 2005 uchar_t *rptr; 2006 2007 size = sizeof (dl_capability_req_t) + 2008 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2009 2010 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2011 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2012 "could not enable MDT for %s (ENOMEM)\n", 2013 ill->ill_name); 2014 return; 2015 } 2016 2017 rptr = nmp->b_rptr; 2018 /* initialize dl_capability_req_t */ 2019 oc = (dl_capability_req_t *)nmp->b_rptr; 2020 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2021 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2022 sizeof (dl_capab_mdt_t); 2023 nmp->b_rptr += sizeof (dl_capability_req_t); 2024 2025 /* initialize dl_capability_sub_t */ 2026 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2027 nmp->b_rptr += sizeof (*isub); 2028 2029 /* initialize dl_capab_mdt_t */ 2030 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2031 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2032 2033 nmp->b_rptr = rptr; 2034 2035 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2036 "to enable MDT version %d\n", ill->ill_name, 2037 MDT_VERSION_2)); 2038 2039 /* set ENABLE flag */ 2040 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2041 2042 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2043 ill_dlpi_send(ill, nmp); 2044 } 2045 } 2046 2047 static void 2048 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2049 { 2050 mblk_t *mp; 2051 dl_capab_mdt_t *mdt_subcap; 2052 dl_capability_sub_t *dl_subcap; 2053 int size; 2054 2055 if (!ILL_MDT_CAPABLE(ill)) 2056 return; 2057 2058 ASSERT(ill->ill_mdt_capab != NULL); 2059 /* 2060 * Clear the capability flag for MDT but retain the ill_mdt_capab 2061 * structure since it's possible that another thread is still 2062 * referring to it. The structure only gets deallocated when 2063 * we destroy the ill. 2064 */ 2065 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2066 2067 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2068 2069 mp = allocb(size, BPRI_HI); 2070 if (mp == NULL) { 2071 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2072 "request to disable MDT\n")); 2073 return; 2074 } 2075 2076 mp->b_wptr = mp->b_rptr + size; 2077 2078 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2079 dl_subcap->dl_cap = DL_CAPAB_MDT; 2080 dl_subcap->dl_length = sizeof (*mdt_subcap); 2081 2082 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2083 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2084 mdt_subcap->mdt_flags = 0; 2085 mdt_subcap->mdt_hdr_head = 0; 2086 mdt_subcap->mdt_hdr_tail = 0; 2087 2088 if (*sc_mp != NULL) 2089 linkb(*sc_mp, mp); 2090 else 2091 *sc_mp = mp; 2092 } 2093 2094 /* 2095 * Send a DL_NOTIFY_REQ to the specified ill to enable 2096 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2097 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2098 * acceleration. 2099 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2100 */ 2101 static boolean_t 2102 ill_enable_promisc_notify(ill_t *ill) 2103 { 2104 mblk_t *mp; 2105 dl_notify_req_t *req; 2106 2107 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2108 2109 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2110 if (mp == NULL) 2111 return (B_FALSE); 2112 2113 req = (dl_notify_req_t *)mp->b_rptr; 2114 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2115 DL_NOTE_PROMISC_OFF_PHYS; 2116 2117 ill_dlpi_send(ill, mp); 2118 2119 return (B_TRUE); 2120 } 2121 2122 2123 /* 2124 * Allocate an IPsec capability request which will be filled by our 2125 * caller to turn on support for one or more algorithms. 2126 */ 2127 static mblk_t * 2128 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2129 { 2130 mblk_t *nmp; 2131 dl_capability_req_t *ocap; 2132 dl_capab_ipsec_t *ocip; 2133 dl_capab_ipsec_t *icip; 2134 uint8_t *ptr; 2135 icip = (dl_capab_ipsec_t *)(isub + 1); 2136 2137 /* 2138 * The first time around, we send a DL_NOTIFY_REQ to enable 2139 * PROMISC_ON/OFF notification from the provider. We need to 2140 * do this before enabling the algorithms to avoid leakage of 2141 * cleartext packets. 2142 */ 2143 2144 if (!ill_enable_promisc_notify(ill)) 2145 return (NULL); 2146 2147 /* 2148 * Allocate new mblk which will contain a new capability 2149 * request to enable the capabilities. 2150 */ 2151 2152 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2153 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2154 if (nmp == NULL) 2155 return (NULL); 2156 2157 ptr = nmp->b_rptr; 2158 2159 /* initialize dl_capability_req_t */ 2160 ocap = (dl_capability_req_t *)ptr; 2161 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2162 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2163 ptr += sizeof (dl_capability_req_t); 2164 2165 /* initialize dl_capability_sub_t */ 2166 bcopy(isub, ptr, sizeof (*isub)); 2167 ptr += sizeof (*isub); 2168 2169 /* initialize dl_capab_ipsec_t */ 2170 ocip = (dl_capab_ipsec_t *)ptr; 2171 bcopy(icip, ocip, sizeof (*icip)); 2172 2173 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2174 return (nmp); 2175 } 2176 2177 /* 2178 * Process an IPsec capability negotiation ack received from a DLS Provider. 2179 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2180 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2181 */ 2182 static void 2183 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2184 { 2185 dl_capab_ipsec_t *icip; 2186 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2187 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2188 uint_t cipher, nciphers; 2189 mblk_t *nmp; 2190 uint_t alg_len; 2191 boolean_t need_sadb_dump; 2192 uint_t sub_dl_cap = isub->dl_cap; 2193 ill_ipsec_capab_t **ill_capab; 2194 uint64_t ill_capab_flag; 2195 uint8_t *capend, *ciphend; 2196 boolean_t sadb_resync; 2197 2198 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2199 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2200 2201 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2202 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2203 ill_capab_flag = ILL_CAPAB_AH; 2204 } else { 2205 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2206 ill_capab_flag = ILL_CAPAB_ESP; 2207 } 2208 2209 /* 2210 * If the ill capability structure exists, then this incoming 2211 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2212 * If this is so, then we'd need to resynchronize the SADB 2213 * after re-enabling the offloaded ciphers. 2214 */ 2215 sadb_resync = (*ill_capab != NULL); 2216 2217 /* 2218 * Note: range checks here are not absolutely sufficient to 2219 * make us robust against malformed messages sent by drivers; 2220 * this is in keeping with the rest of IP's dlpi handling. 2221 * (Remember, it's coming from something else in the kernel 2222 * address space) 2223 */ 2224 2225 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2226 if (capend > mp->b_wptr) { 2227 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2228 "malformed sub-capability too long for mblk"); 2229 return; 2230 } 2231 2232 /* 2233 * There are two types of acks we process here: 2234 * 1. acks in reply to a (first form) generic capability req 2235 * (no ENABLE flag set) 2236 * 2. acks in reply to a ENABLE capability req. 2237 * (ENABLE flag set) 2238 * 2239 * We process the subcapability passed as argument as follows: 2240 * 1 do initializations 2241 * 1.1 initialize nmp = NULL 2242 * 1.2 set need_sadb_dump to B_FALSE 2243 * 2 for each cipher in subcapability: 2244 * 2.1 if ENABLE flag is set: 2245 * 2.1.1 update per-ill ipsec capabilities info 2246 * 2.1.2 set need_sadb_dump to B_TRUE 2247 * 2.2 if ENABLE flag is not set: 2248 * 2.2.1 if nmp is NULL: 2249 * 2.2.1.1 allocate and initialize nmp 2250 * 2.2.1.2 init current pos in nmp 2251 * 2.2.2 copy current cipher to current pos in nmp 2252 * 2.2.3 set ENABLE flag in nmp 2253 * 2.2.4 update current pos 2254 * 3 if nmp is not equal to NULL, send enable request 2255 * 3.1 send capability request 2256 * 4 if need_sadb_dump is B_TRUE 2257 * 4.1 enable promiscuous on/off notifications 2258 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2259 * AH or ESP SA's to interface. 2260 */ 2261 2262 nmp = NULL; 2263 oalg = NULL; 2264 need_sadb_dump = B_FALSE; 2265 icip = (dl_capab_ipsec_t *)(isub + 1); 2266 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2267 2268 nciphers = icip->cip_nciphers; 2269 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2270 2271 if (ciphend > capend) { 2272 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2273 "too many ciphers for sub-capability len"); 2274 return; 2275 } 2276 2277 for (cipher = 0; cipher < nciphers; cipher++) { 2278 alg_len = sizeof (dl_capab_ipsec_alg_t); 2279 2280 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2281 /* 2282 * TBD: when we provide a way to disable capabilities 2283 * from above, need to manage the request-pending state 2284 * and fail if we were not expecting this ACK. 2285 */ 2286 IPSECHW_DEBUG(IPSECHW_CAPAB, 2287 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2288 2289 /* 2290 * Update IPsec capabilities for this ill 2291 */ 2292 2293 if (*ill_capab == NULL) { 2294 IPSECHW_DEBUG(IPSECHW_CAPAB, 2295 ("ill_capability_ipsec_ack: " 2296 "allocating ipsec_capab for ill\n")); 2297 *ill_capab = ill_ipsec_capab_alloc(); 2298 2299 if (*ill_capab == NULL) { 2300 cmn_err(CE_WARN, 2301 "ill_capability_ipsec_ack: " 2302 "could not enable IPsec Hardware " 2303 "acceleration for %s (ENOMEM)\n", 2304 ill->ill_name); 2305 return; 2306 } 2307 } 2308 2309 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2310 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2311 2312 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2313 cmn_err(CE_WARN, 2314 "ill_capability_ipsec_ack: " 2315 "malformed IPsec algorithm id %d", 2316 ialg->alg_prim); 2317 continue; 2318 } 2319 2320 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2321 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2322 ialg->alg_prim); 2323 } else { 2324 ipsec_capab_algparm_t *alp; 2325 2326 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2327 ialg->alg_prim); 2328 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2329 ialg->alg_prim)) { 2330 cmn_err(CE_WARN, 2331 "ill_capability_ipsec_ack: " 2332 "no space for IPsec alg id %d", 2333 ialg->alg_prim); 2334 continue; 2335 } 2336 alp = &((*ill_capab)->encr_algparm[ 2337 ialg->alg_prim]); 2338 alp->minkeylen = ialg->alg_minbits; 2339 alp->maxkeylen = ialg->alg_maxbits; 2340 } 2341 ill->ill_capabilities |= ill_capab_flag; 2342 /* 2343 * indicate that a capability was enabled, which 2344 * will be used below to kick off a SADB dump 2345 * to the ill. 2346 */ 2347 need_sadb_dump = B_TRUE; 2348 } else { 2349 IPSECHW_DEBUG(IPSECHW_CAPAB, 2350 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2351 ialg->alg_prim)); 2352 2353 if (nmp == NULL) { 2354 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2355 if (nmp == NULL) { 2356 /* 2357 * Sending the PROMISC_ON/OFF 2358 * notification request failed. 2359 * We cannot enable the algorithms 2360 * since the Provider will not 2361 * notify IP of promiscous mode 2362 * changes, which could lead 2363 * to leakage of packets. 2364 */ 2365 cmn_err(CE_WARN, 2366 "ill_capability_ipsec_ack: " 2367 "could not enable IPsec Hardware " 2368 "acceleration for %s (ENOMEM)\n", 2369 ill->ill_name); 2370 return; 2371 } 2372 /* ptr to current output alg specifier */ 2373 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2374 } 2375 2376 /* 2377 * Copy current alg specifier, set ENABLE 2378 * flag, and advance to next output alg. 2379 * For now we enable all IPsec capabilities. 2380 */ 2381 ASSERT(oalg != NULL); 2382 bcopy(ialg, oalg, alg_len); 2383 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2384 nmp->b_wptr += alg_len; 2385 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2386 } 2387 2388 /* move to next input algorithm specifier */ 2389 ialg = (dl_capab_ipsec_alg_t *) 2390 ((char *)ialg + alg_len); 2391 } 2392 2393 if (nmp != NULL) 2394 /* 2395 * nmp points to a DL_CAPABILITY_REQ message to enable 2396 * IPsec hardware acceleration. 2397 */ 2398 ill_dlpi_send(ill, nmp); 2399 2400 if (need_sadb_dump) 2401 /* 2402 * An acknowledgement corresponding to a request to 2403 * enable acceleration was received, notify SADB. 2404 */ 2405 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2406 } 2407 2408 /* 2409 * Given an mblk with enough space in it, create sub-capability entries for 2410 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2411 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2412 * in preparation for the reset the DL_CAPABILITY_REQ message. 2413 */ 2414 static void 2415 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2416 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2417 { 2418 dl_capab_ipsec_t *oipsec; 2419 dl_capab_ipsec_alg_t *oalg; 2420 dl_capability_sub_t *dl_subcap; 2421 int i, k; 2422 2423 ASSERT(nciphers > 0); 2424 ASSERT(ill_cap != NULL); 2425 ASSERT(mp != NULL); 2426 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2427 2428 /* dl_capability_sub_t for "stype" */ 2429 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2430 dl_subcap->dl_cap = stype; 2431 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2432 mp->b_wptr += sizeof (dl_capability_sub_t); 2433 2434 /* dl_capab_ipsec_t for "stype" */ 2435 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2436 oipsec->cip_version = 1; 2437 oipsec->cip_nciphers = nciphers; 2438 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2439 2440 /* create entries for "stype" AUTH ciphers */ 2441 for (i = 0; i < ill_cap->algs_size; i++) { 2442 for (k = 0; k < BITSPERBYTE; k++) { 2443 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2444 continue; 2445 2446 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2447 bzero((void *)oalg, sizeof (*oalg)); 2448 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2449 oalg->alg_prim = k + (BITSPERBYTE * i); 2450 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2451 } 2452 } 2453 /* create entries for "stype" ENCR ciphers */ 2454 for (i = 0; i < ill_cap->algs_size; i++) { 2455 for (k = 0; k < BITSPERBYTE; k++) { 2456 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2457 continue; 2458 2459 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2460 bzero((void *)oalg, sizeof (*oalg)); 2461 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2462 oalg->alg_prim = k + (BITSPERBYTE * i); 2463 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2464 } 2465 } 2466 } 2467 2468 /* 2469 * Macro to count number of 1s in a byte (8-bit word). The total count is 2470 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2471 * POPC instruction, but our macro is more flexible for an arbitrary length 2472 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2473 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2474 * stays that way, we can reduce the number of iterations required. 2475 */ 2476 #define COUNT_1S(val, sum) { \ 2477 uint8_t x = val & 0xff; \ 2478 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2479 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2480 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2481 } 2482 2483 /* ARGSUSED */ 2484 static void 2485 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2486 { 2487 mblk_t *mp; 2488 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2489 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2490 uint64_t ill_capabilities = ill->ill_capabilities; 2491 int ah_cnt = 0, esp_cnt = 0; 2492 int ah_len = 0, esp_len = 0; 2493 int i, size = 0; 2494 2495 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2496 return; 2497 2498 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2499 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2500 2501 /* Find out the number of ciphers for AH */ 2502 if (cap_ah != NULL) { 2503 for (i = 0; i < cap_ah->algs_size; i++) { 2504 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2505 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2506 } 2507 if (ah_cnt > 0) { 2508 size += sizeof (dl_capability_sub_t) + 2509 sizeof (dl_capab_ipsec_t); 2510 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2511 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2512 size += ah_len; 2513 } 2514 } 2515 2516 /* Find out the number of ciphers for ESP */ 2517 if (cap_esp != NULL) { 2518 for (i = 0; i < cap_esp->algs_size; i++) { 2519 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2520 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2521 } 2522 if (esp_cnt > 0) { 2523 size += sizeof (dl_capability_sub_t) + 2524 sizeof (dl_capab_ipsec_t); 2525 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2526 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2527 size += esp_len; 2528 } 2529 } 2530 2531 if (size == 0) { 2532 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2533 "there's nothing to reset\n")); 2534 return; 2535 } 2536 2537 mp = allocb(size, BPRI_HI); 2538 if (mp == NULL) { 2539 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2540 "request to disable IPSEC Hardware Acceleration\n")); 2541 return; 2542 } 2543 2544 /* 2545 * Clear the capability flags for IPSec HA but retain the ill 2546 * capability structures since it's possible that another thread 2547 * is still referring to them. The structures only get deallocated 2548 * when we destroy the ill. 2549 * 2550 * Various places check the flags to see if the ill is capable of 2551 * hardware acceleration, and by clearing them we ensure that new 2552 * outbound IPSec packets are sent down encrypted. 2553 */ 2554 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2555 2556 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2557 if (ah_cnt > 0) { 2558 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2559 cap_ah, mp); 2560 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2561 } 2562 2563 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2564 if (esp_cnt > 0) { 2565 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2566 cap_esp, mp); 2567 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2568 } 2569 2570 /* 2571 * At this point we've composed a bunch of sub-capabilities to be 2572 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2573 * by the caller. Upon receiving this reset message, the driver 2574 * must stop inbound decryption (by destroying all inbound SAs) 2575 * and let the corresponding packets come in encrypted. 2576 */ 2577 2578 if (*sc_mp != NULL) 2579 linkb(*sc_mp, mp); 2580 else 2581 *sc_mp = mp; 2582 } 2583 2584 static void 2585 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2586 boolean_t encapsulated) 2587 { 2588 boolean_t legacy = B_FALSE; 2589 2590 /* 2591 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2592 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2593 * instructed the driver to disable its advertised capabilities, 2594 * so there's no point in accepting any response at this moment. 2595 */ 2596 if (ill->ill_capab_state == IDMS_UNKNOWN) 2597 return; 2598 2599 /* 2600 * Note that only the following two sub-capabilities may be 2601 * considered as "legacy", since their original definitions 2602 * do not incorporate the dl_mid_t module ID token, and hence 2603 * may require the use of the wrapper sub-capability. 2604 */ 2605 switch (subp->dl_cap) { 2606 case DL_CAPAB_IPSEC_AH: 2607 case DL_CAPAB_IPSEC_ESP: 2608 legacy = B_TRUE; 2609 break; 2610 } 2611 2612 /* 2613 * For legacy sub-capabilities which don't incorporate a queue_t 2614 * pointer in their structures, discard them if we detect that 2615 * there are intermediate modules in between IP and the driver. 2616 */ 2617 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2618 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2619 "%d discarded; %d module(s) present below IP\n", 2620 subp->dl_cap, ill->ill_lmod_cnt)); 2621 return; 2622 } 2623 2624 switch (subp->dl_cap) { 2625 case DL_CAPAB_IPSEC_AH: 2626 case DL_CAPAB_IPSEC_ESP: 2627 ill_capability_ipsec_ack(ill, mp, subp); 2628 break; 2629 case DL_CAPAB_MDT: 2630 ill_capability_mdt_ack(ill, mp, subp); 2631 break; 2632 case DL_CAPAB_HCKSUM: 2633 ill_capability_hcksum_ack(ill, mp, subp); 2634 break; 2635 case DL_CAPAB_ZEROCOPY: 2636 ill_capability_zerocopy_ack(ill, mp, subp); 2637 break; 2638 case DL_CAPAB_POLL: 2639 if (!SOFT_RINGS_ENABLED()) 2640 ill_capability_dls_ack(ill, mp, subp); 2641 break; 2642 case DL_CAPAB_SOFT_RING: 2643 if (SOFT_RINGS_ENABLED()) 2644 ill_capability_dls_ack(ill, mp, subp); 2645 break; 2646 default: 2647 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2648 subp->dl_cap)); 2649 } 2650 } 2651 2652 /* 2653 * As part of negotiating polling capability, the driver tells us 2654 * the default (or normal) blanking interval and packet threshold 2655 * (the receive timer fires if blanking interval is reached or 2656 * the packet threshold is reached). 2657 * 2658 * As part of manipulating the polling interval, we always use our 2659 * estimated interval (avg service time * number of packets queued 2660 * on the squeue) but we try to blank for a minimum of 2661 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2662 * packet threshold during this time. When we are not in polling mode 2663 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2664 * rr_min_blank_ratio but up the packet cnt by a ratio of 2665 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2666 * possible although for a shorter interval. 2667 */ 2668 #define RR_MAX_BLANK_RATIO 20 2669 #define RR_MIN_BLANK_RATIO 10 2670 #define RR_MAX_PKT_CNT_RATIO 3 2671 #define RR_MIN_PKT_CNT_RATIO 3 2672 2673 /* 2674 * These can be tuned via /etc/system. 2675 */ 2676 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2677 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2678 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2679 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2680 2681 static mac_resource_handle_t 2682 ill_ring_add(void *arg, mac_resource_t *mrp) 2683 { 2684 ill_t *ill = (ill_t *)arg; 2685 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2686 ill_rx_ring_t *rx_ring; 2687 int ip_rx_index; 2688 2689 ASSERT(mrp != NULL); 2690 if (mrp->mr_type != MAC_RX_FIFO) { 2691 return (NULL); 2692 } 2693 ASSERT(ill != NULL); 2694 ASSERT(ill->ill_dls_capab != NULL); 2695 2696 mutex_enter(&ill->ill_lock); 2697 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2698 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2699 ASSERT(rx_ring != NULL); 2700 2701 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2702 time_t normal_blank_time = 2703 mrfp->mrf_normal_blank_time; 2704 uint_t normal_pkt_cnt = 2705 mrfp->mrf_normal_pkt_count; 2706 2707 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2708 2709 rx_ring->rr_blank = mrfp->mrf_blank; 2710 rx_ring->rr_handle = mrfp->mrf_arg; 2711 rx_ring->rr_ill = ill; 2712 rx_ring->rr_normal_blank_time = normal_blank_time; 2713 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2714 2715 rx_ring->rr_max_blank_time = 2716 normal_blank_time * rr_max_blank_ratio; 2717 rx_ring->rr_min_blank_time = 2718 normal_blank_time * rr_min_blank_ratio; 2719 rx_ring->rr_max_pkt_cnt = 2720 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2721 rx_ring->rr_min_pkt_cnt = 2722 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2723 2724 rx_ring->rr_ring_state = ILL_RING_INUSE; 2725 mutex_exit(&ill->ill_lock); 2726 2727 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2728 (int), ip_rx_index); 2729 return ((mac_resource_handle_t)rx_ring); 2730 } 2731 } 2732 2733 /* 2734 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2735 * we have devices which can overwhelm this limit, ILL_MAX_RING 2736 * should be made configurable. Meanwhile it cause no panic because 2737 * driver will pass ip_input a NULL handle which will make 2738 * IP allocate the default squeue and Polling mode will not 2739 * be used for this ring. 2740 */ 2741 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2742 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2743 2744 mutex_exit(&ill->ill_lock); 2745 return (NULL); 2746 } 2747 2748 static boolean_t 2749 ill_capability_dls_init(ill_t *ill) 2750 { 2751 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2752 conn_t *connp; 2753 size_t sz; 2754 2755 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2756 if (ill_dls == NULL) { 2757 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2758 "soft_ring enabled for ill=%s (%p) but data " 2759 "structs uninitialized\n", ill->ill_name, 2760 (void *)ill); 2761 } 2762 return (B_TRUE); 2763 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2764 if (ill_dls == NULL) { 2765 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2766 "polling enabled for ill=%s (%p) but data " 2767 "structs uninitialized\n", ill->ill_name, 2768 (void *)ill); 2769 } 2770 return (B_TRUE); 2771 } 2772 2773 if (ill_dls != NULL) { 2774 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2775 /* Soft_Ring or polling is being re-enabled */ 2776 2777 connp = ill_dls->ill_unbind_conn; 2778 ASSERT(rx_ring != NULL); 2779 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2780 bzero((void *)rx_ring, 2781 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2782 ill_dls->ill_ring_tbl = rx_ring; 2783 ill_dls->ill_unbind_conn = connp; 2784 return (B_TRUE); 2785 } 2786 2787 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 2788 return (B_FALSE); 2789 2790 sz = sizeof (ill_dls_capab_t); 2791 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2792 2793 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2794 if (ill_dls == NULL) { 2795 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2796 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2797 (void *)ill); 2798 CONN_DEC_REF(connp); 2799 return (B_FALSE); 2800 } 2801 2802 /* Allocate space to hold ring table */ 2803 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2804 ill->ill_dls_capab = ill_dls; 2805 ill_dls->ill_unbind_conn = connp; 2806 return (B_TRUE); 2807 } 2808 2809 /* 2810 * ill_capability_dls_disable: disable soft_ring and/or polling 2811 * capability. Since any of the rings might already be in use, need 2812 * to call ipsq_clean_all() which gets behind the squeue to disable 2813 * direct calls if necessary. 2814 */ 2815 static void 2816 ill_capability_dls_disable(ill_t *ill) 2817 { 2818 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2819 2820 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2821 ipsq_clean_all(ill); 2822 ill_dls->ill_tx = NULL; 2823 ill_dls->ill_tx_handle = NULL; 2824 ill_dls->ill_dls_change_status = NULL; 2825 ill_dls->ill_dls_bind = NULL; 2826 ill_dls->ill_dls_unbind = NULL; 2827 } 2828 2829 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2830 } 2831 2832 static void 2833 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2834 dl_capability_sub_t *isub) 2835 { 2836 uint_t size; 2837 uchar_t *rptr; 2838 dl_capab_dls_t dls, *odls; 2839 ill_dls_capab_t *ill_dls; 2840 mblk_t *nmp = NULL; 2841 dl_capability_req_t *ocap; 2842 uint_t sub_dl_cap = isub->dl_cap; 2843 2844 if (!ill_capability_dls_init(ill)) 2845 return; 2846 ill_dls = ill->ill_dls_capab; 2847 2848 /* Copy locally to get the members aligned */ 2849 bcopy((void *)idls, (void *)&dls, 2850 sizeof (dl_capab_dls_t)); 2851 2852 /* Get the tx function and handle from dld */ 2853 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2854 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2855 2856 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2857 ill_dls->ill_dls_change_status = 2858 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2859 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2860 ill_dls->ill_dls_unbind = 2861 (ip_dls_unbind_t)dls.dls_ring_unbind; 2862 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2863 } 2864 2865 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2866 isub->dl_length; 2867 2868 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2869 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2870 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2871 ill->ill_name, (void *)ill); 2872 return; 2873 } 2874 2875 /* initialize dl_capability_req_t */ 2876 rptr = nmp->b_rptr; 2877 ocap = (dl_capability_req_t *)rptr; 2878 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2879 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2880 rptr += sizeof (dl_capability_req_t); 2881 2882 /* initialize dl_capability_sub_t */ 2883 bcopy(isub, rptr, sizeof (*isub)); 2884 rptr += sizeof (*isub); 2885 2886 odls = (dl_capab_dls_t *)rptr; 2887 rptr += sizeof (dl_capab_dls_t); 2888 2889 /* initialize dl_capab_dls_t to be sent down */ 2890 dls.dls_rx_handle = (uintptr_t)ill; 2891 dls.dls_rx = (uintptr_t)ip_input; 2892 dls.dls_ring_add = (uintptr_t)ill_ring_add; 2893 2894 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2895 dls.dls_ring_cnt = ip_soft_rings_cnt; 2896 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 2897 dls.dls_flags = SOFT_RING_ENABLE; 2898 } else { 2899 dls.dls_flags = POLL_ENABLE; 2900 ip1dbg(("ill_capability_dls_capable: asking interface %s " 2901 "to enable polling\n", ill->ill_name)); 2902 } 2903 bcopy((void *)&dls, (void *)odls, 2904 sizeof (dl_capab_dls_t)); 2905 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2906 /* 2907 * nmp points to a DL_CAPABILITY_REQ message to 2908 * enable either soft_ring or polling 2909 */ 2910 ill_dlpi_send(ill, nmp); 2911 } 2912 2913 static void 2914 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 2915 { 2916 mblk_t *mp; 2917 dl_capab_dls_t *idls; 2918 dl_capability_sub_t *dl_subcap; 2919 int size; 2920 2921 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 2922 return; 2923 2924 ASSERT(ill->ill_dls_capab != NULL); 2925 2926 size = sizeof (*dl_subcap) + sizeof (*idls); 2927 2928 mp = allocb(size, BPRI_HI); 2929 if (mp == NULL) { 2930 ip1dbg(("ill_capability_dls_reset: unable to allocate " 2931 "request to disable soft_ring\n")); 2932 return; 2933 } 2934 2935 mp->b_wptr = mp->b_rptr + size; 2936 2937 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2938 dl_subcap->dl_length = sizeof (*idls); 2939 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2940 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 2941 else 2942 dl_subcap->dl_cap = DL_CAPAB_POLL; 2943 2944 idls = (dl_capab_dls_t *)(dl_subcap + 1); 2945 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2946 idls->dls_flags = SOFT_RING_DISABLE; 2947 else 2948 idls->dls_flags = POLL_DISABLE; 2949 2950 if (*sc_mp != NULL) 2951 linkb(*sc_mp, mp); 2952 else 2953 *sc_mp = mp; 2954 } 2955 2956 /* 2957 * Process a soft_ring/poll capability negotiation ack received 2958 * from a DLS Provider.isub must point to the sub-capability 2959 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 2960 */ 2961 static void 2962 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2963 { 2964 dl_capab_dls_t *idls; 2965 uint_t sub_dl_cap = isub->dl_cap; 2966 uint8_t *capend; 2967 2968 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 2969 sub_dl_cap == DL_CAPAB_POLL); 2970 2971 if (ill->ill_isv6) 2972 return; 2973 2974 /* 2975 * Note: range checks here are not absolutely sufficient to 2976 * make us robust against malformed messages sent by drivers; 2977 * this is in keeping with the rest of IP's dlpi handling. 2978 * (Remember, it's coming from something else in the kernel 2979 * address space) 2980 */ 2981 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2982 if (capend > mp->b_wptr) { 2983 cmn_err(CE_WARN, "ill_capability_dls_ack: " 2984 "malformed sub-capability too long for mblk"); 2985 return; 2986 } 2987 2988 /* 2989 * There are two types of acks we process here: 2990 * 1. acks in reply to a (first form) generic capability req 2991 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 2992 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 2993 * capability req. 2994 */ 2995 idls = (dl_capab_dls_t *)(isub + 1); 2996 2997 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 2998 ip1dbg(("ill_capability_dls_ack: mid token for dls " 2999 "capability isn't as expected; pass-thru " 3000 "module(s) detected, discarding capability\n")); 3001 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 3002 /* 3003 * This is a capability renegotitation case. 3004 * The interface better be unusable at this 3005 * point other wise bad things will happen 3006 * if we disable direct calls on a running 3007 * and up interface. 3008 */ 3009 ill_capability_dls_disable(ill); 3010 } 3011 return; 3012 } 3013 3014 switch (idls->dls_flags) { 3015 default: 3016 /* Disable if unknown flag */ 3017 case SOFT_RING_DISABLE: 3018 case POLL_DISABLE: 3019 ill_capability_dls_disable(ill); 3020 break; 3021 case SOFT_RING_CAPABLE: 3022 case POLL_CAPABLE: 3023 /* 3024 * If the capability was already enabled, its safe 3025 * to disable it first to get rid of stale information 3026 * and then start enabling it again. 3027 */ 3028 ill_capability_dls_disable(ill); 3029 ill_capability_dls_capable(ill, idls, isub); 3030 break; 3031 case SOFT_RING_ENABLE: 3032 case POLL_ENABLE: 3033 mutex_enter(&ill->ill_lock); 3034 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3035 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3036 ASSERT(ill->ill_dls_capab != NULL); 3037 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3038 } 3039 if (sub_dl_cap == DL_CAPAB_POLL && 3040 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3041 ASSERT(ill->ill_dls_capab != NULL); 3042 ill->ill_capabilities |= ILL_CAPAB_POLL; 3043 ip1dbg(("ill_capability_dls_ack: interface %s " 3044 "has enabled polling\n", ill->ill_name)); 3045 } 3046 mutex_exit(&ill->ill_lock); 3047 break; 3048 } 3049 } 3050 3051 /* 3052 * Process a hardware checksum offload capability negotiation ack received 3053 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3054 * of a DL_CAPABILITY_ACK message. 3055 */ 3056 static void 3057 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3058 { 3059 dl_capability_req_t *ocap; 3060 dl_capab_hcksum_t *ihck, *ohck; 3061 ill_hcksum_capab_t **ill_hcksum; 3062 mblk_t *nmp = NULL; 3063 uint_t sub_dl_cap = isub->dl_cap; 3064 uint8_t *capend; 3065 3066 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3067 3068 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3069 3070 /* 3071 * Note: range checks here are not absolutely sufficient to 3072 * make us robust against malformed messages sent by drivers; 3073 * this is in keeping with the rest of IP's dlpi handling. 3074 * (Remember, it's coming from something else in the kernel 3075 * address space) 3076 */ 3077 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3078 if (capend > mp->b_wptr) { 3079 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3080 "malformed sub-capability too long for mblk"); 3081 return; 3082 } 3083 3084 /* 3085 * There are two types of acks we process here: 3086 * 1. acks in reply to a (first form) generic capability req 3087 * (no ENABLE flag set) 3088 * 2. acks in reply to a ENABLE capability req. 3089 * (ENABLE flag set) 3090 */ 3091 ihck = (dl_capab_hcksum_t *)(isub + 1); 3092 3093 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3094 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3095 "unsupported hardware checksum " 3096 "sub-capability (version %d, expected %d)", 3097 ihck->hcksum_version, HCKSUM_VERSION_1); 3098 return; 3099 } 3100 3101 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3102 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3103 "checksum capability isn't as expected; pass-thru " 3104 "module(s) detected, discarding capability\n")); 3105 return; 3106 } 3107 3108 #define CURR_HCKSUM_CAPAB \ 3109 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3110 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3111 3112 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3113 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3114 /* do ENABLE processing */ 3115 if (*ill_hcksum == NULL) { 3116 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3117 KM_NOSLEEP); 3118 3119 if (*ill_hcksum == NULL) { 3120 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3121 "could not enable hcksum version %d " 3122 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3123 ill->ill_name); 3124 return; 3125 } 3126 } 3127 3128 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3129 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3130 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3131 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3132 "has enabled hardware checksumming\n ", 3133 ill->ill_name)); 3134 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3135 /* 3136 * Enabling hardware checksum offload 3137 * Currently IP supports {TCP,UDP}/IPv4 3138 * partial and full cksum offload and 3139 * IPv4 header checksum offload. 3140 * Allocate new mblk which will 3141 * contain a new capability request 3142 * to enable hardware checksum offload. 3143 */ 3144 uint_t size; 3145 uchar_t *rptr; 3146 3147 size = sizeof (dl_capability_req_t) + 3148 sizeof (dl_capability_sub_t) + isub->dl_length; 3149 3150 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3151 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3152 "could not enable hardware cksum for %s (ENOMEM)\n", 3153 ill->ill_name); 3154 return; 3155 } 3156 3157 rptr = nmp->b_rptr; 3158 /* initialize dl_capability_req_t */ 3159 ocap = (dl_capability_req_t *)nmp->b_rptr; 3160 ocap->dl_sub_offset = 3161 sizeof (dl_capability_req_t); 3162 ocap->dl_sub_length = 3163 sizeof (dl_capability_sub_t) + 3164 isub->dl_length; 3165 nmp->b_rptr += sizeof (dl_capability_req_t); 3166 3167 /* initialize dl_capability_sub_t */ 3168 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3169 nmp->b_rptr += sizeof (*isub); 3170 3171 /* initialize dl_capab_hcksum_t */ 3172 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3173 bcopy(ihck, ohck, sizeof (*ihck)); 3174 3175 nmp->b_rptr = rptr; 3176 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3177 3178 /* Set ENABLE flag */ 3179 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3180 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3181 3182 /* 3183 * nmp points to a DL_CAPABILITY_REQ message to enable 3184 * hardware checksum acceleration. 3185 */ 3186 ill_dlpi_send(ill, nmp); 3187 } else { 3188 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3189 "advertised %x hardware checksum capability flags\n", 3190 ill->ill_name, ihck->hcksum_txflags)); 3191 } 3192 } 3193 3194 static void 3195 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3196 { 3197 mblk_t *mp; 3198 dl_capab_hcksum_t *hck_subcap; 3199 dl_capability_sub_t *dl_subcap; 3200 int size; 3201 3202 if (!ILL_HCKSUM_CAPABLE(ill)) 3203 return; 3204 3205 ASSERT(ill->ill_hcksum_capab != NULL); 3206 /* 3207 * Clear the capability flag for hardware checksum offload but 3208 * retain the ill_hcksum_capab structure since it's possible that 3209 * another thread is still referring to it. The structure only 3210 * gets deallocated when we destroy the ill. 3211 */ 3212 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3213 3214 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3215 3216 mp = allocb(size, BPRI_HI); 3217 if (mp == NULL) { 3218 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3219 "request to disable hardware checksum offload\n")); 3220 return; 3221 } 3222 3223 mp->b_wptr = mp->b_rptr + size; 3224 3225 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3226 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3227 dl_subcap->dl_length = sizeof (*hck_subcap); 3228 3229 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3230 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3231 hck_subcap->hcksum_txflags = 0; 3232 3233 if (*sc_mp != NULL) 3234 linkb(*sc_mp, mp); 3235 else 3236 *sc_mp = mp; 3237 } 3238 3239 static void 3240 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3241 { 3242 mblk_t *nmp = NULL; 3243 dl_capability_req_t *oc; 3244 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3245 ill_zerocopy_capab_t **ill_zerocopy_capab; 3246 uint_t sub_dl_cap = isub->dl_cap; 3247 uint8_t *capend; 3248 3249 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3250 3251 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3252 3253 /* 3254 * Note: range checks here are not absolutely sufficient to 3255 * make us robust against malformed messages sent by drivers; 3256 * this is in keeping with the rest of IP's dlpi handling. 3257 * (Remember, it's coming from something else in the kernel 3258 * address space) 3259 */ 3260 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3261 if (capend > mp->b_wptr) { 3262 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3263 "malformed sub-capability too long for mblk"); 3264 return; 3265 } 3266 3267 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3268 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3269 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3270 "unsupported ZEROCOPY sub-capability (version %d, " 3271 "expected %d)", zc_ic->zerocopy_version, 3272 ZEROCOPY_VERSION_1); 3273 return; 3274 } 3275 3276 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3277 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3278 "capability isn't as expected; pass-thru module(s) " 3279 "detected, discarding capability\n")); 3280 return; 3281 } 3282 3283 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3284 if (*ill_zerocopy_capab == NULL) { 3285 *ill_zerocopy_capab = 3286 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3287 KM_NOSLEEP); 3288 3289 if (*ill_zerocopy_capab == NULL) { 3290 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3291 "could not enable Zero-copy version %d " 3292 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3293 ill->ill_name); 3294 return; 3295 } 3296 } 3297 3298 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3299 "supports Zero-copy version %d\n", ill->ill_name, 3300 ZEROCOPY_VERSION_1)); 3301 3302 (*ill_zerocopy_capab)->ill_zerocopy_version = 3303 zc_ic->zerocopy_version; 3304 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3305 zc_ic->zerocopy_flags; 3306 3307 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3308 } else { 3309 uint_t size; 3310 uchar_t *rptr; 3311 3312 size = sizeof (dl_capability_req_t) + 3313 sizeof (dl_capability_sub_t) + 3314 sizeof (dl_capab_zerocopy_t); 3315 3316 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3317 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3318 "could not enable zerocopy for %s (ENOMEM)\n", 3319 ill->ill_name); 3320 return; 3321 } 3322 3323 rptr = nmp->b_rptr; 3324 /* initialize dl_capability_req_t */ 3325 oc = (dl_capability_req_t *)rptr; 3326 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3327 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3328 sizeof (dl_capab_zerocopy_t); 3329 rptr += sizeof (dl_capability_req_t); 3330 3331 /* initialize dl_capability_sub_t */ 3332 bcopy(isub, rptr, sizeof (*isub)); 3333 rptr += sizeof (*isub); 3334 3335 /* initialize dl_capab_zerocopy_t */ 3336 zc_oc = (dl_capab_zerocopy_t *)rptr; 3337 *zc_oc = *zc_ic; 3338 3339 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3340 "to enable zero-copy version %d\n", ill->ill_name, 3341 ZEROCOPY_VERSION_1)); 3342 3343 /* set VMSAFE_MEM flag */ 3344 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3345 3346 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3347 ill_dlpi_send(ill, nmp); 3348 } 3349 } 3350 3351 static void 3352 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3353 { 3354 mblk_t *mp; 3355 dl_capab_zerocopy_t *zerocopy_subcap; 3356 dl_capability_sub_t *dl_subcap; 3357 int size; 3358 3359 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3360 return; 3361 3362 ASSERT(ill->ill_zerocopy_capab != NULL); 3363 /* 3364 * Clear the capability flag for Zero-copy but retain the 3365 * ill_zerocopy_capab structure since it's possible that another 3366 * thread is still referring to it. The structure only gets 3367 * deallocated when we destroy the ill. 3368 */ 3369 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3370 3371 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3372 3373 mp = allocb(size, BPRI_HI); 3374 if (mp == NULL) { 3375 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3376 "request to disable Zero-copy\n")); 3377 return; 3378 } 3379 3380 mp->b_wptr = mp->b_rptr + size; 3381 3382 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3383 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3384 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3385 3386 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3387 zerocopy_subcap->zerocopy_version = 3388 ill->ill_zerocopy_capab->ill_zerocopy_version; 3389 zerocopy_subcap->zerocopy_flags = 0; 3390 3391 if (*sc_mp != NULL) 3392 linkb(*sc_mp, mp); 3393 else 3394 *sc_mp = mp; 3395 } 3396 3397 /* 3398 * Consume a new-style hardware capabilities negotiation ack. 3399 * Called from ip_rput_dlpi_writer(). 3400 */ 3401 void 3402 ill_capability_ack(ill_t *ill, mblk_t *mp) 3403 { 3404 dl_capability_ack_t *capp; 3405 dl_capability_sub_t *subp, *endp; 3406 3407 if (ill->ill_capab_state == IDMS_INPROGRESS) 3408 ill->ill_capab_state = IDMS_OK; 3409 3410 capp = (dl_capability_ack_t *)mp->b_rptr; 3411 3412 if (capp->dl_sub_length == 0) 3413 /* no new-style capabilities */ 3414 return; 3415 3416 /* make sure the driver supplied correct dl_sub_length */ 3417 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3418 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3419 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3420 return; 3421 } 3422 3423 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3424 /* 3425 * There are sub-capabilities. Process the ones we know about. 3426 * Loop until we don't have room for another sub-cap header.. 3427 */ 3428 for (subp = SC(capp, capp->dl_sub_offset), 3429 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3430 subp <= endp; 3431 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3432 3433 switch (subp->dl_cap) { 3434 case DL_CAPAB_ID_WRAPPER: 3435 ill_capability_id_ack(ill, mp, subp); 3436 break; 3437 default: 3438 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3439 break; 3440 } 3441 } 3442 #undef SC 3443 } 3444 3445 /* 3446 * This routine is called to scan the fragmentation reassembly table for 3447 * the specified ILL for any packets that are starting to smell. 3448 * dead_interval is the maximum time in seconds that will be tolerated. It 3449 * will either be the value specified in ip_g_frag_timeout, or zero if the 3450 * ILL is shutting down and it is time to blow everything off. 3451 * 3452 * It returns the number of seconds (as a time_t) that the next frag timer 3453 * should be scheduled for, 0 meaning that the timer doesn't need to be 3454 * re-started. Note that the method of calculating next_timeout isn't 3455 * entirely accurate since time will flow between the time we grab 3456 * current_time and the time we schedule the next timeout. This isn't a 3457 * big problem since this is the timer for sending an ICMP reassembly time 3458 * exceeded messages, and it doesn't have to be exactly accurate. 3459 * 3460 * This function is 3461 * sometimes called as writer, although this is not required. 3462 */ 3463 time_t 3464 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3465 { 3466 ipfb_t *ipfb; 3467 ipfb_t *endp; 3468 ipf_t *ipf; 3469 ipf_t *ipfnext; 3470 mblk_t *mp; 3471 time_t current_time = gethrestime_sec(); 3472 time_t next_timeout = 0; 3473 uint32_t hdr_length; 3474 mblk_t *send_icmp_head; 3475 mblk_t *send_icmp_head_v6; 3476 3477 ipfb = ill->ill_frag_hash_tbl; 3478 if (ipfb == NULL) 3479 return (B_FALSE); 3480 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3481 /* Walk the frag hash table. */ 3482 for (; ipfb < endp; ipfb++) { 3483 send_icmp_head = NULL; 3484 send_icmp_head_v6 = NULL; 3485 mutex_enter(&ipfb->ipfb_lock); 3486 while ((ipf = ipfb->ipfb_ipf) != 0) { 3487 time_t frag_time = current_time - ipf->ipf_timestamp; 3488 time_t frag_timeout; 3489 3490 if (frag_time < dead_interval) { 3491 /* 3492 * There are some outstanding fragments 3493 * that will timeout later. Make note of 3494 * the time so that we can reschedule the 3495 * next timeout appropriately. 3496 */ 3497 frag_timeout = dead_interval - frag_time; 3498 if (next_timeout == 0 || 3499 frag_timeout < next_timeout) { 3500 next_timeout = frag_timeout; 3501 } 3502 break; 3503 } 3504 /* Time's up. Get it out of here. */ 3505 hdr_length = ipf->ipf_nf_hdr_len; 3506 ipfnext = ipf->ipf_hash_next; 3507 if (ipfnext) 3508 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3509 *ipf->ipf_ptphn = ipfnext; 3510 mp = ipf->ipf_mp->b_cont; 3511 for (; mp; mp = mp->b_cont) { 3512 /* Extra points for neatness. */ 3513 IP_REASS_SET_START(mp, 0); 3514 IP_REASS_SET_END(mp, 0); 3515 } 3516 mp = ipf->ipf_mp->b_cont; 3517 ill->ill_frag_count -= ipf->ipf_count; 3518 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3519 ipfb->ipfb_count -= ipf->ipf_count; 3520 ASSERT(ipfb->ipfb_frag_pkts > 0); 3521 ipfb->ipfb_frag_pkts--; 3522 /* 3523 * We do not send any icmp message from here because 3524 * we currently are holding the ipfb_lock for this 3525 * hash chain. If we try and send any icmp messages 3526 * from here we may end up via a put back into ip 3527 * trying to get the same lock, causing a recursive 3528 * mutex panic. Instead we build a list and send all 3529 * the icmp messages after we have dropped the lock. 3530 */ 3531 if (ill->ill_isv6) { 3532 BUMP_MIB(ill->ill_ip6_mib, ipv6ReasmFails); 3533 if (hdr_length != 0) { 3534 mp->b_next = send_icmp_head_v6; 3535 send_icmp_head_v6 = mp; 3536 } else { 3537 freemsg(mp); 3538 } 3539 } else { 3540 BUMP_MIB(&ip_mib, ipReasmFails); 3541 if (hdr_length != 0) { 3542 mp->b_next = send_icmp_head; 3543 send_icmp_head = mp; 3544 } else { 3545 freemsg(mp); 3546 } 3547 } 3548 freeb(ipf->ipf_mp); 3549 } 3550 mutex_exit(&ipfb->ipfb_lock); 3551 /* 3552 * Now need to send any icmp messages that we delayed from 3553 * above. 3554 */ 3555 while (send_icmp_head_v6 != NULL) { 3556 mp = send_icmp_head_v6; 3557 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3558 mp->b_next = NULL; 3559 icmp_time_exceeded_v6(ill->ill_wq, mp, 3560 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, B_FALSE); 3561 } 3562 while (send_icmp_head != NULL) { 3563 mp = send_icmp_head; 3564 send_icmp_head = send_icmp_head->b_next; 3565 mp->b_next = NULL; 3566 icmp_time_exceeded(ill->ill_wq, mp, 3567 ICMP_REASSEMBLY_TIME_EXCEEDED); 3568 } 3569 } 3570 /* 3571 * A non-dying ILL will use the return value to decide whether to 3572 * restart the frag timer, and for how long. 3573 */ 3574 return (next_timeout); 3575 } 3576 3577 /* 3578 * This routine is called when the approximate count of mblk memory used 3579 * for the specified ILL has exceeded max_count. 3580 */ 3581 void 3582 ill_frag_prune(ill_t *ill, uint_t max_count) 3583 { 3584 ipfb_t *ipfb; 3585 ipf_t *ipf; 3586 size_t count; 3587 3588 /* 3589 * If we are here within ip_min_frag_prune_time msecs remove 3590 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3591 * ill_frag_free_num_pkts. 3592 */ 3593 mutex_enter(&ill->ill_lock); 3594 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3595 (ip_min_frag_prune_time != 0 ? 3596 ip_min_frag_prune_time : msec_per_tick)) { 3597 3598 ill->ill_frag_free_num_pkts++; 3599 3600 } else { 3601 ill->ill_frag_free_num_pkts = 0; 3602 } 3603 ill->ill_last_frag_clean_time = lbolt; 3604 mutex_exit(&ill->ill_lock); 3605 3606 /* 3607 * free ill_frag_free_num_pkts oldest packets from each bucket. 3608 */ 3609 if (ill->ill_frag_free_num_pkts != 0) { 3610 int ix; 3611 3612 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3613 ipfb = &ill->ill_frag_hash_tbl[ix]; 3614 mutex_enter(&ipfb->ipfb_lock); 3615 if (ipfb->ipfb_ipf != NULL) { 3616 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3617 ill->ill_frag_free_num_pkts); 3618 } 3619 mutex_exit(&ipfb->ipfb_lock); 3620 } 3621 } 3622 /* 3623 * While the reassembly list for this ILL is too big, prune a fragment 3624 * queue by age, oldest first. Note that the per ILL count is 3625 * approximate, while the per frag hash bucket counts are accurate. 3626 */ 3627 while (ill->ill_frag_count > max_count) { 3628 int ix; 3629 ipfb_t *oipfb = NULL; 3630 uint_t oldest = UINT_MAX; 3631 3632 count = 0; 3633 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3634 ipfb = &ill->ill_frag_hash_tbl[ix]; 3635 mutex_enter(&ipfb->ipfb_lock); 3636 ipf = ipfb->ipfb_ipf; 3637 if (ipf != NULL && ipf->ipf_gen < oldest) { 3638 oldest = ipf->ipf_gen; 3639 oipfb = ipfb; 3640 } 3641 count += ipfb->ipfb_count; 3642 mutex_exit(&ipfb->ipfb_lock); 3643 } 3644 /* Refresh the per ILL count */ 3645 ill->ill_frag_count = count; 3646 if (oipfb == NULL) { 3647 ill->ill_frag_count = 0; 3648 break; 3649 } 3650 if (count <= max_count) 3651 return; /* Somebody beat us to it, nothing to do */ 3652 mutex_enter(&oipfb->ipfb_lock); 3653 ipf = oipfb->ipfb_ipf; 3654 if (ipf != NULL) { 3655 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3656 } 3657 mutex_exit(&oipfb->ipfb_lock); 3658 } 3659 } 3660 3661 /* 3662 * free 'free_cnt' fragmented packets starting at ipf. 3663 */ 3664 void 3665 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3666 { 3667 size_t count; 3668 mblk_t *mp; 3669 mblk_t *tmp; 3670 ipf_t **ipfp = ipf->ipf_ptphn; 3671 3672 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3673 ASSERT(ipfp != NULL); 3674 ASSERT(ipf != NULL); 3675 3676 while (ipf != NULL && free_cnt-- > 0) { 3677 count = ipf->ipf_count; 3678 mp = ipf->ipf_mp; 3679 ipf = ipf->ipf_hash_next; 3680 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3681 IP_REASS_SET_START(tmp, 0); 3682 IP_REASS_SET_END(tmp, 0); 3683 } 3684 ill->ill_frag_count -= count; 3685 ASSERT(ipfb->ipfb_count >= count); 3686 ipfb->ipfb_count -= count; 3687 ASSERT(ipfb->ipfb_frag_pkts > 0); 3688 ipfb->ipfb_frag_pkts--; 3689 freemsg(mp); 3690 BUMP_MIB(&ip_mib, ipReasmFails); 3691 } 3692 3693 if (ipf) 3694 ipf->ipf_ptphn = ipfp; 3695 ipfp[0] = ipf; 3696 } 3697 3698 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3699 "obsolete and may be removed in a future release of Solaris. Use " \ 3700 "ifconfig(1M) to manipulate the forwarding status of an interface." 3701 3702 /* 3703 * For obsolete per-interface forwarding configuration; 3704 * called in response to ND_GET. 3705 */ 3706 /* ARGSUSED */ 3707 static int 3708 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3709 { 3710 ill_t *ill = (ill_t *)cp; 3711 3712 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3713 3714 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3715 return (0); 3716 } 3717 3718 /* 3719 * For obsolete per-interface forwarding configuration; 3720 * called in response to ND_SET. 3721 */ 3722 /* ARGSUSED */ 3723 static int 3724 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3725 cred_t *ioc_cr) 3726 { 3727 long value; 3728 int retval; 3729 3730 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3731 3732 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3733 value < 0 || value > 1) { 3734 return (EINVAL); 3735 } 3736 3737 rw_enter(&ill_g_lock, RW_READER); 3738 retval = ill_forward_set(q, mp, (value != 0), cp); 3739 rw_exit(&ill_g_lock); 3740 return (retval); 3741 } 3742 3743 /* 3744 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 3745 * IPMP group, make sure all ill's in the group adopt the new policy. Send 3746 * up RTS_IFINFO routing socket messages for each interface whose flags we 3747 * change. 3748 */ 3749 /* ARGSUSED */ 3750 int 3751 ill_forward_set(queue_t *q, mblk_t *mp, boolean_t enable, caddr_t cp) 3752 { 3753 ill_t *ill = (ill_t *)cp; 3754 ill_group_t *illgrp; 3755 3756 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ill_g_lock)); 3757 3758 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3759 (!enable && !(ill->ill_flags & ILLF_ROUTER)) || 3760 (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) 3761 return (EINVAL); 3762 3763 /* 3764 * If the ill is in an IPMP group, set the forwarding policy on all 3765 * members of the group to the same value. 3766 */ 3767 illgrp = ill->ill_group; 3768 if (illgrp != NULL) { 3769 ill_t *tmp_ill; 3770 3771 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 3772 tmp_ill = tmp_ill->ill_group_next) { 3773 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3774 (enable ? "Enabling" : "Disabling"), 3775 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 3776 tmp_ill->ill_name)); 3777 mutex_enter(&tmp_ill->ill_lock); 3778 if (enable) 3779 tmp_ill->ill_flags |= ILLF_ROUTER; 3780 else 3781 tmp_ill->ill_flags &= ~ILLF_ROUTER; 3782 mutex_exit(&tmp_ill->ill_lock); 3783 if (tmp_ill->ill_isv6) 3784 ill_set_nce_router_flags(tmp_ill, enable); 3785 /* Notify routing socket listeners of this change. */ 3786 ip_rts_ifmsg(tmp_ill->ill_ipif); 3787 } 3788 } else { 3789 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3790 (enable ? "Enabling" : "Disabling"), 3791 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3792 mutex_enter(&ill->ill_lock); 3793 if (enable) 3794 ill->ill_flags |= ILLF_ROUTER; 3795 else 3796 ill->ill_flags &= ~ILLF_ROUTER; 3797 mutex_exit(&ill->ill_lock); 3798 if (ill->ill_isv6) 3799 ill_set_nce_router_flags(ill, enable); 3800 /* Notify routing socket listeners of this change. */ 3801 ip_rts_ifmsg(ill->ill_ipif); 3802 } 3803 3804 return (0); 3805 } 3806 3807 /* 3808 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3809 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3810 * set or clear. 3811 */ 3812 static void 3813 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3814 { 3815 ipif_t *ipif; 3816 nce_t *nce; 3817 3818 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3819 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 3820 if (nce != NULL) { 3821 mutex_enter(&nce->nce_lock); 3822 if (enable) 3823 nce->nce_flags |= NCE_F_ISROUTER; 3824 else 3825 nce->nce_flags &= ~NCE_F_ISROUTER; 3826 mutex_exit(&nce->nce_lock); 3827 NCE_REFRELE(nce); 3828 } 3829 } 3830 } 3831 3832 /* 3833 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 3834 * for this ill. Make sure the v6/v4 question has been answered about this 3835 * ill. The creation of this ndd variable is only for backwards compatibility. 3836 * The preferred way to control per-interface IP forwarding is through the 3837 * ILLF_ROUTER interface flag. 3838 */ 3839 static int 3840 ill_set_ndd_name(ill_t *ill) 3841 { 3842 char *suffix; 3843 3844 ASSERT(IAM_WRITER_ILL(ill)); 3845 3846 if (ill->ill_isv6) 3847 suffix = ipv6_forward_suffix; 3848 else 3849 suffix = ipv4_forward_suffix; 3850 3851 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 3852 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 3853 /* 3854 * Copies over the '\0'. 3855 * Note that strlen(suffix) is always bounded. 3856 */ 3857 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 3858 strlen(suffix) + 1); 3859 3860 /* 3861 * Use of the nd table requires holding the reader lock. 3862 * Modifying the nd table thru nd_load/nd_unload requires 3863 * the writer lock. 3864 */ 3865 rw_enter(&ip_g_nd_lock, RW_WRITER); 3866 if (!nd_load(&ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 3867 nd_ill_forward_set, (caddr_t)ill)) { 3868 /* 3869 * If the nd_load failed, it only meant that it could not 3870 * allocate a new bunch of room for further NDD expansion. 3871 * Because of that, the ill_ndd_name will be set to 0, and 3872 * this interface is at the mercy of the global ip_forwarding 3873 * variable. 3874 */ 3875 rw_exit(&ip_g_nd_lock); 3876 ill->ill_ndd_name = NULL; 3877 return (ENOMEM); 3878 } 3879 rw_exit(&ip_g_nd_lock); 3880 return (0); 3881 } 3882 3883 /* 3884 * Intializes the context structure and returns the first ill in the list 3885 * cuurently start_list and end_list can have values: 3886 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 3887 * IP_V4_G_HEAD Traverse IPV4 list only. 3888 * IP_V6_G_HEAD Traverse IPV6 list only. 3889 */ 3890 3891 /* 3892 * We don't check for CONDEMNED ills here. Caller must do that if 3893 * necessary under the ill lock. 3894 */ 3895 ill_t * 3896 ill_first(int start_list, int end_list, ill_walk_context_t *ctx) 3897 { 3898 ill_if_t *ifp; 3899 ill_t *ill; 3900 avl_tree_t *avl_tree; 3901 3902 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 3903 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 3904 3905 /* 3906 * setup the lists to search 3907 */ 3908 if (end_list != MAX_G_HEADS) { 3909 ctx->ctx_current_list = start_list; 3910 ctx->ctx_last_list = end_list; 3911 } else { 3912 ctx->ctx_last_list = MAX_G_HEADS - 1; 3913 ctx->ctx_current_list = 0; 3914 } 3915 3916 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 3917 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 3918 if (ifp != (ill_if_t *) 3919 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 3920 avl_tree = &ifp->illif_avl_by_ppa; 3921 ill = avl_first(avl_tree); 3922 /* 3923 * ill is guaranteed to be non NULL or ifp should have 3924 * not existed. 3925 */ 3926 ASSERT(ill != NULL); 3927 return (ill); 3928 } 3929 ctx->ctx_current_list++; 3930 } 3931 3932 return (NULL); 3933 } 3934 3935 /* 3936 * returns the next ill in the list. ill_first() must have been called 3937 * before calling ill_next() or bad things will happen. 3938 */ 3939 3940 /* 3941 * We don't check for CONDEMNED ills here. Caller must do that if 3942 * necessary under the ill lock. 3943 */ 3944 ill_t * 3945 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 3946 { 3947 ill_if_t *ifp; 3948 ill_t *ill; 3949 3950 3951 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 3952 ASSERT(lastill->ill_ifptr != (ill_if_t *) 3953 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)); 3954 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 3955 AVL_AFTER)) != NULL) { 3956 return (ill); 3957 } 3958 3959 /* goto next ill_ifp in the list. */ 3960 ifp = lastill->ill_ifptr->illif_next; 3961 3962 /* make sure not at end of circular list */ 3963 while (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 3964 if (++ctx->ctx_current_list > ctx->ctx_last_list) 3965 return (NULL); 3966 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 3967 } 3968 3969 return (avl_first(&ifp->illif_avl_by_ppa)); 3970 } 3971 3972 /* 3973 * Check interface name for correct format which is name+ppa. 3974 * name can contain characters and digits, the right most digits 3975 * make up the ppa number. use of octal is not allowed, name must contain 3976 * a ppa, return pointer to the start of ppa. 3977 * In case of error return NULL. 3978 */ 3979 static char * 3980 ill_get_ppa_ptr(char *name) 3981 { 3982 int namelen = mi_strlen(name); 3983 3984 int len = namelen; 3985 3986 name += len; 3987 while (len > 0) { 3988 name--; 3989 if (*name < '0' || *name > '9') 3990 break; 3991 len--; 3992 } 3993 3994 /* empty string, all digits, or no trailing digits */ 3995 if (len == 0 || len == (int)namelen) 3996 return (NULL); 3997 3998 name++; 3999 /* check for attempted use of octal */ 4000 if (*name == '0' && len != (int)namelen - 1) 4001 return (NULL); 4002 return (name); 4003 } 4004 4005 /* 4006 * use avl tree to locate the ill. 4007 */ 4008 static ill_t * 4009 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4010 ipsq_func_t func, int *error) 4011 { 4012 char *ppa_ptr = NULL; 4013 int len; 4014 uint_t ppa; 4015 ill_t *ill = NULL; 4016 ill_if_t *ifp; 4017 int list; 4018 ipsq_t *ipsq; 4019 4020 if (error != NULL) 4021 *error = 0; 4022 4023 /* 4024 * get ppa ptr 4025 */ 4026 if (isv6) 4027 list = IP_V6_G_HEAD; 4028 else 4029 list = IP_V4_G_HEAD; 4030 4031 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4032 if (error != NULL) 4033 *error = ENXIO; 4034 return (NULL); 4035 } 4036 4037 len = ppa_ptr - name + 1; 4038 4039 ppa = stoi(&ppa_ptr); 4040 4041 ifp = IP_VX_ILL_G_LIST(list); 4042 4043 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4044 /* 4045 * match is done on len - 1 as the name is not null 4046 * terminated it contains ppa in addition to the interface 4047 * name. 4048 */ 4049 if ((ifp->illif_name_len == len) && 4050 bcmp(ifp->illif_name, name, len - 1) == 0) { 4051 break; 4052 } else { 4053 ifp = ifp->illif_next; 4054 } 4055 } 4056 4057 4058 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4059 /* 4060 * Even the interface type does not exist. 4061 */ 4062 if (error != NULL) 4063 *error = ENXIO; 4064 return (NULL); 4065 } 4066 4067 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4068 if (ill != NULL) { 4069 /* 4070 * The block comment at the start of ipif_down 4071 * explains the use of the macros used below 4072 */ 4073 GRAB_CONN_LOCK(q); 4074 mutex_enter(&ill->ill_lock); 4075 if (ILL_CAN_LOOKUP(ill)) { 4076 ill_refhold_locked(ill); 4077 mutex_exit(&ill->ill_lock); 4078 RELEASE_CONN_LOCK(q); 4079 return (ill); 4080 } else if (ILL_CAN_WAIT(ill, q)) { 4081 ipsq = ill->ill_phyint->phyint_ipsq; 4082 mutex_enter(&ipsq->ipsq_lock); 4083 mutex_exit(&ill->ill_lock); 4084 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4085 mutex_exit(&ipsq->ipsq_lock); 4086 RELEASE_CONN_LOCK(q); 4087 *error = EINPROGRESS; 4088 return (NULL); 4089 } 4090 mutex_exit(&ill->ill_lock); 4091 RELEASE_CONN_LOCK(q); 4092 } 4093 if (error != NULL) 4094 *error = ENXIO; 4095 return (NULL); 4096 } 4097 4098 /* 4099 * comparison function for use with avl. 4100 */ 4101 static int 4102 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4103 { 4104 uint_t ppa; 4105 uint_t ill_ppa; 4106 4107 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4108 4109 ppa = *((uint_t *)ppa_ptr); 4110 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4111 /* 4112 * We want the ill with the lowest ppa to be on the 4113 * top. 4114 */ 4115 if (ill_ppa < ppa) 4116 return (1); 4117 if (ill_ppa > ppa) 4118 return (-1); 4119 return (0); 4120 } 4121 4122 /* 4123 * remove an interface type from the global list. 4124 */ 4125 static void 4126 ill_delete_interface_type(ill_if_t *interface) 4127 { 4128 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4129 4130 ASSERT(interface != NULL); 4131 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4132 4133 avl_destroy(&interface->illif_avl_by_ppa); 4134 if (interface->illif_ppa_arena != NULL) 4135 vmem_destroy(interface->illif_ppa_arena); 4136 4137 remque(interface); 4138 4139 mi_free(interface); 4140 } 4141 4142 /* 4143 * remove ill from the global list. 4144 */ 4145 static void 4146 ill_glist_delete(ill_t *ill) 4147 { 4148 if (ill == NULL) 4149 return; 4150 4151 rw_enter(&ill_g_lock, RW_WRITER); 4152 /* 4153 * If the ill was never inserted into the AVL tree 4154 * we skip the if branch. 4155 */ 4156 if (ill->ill_ifptr != NULL) { 4157 /* 4158 * remove from AVL tree and free ppa number 4159 */ 4160 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4161 4162 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4163 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4164 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4165 } 4166 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4167 ill_delete_interface_type(ill->ill_ifptr); 4168 } 4169 4170 /* 4171 * Indicate ill is no longer in the list. 4172 */ 4173 ill->ill_ifptr = NULL; 4174 ill->ill_name_length = 0; 4175 ill->ill_name[0] = '\0'; 4176 ill->ill_ppa = UINT_MAX; 4177 } 4178 ill_phyint_free(ill); 4179 rw_exit(&ill_g_lock); 4180 } 4181 4182 /* 4183 * allocate a ppa, if the number of plumbed interfaces of this type are 4184 * less than ill_no_arena do a linear search to find a unused ppa. 4185 * When the number goes beyond ill_no_arena switch to using an arena. 4186 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4187 * is the return value for an error condition, so allocation starts at one 4188 * and is decremented by one. 4189 */ 4190 static int 4191 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4192 { 4193 ill_t *tmp_ill; 4194 uint_t start, end; 4195 int ppa; 4196 4197 if (ifp->illif_ppa_arena == NULL && 4198 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4199 /* 4200 * Create an arena. 4201 */ 4202 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4203 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4204 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4205 /* allocate what has already been assigned */ 4206 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4207 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4208 tmp_ill, AVL_AFTER)) { 4209 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4210 1, /* size */ 4211 1, /* align/quantum */ 4212 0, /* phase */ 4213 0, /* nocross */ 4214 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), /* minaddr */ 4215 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), /* maxaddr */ 4216 VM_NOSLEEP|VM_FIRSTFIT); 4217 if (ppa == 0) { 4218 ip1dbg(("ill_alloc_ppa: ppa allocation" 4219 " failed while switching")); 4220 vmem_destroy(ifp->illif_ppa_arena); 4221 ifp->illif_ppa_arena = NULL; 4222 break; 4223 } 4224 } 4225 } 4226 4227 if (ifp->illif_ppa_arena != NULL) { 4228 if (ill->ill_ppa == UINT_MAX) { 4229 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4230 1, VM_NOSLEEP|VM_FIRSTFIT); 4231 if (ppa == 0) 4232 return (EAGAIN); 4233 ill->ill_ppa = --ppa; 4234 } else { 4235 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4236 1, /* size */ 4237 1, /* align/quantum */ 4238 0, /* phase */ 4239 0, /* nocross */ 4240 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4241 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4242 VM_NOSLEEP|VM_FIRSTFIT); 4243 /* 4244 * Most likely the allocation failed because 4245 * the requested ppa was in use. 4246 */ 4247 if (ppa == 0) 4248 return (EEXIST); 4249 } 4250 return (0); 4251 } 4252 4253 /* 4254 * No arena is in use and not enough (>ill_no_arena) interfaces have 4255 * been plumbed to create one. Do a linear search to get a unused ppa. 4256 */ 4257 if (ill->ill_ppa == UINT_MAX) { 4258 end = UINT_MAX - 1; 4259 start = 0; 4260 } else { 4261 end = start = ill->ill_ppa; 4262 } 4263 4264 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4265 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4266 if (start++ >= end) { 4267 if (ill->ill_ppa == UINT_MAX) 4268 return (EAGAIN); 4269 else 4270 return (EEXIST); 4271 } 4272 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4273 } 4274 ill->ill_ppa = start; 4275 return (0); 4276 } 4277 4278 /* 4279 * Insert ill into the list of configured ill's. Once this function completes, 4280 * the ill is globally visible and is available through lookups. More precisely 4281 * this happens after the caller drops the ill_g_lock. 4282 */ 4283 static int 4284 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4285 { 4286 ill_if_t *ill_interface; 4287 avl_index_t where = 0; 4288 int error; 4289 int name_length; 4290 int index; 4291 boolean_t check_length = B_FALSE; 4292 4293 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4294 4295 name_length = mi_strlen(name) + 1; 4296 4297 if (isv6) 4298 index = IP_V6_G_HEAD; 4299 else 4300 index = IP_V4_G_HEAD; 4301 4302 ill_interface = IP_VX_ILL_G_LIST(index); 4303 /* 4304 * Search for interface type based on name 4305 */ 4306 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4307 if ((ill_interface->illif_name_len == name_length) && 4308 (strcmp(ill_interface->illif_name, name) == 0)) { 4309 break; 4310 } 4311 ill_interface = ill_interface->illif_next; 4312 } 4313 4314 /* 4315 * Interface type not found, create one. 4316 */ 4317 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4318 4319 ill_g_head_t ghead; 4320 4321 /* 4322 * allocate ill_if_t structure 4323 */ 4324 4325 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4326 if (ill_interface == NULL) { 4327 return (ENOMEM); 4328 } 4329 4330 4331 4332 (void) strcpy(ill_interface->illif_name, name); 4333 ill_interface->illif_name_len = name_length; 4334 4335 avl_create(&ill_interface->illif_avl_by_ppa, 4336 ill_compare_ppa, sizeof (ill_t), 4337 offsetof(struct ill_s, ill_avl_byppa)); 4338 4339 /* 4340 * link the structure in the back to maintain order 4341 * of configuration for ifconfig output. 4342 */ 4343 ghead = ill_g_heads[index]; 4344 insque(ill_interface, ghead.ill_g_list_tail); 4345 4346 } 4347 4348 if (ill->ill_ppa == UINT_MAX) 4349 check_length = B_TRUE; 4350 4351 error = ill_alloc_ppa(ill_interface, ill); 4352 if (error != 0) { 4353 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4354 ill_delete_interface_type(ill->ill_ifptr); 4355 return (error); 4356 } 4357 4358 /* 4359 * When the ppa is choosen by the system, check that there is 4360 * enough space to insert ppa. if a specific ppa was passed in this 4361 * check is not required as the interface name passed in will have 4362 * the right ppa in it. 4363 */ 4364 if (check_length) { 4365 /* 4366 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4367 */ 4368 char buf[sizeof (uint_t) * 3]; 4369 4370 /* 4371 * convert ppa to string to calculate the amount of space 4372 * required for it in the name. 4373 */ 4374 numtos(ill->ill_ppa, buf); 4375 4376 /* Do we have enough space to insert ppa ? */ 4377 4378 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4379 /* Free ppa and interface type struct */ 4380 if (ill_interface->illif_ppa_arena != NULL) { 4381 vmem_free(ill_interface->illif_ppa_arena, 4382 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4383 } 4384 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4385 0) { 4386 ill_delete_interface_type(ill->ill_ifptr); 4387 } 4388 4389 return (EINVAL); 4390 } 4391 } 4392 4393 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4394 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4395 4396 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4397 &where); 4398 ill->ill_ifptr = ill_interface; 4399 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4400 4401 ill_phyint_reinit(ill); 4402 return (0); 4403 } 4404 4405 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4406 static boolean_t 4407 ipsq_init(ill_t *ill) 4408 { 4409 ipsq_t *ipsq; 4410 4411 /* Init the ipsq and impicitly enter as writer */ 4412 ill->ill_phyint->phyint_ipsq = 4413 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4414 if (ill->ill_phyint->phyint_ipsq == NULL) 4415 return (B_FALSE); 4416 ipsq = ill->ill_phyint->phyint_ipsq; 4417 ipsq->ipsq_phyint_list = ill->ill_phyint; 4418 ill->ill_phyint->phyint_ipsq_next = NULL; 4419 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4420 ipsq->ipsq_refs = 1; 4421 ipsq->ipsq_writer = curthread; 4422 ipsq->ipsq_reentry_cnt = 1; 4423 #ifdef ILL_DEBUG 4424 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, IP_STACK_DEPTH); 4425 #endif 4426 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4427 return (B_TRUE); 4428 } 4429 4430 /* 4431 * ill_init is called by ip_open when a device control stream is opened. 4432 * It does a few initializations, and shoots a DL_INFO_REQ message down 4433 * to the driver. The response is later picked up in ip_rput_dlpi and 4434 * used to set up default mechanisms for talking to the driver. (Always 4435 * called as writer.) 4436 * 4437 * If this function returns error, ip_open will call ip_close which in 4438 * turn will call ill_delete to clean up any memory allocated here that 4439 * is not yet freed. 4440 */ 4441 int 4442 ill_init(queue_t *q, ill_t *ill) 4443 { 4444 int count; 4445 dl_info_req_t *dlir; 4446 mblk_t *info_mp; 4447 uchar_t *frag_ptr; 4448 4449 /* 4450 * The ill is initialized to zero by mi_alloc*(). In addition 4451 * some fields already contain valid values, initialized in 4452 * ip_open(), before we reach here. 4453 */ 4454 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4455 4456 ill->ill_rq = q; 4457 ill->ill_wq = WR(q); 4458 4459 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4460 BPRI_HI); 4461 if (info_mp == NULL) 4462 return (ENOMEM); 4463 4464 /* 4465 * Allocate sufficient space to contain our fragment hash table and 4466 * the device name. 4467 */ 4468 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4469 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4470 if (frag_ptr == NULL) { 4471 freemsg(info_mp); 4472 return (ENOMEM); 4473 } 4474 ill->ill_frag_ptr = frag_ptr; 4475 ill->ill_frag_free_num_pkts = 0; 4476 ill->ill_last_frag_clean_time = 0; 4477 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4478 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4479 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4480 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4481 NULL, MUTEX_DEFAULT, NULL); 4482 } 4483 4484 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4485 if (ill->ill_phyint == NULL) { 4486 freemsg(info_mp); 4487 mi_free(frag_ptr); 4488 return (ENOMEM); 4489 } 4490 4491 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4492 /* 4493 * For now pretend this is a v4 ill. We need to set phyint_ill* 4494 * at this point because of the following reason. If we can't 4495 * enter the ipsq at some point and cv_wait, the writer that 4496 * wakes us up tries to locate us using the list of all phyints 4497 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4498 * If we don't set it now, we risk a missed wakeup. 4499 */ 4500 ill->ill_phyint->phyint_illv4 = ill; 4501 ill->ill_ppa = UINT_MAX; 4502 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4503 4504 if (!ipsq_init(ill)) { 4505 freemsg(info_mp); 4506 mi_free(frag_ptr); 4507 mi_free(ill->ill_phyint); 4508 return (ENOMEM); 4509 } 4510 4511 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4512 4513 4514 /* Frag queue limit stuff */ 4515 ill->ill_frag_count = 0; 4516 ill->ill_ipf_gen = 0; 4517 4518 ill->ill_global_timer = INFINITY; 4519 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4520 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4521 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4522 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4523 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4524 4525 /* 4526 * Initialize IPv6 configuration variables. The IP module is always 4527 * opened as an IPv4 module. Instead tracking down the cases where 4528 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4529 * here for convenience, this has no effect until the ill is set to do 4530 * IPv6. 4531 */ 4532 ill->ill_reachable_time = ND_REACHABLE_TIME; 4533 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4534 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4535 ill->ill_max_buf = ND_MAX_Q; 4536 ill->ill_refcnt = 0; 4537 4538 /* Send down the Info Request to the driver. */ 4539 info_mp->b_datap->db_type = M_PCPROTO; 4540 dlir = (dl_info_req_t *)info_mp->b_rptr; 4541 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4542 dlir->dl_primitive = DL_INFO_REQ; 4543 4544 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4545 4546 qprocson(q); 4547 ill_dlpi_send(ill, info_mp); 4548 4549 return (0); 4550 } 4551 4552 /* 4553 * ill_dls_info 4554 * creates datalink socket info from the device. 4555 */ 4556 int 4557 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4558 { 4559 size_t length; 4560 ill_t *ill = ipif->ipif_ill; 4561 4562 sdl->sdl_family = AF_LINK; 4563 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4564 sdl->sdl_type = ipif->ipif_type; 4565 (void) ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4566 length = mi_strlen(sdl->sdl_data); 4567 ASSERT(length < 256); 4568 sdl->sdl_nlen = (uchar_t)length; 4569 sdl->sdl_alen = ill->ill_phys_addr_length; 4570 mutex_enter(&ill->ill_lock); 4571 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) { 4572 bcopy(ill->ill_phys_addr, &sdl->sdl_data[length], 4573 ill->ill_phys_addr_length); 4574 } 4575 mutex_exit(&ill->ill_lock); 4576 sdl->sdl_slen = 0; 4577 return (sizeof (struct sockaddr_dl)); 4578 } 4579 4580 /* 4581 * ill_xarp_info 4582 * creates xarp info from the device. 4583 */ 4584 static int 4585 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4586 { 4587 sdl->sdl_family = AF_LINK; 4588 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4589 sdl->sdl_type = ill->ill_type; 4590 (void) ipif_get_name(ill->ill_ipif, sdl->sdl_data, 4591 sizeof (sdl->sdl_data)); 4592 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4593 sdl->sdl_alen = ill->ill_phys_addr_length; 4594 sdl->sdl_slen = 0; 4595 return (sdl->sdl_nlen); 4596 } 4597 4598 static int 4599 loopback_kstat_update(kstat_t *ksp, int rw) 4600 { 4601 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 4602 4603 if (rw == KSTAT_WRITE) 4604 return (EACCES); 4605 kn[0].value.ui32 = loopback_packets; 4606 kn[1].value.ui32 = loopback_packets; 4607 return (0); 4608 } 4609 4610 4611 /* 4612 * Has ifindex been plumbed already. 4613 */ 4614 static boolean_t 4615 phyint_exists(uint_t index) 4616 { 4617 phyint_t *phyi; 4618 4619 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4620 /* 4621 * Indexes are stored in the phyint - a common structure 4622 * to both IPv4 and IPv6. 4623 */ 4624 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 4625 (void *) &index, NULL); 4626 return (phyi != NULL); 4627 } 4628 4629 /* 4630 * Assign a unique interface index for the phyint. 4631 */ 4632 static boolean_t 4633 phyint_assign_ifindex(phyint_t *phyi) 4634 { 4635 uint_t starting_index; 4636 4637 ASSERT(phyi->phyint_ifindex == 0); 4638 if (!ill_index_wrap) { 4639 phyi->phyint_ifindex = ill_index++; 4640 if (ill_index == 0) { 4641 /* Reached the uint_t limit Next time wrap */ 4642 ill_index_wrap = B_TRUE; 4643 } 4644 return (B_TRUE); 4645 } 4646 4647 /* 4648 * Start reusing unused indexes. Note that we hold the ill_g_lock 4649 * at this point and don't want to call any function that attempts 4650 * to get the lock again. 4651 */ 4652 starting_index = ill_index++; 4653 for (; ill_index != starting_index; ill_index++) { 4654 if (ill_index != 0 && !phyint_exists(ill_index)) { 4655 /* found unused index - use it */ 4656 phyi->phyint_ifindex = ill_index; 4657 return (B_TRUE); 4658 } 4659 } 4660 4661 /* 4662 * all interface indicies are inuse. 4663 */ 4664 return (B_FALSE); 4665 } 4666 4667 /* 4668 * Return a pointer to the ill which matches the supplied name. Note that 4669 * the ill name length includes the null termination character. (May be 4670 * called as writer.) 4671 * If do_alloc and the interface is "lo0" it will be automatically created. 4672 * Cannot bump up reference on condemned ills. So dup detect can't be done 4673 * using this func. 4674 */ 4675 ill_t * 4676 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4677 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc) 4678 { 4679 ill_t *ill; 4680 ipif_t *ipif; 4681 kstat_named_t *kn; 4682 boolean_t isloopback; 4683 ipsq_t *old_ipsq; 4684 4685 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4686 4687 rw_enter(&ill_g_lock, RW_READER); 4688 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4689 rw_exit(&ill_g_lock); 4690 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4691 return (ill); 4692 4693 /* 4694 * Couldn't find it. Does this happen to be a lookup for the 4695 * loopback device and are we allowed to allocate it? 4696 */ 4697 if (!isloopback || !do_alloc) 4698 return (NULL); 4699 4700 rw_enter(&ill_g_lock, RW_WRITER); 4701 4702 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4703 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4704 rw_exit(&ill_g_lock); 4705 return (ill); 4706 } 4707 4708 /* Create the loopback device on demand */ 4709 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4710 sizeof (ipif_loopback_name), BPRI_MED)); 4711 if (ill == NULL) 4712 goto done; 4713 4714 *ill = ill_null; 4715 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4716 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4717 if (ill->ill_phyint == NULL) 4718 goto done; 4719 4720 if (isv6) 4721 ill->ill_phyint->phyint_illv6 = ill; 4722 else 4723 ill->ill_phyint->phyint_illv4 = ill; 4724 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4725 ill->ill_max_frag = IP_LOOPBACK_MTU; 4726 /* Add room for tcp+ip headers */ 4727 if (isv6) { 4728 ill->ill_isv6 = B_TRUE; 4729 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 4730 if (!ill_allocate_mibs(ill)) 4731 goto done; 4732 } else { 4733 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 4734 } 4735 ill->ill_max_mtu = ill->ill_max_frag; 4736 /* 4737 * ipif_loopback_name can't be pointed at directly because its used 4738 * by both the ipv4 and ipv6 interfaces. When the ill is removed 4739 * from the glist, ill_glist_delete() sets the first character of 4740 * ill_name to '\0'. 4741 */ 4742 ill->ill_name = (char *)ill + sizeof (*ill); 4743 (void) strcpy(ill->ill_name, ipif_loopback_name); 4744 ill->ill_name_length = sizeof (ipif_loopback_name); 4745 /* Set ill_name_set for ill_phyint_reinit to work properly */ 4746 4747 ill->ill_global_timer = INFINITY; 4748 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4749 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4750 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4751 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4752 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4753 4754 /* No resolver here. */ 4755 ill->ill_net_type = IRE_LOOPBACK; 4756 4757 /* Initialize the ipsq */ 4758 if (!ipsq_init(ill)) 4759 goto done; 4760 4761 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 4762 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 4763 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 4764 #ifdef ILL_DEBUG 4765 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 4766 #endif 4767 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 4768 if (ipif == NULL) 4769 goto done; 4770 4771 ill->ill_flags = ILLF_MULTICAST; 4772 4773 /* Set up default loopback address and mask. */ 4774 if (!isv6) { 4775 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 4776 4777 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 4778 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4779 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 4780 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4781 ipif->ipif_v6subnet); 4782 ill->ill_flags |= ILLF_IPV4; 4783 } else { 4784 ipif->ipif_v6lcl_addr = ipv6_loopback; 4785 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4786 ipif->ipif_v6net_mask = ipv6_all_ones; 4787 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4788 ipif->ipif_v6subnet); 4789 ill->ill_flags |= ILLF_IPV6; 4790 } 4791 4792 /* 4793 * Chain us in at the end of the ill list. hold the ill 4794 * before we make it globally visible. 1 for the lookup. 4795 */ 4796 ill->ill_refcnt = 0; 4797 ill_refhold(ill); 4798 4799 ill->ill_frag_count = 0; 4800 ill->ill_frag_free_num_pkts = 0; 4801 ill->ill_last_frag_clean_time = 0; 4802 4803 old_ipsq = ill->ill_phyint->phyint_ipsq; 4804 4805 if (ill_glist_insert(ill, "lo", isv6) != 0) 4806 cmn_err(CE_PANIC, "cannot insert loopback interface"); 4807 4808 /* Let SCTP know so that it can add this to its list */ 4809 sctp_update_ill(ill, SCTP_ILL_INSERT); 4810 4811 /* Let SCTP know about this IPIF, so that it can add it to its list */ 4812 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 4813 4814 /* 4815 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 4816 */ 4817 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 4818 /* Loopback ills aren't in any IPMP group */ 4819 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 4820 ipsq_delete(old_ipsq); 4821 } 4822 4823 /* 4824 * Delay this till the ipif is allocated as ipif_allocate 4825 * de-references ill_phyint for getting the ifindex. We 4826 * can't do this before ipif_allocate because ill_phyint_reinit 4827 * -> phyint_assign_ifindex expects ipif to be present. 4828 */ 4829 mutex_enter(&ill->ill_phyint->phyint_lock); 4830 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 4831 mutex_exit(&ill->ill_phyint->phyint_lock); 4832 4833 if (loopback_ksp == NULL) { 4834 /* Export loopback interface statistics */ 4835 loopback_ksp = kstat_create("lo", 0, ipif_loopback_name, "net", 4836 KSTAT_TYPE_NAMED, 2, 0); 4837 if (loopback_ksp != NULL) { 4838 loopback_ksp->ks_update = loopback_kstat_update; 4839 kn = KSTAT_NAMED_PTR(loopback_ksp); 4840 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 4841 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 4842 kstat_install(loopback_ksp); 4843 } 4844 } 4845 4846 if (error != NULL) 4847 *error = 0; 4848 *did_alloc = B_TRUE; 4849 rw_exit(&ill_g_lock); 4850 return (ill); 4851 done: 4852 if (ill != NULL) { 4853 if (ill->ill_phyint != NULL) { 4854 ipsq_t *ipsq; 4855 4856 ipsq = ill->ill_phyint->phyint_ipsq; 4857 if (ipsq != NULL) 4858 kmem_free(ipsq, sizeof (ipsq_t)); 4859 mi_free(ill->ill_phyint); 4860 } 4861 ill_free_mib(ill); 4862 mi_free(ill); 4863 } 4864 rw_exit(&ill_g_lock); 4865 if (error != NULL) 4866 *error = ENOMEM; 4867 return (NULL); 4868 } 4869 4870 /* 4871 * Return a pointer to the ill which matches the index and IP version type. 4872 */ 4873 ill_t * 4874 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 4875 ipsq_func_t func, int *err) 4876 { 4877 ill_t *ill; 4878 ipsq_t *ipsq; 4879 phyint_t *phyi; 4880 4881 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 4882 (q != NULL && mp != NULL && func != NULL && err != NULL)); 4883 4884 if (err != NULL) 4885 *err = 0; 4886 4887 /* 4888 * Indexes are stored in the phyint - a common structure 4889 * to both IPv4 and IPv6. 4890 */ 4891 rw_enter(&ill_g_lock, RW_READER); 4892 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 4893 (void *) &index, NULL); 4894 if (phyi != NULL) { 4895 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 4896 if (ill != NULL) { 4897 /* 4898 * The block comment at the start of ipif_down 4899 * explains the use of the macros used below 4900 */ 4901 GRAB_CONN_LOCK(q); 4902 mutex_enter(&ill->ill_lock); 4903 if (ILL_CAN_LOOKUP(ill)) { 4904 ill_refhold_locked(ill); 4905 mutex_exit(&ill->ill_lock); 4906 RELEASE_CONN_LOCK(q); 4907 rw_exit(&ill_g_lock); 4908 return (ill); 4909 } else if (ILL_CAN_WAIT(ill, q)) { 4910 ipsq = ill->ill_phyint->phyint_ipsq; 4911 mutex_enter(&ipsq->ipsq_lock); 4912 rw_exit(&ill_g_lock); 4913 mutex_exit(&ill->ill_lock); 4914 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4915 mutex_exit(&ipsq->ipsq_lock); 4916 RELEASE_CONN_LOCK(q); 4917 *err = EINPROGRESS; 4918 return (NULL); 4919 } 4920 RELEASE_CONN_LOCK(q); 4921 mutex_exit(&ill->ill_lock); 4922 } 4923 } 4924 rw_exit(&ill_g_lock); 4925 if (err != NULL) 4926 *err = ENXIO; 4927 return (NULL); 4928 } 4929 4930 /* 4931 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4932 * that gives a running thread a reference to the ill. This reference must be 4933 * released by the thread when it is done accessing the ill and related 4934 * objects. ill_refcnt can not be used to account for static references 4935 * such as other structures pointing to an ill. Callers must generally 4936 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4937 * or be sure that the ill is not being deleted or changing state before 4938 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4939 * ill won't change any of its critical state such as address, netmask etc. 4940 */ 4941 void 4942 ill_refhold(ill_t *ill) 4943 { 4944 mutex_enter(&ill->ill_lock); 4945 ill->ill_refcnt++; 4946 ILL_TRACE_REF(ill); 4947 mutex_exit(&ill->ill_lock); 4948 } 4949 4950 void 4951 ill_refhold_locked(ill_t *ill) 4952 { 4953 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4954 ill->ill_refcnt++; 4955 ILL_TRACE_REF(ill); 4956 } 4957 4958 int 4959 ill_check_and_refhold(ill_t *ill) 4960 { 4961 mutex_enter(&ill->ill_lock); 4962 if (ILL_CAN_LOOKUP(ill)) { 4963 ill_refhold_locked(ill); 4964 mutex_exit(&ill->ill_lock); 4965 return (0); 4966 } 4967 mutex_exit(&ill->ill_lock); 4968 return (ILL_LOOKUP_FAILED); 4969 } 4970 4971 /* 4972 * Must not be called while holding any locks. Otherwise if this is 4973 * the last reference to be released, there is a chance of recursive mutex 4974 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4975 * to restart an ioctl. 4976 */ 4977 void 4978 ill_refrele(ill_t *ill) 4979 { 4980 mutex_enter(&ill->ill_lock); 4981 ASSERT(ill->ill_refcnt != 0); 4982 ill->ill_refcnt--; 4983 ILL_UNTRACE_REF(ill); 4984 if (ill->ill_refcnt != 0) { 4985 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4986 mutex_exit(&ill->ill_lock); 4987 return; 4988 } 4989 4990 /* Drops the ill_lock */ 4991 ipif_ill_refrele_tail(ill); 4992 } 4993 4994 /* 4995 * Obtain a weak reference count on the ill. This reference ensures the 4996 * ill won't be freed, but the ill may change any of its critical state 4997 * such as netmask, address etc. Returns an error if the ill has started 4998 * closing. 4999 */ 5000 boolean_t 5001 ill_waiter_inc(ill_t *ill) 5002 { 5003 mutex_enter(&ill->ill_lock); 5004 if (ill->ill_state_flags & ILL_CONDEMNED) { 5005 mutex_exit(&ill->ill_lock); 5006 return (B_FALSE); 5007 } 5008 ill->ill_waiters++; 5009 mutex_exit(&ill->ill_lock); 5010 return (B_TRUE); 5011 } 5012 5013 void 5014 ill_waiter_dcr(ill_t *ill) 5015 { 5016 mutex_enter(&ill->ill_lock); 5017 ill->ill_waiters--; 5018 if (ill->ill_waiters == 0) 5019 cv_broadcast(&ill->ill_cv); 5020 mutex_exit(&ill->ill_lock); 5021 } 5022 5023 /* 5024 * Named Dispatch routine to produce a formatted report on all ILLs. 5025 * This report is accessed by using the ndd utility to "get" ND variable 5026 * "ip_ill_status". 5027 */ 5028 /* ARGSUSED */ 5029 int 5030 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5031 { 5032 ill_t *ill; 5033 ill_walk_context_t ctx; 5034 5035 (void) mi_mpprintf(mp, 5036 "ILL " MI_COL_HDRPAD_STR 5037 /* 01234567[89ABCDEF] */ 5038 "rq " MI_COL_HDRPAD_STR 5039 /* 01234567[89ABCDEF] */ 5040 "wq " MI_COL_HDRPAD_STR 5041 /* 01234567[89ABCDEF] */ 5042 "upcnt mxfrg err name"); 5043 /* 12345 12345 123 xxxxxxxx */ 5044 5045 rw_enter(&ill_g_lock, RW_READER); 5046 ill = ILL_START_WALK_ALL(&ctx); 5047 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5048 (void) mi_mpprintf(mp, 5049 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5050 "%05u %05u %03d %s", 5051 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5052 ill->ill_ipif_up_count, 5053 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5054 } 5055 rw_exit(&ill_g_lock); 5056 5057 return (0); 5058 } 5059 5060 /* 5061 * Named Dispatch routine to produce a formatted report on all IPIFs. 5062 * This report is accessed by using the ndd utility to "get" ND variable 5063 * "ip_ipif_status". 5064 */ 5065 /* ARGSUSED */ 5066 int 5067 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5068 { 5069 char buf1[INET6_ADDRSTRLEN]; 5070 char buf2[INET6_ADDRSTRLEN]; 5071 char buf3[INET6_ADDRSTRLEN]; 5072 char buf4[INET6_ADDRSTRLEN]; 5073 char buf5[INET6_ADDRSTRLEN]; 5074 char buf6[INET6_ADDRSTRLEN]; 5075 char buf[LIFNAMSIZ]; 5076 ill_t *ill; 5077 ipif_t *ipif; 5078 nv_t *nvp; 5079 uint64_t flags; 5080 zoneid_t zoneid; 5081 ill_walk_context_t ctx; 5082 5083 (void) mi_mpprintf(mp, 5084 "IPIF metric mtu in/out/forward name zone flags...\n" 5085 "\tlocal address\n" 5086 "\tsrc address\n" 5087 "\tsubnet\n" 5088 "\tmask\n" 5089 "\tbroadcast\n" 5090 "\tp-p-dst"); 5091 5092 ASSERT(q->q_next == NULL); 5093 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5094 5095 rw_enter(&ill_g_lock, RW_READER); 5096 ill = ILL_START_WALK_ALL(&ctx); 5097 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5098 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 5099 if (zoneid != GLOBAL_ZONEID && 5100 zoneid != ipif->ipif_zoneid && 5101 ipif->ipif_zoneid != ALL_ZONES) 5102 continue; 5103 (void) mi_mpprintf(mp, 5104 MI_COL_PTRFMT_STR 5105 "%04u %05u %u/%u/%u %s %d", 5106 (void *)ipif, 5107 ipif->ipif_metric, ipif->ipif_mtu, 5108 ipif->ipif_ib_pkt_count, 5109 ipif->ipif_ob_pkt_count, 5110 ipif->ipif_fo_pkt_count, 5111 ipif_get_name(ipif, buf, sizeof (buf)), 5112 ipif->ipif_zoneid); 5113 5114 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5115 ipif->ipif_ill->ill_phyint->phyint_flags; 5116 5117 /* Tack on text strings for any flags. */ 5118 nvp = ipif_nv_tbl; 5119 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5120 if (nvp->nv_value & flags) 5121 (void) mi_mpprintf_nr(mp, " %s", 5122 nvp->nv_name); 5123 } 5124 (void) mi_mpprintf(mp, 5125 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5126 inet_ntop(AF_INET6, 5127 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5128 inet_ntop(AF_INET6, 5129 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5130 inet_ntop(AF_INET6, 5131 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5132 inet_ntop(AF_INET6, 5133 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5134 inet_ntop(AF_INET6, 5135 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5136 inet_ntop(AF_INET6, 5137 &ipif->ipif_v6pp_dst_addr, 5138 buf6, sizeof (buf6))); 5139 } 5140 } 5141 rw_exit(&ill_g_lock); 5142 return (0); 5143 } 5144 5145 /* 5146 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5147 * driver. We construct best guess defaults for lower level information that 5148 * we need. If an interface is brought up without injection of any overriding 5149 * information from outside, we have to be ready to go with these defaults. 5150 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5151 * we primarely want the dl_provider_style. 5152 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5153 * at which point we assume the other part of the information is valid. 5154 */ 5155 void 5156 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5157 { 5158 uchar_t *brdcst_addr; 5159 uint_t brdcst_addr_length, phys_addr_length; 5160 t_scalar_t sap_length; 5161 dl_info_ack_t *dlia; 5162 ip_m_t *ipm; 5163 dl_qos_cl_sel1_t *sel1; 5164 5165 ASSERT(IAM_WRITER_ILL(ill)); 5166 5167 /* 5168 * Till the ill is fully up ILL_CHANGING will be set and 5169 * the ill is not globally visible. So no need for a lock. 5170 */ 5171 dlia = (dl_info_ack_t *)mp->b_rptr; 5172 ill->ill_mactype = dlia->dl_mac_type; 5173 5174 ipm = ip_m_lookup(dlia->dl_mac_type); 5175 if (ipm == NULL) { 5176 ipm = ip_m_lookup(DL_OTHER); 5177 ASSERT(ipm != NULL); 5178 } 5179 ill->ill_media = ipm; 5180 5181 /* 5182 * When the new DLPI stuff is ready we'll pull lengths 5183 * from dlia. 5184 */ 5185 if (dlia->dl_version == DL_VERSION_2) { 5186 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5187 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5188 brdcst_addr_length); 5189 if (brdcst_addr == NULL) { 5190 brdcst_addr_length = 0; 5191 } 5192 sap_length = dlia->dl_sap_length; 5193 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5194 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5195 brdcst_addr_length, sap_length, phys_addr_length)); 5196 } else { 5197 brdcst_addr_length = 6; 5198 brdcst_addr = ip_six_byte_all_ones; 5199 sap_length = -2; 5200 phys_addr_length = brdcst_addr_length; 5201 } 5202 5203 ill->ill_bcast_addr_length = brdcst_addr_length; 5204 ill->ill_phys_addr_length = phys_addr_length; 5205 ill->ill_sap_length = sap_length; 5206 ill->ill_max_frag = dlia->dl_max_sdu; 5207 ill->ill_max_mtu = ill->ill_max_frag; 5208 5209 ill->ill_type = ipm->ip_m_type; 5210 5211 if (!ill->ill_dlpi_style_set) { 5212 if (dlia->dl_provider_style == DL_STYLE2) 5213 ill->ill_needs_attach = 1; 5214 5215 /* 5216 * Allocate the first ipif on this ill. We don't delay it 5217 * further as ioctl handling assumes atleast one ipif to 5218 * be present. 5219 * 5220 * At this point we don't know whether the ill is v4 or v6. 5221 * We will know this whan the SIOCSLIFNAME happens and 5222 * the correct value for ill_isv6 will be assigned in 5223 * ipif_set_values(). We need to hold the ill lock and 5224 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5225 * the wakeup. 5226 */ 5227 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5228 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5229 mutex_enter(&ill->ill_lock); 5230 ASSERT(ill->ill_dlpi_style_set == 0); 5231 ill->ill_dlpi_style_set = 1; 5232 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5233 cv_broadcast(&ill->ill_cv); 5234 mutex_exit(&ill->ill_lock); 5235 freemsg(mp); 5236 return; 5237 } 5238 ASSERT(ill->ill_ipif != NULL); 5239 /* 5240 * We know whether it is IPv4 or IPv6 now, as this is the 5241 * second DL_INFO_ACK we are recieving in response to the 5242 * DL_INFO_REQ sent in ipif_set_values. 5243 */ 5244 if (ill->ill_isv6) 5245 ill->ill_sap = IP6_DL_SAP; 5246 else 5247 ill->ill_sap = IP_DL_SAP; 5248 /* 5249 * Set ipif_mtu which is used to set the IRE's 5250 * ire_max_frag value. The driver could have sent 5251 * a different mtu from what it sent last time. No 5252 * need to call ipif_mtu_change because IREs have 5253 * not yet been created. 5254 */ 5255 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5256 /* 5257 * Clear all the flags that were set based on ill_bcast_addr_length 5258 * and ill_phys_addr_length (in ipif_set_values) as these could have 5259 * changed now and we need to re-evaluate. 5260 */ 5261 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5262 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5263 5264 /* 5265 * Free ill_resolver_mp and ill_bcast_mp as things could have 5266 * changed now. 5267 */ 5268 if (ill->ill_bcast_addr_length == 0) { 5269 if (ill->ill_resolver_mp != NULL) 5270 freemsg(ill->ill_resolver_mp); 5271 if (ill->ill_bcast_mp != NULL) 5272 freemsg(ill->ill_bcast_mp); 5273 if (ill->ill_flags & ILLF_XRESOLV) 5274 ill->ill_net_type = IRE_IF_RESOLVER; 5275 else 5276 ill->ill_net_type = IRE_IF_NORESOLVER; 5277 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5278 ill->ill_phys_addr_length, 5279 ill->ill_sap, 5280 ill->ill_sap_length); 5281 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5282 5283 if (ill->ill_isv6) 5284 /* 5285 * Note: xresolv interfaces will eventually need NOARP 5286 * set here as well, but that will require those 5287 * external resolvers to have some knowledge of 5288 * that flag and act appropriately. Not to be changed 5289 * at present. 5290 */ 5291 ill->ill_flags |= ILLF_NONUD; 5292 else 5293 ill->ill_flags |= ILLF_NOARP; 5294 5295 if (ill->ill_phys_addr_length == 0) { 5296 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5297 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5298 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5299 } else { 5300 /* pt-pt supports multicast. */ 5301 ill->ill_flags |= ILLF_MULTICAST; 5302 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5303 } 5304 } 5305 } else { 5306 ill->ill_net_type = IRE_IF_RESOLVER; 5307 if (ill->ill_bcast_mp != NULL) 5308 freemsg(ill->ill_bcast_mp); 5309 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5310 ill->ill_bcast_addr_length, ill->ill_sap, 5311 ill->ill_sap_length); 5312 /* 5313 * Later detect lack of DLPI driver multicast 5314 * capability by catching DL_ENABMULTI errors in 5315 * ip_rput_dlpi. 5316 */ 5317 ill->ill_flags |= ILLF_MULTICAST; 5318 if (!ill->ill_isv6) 5319 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5320 } 5321 /* By default an interface does not support any CoS marking */ 5322 ill->ill_flags &= ~ILLF_COS_ENABLED; 5323 5324 /* 5325 * If we get QoS information in DL_INFO_ACK, the device supports 5326 * some form of CoS marking, set ILLF_COS_ENABLED. 5327 */ 5328 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5329 dlia->dl_qos_length); 5330 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5331 ill->ill_flags |= ILLF_COS_ENABLED; 5332 } 5333 5334 /* Clear any previous error indication. */ 5335 ill->ill_error = 0; 5336 freemsg(mp); 5337 } 5338 5339 /* 5340 * Perform various checks to verify that an address would make sense as a 5341 * local, remote, or subnet interface address. 5342 */ 5343 static boolean_t 5344 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5345 { 5346 ipaddr_t net_mask; 5347 5348 /* 5349 * Don't allow all zeroes, all ones or experimental address, but allow 5350 * all ones netmask. 5351 */ 5352 if ((net_mask = ip_net_mask(addr)) == 0) 5353 return (B_FALSE); 5354 /* A given netmask overrides the "guess" netmask */ 5355 if (subnet_mask != 0) 5356 net_mask = subnet_mask; 5357 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5358 (addr == (addr | ~net_mask)))) { 5359 return (B_FALSE); 5360 } 5361 if (CLASSD(addr)) 5362 return (B_FALSE); 5363 5364 return (B_TRUE); 5365 } 5366 5367 /* 5368 * ipif_lookup_group 5369 * Returns held ipif 5370 */ 5371 ipif_t * 5372 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid) 5373 { 5374 ire_t *ire; 5375 ipif_t *ipif; 5376 5377 ire = ire_lookup_multi(group, zoneid); 5378 if (ire == NULL) 5379 return (NULL); 5380 ipif = ire->ire_ipif; 5381 ipif_refhold(ipif); 5382 ire_refrele(ire); 5383 return (ipif); 5384 } 5385 5386 /* 5387 * Look for an ipif with the specified interface address and destination. 5388 * The destination address is used only for matching point-to-point interfaces. 5389 */ 5390 ipif_t * 5391 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5392 ipsq_func_t func, int *error) 5393 { 5394 ipif_t *ipif; 5395 ill_t *ill; 5396 ill_walk_context_t ctx; 5397 ipsq_t *ipsq; 5398 5399 if (error != NULL) 5400 *error = 0; 5401 5402 /* 5403 * First match all the point-to-point interfaces 5404 * before looking at non-point-to-point interfaces. 5405 * This is done to avoid returning non-point-to-point 5406 * ipif instead of unnumbered point-to-point ipif. 5407 */ 5408 rw_enter(&ill_g_lock, RW_READER); 5409 ill = ILL_START_WALK_V4(&ctx); 5410 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5411 GRAB_CONN_LOCK(q); 5412 mutex_enter(&ill->ill_lock); 5413 for (ipif = ill->ill_ipif; ipif != NULL; 5414 ipif = ipif->ipif_next) { 5415 /* Allow the ipif to be down */ 5416 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5417 (ipif->ipif_lcl_addr == if_addr) && 5418 (ipif->ipif_pp_dst_addr == dst)) { 5419 /* 5420 * The block comment at the start of ipif_down 5421 * explains the use of the macros used below 5422 */ 5423 if (IPIF_CAN_LOOKUP(ipif)) { 5424 ipif_refhold_locked(ipif); 5425 mutex_exit(&ill->ill_lock); 5426 RELEASE_CONN_LOCK(q); 5427 rw_exit(&ill_g_lock); 5428 return (ipif); 5429 } else if (IPIF_CAN_WAIT(ipif, q)) { 5430 ipsq = ill->ill_phyint->phyint_ipsq; 5431 mutex_enter(&ipsq->ipsq_lock); 5432 mutex_exit(&ill->ill_lock); 5433 rw_exit(&ill_g_lock); 5434 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5435 ill); 5436 mutex_exit(&ipsq->ipsq_lock); 5437 RELEASE_CONN_LOCK(q); 5438 *error = EINPROGRESS; 5439 return (NULL); 5440 } 5441 } 5442 } 5443 mutex_exit(&ill->ill_lock); 5444 RELEASE_CONN_LOCK(q); 5445 } 5446 rw_exit(&ill_g_lock); 5447 5448 /* lookup the ipif based on interface address */ 5449 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error); 5450 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5451 return (ipif); 5452 } 5453 5454 /* 5455 * Look for an ipif with the specified address. For point-point links 5456 * we look for matches on either the destination address and the local 5457 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5458 * is set. 5459 * Matches on a specific ill if match_ill is set. 5460 */ 5461 ipif_t * 5462 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5463 mblk_t *mp, ipsq_func_t func, int *error) 5464 { 5465 ipif_t *ipif; 5466 ill_t *ill; 5467 boolean_t ptp = B_FALSE; 5468 ipsq_t *ipsq; 5469 ill_walk_context_t ctx; 5470 5471 if (error != NULL) 5472 *error = 0; 5473 5474 rw_enter(&ill_g_lock, RW_READER); 5475 /* 5476 * Repeat twice, first based on local addresses and 5477 * next time for pointopoint. 5478 */ 5479 repeat: 5480 ill = ILL_START_WALK_V4(&ctx); 5481 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5482 if (match_ill != NULL && ill != match_ill) { 5483 continue; 5484 } 5485 GRAB_CONN_LOCK(q); 5486 mutex_enter(&ill->ill_lock); 5487 for (ipif = ill->ill_ipif; ipif != NULL; 5488 ipif = ipif->ipif_next) { 5489 if (zoneid != ALL_ZONES && 5490 zoneid != ipif->ipif_zoneid && 5491 ipif->ipif_zoneid != ALL_ZONES) 5492 continue; 5493 /* Allow the ipif to be down */ 5494 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5495 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5496 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5497 (ipif->ipif_pp_dst_addr == addr))) { 5498 /* 5499 * The block comment at the start of ipif_down 5500 * explains the use of the macros used below 5501 */ 5502 if (IPIF_CAN_LOOKUP(ipif)) { 5503 ipif_refhold_locked(ipif); 5504 mutex_exit(&ill->ill_lock); 5505 RELEASE_CONN_LOCK(q); 5506 rw_exit(&ill_g_lock); 5507 return (ipif); 5508 } else if (IPIF_CAN_WAIT(ipif, q)) { 5509 ipsq = ill->ill_phyint->phyint_ipsq; 5510 mutex_enter(&ipsq->ipsq_lock); 5511 mutex_exit(&ill->ill_lock); 5512 rw_exit(&ill_g_lock); 5513 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5514 ill); 5515 mutex_exit(&ipsq->ipsq_lock); 5516 RELEASE_CONN_LOCK(q); 5517 *error = EINPROGRESS; 5518 return (NULL); 5519 } 5520 } 5521 } 5522 mutex_exit(&ill->ill_lock); 5523 RELEASE_CONN_LOCK(q); 5524 } 5525 5526 /* Now try the ptp case */ 5527 if (ptp) { 5528 rw_exit(&ill_g_lock); 5529 if (error != NULL) 5530 *error = ENXIO; 5531 return (NULL); 5532 } 5533 ptp = B_TRUE; 5534 goto repeat; 5535 } 5536 5537 /* 5538 * Look for an ipif that matches the specified remote address i.e. the 5539 * ipif that would receive the specified packet. 5540 * First look for directly connected interfaces and then do a recursive 5541 * IRE lookup and pick the first ipif corresponding to the source address in the 5542 * ire. 5543 * Returns: held ipif 5544 */ 5545 ipif_t * 5546 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 5547 { 5548 ipif_t *ipif; 5549 ire_t *ire; 5550 5551 ASSERT(!ill->ill_isv6); 5552 5553 /* 5554 * Someone could be changing this ipif currently or change it 5555 * after we return this. Thus a few packets could use the old 5556 * old values. However structure updates/creates (ire, ilg, ilm etc) 5557 * will atomically be updated or cleaned up with the new value 5558 * Thus we don't need a lock to check the flags or other attrs below. 5559 */ 5560 mutex_enter(&ill->ill_lock); 5561 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5562 if (!IPIF_CAN_LOOKUP(ipif)) 5563 continue; 5564 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 5565 ipif->ipif_zoneid != ALL_ZONES) 5566 continue; 5567 /* Allow the ipif to be down */ 5568 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5569 if ((ipif->ipif_pp_dst_addr == addr) || 5570 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 5571 ipif->ipif_lcl_addr == addr)) { 5572 ipif_refhold_locked(ipif); 5573 mutex_exit(&ill->ill_lock); 5574 return (ipif); 5575 } 5576 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 5577 ipif_refhold_locked(ipif); 5578 mutex_exit(&ill->ill_lock); 5579 return (ipif); 5580 } 5581 } 5582 mutex_exit(&ill->ill_lock); 5583 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 5584 NULL, MATCH_IRE_RECURSIVE); 5585 if (ire != NULL) { 5586 /* 5587 * The callers of this function wants to know the 5588 * interface on which they have to send the replies 5589 * back. For IRE_CACHES that have ire_stq and ire_ipif 5590 * derived from different ills, we really don't care 5591 * what we return here. 5592 */ 5593 ipif = ire->ire_ipif; 5594 if (ipif != NULL) { 5595 ipif_refhold(ipif); 5596 ire_refrele(ire); 5597 return (ipif); 5598 } 5599 ire_refrele(ire); 5600 } 5601 /* Pick the first interface */ 5602 ipif = ipif_get_next_ipif(NULL, ill); 5603 return (ipif); 5604 } 5605 5606 /* 5607 * This func does not prevent refcnt from increasing. But if 5608 * the caller has taken steps to that effect, then this func 5609 * can be used to determine whether the ill has become quiescent 5610 */ 5611 boolean_t 5612 ill_is_quiescent(ill_t *ill) 5613 { 5614 ipif_t *ipif; 5615 5616 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5617 5618 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5619 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5620 return (B_FALSE); 5621 } 5622 } 5623 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 5624 ill->ill_nce_cnt != 0 || ill->ill_srcif_refcnt != 0 || 5625 ill->ill_mrtun_refcnt != 0) { 5626 return (B_FALSE); 5627 } 5628 return (B_TRUE); 5629 } 5630 5631 /* 5632 * This func does not prevent refcnt from increasing. But if 5633 * the caller has taken steps to that effect, then this func 5634 * can be used to determine whether the ipif has become quiescent 5635 */ 5636 static boolean_t 5637 ipif_is_quiescent(ipif_t *ipif) 5638 { 5639 ill_t *ill; 5640 5641 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5642 5643 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5644 return (B_FALSE); 5645 } 5646 5647 ill = ipif->ipif_ill; 5648 if (ill->ill_ipif_up_count != 0 || ill->ill_logical_down) 5649 return (B_TRUE); 5650 5651 /* This is the last ipif going down or being deleted on this ill */ 5652 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 5653 return (B_FALSE); 5654 } 5655 5656 return (B_TRUE); 5657 } 5658 5659 /* 5660 * This func does not prevent refcnt from increasing. But if 5661 * the caller has taken steps to that effect, then this func 5662 * can be used to determine whether the ipifs marked with IPIF_MOVING 5663 * have become quiescent and can be moved in a failover/failback. 5664 */ 5665 static ipif_t * 5666 ill_quiescent_to_move(ill_t *ill) 5667 { 5668 ipif_t *ipif; 5669 5670 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5671 5672 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5673 if (ipif->ipif_state_flags & IPIF_MOVING) { 5674 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5675 return (ipif); 5676 } 5677 } 5678 } 5679 return (NULL); 5680 } 5681 5682 /* 5683 * The ipif/ill/ire has been refreled. Do the tail processing. 5684 * Determine if the ipif or ill in question has become quiescent and if so 5685 * wakeup close and/or restart any queued pending ioctl that is waiting 5686 * for the ipif_down (or ill_down) 5687 */ 5688 void 5689 ipif_ill_refrele_tail(ill_t *ill) 5690 { 5691 mblk_t *mp; 5692 conn_t *connp; 5693 ipsq_t *ipsq; 5694 ipif_t *ipif; 5695 5696 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5697 5698 if ((ill->ill_state_flags & ILL_CONDEMNED) && 5699 ill_is_quiescent(ill)) { 5700 /* ill_close may be waiting */ 5701 cv_broadcast(&ill->ill_cv); 5702 } 5703 5704 /* ipsq can't change because ill_lock is held */ 5705 ipsq = ill->ill_phyint->phyint_ipsq; 5706 if (ipsq->ipsq_waitfor == 0) { 5707 /* Not waiting for anything, just return. */ 5708 mutex_exit(&ill->ill_lock); 5709 return; 5710 } 5711 ASSERT(ipsq->ipsq_pending_mp != NULL && 5712 ipsq->ipsq_pending_ipif != NULL); 5713 /* 5714 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 5715 * Last ipif going down needs to down the ill, so ill_ire_cnt must 5716 * be zero for restarting an ioctl that ends up downing the ill. 5717 */ 5718 ipif = ipsq->ipsq_pending_ipif; 5719 if (ipif->ipif_ill != ill) { 5720 /* The ioctl is pending on some other ill. */ 5721 mutex_exit(&ill->ill_lock); 5722 return; 5723 } 5724 5725 switch (ipsq->ipsq_waitfor) { 5726 case IPIF_DOWN: 5727 case IPIF_FREE: 5728 if (!ipif_is_quiescent(ipif)) { 5729 mutex_exit(&ill->ill_lock); 5730 return; 5731 } 5732 break; 5733 5734 case ILL_DOWN: 5735 case ILL_FREE: 5736 /* 5737 * case ILL_FREE arises only for loopback. otherwise ill_delete 5738 * waits synchronously in ip_close, and no message is queued in 5739 * ipsq_pending_mp at all in this case 5740 */ 5741 if (!ill_is_quiescent(ill)) { 5742 mutex_exit(&ill->ill_lock); 5743 return; 5744 } 5745 5746 break; 5747 5748 case ILL_MOVE_OK: 5749 if (ill_quiescent_to_move(ill) != NULL) { 5750 mutex_exit(&ill->ill_lock); 5751 return; 5752 } 5753 5754 break; 5755 default: 5756 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 5757 (void *)ipsq, ipsq->ipsq_waitfor); 5758 } 5759 5760 /* 5761 * Incr refcnt for the qwriter_ip call below which 5762 * does a refrele 5763 */ 5764 ill_refhold_locked(ill); 5765 mutex_exit(&ill->ill_lock); 5766 5767 mp = ipsq_pending_mp_get(ipsq, &connp); 5768 ASSERT(mp != NULL); 5769 5770 switch (mp->b_datap->db_type) { 5771 case M_ERROR: 5772 case M_HANGUP: 5773 (void) qwriter_ip(NULL, ill, ill->ill_rq, mp, 5774 ipif_all_down_tail, CUR_OP, B_TRUE); 5775 return; 5776 5777 case M_IOCTL: 5778 case M_IOCDATA: 5779 (void) qwriter_ip(NULL, ill, 5780 (connp != NULL ? CONNP_TO_WQ(connp) : ill->ill_wq), mp, 5781 ip_reprocess_ioctl, CUR_OP, B_TRUE); 5782 return; 5783 5784 default: 5785 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5786 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5787 } 5788 } 5789 5790 #ifdef ILL_DEBUG 5791 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5792 void 5793 th_trace_rrecord(th_trace_t *th_trace) 5794 { 5795 tr_buf_t *tr_buf; 5796 uint_t lastref; 5797 5798 lastref = th_trace->th_trace_lastref; 5799 lastref++; 5800 if (lastref == TR_BUF_MAX) 5801 lastref = 0; 5802 th_trace->th_trace_lastref = lastref; 5803 tr_buf = &th_trace->th_trbuf[lastref]; 5804 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, IP_STACK_DEPTH); 5805 } 5806 5807 th_trace_t * 5808 th_trace_ipif_lookup(ipif_t *ipif) 5809 { 5810 int bucket_id; 5811 th_trace_t *th_trace; 5812 5813 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5814 5815 bucket_id = IP_TR_HASH(curthread); 5816 ASSERT(bucket_id < IP_TR_HASH_MAX); 5817 5818 for (th_trace = ipif->ipif_trace[bucket_id]; th_trace != NULL; 5819 th_trace = th_trace->th_next) { 5820 if (th_trace->th_id == curthread) 5821 return (th_trace); 5822 } 5823 return (NULL); 5824 } 5825 5826 void 5827 ipif_trace_ref(ipif_t *ipif) 5828 { 5829 int bucket_id; 5830 th_trace_t *th_trace; 5831 5832 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5833 5834 if (ipif->ipif_trace_disable) 5835 return; 5836 5837 /* 5838 * Attempt to locate the trace buffer for the curthread. 5839 * If it does not exist, then allocate a new trace buffer 5840 * and link it in list of trace bufs for this ipif, at the head 5841 */ 5842 th_trace = th_trace_ipif_lookup(ipif); 5843 if (th_trace == NULL) { 5844 bucket_id = IP_TR_HASH(curthread); 5845 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 5846 KM_NOSLEEP); 5847 if (th_trace == NULL) { 5848 ipif->ipif_trace_disable = B_TRUE; 5849 ipif_trace_cleanup(ipif); 5850 return; 5851 } 5852 th_trace->th_id = curthread; 5853 th_trace->th_next = ipif->ipif_trace[bucket_id]; 5854 th_trace->th_prev = &ipif->ipif_trace[bucket_id]; 5855 if (th_trace->th_next != NULL) 5856 th_trace->th_next->th_prev = &th_trace->th_next; 5857 ipif->ipif_trace[bucket_id] = th_trace; 5858 } 5859 ASSERT(th_trace->th_refcnt >= 0 && 5860 th_trace->th_refcnt < TR_BUF_MAX -1); 5861 th_trace->th_refcnt++; 5862 th_trace_rrecord(th_trace); 5863 } 5864 5865 void 5866 ipif_untrace_ref(ipif_t *ipif) 5867 { 5868 th_trace_t *th_trace; 5869 5870 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5871 5872 if (ipif->ipif_trace_disable) 5873 return; 5874 th_trace = th_trace_ipif_lookup(ipif); 5875 ASSERT(th_trace != NULL); 5876 ASSERT(th_trace->th_refcnt > 0); 5877 5878 th_trace->th_refcnt--; 5879 th_trace_rrecord(th_trace); 5880 } 5881 5882 th_trace_t * 5883 th_trace_ill_lookup(ill_t *ill) 5884 { 5885 th_trace_t *th_trace; 5886 int bucket_id; 5887 5888 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5889 5890 bucket_id = IP_TR_HASH(curthread); 5891 ASSERT(bucket_id < IP_TR_HASH_MAX); 5892 5893 for (th_trace = ill->ill_trace[bucket_id]; th_trace != NULL; 5894 th_trace = th_trace->th_next) { 5895 if (th_trace->th_id == curthread) 5896 return (th_trace); 5897 } 5898 return (NULL); 5899 } 5900 5901 void 5902 ill_trace_ref(ill_t *ill) 5903 { 5904 int bucket_id; 5905 th_trace_t *th_trace; 5906 5907 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5908 if (ill->ill_trace_disable) 5909 return; 5910 /* 5911 * Attempt to locate the trace buffer for the curthread. 5912 * If it does not exist, then allocate a new trace buffer 5913 * and link it in list of trace bufs for this ill, at the head 5914 */ 5915 th_trace = th_trace_ill_lookup(ill); 5916 if (th_trace == NULL) { 5917 bucket_id = IP_TR_HASH(curthread); 5918 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 5919 KM_NOSLEEP); 5920 if (th_trace == NULL) { 5921 ill->ill_trace_disable = B_TRUE; 5922 ill_trace_cleanup(ill); 5923 return; 5924 } 5925 th_trace->th_id = curthread; 5926 th_trace->th_next = ill->ill_trace[bucket_id]; 5927 th_trace->th_prev = &ill->ill_trace[bucket_id]; 5928 if (th_trace->th_next != NULL) 5929 th_trace->th_next->th_prev = &th_trace->th_next; 5930 ill->ill_trace[bucket_id] = th_trace; 5931 } 5932 ASSERT(th_trace->th_refcnt >= 0 && 5933 th_trace->th_refcnt < TR_BUF_MAX - 1); 5934 5935 th_trace->th_refcnt++; 5936 th_trace_rrecord(th_trace); 5937 } 5938 5939 void 5940 ill_untrace_ref(ill_t *ill) 5941 { 5942 th_trace_t *th_trace; 5943 5944 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5945 5946 if (ill->ill_trace_disable) 5947 return; 5948 th_trace = th_trace_ill_lookup(ill); 5949 ASSERT(th_trace != NULL); 5950 ASSERT(th_trace->th_refcnt > 0); 5951 5952 th_trace->th_refcnt--; 5953 th_trace_rrecord(th_trace); 5954 } 5955 5956 /* 5957 * Verify that this thread has no refs to the ipif and free 5958 * the trace buffers 5959 */ 5960 /* ARGSUSED */ 5961 void 5962 ipif_thread_exit(ipif_t *ipif, void *dummy) 5963 { 5964 th_trace_t *th_trace; 5965 5966 mutex_enter(&ipif->ipif_ill->ill_lock); 5967 5968 th_trace = th_trace_ipif_lookup(ipif); 5969 if (th_trace == NULL) { 5970 mutex_exit(&ipif->ipif_ill->ill_lock); 5971 return; 5972 } 5973 ASSERT(th_trace->th_refcnt == 0); 5974 /* unlink th_trace and free it */ 5975 *th_trace->th_prev = th_trace->th_next; 5976 if (th_trace->th_next != NULL) 5977 th_trace->th_next->th_prev = th_trace->th_prev; 5978 th_trace->th_next = NULL; 5979 th_trace->th_prev = NULL; 5980 kmem_free(th_trace, sizeof (th_trace_t)); 5981 5982 mutex_exit(&ipif->ipif_ill->ill_lock); 5983 } 5984 5985 /* 5986 * Verify that this thread has no refs to the ill and free 5987 * the trace buffers 5988 */ 5989 /* ARGSUSED */ 5990 void 5991 ill_thread_exit(ill_t *ill, void *dummy) 5992 { 5993 th_trace_t *th_trace; 5994 5995 mutex_enter(&ill->ill_lock); 5996 5997 th_trace = th_trace_ill_lookup(ill); 5998 if (th_trace == NULL) { 5999 mutex_exit(&ill->ill_lock); 6000 return; 6001 } 6002 ASSERT(th_trace->th_refcnt == 0); 6003 /* unlink th_trace and free it */ 6004 *th_trace->th_prev = th_trace->th_next; 6005 if (th_trace->th_next != NULL) 6006 th_trace->th_next->th_prev = th_trace->th_prev; 6007 th_trace->th_next = NULL; 6008 th_trace->th_prev = NULL; 6009 kmem_free(th_trace, sizeof (th_trace_t)); 6010 6011 mutex_exit(&ill->ill_lock); 6012 } 6013 #endif 6014 6015 #ifdef ILL_DEBUG 6016 void 6017 ip_thread_exit(void) 6018 { 6019 ill_t *ill; 6020 ipif_t *ipif; 6021 ill_walk_context_t ctx; 6022 6023 rw_enter(&ill_g_lock, RW_READER); 6024 ill = ILL_START_WALK_ALL(&ctx); 6025 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6026 for (ipif = ill->ill_ipif; ipif != NULL; 6027 ipif = ipif->ipif_next) { 6028 ipif_thread_exit(ipif, NULL); 6029 } 6030 ill_thread_exit(ill, NULL); 6031 } 6032 rw_exit(&ill_g_lock); 6033 6034 ire_walk(ire_thread_exit, NULL); 6035 ndp_walk_common(&ndp4, NULL, nce_thread_exit, NULL, B_FALSE); 6036 ndp_walk_common(&ndp6, NULL, nce_thread_exit, NULL, B_FALSE); 6037 } 6038 6039 /* 6040 * Called when ipif is unplumbed or when memory alloc fails 6041 */ 6042 void 6043 ipif_trace_cleanup(ipif_t *ipif) 6044 { 6045 int i; 6046 th_trace_t *th_trace; 6047 th_trace_t *th_trace_next; 6048 6049 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6050 for (th_trace = ipif->ipif_trace[i]; th_trace != NULL; 6051 th_trace = th_trace_next) { 6052 th_trace_next = th_trace->th_next; 6053 kmem_free(th_trace, sizeof (th_trace_t)); 6054 } 6055 ipif->ipif_trace[i] = NULL; 6056 } 6057 } 6058 6059 /* 6060 * Called when ill is unplumbed or when memory alloc fails 6061 */ 6062 void 6063 ill_trace_cleanup(ill_t *ill) 6064 { 6065 int i; 6066 th_trace_t *th_trace; 6067 th_trace_t *th_trace_next; 6068 6069 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6070 for (th_trace = ill->ill_trace[i]; th_trace != NULL; 6071 th_trace = th_trace_next) { 6072 th_trace_next = th_trace->th_next; 6073 kmem_free(th_trace, sizeof (th_trace_t)); 6074 } 6075 ill->ill_trace[i] = NULL; 6076 } 6077 } 6078 6079 #else 6080 void ip_thread_exit(void) {} 6081 #endif 6082 6083 void 6084 ipif_refhold_locked(ipif_t *ipif) 6085 { 6086 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6087 ipif->ipif_refcnt++; 6088 IPIF_TRACE_REF(ipif); 6089 } 6090 6091 void 6092 ipif_refhold(ipif_t *ipif) 6093 { 6094 ill_t *ill; 6095 6096 ill = ipif->ipif_ill; 6097 mutex_enter(&ill->ill_lock); 6098 ipif->ipif_refcnt++; 6099 IPIF_TRACE_REF(ipif); 6100 mutex_exit(&ill->ill_lock); 6101 } 6102 6103 /* 6104 * Must not be called while holding any locks. Otherwise if this is 6105 * the last reference to be released there is a chance of recursive mutex 6106 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6107 * to restart an ioctl. 6108 */ 6109 void 6110 ipif_refrele(ipif_t *ipif) 6111 { 6112 ill_t *ill; 6113 6114 ill = ipif->ipif_ill; 6115 6116 mutex_enter(&ill->ill_lock); 6117 ASSERT(ipif->ipif_refcnt != 0); 6118 ipif->ipif_refcnt--; 6119 IPIF_UNTRACE_REF(ipif); 6120 if (ipif->ipif_refcnt != 0) { 6121 mutex_exit(&ill->ill_lock); 6122 return; 6123 } 6124 6125 /* Drops the ill_lock */ 6126 ipif_ill_refrele_tail(ill); 6127 } 6128 6129 ipif_t * 6130 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6131 { 6132 ipif_t *ipif; 6133 6134 mutex_enter(&ill->ill_lock); 6135 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6136 ipif != NULL; ipif = ipif->ipif_next) { 6137 if (!IPIF_CAN_LOOKUP(ipif)) 6138 continue; 6139 ipif_refhold_locked(ipif); 6140 mutex_exit(&ill->ill_lock); 6141 return (ipif); 6142 } 6143 mutex_exit(&ill->ill_lock); 6144 return (NULL); 6145 } 6146 6147 /* 6148 * TODO: make this table extendible at run time 6149 * Return a pointer to the mac type info for 'mac_type' 6150 */ 6151 static ip_m_t * 6152 ip_m_lookup(t_uscalar_t mac_type) 6153 { 6154 ip_m_t *ipm; 6155 6156 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6157 if (ipm->ip_m_mac_type == mac_type) 6158 return (ipm); 6159 return (NULL); 6160 } 6161 6162 /* 6163 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6164 * ipif_arg is passed in to associate it with the correct interface. 6165 * We may need to restart this operation if the ipif cannot be looked up 6166 * due to an exclusive operation that is currently in progress. The restart 6167 * entry point is specified by 'func' 6168 */ 6169 int 6170 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6171 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6172 ire_t **ire_arg, boolean_t ioctl_msg, queue_t *q, mblk_t *mp, 6173 ipsq_func_t func, struct rtsa_s *sp) 6174 { 6175 ire_t *ire; 6176 ire_t *gw_ire = NULL; 6177 ipif_t *ipif = NULL; 6178 boolean_t ipif_refheld = B_FALSE; 6179 uint_t type; 6180 int match_flags = MATCH_IRE_TYPE; 6181 int error; 6182 tsol_gc_t *gc = NULL; 6183 tsol_gcgrp_t *gcgrp = NULL; 6184 boolean_t gcgrp_xtraref = B_FALSE; 6185 6186 ip1dbg(("ip_rt_add:")); 6187 6188 if (ire_arg != NULL) 6189 *ire_arg = NULL; 6190 6191 /* 6192 * If this is the case of RTF_HOST being set, then we set the netmask 6193 * to all ones (regardless if one was supplied). 6194 */ 6195 if (flags & RTF_HOST) 6196 mask = IP_HOST_MASK; 6197 6198 /* 6199 * Prevent routes with a zero gateway from being created (since 6200 * interfaces can currently be plumbed and brought up no assigned 6201 * address). 6202 * For routes with RTA_SRCIFP, the gateway address can be 0.0.0.0. 6203 */ 6204 if (gw_addr == 0 && src_ipif == NULL) 6205 return (ENETUNREACH); 6206 /* 6207 * Get the ipif, if any, corresponding to the gw_addr 6208 */ 6209 if (gw_addr != 0) { 6210 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, 6211 &error); 6212 if (ipif != NULL) { 6213 if (IS_VNI(ipif->ipif_ill)) { 6214 ipif_refrele(ipif); 6215 return (EINVAL); 6216 } 6217 ipif_refheld = B_TRUE; 6218 } else if (error == EINPROGRESS) { 6219 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6220 return (EINPROGRESS); 6221 } else { 6222 error = 0; 6223 } 6224 } 6225 6226 if (ipif != NULL) { 6227 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6228 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6229 } else { 6230 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6231 } 6232 6233 /* 6234 * GateD will attempt to create routes with a loopback interface 6235 * address as the gateway and with RTF_GATEWAY set. We allow 6236 * these routes to be added, but create them as interface routes 6237 * since the gateway is an interface address. 6238 */ 6239 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6240 flags &= ~RTF_GATEWAY; 6241 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6242 mask == IP_HOST_MASK) { 6243 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6244 ALL_ZONES, NULL, match_flags); 6245 if (ire != NULL) { 6246 ire_refrele(ire); 6247 if (ipif_refheld) 6248 ipif_refrele(ipif); 6249 return (EEXIST); 6250 } 6251 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" 6252 "for 0x%x\n", (void *)ipif, 6253 ipif->ipif_ire_type, 6254 ntohl(ipif->ipif_lcl_addr))); 6255 ire = ire_create( 6256 (uchar_t *)&dst_addr, /* dest address */ 6257 (uchar_t *)&mask, /* mask */ 6258 (uchar_t *)&ipif->ipif_src_addr, 6259 NULL, /* no gateway */ 6260 NULL, 6261 &ipif->ipif_mtu, 6262 NULL, 6263 ipif->ipif_rq, /* recv-from queue */ 6264 NULL, /* no send-to queue */ 6265 ipif->ipif_ire_type, /* LOOPBACK */ 6266 NULL, 6267 ipif, 6268 NULL, 6269 0, 6270 0, 6271 0, 6272 (ipif->ipif_flags & IPIF_PRIVATE) ? 6273 RTF_PRIVATE : 0, 6274 &ire_uinfo_null, 6275 NULL, 6276 NULL); 6277 6278 if (ire == NULL) { 6279 if (ipif_refheld) 6280 ipif_refrele(ipif); 6281 return (ENOMEM); 6282 } 6283 error = ire_add(&ire, q, mp, func, B_FALSE); 6284 if (error == 0) 6285 goto save_ire; 6286 if (ipif_refheld) 6287 ipif_refrele(ipif); 6288 return (error); 6289 6290 } 6291 } 6292 6293 /* 6294 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6295 * and the gateway address provided is one of the system's interface 6296 * addresses. By using the routing socket interface and supplying an 6297 * RTA_IFP sockaddr with an interface index, an alternate method of 6298 * specifying an interface route to be created is available which uses 6299 * the interface index that specifies the outgoing interface rather than 6300 * the address of an outgoing interface (which may not be able to 6301 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6302 * flag, routes can be specified which not only specify the next-hop to 6303 * be used when routing to a certain prefix, but also which outgoing 6304 * interface should be used. 6305 * 6306 * Previously, interfaces would have unique addresses assigned to them 6307 * and so the address assigned to a particular interface could be used 6308 * to identify a particular interface. One exception to this was the 6309 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6310 * 6311 * With the advent of IPv6 and its link-local addresses, this 6312 * restriction was relaxed and interfaces could share addresses between 6313 * themselves. In fact, typically all of the link-local interfaces on 6314 * an IPv6 node or router will have the same link-local address. In 6315 * order to differentiate between these interfaces, the use of an 6316 * interface index is necessary and this index can be carried inside a 6317 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6318 * of using the interface index, however, is that all of the ipif's that 6319 * are part of an ill have the same index and so the RTA_IFP sockaddr 6320 * cannot be used to differentiate between ipif's (or logical 6321 * interfaces) that belong to the same ill (physical interface). 6322 * 6323 * For example, in the following case involving IPv4 interfaces and 6324 * logical interfaces 6325 * 6326 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6327 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6328 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6329 * 6330 * the ipif's corresponding to each of these interface routes can be 6331 * uniquely identified by the "gateway" (actually interface address). 6332 * 6333 * In this case involving multiple IPv6 default routes to a particular 6334 * link-local gateway, the use of RTA_IFP is necessary to specify which 6335 * default route is of interest: 6336 * 6337 * default fe80::123:4567:89ab:cdef U if0 6338 * default fe80::123:4567:89ab:cdef U if1 6339 */ 6340 6341 /* RTF_GATEWAY not set */ 6342 if (!(flags & RTF_GATEWAY)) { 6343 queue_t *stq; 6344 queue_t *rfq = NULL; 6345 ill_t *in_ill = NULL; 6346 6347 if (sp != NULL) { 6348 ip2dbg(("ip_rt_add: gateway security attributes " 6349 "cannot be set with interface route\n")); 6350 if (ipif_refheld) 6351 ipif_refrele(ipif); 6352 return (EINVAL); 6353 } 6354 6355 /* 6356 * As the interface index specified with the RTA_IFP sockaddr is 6357 * the same for all ipif's off of an ill, the matching logic 6358 * below uses MATCH_IRE_ILL if such an index was specified. 6359 * This means that routes sharing the same prefix when added 6360 * using a RTA_IFP sockaddr must have distinct interface 6361 * indices (namely, they must be on distinct ill's). 6362 * 6363 * On the other hand, since the gateway address will usually be 6364 * different for each ipif on the system, the matching logic 6365 * uses MATCH_IRE_IPIF in the case of a traditional interface 6366 * route. This means that interface routes for the same prefix 6367 * can be created if they belong to distinct ipif's and if a 6368 * RTA_IFP sockaddr is not present. 6369 */ 6370 if (ipif_arg != NULL) { 6371 if (ipif_refheld) { 6372 ipif_refrele(ipif); 6373 ipif_refheld = B_FALSE; 6374 } 6375 ipif = ipif_arg; 6376 match_flags |= MATCH_IRE_ILL; 6377 } else { 6378 /* 6379 * Check the ipif corresponding to the gw_addr 6380 */ 6381 if (ipif == NULL) 6382 return (ENETUNREACH); 6383 match_flags |= MATCH_IRE_IPIF; 6384 } 6385 ASSERT(ipif != NULL); 6386 /* 6387 * If src_ipif is not NULL, we have to create 6388 * an ire with non-null ire_in_ill value 6389 */ 6390 if (src_ipif != NULL) { 6391 in_ill = src_ipif->ipif_ill; 6392 } 6393 6394 /* 6395 * We check for an existing entry at this point. 6396 * 6397 * Since a netmask isn't passed in via the ioctl interface 6398 * (SIOCADDRT), we don't check for a matching netmask in that 6399 * case. 6400 */ 6401 if (!ioctl_msg) 6402 match_flags |= MATCH_IRE_MASK; 6403 if (src_ipif != NULL) { 6404 /* Look up in the special table */ 6405 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6406 ipif, src_ipif->ipif_ill, match_flags); 6407 } else { 6408 ire = ire_ftable_lookup(dst_addr, mask, 0, 6409 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6410 NULL, match_flags); 6411 } 6412 if (ire != NULL) { 6413 ire_refrele(ire); 6414 if (ipif_refheld) 6415 ipif_refrele(ipif); 6416 return (EEXIST); 6417 } 6418 6419 if (src_ipif != NULL) { 6420 /* 6421 * Create the special ire for the IRE table 6422 * which hangs out of ire_in_ill. This ire 6423 * is in-between IRE_CACHE and IRE_INTERFACE. 6424 * Thus rfq is non-NULL. 6425 */ 6426 rfq = ipif->ipif_rq; 6427 } 6428 /* Create the usual interface ires */ 6429 6430 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6431 ? ipif->ipif_rq : ipif->ipif_wq; 6432 6433 /* 6434 * Create a copy of the IRE_LOOPBACK, 6435 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6436 * the modified address and netmask. 6437 */ 6438 ire = ire_create( 6439 (uchar_t *)&dst_addr, 6440 (uint8_t *)&mask, 6441 (uint8_t *)&ipif->ipif_src_addr, 6442 NULL, 6443 NULL, 6444 &ipif->ipif_mtu, 6445 NULL, 6446 rfq, 6447 stq, 6448 ipif->ipif_net_type, 6449 ipif->ipif_resolver_mp, 6450 ipif, 6451 in_ill, 6452 0, 6453 0, 6454 0, 6455 flags, 6456 &ire_uinfo_null, 6457 NULL, 6458 NULL); 6459 if (ire == NULL) { 6460 if (ipif_refheld) 6461 ipif_refrele(ipif); 6462 return (ENOMEM); 6463 } 6464 6465 /* 6466 * Some software (for example, GateD and Sun Cluster) attempts 6467 * to create (what amount to) IRE_PREFIX routes with the 6468 * loopback address as the gateway. This is primarily done to 6469 * set up prefixes with the RTF_REJECT flag set (for example, 6470 * when generating aggregate routes.) 6471 * 6472 * If the IRE type (as defined by ipif->ipif_net_type) is 6473 * IRE_LOOPBACK, then we map the request into a 6474 * IRE_IF_NORESOLVER. 6475 * 6476 * Needless to say, the real IRE_LOOPBACK is NOT created by this 6477 * routine, but rather using ire_create() directly. 6478 * 6479 */ 6480 if (ipif->ipif_net_type == IRE_LOOPBACK) 6481 ire->ire_type = IRE_IF_NORESOLVER; 6482 6483 error = ire_add(&ire, q, mp, func, B_FALSE); 6484 if (error == 0) 6485 goto save_ire; 6486 6487 /* 6488 * In the result of failure, ire_add() will have already 6489 * deleted the ire in question, so there is no need to 6490 * do that here. 6491 */ 6492 if (ipif_refheld) 6493 ipif_refrele(ipif); 6494 return (error); 6495 } 6496 if (ipif_refheld) { 6497 ipif_refrele(ipif); 6498 ipif_refheld = B_FALSE; 6499 } 6500 6501 if (src_ipif != NULL) { 6502 /* RTA_SRCIFP is not supported on RTF_GATEWAY */ 6503 ip2dbg(("ip_rt_add: SRCIF cannot be set with gateway route\n")); 6504 return (EINVAL); 6505 } 6506 /* 6507 * Get an interface IRE for the specified gateway. 6508 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 6509 * gateway, it is currently unreachable and we fail the request 6510 * accordingly. 6511 */ 6512 ipif = ipif_arg; 6513 if (ipif_arg != NULL) 6514 match_flags |= MATCH_IRE_ILL; 6515 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 6516 ALL_ZONES, 0, NULL, match_flags); 6517 if (gw_ire == NULL) 6518 return (ENETUNREACH); 6519 6520 /* 6521 * We create one of three types of IREs as a result of this request 6522 * based on the netmask. A netmask of all ones (which is automatically 6523 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 6524 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 6525 * created. Otherwise, an IRE_PREFIX route is created for the 6526 * destination prefix. 6527 */ 6528 if (mask == IP_HOST_MASK) 6529 type = IRE_HOST; 6530 else if (mask == 0) 6531 type = IRE_DEFAULT; 6532 else 6533 type = IRE_PREFIX; 6534 6535 /* check for a duplicate entry */ 6536 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 6537 NULL, ALL_ZONES, 0, NULL, 6538 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW); 6539 if (ire != NULL) { 6540 ire_refrele(gw_ire); 6541 ire_refrele(ire); 6542 return (EEXIST); 6543 } 6544 6545 /* Security attribute exists */ 6546 if (sp != NULL) { 6547 tsol_gcgrp_addr_t ga; 6548 6549 /* find or create the gateway credentials group */ 6550 ga.ga_af = AF_INET; 6551 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 6552 6553 /* we hold reference to it upon success */ 6554 gcgrp = gcgrp_lookup(&ga, B_TRUE); 6555 if (gcgrp == NULL) { 6556 ire_refrele(gw_ire); 6557 return (ENOMEM); 6558 } 6559 6560 /* 6561 * Create and add the security attribute to the group; a 6562 * reference to the group is made upon allocating a new 6563 * entry successfully. If it finds an already-existing 6564 * entry for the security attribute in the group, it simply 6565 * returns it and no new reference is made to the group. 6566 */ 6567 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 6568 if (gc == NULL) { 6569 /* release reference held by gcgrp_lookup */ 6570 GCGRP_REFRELE(gcgrp); 6571 ire_refrele(gw_ire); 6572 return (ENOMEM); 6573 } 6574 } 6575 6576 /* Create the IRE. */ 6577 ire = ire_create( 6578 (uchar_t *)&dst_addr, /* dest address */ 6579 (uchar_t *)&mask, /* mask */ 6580 /* src address assigned by the caller? */ 6581 (uchar_t *)(((src_addr != INADDR_ANY) && 6582 (flags & RTF_SETSRC)) ? &src_addr : NULL), 6583 (uchar_t *)&gw_addr, /* gateway address */ 6584 NULL, /* no in-srcaddress */ 6585 &gw_ire->ire_max_frag, 6586 NULL, /* no Fast Path header */ 6587 NULL, /* no recv-from queue */ 6588 NULL, /* no send-to queue */ 6589 (ushort_t)type, /* IRE type */ 6590 NULL, 6591 ipif_arg, 6592 NULL, 6593 0, 6594 0, 6595 0, 6596 flags, 6597 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 6598 gc, /* security attribute */ 6599 NULL); 6600 /* 6601 * The ire holds a reference to the 'gc' and the 'gc' holds a 6602 * reference to the 'gcgrp'. We can now release the extra reference 6603 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 6604 */ 6605 if (gcgrp_xtraref) 6606 GCGRP_REFRELE(gcgrp); 6607 if (ire == NULL) { 6608 if (gc != NULL) 6609 GC_REFRELE(gc); 6610 ire_refrele(gw_ire); 6611 return (ENOMEM); 6612 } 6613 6614 /* 6615 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 6616 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 6617 */ 6618 6619 /* Add the new IRE. */ 6620 error = ire_add(&ire, q, mp, func, B_FALSE); 6621 if (error != 0) { 6622 /* 6623 * In the result of failure, ire_add() will have already 6624 * deleted the ire in question, so there is no need to 6625 * do that here. 6626 */ 6627 ire_refrele(gw_ire); 6628 return (error); 6629 } 6630 6631 if (flags & RTF_MULTIRT) { 6632 /* 6633 * Invoke the CGTP (multirouting) filtering module 6634 * to add the dst address in the filtering database. 6635 * Replicated inbound packets coming from that address 6636 * will be filtered to discard the duplicates. 6637 * It is not necessary to call the CGTP filter hook 6638 * when the dst address is a broadcast or multicast, 6639 * because an IP source address cannot be a broadcast 6640 * or a multicast. 6641 */ 6642 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 6643 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 6644 if (ire_dst != NULL) { 6645 ip_cgtp_bcast_add(ire, ire_dst); 6646 ire_refrele(ire_dst); 6647 goto save_ire; 6648 } 6649 if ((ip_cgtp_filter_ops != NULL) && !CLASSD(ire->ire_addr)) { 6650 int res = ip_cgtp_filter_ops->cfo_add_dest_v4( 6651 ire->ire_addr, 6652 ire->ire_gateway_addr, 6653 ire->ire_src_addr, 6654 gw_ire->ire_src_addr); 6655 if (res != 0) { 6656 ire_refrele(gw_ire); 6657 ire_delete(ire); 6658 return (res); 6659 } 6660 } 6661 } 6662 6663 /* 6664 * Now that the prefix IRE entry has been created, delete any 6665 * existing gateway IRE cache entries as well as any IRE caches 6666 * using the gateway, and force them to be created through 6667 * ip_newroute. 6668 */ 6669 if (gc != NULL) { 6670 ASSERT(gcgrp != NULL); 6671 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES); 6672 } 6673 6674 save_ire: 6675 if (gw_ire != NULL) { 6676 ire_refrele(gw_ire); 6677 } 6678 /* 6679 * We do not do save_ire for the routes added with RTA_SRCIFP 6680 * flag. This route is only added and deleted by mipagent. 6681 * So, for simplicity of design, we refrain from saving 6682 * ires that are created with srcif value. This may change 6683 * in future if we find more usage of srcifp feature. 6684 */ 6685 if (ipif != NULL && src_ipif == NULL) { 6686 /* 6687 * Save enough information so that we can recreate the IRE if 6688 * the interface goes down and then up. The metrics associated 6689 * with the route will be saved as well when rts_setmetrics() is 6690 * called after the IRE has been created. In the case where 6691 * memory cannot be allocated, none of this information will be 6692 * saved. 6693 */ 6694 ipif_save_ire(ipif, ire); 6695 } 6696 if (ioctl_msg) 6697 ip_rts_rtmsg(RTM_OLDADD, ire, 0); 6698 if (ire_arg != NULL) { 6699 /* 6700 * Store the ire that was successfully added into where ire_arg 6701 * points to so that callers don't have to look it up 6702 * themselves (but they are responsible for ire_refrele()ing 6703 * the ire when they are finished with it). 6704 */ 6705 *ire_arg = ire; 6706 } else { 6707 ire_refrele(ire); /* Held in ire_add */ 6708 } 6709 if (ipif_refheld) 6710 ipif_refrele(ipif); 6711 return (0); 6712 } 6713 6714 /* 6715 * ip_rt_delete is called to delete an IPv4 route. 6716 * ipif_arg is passed in to associate it with the correct interface. 6717 * src_ipif is passed to associate the incoming interface of the packet. 6718 * We may need to restart this operation if the ipif cannot be looked up 6719 * due to an exclusive operation that is currently in progress. The restart 6720 * entry point is specified by 'func' 6721 */ 6722 /* ARGSUSED4 */ 6723 int 6724 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6725 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6726 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func) 6727 { 6728 ire_t *ire = NULL; 6729 ipif_t *ipif; 6730 boolean_t ipif_refheld = B_FALSE; 6731 uint_t type; 6732 uint_t match_flags = MATCH_IRE_TYPE; 6733 int err = 0; 6734 6735 ip1dbg(("ip_rt_delete:")); 6736 /* 6737 * If this is the case of RTF_HOST being set, then we set the netmask 6738 * to all ones. Otherwise, we use the netmask if one was supplied. 6739 */ 6740 if (flags & RTF_HOST) { 6741 mask = IP_HOST_MASK; 6742 match_flags |= MATCH_IRE_MASK; 6743 } else if (rtm_addrs & RTA_NETMASK) { 6744 match_flags |= MATCH_IRE_MASK; 6745 } 6746 6747 /* 6748 * Note that RTF_GATEWAY is never set on a delete, therefore 6749 * we check if the gateway address is one of our interfaces first, 6750 * and fall back on RTF_GATEWAY routes. 6751 * 6752 * This makes it possible to delete an original 6753 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 6754 * 6755 * As the interface index specified with the RTA_IFP sockaddr is the 6756 * same for all ipif's off of an ill, the matching logic below uses 6757 * MATCH_IRE_ILL if such an index was specified. This means a route 6758 * sharing the same prefix and interface index as the the route 6759 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 6760 * is specified in the request. 6761 * 6762 * On the other hand, since the gateway address will usually be 6763 * different for each ipif on the system, the matching logic 6764 * uses MATCH_IRE_IPIF in the case of a traditional interface 6765 * route. This means that interface routes for the same prefix can be 6766 * uniquely identified if they belong to distinct ipif's and if a 6767 * RTA_IFP sockaddr is not present. 6768 * 6769 * For more detail on specifying routes by gateway address and by 6770 * interface index, see the comments in ip_rt_add(). 6771 * gw_addr could be zero in some cases when both RTA_SRCIFP and 6772 * RTA_IFP are specified. If RTA_SRCIFP is specified and both 6773 * RTA_IFP and gateway_addr are NULL/zero, then delete will not 6774 * succeed. 6775 */ 6776 if (src_ipif != NULL) { 6777 if (ipif_arg == NULL && gw_addr != 0) { 6778 ipif_arg = ipif_lookup_interface(gw_addr, dst_addr, 6779 q, mp, func, &err); 6780 if (ipif_arg != NULL) 6781 ipif_refheld = B_TRUE; 6782 } 6783 if (ipif_arg == NULL) { 6784 err = (err == EINPROGRESS) ? err : ESRCH; 6785 return (err); 6786 } 6787 ipif = ipif_arg; 6788 } else { 6789 ipif = ipif_lookup_interface(gw_addr, dst_addr, 6790 q, mp, func, &err); 6791 if (ipif != NULL) 6792 ipif_refheld = B_TRUE; 6793 else if (err == EINPROGRESS) 6794 return (err); 6795 else 6796 err = 0; 6797 } 6798 if (ipif != NULL) { 6799 if (ipif_arg != NULL) { 6800 if (ipif_refheld) { 6801 ipif_refrele(ipif); 6802 ipif_refheld = B_FALSE; 6803 } 6804 ipif = ipif_arg; 6805 match_flags |= MATCH_IRE_ILL; 6806 } else { 6807 match_flags |= MATCH_IRE_IPIF; 6808 } 6809 if (src_ipif != NULL) { 6810 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6811 ipif, src_ipif->ipif_ill, match_flags); 6812 } else { 6813 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 6814 ire = ire_ctable_lookup(dst_addr, 0, 6815 IRE_LOOPBACK, ipif, ALL_ZONES, NULL, 6816 match_flags); 6817 } 6818 if (ire == NULL) { 6819 ire = ire_ftable_lookup(dst_addr, mask, 0, 6820 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6821 NULL, match_flags); 6822 } 6823 } 6824 } 6825 6826 if (ire == NULL) { 6827 /* 6828 * At this point, the gateway address is not one of our own 6829 * addresses or a matching interface route was not found. We 6830 * set the IRE type to lookup based on whether 6831 * this is a host route, a default route or just a prefix. 6832 * 6833 * If an ipif_arg was passed in, then the lookup is based on an 6834 * interface index so MATCH_IRE_ILL is added to match_flags. 6835 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 6836 * set as the route being looked up is not a traditional 6837 * interface route. 6838 * Since we do not add gateway route with srcipif, we don't 6839 * expect to find it either. 6840 */ 6841 if (src_ipif != NULL) { 6842 if (ipif_refheld) 6843 ipif_refrele(ipif); 6844 return (ESRCH); 6845 } else { 6846 match_flags &= ~MATCH_IRE_IPIF; 6847 match_flags |= MATCH_IRE_GW; 6848 if (ipif_arg != NULL) 6849 match_flags |= MATCH_IRE_ILL; 6850 if (mask == IP_HOST_MASK) 6851 type = IRE_HOST; 6852 else if (mask == 0) 6853 type = IRE_DEFAULT; 6854 else 6855 type = IRE_PREFIX; 6856 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, 6857 ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags); 6858 if (ire == NULL && type == IRE_HOST) { 6859 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, 6860 IRE_HOST_REDIRECT, ipif_arg, NULL, 6861 ALL_ZONES, 0, NULL, match_flags); 6862 } 6863 } 6864 } 6865 6866 if (ipif_refheld) 6867 ipif_refrele(ipif); 6868 6869 /* ipif is not refheld anymore */ 6870 if (ire == NULL) 6871 return (ESRCH); 6872 6873 if (ire->ire_flags & RTF_MULTIRT) { 6874 /* 6875 * Invoke the CGTP (multirouting) filtering module 6876 * to remove the dst address from the filtering database. 6877 * Packets coming from that address will no longer be 6878 * filtered to remove duplicates. 6879 */ 6880 if (ip_cgtp_filter_ops != NULL) { 6881 err = ip_cgtp_filter_ops->cfo_del_dest_v4(ire->ire_addr, 6882 ire->ire_gateway_addr); 6883 } 6884 ip_cgtp_bcast_delete(ire); 6885 } 6886 6887 ipif = ire->ire_ipif; 6888 /* 6889 * Removing from ipif_saved_ire_mp is not necessary 6890 * when src_ipif being non-NULL. ip_rt_add does not 6891 * save the ires which src_ipif being non-NULL. 6892 */ 6893 if (ipif != NULL && src_ipif == NULL) { 6894 ipif_remove_ire(ipif, ire); 6895 } 6896 if (ioctl_msg) 6897 ip_rts_rtmsg(RTM_OLDDEL, ire, 0); 6898 ire_delete(ire); 6899 ire_refrele(ire); 6900 return (err); 6901 } 6902 6903 /* 6904 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6905 */ 6906 /* ARGSUSED */ 6907 int 6908 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6909 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6910 { 6911 ipaddr_t dst_addr; 6912 ipaddr_t gw_addr; 6913 ipaddr_t mask; 6914 int error = 0; 6915 mblk_t *mp1; 6916 struct rtentry *rt; 6917 ipif_t *ipif = NULL; 6918 6919 ip1dbg(("ip_siocaddrt:")); 6920 /* Existence of mp1 verified in ip_wput_nondata */ 6921 mp1 = mp->b_cont->b_cont; 6922 rt = (struct rtentry *)mp1->b_rptr; 6923 6924 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6925 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6926 6927 /* 6928 * If the RTF_HOST flag is on, this is a request to assign a gateway 6929 * to a particular host address. In this case, we set the netmask to 6930 * all ones for the particular destination address. Otherwise, 6931 * determine the netmask to be used based on dst_addr and the interfaces 6932 * in use. 6933 */ 6934 if (rt->rt_flags & RTF_HOST) { 6935 mask = IP_HOST_MASK; 6936 } else { 6937 /* 6938 * Note that ip_subnet_mask returns a zero mask in the case of 6939 * default (an all-zeroes address). 6940 */ 6941 mask = ip_subnet_mask(dst_addr, &ipif); 6942 } 6943 6944 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6945 NULL, B_TRUE, q, mp, ip_process_ioctl, NULL); 6946 if (ipif != NULL) 6947 ipif_refrele(ipif); 6948 return (error); 6949 } 6950 6951 /* 6952 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6953 */ 6954 /* ARGSUSED */ 6955 int 6956 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6957 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6958 { 6959 ipaddr_t dst_addr; 6960 ipaddr_t gw_addr; 6961 ipaddr_t mask; 6962 int error; 6963 mblk_t *mp1; 6964 struct rtentry *rt; 6965 ipif_t *ipif = NULL; 6966 6967 ip1dbg(("ip_siocdelrt:")); 6968 /* Existence of mp1 verified in ip_wput_nondata */ 6969 mp1 = mp->b_cont->b_cont; 6970 rt = (struct rtentry *)mp1->b_rptr; 6971 6972 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6973 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6974 6975 /* 6976 * If the RTF_HOST flag is on, this is a request to delete a gateway 6977 * to a particular host address. In this case, we set the netmask to 6978 * all ones for the particular destination address. Otherwise, 6979 * determine the netmask to be used based on dst_addr and the interfaces 6980 * in use. 6981 */ 6982 if (rt->rt_flags & RTF_HOST) { 6983 mask = IP_HOST_MASK; 6984 } else { 6985 /* 6986 * Note that ip_subnet_mask returns a zero mask in the case of 6987 * default (an all-zeroes address). 6988 */ 6989 mask = ip_subnet_mask(dst_addr, &ipif); 6990 } 6991 6992 error = ip_rt_delete(dst_addr, mask, gw_addr, 6993 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, NULL, 6994 B_TRUE, q, mp, ip_process_ioctl); 6995 if (ipif != NULL) 6996 ipif_refrele(ipif); 6997 return (error); 6998 } 6999 7000 /* 7001 * Enqueue the mp onto the ipsq, chained by b_next. 7002 * b_prev stores the function to be executed later, and b_queue the queue 7003 * where this mp originated. 7004 */ 7005 void 7006 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7007 ill_t *pending_ill) 7008 { 7009 conn_t *connp = NULL; 7010 7011 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7012 ASSERT(func != NULL); 7013 7014 mp->b_queue = q; 7015 mp->b_prev = (void *)func; 7016 mp->b_next = NULL; 7017 7018 switch (type) { 7019 case CUR_OP: 7020 if (ipsq->ipsq_mptail != NULL) { 7021 ASSERT(ipsq->ipsq_mphead != NULL); 7022 ipsq->ipsq_mptail->b_next = mp; 7023 } else { 7024 ASSERT(ipsq->ipsq_mphead == NULL); 7025 ipsq->ipsq_mphead = mp; 7026 } 7027 ipsq->ipsq_mptail = mp; 7028 break; 7029 7030 case NEW_OP: 7031 if (ipsq->ipsq_xopq_mptail != NULL) { 7032 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7033 ipsq->ipsq_xopq_mptail->b_next = mp; 7034 } else { 7035 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7036 ipsq->ipsq_xopq_mphead = mp; 7037 } 7038 ipsq->ipsq_xopq_mptail = mp; 7039 break; 7040 default: 7041 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7042 } 7043 7044 if (CONN_Q(q) && pending_ill != NULL) { 7045 connp = Q_TO_CONN(q); 7046 7047 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7048 connp->conn_oper_pending_ill = pending_ill; 7049 } 7050 } 7051 7052 /* 7053 * Return the mp at the head of the ipsq. After emptying the ipsq 7054 * look at the next ioctl, if this ioctl is complete. Otherwise 7055 * return, we will resume when we complete the current ioctl. 7056 * The current ioctl will wait till it gets a response from the 7057 * driver below. 7058 */ 7059 static mblk_t * 7060 ipsq_dq(ipsq_t *ipsq) 7061 { 7062 mblk_t *mp; 7063 7064 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7065 7066 mp = ipsq->ipsq_mphead; 7067 if (mp != NULL) { 7068 ipsq->ipsq_mphead = mp->b_next; 7069 if (ipsq->ipsq_mphead == NULL) 7070 ipsq->ipsq_mptail = NULL; 7071 mp->b_next = NULL; 7072 return (mp); 7073 } 7074 if (ipsq->ipsq_current_ipif != NULL) 7075 return (NULL); 7076 mp = ipsq->ipsq_xopq_mphead; 7077 if (mp != NULL) { 7078 ipsq->ipsq_xopq_mphead = mp->b_next; 7079 if (ipsq->ipsq_xopq_mphead == NULL) 7080 ipsq->ipsq_xopq_mptail = NULL; 7081 mp->b_next = NULL; 7082 return (mp); 7083 } 7084 return (NULL); 7085 } 7086 7087 /* 7088 * Enter the ipsq corresponding to ill, by waiting synchronously till 7089 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7090 * will have to drain completely before ipsq_enter returns success. 7091 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 7092 * and the ipsq_exit logic will start the next enqueued ioctl after 7093 * completion of the current ioctl. If 'force' is used, we don't wait 7094 * for the enqueued ioctls. This is needed when a conn_close wants to 7095 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7096 * of an ill can also use this option. But we dont' use it currently. 7097 */ 7098 #define ENTER_SQ_WAIT_TICKS 100 7099 boolean_t 7100 ipsq_enter(ill_t *ill, boolean_t force) 7101 { 7102 ipsq_t *ipsq; 7103 boolean_t waited_enough = B_FALSE; 7104 7105 /* 7106 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 7107 * Since the <ill-ipsq> assocs could change while we wait for the 7108 * writer, it is easier to wait on a fixed global rather than try to 7109 * cv_wait on a changing ipsq. 7110 */ 7111 mutex_enter(&ill->ill_lock); 7112 for (;;) { 7113 if (ill->ill_state_flags & ILL_CONDEMNED) { 7114 mutex_exit(&ill->ill_lock); 7115 return (B_FALSE); 7116 } 7117 7118 ipsq = ill->ill_phyint->phyint_ipsq; 7119 mutex_enter(&ipsq->ipsq_lock); 7120 if (ipsq->ipsq_writer == NULL && 7121 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 7122 break; 7123 } else if (ipsq->ipsq_writer != NULL) { 7124 mutex_exit(&ipsq->ipsq_lock); 7125 cv_wait(&ill->ill_cv, &ill->ill_lock); 7126 } else { 7127 mutex_exit(&ipsq->ipsq_lock); 7128 if (force) { 7129 (void) cv_timedwait(&ill->ill_cv, 7130 &ill->ill_lock, 7131 lbolt + ENTER_SQ_WAIT_TICKS); 7132 waited_enough = B_TRUE; 7133 continue; 7134 } else { 7135 cv_wait(&ill->ill_cv, &ill->ill_lock); 7136 } 7137 } 7138 } 7139 7140 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7141 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7142 ipsq->ipsq_writer = curthread; 7143 ipsq->ipsq_reentry_cnt++; 7144 #ifdef ILL_DEBUG 7145 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7146 #endif 7147 mutex_exit(&ipsq->ipsq_lock); 7148 mutex_exit(&ill->ill_lock); 7149 return (B_TRUE); 7150 } 7151 7152 /* 7153 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7154 * certain critical operations like plumbing (i.e. most set ioctls), 7155 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7156 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7157 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7158 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7159 * threads executing in the ipsq. Responses from the driver pertain to the 7160 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7161 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7162 * 7163 * If a thread does not want to reenter the ipsq when it is already writer, 7164 * it must make sure that the specified reentry point to be called later 7165 * when the ipsq is empty, nor any code path starting from the specified reentry 7166 * point must never ever try to enter the ipsq again. Otherwise it can lead 7167 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7168 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7169 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7170 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7171 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7172 * ioctl if the current ioctl has completed. If the current ioctl is still 7173 * in progress it simply returns. The current ioctl could be waiting for 7174 * a response from another module (arp_ or the driver or could be waiting for 7175 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7176 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7177 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7178 * ipsq_current_ipif is clear which happens only on ioctl completion. 7179 */ 7180 7181 /* 7182 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7183 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7184 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7185 * completion. 7186 */ 7187 ipsq_t * 7188 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7189 ipsq_func_t func, int type, boolean_t reentry_ok) 7190 { 7191 ipsq_t *ipsq; 7192 7193 /* Only 1 of ipif or ill can be specified */ 7194 ASSERT((ipif != NULL) ^ (ill != NULL)); 7195 if (ipif != NULL) 7196 ill = ipif->ipif_ill; 7197 7198 /* 7199 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7200 * ipsq of an ill can't change when ill_lock is held. 7201 */ 7202 GRAB_CONN_LOCK(q); 7203 mutex_enter(&ill->ill_lock); 7204 ipsq = ill->ill_phyint->phyint_ipsq; 7205 mutex_enter(&ipsq->ipsq_lock); 7206 7207 /* 7208 * 1. Enter the ipsq if we are already writer and reentry is ok. 7209 * (Note: If the caller does not specify reentry_ok then neither 7210 * 'func' nor any of its callees must ever attempt to enter the ipsq 7211 * again. Otherwise it can lead to an infinite loop 7212 * 2. Enter the ipsq if there is no current writer and this attempted 7213 * entry is part of the current ioctl or operation 7214 * 3. Enter the ipsq if there is no current writer and this is a new 7215 * ioctl (or operation) and the ioctl (or operation) queue is 7216 * empty and there is no ioctl (or operation) currently in progress 7217 */ 7218 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7219 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7220 ipsq->ipsq_current_ipif == NULL))) || 7221 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7222 /* Success. */ 7223 ipsq->ipsq_reentry_cnt++; 7224 ipsq->ipsq_writer = curthread; 7225 mutex_exit(&ipsq->ipsq_lock); 7226 mutex_exit(&ill->ill_lock); 7227 RELEASE_CONN_LOCK(q); 7228 #ifdef ILL_DEBUG 7229 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7230 #endif 7231 return (ipsq); 7232 } 7233 7234 ipsq_enq(ipsq, q, mp, func, type, ill); 7235 7236 mutex_exit(&ipsq->ipsq_lock); 7237 mutex_exit(&ill->ill_lock); 7238 RELEASE_CONN_LOCK(q); 7239 return (NULL); 7240 } 7241 7242 /* 7243 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7244 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7245 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7246 * completion. 7247 * 7248 * This function does a refrele on the ipif/ill. 7249 */ 7250 void 7251 qwriter_ip(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7252 ipsq_func_t func, int type, boolean_t reentry_ok) 7253 { 7254 ipsq_t *ipsq; 7255 7256 ipsq = ipsq_try_enter(ipif, ill, q, mp, func, type, reentry_ok); 7257 /* 7258 * Caller must have done a refhold on the ipif. ipif_refrele 7259 * happens on the passed ipif. We can do this since we are 7260 * already exclusive, or we won't access ipif henceforth, Both 7261 * this func and caller will just return if we ipsq_try_enter 7262 * fails above. This is needed because func needs to 7263 * see the correct refcount. Eg. removeif can work only then. 7264 */ 7265 if (ipif != NULL) 7266 ipif_refrele(ipif); 7267 else 7268 ill_refrele(ill); 7269 if (ipsq != NULL) { 7270 (*func)(ipsq, q, mp, NULL); 7271 ipsq_exit(ipsq, B_TRUE, B_TRUE); 7272 } 7273 } 7274 7275 /* 7276 * If there are more than ILL_GRP_CNT ills in a group, 7277 * we use kmem alloc'd buffers, else use the stack 7278 */ 7279 #define ILL_GRP_CNT 14 7280 /* 7281 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 7282 * Called by a thread that is currently exclusive on this ipsq. 7283 */ 7284 void 7285 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 7286 { 7287 queue_t *q; 7288 mblk_t *mp; 7289 ipsq_func_t func; 7290 int next; 7291 ill_t **ill_list = NULL; 7292 size_t ill_list_size = 0; 7293 int cnt = 0; 7294 boolean_t need_ipsq_free = B_FALSE; 7295 7296 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7297 mutex_enter(&ipsq->ipsq_lock); 7298 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 7299 if (ipsq->ipsq_reentry_cnt != 1) { 7300 ipsq->ipsq_reentry_cnt--; 7301 mutex_exit(&ipsq->ipsq_lock); 7302 return; 7303 } 7304 7305 mp = ipsq_dq(ipsq); 7306 while (mp != NULL) { 7307 again: 7308 mutex_exit(&ipsq->ipsq_lock); 7309 func = (ipsq_func_t)mp->b_prev; 7310 q = (queue_t *)mp->b_queue; 7311 mp->b_prev = NULL; 7312 mp->b_queue = NULL; 7313 7314 /* 7315 * If 'q' is an conn queue, it is valid, since we did a 7316 * a refhold on the connp, at the start of the ioctl. 7317 * If 'q' is an ill queue, it is valid, since close of an 7318 * ill will clean up the 'ipsq'. 7319 */ 7320 (*func)(ipsq, q, mp, NULL); 7321 7322 mutex_enter(&ipsq->ipsq_lock); 7323 mp = ipsq_dq(ipsq); 7324 } 7325 7326 mutex_exit(&ipsq->ipsq_lock); 7327 7328 /* 7329 * Need to grab the locks in the right order. Need to 7330 * atomically check (under ipsq_lock) that there are no 7331 * messages before relinquishing the ipsq. Also need to 7332 * atomically wakeup waiters on ill_cv while holding ill_lock. 7333 * Holding ill_g_lock ensures that ipsq list of ills is stable. 7334 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 7335 * to grab ill_g_lock as writer. 7336 */ 7337 rw_enter(&ill_g_lock, ipsq->ipsq_split ? RW_WRITER : RW_READER); 7338 7339 /* ipsq_refs can't change while ill_g_lock is held as reader */ 7340 if (ipsq->ipsq_refs != 0) { 7341 /* At most 2 ills v4/v6 per phyint */ 7342 cnt = ipsq->ipsq_refs << 1; 7343 ill_list_size = cnt * sizeof (ill_t *); 7344 /* 7345 * If memory allocation fails, we will do the split 7346 * the next time ipsq_exit is called for whatever reason. 7347 * As long as the ipsq_split flag is set the need to 7348 * split is remembered. 7349 */ 7350 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 7351 if (ill_list != NULL) 7352 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 7353 } 7354 mutex_enter(&ipsq->ipsq_lock); 7355 mp = ipsq_dq(ipsq); 7356 if (mp != NULL) { 7357 /* oops, some message has landed up, we can't get out */ 7358 if (ill_list != NULL) 7359 ill_unlock_ills(ill_list, cnt); 7360 rw_exit(&ill_g_lock); 7361 if (ill_list != NULL) 7362 kmem_free(ill_list, ill_list_size); 7363 ill_list = NULL; 7364 ill_list_size = 0; 7365 cnt = 0; 7366 goto again; 7367 } 7368 7369 /* 7370 * Split only if no ioctl is pending and if memory alloc succeeded 7371 * above. 7372 */ 7373 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 7374 ill_list != NULL) { 7375 /* 7376 * No new ill can join this ipsq since we are holding the 7377 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 7378 * ipsq. ill_split_ipsq may fail due to memory shortage. 7379 * If so we will retry on the next ipsq_exit. 7380 */ 7381 ipsq->ipsq_split = ill_split_ipsq(ipsq); 7382 } 7383 7384 /* 7385 * We are holding the ipsq lock, hence no new messages can 7386 * land up on the ipsq, and there are no messages currently. 7387 * Now safe to get out. Wake up waiters and relinquish ipsq 7388 * atomically while holding ill locks. 7389 */ 7390 ipsq->ipsq_writer = NULL; 7391 ipsq->ipsq_reentry_cnt--; 7392 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7393 #ifdef ILL_DEBUG 7394 ipsq->ipsq_depth = 0; 7395 #endif 7396 mutex_exit(&ipsq->ipsq_lock); 7397 /* 7398 * For IPMP this should wake up all ills in this ipsq. 7399 * We need to hold the ill_lock while waking up waiters to 7400 * avoid missed wakeups. But there is no need to acquire all 7401 * the ill locks and then wakeup. If we have not acquired all 7402 * the locks (due to memory failure above) ill_signal_ipsq_ills 7403 * wakes up ills one at a time after getting the right ill_lock 7404 */ 7405 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 7406 if (ill_list != NULL) 7407 ill_unlock_ills(ill_list, cnt); 7408 if (ipsq->ipsq_refs == 0) 7409 need_ipsq_free = B_TRUE; 7410 rw_exit(&ill_g_lock); 7411 if (ill_list != 0) 7412 kmem_free(ill_list, ill_list_size); 7413 7414 if (need_ipsq_free) { 7415 /* 7416 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 7417 * looked up. ipsq can be looked up only thru ill or phyint 7418 * and there are no ills/phyint on this ipsq. 7419 */ 7420 ipsq_delete(ipsq); 7421 } 7422 /* 7423 * Now start any igmp or mld timers that could not be started 7424 * while inside the ipsq. The timers can't be started while inside 7425 * the ipsq, since igmp_start_timers may need to call untimeout() 7426 * which can't be done while holding a lock i.e. the ipsq. Otherwise 7427 * there could be a deadlock since the timeout handlers 7428 * mld_timeout_handler / igmp_timeout_handler also synchronously 7429 * wait in ipsq_enter() trying to get the ipsq. 7430 * 7431 * However there is one exception to the above. If this thread is 7432 * itself the igmp/mld timeout handler thread, then we don't want 7433 * to start any new timer until the current handler is done. The 7434 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 7435 * all others pass B_TRUE. 7436 */ 7437 if (start_igmp_timer) { 7438 mutex_enter(&igmp_timer_lock); 7439 next = igmp_deferred_next; 7440 igmp_deferred_next = INFINITY; 7441 mutex_exit(&igmp_timer_lock); 7442 7443 if (next != INFINITY) 7444 igmp_start_timers(next); 7445 } 7446 7447 if (start_mld_timer) { 7448 mutex_enter(&mld_timer_lock); 7449 next = mld_deferred_next; 7450 mld_deferred_next = INFINITY; 7451 mutex_exit(&mld_timer_lock); 7452 7453 if (next != INFINITY) 7454 mld_start_timers(next); 7455 } 7456 } 7457 7458 /* 7459 * The ill is closing. Flush all messages on the ipsq that originated 7460 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 7461 * for this ill since ipsq_enter could not have entered until then. 7462 * New messages can't be queued since the CONDEMNED flag is set. 7463 */ 7464 static void 7465 ipsq_flush(ill_t *ill) 7466 { 7467 queue_t *q; 7468 mblk_t *prev; 7469 mblk_t *mp; 7470 mblk_t *mp_next; 7471 ipsq_t *ipsq; 7472 7473 ASSERT(IAM_WRITER_ILL(ill)); 7474 ipsq = ill->ill_phyint->phyint_ipsq; 7475 /* 7476 * Flush any messages sent up by the driver. 7477 */ 7478 mutex_enter(&ipsq->ipsq_lock); 7479 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 7480 mp_next = mp->b_next; 7481 q = mp->b_queue; 7482 if (q == ill->ill_rq || q == ill->ill_wq) { 7483 /* Remove the mp from the ipsq */ 7484 if (prev == NULL) 7485 ipsq->ipsq_mphead = mp->b_next; 7486 else 7487 prev->b_next = mp->b_next; 7488 if (ipsq->ipsq_mptail == mp) { 7489 ASSERT(mp_next == NULL); 7490 ipsq->ipsq_mptail = prev; 7491 } 7492 inet_freemsg(mp); 7493 } else { 7494 prev = mp; 7495 } 7496 } 7497 mutex_exit(&ipsq->ipsq_lock); 7498 (void) ipsq_pending_mp_cleanup(ill, NULL); 7499 ipsq_xopq_mp_cleanup(ill, NULL); 7500 ill_pending_mp_cleanup(ill); 7501 } 7502 7503 /* 7504 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 7505 * The real cleanup happens behind the squeue via ip_squeue_clean function but 7506 * we need to protect ourselfs from 2 threads trying to cleanup at the same 7507 * time (possible with one port going down for aggr and someone tearing down the 7508 * entire aggr simultaneously. So we use ill_inuse_ref protected by ill_lock 7509 * to indicate when the cleanup has started (1 ref) and when the cleanup 7510 * is done (0 ref). When a new ring gets assigned to squeue, we start by 7511 * putting 2 ref on ill_inuse_ref. 7512 */ 7513 static void 7514 ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 7515 { 7516 conn_t *connp; 7517 squeue_t *sqp; 7518 mblk_t *mp; 7519 7520 ASSERT(rx_ring != NULL); 7521 7522 /* Just clean one squeue */ 7523 mutex_enter(&ill->ill_lock); 7524 /* 7525 * Reset the ILL_SOFT_RING_ASSIGN bit so that 7526 * ip_squeue_soft_ring_affinty() will not go 7527 * ahead with assigning rings. 7528 */ 7529 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 7530 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 7531 /* Some operations pending on the ring. Wait */ 7532 cv_wait(&ill->ill_cv, &ill->ill_lock); 7533 7534 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 7535 /* 7536 * Someone already trying to clean 7537 * this squeue or its already been cleaned. 7538 */ 7539 mutex_exit(&ill->ill_lock); 7540 return; 7541 } 7542 sqp = rx_ring->rr_sqp; 7543 7544 if (sqp == NULL) { 7545 /* 7546 * The rx_ring never had a squeue assigned to it. 7547 * We are under ill_lock so we can clean it up 7548 * here itself since no one can get to it. 7549 */ 7550 rx_ring->rr_blank = NULL; 7551 rx_ring->rr_handle = NULL; 7552 rx_ring->rr_sqp = NULL; 7553 rx_ring->rr_ring_state = ILL_RING_FREE; 7554 mutex_exit(&ill->ill_lock); 7555 return; 7556 } 7557 7558 /* Set the state that its being cleaned */ 7559 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 7560 ASSERT(sqp != NULL); 7561 mutex_exit(&ill->ill_lock); 7562 7563 /* 7564 * Use the preallocated ill_unbind_conn for this purpose 7565 */ 7566 connp = ill->ill_dls_capab->ill_unbind_conn; 7567 mp = &connp->conn_tcp->tcp_closemp; 7568 CONN_INC_REF(connp); 7569 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 7570 7571 mutex_enter(&ill->ill_lock); 7572 while (rx_ring->rr_ring_state != ILL_RING_FREE) 7573 cv_wait(&ill->ill_cv, &ill->ill_lock); 7574 7575 mutex_exit(&ill->ill_lock); 7576 } 7577 7578 static void 7579 ipsq_clean_all(ill_t *ill) 7580 { 7581 int idx; 7582 7583 /* 7584 * No need to clean if poll_capab isn't set for this ill 7585 */ 7586 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 7587 return; 7588 7589 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 7590 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 7591 ipsq_clean_ring(ill, ipr); 7592 } 7593 7594 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 7595 } 7596 7597 /* ARGSUSED */ 7598 int 7599 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7600 ip_ioctl_cmd_t *ipip, void *ifreq) 7601 { 7602 ill_t *ill; 7603 struct lifreq *lifr = (struct lifreq *)ifreq; 7604 boolean_t isv6; 7605 conn_t *connp; 7606 7607 connp = Q_TO_CONN(q); 7608 isv6 = connp->conn_af_isv6; 7609 /* 7610 * Set original index. 7611 * Failover and failback move logical interfaces 7612 * from one physical interface to another. The 7613 * original index indicates the parent of a logical 7614 * interface, in other words, the physical interface 7615 * the logical interface will be moved back to on 7616 * failback. 7617 */ 7618 7619 /* 7620 * Don't allow the original index to be changed 7621 * for non-failover addresses, autoconfigured 7622 * addresses, or IPv6 link local addresses. 7623 */ 7624 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 7625 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 7626 return (EINVAL); 7627 } 7628 /* 7629 * The new original index must be in use by some 7630 * physical interface. 7631 */ 7632 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 7633 NULL, NULL); 7634 if (ill == NULL) 7635 return (ENXIO); 7636 ill_refrele(ill); 7637 7638 ipif->ipif_orig_ifindex = lifr->lifr_index; 7639 /* 7640 * When this ipif gets failed back, don't 7641 * preserve the original id, as it is no 7642 * longer applicable. 7643 */ 7644 ipif->ipif_orig_ipifid = 0; 7645 /* 7646 * For IPv4, change the original index of any 7647 * multicast addresses associated with the 7648 * ipif to the new value. 7649 */ 7650 if (!isv6) { 7651 ilm_t *ilm; 7652 7653 mutex_enter(&ipif->ipif_ill->ill_lock); 7654 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 7655 ilm = ilm->ilm_next) { 7656 if (ilm->ilm_ipif == ipif) { 7657 ilm->ilm_orig_ifindex = lifr->lifr_index; 7658 } 7659 } 7660 mutex_exit(&ipif->ipif_ill->ill_lock); 7661 } 7662 return (0); 7663 } 7664 7665 /* ARGSUSED */ 7666 int 7667 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7668 ip_ioctl_cmd_t *ipip, void *ifreq) 7669 { 7670 struct lifreq *lifr = (struct lifreq *)ifreq; 7671 7672 /* 7673 * Get the original interface index i.e the one 7674 * before FAILOVER if it ever happened. 7675 */ 7676 lifr->lifr_index = ipif->ipif_orig_ifindex; 7677 return (0); 7678 } 7679 7680 /* 7681 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 7682 * refhold and return the associated ipif 7683 */ 7684 int 7685 ip_extract_tunreq(queue_t *q, mblk_t *mp, ipif_t **ipifp, ipsq_func_t func) 7686 { 7687 boolean_t exists; 7688 struct iftun_req *ta; 7689 ipif_t *ipif; 7690 ill_t *ill; 7691 boolean_t isv6; 7692 mblk_t *mp1; 7693 int error; 7694 conn_t *connp; 7695 7696 /* Existence verified in ip_wput_nondata */ 7697 mp1 = mp->b_cont->b_cont; 7698 ta = (struct iftun_req *)mp1->b_rptr; 7699 /* 7700 * Null terminate the string to protect against buffer 7701 * overrun. String was generated by user code and may not 7702 * be trusted. 7703 */ 7704 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 7705 7706 connp = Q_TO_CONN(q); 7707 isv6 = connp->conn_af_isv6; 7708 7709 /* Disallows implicit create */ 7710 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 7711 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 7712 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error); 7713 if (ipif == NULL) 7714 return (error); 7715 7716 if (ipif->ipif_id != 0) { 7717 /* 7718 * We really don't want to set/get tunnel parameters 7719 * on virtual tunnel interfaces. Only allow the 7720 * base tunnel to do these. 7721 */ 7722 ipif_refrele(ipif); 7723 return (EINVAL); 7724 } 7725 7726 /* 7727 * Send down to tunnel mod for ioctl processing. 7728 * Will finish ioctl in ip_rput_other(). 7729 */ 7730 ill = ipif->ipif_ill; 7731 if (ill->ill_net_type == IRE_LOOPBACK) { 7732 ipif_refrele(ipif); 7733 return (EOPNOTSUPP); 7734 } 7735 7736 if (ill->ill_wq == NULL) { 7737 ipif_refrele(ipif); 7738 return (ENXIO); 7739 } 7740 /* 7741 * Mark the ioctl as coming from an IPv6 interface for 7742 * tun's convenience. 7743 */ 7744 if (ill->ill_isv6) 7745 ta->ifta_flags |= 0x80000000; 7746 *ipifp = ipif; 7747 return (0); 7748 } 7749 7750 /* 7751 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7752 * and return the associated ipif. 7753 * Return value: 7754 * Non zero: An error has occurred. ci may not be filled out. 7755 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7756 * a held ipif in ci.ci_ipif. 7757 */ 7758 int 7759 ip_extract_lifreq_cmn(queue_t *q, mblk_t *mp, int cmd_type, int flags, 7760 cmd_info_t *ci, ipsq_func_t func) 7761 { 7762 sin_t *sin; 7763 sin6_t *sin6; 7764 char *name; 7765 struct ifreq *ifr; 7766 struct lifreq *lifr; 7767 ipif_t *ipif = NULL; 7768 ill_t *ill; 7769 conn_t *connp; 7770 boolean_t isv6; 7771 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7772 boolean_t exists; 7773 int err; 7774 mblk_t *mp1; 7775 zoneid_t zoneid; 7776 7777 if (q->q_next != NULL) { 7778 ill = (ill_t *)q->q_ptr; 7779 isv6 = ill->ill_isv6; 7780 connp = NULL; 7781 zoneid = ALL_ZONES; 7782 } else { 7783 ill = NULL; 7784 connp = Q_TO_CONN(q); 7785 isv6 = connp->conn_af_isv6; 7786 zoneid = connp->conn_zoneid; 7787 if (zoneid == GLOBAL_ZONEID) { 7788 /* global zone can access ipifs in all zones */ 7789 zoneid = ALL_ZONES; 7790 } 7791 } 7792 7793 /* Has been checked in ip_wput_nondata */ 7794 mp1 = mp->b_cont->b_cont; 7795 7796 7797 if (cmd_type == IF_CMD) { 7798 /* This a old style SIOC[GS]IF* command */ 7799 ifr = (struct ifreq *)mp1->b_rptr; 7800 /* 7801 * Null terminate the string to protect against buffer 7802 * overrun. String was generated by user code and may not 7803 * be trusted. 7804 */ 7805 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7806 sin = (sin_t *)&ifr->ifr_addr; 7807 name = ifr->ifr_name; 7808 ci->ci_sin = sin; 7809 ci->ci_sin6 = NULL; 7810 ci->ci_lifr = (struct lifreq *)ifr; 7811 } else { 7812 /* This a new style SIOC[GS]LIF* command */ 7813 ASSERT(cmd_type == LIF_CMD); 7814 lifr = (struct lifreq *)mp1->b_rptr; 7815 /* 7816 * Null terminate the string to protect against buffer 7817 * overrun. String was generated by user code and may not 7818 * be trusted. 7819 */ 7820 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7821 name = lifr->lifr_name; 7822 sin = (sin_t *)&lifr->lifr_addr; 7823 sin6 = (sin6_t *)&lifr->lifr_addr; 7824 if (iocp->ioc_cmd == SIOCSLIFGROUPNAME) { 7825 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 7826 LIFNAMSIZ); 7827 } 7828 ci->ci_sin = sin; 7829 ci->ci_sin6 = sin6; 7830 ci->ci_lifr = lifr; 7831 } 7832 7833 7834 if (iocp->ioc_cmd == SIOCSLIFNAME) { 7835 /* 7836 * The ioctl will be failed if the ioctl comes down 7837 * an conn stream 7838 */ 7839 if (ill == NULL) { 7840 /* 7841 * Not an ill queue, return EINVAL same as the 7842 * old error code. 7843 */ 7844 return (ENXIO); 7845 } 7846 ipif = ill->ill_ipif; 7847 ipif_refhold(ipif); 7848 } else { 7849 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 7850 &exists, isv6, zoneid, 7851 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err); 7852 if (ipif == NULL) { 7853 if (err == EINPROGRESS) 7854 return (err); 7855 if (iocp->ioc_cmd == SIOCLIFFAILOVER || 7856 iocp->ioc_cmd == SIOCLIFFAILBACK) { 7857 /* 7858 * Need to try both v4 and v6 since this 7859 * ioctl can come down either v4 or v6 7860 * socket. The lifreq.lifr_family passed 7861 * down by this ioctl is AF_UNSPEC. 7862 */ 7863 ipif = ipif_lookup_on_name(name, 7864 mi_strlen(name), B_FALSE, &exists, !isv6, 7865 zoneid, (connp == NULL) ? q : 7866 CONNP_TO_WQ(connp), mp, func, &err); 7867 if (err == EINPROGRESS) 7868 return (err); 7869 } 7870 err = 0; /* Ensure we don't use it below */ 7871 } 7872 } 7873 7874 /* 7875 * Old style [GS]IFCMD does not admit IPv6 ipif 7876 */ 7877 if (ipif != NULL && ipif->ipif_isv6 && cmd_type == IF_CMD) { 7878 ipif_refrele(ipif); 7879 return (ENXIO); 7880 } 7881 7882 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7883 name[0] == '\0') { 7884 /* 7885 * Handle a or a SIOC?IF* with a null name 7886 * during plumb (on the ill queue before the I_PLINK). 7887 */ 7888 ipif = ill->ill_ipif; 7889 ipif_refhold(ipif); 7890 } 7891 7892 if (ipif == NULL) 7893 return (ENXIO); 7894 7895 /* 7896 * Allow only GET operations if this ipif has been created 7897 * temporarily due to a MOVE operation. 7898 */ 7899 if (ipif->ipif_replace_zero && !(flags & IPI_REPL)) { 7900 ipif_refrele(ipif); 7901 return (EINVAL); 7902 } 7903 7904 ci->ci_ipif = ipif; 7905 return (0); 7906 } 7907 7908 /* 7909 * Return the total number of ipifs. 7910 */ 7911 static uint_t 7912 ip_get_numifs(zoneid_t zoneid) 7913 { 7914 uint_t numifs = 0; 7915 ill_t *ill; 7916 ill_walk_context_t ctx; 7917 ipif_t *ipif; 7918 7919 rw_enter(&ill_g_lock, RW_READER); 7920 ill = ILL_START_WALK_V4(&ctx); 7921 7922 while (ill != NULL) { 7923 for (ipif = ill->ill_ipif; ipif != NULL; 7924 ipif = ipif->ipif_next) { 7925 if (ipif->ipif_zoneid == zoneid || 7926 ipif->ipif_zoneid == ALL_ZONES) 7927 numifs++; 7928 } 7929 ill = ill_next(&ctx, ill); 7930 } 7931 rw_exit(&ill_g_lock); 7932 return (numifs); 7933 } 7934 7935 /* 7936 * Return the total number of ipifs. 7937 */ 7938 static uint_t 7939 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid) 7940 { 7941 uint_t numifs = 0; 7942 ill_t *ill; 7943 ipif_t *ipif; 7944 ill_walk_context_t ctx; 7945 7946 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7947 7948 rw_enter(&ill_g_lock, RW_READER); 7949 if (family == AF_INET) 7950 ill = ILL_START_WALK_V4(&ctx); 7951 else if (family == AF_INET6) 7952 ill = ILL_START_WALK_V6(&ctx); 7953 else 7954 ill = ILL_START_WALK_ALL(&ctx); 7955 7956 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7957 for (ipif = ill->ill_ipif; ipif != NULL; 7958 ipif = ipif->ipif_next) { 7959 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7960 !(lifn_flags & LIFC_NOXMIT)) 7961 continue; 7962 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7963 !(lifn_flags & LIFC_TEMPORARY)) 7964 continue; 7965 if (((ipif->ipif_flags & 7966 (IPIF_NOXMIT|IPIF_NOLOCAL| 7967 IPIF_DEPRECATED)) || 7968 (ill->ill_phyint->phyint_flags & 7969 PHYI_LOOPBACK) || 7970 !(ipif->ipif_flags & IPIF_UP)) && 7971 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7972 continue; 7973 7974 if (zoneid != ipif->ipif_zoneid && 7975 ipif->ipif_zoneid != ALL_ZONES && 7976 (zoneid != GLOBAL_ZONEID || 7977 !(lifn_flags & LIFC_ALLZONES))) 7978 continue; 7979 7980 numifs++; 7981 } 7982 } 7983 rw_exit(&ill_g_lock); 7984 return (numifs); 7985 } 7986 7987 uint_t 7988 ip_get_lifsrcofnum(ill_t *ill) 7989 { 7990 uint_t numifs = 0; 7991 ill_t *ill_head = ill; 7992 7993 /* 7994 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7995 * other thread may be trying to relink the ILLs in this usesrc group 7996 * and adjusting the ill_usesrc_grp_next pointers 7997 */ 7998 rw_enter(&ill_g_usesrc_lock, RW_READER); 7999 if ((ill->ill_usesrc_ifindex == 0) && 8000 (ill->ill_usesrc_grp_next != NULL)) { 8001 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8002 ill = ill->ill_usesrc_grp_next) 8003 numifs++; 8004 } 8005 rw_exit(&ill_g_usesrc_lock); 8006 8007 return (numifs); 8008 } 8009 8010 /* Null values are passed in for ipif, sin, and ifreq */ 8011 /* ARGSUSED */ 8012 int 8013 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8014 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8015 { 8016 int *nump; 8017 8018 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8019 8020 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8021 nump = (int *)mp->b_cont->b_cont->b_rptr; 8022 8023 *nump = ip_get_numifs(Q_TO_CONN(q)->conn_zoneid); 8024 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8025 return (0); 8026 } 8027 8028 /* Null values are passed in for ipif, sin, and ifreq */ 8029 /* ARGSUSED */ 8030 int 8031 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8032 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8033 { 8034 struct lifnum *lifn; 8035 mblk_t *mp1; 8036 8037 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8038 8039 /* Existence checked in ip_wput_nondata */ 8040 mp1 = mp->b_cont->b_cont; 8041 8042 lifn = (struct lifnum *)mp1->b_rptr; 8043 switch (lifn->lifn_family) { 8044 case AF_UNSPEC: 8045 case AF_INET: 8046 case AF_INET6: 8047 break; 8048 default: 8049 return (EAFNOSUPPORT); 8050 } 8051 8052 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8053 Q_TO_CONN(q)->conn_zoneid); 8054 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8055 return (0); 8056 } 8057 8058 /* ARGSUSED */ 8059 int 8060 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8061 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8062 { 8063 STRUCT_HANDLE(ifconf, ifc); 8064 mblk_t *mp1; 8065 struct iocblk *iocp; 8066 struct ifreq *ifr; 8067 ill_walk_context_t ctx; 8068 ill_t *ill; 8069 ipif_t *ipif; 8070 struct sockaddr_in *sin; 8071 int32_t ifclen; 8072 zoneid_t zoneid; 8073 8074 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8075 8076 ip1dbg(("ip_sioctl_get_ifconf")); 8077 /* Existence verified in ip_wput_nondata */ 8078 mp1 = mp->b_cont->b_cont; 8079 iocp = (struct iocblk *)mp->b_rptr; 8080 zoneid = Q_TO_CONN(q)->conn_zoneid; 8081 8082 /* 8083 * The original SIOCGIFCONF passed in a struct ifconf which specified 8084 * the user buffer address and length into which the list of struct 8085 * ifreqs was to be copied. Since AT&T Streams does not seem to 8086 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8087 * the SIOCGIFCONF operation was redefined to simply provide 8088 * a large output buffer into which we are supposed to jam the ifreq 8089 * array. The same ioctl command code was used, despite the fact that 8090 * both the applications and the kernel code had to change, thus making 8091 * it impossible to support both interfaces. 8092 * 8093 * For reasons not good enough to try to explain, the following 8094 * algorithm is used for deciding what to do with one of these: 8095 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8096 * form with the output buffer coming down as the continuation message. 8097 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8098 * and we have to copy in the ifconf structure to find out how big the 8099 * output buffer is and where to copy out to. Sure no problem... 8100 * 8101 */ 8102 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8103 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8104 int numifs = 0; 8105 size_t ifc_bufsize; 8106 8107 /* 8108 * Must be (better be!) continuation of a TRANSPARENT 8109 * IOCTL. We just copied in the ifconf structure. 8110 */ 8111 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8112 (struct ifconf *)mp1->b_rptr); 8113 8114 /* 8115 * Allocate a buffer to hold requested information. 8116 * 8117 * If ifc_len is larger than what is needed, we only 8118 * allocate what we will use. 8119 * 8120 * If ifc_len is smaller than what is needed, return 8121 * EINVAL. 8122 * 8123 * XXX: the ill_t structure can hava 2 counters, for 8124 * v4 and v6 (not just ill_ipif_up_count) to store the 8125 * number of interfaces for a device, so we don't need 8126 * to count them here... 8127 */ 8128 numifs = ip_get_numifs(zoneid); 8129 8130 ifclen = STRUCT_FGET(ifc, ifc_len); 8131 ifc_bufsize = numifs * sizeof (struct ifreq); 8132 if (ifc_bufsize > ifclen) { 8133 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8134 /* old behaviour */ 8135 return (EINVAL); 8136 } else { 8137 ifc_bufsize = ifclen; 8138 } 8139 } 8140 8141 mp1 = mi_copyout_alloc(q, mp, 8142 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8143 if (mp1 == NULL) 8144 return (ENOMEM); 8145 8146 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8147 } 8148 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8149 /* 8150 * the SIOCGIFCONF ioctl only knows about 8151 * IPv4 addresses, so don't try to tell 8152 * it about interfaces with IPv6-only 8153 * addresses. (Last parm 'isv6' is B_FALSE) 8154 */ 8155 8156 ifr = (struct ifreq *)mp1->b_rptr; 8157 8158 rw_enter(&ill_g_lock, RW_READER); 8159 ill = ILL_START_WALK_V4(&ctx); 8160 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8161 for (ipif = ill->ill_ipif; ipif; 8162 ipif = ipif->ipif_next) { 8163 if (zoneid != ipif->ipif_zoneid && 8164 ipif->ipif_zoneid != ALL_ZONES) 8165 continue; 8166 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8167 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8168 /* old behaviour */ 8169 rw_exit(&ill_g_lock); 8170 return (EINVAL); 8171 } else { 8172 goto if_copydone; 8173 } 8174 } 8175 (void) ipif_get_name(ipif, 8176 ifr->ifr_name, 8177 sizeof (ifr->ifr_name)); 8178 sin = (sin_t *)&ifr->ifr_addr; 8179 *sin = sin_null; 8180 sin->sin_family = AF_INET; 8181 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8182 ifr++; 8183 } 8184 } 8185 if_copydone: 8186 rw_exit(&ill_g_lock); 8187 mp1->b_wptr = (uchar_t *)ifr; 8188 8189 if (STRUCT_BUF(ifc) != NULL) { 8190 STRUCT_FSET(ifc, ifc_len, 8191 (int)((uchar_t *)ifr - mp1->b_rptr)); 8192 } 8193 return (0); 8194 } 8195 8196 /* 8197 * Get the interfaces using the address hosted on the interface passed in, 8198 * as a source adddress 8199 */ 8200 /* ARGSUSED */ 8201 int 8202 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8203 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8204 { 8205 mblk_t *mp1; 8206 ill_t *ill, *ill_head; 8207 ipif_t *ipif, *orig_ipif; 8208 int numlifs = 0; 8209 size_t lifs_bufsize, lifsmaxlen; 8210 struct lifreq *lifr; 8211 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8212 uint_t ifindex; 8213 zoneid_t zoneid; 8214 int err = 0; 8215 boolean_t isv6 = B_FALSE; 8216 struct sockaddr_in *sin; 8217 struct sockaddr_in6 *sin6; 8218 8219 STRUCT_HANDLE(lifsrcof, lifs); 8220 8221 ASSERT(q->q_next == NULL); 8222 8223 zoneid = Q_TO_CONN(q)->conn_zoneid; 8224 8225 /* Existence verified in ip_wput_nondata */ 8226 mp1 = mp->b_cont->b_cont; 8227 8228 /* 8229 * Must be (better be!) continuation of a TRANSPARENT 8230 * IOCTL. We just copied in the lifsrcof structure. 8231 */ 8232 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8233 (struct lifsrcof *)mp1->b_rptr); 8234 8235 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8236 return (EINVAL); 8237 8238 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8239 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8240 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8241 ip_process_ioctl, &err); 8242 if (ipif == NULL) { 8243 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8244 ifindex)); 8245 return (err); 8246 } 8247 8248 8249 /* Allocate a buffer to hold requested information */ 8250 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8251 lifs_bufsize = numlifs * sizeof (struct lifreq); 8252 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8253 /* The actual size needed is always returned in lifs_len */ 8254 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8255 8256 /* If the amount we need is more than what is passed in, abort */ 8257 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8258 ipif_refrele(ipif); 8259 return (0); 8260 } 8261 8262 mp1 = mi_copyout_alloc(q, mp, 8263 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8264 if (mp1 == NULL) { 8265 ipif_refrele(ipif); 8266 return (ENOMEM); 8267 } 8268 8269 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8270 bzero(mp1->b_rptr, lifs_bufsize); 8271 8272 lifr = (struct lifreq *)mp1->b_rptr; 8273 8274 ill = ill_head = ipif->ipif_ill; 8275 orig_ipif = ipif; 8276 8277 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8278 rw_enter(&ill_g_usesrc_lock, RW_READER); 8279 rw_enter(&ill_g_lock, RW_READER); 8280 8281 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8282 for (; (ill != NULL) && (ill != ill_head); 8283 ill = ill->ill_usesrc_grp_next) { 8284 8285 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8286 break; 8287 8288 ipif = ill->ill_ipif; 8289 (void) ipif_get_name(ipif, 8290 lifr->lifr_name, sizeof (lifr->lifr_name)); 8291 if (ipif->ipif_isv6) { 8292 sin6 = (sin6_t *)&lifr->lifr_addr; 8293 *sin6 = sin6_null; 8294 sin6->sin6_family = AF_INET6; 8295 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8296 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8297 &ipif->ipif_v6net_mask); 8298 } else { 8299 sin = (sin_t *)&lifr->lifr_addr; 8300 *sin = sin_null; 8301 sin->sin_family = AF_INET; 8302 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8303 lifr->lifr_addrlen = ip_mask_to_plen( 8304 ipif->ipif_net_mask); 8305 } 8306 lifr++; 8307 } 8308 rw_exit(&ill_g_usesrc_lock); 8309 rw_exit(&ill_g_lock); 8310 ipif_refrele(orig_ipif); 8311 mp1->b_wptr = (uchar_t *)lifr; 8312 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8313 8314 return (0); 8315 } 8316 8317 /* ARGSUSED */ 8318 int 8319 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8320 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8321 { 8322 mblk_t *mp1; 8323 int list; 8324 ill_t *ill; 8325 ipif_t *ipif; 8326 int flags; 8327 int numlifs = 0; 8328 size_t lifc_bufsize; 8329 struct lifreq *lifr; 8330 sa_family_t family; 8331 struct sockaddr_in *sin; 8332 struct sockaddr_in6 *sin6; 8333 ill_walk_context_t ctx; 8334 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8335 int32_t lifclen; 8336 zoneid_t zoneid; 8337 STRUCT_HANDLE(lifconf, lifc); 8338 8339 ip1dbg(("ip_sioctl_get_lifconf")); 8340 8341 ASSERT(q->q_next == NULL); 8342 8343 zoneid = Q_TO_CONN(q)->conn_zoneid; 8344 8345 /* Existence verified in ip_wput_nondata */ 8346 mp1 = mp->b_cont->b_cont; 8347 8348 /* 8349 * An extended version of SIOCGIFCONF that takes an 8350 * additional address family and flags field. 8351 * AF_UNSPEC retrieve both IPv4 and IPv6. 8352 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8353 * interfaces are omitted. 8354 * Similarly, IPIF_TEMPORARY interfaces are omitted 8355 * unless LIFC_TEMPORARY is specified. 8356 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8357 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8358 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8359 * has priority over LIFC_NOXMIT. 8360 */ 8361 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8362 8363 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8364 return (EINVAL); 8365 8366 /* 8367 * Must be (better be!) continuation of a TRANSPARENT 8368 * IOCTL. We just copied in the lifconf structure. 8369 */ 8370 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8371 8372 family = STRUCT_FGET(lifc, lifc_family); 8373 flags = STRUCT_FGET(lifc, lifc_flags); 8374 8375 switch (family) { 8376 case AF_UNSPEC: 8377 /* 8378 * walk all ILL's. 8379 */ 8380 list = MAX_G_HEADS; 8381 break; 8382 case AF_INET: 8383 /* 8384 * walk only IPV4 ILL's. 8385 */ 8386 list = IP_V4_G_HEAD; 8387 break; 8388 case AF_INET6: 8389 /* 8390 * walk only IPV6 ILL's. 8391 */ 8392 list = IP_V6_G_HEAD; 8393 break; 8394 default: 8395 return (EAFNOSUPPORT); 8396 } 8397 8398 /* 8399 * Allocate a buffer to hold requested information. 8400 * 8401 * If lifc_len is larger than what is needed, we only 8402 * allocate what we will use. 8403 * 8404 * If lifc_len is smaller than what is needed, return 8405 * EINVAL. 8406 */ 8407 numlifs = ip_get_numlifs(family, flags, zoneid); 8408 lifc_bufsize = numlifs * sizeof (struct lifreq); 8409 lifclen = STRUCT_FGET(lifc, lifc_len); 8410 if (lifc_bufsize > lifclen) { 8411 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8412 return (EINVAL); 8413 else 8414 lifc_bufsize = lifclen; 8415 } 8416 8417 mp1 = mi_copyout_alloc(q, mp, 8418 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8419 if (mp1 == NULL) 8420 return (ENOMEM); 8421 8422 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8423 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8424 8425 lifr = (struct lifreq *)mp1->b_rptr; 8426 8427 rw_enter(&ill_g_lock, RW_READER); 8428 ill = ill_first(list, list, &ctx); 8429 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8430 for (ipif = ill->ill_ipif; ipif != NULL; 8431 ipif = ipif->ipif_next) { 8432 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8433 !(flags & LIFC_NOXMIT)) 8434 continue; 8435 8436 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8437 !(flags & LIFC_TEMPORARY)) 8438 continue; 8439 8440 if (((ipif->ipif_flags & 8441 (IPIF_NOXMIT|IPIF_NOLOCAL| 8442 IPIF_DEPRECATED)) || 8443 (ill->ill_phyint->phyint_flags & 8444 PHYI_LOOPBACK) || 8445 !(ipif->ipif_flags & IPIF_UP)) && 8446 (flags & LIFC_EXTERNAL_SOURCE)) 8447 continue; 8448 8449 if (zoneid != ipif->ipif_zoneid && 8450 ipif->ipif_zoneid != ALL_ZONES && 8451 (zoneid != GLOBAL_ZONEID || 8452 !(flags & LIFC_ALLZONES))) 8453 continue; 8454 8455 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8456 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8457 rw_exit(&ill_g_lock); 8458 return (EINVAL); 8459 } else { 8460 goto lif_copydone; 8461 } 8462 } 8463 8464 (void) ipif_get_name(ipif, 8465 lifr->lifr_name, 8466 sizeof (lifr->lifr_name)); 8467 if (ipif->ipif_isv6) { 8468 sin6 = (sin6_t *)&lifr->lifr_addr; 8469 *sin6 = sin6_null; 8470 sin6->sin6_family = AF_INET6; 8471 sin6->sin6_addr = 8472 ipif->ipif_v6lcl_addr; 8473 lifr->lifr_addrlen = 8474 ip_mask_to_plen_v6( 8475 &ipif->ipif_v6net_mask); 8476 } else { 8477 sin = (sin_t *)&lifr->lifr_addr; 8478 *sin = sin_null; 8479 sin->sin_family = AF_INET; 8480 sin->sin_addr.s_addr = 8481 ipif->ipif_lcl_addr; 8482 lifr->lifr_addrlen = 8483 ip_mask_to_plen( 8484 ipif->ipif_net_mask); 8485 } 8486 lifr++; 8487 } 8488 } 8489 lif_copydone: 8490 rw_exit(&ill_g_lock); 8491 8492 mp1->b_wptr = (uchar_t *)lifr; 8493 if (STRUCT_BUF(lifc) != NULL) { 8494 STRUCT_FSET(lifc, lifc_len, 8495 (int)((uchar_t *)lifr - mp1->b_rptr)); 8496 } 8497 return (0); 8498 } 8499 8500 /* ARGSUSED */ 8501 int 8502 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 8503 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8504 { 8505 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8506 ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 8507 return (0); 8508 } 8509 8510 static void 8511 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 8512 { 8513 ip6_asp_t *table; 8514 size_t table_size; 8515 mblk_t *data_mp; 8516 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8517 8518 /* These two ioctls are I_STR only */ 8519 if (iocp->ioc_count == TRANSPARENT) { 8520 miocnak(q, mp, 0, EINVAL); 8521 return; 8522 } 8523 8524 data_mp = mp->b_cont; 8525 if (data_mp == NULL) { 8526 /* The user passed us a NULL argument */ 8527 table = NULL; 8528 table_size = iocp->ioc_count; 8529 } else { 8530 /* 8531 * The user provided a table. The stream head 8532 * may have copied in the user data in chunks, 8533 * so make sure everything is pulled up 8534 * properly. 8535 */ 8536 if (MBLKL(data_mp) < iocp->ioc_count) { 8537 mblk_t *new_data_mp; 8538 if ((new_data_mp = msgpullup(data_mp, -1)) == 8539 NULL) { 8540 miocnak(q, mp, 0, ENOMEM); 8541 return; 8542 } 8543 freemsg(data_mp); 8544 data_mp = new_data_mp; 8545 mp->b_cont = data_mp; 8546 } 8547 table = (ip6_asp_t *)data_mp->b_rptr; 8548 table_size = iocp->ioc_count; 8549 } 8550 8551 switch (iocp->ioc_cmd) { 8552 case SIOCGIP6ADDRPOLICY: 8553 iocp->ioc_rval = ip6_asp_get(table, table_size); 8554 if (iocp->ioc_rval == -1) 8555 iocp->ioc_error = EINVAL; 8556 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8557 else if (table != NULL && 8558 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 8559 ip6_asp_t *src = table; 8560 ip6_asp32_t *dst = (void *)table; 8561 int count = table_size / sizeof (ip6_asp_t); 8562 int i; 8563 8564 /* 8565 * We need to do an in-place shrink of the array 8566 * to match the alignment attributes of the 8567 * 32-bit ABI looking at it. 8568 */ 8569 /* LINTED: logical expression always true: op "||" */ 8570 ASSERT(sizeof (*src) > sizeof (*dst)); 8571 for (i = 1; i < count; i++) 8572 bcopy(src + i, dst + i, sizeof (*dst)); 8573 } 8574 #endif 8575 break; 8576 8577 case SIOCSIP6ADDRPOLICY: 8578 ASSERT(mp->b_prev == NULL); 8579 mp->b_prev = (void *)q; 8580 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8581 /* 8582 * We pass in the datamodel here so that the ip6_asp_replace() 8583 * routine can handle converting from 32-bit to native formats 8584 * where necessary. 8585 * 8586 * A better way to handle this might be to convert the inbound 8587 * data structure here, and hang it off a new 'mp'; thus the 8588 * ip6_asp_replace() logic would always be dealing with native 8589 * format data structures.. 8590 * 8591 * (An even simpler way to handle these ioctls is to just 8592 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 8593 * and just recompile everything that depends on it.) 8594 */ 8595 #endif 8596 ip6_asp_replace(mp, table, table_size, B_FALSE, 8597 iocp->ioc_flag & IOC_MODELS); 8598 return; 8599 } 8600 8601 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 8602 qreply(q, mp); 8603 } 8604 8605 static void 8606 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 8607 { 8608 mblk_t *data_mp; 8609 struct dstinforeq *dir; 8610 uint8_t *end, *cur; 8611 in6_addr_t *daddr, *saddr; 8612 ipaddr_t v4daddr; 8613 ire_t *ire; 8614 char *slabel, *dlabel; 8615 boolean_t isipv4; 8616 int match_ire; 8617 ill_t *dst_ill; 8618 ipif_t *src_ipif, *ire_ipif; 8619 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8620 zoneid_t zoneid; 8621 8622 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8623 zoneid = Q_TO_CONN(q)->conn_zoneid; 8624 8625 /* 8626 * This ioctl is I_STR only, and must have a 8627 * data mblk following the M_IOCTL mblk. 8628 */ 8629 data_mp = mp->b_cont; 8630 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 8631 miocnak(q, mp, 0, EINVAL); 8632 return; 8633 } 8634 8635 if (MBLKL(data_mp) < iocp->ioc_count) { 8636 mblk_t *new_data_mp; 8637 8638 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 8639 miocnak(q, mp, 0, ENOMEM); 8640 return; 8641 } 8642 freemsg(data_mp); 8643 data_mp = new_data_mp; 8644 mp->b_cont = data_mp; 8645 } 8646 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 8647 8648 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 8649 end - cur >= sizeof (struct dstinforeq); 8650 cur += sizeof (struct dstinforeq)) { 8651 dir = (struct dstinforeq *)cur; 8652 daddr = &dir->dir_daddr; 8653 saddr = &dir->dir_saddr; 8654 8655 /* 8656 * ip_addr_scope_v6() and ip6_asp_lookup() handle 8657 * v4 mapped addresses; ire_ftable_lookup[_v6]() 8658 * and ipif_select_source[_v6]() do not. 8659 */ 8660 dir->dir_dscope = ip_addr_scope_v6(daddr); 8661 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence); 8662 8663 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 8664 if (isipv4) { 8665 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 8666 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 8667 0, NULL, NULL, zoneid, 0, NULL, match_ire); 8668 } else { 8669 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 8670 0, NULL, NULL, zoneid, 0, NULL, match_ire); 8671 } 8672 if (ire == NULL) { 8673 dir->dir_dreachable = 0; 8674 8675 /* move on to next dst addr */ 8676 continue; 8677 } 8678 dir->dir_dreachable = 1; 8679 8680 ire_ipif = ire->ire_ipif; 8681 if (ire_ipif == NULL) 8682 goto next_dst; 8683 8684 /* 8685 * We expect to get back an interface ire or a 8686 * gateway ire cache entry. For both types, the 8687 * output interface is ire_ipif->ipif_ill. 8688 */ 8689 dst_ill = ire_ipif->ipif_ill; 8690 dir->dir_dmactype = dst_ill->ill_mactype; 8691 8692 if (isipv4) { 8693 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 8694 } else { 8695 src_ipif = ipif_select_source_v6(dst_ill, 8696 daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, 8697 zoneid); 8698 } 8699 if (src_ipif == NULL) 8700 goto next_dst; 8701 8702 *saddr = src_ipif->ipif_v6lcl_addr; 8703 dir->dir_sscope = ip_addr_scope_v6(saddr); 8704 slabel = ip6_asp_lookup(saddr, NULL); 8705 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 8706 dir->dir_sdeprecated = 8707 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 8708 ipif_refrele(src_ipif); 8709 next_dst: 8710 ire_refrele(ire); 8711 } 8712 miocack(q, mp, iocp->ioc_count, 0); 8713 } 8714 8715 8716 /* 8717 * Check if this is an address assigned to this machine. 8718 * Skips interfaces that are down by using ire checks. 8719 * Translates mapped addresses to v4 addresses and then 8720 * treats them as such, returning true if the v4 address 8721 * associated with this mapped address is configured. 8722 * Note: Applications will have to be careful what they do 8723 * with the response; use of mapped addresses limits 8724 * what can be done with the socket, especially with 8725 * respect to socket options and ioctls - neither IPv4 8726 * options nor IPv6 sticky options/ancillary data options 8727 * may be used. 8728 */ 8729 /* ARGSUSED */ 8730 int 8731 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8732 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8733 { 8734 struct sioc_addrreq *sia; 8735 sin_t *sin; 8736 ire_t *ire; 8737 mblk_t *mp1; 8738 zoneid_t zoneid; 8739 8740 ip1dbg(("ip_sioctl_tmyaddr")); 8741 8742 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8743 zoneid = Q_TO_CONN(q)->conn_zoneid; 8744 8745 /* Existence verified in ip_wput_nondata */ 8746 mp1 = mp->b_cont->b_cont; 8747 sia = (struct sioc_addrreq *)mp1->b_rptr; 8748 sin = (sin_t *)&sia->sa_addr; 8749 switch (sin->sin_family) { 8750 case AF_INET6: { 8751 sin6_t *sin6 = (sin6_t *)sin; 8752 8753 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8754 ipaddr_t v4_addr; 8755 8756 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8757 v4_addr); 8758 ire = ire_ctable_lookup(v4_addr, 0, 8759 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8760 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8761 } else { 8762 in6_addr_t v6addr; 8763 8764 v6addr = sin6->sin6_addr; 8765 ire = ire_ctable_lookup_v6(&v6addr, 0, 8766 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8767 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8768 } 8769 break; 8770 } 8771 case AF_INET: { 8772 ipaddr_t v4addr; 8773 8774 v4addr = sin->sin_addr.s_addr; 8775 ire = ire_ctable_lookup(v4addr, 0, 8776 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8777 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8778 break; 8779 } 8780 default: 8781 return (EAFNOSUPPORT); 8782 } 8783 if (ire != NULL) { 8784 sia->sa_res = 1; 8785 ire_refrele(ire); 8786 } else { 8787 sia->sa_res = 0; 8788 } 8789 return (0); 8790 } 8791 8792 /* 8793 * Check if this is an address assigned on-link i.e. neighbor, 8794 * and makes sure it's reachable from the current zone. 8795 * Returns true for my addresses as well. 8796 * Translates mapped addresses to v4 addresses and then 8797 * treats them as such, returning true if the v4 address 8798 * associated with this mapped address is configured. 8799 * Note: Applications will have to be careful what they do 8800 * with the response; use of mapped addresses limits 8801 * what can be done with the socket, especially with 8802 * respect to socket options and ioctls - neither IPv4 8803 * options nor IPv6 sticky options/ancillary data options 8804 * may be used. 8805 */ 8806 /* ARGSUSED */ 8807 int 8808 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8809 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8810 { 8811 struct sioc_addrreq *sia; 8812 sin_t *sin; 8813 mblk_t *mp1; 8814 ire_t *ire = NULL; 8815 zoneid_t zoneid; 8816 8817 ip1dbg(("ip_sioctl_tonlink")); 8818 8819 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8820 zoneid = Q_TO_CONN(q)->conn_zoneid; 8821 8822 /* Existence verified in ip_wput_nondata */ 8823 mp1 = mp->b_cont->b_cont; 8824 sia = (struct sioc_addrreq *)mp1->b_rptr; 8825 sin = (sin_t *)&sia->sa_addr; 8826 8827 /* 8828 * Match addresses with a zero gateway field to avoid 8829 * routes going through a router. 8830 * Exclude broadcast and multicast addresses. 8831 */ 8832 switch (sin->sin_family) { 8833 case AF_INET6: { 8834 sin6_t *sin6 = (sin6_t *)sin; 8835 8836 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8837 ipaddr_t v4_addr; 8838 8839 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8840 v4_addr); 8841 if (!CLASSD(v4_addr)) { 8842 ire = ire_route_lookup(v4_addr, 0, 0, 0, 8843 NULL, NULL, zoneid, NULL, 8844 MATCH_IRE_GW); 8845 } 8846 } else { 8847 in6_addr_t v6addr; 8848 in6_addr_t v6gw; 8849 8850 v6addr = sin6->sin6_addr; 8851 v6gw = ipv6_all_zeros; 8852 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8853 ire = ire_route_lookup_v6(&v6addr, 0, 8854 &v6gw, 0, NULL, NULL, zoneid, 8855 NULL, MATCH_IRE_GW); 8856 } 8857 } 8858 break; 8859 } 8860 case AF_INET: { 8861 ipaddr_t v4addr; 8862 8863 v4addr = sin->sin_addr.s_addr; 8864 if (!CLASSD(v4addr)) { 8865 ire = ire_route_lookup(v4addr, 0, 0, 0, 8866 NULL, NULL, zoneid, NULL, 8867 MATCH_IRE_GW); 8868 } 8869 break; 8870 } 8871 default: 8872 return (EAFNOSUPPORT); 8873 } 8874 sia->sa_res = 0; 8875 if (ire != NULL) { 8876 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 8877 IRE_LOCAL|IRE_LOOPBACK)) { 8878 sia->sa_res = 1; 8879 } 8880 ire_refrele(ire); 8881 } 8882 return (0); 8883 } 8884 8885 /* 8886 * TBD: implement when kernel maintaines a list of site prefixes. 8887 */ 8888 /* ARGSUSED */ 8889 int 8890 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8891 ip_ioctl_cmd_t *ipip, void *ifreq) 8892 { 8893 return (ENXIO); 8894 } 8895 8896 /* ARGSUSED */ 8897 int 8898 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8899 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8900 { 8901 ill_t *ill; 8902 mblk_t *mp1; 8903 conn_t *connp; 8904 boolean_t success; 8905 8906 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 8907 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 8908 /* ioctl comes down on an conn */ 8909 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8910 connp = Q_TO_CONN(q); 8911 8912 mp->b_datap->db_type = M_IOCTL; 8913 8914 /* 8915 * Send down a copy. (copymsg does not copy b_next/b_prev). 8916 * The original mp contains contaminated b_next values due to 'mi', 8917 * which is needed to do the mi_copy_done. Unfortunately if we 8918 * send down the original mblk itself and if we are popped due to an 8919 * an unplumb before the response comes back from tunnel, 8920 * the streamhead (which does a freemsg) will see this contaminated 8921 * message and the assertion in freemsg about non-null b_next/b_prev 8922 * will panic a DEBUG kernel. 8923 */ 8924 mp1 = copymsg(mp); 8925 if (mp1 == NULL) 8926 return (ENOMEM); 8927 8928 ill = ipif->ipif_ill; 8929 mutex_enter(&connp->conn_lock); 8930 mutex_enter(&ill->ill_lock); 8931 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 8932 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 8933 mp, 0); 8934 } else { 8935 success = ill_pending_mp_add(ill, connp, mp); 8936 } 8937 mutex_exit(&ill->ill_lock); 8938 mutex_exit(&connp->conn_lock); 8939 8940 if (success) { 8941 ip1dbg(("sending down tunparam request ")); 8942 putnext(ill->ill_wq, mp1); 8943 return (EINPROGRESS); 8944 } else { 8945 /* The conn has started closing */ 8946 freemsg(mp1); 8947 return (EINTR); 8948 } 8949 } 8950 8951 static int 8952 ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, 8953 boolean_t x_arp_ioctl, boolean_t if_arp_ioctl) 8954 { 8955 mblk_t *mp1; 8956 mblk_t *mp2; 8957 mblk_t *pending_mp; 8958 ipaddr_t ipaddr; 8959 area_t *area; 8960 struct iocblk *iocp; 8961 conn_t *connp; 8962 struct arpreq *ar; 8963 struct xarpreq *xar; 8964 boolean_t success; 8965 int flags, alength; 8966 char *lladdr; 8967 8968 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8969 connp = Q_TO_CONN(q); 8970 8971 iocp = (struct iocblk *)mp->b_rptr; 8972 /* 8973 * ill has already been set depending on whether 8974 * bsd style or interface style ioctl. 8975 */ 8976 ASSERT(ill != NULL); 8977 8978 /* 8979 * Is this one of the new SIOC*XARP ioctls? 8980 */ 8981 if (x_arp_ioctl) { 8982 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8983 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8984 ar = NULL; 8985 8986 flags = xar->xarp_flags; 8987 lladdr = LLADDR(&xar->xarp_ha); 8988 /* 8989 * Validate against user's link layer address length 8990 * input and name and addr length limits. 8991 */ 8992 alength = ill->ill_phys_addr_length; 8993 if (iocp->ioc_cmd == SIOCSXARP) { 8994 if (alength != xar->xarp_ha.sdl_alen || 8995 (alength + xar->xarp_ha.sdl_nlen > 8996 sizeof (xar->xarp_ha.sdl_data))) 8997 return (EINVAL); 8998 } 8999 } else { 9000 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9001 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9002 xar = NULL; 9003 9004 flags = ar->arp_flags; 9005 lladdr = ar->arp_ha.sa_data; 9006 /* 9007 * Theoretically, the sa_family could tell us what link 9008 * layer type this operation is trying to deal with. By 9009 * common usage AF_UNSPEC means ethernet. We'll assume 9010 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9011 * for now. Our new SIOC*XARP ioctls can be used more 9012 * generally. 9013 * 9014 * If the underlying media happens to have a non 6 byte 9015 * address, arp module will fail set/get, but the del 9016 * operation will succeed. 9017 */ 9018 alength = 6; 9019 if ((iocp->ioc_cmd != SIOCDARP) && 9020 (alength != ill->ill_phys_addr_length)) { 9021 return (EINVAL); 9022 } 9023 } 9024 9025 /* 9026 * We are going to pass up to ARP a packet chain that looks 9027 * like: 9028 * 9029 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9030 * 9031 * Get a copy of the original IOCTL mblk to head the chain, 9032 * to be sent up (in mp1). Also get another copy to store 9033 * in the ill_pending_mp list, for matching the response 9034 * when it comes back from ARP. 9035 */ 9036 mp1 = copyb(mp); 9037 pending_mp = copymsg(mp); 9038 if (mp1 == NULL || pending_mp == NULL) { 9039 if (mp1 != NULL) 9040 freeb(mp1); 9041 if (pending_mp != NULL) 9042 inet_freemsg(pending_mp); 9043 return (ENOMEM); 9044 } 9045 9046 ipaddr = sin->sin_addr.s_addr; 9047 9048 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9049 (caddr_t)&ipaddr); 9050 if (mp2 == NULL) { 9051 freeb(mp1); 9052 inet_freemsg(pending_mp); 9053 return (ENOMEM); 9054 } 9055 /* Put together the chain. */ 9056 mp1->b_cont = mp2; 9057 mp1->b_datap->db_type = M_IOCTL; 9058 mp2->b_cont = mp; 9059 mp2->b_datap->db_type = M_DATA; 9060 9061 iocp = (struct iocblk *)mp1->b_rptr; 9062 9063 /* 9064 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9065 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9066 * cp_private field (or cp_rval on 32-bit systems) in place of the 9067 * ioc_count field; set ioc_count to be correct. 9068 */ 9069 iocp->ioc_count = MBLKL(mp1->b_cont); 9070 9071 /* 9072 * Set the proper command in the ARP message. 9073 * Convert the SIOC{G|S|D}ARP calls into our 9074 * AR_ENTRY_xxx calls. 9075 */ 9076 area = (area_t *)mp2->b_rptr; 9077 switch (iocp->ioc_cmd) { 9078 case SIOCDARP: 9079 case SIOCDXARP: 9080 /* 9081 * We defer deleting the corresponding IRE until 9082 * we return from arp. 9083 */ 9084 area->area_cmd = AR_ENTRY_DELETE; 9085 area->area_proto_mask_offset = 0; 9086 break; 9087 case SIOCGARP: 9088 case SIOCGXARP: 9089 area->area_cmd = AR_ENTRY_SQUERY; 9090 area->area_proto_mask_offset = 0; 9091 break; 9092 case SIOCSARP: 9093 case SIOCSXARP: { 9094 /* 9095 * Delete the corresponding ire to make sure IP will 9096 * pick up any change from arp. 9097 */ 9098 if (!if_arp_ioctl) { 9099 (void) ip_ire_clookup_and_delete(ipaddr, NULL); 9100 break; 9101 } else { 9102 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9103 if (ipif != NULL) { 9104 (void) ip_ire_clookup_and_delete(ipaddr, ipif); 9105 ipif_refrele(ipif); 9106 } 9107 break; 9108 } 9109 } 9110 } 9111 iocp->ioc_cmd = area->area_cmd; 9112 9113 /* 9114 * Before sending 'mp' to ARP, we have to clear the b_next 9115 * and b_prev. Otherwise if STREAMS encounters such a message 9116 * in freemsg(), (because ARP can close any time) it can cause 9117 * a panic. But mi code needs the b_next and b_prev values of 9118 * mp->b_cont, to complete the ioctl. So we store it here 9119 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9120 * when the response comes down from ARP. 9121 */ 9122 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9123 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9124 mp->b_cont->b_next = NULL; 9125 mp->b_cont->b_prev = NULL; 9126 9127 mutex_enter(&connp->conn_lock); 9128 mutex_enter(&ill->ill_lock); 9129 /* conn has not yet started closing, hence this can't fail */ 9130 success = ill_pending_mp_add(ill, connp, pending_mp); 9131 ASSERT(success); 9132 mutex_exit(&ill->ill_lock); 9133 mutex_exit(&connp->conn_lock); 9134 9135 /* 9136 * Fill in the rest of the ARP operation fields. 9137 */ 9138 area->area_hw_addr_length = alength; 9139 bcopy(lladdr, 9140 (char *)area + area->area_hw_addr_offset, 9141 area->area_hw_addr_length); 9142 /* Translate the flags. */ 9143 if (flags & ATF_PERM) 9144 area->area_flags |= ACE_F_PERMANENT; 9145 if (flags & ATF_PUBL) 9146 area->area_flags |= ACE_F_PUBLISH; 9147 9148 /* 9149 * Up to ARP it goes. The response will come 9150 * back in ip_wput as an M_IOCACK message, and 9151 * will be handed to ip_sioctl_iocack for 9152 * completion. 9153 */ 9154 putnext(ill->ill_rq, mp1); 9155 return (EINPROGRESS); 9156 } 9157 9158 /* ARGSUSED */ 9159 int 9160 ip_sioctl_xarp(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9161 ip_ioctl_cmd_t *ipip, void *ifreq) 9162 { 9163 struct xarpreq *xar; 9164 boolean_t isv6; 9165 mblk_t *mp1; 9166 int err; 9167 conn_t *connp; 9168 int ifnamelen; 9169 ire_t *ire = NULL; 9170 ill_t *ill = NULL; 9171 struct sockaddr_in *sin; 9172 boolean_t if_arp_ioctl = B_FALSE; 9173 9174 /* ioctl comes down on an conn */ 9175 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9176 connp = Q_TO_CONN(q); 9177 isv6 = connp->conn_af_isv6; 9178 9179 /* Existance verified in ip_wput_nondata */ 9180 mp1 = mp->b_cont->b_cont; 9181 9182 ASSERT(MBLKL(mp1) >= sizeof (*xar)); 9183 xar = (struct xarpreq *)mp1->b_rptr; 9184 sin = (sin_t *)&xar->xarp_pa; 9185 9186 if (isv6 || (xar->xarp_ha.sdl_family != AF_LINK) || 9187 (xar->xarp_pa.ss_family != AF_INET)) 9188 return (ENXIO); 9189 9190 ifnamelen = xar->xarp_ha.sdl_nlen; 9191 if (ifnamelen != 0) { 9192 char *cptr, cval; 9193 9194 if (ifnamelen >= LIFNAMSIZ) 9195 return (EINVAL); 9196 9197 /* 9198 * Instead of bcopying a bunch of bytes, 9199 * null-terminate the string in-situ. 9200 */ 9201 cptr = xar->xarp_ha.sdl_data + ifnamelen; 9202 cval = *cptr; 9203 *cptr = '\0'; 9204 ill = ill_lookup_on_name(xar->xarp_ha.sdl_data, 9205 B_FALSE, isv6, CONNP_TO_WQ(connp), mp, ip_process_ioctl, 9206 &err, NULL); 9207 *cptr = cval; 9208 if (ill == NULL) 9209 return (err); 9210 if (ill->ill_net_type != IRE_IF_RESOLVER) { 9211 ill_refrele(ill); 9212 return (ENXIO); 9213 } 9214 9215 if_arp_ioctl = B_TRUE; 9216 } else { 9217 /* 9218 * PSARC 2003/088 states that if sdl_nlen == 0, it behaves 9219 * as an extended BSD ioctl. The kernel uses the IP address 9220 * to figure out the network interface. 9221 */ 9222 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9223 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9224 ((ill = ire_to_ill(ire)) == NULL) || 9225 (ill->ill_net_type != IRE_IF_RESOLVER)) { 9226 if (ire != NULL) 9227 ire_refrele(ire); 9228 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9229 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9230 NULL, MATCH_IRE_TYPE); 9231 if ((ire == NULL) || 9232 ((ill = ire_to_ill(ire)) == NULL)) { 9233 if (ire != NULL) 9234 ire_refrele(ire); 9235 return (ENXIO); 9236 } 9237 } 9238 ASSERT(ire != NULL && ill != NULL); 9239 } 9240 9241 err = ip_sioctl_arp_common(ill, q, mp, sin, B_TRUE, if_arp_ioctl); 9242 if (if_arp_ioctl) 9243 ill_refrele(ill); 9244 if (ire != NULL) 9245 ire_refrele(ire); 9246 9247 return (err); 9248 } 9249 9250 /* 9251 * ARP IOCTLs. 9252 * How does IP get in the business of fronting ARP configuration/queries? 9253 * Well its like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9254 * are by tradition passed in through a datagram socket. That lands in IP. 9255 * As it happens, this is just as well since the interface is quite crude in 9256 * that it passes in no information about protocol or hardware types, or 9257 * interface association. After making the protocol assumption, IP is in 9258 * the position to look up the name of the ILL, which ARP will need, and 9259 * format a request that can be handled by ARP. The request is passed up 9260 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9261 * back a response. ARP supports its own set of more general IOCTLs, in 9262 * case anyone is interested. 9263 */ 9264 /* ARGSUSED */ 9265 int 9266 ip_sioctl_arp(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9267 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9268 { 9269 struct arpreq *ar; 9270 struct sockaddr_in *sin; 9271 ire_t *ire; 9272 boolean_t isv6; 9273 mblk_t *mp1; 9274 int err; 9275 conn_t *connp; 9276 ill_t *ill; 9277 9278 /* ioctl comes down on an conn */ 9279 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9280 connp = Q_TO_CONN(q); 9281 isv6 = connp->conn_af_isv6; 9282 if (isv6) 9283 return (ENXIO); 9284 9285 /* Existance verified in ip_wput_nondata */ 9286 mp1 = mp->b_cont->b_cont; 9287 9288 ar = (struct arpreq *)mp1->b_rptr; 9289 sin = (sin_t *)&ar->arp_pa; 9290 9291 /* 9292 * We need to let ARP know on which interface the IP 9293 * address has an ARP mapping. In the IPMP case, a 9294 * simple forwarding table lookup will return the 9295 * IRE_IF_RESOLVER for the first interface in the group, 9296 * which might not be the interface on which the 9297 * requested IP address was resolved due to the ill 9298 * selection algorithm (see ip_newroute_get_dst_ill()). 9299 * So we do a cache table lookup first: if the IRE cache 9300 * entry for the IP address is still there, it will 9301 * contain the ill pointer for the right interface, so 9302 * we use that. If the cache entry has been flushed, we 9303 * fall back to the forwarding table lookup. This should 9304 * be rare enough since IRE cache entries have a longer 9305 * life expectancy than ARP cache entries. 9306 */ 9307 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9308 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9309 ((ill = ire_to_ill(ire)) == NULL)) { 9310 if (ire != NULL) 9311 ire_refrele(ire); 9312 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9313 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9314 NULL, MATCH_IRE_TYPE); 9315 if ((ire == NULL) || ((ill = ire_to_ill(ire)) == NULL)) { 9316 if (ire != NULL) 9317 ire_refrele(ire); 9318 return (ENXIO); 9319 } 9320 } 9321 ASSERT(ire != NULL && ill != NULL); 9322 9323 err = ip_sioctl_arp_common(ill, q, mp, sin, B_FALSE, B_FALSE); 9324 ire_refrele(ire); 9325 return (err); 9326 } 9327 9328 /* 9329 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9330 * atomically set/clear the muxids. Also complete the ioctl by acking or 9331 * naking it. Note that the code is structured such that the link type, 9332 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9333 * its clones use the persistent link, while pppd(1M) and perhaps many 9334 * other daemons may use non-persistent link. When combined with some 9335 * ill_t states, linking and unlinking lower streams may be used as 9336 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9337 */ 9338 /* ARGSUSED */ 9339 void 9340 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9341 { 9342 mblk_t *mp1; 9343 mblk_t *mp2; 9344 struct linkblk *li; 9345 queue_t *ipwq; 9346 char *name; 9347 struct qinit *qinfo; 9348 struct ipmx_s *ipmxp; 9349 ill_t *ill = NULL; 9350 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9351 int err = 0; 9352 boolean_t entered_ipsq = B_FALSE; 9353 boolean_t islink; 9354 queue_t *dwq = NULL; 9355 9356 ASSERT(iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_PUNLINK || 9357 iocp->ioc_cmd == I_LINK || iocp->ioc_cmd == I_UNLINK); 9358 9359 islink = (iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_LINK) ? 9360 B_TRUE : B_FALSE; 9361 9362 mp1 = mp->b_cont; /* This is the linkblk info */ 9363 li = (struct linkblk *)mp1->b_rptr; 9364 9365 /* 9366 * ARP has added this special mblk, and the utility is asking us 9367 * to perform consistency checks, and also atomically set the 9368 * muxid. Ifconfig is an example. It achieves this by using 9369 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9370 * to /dev/udp[6] stream for use as the mux when plinking the IP 9371 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9372 * and other comments in this routine for more details. 9373 */ 9374 mp2 = mp1->b_cont; /* This is added by ARP */ 9375 9376 /* 9377 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9378 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9379 * get the special mblk above. For backward compatibility, we just 9380 * return success. The utility will use SIOCSLIFMUXID to store 9381 * the muxids. This is not atomic, and can leave the streams 9382 * unplumbable if the utility is interrrupted, before it does the 9383 * SIOCSLIFMUXID. 9384 */ 9385 if (mp2 == NULL) { 9386 /* 9387 * At this point we don't know whether or not this is the 9388 * IP module stream or the ARP device stream. We need to 9389 * walk the lower stream in order to find this out, since 9390 * the capability negotiation is done only on the IP module 9391 * stream. IP module instance is identified by the module 9392 * name IP, non-null q_next, and it's wput not being ip_lwput. 9393 * STREAMS ensures that the lower stream (l_qbot) will not 9394 * vanish until this ioctl completes. So we can safely walk 9395 * the stream or refer to the q_ptr. 9396 */ 9397 ipwq = li->l_qbot; 9398 while (ipwq != NULL) { 9399 qinfo = ipwq->q_qinfo; 9400 name = qinfo->qi_minfo->mi_idname; 9401 if (name != NULL && name[0] != NULL && 9402 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9403 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9404 (ipwq->q_next != NULL)) { 9405 break; 9406 } 9407 ipwq = ipwq->q_next; 9408 } 9409 /* 9410 * This looks like an IP module stream, so trigger 9411 * the capability reset or re-negotiation if necessary. 9412 */ 9413 if (ipwq != NULL) { 9414 ill = ipwq->q_ptr; 9415 ASSERT(ill != NULL); 9416 9417 if (ipsq == NULL) { 9418 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9419 ip_sioctl_plink, NEW_OP, B_TRUE); 9420 if (ipsq == NULL) 9421 return; 9422 entered_ipsq = B_TRUE; 9423 } 9424 ASSERT(IAM_WRITER_ILL(ill)); 9425 /* 9426 * Store the upper read queue of the module 9427 * immediately below IP, and count the total 9428 * number of lower modules. Do this only 9429 * for I_PLINK or I_LINK event. 9430 */ 9431 ill->ill_lmod_rq = NULL; 9432 ill->ill_lmod_cnt = 0; 9433 if (islink && (dwq = ipwq->q_next) != NULL) { 9434 ill->ill_lmod_rq = RD(dwq); 9435 9436 while (dwq != NULL) { 9437 ill->ill_lmod_cnt++; 9438 dwq = dwq->q_next; 9439 } 9440 } 9441 /* 9442 * There's no point in resetting or re-negotiating if 9443 * we are not bound to the driver, so only do this if 9444 * the DLPI state is idle (up); we assume such state 9445 * since ill_ipif_up_count gets incremented in 9446 * ipif_up_done(), which is after we are bound to the 9447 * driver. Note that in the case of logical 9448 * interfaces, IP won't rebind to the driver unless 9449 * the ill_ipif_up_count is 0, meaning that all other 9450 * IP interfaces (including the main ipif) are in the 9451 * down state. Because of this, we use such counter 9452 * as an indicator, instead of relying on the IPIF_UP 9453 * flag, which is per ipif instance. 9454 */ 9455 if (ill->ill_ipif_up_count > 0) { 9456 if (islink) 9457 ill_capability_probe(ill); 9458 else 9459 ill_capability_reset(ill); 9460 } 9461 } 9462 goto done; 9463 } 9464 9465 /* 9466 * This is an I_{P}LINK sent down by ifconfig on 9467 * /dev/arp. ARP has appended this last (3rd) mblk, 9468 * giving more info. STREAMS ensures that the lower 9469 * stream (l_qbot) will not vanish until this ioctl 9470 * completes. So we can safely walk the stream or refer 9471 * to the q_ptr. 9472 */ 9473 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9474 if (ipmxp->ipmx_arpdev_stream) { 9475 /* 9476 * The operation is occuring on the arp-device 9477 * stream. 9478 */ 9479 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9480 q, mp, ip_sioctl_plink, &err, NULL); 9481 if (ill == NULL) { 9482 if (err == EINPROGRESS) { 9483 return; 9484 } else { 9485 err = EINVAL; 9486 goto done; 9487 } 9488 } 9489 9490 if (ipsq == NULL) { 9491 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9492 NEW_OP, B_TRUE); 9493 if (ipsq == NULL) { 9494 ill_refrele(ill); 9495 return; 9496 } 9497 entered_ipsq = B_TRUE; 9498 } 9499 ASSERT(IAM_WRITER_ILL(ill)); 9500 ill_refrele(ill); 9501 /* 9502 * To ensure consistency between IP and ARP, 9503 * the following LIFO scheme is used in 9504 * plink/punlink. (IP first, ARP last). 9505 * This is because the muxid's are stored 9506 * in the IP stream on the ill. 9507 * 9508 * I_{P}LINK: ifconfig plinks the IP stream before 9509 * plinking the ARP stream. On an arp-dev 9510 * stream, IP checks that it is not yet 9511 * plinked, and it also checks that the 9512 * corresponding IP stream is already plinked. 9513 * 9514 * I_{P}UNLINK: ifconfig punlinks the ARP stream 9515 * before punlinking the IP stream. IP does 9516 * not allow punlink of the IP stream unless 9517 * the arp stream has been punlinked. 9518 * 9519 */ 9520 if ((islink && 9521 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9522 (!islink && 9523 ill->ill_arp_muxid != li->l_index)) { 9524 err = EINVAL; 9525 goto done; 9526 } 9527 if (islink) { 9528 ill->ill_arp_muxid = li->l_index; 9529 } else { 9530 ill->ill_arp_muxid = 0; 9531 } 9532 } else { 9533 /* 9534 * This must be the IP module stream with or 9535 * without arp. Walk the stream and locate the 9536 * IP module. An IP module instance is 9537 * identified by the module name IP, non-null 9538 * q_next, and it's wput not being ip_lwput. 9539 */ 9540 ipwq = li->l_qbot; 9541 while (ipwq != NULL) { 9542 qinfo = ipwq->q_qinfo; 9543 name = qinfo->qi_minfo->mi_idname; 9544 if (name != NULL && name[0] != NULL && 9545 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9546 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9547 (ipwq->q_next != NULL)) { 9548 break; 9549 } 9550 ipwq = ipwq->q_next; 9551 } 9552 if (ipwq != NULL) { 9553 ill = ipwq->q_ptr; 9554 ASSERT(ill != NULL); 9555 9556 if (ipsq == NULL) { 9557 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9558 ip_sioctl_plink, NEW_OP, B_TRUE); 9559 if (ipsq == NULL) 9560 return; 9561 entered_ipsq = B_TRUE; 9562 } 9563 ASSERT(IAM_WRITER_ILL(ill)); 9564 /* 9565 * Return error if the ip_mux_id is 9566 * non-zero and command is I_{P}LINK. 9567 * If command is I_{P}UNLINK, return 9568 * error if the arp-devstr is not 9569 * yet punlinked. 9570 */ 9571 if ((islink && ill->ill_ip_muxid != 0) || 9572 (!islink && ill->ill_arp_muxid != 0)) { 9573 err = EINVAL; 9574 goto done; 9575 } 9576 ill->ill_lmod_rq = NULL; 9577 ill->ill_lmod_cnt = 0; 9578 if (islink) { 9579 /* 9580 * Store the upper read queue of the module 9581 * immediately below IP, and count the total 9582 * number of lower modules. 9583 */ 9584 if ((dwq = ipwq->q_next) != NULL) { 9585 ill->ill_lmod_rq = RD(dwq); 9586 9587 while (dwq != NULL) { 9588 ill->ill_lmod_cnt++; 9589 dwq = dwq->q_next; 9590 } 9591 } 9592 ill->ill_ip_muxid = li->l_index; 9593 } else { 9594 ill->ill_ip_muxid = 0; 9595 } 9596 9597 /* 9598 * See comments above about resetting/re- 9599 * negotiating driver sub-capabilities. 9600 */ 9601 if (ill->ill_ipif_up_count > 0) { 9602 if (islink) 9603 ill_capability_probe(ill); 9604 else 9605 ill_capability_reset(ill); 9606 } 9607 } 9608 } 9609 done: 9610 iocp->ioc_count = 0; 9611 iocp->ioc_error = err; 9612 if (err == 0) 9613 mp->b_datap->db_type = M_IOCACK; 9614 else 9615 mp->b_datap->db_type = M_IOCNAK; 9616 qreply(q, mp); 9617 9618 /* Conn was refheld in ip_sioctl_copyin_setup */ 9619 if (CONN_Q(q)) 9620 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9621 if (entered_ipsq) 9622 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9623 } 9624 9625 /* 9626 * Search the ioctl command in the ioctl tables and return a pointer 9627 * to the ioctl command information. The ioctl command tables are 9628 * static and fully populated at compile time. 9629 */ 9630 ip_ioctl_cmd_t * 9631 ip_sioctl_lookup(int ioc_cmd) 9632 { 9633 int index; 9634 ip_ioctl_cmd_t *ipip; 9635 ip_ioctl_cmd_t *ipip_end; 9636 9637 if (ioc_cmd == IPI_DONTCARE) 9638 return (NULL); 9639 9640 /* 9641 * Do a 2 step search. First search the indexed table 9642 * based on the least significant byte of the ioctl cmd. 9643 * If we don't find a match, then search the misc table 9644 * serially. 9645 */ 9646 index = ioc_cmd & 0xFF; 9647 if (index < ip_ndx_ioctl_count) { 9648 ipip = &ip_ndx_ioctl_table[index]; 9649 if (ipip->ipi_cmd == ioc_cmd) { 9650 /* Found a match in the ndx table */ 9651 return (ipip); 9652 } 9653 } 9654 9655 /* Search the misc table */ 9656 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 9657 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 9658 if (ipip->ipi_cmd == ioc_cmd) 9659 /* Found a match in the misc table */ 9660 return (ipip); 9661 } 9662 9663 return (NULL); 9664 } 9665 9666 /* 9667 * Wrapper function for resuming deferred ioctl processing 9668 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 9669 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 9670 */ 9671 /* ARGSUSED */ 9672 void 9673 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 9674 void *dummy_arg) 9675 { 9676 ip_sioctl_copyin_setup(q, mp); 9677 } 9678 9679 /* 9680 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 9681 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 9682 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 9683 * We establish here the size of the block to be copied in. mi_copyin 9684 * arranges for this to happen, an processing continues in ip_wput with 9685 * an M_IOCDATA message. 9686 */ 9687 void 9688 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 9689 { 9690 int copyin_size; 9691 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9692 ip_ioctl_cmd_t *ipip; 9693 cred_t *cr; 9694 9695 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 9696 if (ipip == NULL) { 9697 /* 9698 * The ioctl is not one we understand or own. 9699 * Pass it along to be processed down stream, 9700 * if this is a module instance of IP, else nak 9701 * the ioctl. 9702 */ 9703 if (q->q_next == NULL) { 9704 goto nak; 9705 } else { 9706 putnext(q, mp); 9707 return; 9708 } 9709 } 9710 9711 /* 9712 * If this is deferred, then we will do all the checks when we 9713 * come back. 9714 */ 9715 if ((iocp->ioc_cmd == SIOCGDSTINFO || 9716 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup()) { 9717 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 9718 return; 9719 } 9720 9721 /* 9722 * Only allow a very small subset of IP ioctls on this stream if 9723 * IP is a module and not a driver. Allowing ioctls to be processed 9724 * in this case may cause assert failures or data corruption. 9725 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 9726 * ioctls allowed on an IP module stream, after which this stream 9727 * normally becomes a multiplexor (at which time the stream head 9728 * will fail all ioctls). 9729 */ 9730 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 9731 if (ipip->ipi_flags & IPI_PASS_DOWN) { 9732 /* 9733 * Pass common Streams ioctls which the IP 9734 * module does not own or consume along to 9735 * be processed down stream. 9736 */ 9737 putnext(q, mp); 9738 return; 9739 } else { 9740 goto nak; 9741 } 9742 } 9743 9744 /* Make sure we have ioctl data to process. */ 9745 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 9746 goto nak; 9747 9748 /* 9749 * Prefer dblk credential over ioctl credential; some synthesized 9750 * ioctls have kcred set because there's no way to crhold() 9751 * a credential in some contexts. (ioc_cr is not crfree() by 9752 * the framework; the caller of ioctl needs to hold the reference 9753 * for the duration of the call). 9754 */ 9755 cr = DB_CREDDEF(mp, iocp->ioc_cr); 9756 9757 /* Make sure normal users don't send down privileged ioctls */ 9758 if ((ipip->ipi_flags & IPI_PRIV) && 9759 (cr != NULL) && secpolicy_net_config(cr, B_TRUE) != 0) { 9760 /* We checked the privilege earlier but log it here */ 9761 miocnak(q, mp, 0, secpolicy_net_config(cr, B_FALSE)); 9762 return; 9763 } 9764 9765 /* 9766 * The ioctl command tables can only encode fixed length 9767 * ioctl data. If the length is variable, the table will 9768 * encode the length as zero. Such special cases are handled 9769 * below in the switch. 9770 */ 9771 if (ipip->ipi_copyin_size != 0) { 9772 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 9773 return; 9774 } 9775 9776 switch (iocp->ioc_cmd) { 9777 case O_SIOCGIFCONF: 9778 case SIOCGIFCONF: 9779 /* 9780 * This IOCTL is hilarious. See comments in 9781 * ip_sioctl_get_ifconf for the story. 9782 */ 9783 if (iocp->ioc_count == TRANSPARENT) 9784 copyin_size = SIZEOF_STRUCT(ifconf, 9785 iocp->ioc_flag); 9786 else 9787 copyin_size = iocp->ioc_count; 9788 mi_copyin(q, mp, NULL, copyin_size); 9789 return; 9790 9791 case O_SIOCGLIFCONF: 9792 case SIOCGLIFCONF: 9793 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 9794 mi_copyin(q, mp, NULL, copyin_size); 9795 return; 9796 9797 case SIOCGLIFSRCOF: 9798 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 9799 mi_copyin(q, mp, NULL, copyin_size); 9800 return; 9801 case SIOCGIP6ADDRPOLICY: 9802 ip_sioctl_ip6addrpolicy(q, mp); 9803 ip6_asp_table_refrele(); 9804 return; 9805 9806 case SIOCSIP6ADDRPOLICY: 9807 ip_sioctl_ip6addrpolicy(q, mp); 9808 return; 9809 9810 case SIOCGDSTINFO: 9811 ip_sioctl_dstinfo(q, mp); 9812 ip6_asp_table_refrele(); 9813 return; 9814 9815 case I_PLINK: 9816 case I_PUNLINK: 9817 case I_LINK: 9818 case I_UNLINK: 9819 /* 9820 * We treat non-persistent link similarly as the persistent 9821 * link case, in terms of plumbing/unplumbing, as well as 9822 * dynamic re-plumbing events indicator. See comments 9823 * in ip_sioctl_plink() for more. 9824 * 9825 * Request can be enqueued in the 'ipsq' while waiting 9826 * to become exclusive. So bump up the conn ref. 9827 */ 9828 if (CONN_Q(q)) 9829 CONN_INC_REF(Q_TO_CONN(q)); 9830 ip_sioctl_plink(NULL, q, mp, NULL); 9831 return; 9832 9833 case ND_GET: 9834 case ND_SET: 9835 /* 9836 * Use of the nd table requires holding the reader lock. 9837 * Modifying the nd table thru nd_load/nd_unload requires 9838 * the writer lock. 9839 */ 9840 rw_enter(&ip_g_nd_lock, RW_READER); 9841 if (nd_getset(q, ip_g_nd, mp)) { 9842 rw_exit(&ip_g_nd_lock); 9843 9844 if (iocp->ioc_error) 9845 iocp->ioc_count = 0; 9846 mp->b_datap->db_type = M_IOCACK; 9847 qreply(q, mp); 9848 return; 9849 } 9850 rw_exit(&ip_g_nd_lock); 9851 /* 9852 * We don't understand this subioctl of ND_GET / ND_SET. 9853 * Maybe intended for some driver / module below us 9854 */ 9855 if (q->q_next) { 9856 putnext(q, mp); 9857 } else { 9858 iocp->ioc_error = ENOENT; 9859 mp->b_datap->db_type = M_IOCNAK; 9860 iocp->ioc_count = 0; 9861 qreply(q, mp); 9862 } 9863 return; 9864 9865 case IP_IOCTL: 9866 ip_wput_ioctl(q, mp); 9867 return; 9868 default: 9869 cmn_err(CE_PANIC, "should not happen "); 9870 } 9871 nak: 9872 if (mp->b_cont != NULL) { 9873 freemsg(mp->b_cont); 9874 mp->b_cont = NULL; 9875 } 9876 iocp->ioc_error = EINVAL; 9877 mp->b_datap->db_type = M_IOCNAK; 9878 iocp->ioc_count = 0; 9879 qreply(q, mp); 9880 } 9881 9882 /* ip_wput hands off ARP IOCTL responses to us */ 9883 void 9884 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 9885 { 9886 struct arpreq *ar; 9887 struct xarpreq *xar; 9888 area_t *area; 9889 mblk_t *area_mp; 9890 struct iocblk *iocp; 9891 mblk_t *orig_ioc_mp, *tmp; 9892 struct iocblk *orig_iocp; 9893 ill_t *ill; 9894 conn_t *connp = NULL; 9895 uint_t ioc_id; 9896 mblk_t *pending_mp; 9897 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 9898 int *flagsp; 9899 char *storage = NULL; 9900 sin_t *sin; 9901 ipaddr_t addr; 9902 int err; 9903 9904 ill = q->q_ptr; 9905 ASSERT(ill != NULL); 9906 9907 /* 9908 * We should get back from ARP a packet chain that looks like: 9909 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9910 */ 9911 if (!(area_mp = mp->b_cont) || 9912 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 9913 !(orig_ioc_mp = area_mp->b_cont) || 9914 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 9915 freemsg(mp); 9916 return; 9917 } 9918 9919 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 9920 9921 tmp = (orig_ioc_mp->b_cont)->b_cont; 9922 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 9923 (orig_iocp->ioc_cmd == SIOCSXARP) || 9924 (orig_iocp->ioc_cmd == SIOCDXARP)) { 9925 x_arp_ioctl = B_TRUE; 9926 xar = (struct xarpreq *)tmp->b_rptr; 9927 sin = (sin_t *)&xar->xarp_pa; 9928 flagsp = &xar->xarp_flags; 9929 storage = xar->xarp_ha.sdl_data; 9930 if (xar->xarp_ha.sdl_nlen != 0) 9931 ifx_arp_ioctl = B_TRUE; 9932 } else { 9933 ar = (struct arpreq *)tmp->b_rptr; 9934 sin = (sin_t *)&ar->arp_pa; 9935 flagsp = &ar->arp_flags; 9936 storage = ar->arp_ha.sa_data; 9937 } 9938 9939 iocp = (struct iocblk *)mp->b_rptr; 9940 9941 /* 9942 * Pick out the originating queue based on the ioc_id. 9943 */ 9944 ioc_id = iocp->ioc_id; 9945 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 9946 if (pending_mp == NULL) { 9947 ASSERT(connp == NULL); 9948 inet_freemsg(mp); 9949 return; 9950 } 9951 ASSERT(connp != NULL); 9952 q = CONNP_TO_WQ(connp); 9953 9954 /* Uncouple the internally generated IOCTL from the original one */ 9955 area = (area_t *)area_mp->b_rptr; 9956 area_mp->b_cont = NULL; 9957 9958 /* 9959 * Restore the b_next and b_prev used by mi code. This is needed 9960 * to complete the ioctl using mi* functions. We stored them in 9961 * the pending mp prior to sending the request to ARP. 9962 */ 9963 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 9964 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 9965 inet_freemsg(pending_mp); 9966 9967 /* 9968 * We're done if there was an error or if this is not an SIOCG{X}ARP 9969 * Catch the case where there is an IRE_CACHE by no entry in the 9970 * arp table. 9971 */ 9972 addr = sin->sin_addr.s_addr; 9973 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 9974 ire_t *ire; 9975 dl_unitdata_req_t *dlup; 9976 mblk_t *llmp; 9977 int addr_len; 9978 ill_t *ipsqill = NULL; 9979 9980 if (ifx_arp_ioctl) { 9981 /* 9982 * There's no need to lookup the ill, since 9983 * we've already done that when we started 9984 * processing the ioctl and sent the message 9985 * to ARP on that ill. So use the ill that 9986 * is stored in q->q_ptr. 9987 */ 9988 ipsqill = ill; 9989 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 9990 ipsqill->ill_ipif, ALL_ZONES, 9991 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 9992 } else { 9993 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 9994 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 9995 if (ire != NULL) 9996 ipsqill = ire_to_ill(ire); 9997 } 9998 9999 if ((x_arp_ioctl) && (ipsqill != NULL)) 10000 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10001 10002 if (ire != NULL) { 10003 /* 10004 * Since the ire obtained from cachetable is used for 10005 * mac addr copying below, treat an incomplete ire as if 10006 * as if we never found it. 10007 */ 10008 if (ire->ire_nce != NULL && 10009 ire->ire_nce->nce_state != ND_REACHABLE) { 10010 ire_refrele(ire); 10011 ire = NULL; 10012 ipsqill = NULL; 10013 goto errack; 10014 } 10015 *flagsp = ATF_INUSE; 10016 llmp = (ire->ire_nce != NULL ? 10017 ire->ire_nce->nce_res_mp : NULL); 10018 if (llmp != NULL && ipsqill != NULL) { 10019 uchar_t *macaddr; 10020 10021 addr_len = ipsqill->ill_phys_addr_length; 10022 if (x_arp_ioctl && ((addr_len + 10023 ipsqill->ill_name_length) > 10024 sizeof (xar->xarp_ha.sdl_data))) { 10025 ire_refrele(ire); 10026 freemsg(mp); 10027 ip_ioctl_finish(q, orig_ioc_mp, 10028 EINVAL, NO_COPYOUT, NULL, NULL); 10029 return; 10030 } 10031 *flagsp |= ATF_COM; 10032 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10033 if (ipsqill->ill_sap_length < 0) 10034 macaddr = llmp->b_rptr + 10035 dlup->dl_dest_addr_offset; 10036 else 10037 macaddr = llmp->b_rptr + 10038 dlup->dl_dest_addr_offset + 10039 ipsqill->ill_sap_length; 10040 /* 10041 * For SIOCGARP, MAC address length 10042 * validation has already been done 10043 * before the ioctl was issued to ARP to 10044 * allow it to progress only on 6 byte 10045 * addressable (ethernet like) media. Thus 10046 * the mac address copying can not overwrite 10047 * the sa_data area below. 10048 */ 10049 bcopy(macaddr, storage, addr_len); 10050 } 10051 /* Ditch the internal IOCTL. */ 10052 freemsg(mp); 10053 ire_refrele(ire); 10054 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 10055 return; 10056 } 10057 } 10058 10059 /* 10060 * Delete the coresponding IRE_CACHE if any. 10061 * Reset the error if there was one (in case there was no entry 10062 * in arp.) 10063 */ 10064 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10065 ipif_t *ipintf = NULL; 10066 10067 if (ifx_arp_ioctl) { 10068 /* 10069 * There's no need to lookup the ill, since 10070 * we've already done that when we started 10071 * processing the ioctl and sent the message 10072 * to ARP on that ill. So use the ill that 10073 * is stored in q->q_ptr. 10074 */ 10075 ipintf = ill->ill_ipif; 10076 } 10077 if (ip_ire_clookup_and_delete(addr, ipintf)) { 10078 /* 10079 * The address in "addr" may be an entry for a 10080 * router. If that's true, then any off-net 10081 * IRE_CACHE entries that go through the router 10082 * with address "addr" must be clobbered. Use 10083 * ire_walk to achieve this goal. 10084 */ 10085 if (ifx_arp_ioctl) 10086 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10087 ire_delete_cache_gw, (char *)&addr, ill); 10088 else 10089 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10090 ALL_ZONES); 10091 iocp->ioc_error = 0; 10092 } 10093 } 10094 errack: 10095 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10096 err = iocp->ioc_error; 10097 freemsg(mp); 10098 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL, NULL); 10099 return; 10100 } 10101 10102 /* 10103 * Completion of an SIOCG{X}ARP. Translate the information from 10104 * the area_t into the struct {x}arpreq. 10105 */ 10106 if (x_arp_ioctl) { 10107 storage += ill_xarp_info(&xar->xarp_ha, ill); 10108 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10109 sizeof (xar->xarp_ha.sdl_data)) { 10110 freemsg(mp); 10111 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, 10112 NO_COPYOUT, NULL, NULL); 10113 return; 10114 } 10115 } 10116 *flagsp = ATF_INUSE; 10117 if (area->area_flags & ACE_F_PERMANENT) 10118 *flagsp |= ATF_PERM; 10119 if (area->area_flags & ACE_F_PUBLISH) 10120 *flagsp |= ATF_PUBL; 10121 if (area->area_hw_addr_length != 0) { 10122 *flagsp |= ATF_COM; 10123 /* 10124 * For SIOCGARP, MAC address length validation has 10125 * already been done before the ioctl was issued to ARP 10126 * to allow it to progress only on 6 byte addressable 10127 * (ethernet like) media. Thus the mac address copying 10128 * can not overwrite the sa_data area below. 10129 */ 10130 bcopy((char *)area + area->area_hw_addr_offset, 10131 storage, area->area_hw_addr_length); 10132 } 10133 10134 /* Ditch the internal IOCTL. */ 10135 freemsg(mp); 10136 /* Complete the original. */ 10137 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 10138 } 10139 10140 /* 10141 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10142 * interface) create the next available logical interface for this 10143 * physical interface. 10144 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10145 * ipif with the specified name. 10146 * 10147 * If the address family is not AF_UNSPEC then set the address as well. 10148 * 10149 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10150 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10151 * 10152 * Executed as a writer on the ill or ill group. 10153 * So no lock is needed to traverse the ipif chain, or examine the 10154 * phyint flags. 10155 */ 10156 /* ARGSUSED */ 10157 int 10158 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10159 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10160 { 10161 mblk_t *mp1; 10162 struct lifreq *lifr; 10163 boolean_t isv6; 10164 boolean_t exists; 10165 char *name; 10166 char *endp; 10167 char *cp; 10168 int namelen; 10169 ipif_t *ipif; 10170 long id; 10171 ipsq_t *ipsq; 10172 ill_t *ill; 10173 sin_t *sin; 10174 int err = 0; 10175 boolean_t found_sep = B_FALSE; 10176 conn_t *connp; 10177 zoneid_t zoneid; 10178 int orig_ifindex = 0; 10179 10180 ip1dbg(("ip_sioctl_addif\n")); 10181 /* Existence of mp1 has been checked in ip_wput_nondata */ 10182 mp1 = mp->b_cont->b_cont; 10183 /* 10184 * Null terminate the string to protect against buffer 10185 * overrun. String was generated by user code and may not 10186 * be trusted. 10187 */ 10188 lifr = (struct lifreq *)mp1->b_rptr; 10189 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10190 name = lifr->lifr_name; 10191 ASSERT(CONN_Q(q)); 10192 connp = Q_TO_CONN(q); 10193 isv6 = connp->conn_af_isv6; 10194 zoneid = connp->conn_zoneid; 10195 namelen = mi_strlen(name); 10196 if (namelen == 0) 10197 return (EINVAL); 10198 10199 exists = B_FALSE; 10200 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10201 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10202 /* 10203 * Allow creating lo0 using SIOCLIFADDIF. 10204 * can't be any other writer thread. So can pass null below 10205 * for the last 4 args to ipif_lookup_name. 10206 */ 10207 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, 10208 B_TRUE, &exists, isv6, zoneid, NULL, NULL, NULL, NULL); 10209 /* Prevent any further action */ 10210 if (ipif == NULL) { 10211 return (ENOBUFS); 10212 } else if (!exists) { 10213 /* We created the ipif now and as writer */ 10214 ipif_refrele(ipif); 10215 return (0); 10216 } else { 10217 ill = ipif->ipif_ill; 10218 ill_refhold(ill); 10219 ipif_refrele(ipif); 10220 } 10221 } else { 10222 /* Look for a colon in the name. */ 10223 endp = &name[namelen]; 10224 for (cp = endp; --cp > name; ) { 10225 if (*cp == IPIF_SEPARATOR_CHAR) { 10226 found_sep = B_TRUE; 10227 /* 10228 * Reject any non-decimal aliases for plumbing 10229 * of logical interfaces. Aliases with leading 10230 * zeroes are also rejected as they introduce 10231 * ambiguity in the naming of the interfaces. 10232 * Comparing with "0" takes care of all such 10233 * cases. 10234 */ 10235 if ((strncmp("0", cp+1, 1)) == 0) 10236 return (EINVAL); 10237 10238 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10239 id <= 0 || *endp != '\0') { 10240 return (EINVAL); 10241 } 10242 *cp = '\0'; 10243 break; 10244 } 10245 } 10246 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10247 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL); 10248 if (found_sep) 10249 *cp = IPIF_SEPARATOR_CHAR; 10250 if (ill == NULL) 10251 return (err); 10252 } 10253 10254 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10255 B_TRUE); 10256 10257 /* 10258 * Release the refhold due to the lookup, now that we are excl 10259 * or we are just returning 10260 */ 10261 ill_refrele(ill); 10262 10263 if (ipsq == NULL) 10264 return (EINPROGRESS); 10265 10266 /* 10267 * If the interface is failed, inactive or offlined, look for a working 10268 * interface in the ill group and create the ipif there. If we can't 10269 * find a good interface, create the ipif anyway so that in.mpathd can 10270 * move it to the first repaired interface. 10271 */ 10272 if ((ill->ill_phyint->phyint_flags & 10273 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10274 ill->ill_phyint->phyint_groupname_len != 0) { 10275 phyint_t *phyi; 10276 char *groupname = ill->ill_phyint->phyint_groupname; 10277 10278 /* 10279 * We're looking for a working interface, but it doesn't matter 10280 * if it's up or down; so instead of following the group lists, 10281 * we look at each physical interface and compare the groupname. 10282 * We're only interested in interfaces with IPv4 (resp. IPv6) 10283 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 10284 * Otherwise we create the ipif on the failed interface. 10285 */ 10286 rw_enter(&ill_g_lock, RW_READER); 10287 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 10288 for (; phyi != NULL; 10289 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 10290 phyi, AVL_AFTER)) { 10291 if (phyi->phyint_groupname_len == 0) 10292 continue; 10293 ASSERT(phyi->phyint_groupname != NULL); 10294 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 10295 !(phyi->phyint_flags & 10296 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10297 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 10298 (phyi->phyint_illv4 != NULL))) { 10299 break; 10300 } 10301 } 10302 rw_exit(&ill_g_lock); 10303 10304 if (phyi != NULL) { 10305 orig_ifindex = ill->ill_phyint->phyint_ifindex; 10306 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 10307 phyi->phyint_illv4); 10308 } 10309 } 10310 10311 /* 10312 * We are now exclusive on the ipsq, so an ill move will be serialized 10313 * before or after us. 10314 */ 10315 ASSERT(IAM_WRITER_ILL(ill)); 10316 ASSERT(ill->ill_move_in_progress == B_FALSE); 10317 10318 if (found_sep && orig_ifindex == 0) { 10319 /* Now see if there is an IPIF with this unit number. */ 10320 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 10321 if (ipif->ipif_id == id) { 10322 err = EEXIST; 10323 goto done; 10324 } 10325 } 10326 } 10327 10328 /* 10329 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10330 * of lo0. We never come here when we plumb lo0:0. It 10331 * happens in ipif_lookup_on_name. 10332 * The specified unit number is ignored when we create the ipif on a 10333 * different interface. However, we save it in ipif_orig_ipifid below so 10334 * that the ipif fails back to the right position. 10335 */ 10336 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 10337 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 10338 err = ENOBUFS; 10339 goto done; 10340 } 10341 10342 /* Return created name with ioctl */ 10343 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10344 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10345 ip1dbg(("created %s\n", lifr->lifr_name)); 10346 10347 /* Set address */ 10348 sin = (sin_t *)&lifr->lifr_addr; 10349 if (sin->sin_family != AF_UNSPEC) { 10350 err = ip_sioctl_addr(ipif, sin, q, mp, 10351 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10352 } 10353 10354 /* Set ifindex and unit number for failback */ 10355 if (err == 0 && orig_ifindex != 0) { 10356 ipif->ipif_orig_ifindex = orig_ifindex; 10357 if (found_sep) { 10358 ipif->ipif_orig_ipifid = id; 10359 } 10360 } 10361 10362 done: 10363 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10364 return (err); 10365 } 10366 10367 /* 10368 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10369 * interface) delete it based on the IP address (on this physical interface). 10370 * Otherwise delete it based on the ipif_id. 10371 * Also, special handling to allow a removeif of lo0. 10372 */ 10373 /* ARGSUSED */ 10374 int 10375 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10376 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10377 { 10378 conn_t *connp; 10379 ill_t *ill = ipif->ipif_ill; 10380 boolean_t success; 10381 10382 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10383 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10384 ASSERT(IAM_WRITER_IPIF(ipif)); 10385 10386 connp = Q_TO_CONN(q); 10387 /* 10388 * Special case for unplumbing lo0 (the loopback physical interface). 10389 * If unplumbing lo0, the incoming address structure has been 10390 * initialized to all zeros. When unplumbing lo0, all its logical 10391 * interfaces must be removed too. 10392 * 10393 * Note that this interface may be called to remove a specific 10394 * loopback logical interface (eg, lo0:1). But in that case 10395 * ipif->ipif_id != 0 so that the code path for that case is the 10396 * same as any other interface (meaning it skips the code directly 10397 * below). 10398 */ 10399 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10400 if (sin->sin_family == AF_UNSPEC && 10401 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10402 /* 10403 * Mark it condemned. No new ref. will be made to ill. 10404 */ 10405 mutex_enter(&ill->ill_lock); 10406 ill->ill_state_flags |= ILL_CONDEMNED; 10407 for (ipif = ill->ill_ipif; ipif != NULL; 10408 ipif = ipif->ipif_next) { 10409 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10410 } 10411 mutex_exit(&ill->ill_lock); 10412 10413 ipif = ill->ill_ipif; 10414 /* unplumb the loopback interface */ 10415 ill_delete(ill); 10416 mutex_enter(&connp->conn_lock); 10417 mutex_enter(&ill->ill_lock); 10418 ASSERT(ill->ill_group == NULL); 10419 10420 /* Are any references to this ill active */ 10421 if (ill_is_quiescent(ill)) { 10422 mutex_exit(&ill->ill_lock); 10423 mutex_exit(&connp->conn_lock); 10424 ill_delete_tail(ill); 10425 mi_free(ill); 10426 return (0); 10427 } 10428 success = ipsq_pending_mp_add(connp, ipif, 10429 CONNP_TO_WQ(connp), mp, ILL_FREE); 10430 mutex_exit(&connp->conn_lock); 10431 mutex_exit(&ill->ill_lock); 10432 if (success) 10433 return (EINPROGRESS); 10434 else 10435 return (EINTR); 10436 } 10437 } 10438 10439 /* 10440 * We are exclusive on the ipsq, so an ill move will be serialized 10441 * before or after us. 10442 */ 10443 ASSERT(ill->ill_move_in_progress == B_FALSE); 10444 10445 if (ipif->ipif_id == 0) { 10446 /* Find based on address */ 10447 if (ipif->ipif_isv6) { 10448 sin6_t *sin6; 10449 10450 if (sin->sin_family != AF_INET6) 10451 return (EAFNOSUPPORT); 10452 10453 sin6 = (sin6_t *)sin; 10454 /* We are a writer, so we should be able to lookup */ 10455 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10456 ill, ALL_ZONES, NULL, NULL, NULL, NULL); 10457 if (ipif == NULL) { 10458 /* 10459 * Maybe the address in on another interface in 10460 * the same IPMP group? We check this below. 10461 */ 10462 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10463 NULL, ALL_ZONES, NULL, NULL, NULL, NULL); 10464 } 10465 } else { 10466 ipaddr_t addr; 10467 10468 if (sin->sin_family != AF_INET) 10469 return (EAFNOSUPPORT); 10470 10471 addr = sin->sin_addr.s_addr; 10472 /* We are a writer, so we should be able to lookup */ 10473 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 10474 NULL, NULL, NULL); 10475 if (ipif == NULL) { 10476 /* 10477 * Maybe the address in on another interface in 10478 * the same IPMP group? We check this below. 10479 */ 10480 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 10481 NULL, NULL, NULL, NULL); 10482 } 10483 } 10484 if (ipif == NULL) { 10485 return (EADDRNOTAVAIL); 10486 } 10487 /* 10488 * When the address to be removed is hosted on a different 10489 * interface, we check if the interface is in the same IPMP 10490 * group as the specified one; if so we proceed with the 10491 * removal. 10492 * ill->ill_group is NULL when the ill is down, so we have to 10493 * compare the group names instead. 10494 */ 10495 if (ipif->ipif_ill != ill && 10496 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 10497 ill->ill_phyint->phyint_groupname_len == 0 || 10498 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 10499 ill->ill_phyint->phyint_groupname) != 0)) { 10500 ipif_refrele(ipif); 10501 return (EADDRNOTAVAIL); 10502 } 10503 10504 /* This is a writer */ 10505 ipif_refrele(ipif); 10506 } 10507 10508 /* 10509 * Can not delete instance zero since it is tied to the ill. 10510 */ 10511 if (ipif->ipif_id == 0) 10512 return (EBUSY); 10513 10514 mutex_enter(&ill->ill_lock); 10515 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10516 mutex_exit(&ill->ill_lock); 10517 10518 ipif_free(ipif); 10519 10520 mutex_enter(&connp->conn_lock); 10521 mutex_enter(&ill->ill_lock); 10522 10523 /* Are any references to this ipif active */ 10524 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 10525 mutex_exit(&ill->ill_lock); 10526 mutex_exit(&connp->conn_lock); 10527 ipif_down_tail(ipif); 10528 ipif_free_tail(ipif); 10529 return (0); 10530 } 10531 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10532 IPIF_FREE); 10533 mutex_exit(&ill->ill_lock); 10534 mutex_exit(&connp->conn_lock); 10535 if (success) 10536 return (EINPROGRESS); 10537 else 10538 return (EINTR); 10539 } 10540 10541 /* 10542 * Restart the removeif ioctl. The refcnt has gone down to 0. 10543 * The ipif is already condemned. So can't find it thru lookups. 10544 */ 10545 /* ARGSUSED */ 10546 int 10547 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10548 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10549 { 10550 ill_t *ill; 10551 10552 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10553 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10554 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10555 ill = ipif->ipif_ill; 10556 ASSERT(IAM_WRITER_ILL(ill)); 10557 ASSERT((ipif->ipif_state_flags & IPIF_CONDEMNED) && 10558 (ill->ill_state_flags & IPIF_CONDEMNED)); 10559 ill_delete_tail(ill); 10560 mi_free(ill); 10561 return (0); 10562 } 10563 10564 ill = ipif->ipif_ill; 10565 ASSERT(IAM_WRITER_IPIF(ipif)); 10566 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10567 10568 ipif_down_tail(ipif); 10569 ipif_free_tail(ipif); 10570 10571 ILL_UNMARK_CHANGING(ill); 10572 return (0); 10573 } 10574 10575 /* 10576 * Set the local interface address. 10577 * Allow an address of all zero when the interface is down. 10578 */ 10579 /* ARGSUSED */ 10580 int 10581 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10582 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10583 { 10584 int err = 0; 10585 in6_addr_t v6addr; 10586 boolean_t need_up = B_FALSE; 10587 10588 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10589 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10590 10591 ASSERT(IAM_WRITER_IPIF(ipif)); 10592 10593 if (ipif->ipif_isv6) { 10594 sin6_t *sin6; 10595 ill_t *ill; 10596 phyint_t *phyi; 10597 10598 if (sin->sin_family != AF_INET6) 10599 return (EAFNOSUPPORT); 10600 10601 sin6 = (sin6_t *)sin; 10602 v6addr = sin6->sin6_addr; 10603 ill = ipif->ipif_ill; 10604 phyi = ill->ill_phyint; 10605 10606 /* 10607 * Enforce that true multicast interfaces have a link-local 10608 * address for logical unit 0. 10609 */ 10610 if (ipif->ipif_id == 0 && 10611 (ill->ill_flags & ILLF_MULTICAST) && 10612 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 10613 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 10614 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 10615 return (EADDRNOTAVAIL); 10616 } 10617 10618 /* 10619 * up interfaces shouldn't have the unspecified address 10620 * unless they also have the IPIF_NOLOCAL flags set and 10621 * have a subnet assigned. 10622 */ 10623 if ((ipif->ipif_flags & IPIF_UP) && 10624 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 10625 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 10626 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 10627 return (EADDRNOTAVAIL); 10628 } 10629 10630 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10631 return (EADDRNOTAVAIL); 10632 } else { 10633 ipaddr_t addr; 10634 10635 if (sin->sin_family != AF_INET) 10636 return (EAFNOSUPPORT); 10637 10638 addr = sin->sin_addr.s_addr; 10639 10640 /* Allow 0 as the local address. */ 10641 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10642 return (EADDRNOTAVAIL); 10643 10644 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10645 } 10646 10647 10648 /* 10649 * Even if there is no change we redo things just to rerun 10650 * ipif_set_default. 10651 */ 10652 if (ipif->ipif_flags & IPIF_UP) { 10653 /* 10654 * Setting a new local address, make sure 10655 * we have net and subnet bcast ire's for 10656 * the old address if we need them. 10657 */ 10658 if (!ipif->ipif_isv6) 10659 ipif_check_bcast_ires(ipif); 10660 /* 10661 * If the interface is already marked up, 10662 * we call ipif_down which will take care 10663 * of ditching any IREs that have been set 10664 * up based on the old interface address. 10665 */ 10666 err = ipif_logical_down(ipif, q, mp); 10667 if (err == EINPROGRESS) 10668 return (err); 10669 ipif_down_tail(ipif); 10670 need_up = 1; 10671 } 10672 10673 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 10674 return (err); 10675 } 10676 10677 int 10678 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10679 boolean_t need_up) 10680 { 10681 in6_addr_t v6addr; 10682 ipaddr_t addr; 10683 sin6_t *sin6; 10684 int err = 0; 10685 10686 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 10687 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10688 ASSERT(IAM_WRITER_IPIF(ipif)); 10689 if (ipif->ipif_isv6) { 10690 sin6 = (sin6_t *)sin; 10691 v6addr = sin6->sin6_addr; 10692 } else { 10693 addr = sin->sin_addr.s_addr; 10694 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10695 } 10696 mutex_enter(&ipif->ipif_ill->ill_lock); 10697 ipif->ipif_v6lcl_addr = v6addr; 10698 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 10699 ipif->ipif_v6src_addr = ipv6_all_zeros; 10700 } else { 10701 ipif->ipif_v6src_addr = v6addr; 10702 } 10703 10704 if ((ipif->ipif_isv6) && IN6_IS_ADDR_6TO4(&v6addr) && 10705 (!ipif->ipif_ill->ill_is_6to4tun)) { 10706 queue_t *wqp = ipif->ipif_ill->ill_wq; 10707 10708 /* 10709 * The local address of this interface is a 6to4 address, 10710 * check if this interface is in fact a 6to4 tunnel or just 10711 * an interface configured with a 6to4 address. We are only 10712 * interested in the former. 10713 */ 10714 if (wqp != NULL) { 10715 while ((wqp->q_next != NULL) && 10716 (wqp->q_next->q_qinfo != NULL) && 10717 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 10718 10719 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 10720 == TUN6TO4_MODID) { 10721 /* set for use in IP */ 10722 ipif->ipif_ill->ill_is_6to4tun = 1; 10723 break; 10724 } 10725 wqp = wqp->q_next; 10726 } 10727 } 10728 } 10729 10730 ipif_set_default(ipif); 10731 mutex_exit(&ipif->ipif_ill->ill_lock); 10732 10733 if (need_up) { 10734 /* 10735 * Now bring the interface back up. If this 10736 * is the only IPIF for the ILL, ipif_up 10737 * will have to re-bind to the device, so 10738 * we may get back EINPROGRESS, in which 10739 * case, this IOCTL will get completed in 10740 * ip_rput_dlpi when we see the DL_BIND_ACK. 10741 */ 10742 err = ipif_up(ipif, q, mp); 10743 } else { 10744 /* 10745 * Update the IPIF list in SCTP, ipif_up_done() will do it 10746 * if need_up is true. 10747 */ 10748 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10749 } 10750 10751 return (err); 10752 } 10753 10754 10755 /* 10756 * Restart entry point to restart the address set operation after the 10757 * refcounts have dropped to zero. 10758 */ 10759 /* ARGSUSED */ 10760 int 10761 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10762 ip_ioctl_cmd_t *ipip, void *ifreq) 10763 { 10764 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 10765 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10766 ASSERT(IAM_WRITER_IPIF(ipif)); 10767 ipif_down_tail(ipif); 10768 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 10769 } 10770 10771 /* ARGSUSED */ 10772 int 10773 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10774 ip_ioctl_cmd_t *ipip, void *if_req) 10775 { 10776 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 10777 struct lifreq *lifr = (struct lifreq *)if_req; 10778 10779 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 10780 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10781 /* 10782 * The net mask and address can't change since we have a 10783 * reference to the ipif. So no lock is necessary. 10784 */ 10785 if (ipif->ipif_isv6) { 10786 *sin6 = sin6_null; 10787 sin6->sin6_family = AF_INET6; 10788 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 10789 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10790 lifr->lifr_addrlen = 10791 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 10792 } else { 10793 *sin = sin_null; 10794 sin->sin_family = AF_INET; 10795 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 10796 if (ipip->ipi_cmd_type == LIF_CMD) { 10797 lifr->lifr_addrlen = 10798 ip_mask_to_plen(ipif->ipif_net_mask); 10799 } 10800 } 10801 return (0); 10802 } 10803 10804 /* 10805 * Set the destination address for a pt-pt interface. 10806 */ 10807 /* ARGSUSED */ 10808 int 10809 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10810 ip_ioctl_cmd_t *ipip, void *if_req) 10811 { 10812 int err = 0; 10813 in6_addr_t v6addr; 10814 boolean_t need_up = B_FALSE; 10815 10816 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 10817 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10818 ASSERT(IAM_WRITER_IPIF(ipif)); 10819 10820 if (ipif->ipif_isv6) { 10821 sin6_t *sin6; 10822 10823 if (sin->sin_family != AF_INET6) 10824 return (EAFNOSUPPORT); 10825 10826 sin6 = (sin6_t *)sin; 10827 v6addr = sin6->sin6_addr; 10828 10829 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10830 return (EADDRNOTAVAIL); 10831 } else { 10832 ipaddr_t addr; 10833 10834 if (sin->sin_family != AF_INET) 10835 return (EAFNOSUPPORT); 10836 10837 addr = sin->sin_addr.s_addr; 10838 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10839 return (EADDRNOTAVAIL); 10840 10841 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10842 } 10843 10844 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 10845 return (0); /* No change */ 10846 10847 if (ipif->ipif_flags & IPIF_UP) { 10848 /* 10849 * If the interface is already marked up, 10850 * we call ipif_down which will take care 10851 * of ditching any IREs that have been set 10852 * up based on the old pp dst address. 10853 */ 10854 err = ipif_logical_down(ipif, q, mp); 10855 if (err == EINPROGRESS) 10856 return (err); 10857 ipif_down_tail(ipif); 10858 need_up = B_TRUE; 10859 } 10860 /* 10861 * could return EINPROGRESS. If so ioctl will complete in 10862 * ip_rput_dlpi_writer 10863 */ 10864 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 10865 return (err); 10866 } 10867 10868 static int 10869 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10870 boolean_t need_up) 10871 { 10872 in6_addr_t v6addr; 10873 ill_t *ill = ipif->ipif_ill; 10874 int err = 0; 10875 10876 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", 10877 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10878 if (ipif->ipif_isv6) { 10879 sin6_t *sin6; 10880 10881 sin6 = (sin6_t *)sin; 10882 v6addr = sin6->sin6_addr; 10883 } else { 10884 ipaddr_t addr; 10885 10886 addr = sin->sin_addr.s_addr; 10887 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10888 } 10889 mutex_enter(&ill->ill_lock); 10890 /* Set point to point destination address. */ 10891 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10892 /* 10893 * Allow this as a means of creating logical 10894 * pt-pt interfaces on top of e.g. an Ethernet. 10895 * XXX Undocumented HACK for testing. 10896 * pt-pt interfaces are created with NUD disabled. 10897 */ 10898 ipif->ipif_flags |= IPIF_POINTOPOINT; 10899 ipif->ipif_flags &= ~IPIF_BROADCAST; 10900 if (ipif->ipif_isv6) 10901 ipif->ipif_ill->ill_flags |= ILLF_NONUD; 10902 } 10903 10904 /* Set the new address. */ 10905 ipif->ipif_v6pp_dst_addr = v6addr; 10906 /* Make sure subnet tracks pp_dst */ 10907 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 10908 mutex_exit(&ill->ill_lock); 10909 10910 if (need_up) { 10911 /* 10912 * Now bring the interface back up. If this 10913 * is the only IPIF for the ILL, ipif_up 10914 * will have to re-bind to the device, so 10915 * we may get back EINPROGRESS, in which 10916 * case, this IOCTL will get completed in 10917 * ip_rput_dlpi when we see the DL_BIND_ACK. 10918 */ 10919 err = ipif_up(ipif, q, mp); 10920 } 10921 return (err); 10922 } 10923 10924 /* 10925 * Restart entry point to restart the dstaddress set operation after the 10926 * refcounts have dropped to zero. 10927 */ 10928 /* ARGSUSED */ 10929 int 10930 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10931 ip_ioctl_cmd_t *ipip, void *ifreq) 10932 { 10933 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 10934 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10935 ipif_down_tail(ipif); 10936 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 10937 } 10938 10939 /* ARGSUSED */ 10940 int 10941 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10942 ip_ioctl_cmd_t *ipip, void *if_req) 10943 { 10944 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 10945 10946 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 10947 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10948 /* 10949 * Get point to point destination address. The addresses can't 10950 * change since we hold a reference to the ipif. 10951 */ 10952 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 10953 return (EADDRNOTAVAIL); 10954 10955 if (ipif->ipif_isv6) { 10956 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10957 *sin6 = sin6_null; 10958 sin6->sin6_family = AF_INET6; 10959 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 10960 } else { 10961 *sin = sin_null; 10962 sin->sin_family = AF_INET; 10963 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 10964 } 10965 return (0); 10966 } 10967 10968 /* 10969 * part of ipmp, make this func return the active/inactive state and 10970 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 10971 */ 10972 /* 10973 * This function either sets or clears the IFF_INACTIVE flag. 10974 * 10975 * As long as there are some addresses or multicast memberships on the 10976 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 10977 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 10978 * will be used for outbound packets. 10979 * 10980 * Caller needs to verify the validity of setting IFF_INACTIVE. 10981 */ 10982 static void 10983 phyint_inactive(phyint_t *phyi) 10984 { 10985 ill_t *ill_v4; 10986 ill_t *ill_v6; 10987 ipif_t *ipif; 10988 ilm_t *ilm; 10989 10990 ill_v4 = phyi->phyint_illv4; 10991 ill_v6 = phyi->phyint_illv6; 10992 10993 /* 10994 * No need for a lock while traversing the list since iam 10995 * a writer 10996 */ 10997 if (ill_v4 != NULL) { 10998 ASSERT(IAM_WRITER_ILL(ill_v4)); 10999 for (ipif = ill_v4->ill_ipif; ipif != NULL; 11000 ipif = ipif->ipif_next) { 11001 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11002 mutex_enter(&phyi->phyint_lock); 11003 phyi->phyint_flags &= ~PHYI_INACTIVE; 11004 mutex_exit(&phyi->phyint_lock); 11005 return; 11006 } 11007 } 11008 for (ilm = ill_v4->ill_ilm; ilm != NULL; 11009 ilm = ilm->ilm_next) { 11010 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11011 mutex_enter(&phyi->phyint_lock); 11012 phyi->phyint_flags &= ~PHYI_INACTIVE; 11013 mutex_exit(&phyi->phyint_lock); 11014 return; 11015 } 11016 } 11017 } 11018 if (ill_v6 != NULL) { 11019 ill_v6 = phyi->phyint_illv6; 11020 for (ipif = ill_v6->ill_ipif; ipif != NULL; 11021 ipif = ipif->ipif_next) { 11022 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11023 mutex_enter(&phyi->phyint_lock); 11024 phyi->phyint_flags &= ~PHYI_INACTIVE; 11025 mutex_exit(&phyi->phyint_lock); 11026 return; 11027 } 11028 } 11029 for (ilm = ill_v6->ill_ilm; ilm != NULL; 11030 ilm = ilm->ilm_next) { 11031 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11032 mutex_enter(&phyi->phyint_lock); 11033 phyi->phyint_flags &= ~PHYI_INACTIVE; 11034 mutex_exit(&phyi->phyint_lock); 11035 return; 11036 } 11037 } 11038 } 11039 mutex_enter(&phyi->phyint_lock); 11040 phyi->phyint_flags |= PHYI_INACTIVE; 11041 mutex_exit(&phyi->phyint_lock); 11042 } 11043 11044 /* 11045 * This function is called only when the phyint flags change. Currently 11046 * called from ip_sioctl_flags. We re-do the broadcast nomination so 11047 * that we can select a good ill. 11048 */ 11049 static void 11050 ip_redo_nomination(phyint_t *phyi) 11051 { 11052 ill_t *ill_v4; 11053 11054 ill_v4 = phyi->phyint_illv4; 11055 11056 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 11057 ASSERT(IAM_WRITER_ILL(ill_v4)); 11058 if (ill_v4->ill_group->illgrp_ill_count > 1) 11059 ill_nominate_bcast_rcv(ill_v4->ill_group); 11060 } 11061 } 11062 11063 /* 11064 * Heuristic to check if ill is INACTIVE. 11065 * Checks if ill has an ipif with an usable ip address. 11066 * 11067 * Return values: 11068 * B_TRUE - ill is INACTIVE; has no usable ipif 11069 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 11070 */ 11071 static boolean_t 11072 ill_is_inactive(ill_t *ill) 11073 { 11074 ipif_t *ipif; 11075 11076 /* Check whether it is in an IPMP group */ 11077 if (ill->ill_phyint->phyint_groupname == NULL) 11078 return (B_FALSE); 11079 11080 if (ill->ill_ipif_up_count == 0) 11081 return (B_TRUE); 11082 11083 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11084 uint64_t flags = ipif->ipif_flags; 11085 11086 /* 11087 * This ipif is usable if it is IPIF_UP and not a 11088 * dedicated test address. A dedicated test address 11089 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 11090 * (note in particular that V6 test addresses are 11091 * link-local data addresses and thus are marked 11092 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 11093 */ 11094 if ((flags & IPIF_UP) && 11095 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 11096 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 11097 return (B_FALSE); 11098 } 11099 return (B_TRUE); 11100 } 11101 11102 /* 11103 * Set interface flags. 11104 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 11105 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 11106 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 11107 * 11108 * NOTE : We really don't enforce that ipif_id zero should be used 11109 * for setting any flags other than IFF_LOGINT_FLAGS. This 11110 * is because applications generally does SICGLIFFLAGS and 11111 * ORs in the new flags (that affects the logical) and does a 11112 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11113 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11114 * flags that will be turned on is correct with respect to 11115 * ipif_id 0. For backward compatibility reasons, it is not done. 11116 */ 11117 /* ARGSUSED */ 11118 int 11119 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11120 ip_ioctl_cmd_t *ipip, void *if_req) 11121 { 11122 uint64_t turn_on; 11123 uint64_t turn_off; 11124 int err; 11125 boolean_t need_up = B_FALSE; 11126 phyint_t *phyi; 11127 ill_t *ill; 11128 uint64_t intf_flags; 11129 boolean_t phyint_flags_modified = B_FALSE; 11130 uint64_t flags; 11131 struct ifreq *ifr; 11132 struct lifreq *lifr; 11133 boolean_t set_linklocal = B_FALSE; 11134 boolean_t zero_source = B_FALSE; 11135 11136 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11137 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11138 11139 ASSERT(IAM_WRITER_IPIF(ipif)); 11140 11141 ill = ipif->ipif_ill; 11142 phyi = ill->ill_phyint; 11143 11144 if (ipip->ipi_cmd_type == IF_CMD) { 11145 ifr = (struct ifreq *)if_req; 11146 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11147 } else { 11148 lifr = (struct lifreq *)if_req; 11149 flags = lifr->lifr_flags; 11150 } 11151 11152 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11153 11154 /* 11155 * Has the flags been set correctly till now ? 11156 */ 11157 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11158 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11159 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11160 /* 11161 * Compare the new flags to the old, and partition 11162 * into those coming on and those going off. 11163 * For the 16 bit command keep the bits above bit 16 unchanged. 11164 */ 11165 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11166 flags |= intf_flags & ~0xFFFF; 11167 11168 /* 11169 * First check which bits will change and then which will 11170 * go on and off 11171 */ 11172 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 11173 if (!turn_on) 11174 return (0); /* No change */ 11175 11176 turn_off = intf_flags & turn_on; 11177 turn_on ^= turn_off; 11178 err = 0; 11179 11180 /* 11181 * Don't allow any bits belonging to the logical interface 11182 * to be set or cleared on the replacement ipif that was 11183 * created temporarily during a MOVE. 11184 */ 11185 if (ipif->ipif_replace_zero && 11186 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 11187 return (EINVAL); 11188 } 11189 11190 /* 11191 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11192 * IPv6 interfaces. 11193 */ 11194 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11195 return (EINVAL); 11196 11197 /* 11198 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11199 * interfaces. It makes no sense in that context. 11200 */ 11201 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11202 return (EINVAL); 11203 11204 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11205 zero_source = B_TRUE; 11206 11207 /* 11208 * For IPv6 ipif_id 0, don't allow the interface to be up without 11209 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11210 * If the link local address isn't set, and can be set, it will get 11211 * set later on in this function. 11212 */ 11213 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11214 (flags & IFF_UP) && !zero_source && 11215 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11216 if (ipif_cant_setlinklocal(ipif)) 11217 return (EINVAL); 11218 set_linklocal = B_TRUE; 11219 } 11220 11221 /* 11222 * ILL cannot be part of a usesrc group and and IPMP group at the 11223 * same time. No need to grab ill_g_usesrc_lock here, see 11224 * synchronization notes in ip.c 11225 */ 11226 if (turn_on & PHYI_STANDBY && 11227 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 11228 return (EINVAL); 11229 } 11230 11231 /* 11232 * If we modify physical interface flags, we'll potentially need to 11233 * send up two routing socket messages for the changes (one for the 11234 * IPv4 ill, and another for the IPv6 ill). Note that here. 11235 */ 11236 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11237 phyint_flags_modified = B_TRUE; 11238 11239 /* 11240 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 11241 * we need to flush the IRE_CACHES belonging to this ill. 11242 * We handle this case here without doing the DOWN/UP dance 11243 * like it is done for other flags. If some other flags are 11244 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 11245 * below will handle it by bringing it down and then 11246 * bringing it UP. 11247 */ 11248 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 11249 ill_t *ill_v4, *ill_v6; 11250 11251 ill_v4 = phyi->phyint_illv4; 11252 ill_v6 = phyi->phyint_illv6; 11253 11254 /* 11255 * First set the INACTIVE flag if needed. Then delete the ires. 11256 * ire_add will atomically prevent creating new IRE_CACHEs 11257 * unless hidden flag is set. 11258 * PHYI_FAILED and PHYI_INACTIVE are exclusive 11259 */ 11260 if ((turn_on & PHYI_FAILED) && 11261 ((intf_flags & PHYI_STANDBY) || !ipmp_enable_failback)) { 11262 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 11263 phyi->phyint_flags &= ~PHYI_INACTIVE; 11264 } 11265 if ((turn_off & PHYI_FAILED) && 11266 ((intf_flags & PHYI_STANDBY) || 11267 (!ipmp_enable_failback && ill_is_inactive(ill)))) { 11268 phyint_inactive(phyi); 11269 } 11270 11271 if (turn_on & PHYI_STANDBY) { 11272 /* 11273 * We implicitly set INACTIVE only when STANDBY is set. 11274 * INACTIVE is also set on non-STANDBY phyint when user 11275 * disables FAILBACK using configuration file. 11276 * Do not allow STANDBY to be set on such INACTIVE 11277 * phyint 11278 */ 11279 if (phyi->phyint_flags & PHYI_INACTIVE) 11280 return (EINVAL); 11281 if (!(phyi->phyint_flags & PHYI_FAILED)) 11282 phyint_inactive(phyi); 11283 } 11284 if (turn_off & PHYI_STANDBY) { 11285 if (ipmp_enable_failback) { 11286 /* 11287 * Reset PHYI_INACTIVE. 11288 */ 11289 phyi->phyint_flags &= ~PHYI_INACTIVE; 11290 } else if (ill_is_inactive(ill) && 11291 !(phyi->phyint_flags & PHYI_FAILED)) { 11292 /* 11293 * Need to set INACTIVE, when user sets 11294 * STANDBY on a non-STANDBY phyint and 11295 * later resets STANDBY 11296 */ 11297 phyint_inactive(phyi); 11298 } 11299 } 11300 /* 11301 * We should always send up a message so that the 11302 * daemons come to know of it. Note that the zeroth 11303 * interface can be down and the check below for IPIF_UP 11304 * will not make sense as we are actually setting 11305 * a phyint flag here. We assume that the ipif used 11306 * is always the zeroth ipif. (ip_rts_ifmsg does not 11307 * send up any message for non-zero ipifs). 11308 */ 11309 phyint_flags_modified = B_TRUE; 11310 11311 if (ill_v4 != NULL) { 11312 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11313 IRE_CACHE, ill_stq_cache_delete, 11314 (char *)ill_v4, ill_v4); 11315 illgrp_reset_schednext(ill_v4); 11316 } 11317 if (ill_v6 != NULL) { 11318 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11319 IRE_CACHE, ill_stq_cache_delete, 11320 (char *)ill_v6, ill_v6); 11321 illgrp_reset_schednext(ill_v6); 11322 } 11323 } 11324 11325 /* 11326 * If ILLF_ROUTER changes, we need to change the ip forwarding 11327 * status of the interface and, if the interface is part of an IPMP 11328 * group, all other interfaces that are part of the same IPMP 11329 * group. 11330 */ 11331 if ((turn_on | turn_off) & ILLF_ROUTER) { 11332 (void) ill_forward_set(q, mp, ((turn_on & ILLF_ROUTER) != 0), 11333 (caddr_t)ill); 11334 } 11335 11336 /* 11337 * If the interface is not UP and we are not going to 11338 * bring it UP, record the flags and return. When the 11339 * interface comes UP later, the right actions will be 11340 * taken. 11341 */ 11342 if (!(ipif->ipif_flags & IPIF_UP) && 11343 !(turn_on & IPIF_UP)) { 11344 /* Record new flags in their respective places. */ 11345 mutex_enter(&ill->ill_lock); 11346 mutex_enter(&ill->ill_phyint->phyint_lock); 11347 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11348 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11349 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11350 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11351 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11352 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11353 mutex_exit(&ill->ill_lock); 11354 mutex_exit(&ill->ill_phyint->phyint_lock); 11355 11356 /* 11357 * We do the broadcast and nomination here rather 11358 * than waiting for a FAILOVER/FAILBACK to happen. In 11359 * the case of FAILBACK from INACTIVE standby to the 11360 * interface that has been repaired, PHYI_FAILED has not 11361 * been cleared yet. If there are only two interfaces in 11362 * that group, all we have is a FAILED and INACTIVE 11363 * interface. If we do the nomination soon after a failback, 11364 * the broadcast nomination code would select the 11365 * INACTIVE interface for receiving broadcasts as FAILED is 11366 * not yet cleared. As we don't want STANDBY/INACTIVE to 11367 * receive broadcast packets, we need to redo nomination 11368 * when the FAILED is cleared here. Thus, in general we 11369 * always do the nomination here for FAILED, STANDBY 11370 * and OFFLINE. 11371 */ 11372 if (((turn_on | turn_off) & 11373 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 11374 ip_redo_nomination(phyi); 11375 } 11376 if (phyint_flags_modified) { 11377 if (phyi->phyint_illv4 != NULL) { 11378 ip_rts_ifmsg(phyi->phyint_illv4-> 11379 ill_ipif); 11380 } 11381 if (phyi->phyint_illv6 != NULL) { 11382 ip_rts_ifmsg(phyi->phyint_illv6-> 11383 ill_ipif); 11384 } 11385 } 11386 return (0); 11387 } else if (set_linklocal || zero_source) { 11388 mutex_enter(&ill->ill_lock); 11389 if (set_linklocal) 11390 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11391 if (zero_source) 11392 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11393 mutex_exit(&ill->ill_lock); 11394 } 11395 11396 /* 11397 * Disallow IPv6 interfaces coming up that have the unspecified address, 11398 * or point-to-point interfaces with an unspecified destination. We do 11399 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11400 * have a subnet assigned, which is how in.ndpd currently manages its 11401 * onlink prefix list when no addresses are configured with those 11402 * prefixes. 11403 */ 11404 if (ipif->ipif_isv6 && 11405 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11406 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11407 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11408 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11409 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11410 return (EINVAL); 11411 } 11412 11413 /* 11414 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11415 * from being brought up. 11416 */ 11417 if (!ipif->ipif_isv6 && 11418 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11419 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11420 return (EINVAL); 11421 } 11422 11423 /* 11424 * The only flag changes that we currently take specific action on 11425 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 11426 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 11427 * IPIF_PREFERRED. This is done by bring the ipif down, changing 11428 * the flags and bringing it back up again. 11429 */ 11430 if ((turn_on|turn_off) & 11431 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11432 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 11433 /* 11434 * Taking this ipif down, make sure we have 11435 * valid net and subnet bcast ire's for other 11436 * logical interfaces, if we need them. 11437 */ 11438 if (!ipif->ipif_isv6) 11439 ipif_check_bcast_ires(ipif); 11440 11441 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11442 !(turn_off & IPIF_UP)) { 11443 need_up = B_TRUE; 11444 if (ipif->ipif_flags & IPIF_UP) 11445 ill->ill_logical_down = 1; 11446 turn_on &= ~IPIF_UP; 11447 } 11448 err = ipif_down(ipif, q, mp); 11449 ip1dbg(("ipif_down returns %d err ", err)); 11450 if (err == EINPROGRESS) 11451 return (err); 11452 ipif_down_tail(ipif); 11453 } 11454 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 11455 } 11456 11457 static int 11458 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 11459 boolean_t need_up) 11460 { 11461 ill_t *ill; 11462 phyint_t *phyi; 11463 uint64_t turn_on; 11464 uint64_t turn_off; 11465 uint64_t intf_flags; 11466 boolean_t phyint_flags_modified = B_FALSE; 11467 int err = 0; 11468 boolean_t set_linklocal = B_FALSE; 11469 boolean_t zero_source = B_FALSE; 11470 11471 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 11472 ipif->ipif_ill->ill_name, ipif->ipif_id)); 11473 11474 ASSERT(IAM_WRITER_IPIF(ipif)); 11475 11476 ill = ipif->ipif_ill; 11477 phyi = ill->ill_phyint; 11478 11479 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11480 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 11481 11482 turn_off = intf_flags & turn_on; 11483 turn_on ^= turn_off; 11484 11485 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 11486 phyint_flags_modified = B_TRUE; 11487 11488 /* 11489 * Now we change the flags. Track current value of 11490 * other flags in their respective places. 11491 */ 11492 mutex_enter(&ill->ill_lock); 11493 mutex_enter(&phyi->phyint_lock); 11494 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11495 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11496 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11497 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11498 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11499 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11500 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 11501 set_linklocal = B_TRUE; 11502 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 11503 } 11504 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 11505 zero_source = B_TRUE; 11506 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 11507 } 11508 mutex_exit(&ill->ill_lock); 11509 mutex_exit(&phyi->phyint_lock); 11510 11511 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 11512 ip_redo_nomination(phyi); 11513 11514 if (set_linklocal) 11515 (void) ipif_setlinklocal(ipif); 11516 11517 if (zero_source) 11518 ipif->ipif_v6src_addr = ipv6_all_zeros; 11519 else 11520 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 11521 11522 if (need_up) { 11523 /* 11524 * XXX ipif_up really does not know whether a phyint flags 11525 * was modified or not. So, it sends up information on 11526 * only one routing sockets message. As we don't bring up 11527 * the interface and also set STANDBY/FAILED simultaneously 11528 * it should be okay. 11529 */ 11530 err = ipif_up(ipif, q, mp); 11531 } else { 11532 /* 11533 * Make sure routing socket sees all changes to the flags. 11534 * ipif_up_done* handles this when we use ipif_up. 11535 */ 11536 if (phyint_flags_modified) { 11537 if (phyi->phyint_illv4 != NULL) { 11538 ip_rts_ifmsg(phyi->phyint_illv4-> 11539 ill_ipif); 11540 } 11541 if (phyi->phyint_illv6 != NULL) { 11542 ip_rts_ifmsg(phyi->phyint_illv6-> 11543 ill_ipif); 11544 } 11545 } else { 11546 ip_rts_ifmsg(ipif); 11547 } 11548 } 11549 return (err); 11550 } 11551 11552 /* 11553 * Restart entry point to restart the flags restart operation after the 11554 * refcounts have dropped to zero. 11555 */ 11556 /* ARGSUSED */ 11557 int 11558 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11559 ip_ioctl_cmd_t *ipip, void *if_req) 11560 { 11561 int err; 11562 struct ifreq *ifr = (struct ifreq *)if_req; 11563 struct lifreq *lifr = (struct lifreq *)if_req; 11564 11565 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 11566 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11567 11568 ipif_down_tail(ipif); 11569 if (ipip->ipi_cmd_type == IF_CMD) { 11570 /* 11571 * Since ip_sioctl_flags expects an int and ifr_flags 11572 * is a short we need to cast ifr_flags into an int 11573 * to avoid having sign extension cause bits to get 11574 * set that should not be. 11575 */ 11576 err = ip_sioctl_flags_tail(ipif, 11577 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 11578 q, mp, B_TRUE); 11579 } else { 11580 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 11581 q, mp, B_TRUE); 11582 } 11583 return (err); 11584 } 11585 11586 /* ARGSUSED */ 11587 int 11588 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11589 ip_ioctl_cmd_t *ipip, void *if_req) 11590 { 11591 /* 11592 * Has the flags been set correctly till now ? 11593 */ 11594 ill_t *ill = ipif->ipif_ill; 11595 phyint_t *phyi = ill->ill_phyint; 11596 11597 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 11598 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11599 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11600 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11601 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11602 11603 /* 11604 * Need a lock since some flags can be set even when there are 11605 * references to the ipif. 11606 */ 11607 mutex_enter(&ill->ill_lock); 11608 if (ipip->ipi_cmd_type == IF_CMD) { 11609 struct ifreq *ifr = (struct ifreq *)if_req; 11610 11611 /* Get interface flags (low 16 only). */ 11612 ifr->ifr_flags = ((ipif->ipif_flags | 11613 ill->ill_flags | phyi->phyint_flags) & 0xffff); 11614 } else { 11615 struct lifreq *lifr = (struct lifreq *)if_req; 11616 11617 /* Get interface flags. */ 11618 lifr->lifr_flags = ipif->ipif_flags | 11619 ill->ill_flags | phyi->phyint_flags; 11620 } 11621 mutex_exit(&ill->ill_lock); 11622 return (0); 11623 } 11624 11625 /* ARGSUSED */ 11626 int 11627 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11628 ip_ioctl_cmd_t *ipip, void *if_req) 11629 { 11630 int mtu; 11631 int ip_min_mtu; 11632 struct ifreq *ifr; 11633 struct lifreq *lifr; 11634 ire_t *ire; 11635 11636 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 11637 ipif->ipif_id, (void *)ipif)); 11638 if (ipip->ipi_cmd_type == IF_CMD) { 11639 ifr = (struct ifreq *)if_req; 11640 mtu = ifr->ifr_metric; 11641 } else { 11642 lifr = (struct lifreq *)if_req; 11643 mtu = lifr->lifr_mtu; 11644 } 11645 11646 if (ipif->ipif_isv6) 11647 ip_min_mtu = IPV6_MIN_MTU; 11648 else 11649 ip_min_mtu = IP_MIN_MTU; 11650 11651 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 11652 return (EINVAL); 11653 11654 /* 11655 * Change the MTU size in all relevant ire's. 11656 * Mtu change Vs. new ire creation - protocol below. 11657 * First change ipif_mtu and the ire_max_frag of the 11658 * interface ire. Then do an ire walk and change the 11659 * ire_max_frag of all affected ires. During ire_add 11660 * under the bucket lock, set the ire_max_frag of the 11661 * new ire being created from the ipif/ire from which 11662 * it is being derived. If an mtu change happens after 11663 * the ire is added, the new ire will be cleaned up. 11664 * Conversely if the mtu change happens before the ire 11665 * is added, ire_add will see the new value of the mtu. 11666 */ 11667 ipif->ipif_mtu = mtu; 11668 ipif->ipif_flags |= IPIF_FIXEDMTU; 11669 11670 if (ipif->ipif_isv6) 11671 ire = ipif_to_ire_v6(ipif); 11672 else 11673 ire = ipif_to_ire(ipif); 11674 if (ire != NULL) { 11675 ire->ire_max_frag = ipif->ipif_mtu; 11676 ire_refrele(ire); 11677 } 11678 if (ipif->ipif_flags & IPIF_UP) { 11679 if (ipif->ipif_isv6) 11680 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES); 11681 else 11682 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES); 11683 } 11684 /* Update the MTU in SCTP's list */ 11685 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11686 return (0); 11687 } 11688 11689 /* Get interface MTU. */ 11690 /* ARGSUSED */ 11691 int 11692 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11693 ip_ioctl_cmd_t *ipip, void *if_req) 11694 { 11695 struct ifreq *ifr; 11696 struct lifreq *lifr; 11697 11698 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 11699 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11700 if (ipip->ipi_cmd_type == IF_CMD) { 11701 ifr = (struct ifreq *)if_req; 11702 ifr->ifr_metric = ipif->ipif_mtu; 11703 } else { 11704 lifr = (struct lifreq *)if_req; 11705 lifr->lifr_mtu = ipif->ipif_mtu; 11706 } 11707 return (0); 11708 } 11709 11710 /* Set interface broadcast address. */ 11711 /* ARGSUSED2 */ 11712 int 11713 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11714 ip_ioctl_cmd_t *ipip, void *if_req) 11715 { 11716 ipaddr_t addr; 11717 ire_t *ire; 11718 11719 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 11720 ipif->ipif_id)); 11721 11722 ASSERT(IAM_WRITER_IPIF(ipif)); 11723 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11724 return (EADDRNOTAVAIL); 11725 11726 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 11727 11728 if (sin->sin_family != AF_INET) 11729 return (EAFNOSUPPORT); 11730 11731 addr = sin->sin_addr.s_addr; 11732 if (ipif->ipif_flags & IPIF_UP) { 11733 /* 11734 * If we are already up, make sure the new 11735 * broadcast address makes sense. If it does, 11736 * there should be an IRE for it already. 11737 * Don't match on ipif, only on the ill 11738 * since we are sharing these now. Don't use 11739 * MATCH_IRE_ILL_GROUP as we are looking for 11740 * the broadcast ire on this ill and each ill 11741 * in the group has its own broadcast ire. 11742 */ 11743 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 11744 ipif, ALL_ZONES, NULL, 11745 (MATCH_IRE_ILL | MATCH_IRE_TYPE)); 11746 if (ire == NULL) { 11747 return (EINVAL); 11748 } else { 11749 ire_refrele(ire); 11750 } 11751 } 11752 /* 11753 * Changing the broadcast addr for this ipif. 11754 * Make sure we have valid net and subnet bcast 11755 * ire's for other logical interfaces, if needed. 11756 */ 11757 if (addr != ipif->ipif_brd_addr) 11758 ipif_check_bcast_ires(ipif); 11759 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 11760 return (0); 11761 } 11762 11763 /* Get interface broadcast address. */ 11764 /* ARGSUSED */ 11765 int 11766 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11767 ip_ioctl_cmd_t *ipip, void *if_req) 11768 { 11769 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 11770 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11771 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11772 return (EADDRNOTAVAIL); 11773 11774 /* IPIF_BROADCAST not possible with IPv6 */ 11775 ASSERT(!ipif->ipif_isv6); 11776 *sin = sin_null; 11777 sin->sin_family = AF_INET; 11778 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 11779 return (0); 11780 } 11781 11782 /* 11783 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 11784 */ 11785 /* ARGSUSED */ 11786 int 11787 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11788 ip_ioctl_cmd_t *ipip, void *if_req) 11789 { 11790 int err = 0; 11791 in6_addr_t v6mask; 11792 11793 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 11794 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11795 11796 ASSERT(IAM_WRITER_IPIF(ipif)); 11797 11798 if (ipif->ipif_isv6) { 11799 sin6_t *sin6; 11800 11801 if (sin->sin_family != AF_INET6) 11802 return (EAFNOSUPPORT); 11803 11804 sin6 = (sin6_t *)sin; 11805 v6mask = sin6->sin6_addr; 11806 } else { 11807 ipaddr_t mask; 11808 11809 if (sin->sin_family != AF_INET) 11810 return (EAFNOSUPPORT); 11811 11812 mask = sin->sin_addr.s_addr; 11813 V4MASK_TO_V6(mask, v6mask); 11814 } 11815 11816 /* 11817 * No big deal if the interface isn't already up, or the mask 11818 * isn't really changing, or this is pt-pt. 11819 */ 11820 if (!(ipif->ipif_flags & IPIF_UP) || 11821 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 11822 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 11823 ipif->ipif_v6net_mask = v6mask; 11824 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11825 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 11826 ipif->ipif_v6net_mask, 11827 ipif->ipif_v6subnet); 11828 } 11829 return (0); 11830 } 11831 /* 11832 * Make sure we have valid net and subnet broadcast ire's 11833 * for the old netmask, if needed by other logical interfaces. 11834 */ 11835 if (!ipif->ipif_isv6) 11836 ipif_check_bcast_ires(ipif); 11837 11838 err = ipif_logical_down(ipif, q, mp); 11839 if (err == EINPROGRESS) 11840 return (err); 11841 ipif_down_tail(ipif); 11842 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 11843 return (err); 11844 } 11845 11846 static int 11847 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 11848 { 11849 in6_addr_t v6mask; 11850 int err = 0; 11851 11852 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 11853 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11854 11855 if (ipif->ipif_isv6) { 11856 sin6_t *sin6; 11857 11858 sin6 = (sin6_t *)sin; 11859 v6mask = sin6->sin6_addr; 11860 } else { 11861 ipaddr_t mask; 11862 11863 mask = sin->sin_addr.s_addr; 11864 V4MASK_TO_V6(mask, v6mask); 11865 } 11866 11867 ipif->ipif_v6net_mask = v6mask; 11868 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11869 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 11870 ipif->ipif_v6subnet); 11871 } 11872 err = ipif_up(ipif, q, mp); 11873 11874 if (err == 0 || err == EINPROGRESS) { 11875 /* 11876 * The interface must be DL_BOUND if this packet has to 11877 * go out on the wire. Since we only go through a logical 11878 * down and are bound with the driver during an internal 11879 * down/up that is satisfied. 11880 */ 11881 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 11882 /* Potentially broadcast an address mask reply. */ 11883 ipif_mask_reply(ipif); 11884 } 11885 } 11886 return (err); 11887 } 11888 11889 /* ARGSUSED */ 11890 int 11891 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11892 ip_ioctl_cmd_t *ipip, void *if_req) 11893 { 11894 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 11895 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11896 ipif_down_tail(ipif); 11897 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 11898 } 11899 11900 /* Get interface net mask. */ 11901 /* ARGSUSED */ 11902 int 11903 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11904 ip_ioctl_cmd_t *ipip, void *if_req) 11905 { 11906 struct lifreq *lifr = (struct lifreq *)if_req; 11907 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 11908 11909 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 11910 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11911 11912 /* 11913 * net mask can't change since we have a reference to the ipif. 11914 */ 11915 if (ipif->ipif_isv6) { 11916 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11917 *sin6 = sin6_null; 11918 sin6->sin6_family = AF_INET6; 11919 sin6->sin6_addr = ipif->ipif_v6net_mask; 11920 lifr->lifr_addrlen = 11921 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11922 } else { 11923 *sin = sin_null; 11924 sin->sin_family = AF_INET; 11925 sin->sin_addr.s_addr = ipif->ipif_net_mask; 11926 if (ipip->ipi_cmd_type == LIF_CMD) { 11927 lifr->lifr_addrlen = 11928 ip_mask_to_plen(ipif->ipif_net_mask); 11929 } 11930 } 11931 return (0); 11932 } 11933 11934 /* ARGSUSED */ 11935 int 11936 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11937 ip_ioctl_cmd_t *ipip, void *if_req) 11938 { 11939 11940 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 11941 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11942 /* 11943 * Set interface metric. We don't use this for 11944 * anything but we keep track of it in case it is 11945 * important to routing applications or such. 11946 */ 11947 if (ipip->ipi_cmd_type == IF_CMD) { 11948 struct ifreq *ifr; 11949 11950 ifr = (struct ifreq *)if_req; 11951 ipif->ipif_metric = ifr->ifr_metric; 11952 } else { 11953 struct lifreq *lifr; 11954 11955 lifr = (struct lifreq *)if_req; 11956 ipif->ipif_metric = lifr->lifr_metric; 11957 } 11958 return (0); 11959 } 11960 11961 11962 /* ARGSUSED */ 11963 int 11964 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11965 ip_ioctl_cmd_t *ipip, void *if_req) 11966 { 11967 11968 /* Get interface metric. */ 11969 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 11970 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11971 if (ipip->ipi_cmd_type == IF_CMD) { 11972 struct ifreq *ifr; 11973 11974 ifr = (struct ifreq *)if_req; 11975 ifr->ifr_metric = ipif->ipif_metric; 11976 } else { 11977 struct lifreq *lifr; 11978 11979 lifr = (struct lifreq *)if_req; 11980 lifr->lifr_metric = ipif->ipif_metric; 11981 } 11982 11983 return (0); 11984 } 11985 11986 /* ARGSUSED */ 11987 int 11988 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11989 ip_ioctl_cmd_t *ipip, void *if_req) 11990 { 11991 11992 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 11993 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11994 /* 11995 * Set the muxid returned from I_PLINK. 11996 */ 11997 if (ipip->ipi_cmd_type == IF_CMD) { 11998 struct ifreq *ifr = (struct ifreq *)if_req; 11999 12000 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12001 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12002 } else { 12003 struct lifreq *lifr = (struct lifreq *)if_req; 12004 12005 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12006 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12007 } 12008 return (0); 12009 } 12010 12011 /* ARGSUSED */ 12012 int 12013 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12014 ip_ioctl_cmd_t *ipip, void *if_req) 12015 { 12016 12017 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12018 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12019 /* 12020 * Get the muxid saved in ill for I_PUNLINK. 12021 */ 12022 if (ipip->ipi_cmd_type == IF_CMD) { 12023 struct ifreq *ifr = (struct ifreq *)if_req; 12024 12025 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12026 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12027 } else { 12028 struct lifreq *lifr = (struct lifreq *)if_req; 12029 12030 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12031 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12032 } 12033 return (0); 12034 } 12035 12036 /* 12037 * Set the subnet prefix. Does not modify the broadcast address. 12038 */ 12039 /* ARGSUSED */ 12040 int 12041 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12042 ip_ioctl_cmd_t *ipip, void *if_req) 12043 { 12044 int err = 0; 12045 in6_addr_t v6addr; 12046 in6_addr_t v6mask; 12047 boolean_t need_up = B_FALSE; 12048 int addrlen; 12049 12050 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12051 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12052 12053 ASSERT(IAM_WRITER_IPIF(ipif)); 12054 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12055 12056 if (ipif->ipif_isv6) { 12057 sin6_t *sin6; 12058 12059 if (sin->sin_family != AF_INET6) 12060 return (EAFNOSUPPORT); 12061 12062 sin6 = (sin6_t *)sin; 12063 v6addr = sin6->sin6_addr; 12064 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12065 return (EADDRNOTAVAIL); 12066 } else { 12067 ipaddr_t addr; 12068 12069 if (sin->sin_family != AF_INET) 12070 return (EAFNOSUPPORT); 12071 12072 addr = sin->sin_addr.s_addr; 12073 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12074 return (EADDRNOTAVAIL); 12075 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12076 /* Add 96 bits */ 12077 addrlen += IPV6_ABITS - IP_ABITS; 12078 } 12079 12080 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12081 return (EINVAL); 12082 12083 /* Check if bits in the address is set past the mask */ 12084 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12085 return (EINVAL); 12086 12087 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12088 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12089 return (0); /* No change */ 12090 12091 if (ipif->ipif_flags & IPIF_UP) { 12092 /* 12093 * If the interface is already marked up, 12094 * we call ipif_down which will take care 12095 * of ditching any IREs that have been set 12096 * up based on the old interface address. 12097 */ 12098 err = ipif_logical_down(ipif, q, mp); 12099 if (err == EINPROGRESS) 12100 return (err); 12101 ipif_down_tail(ipif); 12102 need_up = B_TRUE; 12103 } 12104 12105 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12106 return (err); 12107 } 12108 12109 static int 12110 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12111 queue_t *q, mblk_t *mp, boolean_t need_up) 12112 { 12113 ill_t *ill = ipif->ipif_ill; 12114 int err = 0; 12115 12116 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12117 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12118 12119 /* Set the new address. */ 12120 mutex_enter(&ill->ill_lock); 12121 ipif->ipif_v6net_mask = v6mask; 12122 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12123 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12124 ipif->ipif_v6subnet); 12125 } 12126 mutex_exit(&ill->ill_lock); 12127 12128 if (need_up) { 12129 /* 12130 * Now bring the interface back up. If this 12131 * is the only IPIF for the ILL, ipif_up 12132 * will have to re-bind to the device, so 12133 * we may get back EINPROGRESS, in which 12134 * case, this IOCTL will get completed in 12135 * ip_rput_dlpi when we see the DL_BIND_ACK. 12136 */ 12137 err = ipif_up(ipif, q, mp); 12138 if (err == EINPROGRESS) 12139 return (err); 12140 } 12141 return (err); 12142 } 12143 12144 /* ARGSUSED */ 12145 int 12146 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12147 ip_ioctl_cmd_t *ipip, void *if_req) 12148 { 12149 int addrlen; 12150 in6_addr_t v6addr; 12151 in6_addr_t v6mask; 12152 struct lifreq *lifr = (struct lifreq *)if_req; 12153 12154 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12155 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12156 ipif_down_tail(ipif); 12157 12158 addrlen = lifr->lifr_addrlen; 12159 if (ipif->ipif_isv6) { 12160 sin6_t *sin6; 12161 12162 sin6 = (sin6_t *)sin; 12163 v6addr = sin6->sin6_addr; 12164 } else { 12165 ipaddr_t addr; 12166 12167 addr = sin->sin_addr.s_addr; 12168 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12169 addrlen += IPV6_ABITS - IP_ABITS; 12170 } 12171 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12172 12173 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12174 } 12175 12176 /* ARGSUSED */ 12177 int 12178 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12179 ip_ioctl_cmd_t *ipip, void *if_req) 12180 { 12181 struct lifreq *lifr = (struct lifreq *)if_req; 12182 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12183 12184 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12185 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12186 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12187 12188 if (ipif->ipif_isv6) { 12189 *sin6 = sin6_null; 12190 sin6->sin6_family = AF_INET6; 12191 sin6->sin6_addr = ipif->ipif_v6subnet; 12192 lifr->lifr_addrlen = 12193 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12194 } else { 12195 *sin = sin_null; 12196 sin->sin_family = AF_INET; 12197 sin->sin_addr.s_addr = ipif->ipif_subnet; 12198 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12199 } 12200 return (0); 12201 } 12202 12203 /* 12204 * Set the IPv6 address token. 12205 */ 12206 /* ARGSUSED */ 12207 int 12208 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12209 ip_ioctl_cmd_t *ipi, void *if_req) 12210 { 12211 ill_t *ill = ipif->ipif_ill; 12212 int err; 12213 in6_addr_t v6addr; 12214 in6_addr_t v6mask; 12215 boolean_t need_up = B_FALSE; 12216 int i; 12217 sin6_t *sin6 = (sin6_t *)sin; 12218 struct lifreq *lifr = (struct lifreq *)if_req; 12219 int addrlen; 12220 12221 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12222 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12223 ASSERT(IAM_WRITER_IPIF(ipif)); 12224 12225 addrlen = lifr->lifr_addrlen; 12226 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12227 if (ipif->ipif_id != 0) 12228 return (EINVAL); 12229 12230 if (!ipif->ipif_isv6) 12231 return (EINVAL); 12232 12233 if (addrlen > IPV6_ABITS) 12234 return (EINVAL); 12235 12236 v6addr = sin6->sin6_addr; 12237 12238 /* 12239 * The length of the token is the length from the end. To get 12240 * the proper mask for this, compute the mask of the bits not 12241 * in the token; ie. the prefix, and then xor to get the mask. 12242 */ 12243 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12244 return (EINVAL); 12245 for (i = 0; i < 4; i++) { 12246 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12247 } 12248 12249 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12250 ill->ill_token_length == addrlen) 12251 return (0); /* No change */ 12252 12253 if (ipif->ipif_flags & IPIF_UP) { 12254 err = ipif_logical_down(ipif, q, mp); 12255 if (err == EINPROGRESS) 12256 return (err); 12257 ipif_down_tail(ipif); 12258 need_up = B_TRUE; 12259 } 12260 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12261 return (err); 12262 } 12263 12264 static int 12265 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12266 mblk_t *mp, boolean_t need_up) 12267 { 12268 in6_addr_t v6addr; 12269 in6_addr_t v6mask; 12270 ill_t *ill = ipif->ipif_ill; 12271 int i; 12272 int err = 0; 12273 12274 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12275 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12276 v6addr = sin6->sin6_addr; 12277 /* 12278 * The length of the token is the length from the end. To get 12279 * the proper mask for this, compute the mask of the bits not 12280 * in the token; ie. the prefix, and then xor to get the mask. 12281 */ 12282 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12283 for (i = 0; i < 4; i++) 12284 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12285 12286 mutex_enter(&ill->ill_lock); 12287 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12288 ill->ill_token_length = addrlen; 12289 mutex_exit(&ill->ill_lock); 12290 12291 if (need_up) { 12292 /* 12293 * Now bring the interface back up. If this 12294 * is the only IPIF for the ILL, ipif_up 12295 * will have to re-bind to the device, so 12296 * we may get back EINPROGRESS, in which 12297 * case, this IOCTL will get completed in 12298 * ip_rput_dlpi when we see the DL_BIND_ACK. 12299 */ 12300 err = ipif_up(ipif, q, mp); 12301 if (err == EINPROGRESS) 12302 return (err); 12303 } 12304 return (err); 12305 } 12306 12307 /* ARGSUSED */ 12308 int 12309 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12310 ip_ioctl_cmd_t *ipi, void *if_req) 12311 { 12312 ill_t *ill; 12313 sin6_t *sin6 = (sin6_t *)sin; 12314 struct lifreq *lifr = (struct lifreq *)if_req; 12315 12316 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12317 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12318 if (ipif->ipif_id != 0) 12319 return (EINVAL); 12320 12321 ill = ipif->ipif_ill; 12322 if (!ill->ill_isv6) 12323 return (ENXIO); 12324 12325 *sin6 = sin6_null; 12326 sin6->sin6_family = AF_INET6; 12327 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12328 sin6->sin6_addr = ill->ill_token; 12329 lifr->lifr_addrlen = ill->ill_token_length; 12330 return (0); 12331 } 12332 12333 /* 12334 * Set (hardware) link specific information that might override 12335 * what was acquired through the DL_INFO_ACK. 12336 * The logic is as follows. 12337 * 12338 * become exclusive 12339 * set CHANGING flag 12340 * change mtu on affected IREs 12341 * clear CHANGING flag 12342 * 12343 * An ire add that occurs before the CHANGING flag is set will have its mtu 12344 * changed by the ip_sioctl_lnkinfo. 12345 * 12346 * During the time the CHANGING flag is set, no new ires will be added to the 12347 * bucket, and ire add will fail (due the CHANGING flag). 12348 * 12349 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12350 * before it is added to the bucket. 12351 * 12352 * Obviously only 1 thread can set the CHANGING flag and we need to become 12353 * exclusive to set the flag. 12354 */ 12355 /* ARGSUSED */ 12356 int 12357 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12358 ip_ioctl_cmd_t *ipi, void *if_req) 12359 { 12360 ill_t *ill = ipif->ipif_ill; 12361 ipif_t *nipif; 12362 int ip_min_mtu; 12363 boolean_t mtu_walk = B_FALSE; 12364 struct lifreq *lifr = (struct lifreq *)if_req; 12365 lif_ifinfo_req_t *lir; 12366 ire_t *ire; 12367 12368 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12369 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12370 lir = &lifr->lifr_ifinfo; 12371 ASSERT(IAM_WRITER_IPIF(ipif)); 12372 12373 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12374 if (ipif->ipif_id != 0) 12375 return (EINVAL); 12376 12377 /* Set interface MTU. */ 12378 if (ipif->ipif_isv6) 12379 ip_min_mtu = IPV6_MIN_MTU; 12380 else 12381 ip_min_mtu = IP_MIN_MTU; 12382 12383 /* 12384 * Verify values before we set anything. Allow zero to 12385 * mean unspecified. 12386 */ 12387 if (lir->lir_maxmtu != 0 && 12388 (lir->lir_maxmtu > ill->ill_max_frag || 12389 lir->lir_maxmtu < ip_min_mtu)) 12390 return (EINVAL); 12391 if (lir->lir_reachtime != 0 && 12392 lir->lir_reachtime > ND_MAX_REACHTIME) 12393 return (EINVAL); 12394 if (lir->lir_reachretrans != 0 && 12395 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12396 return (EINVAL); 12397 12398 mutex_enter(&ill->ill_lock); 12399 ill->ill_state_flags |= ILL_CHANGING; 12400 for (nipif = ill->ill_ipif; nipif != NULL; 12401 nipif = nipif->ipif_next) { 12402 nipif->ipif_state_flags |= IPIF_CHANGING; 12403 } 12404 12405 mutex_exit(&ill->ill_lock); 12406 12407 if (lir->lir_maxmtu != 0) { 12408 ill->ill_max_mtu = lir->lir_maxmtu; 12409 ill->ill_mtu_userspecified = 1; 12410 mtu_walk = B_TRUE; 12411 } 12412 12413 if (lir->lir_reachtime != 0) 12414 ill->ill_reachable_time = lir->lir_reachtime; 12415 12416 if (lir->lir_reachretrans != 0) 12417 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12418 12419 ill->ill_max_hops = lir->lir_maxhops; 12420 12421 ill->ill_max_buf = ND_MAX_Q; 12422 12423 if (mtu_walk) { 12424 /* 12425 * Set the MTU on all ipifs associated with this ill except 12426 * for those whose MTU was fixed via SIOCSLIFMTU. 12427 */ 12428 for (nipif = ill->ill_ipif; nipif != NULL; 12429 nipif = nipif->ipif_next) { 12430 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12431 continue; 12432 12433 nipif->ipif_mtu = ill->ill_max_mtu; 12434 12435 if (!(nipif->ipif_flags & IPIF_UP)) 12436 continue; 12437 12438 if (nipif->ipif_isv6) 12439 ire = ipif_to_ire_v6(nipif); 12440 else 12441 ire = ipif_to_ire(nipif); 12442 if (ire != NULL) { 12443 ire->ire_max_frag = ipif->ipif_mtu; 12444 ire_refrele(ire); 12445 } 12446 if (ill->ill_isv6) { 12447 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 12448 ipif_mtu_change, (char *)nipif, 12449 ill); 12450 } else { 12451 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 12452 ipif_mtu_change, (char *)nipif, 12453 ill); 12454 } 12455 } 12456 } 12457 12458 mutex_enter(&ill->ill_lock); 12459 for (nipif = ill->ill_ipif; nipif != NULL; 12460 nipif = nipif->ipif_next) { 12461 nipif->ipif_state_flags &= ~IPIF_CHANGING; 12462 } 12463 ILL_UNMARK_CHANGING(ill); 12464 mutex_exit(&ill->ill_lock); 12465 12466 return (0); 12467 } 12468 12469 /* ARGSUSED */ 12470 int 12471 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12472 ip_ioctl_cmd_t *ipi, void *if_req) 12473 { 12474 struct lif_ifinfo_req *lir; 12475 ill_t *ill = ipif->ipif_ill; 12476 12477 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 12478 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12479 if (ipif->ipif_id != 0) 12480 return (EINVAL); 12481 12482 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 12483 lir->lir_maxhops = ill->ill_max_hops; 12484 lir->lir_reachtime = ill->ill_reachable_time; 12485 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 12486 lir->lir_maxmtu = ill->ill_max_mtu; 12487 12488 return (0); 12489 } 12490 12491 /* 12492 * Return best guess as to the subnet mask for the specified address. 12493 * Based on the subnet masks for all the configured interfaces. 12494 * 12495 * We end up returning a zero mask in the case of default, multicast or 12496 * experimental. 12497 */ 12498 static ipaddr_t 12499 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp) 12500 { 12501 ipaddr_t net_mask; 12502 ill_t *ill; 12503 ipif_t *ipif; 12504 ill_walk_context_t ctx; 12505 ipif_t *fallback_ipif = NULL; 12506 12507 net_mask = ip_net_mask(addr); 12508 if (net_mask == 0) { 12509 *ipifp = NULL; 12510 return (0); 12511 } 12512 12513 /* Let's check to see if this is maybe a local subnet route. */ 12514 /* this function only applies to IPv4 interfaces */ 12515 rw_enter(&ill_g_lock, RW_READER); 12516 ill = ILL_START_WALK_V4(&ctx); 12517 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 12518 mutex_enter(&ill->ill_lock); 12519 for (ipif = ill->ill_ipif; ipif != NULL; 12520 ipif = ipif->ipif_next) { 12521 if (!IPIF_CAN_LOOKUP(ipif)) 12522 continue; 12523 if (!(ipif->ipif_flags & IPIF_UP)) 12524 continue; 12525 if ((ipif->ipif_subnet & net_mask) == 12526 (addr & net_mask)) { 12527 /* 12528 * Don't trust pt-pt interfaces if there are 12529 * other interfaces. 12530 */ 12531 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 12532 if (fallback_ipif == NULL) { 12533 ipif_refhold_locked(ipif); 12534 fallback_ipif = ipif; 12535 } 12536 continue; 12537 } 12538 12539 /* 12540 * Fine. Just assume the same net mask as the 12541 * directly attached subnet interface is using. 12542 */ 12543 ipif_refhold_locked(ipif); 12544 mutex_exit(&ill->ill_lock); 12545 rw_exit(&ill_g_lock); 12546 if (fallback_ipif != NULL) 12547 ipif_refrele(fallback_ipif); 12548 *ipifp = ipif; 12549 return (ipif->ipif_net_mask); 12550 } 12551 } 12552 mutex_exit(&ill->ill_lock); 12553 } 12554 rw_exit(&ill_g_lock); 12555 12556 *ipifp = fallback_ipif; 12557 return ((fallback_ipif != NULL) ? 12558 fallback_ipif->ipif_net_mask : net_mask); 12559 } 12560 12561 /* 12562 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 12563 */ 12564 static void 12565 ip_wput_ioctl(queue_t *q, mblk_t *mp) 12566 { 12567 IOCP iocp; 12568 ipft_t *ipft; 12569 ipllc_t *ipllc; 12570 mblk_t *mp1; 12571 cred_t *cr; 12572 int error = 0; 12573 conn_t *connp; 12574 12575 ip1dbg(("ip_wput_ioctl")); 12576 iocp = (IOCP)mp->b_rptr; 12577 mp1 = mp->b_cont; 12578 if (mp1 == NULL) { 12579 iocp->ioc_error = EINVAL; 12580 mp->b_datap->db_type = M_IOCNAK; 12581 iocp->ioc_count = 0; 12582 qreply(q, mp); 12583 return; 12584 } 12585 12586 /* 12587 * These IOCTLs provide various control capabilities to 12588 * upstream agents such as ULPs and processes. There 12589 * are currently two such IOCTLs implemented. They 12590 * are used by TCP to provide update information for 12591 * existing IREs and to forcibly delete an IRE for a 12592 * host that is not responding, thereby forcing an 12593 * attempt at a new route. 12594 */ 12595 iocp->ioc_error = EINVAL; 12596 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 12597 goto done; 12598 12599 ipllc = (ipllc_t *)mp1->b_rptr; 12600 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 12601 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 12602 break; 12603 } 12604 /* 12605 * prefer credential from mblk over ioctl; 12606 * see ip_sioctl_copyin_setup 12607 */ 12608 cr = DB_CREDDEF(mp, iocp->ioc_cr); 12609 12610 /* 12611 * Refhold the conn in case the request gets queued up in some lookup 12612 */ 12613 ASSERT(CONN_Q(q)); 12614 connp = Q_TO_CONN(q); 12615 CONN_INC_REF(connp); 12616 if (ipft->ipft_pfi && 12617 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 12618 pullupmsg(mp1, ipft->ipft_min_size))) { 12619 error = (*ipft->ipft_pfi)(q, 12620 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 12621 } 12622 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 12623 /* 12624 * CONN_OPER_PENDING_DONE happens in the function called 12625 * through ipft_pfi above. 12626 */ 12627 return; 12628 } 12629 12630 CONN_OPER_PENDING_DONE(connp); 12631 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 12632 freemsg(mp); 12633 return; 12634 } 12635 iocp->ioc_error = error; 12636 12637 done: 12638 mp->b_datap->db_type = M_IOCACK; 12639 if (iocp->ioc_error) 12640 iocp->ioc_count = 0; 12641 qreply(q, mp); 12642 } 12643 12644 /* 12645 * Lookup an ipif using the sequence id (ipif_seqid) 12646 */ 12647 ipif_t * 12648 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 12649 { 12650 ipif_t *ipif; 12651 12652 ASSERT(MUTEX_HELD(&ill->ill_lock)); 12653 12654 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12655 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 12656 return (ipif); 12657 } 12658 return (NULL); 12659 } 12660 12661 uint64_t ipif_g_seqid; 12662 12663 /* 12664 * Assign a unique id for the ipif. This is used later when we send 12665 * IRES to ARP for resolution where we initialize ire_ipif_seqid 12666 * to the value pointed by ire_ipif->ipif_seqid. Later when the 12667 * IRE is added, we verify that ipif has not disappeared. 12668 */ 12669 12670 static void 12671 ipif_assign_seqid(ipif_t *ipif) 12672 { 12673 ipif->ipif_seqid = atomic_add_64_nv(&ipif_g_seqid, 1); 12674 } 12675 12676 /* 12677 * Insert the ipif, so that the list of ipifs on the ill will be sorted 12678 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 12679 * be inserted into the first space available in the list. The value of 12680 * ipif_id will then be set to the appropriate value for its position. 12681 */ 12682 static int 12683 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 12684 { 12685 ill_t *ill; 12686 ipif_t *tipif; 12687 ipif_t **tipifp; 12688 int id; 12689 12690 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 12691 IAM_WRITER_IPIF(ipif)); 12692 12693 ill = ipif->ipif_ill; 12694 ASSERT(ill != NULL); 12695 12696 /* 12697 * In the case of lo0:0 we already hold the ill_g_lock. 12698 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 12699 * ipif_insert. Another such caller is ipif_move. 12700 */ 12701 if (acquire_g_lock) 12702 rw_enter(&ill_g_lock, RW_WRITER); 12703 if (acquire_ill_lock) 12704 mutex_enter(&ill->ill_lock); 12705 id = ipif->ipif_id; 12706 tipifp = &(ill->ill_ipif); 12707 if (id == -1) { /* need to find a real id */ 12708 id = 0; 12709 while ((tipif = *tipifp) != NULL) { 12710 ASSERT(tipif->ipif_id >= id); 12711 if (tipif->ipif_id != id) 12712 break; /* non-consecutive id */ 12713 id++; 12714 tipifp = &(tipif->ipif_next); 12715 } 12716 /* limit number of logical interfaces */ 12717 if (id >= ip_addrs_per_if) { 12718 if (acquire_ill_lock) 12719 mutex_exit(&ill->ill_lock); 12720 if (acquire_g_lock) 12721 rw_exit(&ill_g_lock); 12722 return (-1); 12723 } 12724 ipif->ipif_id = id; /* assign new id */ 12725 } else if (id < ip_addrs_per_if) { 12726 /* we have a real id; insert ipif in the right place */ 12727 while ((tipif = *tipifp) != NULL) { 12728 ASSERT(tipif->ipif_id != id); 12729 if (tipif->ipif_id > id) 12730 break; /* found correct location */ 12731 tipifp = &(tipif->ipif_next); 12732 } 12733 } else { 12734 if (acquire_ill_lock) 12735 mutex_exit(&ill->ill_lock); 12736 if (acquire_g_lock) 12737 rw_exit(&ill_g_lock); 12738 return (-1); 12739 } 12740 12741 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 12742 12743 ipif->ipif_next = tipif; 12744 *tipifp = ipif; 12745 if (acquire_ill_lock) 12746 mutex_exit(&ill->ill_lock); 12747 if (acquire_g_lock) 12748 rw_exit(&ill_g_lock); 12749 return (0); 12750 } 12751 12752 /* 12753 * Allocate and initialize a new interface control structure. (Always 12754 * called as writer.) 12755 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 12756 * is not part of the global linked list of ills. ipif_seqid is unique 12757 * in the system and to preserve the uniqueness, it is assigned only 12758 * when ill becomes part of the global list. At that point ill will 12759 * have a name. If it doesn't get assigned here, it will get assigned 12760 * in ipif_set_values() as part of SIOCSLIFNAME processing. 12761 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 12762 * the interface flags or any other information from the DL_INFO_ACK for 12763 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 12764 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 12765 * second DL_INFO_ACK comes in from the driver. 12766 */ 12767 static ipif_t * 12768 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 12769 { 12770 ipif_t *ipif; 12771 phyint_t *phyi; 12772 12773 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 12774 ill->ill_name, id, (void *)ill)); 12775 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 12776 12777 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 12778 return (NULL); 12779 *ipif = ipif_zero; /* start clean */ 12780 12781 ipif->ipif_ill = ill; 12782 ipif->ipif_id = id; /* could be -1 */ 12783 ipif->ipif_zoneid = GLOBAL_ZONEID; 12784 12785 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 12786 12787 ipif->ipif_refcnt = 0; 12788 ipif->ipif_saved_ire_cnt = 0; 12789 12790 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 12791 mi_free(ipif); 12792 return (NULL); 12793 } 12794 /* -1 id should have been replaced by real id */ 12795 id = ipif->ipif_id; 12796 ASSERT(id >= 0); 12797 12798 if (ill->ill_name[0] != '\0') { 12799 ipif_assign_seqid(ipif); 12800 if (ill->ill_phyint->phyint_ifindex != 0) 12801 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 12802 } 12803 /* 12804 * Keep a copy of original id in ipif_orig_ipifid. Failback 12805 * will attempt to restore the original id. The SIOCSLIFOINDEX 12806 * ioctl sets ipif_orig_ipifid to zero. 12807 */ 12808 ipif->ipif_orig_ipifid = id; 12809 12810 /* 12811 * We grab the ill_lock and phyint_lock to protect the flag changes. 12812 * The ipif is still not up and can't be looked up until the 12813 * ioctl completes and the IPIF_CHANGING flag is cleared. 12814 */ 12815 mutex_enter(&ill->ill_lock); 12816 mutex_enter(&ill->ill_phyint->phyint_lock); 12817 /* 12818 * Set the running flag when logical interface zero is created. 12819 * For subsequent logical interfaces, a DLPI link down 12820 * notification message may have cleared the running flag to 12821 * indicate the link is down, so we shouldn't just blindly set it. 12822 */ 12823 if (id == 0) 12824 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 12825 ipif->ipif_ire_type = ire_type; 12826 phyi = ill->ill_phyint; 12827 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 12828 12829 if (ipif->ipif_isv6) { 12830 ill->ill_flags |= ILLF_IPV6; 12831 } else { 12832 ipaddr_t inaddr_any = INADDR_ANY; 12833 12834 ill->ill_flags |= ILLF_IPV4; 12835 12836 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 12837 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12838 &ipif->ipif_v6lcl_addr); 12839 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12840 &ipif->ipif_v6src_addr); 12841 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12842 &ipif->ipif_v6subnet); 12843 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12844 &ipif->ipif_v6net_mask); 12845 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12846 &ipif->ipif_v6brd_addr); 12847 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12848 &ipif->ipif_v6pp_dst_addr); 12849 } 12850 12851 /* 12852 * Don't set the interface flags etc. now, will do it in 12853 * ip_ll_subnet_defaults. 12854 */ 12855 if (!initialize) { 12856 mutex_exit(&ill->ill_lock); 12857 mutex_exit(&ill->ill_phyint->phyint_lock); 12858 return (ipif); 12859 } 12860 ipif->ipif_mtu = ill->ill_max_mtu; 12861 12862 if (ill->ill_bcast_addr_length != 0) { 12863 /* 12864 * Later detect lack of DLPI driver multicast 12865 * capability by catching DL_ENABMULTI errors in 12866 * ip_rput_dlpi. 12867 */ 12868 ill->ill_flags |= ILLF_MULTICAST; 12869 if (!ipif->ipif_isv6) 12870 ipif->ipif_flags |= IPIF_BROADCAST; 12871 } else { 12872 if (ill->ill_net_type != IRE_LOOPBACK) { 12873 if (ipif->ipif_isv6) 12874 /* 12875 * Note: xresolv interfaces will eventually need 12876 * NOARP set here as well, but that will require 12877 * those external resolvers to have some 12878 * knowledge of that flag and act appropriately. 12879 * Not to be changed at present. 12880 */ 12881 ill->ill_flags |= ILLF_NONUD; 12882 else 12883 ill->ill_flags |= ILLF_NOARP; 12884 } 12885 if (ill->ill_phys_addr_length == 0) { 12886 if (ill->ill_media && 12887 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 12888 ipif->ipif_flags |= IPIF_NOXMIT; 12889 phyi->phyint_flags |= PHYI_VIRTUAL; 12890 } else { 12891 /* pt-pt supports multicast. */ 12892 ill->ill_flags |= ILLF_MULTICAST; 12893 if (ill->ill_net_type == IRE_LOOPBACK) { 12894 phyi->phyint_flags |= 12895 (PHYI_LOOPBACK | PHYI_VIRTUAL); 12896 } else { 12897 ipif->ipif_flags |= IPIF_POINTOPOINT; 12898 } 12899 } 12900 } 12901 } 12902 mutex_exit(&ill->ill_lock); 12903 mutex_exit(&ill->ill_phyint->phyint_lock); 12904 return (ipif); 12905 } 12906 12907 /* 12908 * If appropriate, send a message up to the resolver delete the entry 12909 * for the address of this interface which is going out of business. 12910 * (Always called as writer). 12911 * 12912 * NOTE : We need to check for NULL mps as some of the fields are 12913 * initialized only for some interface types. See ipif_resolver_up() 12914 * for details. 12915 */ 12916 void 12917 ipif_arp_down(ipif_t *ipif) 12918 { 12919 mblk_t *mp; 12920 12921 ip1dbg(("ipif_arp_down(%s:%u)\n", 12922 ipif->ipif_ill->ill_name, ipif->ipif_id)); 12923 ASSERT(IAM_WRITER_IPIF(ipif)); 12924 12925 /* Delete the mapping for the local address */ 12926 mp = ipif->ipif_arp_del_mp; 12927 if (mp != NULL) { 12928 ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", 12929 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12930 ipif->ipif_ill->ill_name, ipif->ipif_id)); 12931 putnext(ipif->ipif_ill->ill_rq, mp); 12932 ipif->ipif_arp_del_mp = NULL; 12933 } 12934 12935 /* 12936 * If this is the last ipif that is going down, we need 12937 * to clean up ARP completely. 12938 */ 12939 if (ipif->ipif_ill->ill_ipif_up_count == 0) { 12940 12941 /* Send up AR_INTERFACE_DOWN message */ 12942 mp = ipif->ipif_ill->ill_arp_down_mp; 12943 if (mp != NULL) { 12944 ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", 12945 dlpi_prim_str(*(int *)mp->b_rptr), 12946 *(int *)mp->b_rptr, ipif->ipif_ill->ill_name, 12947 ipif->ipif_id)); 12948 putnext(ipif->ipif_ill->ill_rq, mp); 12949 ipif->ipif_ill->ill_arp_down_mp = NULL; 12950 } 12951 12952 /* Tell ARP to delete the multicast mappings */ 12953 mp = ipif->ipif_ill->ill_arp_del_mapping_mp; 12954 if (mp != NULL) { 12955 ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", 12956 dlpi_prim_str(*(int *)mp->b_rptr), 12957 *(int *)mp->b_rptr, ipif->ipif_ill->ill_name, 12958 ipif->ipif_id)); 12959 putnext(ipif->ipif_ill->ill_rq, mp); 12960 ipif->ipif_ill->ill_arp_del_mapping_mp = NULL; 12961 } 12962 } 12963 } 12964 12965 /* 12966 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 12967 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 12968 * that it wants the add_mp allocated in this function to be returned 12969 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 12970 * just re-do the multicast, it wants us to send the add_mp to ARP also. 12971 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 12972 * as it does a ipif_arp_down after calling this function - which will 12973 * remove what we add here. 12974 * 12975 * Returns -1 on failures and 0 on success. 12976 */ 12977 int 12978 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 12979 { 12980 mblk_t *del_mp = NULL; 12981 mblk_t *add_mp = NULL; 12982 mblk_t *mp; 12983 ill_t *ill = ipif->ipif_ill; 12984 phyint_t *phyi = ill->ill_phyint; 12985 ipaddr_t addr, mask, extract_mask = 0; 12986 arma_t *arma; 12987 uint8_t *maddr, *bphys_addr; 12988 uint32_t hw_start; 12989 dl_unitdata_req_t *dlur; 12990 12991 ASSERT(IAM_WRITER_IPIF(ipif)); 12992 if (ipif->ipif_flags & IPIF_POINTOPOINT) 12993 return (0); 12994 12995 /* 12996 * Delete the existing mapping from ARP. Normally ipif_down 12997 * -> ipif_arp_down should send this up to ARP. The only 12998 * reason we would find this when we are switching from 12999 * Multicast to Broadcast where we did not do a down. 13000 */ 13001 mp = ill->ill_arp_del_mapping_mp; 13002 if (mp != NULL) { 13003 ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", 13004 dlpi_prim_str(*(int *)mp->b_rptr), 13005 *(int *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13006 putnext(ill->ill_rq, mp); 13007 ill->ill_arp_del_mapping_mp = NULL; 13008 } 13009 13010 if (arp_add_mapping_mp != NULL) 13011 *arp_add_mapping_mp = NULL; 13012 13013 /* 13014 * Check that the address is not to long for the constant 13015 * length reserved in the template arma_t. 13016 */ 13017 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13018 return (-1); 13019 13020 /* Add mapping mblk */ 13021 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13022 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13023 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13024 (caddr_t)&addr); 13025 if (add_mp == NULL) 13026 return (-1); 13027 arma = (arma_t *)add_mp->b_rptr; 13028 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13029 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13030 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13031 13032 /* 13033 * Determine the broadcast address. 13034 */ 13035 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13036 if (ill->ill_sap_length < 0) 13037 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13038 else 13039 bphys_addr = (uchar_t *)dlur + 13040 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13041 /* 13042 * Check PHYI_MULTI_BCAST and length of physical 13043 * address to determine if we use the mapping or the 13044 * broadcast address. 13045 */ 13046 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13047 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13048 bphys_addr, maddr, &hw_start, &extract_mask)) 13049 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13050 13051 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13052 (ill->ill_flags & ILLF_MULTICAST)) { 13053 /* Make sure this will not match the "exact" entry. */ 13054 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13055 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13056 (caddr_t)&addr); 13057 if (del_mp == NULL) { 13058 freemsg(add_mp); 13059 return (-1); 13060 } 13061 bcopy(&extract_mask, (char *)arma + 13062 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13063 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13064 /* Use link-layer broadcast address for MULTI_BCAST */ 13065 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13066 ip2dbg(("ipif_arp_setup_multicast: adding" 13067 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13068 } else { 13069 arma->arma_hw_mapping_start = hw_start; 13070 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13071 " ARP setup for %s\n", ill->ill_name)); 13072 } 13073 } else { 13074 freemsg(add_mp); 13075 ASSERT(del_mp == NULL); 13076 /* It is neither MULTICAST nor MULTI_BCAST */ 13077 return (0); 13078 } 13079 ASSERT(add_mp != NULL && del_mp != NULL); 13080 ill->ill_arp_del_mapping_mp = del_mp; 13081 if (arp_add_mapping_mp != NULL) { 13082 /* The caller just wants the mblks allocated */ 13083 *arp_add_mapping_mp = add_mp; 13084 } else { 13085 /* The caller wants us to send it to arp */ 13086 putnext(ill->ill_rq, add_mp); 13087 } 13088 return (0); 13089 } 13090 13091 /* 13092 * Get the resolver set up for a new interface address. 13093 * (Always called as writer.) 13094 * Called both for IPv4 and IPv6 interfaces, 13095 * though it only sets up the resolver for v6 13096 * if it's an xresolv interface (one using an external resolver). 13097 * Honors ILLF_NOARP. 13098 * The boolean value arp_just_publish, if B_TRUE, indicates that 13099 * it only needs to send an AR_ENTRY_ADD message up to ARP for 13100 * IPv4 interfaces. Currently, B_TRUE is only set when this 13101 * function is called by ip_rput_dlpi_writer() to handle 13102 * asynchronous hardware address change notification. 13103 * Returns error on failure. 13104 */ 13105 int 13106 ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish) 13107 { 13108 caddr_t addr; 13109 mblk_t *arp_up_mp = NULL; 13110 mblk_t *arp_down_mp = NULL; 13111 mblk_t *arp_add_mp = NULL; 13112 mblk_t *arp_del_mp = NULL; 13113 mblk_t *arp_add_mapping_mp = NULL; 13114 mblk_t *arp_del_mapping_mp = NULL; 13115 ill_t *ill = ipif->ipif_ill; 13116 uchar_t *area_p = NULL; 13117 uchar_t *ared_p = NULL; 13118 int err = ENOMEM; 13119 13120 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13121 ipif->ipif_ill->ill_name, ipif->ipif_id, 13122 (uint_t)ipif->ipif_flags)); 13123 ASSERT(IAM_WRITER_IPIF(ipif)); 13124 13125 if ((ill->ill_net_type != IRE_IF_RESOLVER) || 13126 (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))) { 13127 return (0); 13128 } 13129 13130 if (ill->ill_isv6) { 13131 /* 13132 * External resolver for IPv6 13133 */ 13134 ASSERT(!arp_just_publish); 13135 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 13136 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 13137 area_p = (uchar_t *)&ip6_area_template; 13138 ared_p = (uchar_t *)&ip6_ared_template; 13139 } 13140 } else { 13141 /* 13142 * IPv4 arp case. If the ARP stream has already started 13143 * closing, fail this request for ARP bringup. Else 13144 * record the fact that an ARP bringup is pending. 13145 */ 13146 mutex_enter(&ill->ill_lock); 13147 if (ill->ill_arp_closing) { 13148 mutex_exit(&ill->ill_lock); 13149 err = EINVAL; 13150 goto failed; 13151 } else { 13152 if (ill->ill_ipif_up_count == 0) 13153 ill->ill_arp_bringup_pending = 1; 13154 mutex_exit(&ill->ill_lock); 13155 } 13156 if (ipif->ipif_lcl_addr != INADDR_ANY) { 13157 addr = (caddr_t)&ipif->ipif_lcl_addr; 13158 area_p = (uchar_t *)&ip_area_template; 13159 ared_p = (uchar_t *)&ip_ared_template; 13160 } 13161 } 13162 13163 /* 13164 * Add an entry for the local address in ARP only if it 13165 * is not UNNUMBERED and the address is not INADDR_ANY. 13166 */ 13167 if (((ipif->ipif_flags & IPIF_UNNUMBERED) == 0) && area_p != NULL) { 13168 /* Now ask ARP to publish our address. */ 13169 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 13170 if (arp_add_mp == NULL) 13171 goto failed; 13172 if (arp_just_publish) { 13173 /* 13174 * Copy the new hardware address and length into 13175 * arp_add_mp to be sent to ARP. 13176 */ 13177 area_t *area = (area_t *)arp_add_mp->b_rptr; 13178 area->area_hw_addr_length = 13179 ill->ill_phys_addr_length; 13180 bcopy((char *)ill->ill_phys_addr, 13181 ((char *)area + area->area_hw_addr_offset), 13182 area->area_hw_addr_length); 13183 } 13184 13185 ((area_t *)arp_add_mp->b_rptr)->area_flags = 13186 ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; 13187 13188 if (arp_just_publish) 13189 goto arp_setup_multicast; 13190 13191 /* 13192 * Allocate an ARP deletion message so we know we can tell ARP 13193 * when the interface goes down. 13194 */ 13195 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 13196 if (arp_del_mp == NULL) 13197 goto failed; 13198 13199 } else { 13200 if (arp_just_publish) 13201 goto done; 13202 } 13203 /* 13204 * Need to bring up ARP or setup multicast mapping only 13205 * when the first interface is coming UP. 13206 */ 13207 if (ill->ill_ipif_up_count != 0) 13208 goto done; 13209 13210 /* 13211 * Allocate an ARP down message (to be saved) and an ARP up 13212 * message. 13213 */ 13214 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13215 if (arp_down_mp == NULL) 13216 goto failed; 13217 13218 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13219 if (arp_up_mp == NULL) 13220 goto failed; 13221 13222 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13223 goto done; 13224 13225 arp_setup_multicast: 13226 /* 13227 * Setup the multicast mappings. This function initializes 13228 * ill_arp_del_mapping_mp also. This does not need to be done for 13229 * IPv6. 13230 */ 13231 if (!ill->ill_isv6) { 13232 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13233 if (err != 0) 13234 goto failed; 13235 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13236 ASSERT(arp_add_mapping_mp != NULL); 13237 } 13238 13239 done:; 13240 if (arp_del_mp != NULL) { 13241 ASSERT(ipif->ipif_arp_del_mp == NULL); 13242 ipif->ipif_arp_del_mp = arp_del_mp; 13243 } 13244 if (arp_down_mp != NULL) { 13245 ASSERT(ill->ill_arp_down_mp == NULL); 13246 ill->ill_arp_down_mp = arp_down_mp; 13247 } 13248 if (arp_del_mapping_mp != NULL) { 13249 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13250 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13251 } 13252 if (arp_up_mp != NULL) { 13253 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13254 ipif->ipif_ill->ill_name, ipif->ipif_id)); 13255 putnext(ill->ill_rq, arp_up_mp); 13256 } 13257 if (arp_add_mp != NULL) { 13258 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13259 ipif->ipif_ill->ill_name, ipif->ipif_id)); 13260 putnext(ill->ill_rq, arp_add_mp); 13261 } 13262 if (arp_add_mapping_mp != NULL) { 13263 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13264 ipif->ipif_ill->ill_name, ipif->ipif_id)); 13265 putnext(ill->ill_rq, arp_add_mapping_mp); 13266 } 13267 if (arp_just_publish) 13268 return (0); 13269 13270 if (ill->ill_flags & ILLF_NOARP) 13271 err = ill_arp_off(ill); 13272 else 13273 err = ill_arp_on(ill); 13274 if (err) { 13275 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 13276 freemsg(ipif->ipif_arp_del_mp); 13277 if (arp_down_mp != NULL) 13278 freemsg(ill->ill_arp_down_mp); 13279 if (ill->ill_arp_del_mapping_mp != NULL) 13280 freemsg(ill->ill_arp_del_mapping_mp); 13281 ipif->ipif_arp_del_mp = NULL; 13282 ill->ill_arp_down_mp = NULL; 13283 ill->ill_arp_del_mapping_mp = NULL; 13284 return (err); 13285 } 13286 return (ill->ill_ipif_up_count != 0 ? 0 : EINPROGRESS); 13287 13288 failed:; 13289 ip1dbg(("ipif_resolver_up: FAILED\n")); 13290 freemsg(arp_add_mp); 13291 freemsg(arp_del_mp); 13292 freemsg(arp_add_mapping_mp); 13293 freemsg(arp_up_mp); 13294 freemsg(arp_down_mp); 13295 ill->ill_arp_bringup_pending = 0; 13296 return (err); 13297 } 13298 13299 /* 13300 * Wakeup all threads waiting to enter the ipsq, and sleeping 13301 * on any of the ills in this ipsq. The ill_lock of the ill 13302 * must be held so that waiters don't miss wakeups 13303 */ 13304 static void 13305 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 13306 { 13307 phyint_t *phyint; 13308 13309 phyint = ipsq->ipsq_phyint_list; 13310 while (phyint != NULL) { 13311 if (phyint->phyint_illv4) { 13312 if (!caller_holds_lock) 13313 mutex_enter(&phyint->phyint_illv4->ill_lock); 13314 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13315 cv_broadcast(&phyint->phyint_illv4->ill_cv); 13316 if (!caller_holds_lock) 13317 mutex_exit(&phyint->phyint_illv4->ill_lock); 13318 } 13319 if (phyint->phyint_illv6) { 13320 if (!caller_holds_lock) 13321 mutex_enter(&phyint->phyint_illv6->ill_lock); 13322 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13323 cv_broadcast(&phyint->phyint_illv6->ill_cv); 13324 if (!caller_holds_lock) 13325 mutex_exit(&phyint->phyint_illv6->ill_lock); 13326 } 13327 phyint = phyint->phyint_ipsq_next; 13328 } 13329 } 13330 13331 static ipsq_t * 13332 ipsq_create(char *groupname) 13333 { 13334 ipsq_t *ipsq; 13335 13336 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13337 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 13338 if (ipsq == NULL) { 13339 return (NULL); 13340 } 13341 13342 if (groupname != NULL) 13343 (void) strcpy(ipsq->ipsq_name, groupname); 13344 else 13345 ipsq->ipsq_name[0] = '\0'; 13346 13347 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 13348 ipsq->ipsq_flags |= IPSQ_GROUP; 13349 ipsq->ipsq_next = ipsq_g_head; 13350 ipsq_g_head = ipsq; 13351 return (ipsq); 13352 } 13353 13354 /* 13355 * Return an ipsq correspoding to the groupname. If 'create' is true 13356 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 13357 * uniquely with an IPMP group. However during IPMP groupname operations, 13358 * multiple IPMP groups may be associated with a single ipsq. But no 13359 * IPMP group can be associated with more than 1 ipsq at any time. 13360 * For example 13361 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 13362 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 13363 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 13364 * 13365 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 13366 * status shown below during the execution of the above command. 13367 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 13368 * 13369 * After the completion of the above groupname command we return to the stable 13370 * state shown below. 13371 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 13372 * hme4 mpk17-85 ipsq2 mpk17-85 1 13373 * 13374 * Because of the above, we don't search based on the ipsq_name since that 13375 * would miss the correct ipsq during certain windows as shown above. 13376 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 13377 * natural state. 13378 */ 13379 static ipsq_t * 13380 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq) 13381 { 13382 ipsq_t *ipsq; 13383 int group_len; 13384 phyint_t *phyint; 13385 13386 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 13387 13388 group_len = strlen(groupname); 13389 ASSERT(group_len != 0); 13390 group_len++; 13391 13392 for (ipsq = ipsq_g_head; ipsq != NULL; ipsq = ipsq->ipsq_next) { 13393 /* 13394 * When an ipsq is being split, and ill_split_ipsq 13395 * calls this function, we exclude it from being considered. 13396 */ 13397 if (ipsq == exclude_ipsq) 13398 continue; 13399 13400 /* 13401 * Compare against the ipsq_name. The groupname change happens 13402 * in 2 phases. The 1st phase merges the from group into 13403 * the to group's ipsq, by calling ill_merge_groups and restarts 13404 * the ioctl. The 2nd phase then locates the ipsq again thru 13405 * ipsq_name. At this point the phyint_groupname has not been 13406 * updated. 13407 */ 13408 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 13409 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 13410 /* 13411 * Verify that an ipmp groupname is exactly 13412 * part of 1 ipsq and is not found in any other 13413 * ipsq. 13414 */ 13415 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) == 13416 NULL); 13417 return (ipsq); 13418 } 13419 13420 /* 13421 * Comparison against ipsq_name alone is not sufficient. 13422 * In the case when groups are currently being 13423 * merged, the ipsq could hold other IPMP groups temporarily. 13424 * so we walk the phyint list and compare against the 13425 * phyint_groupname as well. 13426 */ 13427 phyint = ipsq->ipsq_phyint_list; 13428 while (phyint != NULL) { 13429 if ((group_len == phyint->phyint_groupname_len) && 13430 (bcmp(phyint->phyint_groupname, groupname, 13431 group_len) == 0)) { 13432 /* 13433 * Verify that an ipmp groupname is exactly 13434 * part of 1 ipsq and is not found in any other 13435 * ipsq. 13436 */ 13437 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) 13438 == NULL); 13439 return (ipsq); 13440 } 13441 phyint = phyint->phyint_ipsq_next; 13442 } 13443 } 13444 if (create) 13445 ipsq = ipsq_create(groupname); 13446 return (ipsq); 13447 } 13448 13449 static void 13450 ipsq_delete(ipsq_t *ipsq) 13451 { 13452 ipsq_t *nipsq; 13453 ipsq_t *pipsq = NULL; 13454 13455 /* 13456 * We don't hold the ipsq lock, but we are sure no new 13457 * messages can land up, since the ipsq_refs is zero. 13458 * i.e. this ipsq is unnamed and no phyint or phyint group 13459 * is associated with this ipsq. (Lookups are based on ill_name 13460 * or phyint_group_name) 13461 */ 13462 ASSERT(ipsq->ipsq_refs == 0); 13463 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 13464 ASSERT(ipsq->ipsq_pending_mp == NULL); 13465 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 13466 /* 13467 * This is not the ipsq of an IPMP group. 13468 */ 13469 kmem_free(ipsq, sizeof (ipsq_t)); 13470 return; 13471 } 13472 13473 rw_enter(&ill_g_lock, RW_WRITER); 13474 13475 /* 13476 * Locate the ipsq before we can remove it from 13477 * the singly linked list of ipsq's. 13478 */ 13479 for (nipsq = ipsq_g_head; nipsq != NULL; nipsq = nipsq->ipsq_next) { 13480 if (nipsq == ipsq) { 13481 break; 13482 } 13483 pipsq = nipsq; 13484 } 13485 13486 ASSERT(nipsq == ipsq); 13487 13488 /* unlink ipsq from the list */ 13489 if (pipsq != NULL) 13490 pipsq->ipsq_next = ipsq->ipsq_next; 13491 else 13492 ipsq_g_head = ipsq->ipsq_next; 13493 kmem_free(ipsq, sizeof (ipsq_t)); 13494 rw_exit(&ill_g_lock); 13495 } 13496 13497 static void 13498 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 13499 queue_t *q) 13500 13501 { 13502 13503 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 13504 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 13505 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 13506 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 13507 ASSERT(current_mp != NULL); 13508 13509 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 13510 NEW_OP, NULL); 13511 13512 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 13513 new_ipsq->ipsq_xopq_mphead != NULL); 13514 13515 /* 13516 * move from old ipsq to the new ipsq. 13517 */ 13518 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 13519 if (old_ipsq->ipsq_xopq_mphead != NULL) 13520 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 13521 13522 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 13523 } 13524 13525 void 13526 ill_group_cleanup(ill_t *ill) 13527 { 13528 ill_t *ill_v4; 13529 ill_t *ill_v6; 13530 ipif_t *ipif; 13531 13532 ill_v4 = ill->ill_phyint->phyint_illv4; 13533 ill_v6 = ill->ill_phyint->phyint_illv6; 13534 13535 if (ill_v4 != NULL) { 13536 mutex_enter(&ill_v4->ill_lock); 13537 for (ipif = ill_v4->ill_ipif; ipif != NULL; 13538 ipif = ipif->ipif_next) { 13539 IPIF_UNMARK_MOVING(ipif); 13540 } 13541 ill_v4->ill_up_ipifs = B_FALSE; 13542 mutex_exit(&ill_v4->ill_lock); 13543 } 13544 13545 if (ill_v6 != NULL) { 13546 mutex_enter(&ill_v6->ill_lock); 13547 for (ipif = ill_v6->ill_ipif; ipif != NULL; 13548 ipif = ipif->ipif_next) { 13549 IPIF_UNMARK_MOVING(ipif); 13550 } 13551 ill_v6->ill_up_ipifs = B_FALSE; 13552 mutex_exit(&ill_v6->ill_lock); 13553 } 13554 } 13555 /* 13556 * This function is called when an ill has had a change in its group status 13557 * to bring up all the ipifs that were up before the change. 13558 */ 13559 int 13560 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 13561 { 13562 ipif_t *ipif; 13563 ill_t *ill_v4; 13564 ill_t *ill_v6; 13565 ill_t *from_ill; 13566 int err = 0; 13567 13568 13569 ASSERT(IAM_WRITER_ILL(ill)); 13570 13571 /* 13572 * Except for ipif_state_flags and ill_state_flags the other 13573 * fields of the ipif/ill that are modified below are protected 13574 * implicitly since we are a writer. We would have tried to down 13575 * even an ipif that was already down, in ill_down_ipifs. So we 13576 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 13577 */ 13578 ill_v4 = ill->ill_phyint->phyint_illv4; 13579 ill_v6 = ill->ill_phyint->phyint_illv6; 13580 if (ill_v4 != NULL) { 13581 ill_v4->ill_up_ipifs = B_TRUE; 13582 for (ipif = ill_v4->ill_ipif; ipif != NULL; 13583 ipif = ipif->ipif_next) { 13584 mutex_enter(&ill_v4->ill_lock); 13585 ipif->ipif_state_flags &= ~IPIF_CHANGING; 13586 IPIF_UNMARK_MOVING(ipif); 13587 mutex_exit(&ill_v4->ill_lock); 13588 if (ipif->ipif_was_up) { 13589 if (!(ipif->ipif_flags & IPIF_UP)) 13590 err = ipif_up(ipif, q, mp); 13591 ipif->ipif_was_up = B_FALSE; 13592 if (err != 0) { 13593 /* 13594 * Can there be any other error ? 13595 */ 13596 ASSERT(err == EINPROGRESS); 13597 return (err); 13598 } 13599 } 13600 } 13601 mutex_enter(&ill_v4->ill_lock); 13602 ill_v4->ill_state_flags &= ~ILL_CHANGING; 13603 mutex_exit(&ill_v4->ill_lock); 13604 ill_v4->ill_up_ipifs = B_FALSE; 13605 if (ill_v4->ill_move_in_progress) { 13606 ASSERT(ill_v4->ill_move_peer != NULL); 13607 ill_v4->ill_move_in_progress = B_FALSE; 13608 from_ill = ill_v4->ill_move_peer; 13609 from_ill->ill_move_in_progress = B_FALSE; 13610 from_ill->ill_move_peer = NULL; 13611 mutex_enter(&from_ill->ill_lock); 13612 from_ill->ill_state_flags &= ~ILL_CHANGING; 13613 mutex_exit(&from_ill->ill_lock); 13614 if (ill_v6 == NULL) { 13615 if (from_ill->ill_phyint->phyint_flags & 13616 PHYI_STANDBY) { 13617 phyint_inactive(from_ill->ill_phyint); 13618 } 13619 if (ill_v4->ill_phyint->phyint_flags & 13620 PHYI_STANDBY) { 13621 phyint_inactive(ill_v4->ill_phyint); 13622 } 13623 } 13624 ill_v4->ill_move_peer = NULL; 13625 } 13626 } 13627 13628 if (ill_v6 != NULL) { 13629 ill_v6->ill_up_ipifs = B_TRUE; 13630 for (ipif = ill_v6->ill_ipif; ipif != NULL; 13631 ipif = ipif->ipif_next) { 13632 mutex_enter(&ill_v6->ill_lock); 13633 ipif->ipif_state_flags &= ~IPIF_CHANGING; 13634 IPIF_UNMARK_MOVING(ipif); 13635 mutex_exit(&ill_v6->ill_lock); 13636 if (ipif->ipif_was_up) { 13637 if (!(ipif->ipif_flags & IPIF_UP)) 13638 err = ipif_up(ipif, q, mp); 13639 ipif->ipif_was_up = B_FALSE; 13640 if (err != 0) { 13641 /* 13642 * Can there be any other error ? 13643 */ 13644 ASSERT(err == EINPROGRESS); 13645 return (err); 13646 } 13647 } 13648 } 13649 mutex_enter(&ill_v6->ill_lock); 13650 ill_v6->ill_state_flags &= ~ILL_CHANGING; 13651 mutex_exit(&ill_v6->ill_lock); 13652 ill_v6->ill_up_ipifs = B_FALSE; 13653 if (ill_v6->ill_move_in_progress) { 13654 ASSERT(ill_v6->ill_move_peer != NULL); 13655 ill_v6->ill_move_in_progress = B_FALSE; 13656 from_ill = ill_v6->ill_move_peer; 13657 from_ill->ill_move_in_progress = B_FALSE; 13658 from_ill->ill_move_peer = NULL; 13659 mutex_enter(&from_ill->ill_lock); 13660 from_ill->ill_state_flags &= ~ILL_CHANGING; 13661 mutex_exit(&from_ill->ill_lock); 13662 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 13663 phyint_inactive(from_ill->ill_phyint); 13664 } 13665 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 13666 phyint_inactive(ill_v6->ill_phyint); 13667 } 13668 ill_v6->ill_move_peer = NULL; 13669 } 13670 } 13671 return (0); 13672 } 13673 13674 /* 13675 * bring down all the approriate ipifs. 13676 */ 13677 /* ARGSUSED */ 13678 static void 13679 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 13680 { 13681 ipif_t *ipif; 13682 13683 ASSERT(IAM_WRITER_ILL(ill)); 13684 13685 /* 13686 * Except for ipif_state_flags the other fields of the ipif/ill that 13687 * are modified below are protected implicitly since we are a writer 13688 */ 13689 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13690 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 13691 continue; 13692 if (index == 0 || index == ipif->ipif_orig_ifindex) { 13693 /* 13694 * We go through the ipif_down logic even if the ipif 13695 * is already down, since routes can be added based 13696 * on down ipifs. Going through ipif_down once again 13697 * will delete any IREs created based on these routes. 13698 */ 13699 if (ipif->ipif_flags & IPIF_UP) 13700 ipif->ipif_was_up = B_TRUE; 13701 /* 13702 * If called with chk_nofailover true ipif is moving. 13703 */ 13704 mutex_enter(&ill->ill_lock); 13705 if (chk_nofailover) { 13706 ipif->ipif_state_flags |= 13707 IPIF_MOVING | IPIF_CHANGING; 13708 } else { 13709 ipif->ipif_state_flags |= IPIF_CHANGING; 13710 } 13711 mutex_exit(&ill->ill_lock); 13712 /* 13713 * Need to re-create net/subnet bcast ires if 13714 * they are dependent on ipif. 13715 */ 13716 if (!ipif->ipif_isv6) 13717 ipif_check_bcast_ires(ipif); 13718 (void) ipif_logical_down(ipif, NULL, NULL); 13719 ipif_down_tail(ipif); 13720 /* 13721 * We don't do ipif_multicast_down for IPv4 in 13722 * ipif_down. We need to set this so that 13723 * ipif_multicast_up will join the 13724 * ALLHOSTS_GROUP on to_ill. 13725 */ 13726 ipif->ipif_multicast_up = B_FALSE; 13727 } 13728 } 13729 } 13730 13731 #define IPSQ_INC_REF(ipsq) { \ 13732 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 13733 (ipsq)->ipsq_refs++; \ 13734 } 13735 13736 #define IPSQ_DEC_REF(ipsq) { \ 13737 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 13738 (ipsq)->ipsq_refs--; \ 13739 if ((ipsq)->ipsq_refs == 0) \ 13740 (ipsq)->ipsq_name[0] = '\0'; \ 13741 } 13742 13743 /* 13744 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 13745 * new_ipsq. 13746 */ 13747 static void 13748 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq) 13749 { 13750 phyint_t *phyint; 13751 phyint_t *next_phyint; 13752 13753 /* 13754 * To change the ipsq of an ill, we need to hold the ill_g_lock as 13755 * writer and the ill_lock of the ill in question. Also the dest 13756 * ipsq can't vanish while we hold the ill_g_lock as writer. 13757 */ 13758 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13759 13760 phyint = cur_ipsq->ipsq_phyint_list; 13761 cur_ipsq->ipsq_phyint_list = NULL; 13762 while (phyint != NULL) { 13763 next_phyint = phyint->phyint_ipsq_next; 13764 IPSQ_DEC_REF(cur_ipsq); 13765 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 13766 new_ipsq->ipsq_phyint_list = phyint; 13767 IPSQ_INC_REF(new_ipsq); 13768 phyint->phyint_ipsq = new_ipsq; 13769 phyint = next_phyint; 13770 } 13771 } 13772 13773 #define SPLIT_SUCCESS 0 13774 #define SPLIT_NOT_NEEDED 1 13775 #define SPLIT_FAILED 2 13776 13777 int 13778 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry) 13779 { 13780 ipsq_t *newipsq = NULL; 13781 13782 /* 13783 * Assertions denote pre-requisites for changing the ipsq of 13784 * a phyint 13785 */ 13786 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13787 /* 13788 * <ill-phyint> assocs can't change while ill_g_lock 13789 * is held as writer. See ill_phyint_reinit() 13790 */ 13791 ASSERT(phyint->phyint_illv4 == NULL || 13792 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13793 ASSERT(phyint->phyint_illv6 == NULL || 13794 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13795 13796 if ((phyint->phyint_groupname_len != 13797 (strlen(cur_ipsq->ipsq_name) + 1) || 13798 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 13799 phyint->phyint_groupname_len) != 0)) { 13800 /* 13801 * Once we fail in creating a new ipsq due to memory shortage, 13802 * don't attempt to create new ipsq again, based on another 13803 * phyint, since we want all phyints belonging to an IPMP group 13804 * to be in the same ipsq even in the event of mem alloc fails. 13805 */ 13806 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 13807 cur_ipsq); 13808 if (newipsq == NULL) { 13809 /* Memory allocation failure */ 13810 return (SPLIT_FAILED); 13811 } else { 13812 /* ipsq_refs protected by ill_g_lock (writer) */ 13813 IPSQ_DEC_REF(cur_ipsq); 13814 phyint->phyint_ipsq = newipsq; 13815 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 13816 newipsq->ipsq_phyint_list = phyint; 13817 IPSQ_INC_REF(newipsq); 13818 return (SPLIT_SUCCESS); 13819 } 13820 } 13821 return (SPLIT_NOT_NEEDED); 13822 } 13823 13824 /* 13825 * The ill locks of the phyint and the ill_g_lock (writer) must be held 13826 * to do this split 13827 */ 13828 static int 13829 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq) 13830 { 13831 ipsq_t *newipsq; 13832 13833 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13834 /* 13835 * <ill-phyint> assocs can't change while ill_g_lock 13836 * is held as writer. See ill_phyint_reinit() 13837 */ 13838 13839 ASSERT(phyint->phyint_illv4 == NULL || 13840 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13841 ASSERT(phyint->phyint_illv6 == NULL || 13842 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13843 13844 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 13845 phyint->phyint_illv4: phyint->phyint_illv6)) { 13846 /* 13847 * ipsq_init failed due to no memory 13848 * caller will use the same ipsq 13849 */ 13850 return (SPLIT_FAILED); 13851 } 13852 13853 /* ipsq_ref is protected by ill_g_lock (writer) */ 13854 IPSQ_DEC_REF(cur_ipsq); 13855 13856 /* 13857 * This is a new ipsq that is unknown to the world. 13858 * So we don't need to hold ipsq_lock, 13859 */ 13860 newipsq = phyint->phyint_ipsq; 13861 newipsq->ipsq_writer = NULL; 13862 newipsq->ipsq_reentry_cnt--; 13863 ASSERT(newipsq->ipsq_reentry_cnt == 0); 13864 #ifdef ILL_DEBUG 13865 newipsq->ipsq_depth = 0; 13866 #endif 13867 13868 return (SPLIT_SUCCESS); 13869 } 13870 13871 /* 13872 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 13873 * ipsq's representing their individual groups or themselves. Return 13874 * whether split needs to be retried again later. 13875 */ 13876 static boolean_t 13877 ill_split_ipsq(ipsq_t *cur_ipsq) 13878 { 13879 phyint_t *phyint; 13880 phyint_t *next_phyint; 13881 int error; 13882 boolean_t need_retry = B_FALSE; 13883 13884 phyint = cur_ipsq->ipsq_phyint_list; 13885 cur_ipsq->ipsq_phyint_list = NULL; 13886 while (phyint != NULL) { 13887 next_phyint = phyint->phyint_ipsq_next; 13888 /* 13889 * 'created' will tell us whether the callee actually 13890 * created an ipsq. Lack of memory may force the callee 13891 * to return without creating an ipsq. 13892 */ 13893 if (phyint->phyint_groupname == NULL) { 13894 error = ill_split_to_own_ipsq(phyint, cur_ipsq); 13895 } else { 13896 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 13897 need_retry); 13898 } 13899 13900 switch (error) { 13901 case SPLIT_FAILED: 13902 need_retry = B_TRUE; 13903 /* FALLTHRU */ 13904 case SPLIT_NOT_NEEDED: 13905 /* 13906 * Keep it on the list. 13907 */ 13908 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 13909 cur_ipsq->ipsq_phyint_list = phyint; 13910 break; 13911 case SPLIT_SUCCESS: 13912 break; 13913 default: 13914 ASSERT(0); 13915 } 13916 13917 phyint = next_phyint; 13918 } 13919 return (need_retry); 13920 } 13921 13922 /* 13923 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 13924 * and return the ills in the list. This list will be 13925 * needed to unlock all the ills later on by the caller. 13926 * The <ill-ipsq> associations could change between the 13927 * lock and unlock. Hence the unlock can't traverse the 13928 * ipsq to get the list of ills. 13929 */ 13930 static int 13931 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 13932 { 13933 int cnt = 0; 13934 phyint_t *phyint; 13935 13936 /* 13937 * The caller holds ill_g_lock to ensure that the ill memberships 13938 * of the ipsq don't change 13939 */ 13940 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 13941 13942 phyint = ipsq->ipsq_phyint_list; 13943 while (phyint != NULL) { 13944 if (phyint->phyint_illv4 != NULL) { 13945 ASSERT(cnt < list_max); 13946 list[cnt++] = phyint->phyint_illv4; 13947 } 13948 if (phyint->phyint_illv6 != NULL) { 13949 ASSERT(cnt < list_max); 13950 list[cnt++] = phyint->phyint_illv6; 13951 } 13952 phyint = phyint->phyint_ipsq_next; 13953 } 13954 ill_lock_ills(list, cnt); 13955 return (cnt); 13956 } 13957 13958 void 13959 ill_lock_ills(ill_t **list, int cnt) 13960 { 13961 int i; 13962 13963 if (cnt > 1) { 13964 boolean_t try_again; 13965 do { 13966 try_again = B_FALSE; 13967 for (i = 0; i < cnt - 1; i++) { 13968 if (list[i] < list[i + 1]) { 13969 ill_t *tmp; 13970 13971 /* swap the elements */ 13972 tmp = list[i]; 13973 list[i] = list[i + 1]; 13974 list[i + 1] = tmp; 13975 try_again = B_TRUE; 13976 } 13977 } 13978 } while (try_again); 13979 } 13980 13981 for (i = 0; i < cnt; i++) { 13982 if (i == 0) { 13983 if (list[i] != NULL) 13984 mutex_enter(&list[i]->ill_lock); 13985 else 13986 return; 13987 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 13988 mutex_enter(&list[i]->ill_lock); 13989 } 13990 } 13991 } 13992 13993 void 13994 ill_unlock_ills(ill_t **list, int cnt) 13995 { 13996 int i; 13997 13998 for (i = 0; i < cnt; i++) { 13999 if ((i == 0) && (list[i] != NULL)) { 14000 mutex_exit(&list[i]->ill_lock); 14001 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14002 mutex_exit(&list[i]->ill_lock); 14003 } 14004 } 14005 } 14006 14007 /* 14008 * Merge all the ills from 1 ipsq group into another ipsq group. 14009 * The source ipsq group is specified by the ipsq associated with 14010 * 'from_ill'. The destination ipsq group is specified by the ipsq 14011 * associated with 'to_ill' or 'groupname' respectively. 14012 * Note that ipsq itself does not have a reference count mechanism 14013 * and functions don't look up an ipsq and pass it around. Instead 14014 * functions pass around an ill or groupname, and the ipsq is looked 14015 * up from the ill or groupname and the required operation performed 14016 * atomically with the lookup on the ipsq. 14017 */ 14018 static int 14019 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 14020 queue_t *q) 14021 { 14022 ipsq_t *old_ipsq; 14023 ipsq_t *new_ipsq; 14024 ill_t **ill_list; 14025 int cnt; 14026 size_t ill_list_size; 14027 boolean_t became_writer_on_new_sq = B_FALSE; 14028 14029 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 14030 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 14031 14032 /* 14033 * Need to hold ill_g_lock as writer and also the ill_lock to 14034 * change the <ill-ipsq> assoc of an ill. Need to hold the 14035 * ipsq_lock to prevent new messages from landing on an ipsq. 14036 */ 14037 rw_enter(&ill_g_lock, RW_WRITER); 14038 14039 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 14040 if (groupname != NULL) 14041 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL); 14042 else { 14043 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 14044 } 14045 14046 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 14047 14048 /* 14049 * both groups are on the same ipsq. 14050 */ 14051 if (old_ipsq == new_ipsq) { 14052 rw_exit(&ill_g_lock); 14053 return (0); 14054 } 14055 14056 cnt = old_ipsq->ipsq_refs << 1; 14057 ill_list_size = cnt * sizeof (ill_t *); 14058 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 14059 if (ill_list == NULL) { 14060 rw_exit(&ill_g_lock); 14061 return (ENOMEM); 14062 } 14063 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 14064 14065 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 14066 mutex_enter(&new_ipsq->ipsq_lock); 14067 if ((new_ipsq->ipsq_writer == NULL && 14068 new_ipsq->ipsq_current_ipif == NULL) || 14069 (new_ipsq->ipsq_writer == curthread)) { 14070 new_ipsq->ipsq_writer = curthread; 14071 new_ipsq->ipsq_reentry_cnt++; 14072 became_writer_on_new_sq = B_TRUE; 14073 } 14074 14075 /* 14076 * We are holding ill_g_lock as writer and all the ill locks of 14077 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 14078 * message can land up on the old ipsq even though we don't hold the 14079 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 14080 */ 14081 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 14082 14083 /* 14084 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 14085 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 14086 * assocs. till we release the ill_g_lock, and hence it can't vanish. 14087 */ 14088 ill_merge_ipsq(old_ipsq, new_ipsq); 14089 14090 /* 14091 * Mark the new ipsq as needing a split since it is currently 14092 * being shared by more than 1 IPMP group. The split will 14093 * occur at the end of ipsq_exit 14094 */ 14095 new_ipsq->ipsq_split = B_TRUE; 14096 14097 /* Now release all the locks */ 14098 mutex_exit(&new_ipsq->ipsq_lock); 14099 ill_unlock_ills(ill_list, cnt); 14100 rw_exit(&ill_g_lock); 14101 14102 kmem_free(ill_list, ill_list_size); 14103 14104 /* 14105 * If we succeeded in becoming writer on the new ipsq, then 14106 * drain the new ipsq and start processing all enqueued messages 14107 * including the current ioctl we are processing which is either 14108 * a set groupname or failover/failback. 14109 */ 14110 if (became_writer_on_new_sq) 14111 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 14112 14113 /* 14114 * syncq has been changed and all the messages have been moved. 14115 */ 14116 mutex_enter(&old_ipsq->ipsq_lock); 14117 old_ipsq->ipsq_current_ipif = NULL; 14118 mutex_exit(&old_ipsq->ipsq_lock); 14119 return (EINPROGRESS); 14120 } 14121 14122 /* 14123 * Delete and add the loopback copy and non-loopback copy of 14124 * the BROADCAST ire corresponding to ill and addr. Used to 14125 * group broadcast ires together when ill becomes part of 14126 * a group. 14127 * 14128 * This function is also called when ill is leaving the group 14129 * so that the ires belonging to the group gets re-grouped. 14130 */ 14131 static void 14132 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 14133 { 14134 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 14135 ire_t **ire_ptpn = &ire_head; 14136 14137 /* 14138 * The loopback and non-loopback IREs are inserted in the order in which 14139 * they're found, on the basis that they are correctly ordered (loopback 14140 * first). 14141 */ 14142 for (;;) { 14143 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14144 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14145 if (ire == NULL) 14146 break; 14147 14148 /* 14149 * we are passing in KM_SLEEP because it is not easy to 14150 * go back to a sane state in case of memory failure. 14151 */ 14152 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 14153 ASSERT(nire != NULL); 14154 bzero(nire, sizeof (ire_t)); 14155 /* 14156 * Don't use ire_max_frag directly since we don't 14157 * hold on to 'ire' until we add the new ire 'nire' and 14158 * we don't want the new ire to have a dangling reference 14159 * to 'ire'. The ire_max_frag of a broadcast ire must 14160 * be in sync with the ipif_mtu of the associate ipif. 14161 * For eg. this happens as a result of SIOCSLIFNAME, 14162 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 14163 * the driver. A change in ire_max_frag triggered as 14164 * as a result of path mtu discovery, or due to an 14165 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 14166 * route change -mtu command does not apply to broadcast ires. 14167 * 14168 * XXX We need a recovery strategy here if ire_init fails 14169 */ 14170 if (ire_init(nire, 14171 (uchar_t *)&ire->ire_addr, 14172 (uchar_t *)&ire->ire_mask, 14173 (uchar_t *)&ire->ire_src_addr, 14174 (uchar_t *)&ire->ire_gateway_addr, 14175 (uchar_t *)&ire->ire_in_src_addr, 14176 ire->ire_stq == NULL ? &ip_loopback_mtu : 14177 &ire->ire_ipif->ipif_mtu, 14178 (ire->ire_nce != NULL ? ire->ire_nce->nce_fp_mp : NULL), 14179 ire->ire_rfq, 14180 ire->ire_stq, 14181 ire->ire_type, 14182 (ire->ire_nce != NULL? ire->ire_nce->nce_res_mp : NULL), 14183 ire->ire_ipif, 14184 ire->ire_in_ill, 14185 ire->ire_cmask, 14186 ire->ire_phandle, 14187 ire->ire_ihandle, 14188 ire->ire_flags, 14189 &ire->ire_uinfo, 14190 NULL, 14191 NULL) == NULL) { 14192 cmn_err(CE_PANIC, "ire_init() failed"); 14193 } 14194 ire_delete(ire); 14195 ire_refrele(ire); 14196 14197 /* 14198 * The newly created IREs are inserted at the tail of the list 14199 * starting with ire_head. As we've just allocated them no one 14200 * knows about them so it's safe. 14201 */ 14202 *ire_ptpn = nire; 14203 ire_ptpn = &nire->ire_next; 14204 } 14205 14206 for (nire = ire_head; nire != NULL; nire = nire_next) { 14207 int error; 14208 ire_t *oire; 14209 /* unlink the IRE from our list before calling ire_add() */ 14210 nire_next = nire->ire_next; 14211 nire->ire_next = NULL; 14212 14213 /* ire_add adds the ire at the right place in the list */ 14214 oire = nire; 14215 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); 14216 ASSERT(error == 0); 14217 ASSERT(oire == nire); 14218 ire_refrele(nire); /* Held in ire_add */ 14219 } 14220 } 14221 14222 /* 14223 * This function is usually called when an ill is inserted in 14224 * a group and all the ipifs are already UP. As all the ipifs 14225 * are already UP, the broadcast ires have already been created 14226 * and been inserted. But, ire_add_v4 would not have grouped properly. 14227 * We need to re-group for the benefit of ip_wput_ire which 14228 * expects BROADCAST ires to be grouped properly to avoid sending 14229 * more than one copy of the broadcast packet per group. 14230 * 14231 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 14232 * because when ipif_up_done ends up calling this, ires have 14233 * already been added before illgrp_insert i.e before ill_group 14234 * has been initialized. 14235 */ 14236 static void 14237 ill_group_bcast_for_xmit(ill_t *ill) 14238 { 14239 ill_group_t *illgrp; 14240 ipif_t *ipif; 14241 ipaddr_t addr; 14242 ipaddr_t net_mask; 14243 ipaddr_t subnet_netmask; 14244 14245 illgrp = ill->ill_group; 14246 14247 /* 14248 * This function is called even when an ill is deleted from 14249 * the group. Hence, illgrp could be null. 14250 */ 14251 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 14252 return; 14253 14254 /* 14255 * Delete all the BROADCAST ires matching this ill and add 14256 * them back. This time, ire_add_v4 should take care of 14257 * grouping them with others because ill is part of the 14258 * group. 14259 */ 14260 ill_bcast_delete_and_add(ill, 0); 14261 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 14262 14263 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14264 14265 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14266 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14267 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14268 } else { 14269 net_mask = htonl(IN_CLASSA_NET); 14270 } 14271 addr = net_mask & ipif->ipif_subnet; 14272 ill_bcast_delete_and_add(ill, addr); 14273 ill_bcast_delete_and_add(ill, ~net_mask | addr); 14274 14275 subnet_netmask = ipif->ipif_net_mask; 14276 addr = ipif->ipif_subnet; 14277 ill_bcast_delete_and_add(ill, addr); 14278 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 14279 } 14280 } 14281 14282 /* 14283 * This function is called from illgrp_delete when ill is being deleted 14284 * from the group. 14285 * 14286 * As ill is not there in the group anymore, any address belonging 14287 * to this ill should be cleared of IRE_MARK_NORECV. 14288 */ 14289 static void 14290 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 14291 { 14292 ire_t *ire; 14293 irb_t *irb; 14294 14295 ASSERT(ill->ill_group == NULL); 14296 14297 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14298 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14299 14300 if (ire != NULL) { 14301 /* 14302 * IPMP and plumbing operations are serialized on the ipsq, so 14303 * no one will insert or delete a broadcast ire under our feet. 14304 */ 14305 irb = ire->ire_bucket; 14306 rw_enter(&irb->irb_lock, RW_READER); 14307 ire_refrele(ire); 14308 14309 for (; ire != NULL; ire = ire->ire_next) { 14310 if (ire->ire_addr != addr) 14311 break; 14312 if (ire_to_ill(ire) != ill) 14313 continue; 14314 14315 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 14316 ire->ire_marks &= ~IRE_MARK_NORECV; 14317 } 14318 rw_exit(&irb->irb_lock); 14319 } 14320 } 14321 14322 /* 14323 * This function must be called only after the broadcast ires 14324 * have been grouped together. For a given address addr, nominate 14325 * only one of the ires whose interface is not FAILED or OFFLINE. 14326 * 14327 * This is also called when an ipif goes down, so that we can nominate 14328 * a different ire with the same address for receiving. 14329 */ 14330 static void 14331 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr) 14332 { 14333 irb_t *irb; 14334 ire_t *ire; 14335 ire_t *ire1; 14336 ire_t *save_ire; 14337 ire_t **irep = NULL; 14338 boolean_t first = B_TRUE; 14339 ire_t *clear_ire = NULL; 14340 ire_t *start_ire = NULL; 14341 ire_t *new_lb_ire; 14342 ire_t *new_nlb_ire; 14343 boolean_t new_lb_ire_used = B_FALSE; 14344 boolean_t new_nlb_ire_used = B_FALSE; 14345 uint64_t match_flags; 14346 uint64_t phyi_flags; 14347 boolean_t fallback = B_FALSE; 14348 14349 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 14350 NULL, MATCH_IRE_TYPE); 14351 /* 14352 * We may not be able to find some ires if a previous 14353 * ire_create failed. This happens when an ipif goes 14354 * down and we are unable to create BROADCAST ires due 14355 * to memory failure. Thus, we have to check for NULL 14356 * below. This should handle the case for LOOPBACK, 14357 * POINTOPOINT and interfaces with some POINTOPOINT 14358 * logicals for which there are no BROADCAST ires. 14359 */ 14360 if (ire == NULL) 14361 return; 14362 /* 14363 * Currently IRE_BROADCASTS are deleted when an ipif 14364 * goes down which runs exclusively. Thus, setting 14365 * IRE_MARK_RCVD should not race with ire_delete marking 14366 * IRE_MARK_CONDEMNED. We grab the lock below just to 14367 * be consistent with other parts of the code that walks 14368 * a given bucket. 14369 */ 14370 save_ire = ire; 14371 irb = ire->ire_bucket; 14372 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14373 if (new_lb_ire == NULL) { 14374 ire_refrele(ire); 14375 return; 14376 } 14377 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14378 if (new_nlb_ire == NULL) { 14379 ire_refrele(ire); 14380 kmem_cache_free(ire_cache, new_lb_ire); 14381 return; 14382 } 14383 IRB_REFHOLD(irb); 14384 rw_enter(&irb->irb_lock, RW_WRITER); 14385 /* 14386 * Get to the first ire matching the address and the 14387 * group. If the address does not match we are done 14388 * as we could not find the IRE. If the address matches 14389 * we should get to the first one matching the group. 14390 */ 14391 while (ire != NULL) { 14392 if (ire->ire_addr != addr || 14393 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14394 break; 14395 } 14396 ire = ire->ire_next; 14397 } 14398 match_flags = PHYI_FAILED | PHYI_INACTIVE; 14399 start_ire = ire; 14400 redo: 14401 while (ire != NULL && ire->ire_addr == addr && 14402 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14403 /* 14404 * The first ire for any address within a group 14405 * should always be the one with IRE_MARK_NORECV cleared 14406 * so that ip_wput_ire can avoid searching for one. 14407 * Note down the insertion point which will be used 14408 * later. 14409 */ 14410 if (first && (irep == NULL)) 14411 irep = ire->ire_ptpn; 14412 /* 14413 * PHYI_FAILED is set when the interface fails. 14414 * This interface might have become good, but the 14415 * daemon has not yet detected. We should still 14416 * not receive on this. PHYI_OFFLINE should never 14417 * be picked as this has been offlined and soon 14418 * be removed. 14419 */ 14420 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 14421 if (phyi_flags & PHYI_OFFLINE) { 14422 ire->ire_marks |= IRE_MARK_NORECV; 14423 ire = ire->ire_next; 14424 continue; 14425 } 14426 if (phyi_flags & match_flags) { 14427 ire->ire_marks |= IRE_MARK_NORECV; 14428 ire = ire->ire_next; 14429 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 14430 PHYI_INACTIVE) { 14431 fallback = B_TRUE; 14432 } 14433 continue; 14434 } 14435 if (first) { 14436 /* 14437 * We will move this to the front of the list later 14438 * on. 14439 */ 14440 clear_ire = ire; 14441 ire->ire_marks &= ~IRE_MARK_NORECV; 14442 } else { 14443 ire->ire_marks |= IRE_MARK_NORECV; 14444 } 14445 first = B_FALSE; 14446 ire = ire->ire_next; 14447 } 14448 /* 14449 * If we never nominated anybody, try nominating at least 14450 * an INACTIVE, if we found one. Do it only once though. 14451 */ 14452 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 14453 fallback) { 14454 match_flags = PHYI_FAILED; 14455 ire = start_ire; 14456 irep = NULL; 14457 goto redo; 14458 } 14459 ire_refrele(save_ire); 14460 14461 /* 14462 * irep non-NULL indicates that we entered the while loop 14463 * above. If clear_ire is at the insertion point, we don't 14464 * have to do anything. clear_ire will be NULL if all the 14465 * interfaces are failed. 14466 * 14467 * We cannot unlink and reinsert the ire at the right place 14468 * in the list since there can be other walkers of this bucket. 14469 * Instead we delete and recreate the ire 14470 */ 14471 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 14472 ire_t *clear_ire_stq = NULL; 14473 mblk_t *fp_mp = NULL, *res_mp = NULL; 14474 14475 bzero(new_lb_ire, sizeof (ire_t)); 14476 if (clear_ire->ire_nce != NULL) { 14477 fp_mp = clear_ire->ire_nce->nce_fp_mp; 14478 res_mp = clear_ire->ire_nce->nce_res_mp; 14479 } 14480 /* XXX We need a recovery strategy here. */ 14481 if (ire_init(new_lb_ire, 14482 (uchar_t *)&clear_ire->ire_addr, 14483 (uchar_t *)&clear_ire->ire_mask, 14484 (uchar_t *)&clear_ire->ire_src_addr, 14485 (uchar_t *)&clear_ire->ire_gateway_addr, 14486 (uchar_t *)&clear_ire->ire_in_src_addr, 14487 &clear_ire->ire_max_frag, 14488 fp_mp, 14489 clear_ire->ire_rfq, 14490 clear_ire->ire_stq, 14491 clear_ire->ire_type, 14492 res_mp, 14493 clear_ire->ire_ipif, 14494 clear_ire->ire_in_ill, 14495 clear_ire->ire_cmask, 14496 clear_ire->ire_phandle, 14497 clear_ire->ire_ihandle, 14498 clear_ire->ire_flags, 14499 &clear_ire->ire_uinfo, 14500 NULL, 14501 NULL) == NULL) 14502 cmn_err(CE_PANIC, "ire_init() failed"); 14503 if (clear_ire->ire_stq == NULL) { 14504 ire_t *ire_next = clear_ire->ire_next; 14505 if (ire_next != NULL && 14506 ire_next->ire_stq != NULL && 14507 ire_next->ire_addr == clear_ire->ire_addr && 14508 ire_next->ire_ipif->ipif_ill == 14509 clear_ire->ire_ipif->ipif_ill) { 14510 clear_ire_stq = ire_next; 14511 14512 bzero(new_nlb_ire, sizeof (ire_t)); 14513 if (clear_ire_stq->ire_nce != NULL) { 14514 fp_mp = 14515 clear_ire_stq->ire_nce->nce_fp_mp; 14516 res_mp = 14517 clear_ire_stq->ire_nce->nce_res_mp; 14518 } else { 14519 fp_mp = res_mp = NULL; 14520 } 14521 /* XXX We need a recovery strategy here. */ 14522 if (ire_init(new_nlb_ire, 14523 (uchar_t *)&clear_ire_stq->ire_addr, 14524 (uchar_t *)&clear_ire_stq->ire_mask, 14525 (uchar_t *)&clear_ire_stq->ire_src_addr, 14526 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 14527 (uchar_t *)&clear_ire_stq->ire_in_src_addr, 14528 &clear_ire_stq->ire_max_frag, 14529 fp_mp, 14530 clear_ire_stq->ire_rfq, 14531 clear_ire_stq->ire_stq, 14532 clear_ire_stq->ire_type, 14533 res_mp, 14534 clear_ire_stq->ire_ipif, 14535 clear_ire_stq->ire_in_ill, 14536 clear_ire_stq->ire_cmask, 14537 clear_ire_stq->ire_phandle, 14538 clear_ire_stq->ire_ihandle, 14539 clear_ire_stq->ire_flags, 14540 &clear_ire_stq->ire_uinfo, 14541 NULL, 14542 NULL) == NULL) 14543 cmn_err(CE_PANIC, "ire_init() failed"); 14544 } 14545 } 14546 14547 /* 14548 * Delete the ire. We can't call ire_delete() since 14549 * we are holding the bucket lock. We can't release the 14550 * bucket lock since we can't allow irep to change. So just 14551 * mark it CONDEMNED. The IRB_REFRELE will delete the 14552 * ire from the list and do the refrele. 14553 */ 14554 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 14555 irb->irb_marks |= IRB_MARK_CONDEMNED; 14556 14557 if (clear_ire_stq != NULL) { 14558 ire_fastpath_list_delete( 14559 (ill_t *)clear_ire_stq->ire_stq->q_ptr, 14560 clear_ire_stq); 14561 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 14562 } 14563 14564 /* 14565 * Also take care of otherfields like ib/ob pkt count 14566 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 14567 */ 14568 14569 /* Add the new ire's. Insert at *irep */ 14570 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 14571 ire1 = *irep; 14572 if (ire1 != NULL) 14573 ire1->ire_ptpn = &new_lb_ire->ire_next; 14574 new_lb_ire->ire_next = ire1; 14575 /* Link the new one in. */ 14576 new_lb_ire->ire_ptpn = irep; 14577 membar_producer(); 14578 *irep = new_lb_ire; 14579 new_lb_ire_used = B_TRUE; 14580 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 14581 new_lb_ire->ire_bucket->irb_ire_cnt++; 14582 new_lb_ire->ire_ipif->ipif_ire_cnt++; 14583 14584 if (clear_ire_stq != NULL) { 14585 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 14586 irep = &new_lb_ire->ire_next; 14587 /* Add the new ire. Insert at *irep */ 14588 ire1 = *irep; 14589 if (ire1 != NULL) 14590 ire1->ire_ptpn = &new_nlb_ire->ire_next; 14591 new_nlb_ire->ire_next = ire1; 14592 /* Link the new one in. */ 14593 new_nlb_ire->ire_ptpn = irep; 14594 membar_producer(); 14595 *irep = new_nlb_ire; 14596 new_nlb_ire_used = B_TRUE; 14597 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 14598 new_nlb_ire->ire_bucket->irb_ire_cnt++; 14599 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 14600 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 14601 } 14602 } 14603 rw_exit(&irb->irb_lock); 14604 if (!new_lb_ire_used) 14605 kmem_cache_free(ire_cache, new_lb_ire); 14606 if (!new_nlb_ire_used) 14607 kmem_cache_free(ire_cache, new_nlb_ire); 14608 IRB_REFRELE(irb); 14609 } 14610 14611 /* 14612 * Whenever an ipif goes down we have to renominate a different 14613 * broadcast ire to receive. Whenever an ipif comes up, we need 14614 * to make sure that we have only one nominated to receive. 14615 */ 14616 static void 14617 ipif_renominate_bcast(ipif_t *ipif) 14618 { 14619 ill_t *ill = ipif->ipif_ill; 14620 ipaddr_t subnet_addr; 14621 ipaddr_t net_addr; 14622 ipaddr_t net_mask = 0; 14623 ipaddr_t subnet_netmask; 14624 ipaddr_t addr; 14625 ill_group_t *illgrp; 14626 14627 illgrp = ill->ill_group; 14628 /* 14629 * If this is the last ipif going down, it might take 14630 * the ill out of the group. In that case ipif_down -> 14631 * illgrp_delete takes care of doing the nomination. 14632 * ipif_down does not call for this case. 14633 */ 14634 ASSERT(illgrp != NULL); 14635 14636 /* There could not have been any ires associated with this */ 14637 if (ipif->ipif_subnet == 0) 14638 return; 14639 14640 ill_mark_bcast(illgrp, 0); 14641 ill_mark_bcast(illgrp, INADDR_BROADCAST); 14642 14643 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14644 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14645 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14646 } else { 14647 net_mask = htonl(IN_CLASSA_NET); 14648 } 14649 addr = net_mask & ipif->ipif_subnet; 14650 ill_mark_bcast(illgrp, addr); 14651 14652 net_addr = ~net_mask | addr; 14653 ill_mark_bcast(illgrp, net_addr); 14654 14655 subnet_netmask = ipif->ipif_net_mask; 14656 addr = ipif->ipif_subnet; 14657 ill_mark_bcast(illgrp, addr); 14658 14659 subnet_addr = ~subnet_netmask | addr; 14660 ill_mark_bcast(illgrp, subnet_addr); 14661 } 14662 14663 /* 14664 * Whenever we form or delete ill groups, we need to nominate one set of 14665 * BROADCAST ires for receiving in the group. 14666 * 14667 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 14668 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 14669 * for ill_ipif_up_count to be non-zero. This is the only case where 14670 * ill_ipif_up_count is zero and we would still find the ires. 14671 * 14672 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 14673 * ipif is UP and we just have to do the nomination. 14674 * 14675 * 3) When ill_handoff_responsibility calls us, some ill has been removed 14676 * from the group. So, we have to do the nomination. 14677 * 14678 * Because of (3), there could be just one ill in the group. But we have 14679 * to nominate still as IRE_MARK_NORCV may have been marked on this. 14680 * Thus, this function does not optimize when there is only one ill as 14681 * it is not correct for (3). 14682 */ 14683 static void 14684 ill_nominate_bcast_rcv(ill_group_t *illgrp) 14685 { 14686 ill_t *ill; 14687 ipif_t *ipif; 14688 ipaddr_t subnet_addr; 14689 ipaddr_t prev_subnet_addr = 0; 14690 ipaddr_t net_addr; 14691 ipaddr_t prev_net_addr = 0; 14692 ipaddr_t net_mask = 0; 14693 ipaddr_t subnet_netmask; 14694 ipaddr_t addr; 14695 14696 /* 14697 * When the last memeber is leaving, there is nothing to 14698 * nominate. 14699 */ 14700 if (illgrp->illgrp_ill_count == 0) { 14701 ASSERT(illgrp->illgrp_ill == NULL); 14702 return; 14703 } 14704 14705 ill = illgrp->illgrp_ill; 14706 ASSERT(!ill->ill_isv6); 14707 /* 14708 * We assume that ires with same address and belonging to the 14709 * same group, has been grouped together. Nominating a *single* 14710 * ill in the group for sending and receiving broadcast is done 14711 * by making sure that the first BROADCAST ire (which will be 14712 * the one returned by ire_ctable_lookup for ip_rput and the 14713 * one that will be used in ip_wput_ire) will be the one that 14714 * will not have IRE_MARK_NORECV set. 14715 * 14716 * 1) ip_rput checks and discards packets received on ires marked 14717 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 14718 * broadcast packets. We need to clear IRE_MARK_NORECV on the 14719 * first ire in the group for every broadcast address in the group. 14720 * ip_rput will accept packets only on the first ire i.e only 14721 * one copy of the ill. 14722 * 14723 * 2) ip_wput_ire needs to send out just one copy of the broadcast 14724 * packet for the whole group. It needs to send out on the ill 14725 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 14726 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 14727 * the copy echoed back on other port where the ire is not marked 14728 * with IRE_MARK_NORECV. 14729 * 14730 * Note that we just need to have the first IRE either loopback or 14731 * non-loopback (either of them may not exist if ire_create failed 14732 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 14733 * always hit the first one and hence will always accept one copy. 14734 * 14735 * We have a broadcast ire per ill for all the unique prefixes 14736 * hosted on that ill. As we don't have a way of knowing the 14737 * unique prefixes on a given ill and hence in the whole group, 14738 * we just call ill_mark_bcast on all the prefixes that exist 14739 * in the group. For the common case of one prefix, the code 14740 * below optimizes by remebering the last address used for 14741 * markng. In the case of multiple prefixes, this will still 14742 * optimize depending the order of prefixes. 14743 * 14744 * The only unique address across the whole group is 0.0.0.0 and 14745 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 14746 * the first ire in the bucket for receiving and disables the 14747 * others. 14748 */ 14749 ill_mark_bcast(illgrp, 0); 14750 ill_mark_bcast(illgrp, INADDR_BROADCAST); 14751 for (; ill != NULL; ill = ill->ill_group_next) { 14752 14753 for (ipif = ill->ill_ipif; ipif != NULL; 14754 ipif = ipif->ipif_next) { 14755 14756 if (!(ipif->ipif_flags & IPIF_UP) || 14757 ipif->ipif_subnet == 0) { 14758 continue; 14759 } 14760 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14761 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14762 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14763 } else { 14764 net_mask = htonl(IN_CLASSA_NET); 14765 } 14766 addr = net_mask & ipif->ipif_subnet; 14767 if (prev_net_addr == 0 || prev_net_addr != addr) { 14768 ill_mark_bcast(illgrp, addr); 14769 net_addr = ~net_mask | addr; 14770 ill_mark_bcast(illgrp, net_addr); 14771 } 14772 prev_net_addr = addr; 14773 14774 subnet_netmask = ipif->ipif_net_mask; 14775 addr = ipif->ipif_subnet; 14776 if (prev_subnet_addr == 0 || 14777 prev_subnet_addr != addr) { 14778 ill_mark_bcast(illgrp, addr); 14779 subnet_addr = ~subnet_netmask | addr; 14780 ill_mark_bcast(illgrp, subnet_addr); 14781 } 14782 prev_subnet_addr = addr; 14783 } 14784 } 14785 } 14786 14787 /* 14788 * This function is called while forming ill groups. 14789 * 14790 * Currently, we handle only allmulti groups. We want to join 14791 * allmulti on only one of the ills in the groups. In future, 14792 * when we have link aggregation, we may have to join normal 14793 * multicast groups on multiple ills as switch does inbound load 14794 * balancing. Following are the functions that calls this 14795 * function : 14796 * 14797 * 1) ill_recover_multicast : Interface is coming back UP. 14798 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 14799 * will call ill_recover_multicast to recover all the multicast 14800 * groups. We need to make sure that only one member is joined 14801 * in the ill group. 14802 * 14803 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 14804 * Somebody is joining allmulti. We need to make sure that only one 14805 * member is joined in the group. 14806 * 14807 * 3) illgrp_insert : If allmulti has already joined, we need to make 14808 * sure that only one member is joined in the group. 14809 * 14810 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 14811 * allmulti who we have nominated. We need to pick someother ill. 14812 * 14813 * 5) illgrp_delete : The ill we nominated is leaving the group, 14814 * we need to pick a new ill to join the group. 14815 * 14816 * For (1), (2), (5) - we just have to check whether there is 14817 * a good ill joined in the group. If we could not find any ills 14818 * joined the group, we should join. 14819 * 14820 * For (4), the one that was nominated to receive, left the group. 14821 * There could be nobody joined in the group when this function is 14822 * called. 14823 * 14824 * For (3) - we need to explicitly check whether there are multiple 14825 * ills joined in the group. 14826 * 14827 * For simplicity, we don't differentiate any of the above cases. We 14828 * just leave the group if it is joined on any of them and join on 14829 * the first good ill. 14830 */ 14831 int 14832 ill_nominate_mcast_rcv(ill_group_t *illgrp) 14833 { 14834 ilm_t *ilm; 14835 ill_t *ill; 14836 ill_t *fallback_inactive_ill = NULL; 14837 ill_t *fallback_failed_ill = NULL; 14838 int ret = 0; 14839 14840 /* 14841 * Leave the allmulti on all the ills and start fresh. 14842 */ 14843 for (ill = illgrp->illgrp_ill; ill != NULL; 14844 ill = ill->ill_group_next) { 14845 if (ill->ill_join_allmulti) 14846 (void) ip_leave_allmulti(ill->ill_ipif); 14847 } 14848 14849 /* 14850 * Choose a good ill. Fallback to inactive or failed if 14851 * none available. We need to fallback to FAILED in the 14852 * case where we have 2 interfaces in a group - where 14853 * one of them is failed and another is a good one and 14854 * the good one (not marked inactive) is leaving the group. 14855 */ 14856 ret = 0; 14857 for (ill = illgrp->illgrp_ill; ill != NULL; 14858 ill = ill->ill_group_next) { 14859 /* Never pick an offline interface */ 14860 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 14861 continue; 14862 14863 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 14864 fallback_failed_ill = ill; 14865 continue; 14866 } 14867 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 14868 fallback_inactive_ill = ill; 14869 continue; 14870 } 14871 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 14872 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 14873 ret = ip_join_allmulti(ill->ill_ipif); 14874 /* 14875 * ip_join_allmulti can fail because of memory 14876 * failures. So, make sure we join at least 14877 * on one ill. 14878 */ 14879 if (ill->ill_join_allmulti) 14880 return (0); 14881 } 14882 } 14883 } 14884 if (ret != 0) { 14885 /* 14886 * If we tried nominating above and failed to do so, 14887 * return error. We might have tried multiple times. 14888 * But, return the latest error. 14889 */ 14890 return (ret); 14891 } 14892 if ((ill = fallback_inactive_ill) != NULL) { 14893 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 14894 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 14895 ret = ip_join_allmulti(ill->ill_ipif); 14896 return (ret); 14897 } 14898 } 14899 } else if ((ill = fallback_failed_ill) != NULL) { 14900 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 14901 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 14902 ret = ip_join_allmulti(ill->ill_ipif); 14903 return (ret); 14904 } 14905 } 14906 } 14907 return (0); 14908 } 14909 14910 /* 14911 * This function is called from illgrp_delete after it is 14912 * deleted from the group to reschedule responsibilities 14913 * to a different ill. 14914 */ 14915 static void 14916 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 14917 { 14918 ilm_t *ilm; 14919 ipif_t *ipif; 14920 ipaddr_t subnet_addr; 14921 ipaddr_t net_addr; 14922 ipaddr_t net_mask = 0; 14923 ipaddr_t subnet_netmask; 14924 ipaddr_t addr; 14925 14926 ASSERT(ill->ill_group == NULL); 14927 /* 14928 * Broadcast Responsibility: 14929 * 14930 * 1. If this ill has been nominated for receiving broadcast 14931 * packets, we need to find a new one. Before we find a new 14932 * one, we need to re-group the ires that are part of this new 14933 * group (assumed by ill_nominate_bcast_rcv). We do this by 14934 * calling ill_group_bcast_for_xmit(ill) which will do the right 14935 * thing for us. 14936 * 14937 * 2. If this ill was not nominated for receiving broadcast 14938 * packets, we need to clear the IRE_MARK_NORECV flag 14939 * so that we continue to send up broadcast packets. 14940 */ 14941 if (!ill->ill_isv6) { 14942 /* 14943 * Case 1 above : No optimization here. Just redo the 14944 * nomination. 14945 */ 14946 ill_group_bcast_for_xmit(ill); 14947 ill_nominate_bcast_rcv(illgrp); 14948 14949 /* 14950 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 14951 */ 14952 ill_clear_bcast_mark(ill, 0); 14953 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 14954 14955 for (ipif = ill->ill_ipif; ipif != NULL; 14956 ipif = ipif->ipif_next) { 14957 14958 if (!(ipif->ipif_flags & IPIF_UP) || 14959 ipif->ipif_subnet == 0) { 14960 continue; 14961 } 14962 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14963 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14964 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14965 } else { 14966 net_mask = htonl(IN_CLASSA_NET); 14967 } 14968 addr = net_mask & ipif->ipif_subnet; 14969 ill_clear_bcast_mark(ill, addr); 14970 14971 net_addr = ~net_mask | addr; 14972 ill_clear_bcast_mark(ill, net_addr); 14973 14974 subnet_netmask = ipif->ipif_net_mask; 14975 addr = ipif->ipif_subnet; 14976 ill_clear_bcast_mark(ill, addr); 14977 14978 subnet_addr = ~subnet_netmask | addr; 14979 ill_clear_bcast_mark(ill, subnet_addr); 14980 } 14981 } 14982 14983 /* 14984 * Multicast Responsibility. 14985 * 14986 * If we have joined allmulti on this one, find a new member 14987 * in the group to join allmulti. As this ill is already part 14988 * of allmulti, we don't have to join on this one. 14989 * 14990 * If we have not joined allmulti on this one, there is no 14991 * responsibility to handoff. But we need to take new 14992 * responsibility i.e, join allmulti on this one if we need 14993 * to. 14994 */ 14995 if (ill->ill_join_allmulti) { 14996 (void) ill_nominate_mcast_rcv(illgrp); 14997 } else { 14998 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 14999 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15000 (void) ip_join_allmulti(ill->ill_ipif); 15001 break; 15002 } 15003 } 15004 } 15005 15006 /* 15007 * We intentionally do the flushing of IRE_CACHES only matching 15008 * on the ill and not on groups. Note that we are already deleted 15009 * from the group. 15010 * 15011 * This will make sure that all IRE_CACHES whose stq is pointing 15012 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 15013 * deleted and IRE_CACHES that are not pointing at this ill will 15014 * be left alone. 15015 */ 15016 if (ill->ill_isv6) { 15017 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15018 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15019 } else { 15020 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15021 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15022 } 15023 15024 /* 15025 * Some conn may have cached one of the IREs deleted above. By removing 15026 * the ire reference, we clean up the extra reference to the ill held in 15027 * ire->ire_stq. 15028 */ 15029 ipcl_walk(conn_cleanup_stale_ire, NULL); 15030 15031 /* 15032 * Re-do source address selection for all the members in the 15033 * group, if they borrowed source address from one of the ipifs 15034 * in this ill. 15035 */ 15036 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15037 if (ill->ill_isv6) { 15038 ipif_update_other_ipifs_v6(ipif, illgrp); 15039 } else { 15040 ipif_update_other_ipifs(ipif, illgrp); 15041 } 15042 } 15043 } 15044 15045 /* 15046 * Delete the ill from the group. The caller makes sure that it is 15047 * in a group and it okay to delete from the group. So, we always 15048 * delete here. 15049 */ 15050 static void 15051 illgrp_delete(ill_t *ill) 15052 { 15053 ill_group_t *illgrp; 15054 ill_group_t *tmpg; 15055 ill_t *tmp_ill; 15056 15057 /* 15058 * Reset illgrp_ill_schednext if it was pointing at us. 15059 * We need to do this before we set ill_group to NULL. 15060 */ 15061 rw_enter(&ill_g_lock, RW_WRITER); 15062 mutex_enter(&ill->ill_lock); 15063 15064 illgrp_reset_schednext(ill); 15065 15066 illgrp = ill->ill_group; 15067 15068 /* Delete the ill from illgrp. */ 15069 if (illgrp->illgrp_ill == ill) { 15070 illgrp->illgrp_ill = ill->ill_group_next; 15071 } else { 15072 tmp_ill = illgrp->illgrp_ill; 15073 while (tmp_ill->ill_group_next != ill) { 15074 tmp_ill = tmp_ill->ill_group_next; 15075 ASSERT(tmp_ill != NULL); 15076 } 15077 tmp_ill->ill_group_next = ill->ill_group_next; 15078 } 15079 ill->ill_group = NULL; 15080 ill->ill_group_next = NULL; 15081 15082 illgrp->illgrp_ill_count--; 15083 mutex_exit(&ill->ill_lock); 15084 rw_exit(&ill_g_lock); 15085 15086 /* 15087 * As this ill is leaving the group, we need to hand off 15088 * the responsibilities to the other ills in the group, if 15089 * this ill had some responsibilities. 15090 */ 15091 15092 ill_handoff_responsibility(ill, illgrp); 15093 15094 rw_enter(&ill_g_lock, RW_WRITER); 15095 15096 if (illgrp->illgrp_ill_count == 0) { 15097 15098 ASSERT(illgrp->illgrp_ill == NULL); 15099 if (ill->ill_isv6) { 15100 if (illgrp == illgrp_head_v6) { 15101 illgrp_head_v6 = illgrp->illgrp_next; 15102 } else { 15103 tmpg = illgrp_head_v6; 15104 while (tmpg->illgrp_next != illgrp) { 15105 tmpg = tmpg->illgrp_next; 15106 ASSERT(tmpg != NULL); 15107 } 15108 tmpg->illgrp_next = illgrp->illgrp_next; 15109 } 15110 } else { 15111 if (illgrp == illgrp_head_v4) { 15112 illgrp_head_v4 = illgrp->illgrp_next; 15113 } else { 15114 tmpg = illgrp_head_v4; 15115 while (tmpg->illgrp_next != illgrp) { 15116 tmpg = tmpg->illgrp_next; 15117 ASSERT(tmpg != NULL); 15118 } 15119 tmpg->illgrp_next = illgrp->illgrp_next; 15120 } 15121 } 15122 mutex_destroy(&illgrp->illgrp_lock); 15123 mi_free(illgrp); 15124 } 15125 rw_exit(&ill_g_lock); 15126 15127 /* 15128 * Even though the ill is out of the group its not necessary 15129 * to set ipsq_split as TRUE as the ipifs could be down temporarily 15130 * We will split the ipsq when phyint_groupname is set to NULL. 15131 */ 15132 15133 /* 15134 * Send a routing sockets message if we are deleting from 15135 * groups with names. 15136 */ 15137 if (ill->ill_phyint->phyint_groupname_len != 0) 15138 ip_rts_ifmsg(ill->ill_ipif); 15139 } 15140 15141 /* 15142 * Re-do source address selection. This is normally called when 15143 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 15144 * ipif comes up. 15145 */ 15146 void 15147 ill_update_source_selection(ill_t *ill) 15148 { 15149 ipif_t *ipif; 15150 15151 ASSERT(IAM_WRITER_ILL(ill)); 15152 15153 if (ill->ill_group != NULL) 15154 ill = ill->ill_group->illgrp_ill; 15155 15156 for (; ill != NULL; ill = ill->ill_group_next) { 15157 for (ipif = ill->ill_ipif; ipif != NULL; 15158 ipif = ipif->ipif_next) { 15159 if (ill->ill_isv6) 15160 ipif_recreate_interface_routes_v6(NULL, ipif); 15161 else 15162 ipif_recreate_interface_routes(NULL, ipif); 15163 } 15164 } 15165 } 15166 15167 /* 15168 * Insert ill in a group headed by illgrp_head. The caller can either 15169 * pass a groupname in which case we search for a group with the 15170 * same name to insert in or pass a group to insert in. This function 15171 * would only search groups with names. 15172 * 15173 * NOTE : The caller should make sure that there is at least one ipif 15174 * UP on this ill so that illgrp_scheduler can pick this ill 15175 * for outbound packets. If ill_ipif_up_count is zero, we have 15176 * already sent a DL_UNBIND to the driver and we don't want to 15177 * send anymore packets. We don't assert for ipif_up_count 15178 * to be greater than zero, because ipif_up_done wants to call 15179 * this function before bumping up the ipif_up_count. See 15180 * ipif_up_done() for details. 15181 */ 15182 int 15183 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 15184 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 15185 { 15186 ill_group_t *illgrp; 15187 ill_t *prev_ill; 15188 phyint_t *phyi; 15189 15190 ASSERT(ill->ill_group == NULL); 15191 15192 rw_enter(&ill_g_lock, RW_WRITER); 15193 mutex_enter(&ill->ill_lock); 15194 15195 if (groupname != NULL) { 15196 /* 15197 * Look for a group with a matching groupname to insert. 15198 */ 15199 for (illgrp = *illgrp_head; illgrp != NULL; 15200 illgrp = illgrp->illgrp_next) { 15201 15202 ill_t *tmp_ill; 15203 15204 /* 15205 * If we have an ill_group_t in the list which has 15206 * no ill_t assigned then we must be in the process of 15207 * removing this group. We skip this as illgrp_delete() 15208 * will remove it from the list. 15209 */ 15210 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 15211 ASSERT(illgrp->illgrp_ill_count == 0); 15212 continue; 15213 } 15214 15215 ASSERT(tmp_ill->ill_phyint != NULL); 15216 phyi = tmp_ill->ill_phyint; 15217 /* 15218 * Look at groups which has names only. 15219 */ 15220 if (phyi->phyint_groupname_len == 0) 15221 continue; 15222 /* 15223 * Names are stored in the phyint common to both 15224 * IPv4 and IPv6. 15225 */ 15226 if (mi_strcmp(phyi->phyint_groupname, 15227 groupname) == 0) { 15228 break; 15229 } 15230 } 15231 } else { 15232 /* 15233 * If the caller passes in a NULL "grp_to_insert", we 15234 * allocate one below and insert this singleton. 15235 */ 15236 illgrp = grp_to_insert; 15237 } 15238 15239 ill->ill_group_next = NULL; 15240 15241 if (illgrp == NULL) { 15242 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 15243 if (illgrp == NULL) { 15244 return (ENOMEM); 15245 } 15246 illgrp->illgrp_next = *illgrp_head; 15247 *illgrp_head = illgrp; 15248 illgrp->illgrp_ill = ill; 15249 illgrp->illgrp_ill_count = 1; 15250 ill->ill_group = illgrp; 15251 /* 15252 * Used in illgrp_scheduler to protect multiple threads 15253 * from traversing the list. 15254 */ 15255 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 15256 } else { 15257 ASSERT(ill->ill_net_type == 15258 illgrp->illgrp_ill->ill_net_type); 15259 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 15260 15261 /* Insert ill at tail of this group */ 15262 prev_ill = illgrp->illgrp_ill; 15263 while (prev_ill->ill_group_next != NULL) 15264 prev_ill = prev_ill->ill_group_next; 15265 prev_ill->ill_group_next = ill; 15266 ill->ill_group = illgrp; 15267 illgrp->illgrp_ill_count++; 15268 /* 15269 * Inherit group properties. Currently only forwarding 15270 * is the property we try to keep the same with all the 15271 * ills. When there are more, we will abstract this into 15272 * a function. 15273 */ 15274 ill->ill_flags &= ~ILLF_ROUTER; 15275 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 15276 } 15277 mutex_exit(&ill->ill_lock); 15278 rw_exit(&ill_g_lock); 15279 15280 /* 15281 * 1) When ipif_up_done() calls this function, ipif_up_count 15282 * may be zero as it has not yet been bumped. But the ires 15283 * have already been added. So, we do the nomination here 15284 * itself. But, when ip_sioctl_groupname calls this, it checks 15285 * for ill_ipif_up_count != 0. Thus we don't check for 15286 * ill_ipif_up_count here while nominating broadcast ires for 15287 * receive. 15288 * 15289 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 15290 * to group them properly as ire_add() has already happened 15291 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 15292 * case, we need to do it here anyway. 15293 */ 15294 if (!ill->ill_isv6) { 15295 ill_group_bcast_for_xmit(ill); 15296 ill_nominate_bcast_rcv(illgrp); 15297 } 15298 15299 if (!ipif_is_coming_up) { 15300 /* 15301 * When ipif_up_done() calls this function, the multicast 15302 * groups have not been joined yet. So, there is no point in 15303 * nomination. ip_join_allmulti will handle groups when 15304 * ill_recover_multicast is called from ipif_up_done() later. 15305 */ 15306 (void) ill_nominate_mcast_rcv(illgrp); 15307 /* 15308 * ipif_up_done calls ill_update_source_selection 15309 * anyway. Moreover, we don't want to re-create 15310 * interface routes while ipif_up_done() still has reference 15311 * to them. Refer to ipif_up_done() for more details. 15312 */ 15313 ill_update_source_selection(ill); 15314 } 15315 15316 /* 15317 * Send a routing sockets message if we are inserting into 15318 * groups with names. 15319 */ 15320 if (groupname != NULL) 15321 ip_rts_ifmsg(ill->ill_ipif); 15322 return (0); 15323 } 15324 15325 /* 15326 * Return the first phyint matching the groupname. There could 15327 * be more than one when there are ill groups. 15328 * 15329 * Needs work: called only from ip_sioctl_groupname 15330 */ 15331 static phyint_t * 15332 phyint_lookup_group(char *groupname) 15333 { 15334 phyint_t *phyi; 15335 15336 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 15337 /* 15338 * Group names are stored in the phyint - a common structure 15339 * to both IPv4 and IPv6. 15340 */ 15341 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 15342 for (; phyi != NULL; 15343 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 15344 phyi, AVL_AFTER)) { 15345 if (phyi->phyint_groupname_len == 0) 15346 continue; 15347 ASSERT(phyi->phyint_groupname != NULL); 15348 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 15349 return (phyi); 15350 } 15351 return (NULL); 15352 } 15353 15354 15355 15356 /* 15357 * MT notes on creation and deletion of IPMP groups 15358 * 15359 * Creation and deletion of IPMP groups introduce the need to merge or 15360 * split the associated serialization objects i.e the ipsq's. Normally all 15361 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 15362 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 15363 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 15364 * is a need to change the <ill-ipsq> association and we have to operate on both 15365 * the source and destination IPMP groups. For eg. attempting to set the 15366 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 15367 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 15368 * source or destination IPMP group are mapped to a single ipsq for executing 15369 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 15370 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 15371 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 15372 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 15373 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 15374 * ipsq has to be examined for redoing the <ill-ipsq> associations. 15375 * 15376 * In the above example the ioctl handling code locates the current ipsq of hme0 15377 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 15378 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 15379 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 15380 * the destination ipsq. If the destination ipsq is not busy, it also enters 15381 * the destination ipsq exclusively. Now the actual groupname setting operation 15382 * can proceed. If the destination ipsq is busy, the operation is enqueued 15383 * on the destination (merged) ipsq and will be handled in the unwind from 15384 * ipsq_exit. 15385 * 15386 * To prevent other threads accessing the ill while the group name change is 15387 * in progres, we bring down the ipifs which also removes the ill from the 15388 * group. The group is changed in phyint and when the first ipif on the ill 15389 * is brought up, the ill is inserted into the right IPMP group by 15390 * illgrp_insert. 15391 */ 15392 /* ARGSUSED */ 15393 int 15394 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15395 ip_ioctl_cmd_t *ipip, void *ifreq) 15396 { 15397 int i; 15398 char *tmp; 15399 int namelen; 15400 ill_t *ill = ipif->ipif_ill; 15401 ill_t *ill_v4, *ill_v6; 15402 int err = 0; 15403 phyint_t *phyi; 15404 phyint_t *phyi_tmp; 15405 struct lifreq *lifr; 15406 mblk_t *mp1; 15407 char *groupname; 15408 ipsq_t *ipsq; 15409 15410 ASSERT(IAM_WRITER_IPIF(ipif)); 15411 15412 /* Existance verified in ip_wput_nondata */ 15413 mp1 = mp->b_cont->b_cont; 15414 lifr = (struct lifreq *)mp1->b_rptr; 15415 groupname = lifr->lifr_groupname; 15416 15417 if (ipif->ipif_id != 0) 15418 return (EINVAL); 15419 15420 phyi = ill->ill_phyint; 15421 ASSERT(phyi != NULL); 15422 15423 if (phyi->phyint_flags & PHYI_VIRTUAL) 15424 return (EINVAL); 15425 15426 tmp = groupname; 15427 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 15428 ; 15429 15430 if (i == LIFNAMSIZ) { 15431 /* no null termination */ 15432 return (EINVAL); 15433 } 15434 15435 /* 15436 * Calculate the namelen exclusive of the null 15437 * termination character. 15438 */ 15439 namelen = tmp - groupname; 15440 15441 ill_v4 = phyi->phyint_illv4; 15442 ill_v6 = phyi->phyint_illv6; 15443 15444 /* 15445 * ILL cannot be part of a usesrc group and and IPMP group at the 15446 * same time. No need to grab the ill_g_usesrc_lock here, see 15447 * synchronization notes in ip.c 15448 */ 15449 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 15450 return (EINVAL); 15451 } 15452 15453 /* 15454 * mark the ill as changing. 15455 * this should queue all new requests on the syncq. 15456 */ 15457 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15458 15459 if (ill_v4 != NULL) 15460 ill_v4->ill_state_flags |= ILL_CHANGING; 15461 if (ill_v6 != NULL) 15462 ill_v6->ill_state_flags |= ILL_CHANGING; 15463 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15464 15465 if (namelen == 0) { 15466 /* 15467 * Null string means remove this interface from the 15468 * existing group. 15469 */ 15470 if (phyi->phyint_groupname_len == 0) { 15471 /* 15472 * Never was in a group. 15473 */ 15474 err = 0; 15475 goto done; 15476 } 15477 15478 /* 15479 * IPv4 or IPv6 may be temporarily out of the group when all 15480 * the ipifs are down. Thus, we need to check for ill_group to 15481 * be non-NULL. 15482 */ 15483 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 15484 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 15485 mutex_enter(&ill_v4->ill_lock); 15486 if (!ill_is_quiescent(ill_v4)) { 15487 /* 15488 * ipsq_pending_mp_add will not fail since 15489 * connp is NULL 15490 */ 15491 (void) ipsq_pending_mp_add(NULL, 15492 ill_v4->ill_ipif, q, mp, ILL_DOWN); 15493 mutex_exit(&ill_v4->ill_lock); 15494 err = EINPROGRESS; 15495 goto done; 15496 } 15497 mutex_exit(&ill_v4->ill_lock); 15498 } 15499 15500 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 15501 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 15502 mutex_enter(&ill_v6->ill_lock); 15503 if (!ill_is_quiescent(ill_v6)) { 15504 (void) ipsq_pending_mp_add(NULL, 15505 ill_v6->ill_ipif, q, mp, ILL_DOWN); 15506 mutex_exit(&ill_v6->ill_lock); 15507 err = EINPROGRESS; 15508 goto done; 15509 } 15510 mutex_exit(&ill_v6->ill_lock); 15511 } 15512 15513 rw_enter(&ill_g_lock, RW_WRITER); 15514 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15515 mutex_enter(&phyi->phyint_lock); 15516 ASSERT(phyi->phyint_groupname != NULL); 15517 mi_free(phyi->phyint_groupname); 15518 phyi->phyint_groupname = NULL; 15519 phyi->phyint_groupname_len = 0; 15520 mutex_exit(&phyi->phyint_lock); 15521 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15522 rw_exit(&ill_g_lock); 15523 err = ill_up_ipifs(ill, q, mp); 15524 15525 /* 15526 * set the split flag so that the ipsq can be split 15527 */ 15528 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15529 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15530 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15531 15532 } else { 15533 if (phyi->phyint_groupname_len != 0) { 15534 ASSERT(phyi->phyint_groupname != NULL); 15535 /* Are we inserting in the same group ? */ 15536 if (mi_strcmp(groupname, 15537 phyi->phyint_groupname) == 0) { 15538 err = 0; 15539 goto done; 15540 } 15541 } 15542 15543 rw_enter(&ill_g_lock, RW_READER); 15544 /* 15545 * Merge ipsq for the group's. 15546 * This check is here as multiple groups/ills might be 15547 * sharing the same ipsq. 15548 * If we have to merege than the operation is restarted 15549 * on the new ipsq. 15550 */ 15551 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL); 15552 if (phyi->phyint_ipsq != ipsq) { 15553 rw_exit(&ill_g_lock); 15554 err = ill_merge_groups(ill, NULL, groupname, mp, q); 15555 goto done; 15556 } 15557 /* 15558 * Running exclusive on new ipsq. 15559 */ 15560 15561 ASSERT(ipsq != NULL); 15562 ASSERT(ipsq->ipsq_writer == curthread); 15563 15564 /* 15565 * Check whether the ill_type and ill_net_type matches before 15566 * we allocate any memory so that the cleanup is easier. 15567 * 15568 * We can't group dissimilar ones as we can't load spread 15569 * packets across the group because of potential link-level 15570 * header differences. 15571 */ 15572 phyi_tmp = phyint_lookup_group(groupname); 15573 if (phyi_tmp != NULL) { 15574 if ((ill_v4 != NULL && 15575 phyi_tmp->phyint_illv4 != NULL) && 15576 ((ill_v4->ill_net_type != 15577 phyi_tmp->phyint_illv4->ill_net_type) || 15578 (ill_v4->ill_type != 15579 phyi_tmp->phyint_illv4->ill_type))) { 15580 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15581 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15582 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15583 rw_exit(&ill_g_lock); 15584 return (EINVAL); 15585 } 15586 if ((ill_v6 != NULL && 15587 phyi_tmp->phyint_illv6 != NULL) && 15588 ((ill_v6->ill_net_type != 15589 phyi_tmp->phyint_illv6->ill_net_type) || 15590 (ill_v6->ill_type != 15591 phyi_tmp->phyint_illv6->ill_type))) { 15592 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15593 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15594 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15595 rw_exit(&ill_g_lock); 15596 return (EINVAL); 15597 } 15598 } 15599 15600 rw_exit(&ill_g_lock); 15601 15602 /* 15603 * bring down all v4 ipifs. 15604 */ 15605 if (ill_v4 != NULL) { 15606 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 15607 } 15608 15609 /* 15610 * bring down all v6 ipifs. 15611 */ 15612 if (ill_v6 != NULL) { 15613 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 15614 } 15615 15616 /* 15617 * make sure all ipifs are down and there are no active 15618 * references. Call to ipsq_pending_mp_add will not fail 15619 * since connp is NULL. 15620 */ 15621 if (ill_v4 != NULL) { 15622 mutex_enter(&ill_v4->ill_lock); 15623 if (!ill_is_quiescent(ill_v4)) { 15624 (void) ipsq_pending_mp_add(NULL, 15625 ill_v4->ill_ipif, q, mp, ILL_DOWN); 15626 mutex_exit(&ill_v4->ill_lock); 15627 err = EINPROGRESS; 15628 goto done; 15629 } 15630 mutex_exit(&ill_v4->ill_lock); 15631 } 15632 15633 if (ill_v6 != NULL) { 15634 mutex_enter(&ill_v6->ill_lock); 15635 if (!ill_is_quiescent(ill_v6)) { 15636 (void) ipsq_pending_mp_add(NULL, 15637 ill_v6->ill_ipif, q, mp, ILL_DOWN); 15638 mutex_exit(&ill_v6->ill_lock); 15639 err = EINPROGRESS; 15640 goto done; 15641 } 15642 mutex_exit(&ill_v6->ill_lock); 15643 } 15644 15645 /* 15646 * allocate including space for null terminator 15647 * before we insert. 15648 */ 15649 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 15650 if (tmp == NULL) 15651 return (ENOMEM); 15652 15653 rw_enter(&ill_g_lock, RW_WRITER); 15654 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15655 mutex_enter(&phyi->phyint_lock); 15656 if (phyi->phyint_groupname_len != 0) { 15657 ASSERT(phyi->phyint_groupname != NULL); 15658 mi_free(phyi->phyint_groupname); 15659 } 15660 15661 /* 15662 * setup the new group name. 15663 */ 15664 phyi->phyint_groupname = tmp; 15665 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 15666 phyi->phyint_groupname_len = namelen + 1; 15667 mutex_exit(&phyi->phyint_lock); 15668 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15669 rw_exit(&ill_g_lock); 15670 15671 err = ill_up_ipifs(ill, q, mp); 15672 } 15673 15674 done: 15675 /* 15676 * normally ILL_CHANGING is cleared in ill_up_ipifs. 15677 */ 15678 if (err != EINPROGRESS) { 15679 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15680 if (ill_v4 != NULL) 15681 ill_v4->ill_state_flags &= ~ILL_CHANGING; 15682 if (ill_v6 != NULL) 15683 ill_v6->ill_state_flags &= ~ILL_CHANGING; 15684 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15685 } 15686 return (err); 15687 } 15688 15689 /* ARGSUSED */ 15690 int 15691 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 15692 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15693 { 15694 ill_t *ill; 15695 phyint_t *phyi; 15696 struct lifreq *lifr; 15697 mblk_t *mp1; 15698 15699 /* Existence verified in ip_wput_nondata */ 15700 mp1 = mp->b_cont->b_cont; 15701 lifr = (struct lifreq *)mp1->b_rptr; 15702 ill = ipif->ipif_ill; 15703 phyi = ill->ill_phyint; 15704 15705 lifr->lifr_groupname[0] = '\0'; 15706 /* 15707 * ill_group may be null if all the interfaces 15708 * are down. But still, the phyint should always 15709 * hold the name. 15710 */ 15711 if (phyi->phyint_groupname_len != 0) { 15712 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 15713 phyi->phyint_groupname_len); 15714 } 15715 15716 return (0); 15717 } 15718 15719 15720 typedef struct conn_move_s { 15721 ill_t *cm_from_ill; 15722 ill_t *cm_to_ill; 15723 int cm_ifindex; 15724 } conn_move_t; 15725 15726 /* 15727 * ipcl_walk function for moving conn_multicast_ill for a given ill. 15728 */ 15729 static void 15730 conn_move(conn_t *connp, caddr_t arg) 15731 { 15732 conn_move_t *connm; 15733 int ifindex; 15734 int i; 15735 ill_t *from_ill; 15736 ill_t *to_ill; 15737 ilg_t *ilg; 15738 ilm_t *ret_ilm; 15739 15740 connm = (conn_move_t *)arg; 15741 ifindex = connm->cm_ifindex; 15742 from_ill = connm->cm_from_ill; 15743 to_ill = connm->cm_to_ill; 15744 15745 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 15746 15747 /* All multicast fields protected by conn_lock */ 15748 mutex_enter(&connp->conn_lock); 15749 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 15750 if ((connp->conn_outgoing_ill == from_ill) && 15751 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 15752 connp->conn_outgoing_ill = to_ill; 15753 connp->conn_incoming_ill = to_ill; 15754 } 15755 15756 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 15757 15758 if ((connp->conn_multicast_ill == from_ill) && 15759 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 15760 connp->conn_multicast_ill = connm->cm_to_ill; 15761 } 15762 15763 /* Change IP_XMIT_IF associations */ 15764 if ((connp->conn_xmit_if_ill == from_ill) && 15765 (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) { 15766 connp->conn_xmit_if_ill = to_ill; 15767 } 15768 /* 15769 * Change the ilg_ill to point to the new one. This assumes 15770 * ilm_move_v6 has moved the ilms to new_ill and the driver 15771 * has been told to receive packets on this interface. 15772 * ilm_move_v6 FAILBACKS all the ilms successfully always. 15773 * But when doing a FAILOVER, it might fail with ENOMEM and so 15774 * some ilms may not have moved. We check to see whether 15775 * the ilms have moved to to_ill. We can't check on from_ill 15776 * as in the process of moving, we could have split an ilm 15777 * in to two - which has the same orig_ifindex and v6group. 15778 * 15779 * For IPv4, ilg_ipif moves implicitly. The code below really 15780 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 15781 */ 15782 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 15783 ilg = &connp->conn_ilg[i]; 15784 if ((ilg->ilg_ill == from_ill) && 15785 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 15786 /* ifindex != 0 indicates failback */ 15787 if (ifindex != 0) { 15788 connp->conn_ilg[i].ilg_ill = to_ill; 15789 continue; 15790 } 15791 15792 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 15793 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 15794 connp->conn_zoneid); 15795 15796 if (ret_ilm != NULL) 15797 connp->conn_ilg[i].ilg_ill = to_ill; 15798 } 15799 } 15800 mutex_exit(&connp->conn_lock); 15801 } 15802 15803 static void 15804 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 15805 { 15806 conn_move_t connm; 15807 15808 connm.cm_from_ill = from_ill; 15809 connm.cm_to_ill = to_ill; 15810 connm.cm_ifindex = ifindex; 15811 15812 ipcl_walk(conn_move, (caddr_t)&connm); 15813 } 15814 15815 /* 15816 * ilm has been moved from from_ill to to_ill. 15817 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 15818 * appropriately. 15819 * 15820 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 15821 * the code there de-references ipif_ill to get the ill to 15822 * send multicast requests. It does not work as ipif is on its 15823 * move and already moved when this function is called. 15824 * Thus, we need to use from_ill and to_ill send down multicast 15825 * requests. 15826 */ 15827 static void 15828 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 15829 { 15830 ipif_t *ipif; 15831 ilm_t *ilm; 15832 15833 /* 15834 * See whether we need to send down DL_ENABMULTI_REQ on 15835 * to_ill as ilm has just been added. 15836 */ 15837 ASSERT(IAM_WRITER_ILL(to_ill)); 15838 ASSERT(IAM_WRITER_ILL(from_ill)); 15839 15840 ILM_WALKER_HOLD(to_ill); 15841 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15842 15843 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 15844 continue; 15845 /* 15846 * no locks held, ill/ipif cannot dissappear as long 15847 * as we are writer. 15848 */ 15849 ipif = to_ill->ill_ipif; 15850 /* 15851 * No need to hold any lock as we are the writer and this 15852 * can only be changed by a writer. 15853 */ 15854 ilm->ilm_is_new = B_FALSE; 15855 15856 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 15857 ipif->ipif_flags & IPIF_POINTOPOINT) { 15858 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 15859 "resolver\n")); 15860 continue; /* Must be IRE_IF_NORESOLVER */ 15861 } 15862 15863 15864 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 15865 ip1dbg(("ilm_send_multicast_reqs: " 15866 "to_ill MULTI_BCAST\n")); 15867 goto from; 15868 } 15869 15870 if (to_ill->ill_isv6) 15871 mld_joingroup(ilm); 15872 else 15873 igmp_joingroup(ilm); 15874 15875 if (to_ill->ill_ipif_up_count == 0) { 15876 /* 15877 * Nobody there. All multicast addresses will be 15878 * re-joined when we get the DL_BIND_ACK bringing the 15879 * interface up. 15880 */ 15881 ilm->ilm_notify_driver = B_FALSE; 15882 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 15883 goto from; 15884 } 15885 15886 /* 15887 * For allmulti address, we want to join on only one interface. 15888 * Checking for ilm_numentries_v6 is not correct as you may 15889 * find an ilm with zero address on to_ill, but we may not 15890 * have nominated to_ill for receiving. Thus, if we have 15891 * nominated from_ill (ill_join_allmulti is set), nominate 15892 * only if to_ill is not already nominated (to_ill normally 15893 * should not have been nominated if "from_ill" has already 15894 * been nominated. As we don't prevent failovers from happening 15895 * across groups, we don't assert). 15896 */ 15897 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15898 /* 15899 * There is no need to hold ill locks as we are 15900 * writer on both ills and when ill_join_allmulti 15901 * is changed the thread is always a writer. 15902 */ 15903 if (from_ill->ill_join_allmulti && 15904 !to_ill->ill_join_allmulti) { 15905 (void) ip_join_allmulti(to_ill->ill_ipif); 15906 } 15907 } else if (ilm->ilm_notify_driver) { 15908 15909 /* 15910 * This is a newly moved ilm so we need to tell the 15911 * driver about the new group. There can be more than 15912 * one ilm's for the same group in the list each with a 15913 * different orig_ifindex. We have to inform the driver 15914 * once. In ilm_move_v[4,6] we only set the flag 15915 * ilm_notify_driver for the first ilm. 15916 */ 15917 15918 (void) ip_ll_send_enabmulti_req(to_ill, 15919 &ilm->ilm_v6addr); 15920 } 15921 15922 ilm->ilm_notify_driver = B_FALSE; 15923 15924 /* 15925 * See whether we need to send down DL_DISABMULTI_REQ on 15926 * from_ill as ilm has just been removed. 15927 */ 15928 from: 15929 ipif = from_ill->ill_ipif; 15930 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 15931 ipif->ipif_flags & IPIF_POINTOPOINT) { 15932 ip1dbg(("ilm_send_multicast_reqs: " 15933 "from_ill not resolver\n")); 15934 continue; /* Must be IRE_IF_NORESOLVER */ 15935 } 15936 15937 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 15938 ip1dbg(("ilm_send_multicast_reqs: " 15939 "from_ill MULTI_BCAST\n")); 15940 continue; 15941 } 15942 15943 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15944 if (from_ill->ill_join_allmulti) 15945 (void) ip_leave_allmulti(from_ill->ill_ipif); 15946 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 15947 (void) ip_ll_send_disabmulti_req(from_ill, 15948 &ilm->ilm_v6addr); 15949 } 15950 } 15951 ILM_WALKER_RELE(to_ill); 15952 } 15953 15954 /* 15955 * This function is called when all multicast memberships needs 15956 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 15957 * called only once unlike the IPv4 counterpart where it is called after 15958 * every logical interface is moved. The reason is due to multicast 15959 * memberships are joined using an interface address in IPv4 while in 15960 * IPv6, interface index is used. 15961 */ 15962 static void 15963 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 15964 { 15965 ilm_t *ilm; 15966 ilm_t *ilm_next; 15967 ilm_t *new_ilm; 15968 ilm_t **ilmp; 15969 int count; 15970 char buf[INET6_ADDRSTRLEN]; 15971 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 15972 15973 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 15974 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 15975 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 15976 15977 if (ifindex == 0) { 15978 /* 15979 * Form the solicited node mcast address which is used later. 15980 */ 15981 ipif_t *ipif; 15982 15983 ipif = from_ill->ill_ipif; 15984 ASSERT(ipif->ipif_id == 0); 15985 15986 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 15987 } 15988 15989 ilmp = &from_ill->ill_ilm; 15990 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 15991 ilm_next = ilm->ilm_next; 15992 15993 if (ilm->ilm_flags & ILM_DELETED) { 15994 ilmp = &ilm->ilm_next; 15995 continue; 15996 } 15997 15998 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 15999 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 16000 ASSERT(ilm->ilm_orig_ifindex != 0); 16001 if (ilm->ilm_orig_ifindex == ifindex) { 16002 /* 16003 * We are failing back multicast memberships. 16004 * If the same ilm exists in to_ill, it means somebody 16005 * has joined the same group there e.g. ff02::1 16006 * is joined within the kernel when the interfaces 16007 * came UP. 16008 */ 16009 ASSERT(ilm->ilm_ipif == NULL); 16010 if (new_ilm != NULL) { 16011 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16012 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16013 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16014 new_ilm->ilm_is_new = B_TRUE; 16015 } 16016 } else { 16017 /* 16018 * check if we can just move the ilm 16019 */ 16020 if (from_ill->ill_ilm_walker_cnt != 0) { 16021 /* 16022 * We have walkers we cannot move 16023 * the ilm, so allocate a new ilm, 16024 * this (old) ilm will be marked 16025 * ILM_DELETED at the end of the loop 16026 * and will be freed when the 16027 * last walker exits. 16028 */ 16029 new_ilm = (ilm_t *)mi_zalloc 16030 (sizeof (ilm_t)); 16031 if (new_ilm == NULL) { 16032 ip0dbg(("ilm_move_v6: " 16033 "FAILBACK of IPv6" 16034 " multicast address %s : " 16035 "from %s to" 16036 " %s failed : ENOMEM \n", 16037 inet_ntop(AF_INET6, 16038 &ilm->ilm_v6addr, buf, 16039 sizeof (buf)), 16040 from_ill->ill_name, 16041 to_ill->ill_name)); 16042 16043 ilmp = &ilm->ilm_next; 16044 continue; 16045 } 16046 *new_ilm = *ilm; 16047 /* 16048 * we don't want new_ilm linked to 16049 * ilm's filter list. 16050 */ 16051 new_ilm->ilm_filter = NULL; 16052 } else { 16053 /* 16054 * No walkers we can move the ilm. 16055 * lets take it out of the list. 16056 */ 16057 *ilmp = ilm->ilm_next; 16058 ilm->ilm_next = NULL; 16059 new_ilm = ilm; 16060 } 16061 16062 /* 16063 * if this is the first ilm for the group 16064 * set ilm_notify_driver so that we notify the 16065 * driver in ilm_send_multicast_reqs. 16066 */ 16067 if (ilm_lookup_ill_v6(to_ill, 16068 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16069 new_ilm->ilm_notify_driver = B_TRUE; 16070 16071 new_ilm->ilm_ill = to_ill; 16072 /* Add to the to_ill's list */ 16073 new_ilm->ilm_next = to_ill->ill_ilm; 16074 to_ill->ill_ilm = new_ilm; 16075 /* 16076 * set the flag so that mld_joingroup is 16077 * called in ilm_send_multicast_reqs(). 16078 */ 16079 new_ilm->ilm_is_new = B_TRUE; 16080 } 16081 goto bottom; 16082 } else if (ifindex != 0) { 16083 /* 16084 * If this is FAILBACK (ifindex != 0) and the ifindex 16085 * has not matched above, look at the next ilm. 16086 */ 16087 ilmp = &ilm->ilm_next; 16088 continue; 16089 } 16090 /* 16091 * If we are here, it means ifindex is 0. Failover 16092 * everything. 16093 * 16094 * We need to handle solicited node mcast address 16095 * and all_nodes mcast address differently as they 16096 * are joined witin the kenrel (ipif_multicast_up) 16097 * and potentially from the userland. We are called 16098 * after the ipifs of from_ill has been moved. 16099 * If we still find ilms on ill with solicited node 16100 * mcast address or all_nodes mcast address, it must 16101 * belong to the UP interface that has not moved e.g. 16102 * ipif_id 0 with the link local prefix does not move. 16103 * We join this on the new ill accounting for all the 16104 * userland memberships so that applications don't 16105 * see any failure. 16106 * 16107 * We need to make sure that we account only for the 16108 * solicited node and all node multicast addresses 16109 * that was brought UP on these. In the case of 16110 * a failover from A to B, we might have ilms belonging 16111 * to A (ilm_orig_ifindex pointing at A) on B accounting 16112 * for the membership from the userland. If we are failing 16113 * over from B to C now, we will find the ones belonging 16114 * to A on B. These don't account for the ill_ipif_up_count. 16115 * They just move from B to C. The check below on 16116 * ilm_orig_ifindex ensures that. 16117 */ 16118 if ((ilm->ilm_orig_ifindex == 16119 from_ill->ill_phyint->phyint_ifindex) && 16120 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 16121 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 16122 &ilm->ilm_v6addr))) { 16123 ASSERT(ilm->ilm_refcnt > 0); 16124 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 16125 /* 16126 * For indentation reasons, we are not using a 16127 * "else" here. 16128 */ 16129 if (count == 0) { 16130 ilmp = &ilm->ilm_next; 16131 continue; 16132 } 16133 ilm->ilm_refcnt -= count; 16134 if (new_ilm != NULL) { 16135 /* 16136 * Can find one with the same 16137 * ilm_orig_ifindex, if we are failing 16138 * over to a STANDBY. This happens 16139 * when somebody wants to join a group 16140 * on a STANDBY interface and we 16141 * internally join on a different one. 16142 * If we had joined on from_ill then, a 16143 * failover now will find a new ilm 16144 * with this index. 16145 */ 16146 ip1dbg(("ilm_move_v6: FAILOVER, found" 16147 " new ilm on %s, group address %s\n", 16148 to_ill->ill_name, 16149 inet_ntop(AF_INET6, 16150 &ilm->ilm_v6addr, buf, 16151 sizeof (buf)))); 16152 new_ilm->ilm_refcnt += count; 16153 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16154 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16155 new_ilm->ilm_is_new = B_TRUE; 16156 } 16157 } else { 16158 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16159 if (new_ilm == NULL) { 16160 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 16161 " multicast address %s : from %s to" 16162 " %s failed : ENOMEM \n", 16163 inet_ntop(AF_INET6, 16164 &ilm->ilm_v6addr, buf, 16165 sizeof (buf)), from_ill->ill_name, 16166 to_ill->ill_name)); 16167 ilmp = &ilm->ilm_next; 16168 continue; 16169 } 16170 *new_ilm = *ilm; 16171 new_ilm->ilm_filter = NULL; 16172 new_ilm->ilm_refcnt = count; 16173 new_ilm->ilm_timer = INFINITY; 16174 new_ilm->ilm_rtx.rtx_timer = INFINITY; 16175 new_ilm->ilm_is_new = B_TRUE; 16176 /* 16177 * If the to_ill has not joined this 16178 * group we need to tell the driver in 16179 * ill_send_multicast_reqs. 16180 */ 16181 if (ilm_lookup_ill_v6(to_ill, 16182 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16183 new_ilm->ilm_notify_driver = B_TRUE; 16184 16185 new_ilm->ilm_ill = to_ill; 16186 /* Add to the to_ill's list */ 16187 new_ilm->ilm_next = to_ill->ill_ilm; 16188 to_ill->ill_ilm = new_ilm; 16189 ASSERT(new_ilm->ilm_ipif == NULL); 16190 } 16191 if (ilm->ilm_refcnt == 0) { 16192 goto bottom; 16193 } else { 16194 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16195 CLEAR_SLIST(new_ilm->ilm_filter); 16196 ilmp = &ilm->ilm_next; 16197 } 16198 continue; 16199 } else { 16200 /* 16201 * ifindex = 0 means, move everything pointing at 16202 * from_ill. We are doing this becuase ill has 16203 * either FAILED or became INACTIVE. 16204 * 16205 * As we would like to move things later back to 16206 * from_ill, we want to retain the identity of this 16207 * ilm. Thus, we don't blindly increment the reference 16208 * count on the ilms matching the address alone. We 16209 * need to match on the ilm_orig_index also. new_ilm 16210 * was obtained by matching ilm_orig_index also. 16211 */ 16212 if (new_ilm != NULL) { 16213 /* 16214 * This is possible only if a previous restore 16215 * was incomplete i.e restore to 16216 * ilm_orig_ifindex left some ilms because 16217 * of some failures. Thus when we are failing 16218 * again, we might find our old friends there. 16219 */ 16220 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 16221 " on %s, group address %s\n", 16222 to_ill->ill_name, 16223 inet_ntop(AF_INET6, 16224 &ilm->ilm_v6addr, buf, 16225 sizeof (buf)))); 16226 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16227 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16228 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16229 new_ilm->ilm_is_new = B_TRUE; 16230 } 16231 } else { 16232 if (from_ill->ill_ilm_walker_cnt != 0) { 16233 new_ilm = (ilm_t *) 16234 mi_zalloc(sizeof (ilm_t)); 16235 if (new_ilm == NULL) { 16236 ip0dbg(("ilm_move_v6: " 16237 "FAILOVER of IPv6" 16238 " multicast address %s : " 16239 "from %s to" 16240 " %s failed : ENOMEM \n", 16241 inet_ntop(AF_INET6, 16242 &ilm->ilm_v6addr, buf, 16243 sizeof (buf)), 16244 from_ill->ill_name, 16245 to_ill->ill_name)); 16246 16247 ilmp = &ilm->ilm_next; 16248 continue; 16249 } 16250 *new_ilm = *ilm; 16251 new_ilm->ilm_filter = NULL; 16252 } else { 16253 *ilmp = ilm->ilm_next; 16254 new_ilm = ilm; 16255 } 16256 /* 16257 * If the to_ill has not joined this 16258 * group we need to tell the driver in 16259 * ill_send_multicast_reqs. 16260 */ 16261 if (ilm_lookup_ill_v6(to_ill, 16262 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16263 new_ilm->ilm_notify_driver = B_TRUE; 16264 16265 /* Add to the to_ill's list */ 16266 new_ilm->ilm_next = to_ill->ill_ilm; 16267 to_ill->ill_ilm = new_ilm; 16268 ASSERT(ilm->ilm_ipif == NULL); 16269 new_ilm->ilm_ill = to_ill; 16270 new_ilm->ilm_is_new = B_TRUE; 16271 } 16272 16273 } 16274 16275 bottom: 16276 /* 16277 * Revert multicast filter state to (EXCLUDE, NULL). 16278 * new_ilm->ilm_is_new should already be set if needed. 16279 */ 16280 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16281 CLEAR_SLIST(new_ilm->ilm_filter); 16282 /* 16283 * We allocated/got a new ilm, free the old one. 16284 */ 16285 if (new_ilm != ilm) { 16286 if (from_ill->ill_ilm_walker_cnt == 0) { 16287 *ilmp = ilm->ilm_next; 16288 ilm->ilm_next = NULL; 16289 FREE_SLIST(ilm->ilm_filter); 16290 FREE_SLIST(ilm->ilm_pendsrcs); 16291 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 16292 FREE_SLIST(ilm->ilm_rtx.rtx_block); 16293 mi_free((char *)ilm); 16294 } else { 16295 ilm->ilm_flags |= ILM_DELETED; 16296 from_ill->ill_ilm_cleanup_reqd = 1; 16297 ilmp = &ilm->ilm_next; 16298 } 16299 } 16300 } 16301 } 16302 16303 /* 16304 * Move all the multicast memberships to to_ill. Called when 16305 * an ipif moves from "from_ill" to "to_ill". This function is slightly 16306 * different from IPv6 counterpart as multicast memberships are associated 16307 * with ills in IPv6. This function is called after every ipif is moved 16308 * unlike IPv6, where it is moved only once. 16309 */ 16310 static void 16311 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 16312 { 16313 ilm_t *ilm; 16314 ilm_t *ilm_next; 16315 ilm_t *new_ilm; 16316 ilm_t **ilmp; 16317 16318 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16319 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16320 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16321 16322 ilmp = &from_ill->ill_ilm; 16323 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16324 ilm_next = ilm->ilm_next; 16325 16326 if (ilm->ilm_flags & ILM_DELETED) { 16327 ilmp = &ilm->ilm_next; 16328 continue; 16329 } 16330 16331 ASSERT(ilm->ilm_ipif != NULL); 16332 16333 if (ilm->ilm_ipif != ipif) { 16334 ilmp = &ilm->ilm_next; 16335 continue; 16336 } 16337 16338 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 16339 htonl(INADDR_ALLHOSTS_GROUP)) { 16340 /* 16341 * We joined this in ipif_multicast_up 16342 * and we never did an ipif_multicast_down 16343 * for IPv4. If nobody else from the userland 16344 * has reference, we free the ilm, and later 16345 * when this ipif comes up on the new ill, 16346 * we will join this again. 16347 */ 16348 if (--ilm->ilm_refcnt == 0) 16349 goto delete_ilm; 16350 16351 new_ilm = ilm_lookup_ipif(ipif, 16352 V4_PART_OF_V6(ilm->ilm_v6addr)); 16353 if (new_ilm != NULL) { 16354 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16355 /* 16356 * We still need to deal with the from_ill. 16357 */ 16358 new_ilm->ilm_is_new = B_TRUE; 16359 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16360 CLEAR_SLIST(new_ilm->ilm_filter); 16361 goto delete_ilm; 16362 } 16363 /* 16364 * If we could not find one e.g. ipif is 16365 * still down on to_ill, we add this ilm 16366 * on ill_new to preserve the reference 16367 * count. 16368 */ 16369 } 16370 /* 16371 * When ipifs move, ilms always move with it 16372 * to the NEW ill. Thus we should never be 16373 * able to find ilm till we really move it here. 16374 */ 16375 ASSERT(ilm_lookup_ipif(ipif, 16376 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 16377 16378 if (from_ill->ill_ilm_walker_cnt != 0) { 16379 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16380 if (new_ilm == NULL) { 16381 char buf[INET6_ADDRSTRLEN]; 16382 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 16383 " multicast address %s : " 16384 "from %s to" 16385 " %s failed : ENOMEM \n", 16386 inet_ntop(AF_INET, 16387 &ilm->ilm_v6addr, buf, 16388 sizeof (buf)), 16389 from_ill->ill_name, 16390 to_ill->ill_name)); 16391 16392 ilmp = &ilm->ilm_next; 16393 continue; 16394 } 16395 *new_ilm = *ilm; 16396 /* We don't want new_ilm linked to ilm's filter list */ 16397 new_ilm->ilm_filter = NULL; 16398 } else { 16399 /* Remove from the list */ 16400 *ilmp = ilm->ilm_next; 16401 new_ilm = ilm; 16402 } 16403 16404 /* 16405 * If we have never joined this group on the to_ill 16406 * make sure we tell the driver. 16407 */ 16408 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 16409 ALL_ZONES) == NULL) 16410 new_ilm->ilm_notify_driver = B_TRUE; 16411 16412 /* Add to the to_ill's list */ 16413 new_ilm->ilm_next = to_ill->ill_ilm; 16414 to_ill->ill_ilm = new_ilm; 16415 new_ilm->ilm_is_new = B_TRUE; 16416 16417 /* 16418 * Revert multicast filter state to (EXCLUDE, NULL) 16419 */ 16420 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16421 CLEAR_SLIST(new_ilm->ilm_filter); 16422 16423 /* 16424 * Delete only if we have allocated a new ilm. 16425 */ 16426 if (new_ilm != ilm) { 16427 delete_ilm: 16428 if (from_ill->ill_ilm_walker_cnt == 0) { 16429 /* Remove from the list */ 16430 *ilmp = ilm->ilm_next; 16431 ilm->ilm_next = NULL; 16432 FREE_SLIST(ilm->ilm_filter); 16433 FREE_SLIST(ilm->ilm_pendsrcs); 16434 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 16435 FREE_SLIST(ilm->ilm_rtx.rtx_block); 16436 mi_free((char *)ilm); 16437 } else { 16438 ilm->ilm_flags |= ILM_DELETED; 16439 from_ill->ill_ilm_cleanup_reqd = 1; 16440 ilmp = &ilm->ilm_next; 16441 } 16442 } 16443 } 16444 } 16445 16446 static uint_t 16447 ipif_get_id(ill_t *ill, uint_t id) 16448 { 16449 uint_t unit; 16450 ipif_t *tipif; 16451 boolean_t found = B_FALSE; 16452 16453 /* 16454 * During failback, we want to go back to the same id 16455 * instead of the smallest id so that the original 16456 * configuration is maintained. id is non-zero in that 16457 * case. 16458 */ 16459 if (id != 0) { 16460 /* 16461 * While failing back, if we still have an ipif with 16462 * MAX_ADDRS_PER_IF, it means this will be replaced 16463 * as soon as we return from this function. It was 16464 * to set to MAX_ADDRS_PER_IF by the caller so that 16465 * we can choose the smallest id. Thus we return zero 16466 * in that case ignoring the hint. 16467 */ 16468 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 16469 return (0); 16470 for (tipif = ill->ill_ipif; tipif != NULL; 16471 tipif = tipif->ipif_next) { 16472 if (tipif->ipif_id == id) { 16473 found = B_TRUE; 16474 break; 16475 } 16476 } 16477 /* 16478 * If somebody already plumbed another logical 16479 * with the same id, we won't be able to find it. 16480 */ 16481 if (!found) 16482 return (id); 16483 } 16484 for (unit = 0; unit <= ip_addrs_per_if; unit++) { 16485 found = B_FALSE; 16486 for (tipif = ill->ill_ipif; tipif != NULL; 16487 tipif = tipif->ipif_next) { 16488 if (tipif->ipif_id == unit) { 16489 found = B_TRUE; 16490 break; 16491 } 16492 } 16493 if (!found) 16494 break; 16495 } 16496 return (unit); 16497 } 16498 16499 /* ARGSUSED */ 16500 static int 16501 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 16502 ipif_t **rep_ipif_ptr) 16503 { 16504 ill_t *from_ill; 16505 ipif_t *rep_ipif; 16506 ipif_t **ipifp; 16507 uint_t unit; 16508 int err = 0; 16509 ipif_t *to_ipif; 16510 struct iocblk *iocp; 16511 boolean_t failback_cmd; 16512 boolean_t remove_ipif; 16513 int rc; 16514 16515 ASSERT(IAM_WRITER_ILL(to_ill)); 16516 ASSERT(IAM_WRITER_IPIF(ipif)); 16517 16518 iocp = (struct iocblk *)mp->b_rptr; 16519 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 16520 remove_ipif = B_FALSE; 16521 16522 from_ill = ipif->ipif_ill; 16523 16524 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16525 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16526 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16527 16528 /* 16529 * Don't move LINK LOCAL addresses as they are tied to 16530 * physical interface. 16531 */ 16532 if (from_ill->ill_isv6 && 16533 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 16534 ipif->ipif_was_up = B_FALSE; 16535 IPIF_UNMARK_MOVING(ipif); 16536 return (0); 16537 } 16538 16539 /* 16540 * We set the ipif_id to maximum so that the search for 16541 * ipif_id will pick the lowest number i.e 0 in the 16542 * following 2 cases : 16543 * 16544 * 1) We have a replacement ipif at the head of to_ill. 16545 * We can't remove it yet as we can exceed ip_addrs_per_if 16546 * on to_ill and hence the MOVE might fail. We want to 16547 * remove it only if we could move the ipif. Thus, by 16548 * setting it to the MAX value, we make the search in 16549 * ipif_get_id return the zeroth id. 16550 * 16551 * 2) When DR pulls out the NIC and re-plumbs the interface, 16552 * we might just have a zero address plumbed on the ipif 16553 * with zero id in the case of IPv4. We remove that while 16554 * doing the failback. We want to remove it only if we 16555 * could move the ipif. Thus, by setting it to the MAX 16556 * value, we make the search in ipif_get_id return the 16557 * zeroth id. 16558 * 16559 * Both (1) and (2) are done only when when we are moving 16560 * an ipif (either due to failover/failback) which originally 16561 * belonged to this interface i.e the ipif_orig_ifindex is 16562 * the same as to_ill's ifindex. This is needed so that 16563 * FAILOVER from A -> B ( A failed) followed by FAILOVER 16564 * from B -> A (B is being removed from the group) and 16565 * FAILBACK from A -> B restores the original configuration. 16566 * Without the check for orig_ifindex, the second FAILOVER 16567 * could make the ipif belonging to B replace the A's zeroth 16568 * ipif and the subsequent failback re-creating the replacement 16569 * ipif again. 16570 * 16571 * NOTE : We created the replacement ipif when we did a 16572 * FAILOVER (See below). We could check for FAILBACK and 16573 * then look for replacement ipif to be removed. But we don't 16574 * want to do that because we wan't to allow the possibility 16575 * of a FAILOVER from A -> B (which creates the replacement ipif), 16576 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 16577 * from B -> A. 16578 */ 16579 to_ipif = to_ill->ill_ipif; 16580 if ((to_ill->ill_phyint->phyint_ifindex == 16581 ipif->ipif_orig_ifindex) && 16582 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 16583 ASSERT(to_ipif->ipif_id == 0); 16584 remove_ipif = B_TRUE; 16585 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 16586 } 16587 /* 16588 * Find the lowest logical unit number on the to_ill. 16589 * If we are failing back, try to get the original id 16590 * rather than the lowest one so that the original 16591 * configuration is maintained. 16592 * 16593 * XXX need a better scheme for this. 16594 */ 16595 if (failback_cmd) { 16596 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 16597 } else { 16598 unit = ipif_get_id(to_ill, 0); 16599 } 16600 16601 /* Reset back to zero in case we fail below */ 16602 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 16603 to_ipif->ipif_id = 0; 16604 16605 if (unit == ip_addrs_per_if) { 16606 ipif->ipif_was_up = B_FALSE; 16607 IPIF_UNMARK_MOVING(ipif); 16608 return (EINVAL); 16609 } 16610 16611 /* 16612 * ipif is ready to move from "from_ill" to "to_ill". 16613 * 16614 * 1) If we are moving ipif with id zero, create a 16615 * replacement ipif for this ipif on from_ill. If this fails 16616 * fail the MOVE operation. 16617 * 16618 * 2) Remove the replacement ipif on to_ill if any. 16619 * We could remove the replacement ipif when we are moving 16620 * the ipif with id zero. But what if somebody already 16621 * unplumbed it ? Thus we always remove it if it is present. 16622 * We want to do it only if we are sure we are going to 16623 * move the ipif to to_ill which is why there are no 16624 * returns due to error till ipif is linked to to_ill. 16625 * Note that the first ipif that we failback will always 16626 * be zero if it is present. 16627 */ 16628 if (ipif->ipif_id == 0) { 16629 ipaddr_t inaddr_any = INADDR_ANY; 16630 16631 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 16632 if (rep_ipif == NULL) { 16633 ipif->ipif_was_up = B_FALSE; 16634 IPIF_UNMARK_MOVING(ipif); 16635 return (ENOMEM); 16636 } 16637 *rep_ipif = ipif_zero; 16638 /* 16639 * Before we put the ipif on the list, store the addresses 16640 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 16641 * assumes so. This logic is not any different from what 16642 * ipif_allocate does. 16643 */ 16644 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16645 &rep_ipif->ipif_v6lcl_addr); 16646 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16647 &rep_ipif->ipif_v6src_addr); 16648 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16649 &rep_ipif->ipif_v6subnet); 16650 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16651 &rep_ipif->ipif_v6net_mask); 16652 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16653 &rep_ipif->ipif_v6brd_addr); 16654 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16655 &rep_ipif->ipif_v6pp_dst_addr); 16656 /* 16657 * We mark IPIF_NOFAILOVER so that this can never 16658 * move. 16659 */ 16660 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 16661 rep_ipif->ipif_flags &= ~IPIF_UP; 16662 rep_ipif->ipif_replace_zero = B_TRUE; 16663 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 16664 MUTEX_DEFAULT, NULL); 16665 rep_ipif->ipif_id = 0; 16666 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 16667 rep_ipif->ipif_ill = from_ill; 16668 rep_ipif->ipif_orig_ifindex = 16669 from_ill->ill_phyint->phyint_ifindex; 16670 /* Insert at head */ 16671 rep_ipif->ipif_next = from_ill->ill_ipif; 16672 from_ill->ill_ipif = rep_ipif; 16673 /* 16674 * We don't really care to let apps know about 16675 * this interface. 16676 */ 16677 } 16678 16679 if (remove_ipif) { 16680 /* 16681 * We set to a max value above for this case to get 16682 * id zero. ASSERT that we did get one. 16683 */ 16684 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 16685 rep_ipif = to_ipif; 16686 to_ill->ill_ipif = rep_ipif->ipif_next; 16687 rep_ipif->ipif_next = NULL; 16688 /* 16689 * If some apps scanned and find this interface, 16690 * it is time to let them know, so that they can 16691 * delete it. 16692 */ 16693 16694 *rep_ipif_ptr = rep_ipif; 16695 } 16696 16697 /* Get it out of the ILL interface list. */ 16698 ipifp = &ipif->ipif_ill->ill_ipif; 16699 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 16700 if (*ipifp == ipif) { 16701 *ipifp = ipif->ipif_next; 16702 break; 16703 } 16704 } 16705 16706 /* Assign the new ill */ 16707 ipif->ipif_ill = to_ill; 16708 ipif->ipif_id = unit; 16709 /* id has already been checked */ 16710 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 16711 ASSERT(rc == 0); 16712 /* Let SCTP update its list */ 16713 sctp_move_ipif(ipif, from_ill, to_ill); 16714 /* 16715 * Handle the failover and failback of ipif_t between 16716 * ill_t that have differing maximum mtu values. 16717 */ 16718 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 16719 if (ipif->ipif_saved_mtu == 0) { 16720 /* 16721 * As this ipif_t is moving to an ill_t 16722 * that has a lower ill_max_mtu, its 16723 * ipif_mtu needs to be saved so it can 16724 * be restored during failback or during 16725 * failover to an ill_t which has a 16726 * higher ill_max_mtu. 16727 */ 16728 ipif->ipif_saved_mtu = ipif->ipif_mtu; 16729 ipif->ipif_mtu = to_ill->ill_max_mtu; 16730 } else { 16731 /* 16732 * The ipif_t is, once again, moving to 16733 * an ill_t that has a lower maximum mtu 16734 * value. 16735 */ 16736 ipif->ipif_mtu = to_ill->ill_max_mtu; 16737 } 16738 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 16739 ipif->ipif_saved_mtu != 0) { 16740 /* 16741 * The mtu of this ipif_t had to be reduced 16742 * during an earlier failover; this is an 16743 * opportunity for it to be increased (either as 16744 * part of another failover or a failback). 16745 */ 16746 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 16747 ipif->ipif_mtu = ipif->ipif_saved_mtu; 16748 ipif->ipif_saved_mtu = 0; 16749 } else { 16750 ipif->ipif_mtu = to_ill->ill_max_mtu; 16751 } 16752 } 16753 16754 /* 16755 * We preserve all the other fields of the ipif including 16756 * ipif_saved_ire_mp. The routes that are saved here will 16757 * be recreated on the new interface and back on the old 16758 * interface when we move back. 16759 */ 16760 ASSERT(ipif->ipif_arp_del_mp == NULL); 16761 16762 return (err); 16763 } 16764 16765 static int 16766 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 16767 int ifindex, ipif_t **rep_ipif_ptr) 16768 { 16769 ipif_t *mipif; 16770 ipif_t *ipif_next; 16771 int err; 16772 16773 /* 16774 * We don't really try to MOVE back things if some of the 16775 * operations fail. The daemon will take care of moving again 16776 * later on. 16777 */ 16778 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 16779 ipif_next = mipif->ipif_next; 16780 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 16781 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 16782 16783 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 16784 16785 /* 16786 * When the MOVE fails, it is the job of the 16787 * application to take care of this properly 16788 * i.e try again if it is ENOMEM. 16789 */ 16790 if (mipif->ipif_ill != from_ill) { 16791 /* 16792 * ipif has moved. 16793 * 16794 * Move the multicast memberships associated 16795 * with this ipif to the new ill. For IPv6, we 16796 * do it once after all the ipifs are moved 16797 * (in ill_move) as they are not associated 16798 * with ipifs. 16799 * 16800 * We need to move the ilms as the ipif has 16801 * already been moved to a new ill even 16802 * in the case of errors. Neither 16803 * ilm_free(ipif) will find the ilm 16804 * when somebody unplumbs this ipif nor 16805 * ilm_delete(ilm) will be able to find the 16806 * ilm, if we don't move now. 16807 */ 16808 if (!from_ill->ill_isv6) 16809 ilm_move_v4(from_ill, to_ill, mipif); 16810 } 16811 16812 if (err != 0) 16813 return (err); 16814 } 16815 } 16816 return (0); 16817 } 16818 16819 static int 16820 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 16821 { 16822 int ifindex; 16823 int err; 16824 struct iocblk *iocp; 16825 ipif_t *ipif; 16826 ipif_t *rep_ipif_ptr = NULL; 16827 ipif_t *from_ipif = NULL; 16828 boolean_t check_rep_if = B_FALSE; 16829 16830 iocp = (struct iocblk *)mp->b_rptr; 16831 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 16832 /* 16833 * Move everything pointing at from_ill to to_ill. 16834 * We acheive this by passing in 0 as ifindex. 16835 */ 16836 ifindex = 0; 16837 } else { 16838 /* 16839 * Move everything pointing at from_ill whose original 16840 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 16841 * We acheive this by passing in ifindex rather than 0. 16842 * Multicast vifs, ilgs move implicitly because ipifs move. 16843 */ 16844 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 16845 ifindex = to_ill->ill_phyint->phyint_ifindex; 16846 } 16847 16848 /* 16849 * Determine if there is at least one ipif that would move from 16850 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 16851 * ipif (if it exists) on the to_ill would be consumed as a result of 16852 * the move, in which case we need to quiesce the replacement ipif also. 16853 */ 16854 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 16855 from_ipif = from_ipif->ipif_next) { 16856 if (((ifindex == 0) || 16857 (ifindex == from_ipif->ipif_orig_ifindex)) && 16858 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 16859 check_rep_if = B_TRUE; 16860 break; 16861 } 16862 } 16863 16864 16865 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 16866 16867 GRAB_ILL_LOCKS(from_ill, to_ill); 16868 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 16869 (void) ipsq_pending_mp_add(NULL, ipif, q, 16870 mp, ILL_MOVE_OK); 16871 RELEASE_ILL_LOCKS(from_ill, to_ill); 16872 return (EINPROGRESS); 16873 } 16874 16875 /* Check if the replacement ipif is quiescent to delete */ 16876 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 16877 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 16878 to_ill->ill_ipif->ipif_state_flags |= 16879 IPIF_MOVING | IPIF_CHANGING; 16880 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 16881 (void) ipsq_pending_mp_add(NULL, ipif, q, 16882 mp, ILL_MOVE_OK); 16883 RELEASE_ILL_LOCKS(from_ill, to_ill); 16884 return (EINPROGRESS); 16885 } 16886 } 16887 RELEASE_ILL_LOCKS(from_ill, to_ill); 16888 16889 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 16890 rw_enter(&ill_g_lock, RW_WRITER); 16891 GRAB_ILL_LOCKS(from_ill, to_ill); 16892 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 16893 16894 /* ilm_move is done inside ipif_move for IPv4 */ 16895 if (err == 0 && from_ill->ill_isv6) 16896 ilm_move_v6(from_ill, to_ill, ifindex); 16897 16898 RELEASE_ILL_LOCKS(from_ill, to_ill); 16899 rw_exit(&ill_g_lock); 16900 16901 /* 16902 * send rts messages and multicast messages. 16903 */ 16904 if (rep_ipif_ptr != NULL) { 16905 ip_rts_ifmsg(rep_ipif_ptr); 16906 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 16907 IPIF_TRACE_CLEANUP(rep_ipif_ptr); 16908 mi_free(rep_ipif_ptr); 16909 } 16910 16911 conn_move_ill(from_ill, to_ill, ifindex); 16912 16913 return (err); 16914 } 16915 16916 /* 16917 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 16918 * Also checks for the validity of the arguments. 16919 * Note: We are already exclusive inside the from group. 16920 * It is upto the caller to release refcnt on the to_ill's. 16921 */ 16922 static int 16923 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 16924 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 16925 { 16926 int dst_index; 16927 ipif_t *ipif_v4, *ipif_v6; 16928 struct lifreq *lifr; 16929 mblk_t *mp1; 16930 boolean_t exists; 16931 sin_t *sin; 16932 int err = 0; 16933 16934 if ((mp1 = mp->b_cont) == NULL) 16935 return (EPROTO); 16936 16937 if ((mp1 = mp1->b_cont) == NULL) 16938 return (EPROTO); 16939 16940 lifr = (struct lifreq *)mp1->b_rptr; 16941 sin = (sin_t *)&lifr->lifr_addr; 16942 16943 /* 16944 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 16945 * specific operations. 16946 */ 16947 if (sin->sin_family != AF_UNSPEC) 16948 return (EINVAL); 16949 16950 /* 16951 * Get ipif with id 0. We are writer on the from ill. So we can pass 16952 * NULLs for the last 4 args and we know the lookup won't fail 16953 * with EINPROGRESS. 16954 */ 16955 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 16956 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 16957 ALL_ZONES, NULL, NULL, NULL, NULL); 16958 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 16959 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 16960 ALL_ZONES, NULL, NULL, NULL, NULL); 16961 16962 if (ipif_v4 == NULL && ipif_v6 == NULL) 16963 return (ENXIO); 16964 16965 if (ipif_v4 != NULL) { 16966 ASSERT(ipif_v4->ipif_refcnt != 0); 16967 if (ipif_v4->ipif_id != 0) { 16968 err = EINVAL; 16969 goto done; 16970 } 16971 16972 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 16973 *ill_from_v4 = ipif_v4->ipif_ill; 16974 } 16975 16976 if (ipif_v6 != NULL) { 16977 ASSERT(ipif_v6->ipif_refcnt != 0); 16978 if (ipif_v6->ipif_id != 0) { 16979 err = EINVAL; 16980 goto done; 16981 } 16982 16983 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 16984 *ill_from_v6 = ipif_v6->ipif_ill; 16985 } 16986 16987 err = 0; 16988 dst_index = lifr->lifr_movetoindex; 16989 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 16990 q, mp, ip_process_ioctl, &err); 16991 if (err != 0) { 16992 /* 16993 * There could be only v6. 16994 */ 16995 if (err != ENXIO) 16996 goto done; 16997 err = 0; 16998 } 16999 17000 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 17001 q, mp, ip_process_ioctl, &err); 17002 if (err != 0) { 17003 if (err != ENXIO) 17004 goto done; 17005 if (*ill_to_v4 == NULL) { 17006 err = ENXIO; 17007 goto done; 17008 } 17009 err = 0; 17010 } 17011 17012 /* 17013 * If we have something to MOVE i.e "from" not NULL, 17014 * "to" should be non-NULL. 17015 */ 17016 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 17017 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 17018 err = EINVAL; 17019 } 17020 17021 done: 17022 if (ipif_v4 != NULL) 17023 ipif_refrele(ipif_v4); 17024 if (ipif_v6 != NULL) 17025 ipif_refrele(ipif_v6); 17026 return (err); 17027 } 17028 17029 /* 17030 * FAILOVER and FAILBACK are modelled as MOVE operations. 17031 * 17032 * We don't check whether the MOVE is within the same group or 17033 * not, because this ioctl can be used as a generic mechanism 17034 * to failover from interface A to B, though things will function 17035 * only if they are really part of the same group. Moreover, 17036 * all ipifs may be down and hence temporarily out of the group. 17037 * 17038 * ipif's that need to be moved are first brought down; V4 ipifs are brought 17039 * down first and then V6. For each we wait for the ipif's to become quiescent. 17040 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 17041 * have been deleted and there are no active references. Once quiescent the 17042 * ipif's are moved and brought up on the new ill. 17043 * 17044 * Normally the source ill and destination ill belong to the same IPMP group 17045 * and hence the same ipsq_t. In the event they don't belong to the same 17046 * same group the two ipsq's are first merged into one ipsq - that of the 17047 * to_ill. The multicast memberships on the source and destination ill cannot 17048 * change during the move operation since multicast joins/leaves also have to 17049 * execute on the same ipsq and are hence serialized. 17050 */ 17051 /* ARGSUSED */ 17052 int 17053 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17054 ip_ioctl_cmd_t *ipip, void *ifreq) 17055 { 17056 ill_t *ill_to_v4 = NULL; 17057 ill_t *ill_to_v6 = NULL; 17058 ill_t *ill_from_v4 = NULL; 17059 ill_t *ill_from_v6 = NULL; 17060 int err = 0; 17061 17062 /* 17063 * setup from and to ill's, we can get EINPROGRESS only for 17064 * to_ill's. 17065 */ 17066 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 17067 &ill_to_v4, &ill_to_v6); 17068 17069 if (err != 0) { 17070 ip0dbg(("ip_sioctl_move: extract args failed\n")); 17071 goto done; 17072 } 17073 17074 /* 17075 * nothing to do. 17076 */ 17077 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 17078 goto done; 17079 } 17080 17081 /* 17082 * nothing to do. 17083 */ 17084 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 17085 goto done; 17086 } 17087 17088 /* 17089 * Mark the ill as changing. 17090 * ILL_CHANGING flag is cleared when the ipif's are brought up 17091 * in ill_up_ipifs in case of error they are cleared below. 17092 */ 17093 17094 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17095 if (ill_from_v4 != NULL) 17096 ill_from_v4->ill_state_flags |= ILL_CHANGING; 17097 if (ill_from_v6 != NULL) 17098 ill_from_v6->ill_state_flags |= ILL_CHANGING; 17099 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17100 17101 /* 17102 * Make sure that both src and dst are 17103 * in the same syncq group. If not make it happen. 17104 * We are not holding any locks because we are the writer 17105 * on the from_ipsq and we will hold locks in ill_merge_groups 17106 * to protect to_ipsq against changing. 17107 */ 17108 if (ill_from_v4 != NULL) { 17109 if (ill_from_v4->ill_phyint->phyint_ipsq != 17110 ill_to_v4->ill_phyint->phyint_ipsq) { 17111 err = ill_merge_groups(ill_from_v4, ill_to_v4, 17112 NULL, mp, q); 17113 goto err_ret; 17114 17115 } 17116 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 17117 } else { 17118 17119 if (ill_from_v6->ill_phyint->phyint_ipsq != 17120 ill_to_v6->ill_phyint->phyint_ipsq) { 17121 err = ill_merge_groups(ill_from_v6, ill_to_v6, 17122 NULL, mp, q); 17123 goto err_ret; 17124 17125 } 17126 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 17127 } 17128 17129 /* 17130 * Now that the ipsq's have been merged and we are the writer 17131 * lets mark to_ill as changing as well. 17132 */ 17133 17134 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17135 if (ill_to_v4 != NULL) 17136 ill_to_v4->ill_state_flags |= ILL_CHANGING; 17137 if (ill_to_v6 != NULL) 17138 ill_to_v6->ill_state_flags |= ILL_CHANGING; 17139 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17140 17141 /* 17142 * Its ok for us to proceed with the move even if 17143 * ill_pending_mp is non null on one of the from ill's as the reply 17144 * should not be looking at the ipif, it should only care about the 17145 * ill itself. 17146 */ 17147 17148 /* 17149 * lets move ipv4 first. 17150 */ 17151 if (ill_from_v4 != NULL) { 17152 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 17153 ill_from_v4->ill_move_in_progress = B_TRUE; 17154 ill_to_v4->ill_move_in_progress = B_TRUE; 17155 ill_to_v4->ill_move_peer = ill_from_v4; 17156 ill_from_v4->ill_move_peer = ill_to_v4; 17157 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 17158 } 17159 17160 /* 17161 * Now lets move ipv6. 17162 */ 17163 if (err == 0 && ill_from_v6 != NULL) { 17164 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 17165 ill_from_v6->ill_move_in_progress = B_TRUE; 17166 ill_to_v6->ill_move_in_progress = B_TRUE; 17167 ill_to_v6->ill_move_peer = ill_from_v6; 17168 ill_from_v6->ill_move_peer = ill_to_v6; 17169 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 17170 } 17171 17172 err_ret: 17173 /* 17174 * EINPROGRESS means we are waiting for the ipif's that need to be 17175 * moved to become quiescent. 17176 */ 17177 if (err == EINPROGRESS) { 17178 goto done; 17179 } 17180 17181 /* 17182 * if err is set ill_up_ipifs will not be called 17183 * lets clear the flags. 17184 */ 17185 17186 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17187 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17188 /* 17189 * Some of the clearing may be redundant. But it is simple 17190 * not making any extra checks. 17191 */ 17192 if (ill_from_v6 != NULL) { 17193 ill_from_v6->ill_move_in_progress = B_FALSE; 17194 ill_from_v6->ill_move_peer = NULL; 17195 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 17196 } 17197 if (ill_from_v4 != NULL) { 17198 ill_from_v4->ill_move_in_progress = B_FALSE; 17199 ill_from_v4->ill_move_peer = NULL; 17200 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 17201 } 17202 if (ill_to_v6 != NULL) { 17203 ill_to_v6->ill_move_in_progress = B_FALSE; 17204 ill_to_v6->ill_move_peer = NULL; 17205 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 17206 } 17207 if (ill_to_v4 != NULL) { 17208 ill_to_v4->ill_move_in_progress = B_FALSE; 17209 ill_to_v4->ill_move_peer = NULL; 17210 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 17211 } 17212 17213 /* 17214 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 17215 * Do this always to maintain proper state i.e even in case of errors. 17216 * As phyint_inactive looks at both v4 and v6 interfaces, 17217 * we need not call on both v4 and v6 interfaces. 17218 */ 17219 if (ill_from_v4 != NULL) { 17220 if ((ill_from_v4->ill_phyint->phyint_flags & 17221 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17222 phyint_inactive(ill_from_v4->ill_phyint); 17223 } 17224 } else if (ill_from_v6 != NULL) { 17225 if ((ill_from_v6->ill_phyint->phyint_flags & 17226 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17227 phyint_inactive(ill_from_v6->ill_phyint); 17228 } 17229 } 17230 17231 if (ill_to_v4 != NULL) { 17232 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17233 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17234 } 17235 } else if (ill_to_v6 != NULL) { 17236 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17237 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17238 } 17239 } 17240 17241 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17242 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17243 17244 no_err: 17245 /* 17246 * lets bring the interfaces up on the to_ill. 17247 */ 17248 if (err == 0) { 17249 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 17250 q, mp); 17251 } 17252 17253 if (err == 0) { 17254 if (ill_from_v4 != NULL && ill_to_v4 != NULL) 17255 ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); 17256 17257 if (ill_from_v6 != NULL && ill_to_v6 != NULL) 17258 ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); 17259 } 17260 done: 17261 17262 if (ill_to_v4 != NULL) { 17263 ill_refrele(ill_to_v4); 17264 } 17265 if (ill_to_v6 != NULL) { 17266 ill_refrele(ill_to_v6); 17267 } 17268 17269 return (err); 17270 } 17271 17272 static void 17273 ill_dl_down(ill_t *ill) 17274 { 17275 /* 17276 * The ill is down; unbind but stay attached since we're still 17277 * associated with a PPA. 17278 */ 17279 mblk_t *mp = ill->ill_unbind_mp; 17280 17281 ill->ill_unbind_mp = NULL; 17282 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 17283 if (mp != NULL) { 17284 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 17285 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 17286 ill->ill_name)); 17287 mutex_enter(&ill->ill_lock); 17288 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 17289 mutex_exit(&ill->ill_lock); 17290 ill_dlpi_send(ill, mp); 17291 } 17292 17293 /* 17294 * Toss all of our multicast memberships. We could keep them, but 17295 * then we'd have to do bookkeeping of any joins and leaves performed 17296 * by the application while the the interface is down (we can't just 17297 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 17298 * on a downed interface). 17299 */ 17300 ill_leave_multicast(ill); 17301 17302 mutex_enter(&ill->ill_lock); 17303 ill->ill_dl_up = 0; 17304 mutex_exit(&ill->ill_lock); 17305 } 17306 17307 void 17308 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 17309 { 17310 union DL_primitives *dlp; 17311 t_uscalar_t prim; 17312 17313 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17314 17315 dlp = (union DL_primitives *)mp->b_rptr; 17316 prim = dlp->dl_primitive; 17317 17318 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 17319 dlpi_prim_str(prim), prim, ill->ill_name)); 17320 17321 switch (prim) { 17322 case DL_PHYS_ADDR_REQ: 17323 { 17324 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 17325 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 17326 break; 17327 } 17328 case DL_BIND_REQ: 17329 mutex_enter(&ill->ill_lock); 17330 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 17331 mutex_exit(&ill->ill_lock); 17332 break; 17333 } 17334 17335 ill->ill_dlpi_pending = prim; 17336 17337 /* 17338 * Some drivers send M_FLUSH up to IP as part of unbind 17339 * request. When this M_FLUSH is sent back to the driver, 17340 * this can go after we send the detach request if the 17341 * M_FLUSH ends up in IP's syncq. To avoid that, we reply 17342 * to the M_FLUSH in ip_rput and locally generate another 17343 * M_FLUSH for the correctness. This will get freed in 17344 * ip_wput_nondata. 17345 */ 17346 if (prim == DL_UNBIND_REQ) 17347 (void) putnextctl1(ill->ill_rq, M_FLUSH, FLUSHRW); 17348 17349 putnext(ill->ill_wq, mp); 17350 } 17351 17352 /* 17353 * Send a DLPI control message to the driver but make sure there 17354 * is only one outstanding message. Uses ill_dlpi_pending to tell 17355 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 17356 * when an ACK or a NAK is received to process the next queued message. 17357 * 17358 * We don't protect ill_dlpi_pending with any lock. This is okay as 17359 * every place where its accessed, ip is exclusive while accessing 17360 * ill_dlpi_pending except when this function is called from ill_init() 17361 */ 17362 void 17363 ill_dlpi_send(ill_t *ill, mblk_t *mp) 17364 { 17365 mblk_t **mpp; 17366 17367 ASSERT(IAM_WRITER_ILL(ill)); 17368 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17369 17370 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 17371 /* Must queue message. Tail insertion */ 17372 mpp = &ill->ill_dlpi_deferred; 17373 while (*mpp != NULL) 17374 mpp = &((*mpp)->b_next); 17375 17376 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 17377 ill->ill_name)); 17378 17379 *mpp = mp; 17380 return; 17381 } 17382 17383 ill_dlpi_dispatch(ill, mp); 17384 } 17385 17386 /* 17387 * Called when an DLPI control message has been acked or nacked to 17388 * send down the next queued message (if any). 17389 */ 17390 void 17391 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 17392 { 17393 mblk_t *mp; 17394 17395 ASSERT(IAM_WRITER_ILL(ill)); 17396 17397 ASSERT(prim != DL_PRIM_INVAL); 17398 if (ill->ill_dlpi_pending != prim) { 17399 if (ill->ill_dlpi_pending == DL_PRIM_INVAL) { 17400 (void) mi_strlog(ill->ill_rq, 1, 17401 SL_CONSOLE|SL_ERROR|SL_TRACE, 17402 "ill_dlpi_done: unsolicited ack for %s from %s\n", 17403 dlpi_prim_str(prim), ill->ill_name); 17404 } else { 17405 (void) mi_strlog(ill->ill_rq, 1, 17406 SL_CONSOLE|SL_ERROR|SL_TRACE, 17407 "ill_dlpi_done: unexpected ack for %s from %s " 17408 "(expecting ack for %s)\n", 17409 dlpi_prim_str(prim), ill->ill_name, 17410 dlpi_prim_str(ill->ill_dlpi_pending)); 17411 } 17412 return; 17413 } 17414 17415 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 17416 dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 17417 17418 if ((mp = ill->ill_dlpi_deferred) == NULL) { 17419 ill->ill_dlpi_pending = DL_PRIM_INVAL; 17420 return; 17421 } 17422 17423 ill->ill_dlpi_deferred = mp->b_next; 17424 mp->b_next = NULL; 17425 17426 ill_dlpi_dispatch(ill, mp); 17427 } 17428 17429 void 17430 conn_delete_ire(conn_t *connp, caddr_t arg) 17431 { 17432 ipif_t *ipif = (ipif_t *)arg; 17433 ire_t *ire; 17434 17435 /* 17436 * Look at the cached ires on conns which has pointers to ipifs. 17437 * We just call ire_refrele which clears up the reference 17438 * to ire. Called when a conn closes. Also called from ipif_free 17439 * to cleanup indirect references to the stale ipif via the cached ire. 17440 */ 17441 mutex_enter(&connp->conn_lock); 17442 ire = connp->conn_ire_cache; 17443 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 17444 connp->conn_ire_cache = NULL; 17445 mutex_exit(&connp->conn_lock); 17446 IRE_REFRELE_NOTR(ire); 17447 return; 17448 } 17449 mutex_exit(&connp->conn_lock); 17450 17451 } 17452 17453 /* 17454 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 17455 * of IREs. Those IREs may have been previously cached in the conn structure. 17456 * This ipcl_walk() walker function releases all references to such IREs based 17457 * on the condemned flag. 17458 */ 17459 /* ARGSUSED */ 17460 void 17461 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 17462 { 17463 ire_t *ire; 17464 17465 mutex_enter(&connp->conn_lock); 17466 ire = connp->conn_ire_cache; 17467 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 17468 connp->conn_ire_cache = NULL; 17469 mutex_exit(&connp->conn_lock); 17470 IRE_REFRELE_NOTR(ire); 17471 return; 17472 } 17473 mutex_exit(&connp->conn_lock); 17474 } 17475 17476 /* 17477 * Take down a specific interface, but don't lose any information about it. 17478 * Also delete interface from its interface group (ifgrp). 17479 * (Always called as writer.) 17480 * This function goes through the down sequence even if the interface is 17481 * already down. There are 2 reasons. 17482 * a. Currently we permit interface routes that depend on down interfaces 17483 * to be added. This behaviour itself is questionable. However it appears 17484 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 17485 * time. We go thru the cleanup in order to remove these routes. 17486 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 17487 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 17488 * down, but we need to cleanup i.e. do ill_dl_down and 17489 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 17490 * 17491 * IP-MT notes: 17492 * 17493 * Model of reference to interfaces. 17494 * 17495 * The following members in ipif_t track references to the ipif. 17496 * int ipif_refcnt; Active reference count 17497 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 17498 * The following members in ill_t track references to the ill. 17499 * int ill_refcnt; active refcnt 17500 * uint_t ill_ire_cnt; Number of ires referencing ill 17501 * uint_t ill_nce_cnt; Number of nces referencing ill 17502 * 17503 * Reference to an ipif or ill can be obtained in any of the following ways. 17504 * 17505 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 17506 * Pointers to ipif / ill from other data structures viz ire and conn. 17507 * Implicit reference to the ipif / ill by holding a reference to the ire. 17508 * 17509 * The ipif/ill lookup functions return a reference held ipif / ill. 17510 * ipif_refcnt and ill_refcnt track the reference counts respectively. 17511 * This is a purely dynamic reference count associated with threads holding 17512 * references to the ipif / ill. Pointers from other structures do not 17513 * count towards this reference count. 17514 * 17515 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 17516 * ipif/ill. This is incremented whenever a new ire is created referencing the 17517 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 17518 * actually added to the ire hash table. The count is decremented in 17519 * ire_inactive where the ire is destroyed. 17520 * 17521 * nce's reference ill's thru nce_ill and the count of nce's associated with 17522 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 17523 * ndp_add() where the nce is actually added to the table. Similarly it is 17524 * decremented in ndp_inactive where the nce is destroyed. 17525 * 17526 * Flow of ioctls involving interface down/up 17527 * 17528 * The following is the sequence of an attempt to set some critical flags on an 17529 * up interface. 17530 * ip_sioctl_flags 17531 * ipif_down 17532 * wait for ipif to be quiescent 17533 * ipif_down_tail 17534 * ip_sioctl_flags_tail 17535 * 17536 * All set ioctls that involve down/up sequence would have a skeleton similar 17537 * to the above. All the *tail functions are called after the refcounts have 17538 * dropped to the appropriate values. 17539 * 17540 * The mechanism to quiesce an ipif is as follows. 17541 * 17542 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 17543 * on the ipif. Callers either pass a flag requesting wait or the lookup 17544 * functions will return NULL. 17545 * 17546 * Delete all ires referencing this ipif 17547 * 17548 * Any thread attempting to do an ipif_refhold on an ipif that has been 17549 * obtained thru a cached pointer will first make sure that 17550 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 17551 * increment the refcount. 17552 * 17553 * The above guarantees that the ipif refcount will eventually come down to 17554 * zero and the ipif will quiesce, once all threads that currently hold a 17555 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 17556 * ipif_refcount has dropped to zero and all ire's associated with this ipif 17557 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 17558 * drop to zero. 17559 * 17560 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 17561 * 17562 * Threads trying to lookup an ipif or ill can pass a flag requesting 17563 * wait and restart if the ipif / ill cannot be looked up currently. 17564 * For eg. bind, and route operations (Eg. route add / delete) cannot return 17565 * failure if the ipif is currently undergoing an exclusive operation, and 17566 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 17567 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 17568 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 17569 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 17570 * change while the ill_lock is held. Before dropping the ill_lock we acquire 17571 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 17572 * until we release the ipsq_lock, even though the the ill/ipif state flags 17573 * can change after we drop the ill_lock. 17574 * 17575 * An attempt to send out a packet using an ipif that is currently 17576 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 17577 * operation and restart it later when the exclusive condition on the ipif ends. 17578 * This is an example of not passing the wait flag to the lookup functions. For 17579 * example an attempt to refhold and use conn->conn_multicast_ipif and send 17580 * out a multicast packet on that ipif will fail while the ipif is 17581 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 17582 * currently IPIF_CHANGING will also fail. 17583 */ 17584 int 17585 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 17586 { 17587 ill_t *ill = ipif->ipif_ill; 17588 phyint_t *phyi; 17589 conn_t *connp; 17590 boolean_t success; 17591 boolean_t ipif_was_up = B_FALSE; 17592 17593 ASSERT(IAM_WRITER_IPIF(ipif)); 17594 17595 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 17596 17597 if (ipif->ipif_flags & IPIF_UP) { 17598 mutex_enter(&ill->ill_lock); 17599 ipif->ipif_flags &= ~IPIF_UP; 17600 ASSERT(ill->ill_ipif_up_count > 0); 17601 --ill->ill_ipif_up_count; 17602 mutex_exit(&ill->ill_lock); 17603 ipif_was_up = B_TRUE; 17604 /* Update status in SCTP's list */ 17605 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 17606 } 17607 17608 /* 17609 * Blow away v6 memberships we established in ipif_multicast_up(); the 17610 * v4 ones are left alone (as is the ipif_multicast_up flag, so we 17611 * know not to rejoin when the interface is brought back up). 17612 */ 17613 if (ipif->ipif_isv6) 17614 ipif_multicast_down(ipif); 17615 /* 17616 * Remove from the mapping for __sin6_src_id. We insert only 17617 * when the address is not INADDR_ANY. As IPv4 addresses are 17618 * stored as mapped addresses, we need to check for mapped 17619 * INADDR_ANY also. 17620 */ 17621 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 17622 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 17623 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 17624 int err; 17625 17626 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 17627 ipif->ipif_zoneid); 17628 if (err != 0) { 17629 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 17630 } 17631 } 17632 17633 /* 17634 * Before we delete the ill from the group (if any), we need 17635 * to make sure that we delete all the routes dependent on 17636 * this and also any ipifs dependent on this ipif for 17637 * source address. We need to do before we delete from 17638 * the group because 17639 * 17640 * 1) ipif_down_delete_ire de-references ill->ill_group. 17641 * 17642 * 2) ipif_update_other_ipifs needs to walk the whole group 17643 * for re-doing source address selection. Note that 17644 * ipif_select_source[_v6] called from 17645 * ipif_update_other_ipifs[_v6] will not pick this ipif 17646 * because we have already marked down here i.e cleared 17647 * IPIF_UP. 17648 */ 17649 if (ipif->ipif_isv6) 17650 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 17651 else 17652 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 17653 17654 /* 17655 * Need to add these also to be saved and restored when the 17656 * ipif is brought down and up 17657 */ 17658 mutex_enter(&ire_mrtun_lock); 17659 if (ire_mrtun_count != 0) { 17660 mutex_exit(&ire_mrtun_lock); 17661 ire_walk_ill_mrtun(0, 0, ipif_down_delete_ire, 17662 (char *)ipif, NULL); 17663 } else { 17664 mutex_exit(&ire_mrtun_lock); 17665 } 17666 17667 mutex_enter(&ire_srcif_table_lock); 17668 if (ire_srcif_table_count > 0) { 17669 mutex_exit(&ire_srcif_table_lock); 17670 ire_walk_srcif_table_v4(ipif_down_delete_ire, (char *)ipif); 17671 } else { 17672 mutex_exit(&ire_srcif_table_lock); 17673 } 17674 17675 /* 17676 * Cleaning up the conn_ire_cache or conns must be done only after the 17677 * ires have been deleted above. Otherwise a thread could end up 17678 * caching an ire in a conn after we have finished the cleanup of the 17679 * conn. The caching is done after making sure that the ire is not yet 17680 * condemned. Also documented in the block comment above ip_output 17681 */ 17682 ipcl_walk(conn_cleanup_stale_ire, NULL); 17683 /* Also, delete the ires cached in SCTP */ 17684 sctp_ire_cache_flush(ipif); 17685 17686 /* Resolve any IPsec/IKE NAT-T instances that depend on this ipif. */ 17687 nattymod_clean_ipif(ipif); 17688 17689 /* 17690 * Update any other ipifs which have used "our" local address as 17691 * a source address. This entails removing and recreating IRE_INTERFACE 17692 * entries for such ipifs. 17693 */ 17694 if (ipif->ipif_isv6) 17695 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 17696 else 17697 ipif_update_other_ipifs(ipif, ill->ill_group); 17698 17699 if (ipif_was_up) { 17700 /* 17701 * Check whether it is last ipif to leave this group. 17702 * If this is the last ipif to leave, we should remove 17703 * this ill from the group as ipif_select_source will not 17704 * be able to find any useful ipifs if this ill is selected 17705 * for load balancing. 17706 * 17707 * For nameless groups, we should call ifgrp_delete if this 17708 * belongs to some group. As this ipif is going down, we may 17709 * need to reconstruct groups. 17710 */ 17711 phyi = ill->ill_phyint; 17712 /* 17713 * If the phyint_groupname_len is 0, it may or may not 17714 * be in the nameless group. If the phyint_groupname_len is 17715 * not 0, then this ill should be part of some group. 17716 * As we always insert this ill in the group if 17717 * phyint_groupname_len is not zero when the first ipif 17718 * comes up (in ipif_up_done), it should be in a group 17719 * when the namelen is not 0. 17720 * 17721 * NOTE : When we delete the ill from the group,it will 17722 * blow away all the IRE_CACHES pointing either at this ipif or 17723 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 17724 * should be pointing at this ill. 17725 */ 17726 ASSERT(phyi->phyint_groupname_len == 0 || 17727 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 17728 17729 if (phyi->phyint_groupname_len != 0) { 17730 if (ill->ill_ipif_up_count == 0) 17731 illgrp_delete(ill); 17732 } 17733 17734 /* 17735 * If we have deleted some of the broadcast ires associated 17736 * with this ipif, we need to re-nominate somebody else if 17737 * the ires that we deleted were the nominated ones. 17738 */ 17739 if (ill->ill_group != NULL && !ill->ill_isv6) 17740 ipif_renominate_bcast(ipif); 17741 } 17742 17743 /* 17744 * neighbor-discovery or arp entries for this interface. 17745 */ 17746 ipif_ndp_down(ipif); 17747 17748 /* 17749 * If mp is NULL the caller will wait for the appropriate refcnt. 17750 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 17751 * and ill_delete -> ipif_free -> ipif_down 17752 */ 17753 if (mp == NULL) { 17754 ASSERT(q == NULL); 17755 return (0); 17756 } 17757 17758 if (CONN_Q(q)) { 17759 connp = Q_TO_CONN(q); 17760 mutex_enter(&connp->conn_lock); 17761 } else { 17762 connp = NULL; 17763 } 17764 mutex_enter(&ill->ill_lock); 17765 /* 17766 * Are there any ire's pointing to this ipif that are still active ? 17767 * If this is the last ipif going down, are there any ire's pointing 17768 * to this ill that are still active ? 17769 */ 17770 if (ipif_is_quiescent(ipif)) { 17771 mutex_exit(&ill->ill_lock); 17772 if (connp != NULL) 17773 mutex_exit(&connp->conn_lock); 17774 return (0); 17775 } 17776 17777 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 17778 ill->ill_name, (void *)ill)); 17779 /* 17780 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 17781 * drops down, the operation will be restarted by ipif_ill_refrele_tail 17782 * which in turn is called by the last refrele on the ipif/ill/ire. 17783 */ 17784 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 17785 if (!success) { 17786 /* The conn is closing. So just return */ 17787 ASSERT(connp != NULL); 17788 mutex_exit(&ill->ill_lock); 17789 mutex_exit(&connp->conn_lock); 17790 return (EINTR); 17791 } 17792 17793 mutex_exit(&ill->ill_lock); 17794 if (connp != NULL) 17795 mutex_exit(&connp->conn_lock); 17796 return (EINPROGRESS); 17797 } 17798 17799 static void 17800 ipif_down_tail(ipif_t *ipif) 17801 { 17802 ill_t *ill = ipif->ipif_ill; 17803 17804 /* 17805 * Skip any loopback interface (null wq). 17806 * If this is the last logical interface on the ill 17807 * have ill_dl_down tell the driver we are gone (unbind) 17808 * Note that lun 0 can ipif_down even though 17809 * there are other logical units that are up. 17810 * This occurs e.g. when we change a "significant" IFF_ flag. 17811 */ 17812 if (ipif->ipif_ill->ill_wq != NULL) { 17813 if (!ill->ill_logical_down && (ill->ill_ipif_up_count == 0) && 17814 ill->ill_dl_up) { 17815 ill_dl_down(ill); 17816 } 17817 } 17818 ill->ill_logical_down = 0; 17819 17820 /* 17821 * Have to be after removing the routes in ipif_down_delete_ire. 17822 */ 17823 if (ipif->ipif_isv6) { 17824 if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) 17825 ipif_arp_down(ipif); 17826 } else { 17827 ipif_arp_down(ipif); 17828 } 17829 17830 ip_rts_ifmsg(ipif); 17831 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 17832 } 17833 17834 /* 17835 * Bring interface logically down without bringing the physical interface 17836 * down e.g. when the netmask is changed. This avoids long lasting link 17837 * negotiations between an ethernet interface and a certain switches. 17838 */ 17839 static int 17840 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 17841 { 17842 /* 17843 * The ill_logical_down flag is a transient flag. It is set here 17844 * and is cleared once the down has completed in ipif_down_tail. 17845 * This flag does not indicate whether the ill stream is in the 17846 * DL_BOUND state with the driver. Instead this flag is used by 17847 * ipif_down_tail to determine whether to DL_UNBIND the stream with 17848 * the driver. The state of the ill stream i.e. whether it is 17849 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 17850 */ 17851 ipif->ipif_ill->ill_logical_down = 1; 17852 return (ipif_down(ipif, q, mp)); 17853 } 17854 17855 /* 17856 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 17857 * If the usesrc client ILL is already part of a usesrc group or not, 17858 * in either case a ire_stq with the matching usesrc client ILL will 17859 * locate the IRE's that need to be deleted. We want IREs to be created 17860 * with the new source address. 17861 */ 17862 static void 17863 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 17864 { 17865 ill_t *ucill = (ill_t *)ill_arg; 17866 17867 ASSERT(IAM_WRITER_ILL(ucill)); 17868 17869 if (ire->ire_stq == NULL) 17870 return; 17871 17872 if ((ire->ire_type == IRE_CACHE) && 17873 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 17874 ire_delete(ire); 17875 } 17876 17877 /* 17878 * ire_walk routine to delete every IRE dependent on the interface 17879 * address that is going down. (Always called as writer.) 17880 * Works for both v4 and v6. 17881 * In addition for checking for ire_ipif matches it also checks for 17882 * IRE_CACHE entries which have the same source address as the 17883 * disappearing ipif since ipif_select_source might have picked 17884 * that source. Note that ipif_down/ipif_update_other_ipifs takes 17885 * care of any IRE_INTERFACE with the disappearing source address. 17886 */ 17887 static void 17888 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 17889 { 17890 ipif_t *ipif = (ipif_t *)ipif_arg; 17891 ill_t *ire_ill; 17892 ill_t *ipif_ill; 17893 17894 ASSERT(IAM_WRITER_IPIF(ipif)); 17895 if (ire->ire_ipif == NULL) 17896 return; 17897 17898 /* 17899 * For IPv4, we derive source addresses for an IRE from ipif's 17900 * belonging to the same IPMP group as the IRE's outgoing 17901 * interface. If an IRE's outgoing interface isn't in the 17902 * same IPMP group as a particular ipif, then that ipif 17903 * couldn't have been used as a source address for this IRE. 17904 * 17905 * For IPv6, source addresses are only restricted to the IPMP group 17906 * if the IRE is for a link-local address or a multicast address. 17907 * Otherwise, source addresses for an IRE can be chosen from 17908 * interfaces other than the the outgoing interface for that IRE. 17909 * 17910 * For source address selection details, see ipif_select_source() 17911 * and ipif_select_source_v6(). 17912 */ 17913 if (ire->ire_ipversion == IPV4_VERSION || 17914 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 17915 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 17916 ire_ill = ire->ire_ipif->ipif_ill; 17917 ipif_ill = ipif->ipif_ill; 17918 17919 if (ire_ill->ill_group != ipif_ill->ill_group) { 17920 return; 17921 } 17922 } 17923 17924 17925 if (ire->ire_ipif != ipif) { 17926 /* 17927 * Look for a matching source address. 17928 */ 17929 if (ire->ire_type != IRE_CACHE) 17930 return; 17931 if (ipif->ipif_flags & IPIF_NOLOCAL) 17932 return; 17933 17934 if (ire->ire_ipversion == IPV4_VERSION) { 17935 if (ire->ire_src_addr != ipif->ipif_src_addr) 17936 return; 17937 } else { 17938 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 17939 &ipif->ipif_v6lcl_addr)) 17940 return; 17941 } 17942 ire_delete(ire); 17943 return; 17944 } 17945 /* 17946 * ire_delete() will do an ire_flush_cache which will delete 17947 * all ire_ipif matches 17948 */ 17949 ire_delete(ire); 17950 } 17951 17952 /* 17953 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 17954 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 17955 * 2) when an interface is brought up or down (on that ill). 17956 * This ensures that the IRE_CACHE entries don't retain stale source 17957 * address selection results. 17958 */ 17959 void 17960 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 17961 { 17962 ill_t *ill = (ill_t *)ill_arg; 17963 ill_t *ipif_ill; 17964 17965 ASSERT(IAM_WRITER_ILL(ill)); 17966 /* 17967 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 17968 * Hence this should be IRE_CACHE. 17969 */ 17970 ASSERT(ire->ire_type == IRE_CACHE); 17971 17972 /* 17973 * We are called for IRE_CACHES whose ire_ipif matches ill. 17974 * We are only interested in IRE_CACHES that has borrowed 17975 * the source address from ill_arg e.g. ipif_up_done[_v6] 17976 * for which we need to look at ire_ipif->ipif_ill match 17977 * with ill. 17978 */ 17979 ASSERT(ire->ire_ipif != NULL); 17980 ipif_ill = ire->ire_ipif->ipif_ill; 17981 if (ipif_ill == ill || (ill->ill_group != NULL && 17982 ipif_ill->ill_group == ill->ill_group)) { 17983 ire_delete(ire); 17984 } 17985 } 17986 17987 /* 17988 * Delete all the ire whose stq references ill_arg. 17989 */ 17990 static void 17991 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 17992 { 17993 ill_t *ill = (ill_t *)ill_arg; 17994 ill_t *ire_ill; 17995 17996 ASSERT(IAM_WRITER_ILL(ill)); 17997 /* 17998 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 17999 * Hence this should be IRE_CACHE. 18000 */ 18001 ASSERT(ire->ire_type == IRE_CACHE); 18002 18003 /* 18004 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18005 * matches ill. We are only interested in IRE_CACHES that 18006 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 18007 * filtering here. 18008 */ 18009 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 18010 18011 if (ire_ill == ill) 18012 ire_delete(ire); 18013 } 18014 18015 /* 18016 * This is called when an ill leaves the group. We want to delete 18017 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 18018 * pointing at ill. 18019 */ 18020 static void 18021 illgrp_cache_delete(ire_t *ire, char *ill_arg) 18022 { 18023 ill_t *ill = (ill_t *)ill_arg; 18024 18025 ASSERT(IAM_WRITER_ILL(ill)); 18026 ASSERT(ill->ill_group == NULL); 18027 /* 18028 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18029 * Hence this should be IRE_CACHE. 18030 */ 18031 ASSERT(ire->ire_type == IRE_CACHE); 18032 /* 18033 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18034 * matches ill. We are interested in both. 18035 */ 18036 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 18037 (ire->ire_ipif->ipif_ill == ill)); 18038 18039 ire_delete(ire); 18040 } 18041 18042 /* 18043 * Initiate deallocate of an IPIF. Always called as writer. Called by 18044 * ill_delete or ip_sioctl_removeif. 18045 */ 18046 static void 18047 ipif_free(ipif_t *ipif) 18048 { 18049 ASSERT(IAM_WRITER_IPIF(ipif)); 18050 18051 /* Remove conn references */ 18052 reset_conn_ipif(ipif); 18053 18054 /* 18055 * Make sure we have valid net and subnet broadcast ire's for the 18056 * other ipif's which share them with this ipif. 18057 */ 18058 if (!ipif->ipif_isv6) 18059 ipif_check_bcast_ires(ipif); 18060 18061 /* 18062 * Take down the interface. We can be called either from ill_delete 18063 * or from ip_sioctl_removeif. 18064 */ 18065 (void) ipif_down(ipif, NULL, NULL); 18066 18067 rw_enter(&ill_g_lock, RW_WRITER); 18068 /* Remove pointers to this ill in the multicast routing tables */ 18069 reset_mrt_vif_ipif(ipif); 18070 rw_exit(&ill_g_lock); 18071 } 18072 18073 static void 18074 ipif_free_tail(ipif_t *ipif) 18075 { 18076 mblk_t *mp; 18077 ipif_t **ipifp; 18078 18079 /* 18080 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 18081 */ 18082 mutex_enter(&ipif->ipif_saved_ire_lock); 18083 mp = ipif->ipif_saved_ire_mp; 18084 ipif->ipif_saved_ire_mp = NULL; 18085 mutex_exit(&ipif->ipif_saved_ire_lock); 18086 freemsg(mp); 18087 18088 /* 18089 * Need to hold both ill_g_lock and ill_lock while 18090 * inserting or removing an ipif from the linked list 18091 * of ipifs hanging off the ill. 18092 */ 18093 rw_enter(&ill_g_lock, RW_WRITER); 18094 /* 18095 * Remove all multicast memberships on the interface now. 18096 * This removes IPv4 multicast memberships joined within 18097 * the kernel as ipif_down does not do ipif_multicast_down 18098 * for IPv4. IPv6 is not handled here as the multicast memberships 18099 * are based on ill and not on ipif. 18100 */ 18101 ilm_free(ipif); 18102 18103 /* 18104 * Since we held the ill_g_lock while doing the ilm_free above, 18105 * we can assert the ilms were really deleted and not just marked 18106 * ILM_DELETED. 18107 */ 18108 ASSERT(ilm_walk_ipif(ipif) == 0); 18109 18110 18111 IPIF_TRACE_CLEANUP(ipif); 18112 18113 /* Ask SCTP to take it out of it list */ 18114 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 18115 18116 mutex_enter(&ipif->ipif_ill->ill_lock); 18117 /* Get it out of the ILL interface list. */ 18118 ipifp = &ipif->ipif_ill->ill_ipif; 18119 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 18120 if (*ipifp == ipif) { 18121 *ipifp = ipif->ipif_next; 18122 break; 18123 } 18124 } 18125 18126 mutex_exit(&ipif->ipif_ill->ill_lock); 18127 rw_exit(&ill_g_lock); 18128 18129 mutex_destroy(&ipif->ipif_saved_ire_lock); 18130 /* Free the memory. */ 18131 mi_free((char *)ipif); 18132 } 18133 18134 /* 18135 * Returns an ipif name in the form "ill_name/unit" if ipif_id is not zero, 18136 * "ill_name" otherwise. 18137 */ 18138 char * 18139 ipif_get_name(const ipif_t *ipif, char *buf, int len) 18140 { 18141 char lbuf[32]; 18142 char *name; 18143 size_t name_len; 18144 18145 buf[0] = '\0'; 18146 if (!ipif) 18147 return (buf); 18148 name = ipif->ipif_ill->ill_name; 18149 name_len = ipif->ipif_ill->ill_name_length; 18150 if (ipif->ipif_id != 0) { 18151 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 18152 ipif->ipif_id); 18153 name = lbuf; 18154 name_len = mi_strlen(name) + 1; 18155 } 18156 len -= 1; 18157 buf[len] = '\0'; 18158 len = MIN(len, name_len); 18159 bcopy(name, buf, len); 18160 return (buf); 18161 } 18162 18163 /* 18164 * Find an IPIF based on the name passed in. Names can be of the 18165 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 18166 * The <phys> string can have forms like <dev><#> (e.g., le0), 18167 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 18168 * When there is no colon, the implied unit id is zero. <phys> must 18169 * correspond to the name of an ILL. (May be called as writer.) 18170 */ 18171 static ipif_t * 18172 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 18173 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 18174 mblk_t *mp, ipsq_func_t func, int *error) 18175 { 18176 char *cp; 18177 char *endp; 18178 long id; 18179 ill_t *ill; 18180 ipif_t *ipif; 18181 uint_t ire_type; 18182 boolean_t did_alloc = B_FALSE; 18183 ipsq_t *ipsq; 18184 18185 if (error != NULL) 18186 *error = 0; 18187 18188 /* 18189 * If the caller wants to us to create the ipif, make sure we have a 18190 * valid zoneid 18191 */ 18192 ASSERT(!do_alloc || zoneid != ALL_ZONES); 18193 18194 if (namelen == 0) { 18195 if (error != NULL) 18196 *error = ENXIO; 18197 return (NULL); 18198 } 18199 18200 *exists = B_FALSE; 18201 /* Look for a colon in the name. */ 18202 endp = &name[namelen]; 18203 for (cp = endp; --cp > name; ) { 18204 if (*cp == IPIF_SEPARATOR_CHAR) 18205 break; 18206 } 18207 18208 if (*cp == IPIF_SEPARATOR_CHAR) { 18209 /* 18210 * Reject any non-decimal aliases for logical 18211 * interfaces. Aliases with leading zeroes 18212 * are also rejected as they introduce ambiguity 18213 * in the naming of the interfaces. 18214 * In order to confirm with existing semantics, 18215 * and to not break any programs/script relying 18216 * on that behaviour, if<0>:0 is considered to be 18217 * a valid interface. 18218 * 18219 * If alias has two or more digits and the first 18220 * is zero, fail. 18221 */ 18222 if (&cp[2] < endp && cp[1] == '0') 18223 return (NULL); 18224 } 18225 18226 if (cp <= name) { 18227 cp = endp; 18228 } else { 18229 *cp = '\0'; 18230 } 18231 18232 /* 18233 * Look up the ILL, based on the portion of the name 18234 * before the slash. ill_lookup_on_name returns a held ill. 18235 * Temporary to check whether ill exists already. If so 18236 * ill_lookup_on_name will clear it. 18237 */ 18238 ill = ill_lookup_on_name(name, do_alloc, isv6, 18239 q, mp, func, error, &did_alloc); 18240 if (cp != endp) 18241 *cp = IPIF_SEPARATOR_CHAR; 18242 if (ill == NULL) 18243 return (NULL); 18244 18245 /* Establish the unit number in the name. */ 18246 id = 0; 18247 if (cp < endp && *endp == '\0') { 18248 /* If there was a colon, the unit number follows. */ 18249 cp++; 18250 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 18251 ill_refrele(ill); 18252 if (error != NULL) 18253 *error = ENXIO; 18254 return (NULL); 18255 } 18256 } 18257 18258 GRAB_CONN_LOCK(q); 18259 mutex_enter(&ill->ill_lock); 18260 /* Now see if there is an IPIF with this unit number. */ 18261 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 18262 if (ipif->ipif_id == id) { 18263 if (zoneid != ALL_ZONES && 18264 zoneid != ipif->ipif_zoneid && 18265 ipif->ipif_zoneid != ALL_ZONES) { 18266 mutex_exit(&ill->ill_lock); 18267 RELEASE_CONN_LOCK(q); 18268 ill_refrele(ill); 18269 if (error != NULL) 18270 *error = ENXIO; 18271 return (NULL); 18272 } 18273 /* 18274 * The block comment at the start of ipif_down 18275 * explains the use of the macros used below 18276 */ 18277 if (IPIF_CAN_LOOKUP(ipif)) { 18278 ipif_refhold_locked(ipif); 18279 mutex_exit(&ill->ill_lock); 18280 if (!did_alloc) 18281 *exists = B_TRUE; 18282 /* 18283 * Drop locks before calling ill_refrele 18284 * since it can potentially call into 18285 * ipif_ill_refrele_tail which can end up 18286 * in trying to acquire any lock. 18287 */ 18288 RELEASE_CONN_LOCK(q); 18289 ill_refrele(ill); 18290 return (ipif); 18291 } else if (IPIF_CAN_WAIT(ipif, q)) { 18292 ipsq = ill->ill_phyint->phyint_ipsq; 18293 mutex_enter(&ipsq->ipsq_lock); 18294 mutex_exit(&ill->ill_lock); 18295 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 18296 mutex_exit(&ipsq->ipsq_lock); 18297 RELEASE_CONN_LOCK(q); 18298 ill_refrele(ill); 18299 *error = EINPROGRESS; 18300 return (NULL); 18301 } 18302 } 18303 } 18304 RELEASE_CONN_LOCK(q); 18305 18306 if (!do_alloc) { 18307 mutex_exit(&ill->ill_lock); 18308 ill_refrele(ill); 18309 if (error != NULL) 18310 *error = ENXIO; 18311 return (NULL); 18312 } 18313 18314 /* 18315 * If none found, atomically allocate and return a new one. 18316 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 18317 * to support "receive only" use of lo0:1 etc. as is still done 18318 * below as an initial guess. 18319 * However, this is now likely to be overriden later in ipif_up_done() 18320 * when we know for sure what address has been configured on the 18321 * interface, since we might have more than one loopback interface 18322 * with a loopback address, e.g. in the case of zones, and all the 18323 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 18324 */ 18325 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 18326 ire_type = IRE_LOOPBACK; 18327 else 18328 ire_type = IRE_LOCAL; 18329 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 18330 if (ipif != NULL) 18331 ipif_refhold_locked(ipif); 18332 else if (error != NULL) 18333 *error = ENOMEM; 18334 mutex_exit(&ill->ill_lock); 18335 ill_refrele(ill); 18336 return (ipif); 18337 } 18338 18339 /* 18340 * This routine is called whenever a new address comes up on an ipif. If 18341 * we are configured to respond to address mask requests, then we are supposed 18342 * to broadcast an address mask reply at this time. This routine is also 18343 * called if we are already up, but a netmask change is made. This is legal 18344 * but might not make the system manager very popular. (May be called 18345 * as writer.) 18346 */ 18347 static void 18348 ipif_mask_reply(ipif_t *ipif) 18349 { 18350 icmph_t *icmph; 18351 ipha_t *ipha; 18352 mblk_t *mp; 18353 18354 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 18355 18356 if (!ip_respond_to_address_mask_broadcast) 18357 return; 18358 18359 /* ICMP mask reply is IPv4 only */ 18360 ASSERT(!ipif->ipif_isv6); 18361 /* ICMP mask reply is not for a loopback interface */ 18362 ASSERT(ipif->ipif_ill->ill_wq != NULL); 18363 18364 mp = allocb(REPLY_LEN, BPRI_HI); 18365 if (mp == NULL) 18366 return; 18367 mp->b_wptr = mp->b_rptr + REPLY_LEN; 18368 18369 ipha = (ipha_t *)mp->b_rptr; 18370 bzero(ipha, REPLY_LEN); 18371 *ipha = icmp_ipha; 18372 ipha->ipha_ttl = ip_broadcast_ttl; 18373 ipha->ipha_src = ipif->ipif_src_addr; 18374 ipha->ipha_dst = ipif->ipif_brd_addr; 18375 ipha->ipha_length = htons(REPLY_LEN); 18376 ipha->ipha_ident = 0; 18377 18378 icmph = (icmph_t *)&ipha[1]; 18379 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 18380 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 18381 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 18382 if (icmph->icmph_checksum == 0) 18383 icmph->icmph_checksum = 0xffff; 18384 18385 put(ipif->ipif_wq, mp); 18386 18387 #undef REPLY_LEN 18388 } 18389 18390 /* 18391 * When the mtu in the ipif changes, we call this routine through ire_walk 18392 * to update all the relevant IREs. 18393 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 18394 */ 18395 static void 18396 ipif_mtu_change(ire_t *ire, char *ipif_arg) 18397 { 18398 ipif_t *ipif = (ipif_t *)ipif_arg; 18399 18400 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 18401 return; 18402 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 18403 } 18404 18405 /* 18406 * When the mtu in the ill changes, we call this routine through ire_walk 18407 * to update all the relevant IREs. 18408 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 18409 */ 18410 void 18411 ill_mtu_change(ire_t *ire, char *ill_arg) 18412 { 18413 ill_t *ill = (ill_t *)ill_arg; 18414 18415 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 18416 return; 18417 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 18418 } 18419 18420 /* 18421 * Join the ipif specific multicast groups. 18422 * Must be called after a mapping has been set up in the resolver. (Always 18423 * called as writer.) 18424 */ 18425 void 18426 ipif_multicast_up(ipif_t *ipif) 18427 { 18428 int err, index; 18429 ill_t *ill; 18430 18431 ASSERT(IAM_WRITER_IPIF(ipif)); 18432 18433 ill = ipif->ipif_ill; 18434 index = ill->ill_phyint->phyint_ifindex; 18435 18436 ip1dbg(("ipif_multicast_up\n")); 18437 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 18438 return; 18439 18440 if (ipif->ipif_isv6) { 18441 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 18442 return; 18443 18444 /* Join the all hosts multicast address */ 18445 ip1dbg(("ipif_multicast_up - addmulti\n")); 18446 /* 18447 * Passing B_TRUE means we have to join the multicast 18448 * membership on this interface even though this is 18449 * FAILED. If we join on a different one in the group, 18450 * we will not be able to delete the membership later 18451 * as we currently don't track where we join when we 18452 * join within the kernel unlike applications where 18453 * we have ilg/ilg_orig_index. See ip_addmulti_v6 18454 * for more on this. 18455 */ 18456 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 18457 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 18458 if (err != 0) { 18459 ip0dbg(("ipif_multicast_up: " 18460 "all_hosts_mcast failed %d\n", 18461 err)); 18462 return; 18463 } 18464 /* 18465 * Enable multicast for the solicited node multicast address 18466 */ 18467 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 18468 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 18469 18470 ipv6_multi.s6_addr32[3] |= 18471 ipif->ipif_v6lcl_addr.s6_addr32[3]; 18472 18473 err = ip_addmulti_v6(&ipv6_multi, ill, index, 18474 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 18475 NULL); 18476 if (err != 0) { 18477 ip0dbg(("ipif_multicast_up: solicited MC" 18478 " failed %d\n", err)); 18479 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 18480 ill, ill->ill_phyint->phyint_ifindex, 18481 ipif->ipif_zoneid, B_TRUE, B_TRUE); 18482 return; 18483 } 18484 } 18485 } else { 18486 if (ipif->ipif_lcl_addr == INADDR_ANY) 18487 return; 18488 18489 /* Join the all hosts multicast address */ 18490 ip1dbg(("ipif_multicast_up - addmulti\n")); 18491 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 18492 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 18493 if (err) { 18494 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 18495 return; 18496 } 18497 } 18498 ipif->ipif_multicast_up = 1; 18499 } 18500 18501 /* 18502 * Blow away any IPv6 multicast groups that we joined in ipif_multicast_up(); 18503 * any explicit memberships are blown away in ill_leave_multicast() when the 18504 * ill is brought down. 18505 */ 18506 static void 18507 ipif_multicast_down(ipif_t *ipif) 18508 { 18509 int err; 18510 18511 ASSERT(IAM_WRITER_IPIF(ipif)); 18512 18513 ip1dbg(("ipif_multicast_down\n")); 18514 if (!ipif->ipif_multicast_up) 18515 return; 18516 18517 ASSERT(ipif->ipif_isv6); 18518 18519 ip1dbg(("ipif_multicast_down - delmulti\n")); 18520 18521 /* 18522 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 18523 * we should look for ilms on this ill rather than the ones that have 18524 * been failed over here. They are here temporarily. As 18525 * ipif_multicast_up has joined on this ill, we should delete only 18526 * from this ill. 18527 */ 18528 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 18529 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 18530 B_TRUE, B_TRUE); 18531 if (err != 0) { 18532 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 18533 err)); 18534 } 18535 /* 18536 * Disable multicast for the solicited node multicast address 18537 */ 18538 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 18539 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 18540 18541 ipv6_multi.s6_addr32[3] |= 18542 ipif->ipif_v6lcl_addr.s6_addr32[3]; 18543 18544 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 18545 ipif->ipif_ill->ill_phyint->phyint_ifindex, 18546 ipif->ipif_zoneid, B_TRUE, B_TRUE); 18547 18548 if (err != 0) { 18549 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 18550 err)); 18551 } 18552 } 18553 18554 ipif->ipif_multicast_up = 0; 18555 } 18556 18557 /* 18558 * Used when an interface comes up to recreate any extra routes on this 18559 * interface. 18560 */ 18561 static ire_t ** 18562 ipif_recover_ire(ipif_t *ipif) 18563 { 18564 mblk_t *mp; 18565 ire_t **ipif_saved_irep; 18566 ire_t **irep; 18567 18568 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 18569 ipif->ipif_id)); 18570 18571 mutex_enter(&ipif->ipif_saved_ire_lock); 18572 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 18573 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 18574 if (ipif_saved_irep == NULL) { 18575 mutex_exit(&ipif->ipif_saved_ire_lock); 18576 return (NULL); 18577 } 18578 18579 irep = ipif_saved_irep; 18580 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 18581 ire_t *ire; 18582 queue_t *rfq; 18583 queue_t *stq; 18584 ifrt_t *ifrt; 18585 uchar_t *src_addr; 18586 uchar_t *gateway_addr; 18587 mblk_t *resolver_mp; 18588 ushort_t type; 18589 18590 /* 18591 * When the ire was initially created and then added in 18592 * ip_rt_add(), it was created either using ipif->ipif_net_type 18593 * in the case of a traditional interface route, or as one of 18594 * the IRE_OFFSUBNET types (with the exception of 18595 * IRE_HOST_REDIRECT which is created by icmp_redirect() and 18596 * which we don't need to save or recover). In the case where 18597 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 18598 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 18599 * to satisfy software like GateD and Sun Cluster which creates 18600 * routes using the the loopback interface's address as a 18601 * gateway. 18602 * 18603 * As ifrt->ifrt_type reflects the already updated ire_type and 18604 * since ire_create() expects that IRE_IF_NORESOLVER will have 18605 * a valid nce_res_mp field (which doesn't make sense for a 18606 * IRE_LOOPBACK), ire_create() will be called in the same way 18607 * here as in ip_rt_add(), namely using ipif->ipif_net_type when 18608 * the route looks like a traditional interface route (where 18609 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 18610 * the saved ifrt->ifrt_type. This means that in the case where 18611 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 18612 * ire_create() will be an IRE_LOOPBACK, it will then be turned 18613 * into an IRE_IF_NORESOLVER and then added by ire_add(). 18614 */ 18615 ifrt = (ifrt_t *)mp->b_rptr; 18616 if (ifrt->ifrt_type & IRE_INTERFACE) { 18617 rfq = NULL; 18618 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 18619 ? ipif->ipif_rq : ipif->ipif_wq; 18620 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18621 ? (uint8_t *)&ifrt->ifrt_src_addr 18622 : (uint8_t *)&ipif->ipif_src_addr; 18623 gateway_addr = NULL; 18624 resolver_mp = ipif->ipif_resolver_mp; 18625 type = ipif->ipif_net_type; 18626 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 18627 /* Recover multiroute broadcast IRE. */ 18628 rfq = ipif->ipif_rq; 18629 stq = ipif->ipif_wq; 18630 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18631 ? (uint8_t *)&ifrt->ifrt_src_addr 18632 : (uint8_t *)&ipif->ipif_src_addr; 18633 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 18634 resolver_mp = ipif->ipif_bcast_mp; 18635 type = ifrt->ifrt_type; 18636 } else { 18637 rfq = NULL; 18638 stq = NULL; 18639 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18640 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 18641 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 18642 resolver_mp = NULL; 18643 type = ifrt->ifrt_type; 18644 } 18645 18646 /* 18647 * Create a copy of the IRE with the saved address and netmask. 18648 */ 18649 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 18650 "0x%x/0x%x\n", 18651 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 18652 ntohl(ifrt->ifrt_addr), 18653 ntohl(ifrt->ifrt_mask))); 18654 ire = ire_create( 18655 (uint8_t *)&ifrt->ifrt_addr, 18656 (uint8_t *)&ifrt->ifrt_mask, 18657 src_addr, 18658 gateway_addr, 18659 NULL, 18660 &ifrt->ifrt_max_frag, 18661 NULL, 18662 rfq, 18663 stq, 18664 type, 18665 resolver_mp, 18666 ipif, 18667 NULL, 18668 0, 18669 0, 18670 0, 18671 ifrt->ifrt_flags, 18672 &ifrt->ifrt_iulp_info, 18673 NULL, 18674 NULL); 18675 18676 if (ire == NULL) { 18677 mutex_exit(&ipif->ipif_saved_ire_lock); 18678 kmem_free(ipif_saved_irep, 18679 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 18680 return (NULL); 18681 } 18682 18683 /* 18684 * Some software (for example, GateD and Sun Cluster) attempts 18685 * to create (what amount to) IRE_PREFIX routes with the 18686 * loopback address as the gateway. This is primarily done to 18687 * set up prefixes with the RTF_REJECT flag set (for example, 18688 * when generating aggregate routes.) 18689 * 18690 * If the IRE type (as defined by ipif->ipif_net_type) is 18691 * IRE_LOOPBACK, then we map the request into a 18692 * IRE_IF_NORESOLVER. 18693 */ 18694 if (ipif->ipif_net_type == IRE_LOOPBACK) 18695 ire->ire_type = IRE_IF_NORESOLVER; 18696 /* 18697 * ire held by ire_add, will be refreled' towards the 18698 * the end of ipif_up_done 18699 */ 18700 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 18701 *irep = ire; 18702 irep++; 18703 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 18704 } 18705 mutex_exit(&ipif->ipif_saved_ire_lock); 18706 return (ipif_saved_irep); 18707 } 18708 18709 /* 18710 * Used to set the netmask and broadcast address to default values when the 18711 * interface is brought up. (Always called as writer.) 18712 */ 18713 static void 18714 ipif_set_default(ipif_t *ipif) 18715 { 18716 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 18717 18718 if (!ipif->ipif_isv6) { 18719 /* 18720 * Interface holds an IPv4 address. Default 18721 * mask is the natural netmask. 18722 */ 18723 if (!ipif->ipif_net_mask) { 18724 ipaddr_t v4mask; 18725 18726 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 18727 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 18728 } 18729 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 18730 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 18731 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 18732 } else { 18733 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 18734 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 18735 } 18736 /* 18737 * NOTE: SunOS 4.X does this even if the broadcast address 18738 * has been already set thus we do the same here. 18739 */ 18740 if (ipif->ipif_flags & IPIF_BROADCAST) { 18741 ipaddr_t v4addr; 18742 18743 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 18744 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 18745 } 18746 } else { 18747 /* 18748 * Interface holds an IPv6-only address. Default 18749 * mask is all-ones. 18750 */ 18751 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 18752 ipif->ipif_v6net_mask = ipv6_all_ones; 18753 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 18754 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 18755 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 18756 } else { 18757 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 18758 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 18759 } 18760 } 18761 } 18762 18763 /* 18764 * Return 0 if this address can be used as local address without causing 18765 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 18766 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 18767 * Special checks are needed to allow the same IPv6 link-local address 18768 * on different ills. 18769 * TODO: allowing the same site-local address on different ill's. 18770 */ 18771 int 18772 ip_addr_availability_check(ipif_t *new_ipif) 18773 { 18774 in6_addr_t our_v6addr; 18775 ill_t *ill; 18776 ipif_t *ipif; 18777 ill_walk_context_t ctx; 18778 18779 ASSERT(IAM_WRITER_IPIF(new_ipif)); 18780 ASSERT(MUTEX_HELD(&ip_addr_avail_lock)); 18781 ASSERT(RW_READ_HELD(&ill_g_lock)); 18782 18783 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 18784 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 18785 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 18786 return (0); 18787 18788 our_v6addr = new_ipif->ipif_v6lcl_addr; 18789 18790 if (new_ipif->ipif_isv6) 18791 ill = ILL_START_WALK_V6(&ctx); 18792 else 18793 ill = ILL_START_WALK_V4(&ctx); 18794 18795 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18796 for (ipif = ill->ill_ipif; ipif != NULL; 18797 ipif = ipif->ipif_next) { 18798 if ((ipif == new_ipif) || 18799 !(ipif->ipif_flags & IPIF_UP) || 18800 (ipif->ipif_flags & IPIF_UNNUMBERED)) 18801 continue; 18802 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 18803 &our_v6addr)) { 18804 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 18805 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 18806 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 18807 ipif->ipif_flags |= IPIF_UNNUMBERED; 18808 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 18809 new_ipif->ipif_ill != ill) 18810 continue; 18811 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 18812 new_ipif->ipif_ill != ill) 18813 continue; 18814 else if (new_ipif->ipif_zoneid != 18815 ipif->ipif_zoneid && 18816 ipif->ipif_zoneid != ALL_ZONES && 18817 (ill->ill_phyint->phyint_flags & 18818 PHYI_LOOPBACK)) 18819 continue; 18820 else if (new_ipif->ipif_ill == ill) 18821 return (EADDRINUSE); 18822 else 18823 return (EADDRNOTAVAIL); 18824 } 18825 } 18826 } 18827 18828 return (0); 18829 } 18830 18831 /* 18832 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 18833 * IREs for the ipif. 18834 * When the routine returns EINPROGRESS then mp has been consumed and 18835 * the ioctl will be acked from ip_rput_dlpi. 18836 */ 18837 static int 18838 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 18839 { 18840 ill_t *ill = ipif->ipif_ill; 18841 boolean_t isv6 = ipif->ipif_isv6; 18842 int err = 0; 18843 boolean_t success; 18844 18845 ASSERT(IAM_WRITER_IPIF(ipif)); 18846 18847 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 18848 18849 /* Shouldn't get here if it is already up. */ 18850 if (ipif->ipif_flags & IPIF_UP) 18851 return (EALREADY); 18852 18853 /* Skip arp/ndp for any loopback interface. */ 18854 if (ill->ill_wq != NULL) { 18855 conn_t *connp = Q_TO_CONN(q); 18856 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 18857 18858 if (!ill->ill_dl_up) { 18859 /* 18860 * ill_dl_up is not yet set. i.e. we are yet to 18861 * DL_BIND with the driver and this is the first 18862 * logical interface on the ill to become "up". 18863 * Tell the driver to get going (via DL_BIND_REQ). 18864 * Note that changing "significant" IFF_ flags 18865 * address/netmask etc cause a down/up dance, but 18866 * does not cause an unbind (DL_UNBIND) with the driver 18867 */ 18868 return (ill_dl_up(ill, ipif, mp, q)); 18869 } 18870 18871 /* 18872 * ipif_resolver_up may end up sending an 18873 * AR_INTERFACE_UP message to ARP, which would, in 18874 * turn send a DLPI message to the driver. ioctls are 18875 * serialized and so we cannot send more than one 18876 * interface up message at a time. If ipif_resolver_up 18877 * does send an interface up message to ARP, we get 18878 * EINPROGRESS and we will complete in ip_arp_done. 18879 */ 18880 18881 ASSERT(connp != NULL); 18882 ASSERT(ipsq->ipsq_pending_mp == NULL); 18883 mutex_enter(&connp->conn_lock); 18884 mutex_enter(&ill->ill_lock); 18885 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 18886 mutex_exit(&ill->ill_lock); 18887 mutex_exit(&connp->conn_lock); 18888 if (!success) 18889 return (EINTR); 18890 18891 /* 18892 * Crank up IPv6 neighbor discovery 18893 * Unlike ARP, this should complete when 18894 * ipif_ndp_up returns. However, for 18895 * ILLF_XRESOLV interfaces we also send a 18896 * AR_INTERFACE_UP to the external resolver. 18897 * That ioctl will complete in ip_rput. 18898 */ 18899 if (isv6) { 18900 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, 18901 B_FALSE); 18902 if (err != 0) { 18903 mp = ipsq_pending_mp_get(ipsq, &connp); 18904 return (err); 18905 } 18906 } 18907 /* Now, ARP */ 18908 if ((err = ipif_resolver_up(ipif, B_FALSE)) == 18909 EINPROGRESS) { 18910 /* We will complete it in ip_arp_done */ 18911 return (err); 18912 } 18913 mp = ipsq_pending_mp_get(ipsq, &connp); 18914 ASSERT(mp != NULL); 18915 if (err != 0) 18916 return (err); 18917 } 18918 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 18919 } 18920 18921 /* 18922 * Perform a bind for the physical device. 18923 * When the routine returns EINPROGRESS then mp has been consumed and 18924 * the ioctl will be acked from ip_rput_dlpi. 18925 * Allocate an unbind message and save it until ipif_down. 18926 */ 18927 static int 18928 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 18929 { 18930 mblk_t *areq_mp = NULL; 18931 mblk_t *bind_mp = NULL; 18932 mblk_t *unbind_mp = NULL; 18933 conn_t *connp; 18934 boolean_t success; 18935 18936 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 18937 ASSERT(IAM_WRITER_ILL(ill)); 18938 18939 ASSERT(mp != NULL); 18940 18941 /* Create a resolver cookie for ARP */ 18942 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 18943 areq_t *areq; 18944 uint16_t sap_addr; 18945 18946 areq_mp = ill_arp_alloc(ill, 18947 (uchar_t *)&ip_areq_template, 0); 18948 if (areq_mp == NULL) { 18949 return (ENOMEM); 18950 } 18951 freemsg(ill->ill_resolver_mp); 18952 ill->ill_resolver_mp = areq_mp; 18953 areq = (areq_t *)areq_mp->b_rptr; 18954 sap_addr = ill->ill_sap; 18955 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 18956 /* 18957 * Wait till we call ill_pending_mp_add to determine 18958 * the success before we free the ill_resolver_mp and 18959 * attach areq_mp in it's place. 18960 */ 18961 } 18962 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 18963 DL_BIND_REQ); 18964 if (bind_mp == NULL) 18965 goto bad; 18966 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 18967 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 18968 18969 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 18970 if (unbind_mp == NULL) 18971 goto bad; 18972 18973 /* 18974 * Record state needed to complete this operation when the 18975 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 18976 */ 18977 if (WR(q)->q_next == NULL) { 18978 connp = Q_TO_CONN(q); 18979 mutex_enter(&connp->conn_lock); 18980 } else { 18981 connp = NULL; 18982 } 18983 mutex_enter(&ipif->ipif_ill->ill_lock); 18984 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 18985 mutex_exit(&ipif->ipif_ill->ill_lock); 18986 if (connp != NULL) 18987 mutex_exit(&connp->conn_lock); 18988 if (!success) 18989 goto bad; 18990 18991 /* 18992 * Save the unbind message for ill_dl_down(); it will be consumed when 18993 * the interface goes down. 18994 */ 18995 ASSERT(ill->ill_unbind_mp == NULL); 18996 ill->ill_unbind_mp = unbind_mp; 18997 18998 ill_dlpi_send(ill, bind_mp); 18999 /* Send down link-layer capabilities probe if not already done. */ 19000 ill_capability_probe(ill); 19001 19002 /* 19003 * Sysid used to rely on the fact that netboots set domainname 19004 * and the like. Now that miniroot boots aren't strictly netboots 19005 * and miniroot network configuration is driven from userland 19006 * these things still need to be set. This situation can be detected 19007 * by comparing the interface being configured here to the one 19008 * dhcack was set to reference by the boot loader. Once sysid is 19009 * converted to use dhcp_ipc_getinfo() this call can go away. 19010 */ 19011 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) && 19012 (strcmp(ill->ill_name, dhcack) == 0) && 19013 (strlen(srpc_domain) == 0)) { 19014 if (dhcpinit() != 0) 19015 cmn_err(CE_WARN, "no cached dhcp response"); 19016 } 19017 19018 /* 19019 * This operation will complete in ip_rput_dlpi with either 19020 * a DL_BIND_ACK or DL_ERROR_ACK. 19021 */ 19022 return (EINPROGRESS); 19023 bad: 19024 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 19025 /* 19026 * We don't have to check for possible removal from illgrp 19027 * as we have not yet inserted in illgrp. For groups 19028 * without names, this ipif is still not UP and hence 19029 * this could not have possibly had any influence in forming 19030 * groups. 19031 */ 19032 19033 if (bind_mp != NULL) 19034 freemsg(bind_mp); 19035 if (unbind_mp != NULL) 19036 freemsg(unbind_mp); 19037 return (ENOMEM); 19038 } 19039 19040 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 19041 19042 /* 19043 * DLPI and ARP is up. 19044 * Create all the IREs associated with an interface bring up multicast. 19045 * Set the interface flag and finish other initialization 19046 * that potentially had to be differed to after DL_BIND_ACK. 19047 */ 19048 int 19049 ipif_up_done(ipif_t *ipif) 19050 { 19051 ire_t *ire_array[20]; 19052 ire_t **irep = ire_array; 19053 ire_t **irep1; 19054 ipaddr_t net_mask = 0; 19055 ipaddr_t subnet_mask, route_mask; 19056 ill_t *ill = ipif->ipif_ill; 19057 queue_t *stq; 19058 ipif_t *src_ipif; 19059 ipif_t *tmp_ipif; 19060 boolean_t flush_ire_cache = B_TRUE; 19061 int err = 0; 19062 phyint_t *phyi; 19063 ire_t **ipif_saved_irep = NULL; 19064 int ipif_saved_ire_cnt; 19065 int cnt; 19066 boolean_t src_ipif_held = B_FALSE; 19067 boolean_t ire_added = B_FALSE; 19068 boolean_t loopback = B_FALSE; 19069 19070 ip1dbg(("ipif_up_done(%s:%u)\n", 19071 ipif->ipif_ill->ill_name, ipif->ipif_id)); 19072 /* Check if this is a loopback interface */ 19073 if (ipif->ipif_ill->ill_wq == NULL) 19074 loopback = B_TRUE; 19075 19076 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19077 /* 19078 * If all other interfaces for this ill are down or DEPRECATED, 19079 * or otherwise unsuitable for source address selection, remove 19080 * any IRE_CACHE entries for this ill to make sure source 19081 * address selection gets to take this new ipif into account. 19082 * No need to hold ill_lock while traversing the ipif list since 19083 * we are writer 19084 */ 19085 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 19086 tmp_ipif = tmp_ipif->ipif_next) { 19087 if (((tmp_ipif->ipif_flags & 19088 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 19089 !(tmp_ipif->ipif_flags & IPIF_UP)) || 19090 (tmp_ipif == ipif)) 19091 continue; 19092 /* first useable pre-existing interface */ 19093 flush_ire_cache = B_FALSE; 19094 break; 19095 } 19096 if (flush_ire_cache) 19097 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 19098 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 19099 19100 /* 19101 * Figure out which way the send-to queue should go. Only 19102 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 19103 * should show up here. 19104 */ 19105 switch (ill->ill_net_type) { 19106 case IRE_IF_RESOLVER: 19107 stq = ill->ill_rq; 19108 break; 19109 case IRE_IF_NORESOLVER: 19110 case IRE_LOOPBACK: 19111 stq = ill->ill_wq; 19112 break; 19113 default: 19114 return (EINVAL); 19115 } 19116 19117 if (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) { 19118 /* 19119 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 19120 * ipif_lookup_on_name(), but in the case of zones we can have 19121 * several loopback addresses on lo0. So all the interfaces with 19122 * loopback addresses need to be marked IRE_LOOPBACK. 19123 */ 19124 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 19125 htonl(INADDR_LOOPBACK)) 19126 ipif->ipif_ire_type = IRE_LOOPBACK; 19127 else 19128 ipif->ipif_ire_type = IRE_LOCAL; 19129 } 19130 19131 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 19132 /* 19133 * Can't use our source address. Select a different 19134 * source address for the IRE_INTERFACE and IRE_LOCAL 19135 */ 19136 src_ipif = ipif_select_source(ipif->ipif_ill, 19137 ipif->ipif_subnet, ipif->ipif_zoneid); 19138 if (src_ipif == NULL) 19139 src_ipif = ipif; /* Last resort */ 19140 else 19141 src_ipif_held = B_TRUE; 19142 } else { 19143 src_ipif = ipif; 19144 } 19145 19146 /* Create all the IREs associated with this interface */ 19147 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19148 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19149 19150 /* 19151 * If we're on a labeled system then make sure that zone- 19152 * private addresses have proper remote host database entries. 19153 */ 19154 if (is_system_labeled() && 19155 ipif->ipif_ire_type != IRE_LOOPBACK && 19156 !tsol_check_interface_address(ipif)) 19157 return (EINVAL); 19158 19159 /* Register the source address for __sin6_src_id */ 19160 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 19161 ipif->ipif_zoneid); 19162 if (err != 0) { 19163 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 19164 return (err); 19165 } 19166 19167 /* If the interface address is set, create the local IRE. */ 19168 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 19169 (void *)ipif, 19170 ipif->ipif_ire_type, 19171 ntohl(ipif->ipif_lcl_addr))); 19172 *irep++ = ire_create( 19173 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 19174 (uchar_t *)&ip_g_all_ones, /* mask */ 19175 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 19176 NULL, /* no gateway */ 19177 NULL, 19178 &ip_loopback_mtuplus, /* max frag size */ 19179 NULL, 19180 ipif->ipif_rq, /* recv-from queue */ 19181 NULL, /* no send-to queue */ 19182 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 19183 NULL, 19184 ipif, 19185 NULL, 19186 0, 19187 0, 19188 0, 19189 (ipif->ipif_flags & IPIF_PRIVATE) ? 19190 RTF_PRIVATE : 0, 19191 &ire_uinfo_null, 19192 NULL, 19193 NULL); 19194 } else { 19195 ip1dbg(( 19196 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 19197 ipif->ipif_ire_type, 19198 ntohl(ipif->ipif_lcl_addr), 19199 (uint_t)ipif->ipif_flags)); 19200 } 19201 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19202 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19203 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 19204 } else { 19205 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 19206 } 19207 19208 subnet_mask = ipif->ipif_net_mask; 19209 19210 /* 19211 * If mask was not specified, use natural netmask of 19212 * interface address. Also, store this mask back into the 19213 * ipif struct. 19214 */ 19215 if (subnet_mask == 0) { 19216 subnet_mask = net_mask; 19217 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 19218 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 19219 ipif->ipif_v6subnet); 19220 } 19221 19222 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 19223 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 19224 ipif->ipif_subnet != INADDR_ANY) { 19225 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19226 19227 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19228 route_mask = IP_HOST_MASK; 19229 } else { 19230 route_mask = subnet_mask; 19231 } 19232 19233 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 19234 "creating if IRE ill_net_type 0x%x for 0x%x\n", 19235 (void *)ipif, (void *)ill, 19236 ill->ill_net_type, 19237 ntohl(ipif->ipif_subnet))); 19238 *irep++ = ire_create( 19239 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 19240 (uchar_t *)&route_mask, /* mask */ 19241 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 19242 NULL, /* no gateway */ 19243 NULL, 19244 &ipif->ipif_mtu, /* max frag */ 19245 NULL, 19246 NULL, /* no recv queue */ 19247 stq, /* send-to queue */ 19248 ill->ill_net_type, /* IF_[NO]RESOLVER */ 19249 ill->ill_resolver_mp, /* xmit header */ 19250 ipif, 19251 NULL, 19252 0, 19253 0, 19254 0, 19255 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 19256 &ire_uinfo_null, 19257 NULL, 19258 NULL); 19259 } 19260 19261 /* 19262 * If the interface address is set, create the broadcast IREs. 19263 * 19264 * ire_create_bcast checks if the proposed new IRE matches 19265 * any existing IRE's with the same physical interface (ILL). 19266 * This should get rid of duplicates. 19267 * ire_create_bcast also check IPIF_NOXMIT and does not create 19268 * any broadcast ires. 19269 */ 19270 if ((ipif->ipif_subnet != INADDR_ANY) && 19271 (ipif->ipif_flags & IPIF_BROADCAST)) { 19272 ipaddr_t addr; 19273 19274 ip1dbg(("ipif_up_done: creating broadcast IRE\n")); 19275 irep = ire_check_and_create_bcast(ipif, 0, irep, 19276 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19277 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, 19278 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19279 19280 /* 19281 * For backward compatibility, we need to create net 19282 * broadcast ire's based on the old "IP address class 19283 * system." The reason is that some old machines only 19284 * respond to these class derived net broadcast. 19285 * 19286 * But we should not create these net broadcast ire's if 19287 * the subnet_mask is shorter than the IP address class based 19288 * derived netmask. Otherwise, we may create a net 19289 * broadcast address which is the same as an IP address 19290 * on the subnet. Then TCP will refuse to talk to that 19291 * address. 19292 * 19293 * Nor do we need IRE_BROADCAST ire's for the interface 19294 * with the netmask as 0xFFFFFFFF, as IRE_LOCAL for that 19295 * interface is already created. Creating these broadcast 19296 * ire's will only create confusion as the "addr" is going 19297 * to be same as that of the IP address of the interface. 19298 */ 19299 if (net_mask < subnet_mask) { 19300 addr = net_mask & ipif->ipif_subnet; 19301 irep = ire_check_and_create_bcast(ipif, addr, irep, 19302 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19303 irep = ire_check_and_create_bcast(ipif, 19304 ~net_mask | addr, irep, 19305 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19306 } 19307 19308 if (subnet_mask != 0xFFFFFFFF) { 19309 addr = ipif->ipif_subnet; 19310 irep = ire_check_and_create_bcast(ipif, addr, irep, 19311 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19312 irep = ire_check_and_create_bcast(ipif, 19313 ~subnet_mask|addr, irep, 19314 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19315 } 19316 } 19317 19318 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19319 19320 /* If an earlier ire_create failed, get out now */ 19321 for (irep1 = irep; irep1 > ire_array; ) { 19322 irep1--; 19323 if (*irep1 == NULL) { 19324 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 19325 err = ENOMEM; 19326 goto bad; 19327 } 19328 } 19329 19330 /* 19331 * Need to atomically check for ip_addr_availablity_check 19332 * under ip_addr_avail_lock, and if it fails got bad, and remove 19333 * from group also.The ill_g_lock is grabbed as reader 19334 * just to make sure no new ills or new ipifs are being added 19335 * to the system while we are checking the uniqueness of addresses. 19336 */ 19337 rw_enter(&ill_g_lock, RW_READER); 19338 mutex_enter(&ip_addr_avail_lock); 19339 /* Mark it up, and increment counters. */ 19340 ill->ill_ipif_up_count++; 19341 ipif->ipif_flags |= IPIF_UP; 19342 err = ip_addr_availability_check(ipif); 19343 mutex_exit(&ip_addr_avail_lock); 19344 rw_exit(&ill_g_lock); 19345 19346 if (err != 0) { 19347 /* 19348 * Our address may already be up on the same ill. In this case, 19349 * the ARP entry for our ipif replaced the one for the other 19350 * ipif. So we don't want to delete it (otherwise the other ipif 19351 * would be unable to send packets). 19352 * ip_addr_availability_check() identifies this case for us and 19353 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 19354 * which is the expected error code. 19355 */ 19356 if (err == EADDRINUSE) { 19357 freemsg(ipif->ipif_arp_del_mp); 19358 ipif->ipif_arp_del_mp = NULL; 19359 err = EADDRNOTAVAIL; 19360 } 19361 ill->ill_ipif_up_count--; 19362 ipif->ipif_flags &= ~IPIF_UP; 19363 goto bad; 19364 } 19365 19366 /* 19367 * Add in all newly created IREs. ire_create_bcast() has 19368 * already checked for duplicates of the IRE_BROADCAST type. 19369 * We want to add before we call ifgrp_insert which wants 19370 * to know whether IRE_IF_RESOLVER exists or not. 19371 * 19372 * NOTE : We refrele the ire though we may branch to "bad" 19373 * later on where we do ire_delete. This is okay 19374 * because nobody can delete it as we are running 19375 * exclusively. 19376 */ 19377 for (irep1 = irep; irep1 > ire_array; ) { 19378 irep1--; 19379 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 19380 /* 19381 * refheld by ire_add. refele towards the end of the func 19382 */ 19383 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 19384 } 19385 ire_added = B_TRUE; 19386 /* 19387 * Form groups if possible. 19388 * 19389 * If we are supposed to be in a ill_group with a name, insert it 19390 * now as we know that at least one ipif is UP. Otherwise form 19391 * nameless groups. 19392 * 19393 * If ip_enable_group_ifs is set and ipif address is not 0, insert 19394 * this ipif into the appropriate interface group, or create a 19395 * new one. If this is already in a nameless group, we try to form 19396 * a bigger group looking at other ills potentially sharing this 19397 * ipif's prefix. 19398 */ 19399 phyi = ill->ill_phyint; 19400 if (phyi->phyint_groupname_len != 0) { 19401 ASSERT(phyi->phyint_groupname != NULL); 19402 if (ill->ill_ipif_up_count == 1) { 19403 ASSERT(ill->ill_group == NULL); 19404 err = illgrp_insert(&illgrp_head_v4, ill, 19405 phyi->phyint_groupname, NULL, B_TRUE); 19406 if (err != 0) { 19407 ip1dbg(("ipif_up_done: illgrp allocation " 19408 "failed, error %d\n", err)); 19409 goto bad; 19410 } 19411 } 19412 ASSERT(ill->ill_group != NULL); 19413 } 19414 19415 /* 19416 * When this is part of group, we need to make sure that 19417 * any broadcast ires created because of this ipif coming 19418 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 19419 * so that we don't receive duplicate broadcast packets. 19420 */ 19421 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 19422 ipif_renominate_bcast(ipif); 19423 19424 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 19425 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 19426 ipif_saved_irep = ipif_recover_ire(ipif); 19427 19428 if (!loopback) { 19429 /* 19430 * If the broadcast address has been set, make sure it makes 19431 * sense based on the interface address. 19432 * Only match on ill since we are sharing broadcast addresses. 19433 */ 19434 if ((ipif->ipif_brd_addr != INADDR_ANY) && 19435 (ipif->ipif_flags & IPIF_BROADCAST)) { 19436 ire_t *ire; 19437 19438 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 19439 IRE_BROADCAST, ipif, ALL_ZONES, 19440 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19441 19442 if (ire == NULL) { 19443 /* 19444 * If there isn't a matching broadcast IRE, 19445 * revert to the default for this netmask. 19446 */ 19447 ipif->ipif_v6brd_addr = ipv6_all_zeros; 19448 mutex_enter(&ipif->ipif_ill->ill_lock); 19449 ipif_set_default(ipif); 19450 mutex_exit(&ipif->ipif_ill->ill_lock); 19451 } else { 19452 ire_refrele(ire); 19453 } 19454 } 19455 19456 } 19457 19458 19459 /* This is the first interface on this ill */ 19460 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 19461 /* 19462 * Need to recover all multicast memberships in the driver. 19463 * This had to be deferred until we had attached. 19464 */ 19465 ill_recover_multicast(ill); 19466 } 19467 /* Join the allhosts multicast address */ 19468 ipif_multicast_up(ipif); 19469 19470 if (!loopback) { 19471 /* 19472 * See whether anybody else would benefit from the 19473 * new ipif that we added. We call this always rather 19474 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 19475 * ipif is for the benefit of illgrp_insert (done above) 19476 * which does not do source address selection as it does 19477 * not want to re-create interface routes that we are 19478 * having reference to it here. 19479 */ 19480 ill_update_source_selection(ill); 19481 } 19482 19483 for (irep1 = irep; irep1 > ire_array; ) { 19484 irep1--; 19485 if (*irep1 != NULL) { 19486 /* was held in ire_add */ 19487 ire_refrele(*irep1); 19488 } 19489 } 19490 19491 cnt = ipif_saved_ire_cnt; 19492 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 19493 if (*irep1 != NULL) { 19494 /* was held in ire_add */ 19495 ire_refrele(*irep1); 19496 } 19497 } 19498 19499 /* 19500 * This had to be deferred until we had bound. 19501 * tell routing sockets that this interface is up 19502 */ 19503 ip_rts_ifmsg(ipif); 19504 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 19505 19506 if (!loopback) { 19507 /* Broadcast an address mask reply. */ 19508 ipif_mask_reply(ipif); 19509 } 19510 if (ipif_saved_irep != NULL) { 19511 kmem_free(ipif_saved_irep, 19512 ipif_saved_ire_cnt * sizeof (ire_t *)); 19513 } 19514 if (src_ipif_held) 19515 ipif_refrele(src_ipif); 19516 /* Let SCTP update the status for this ipif */ 19517 sctp_update_ipif(ipif, SCTP_IPIF_UP); 19518 return (0); 19519 19520 bad: 19521 ip1dbg(("ipif_up_done: FAILED \n")); 19522 /* 19523 * We don't have to bother removing from ill groups because 19524 * 19525 * 1) For groups with names, we insert only when the first ipif 19526 * comes up. In that case if it fails, it will not be in any 19527 * group. So, we need not try to remove for that case. 19528 * 19529 * 2) For groups without names, either we tried to insert ipif_ill 19530 * in a group as singleton or found some other group to become 19531 * a bigger group. For the former, if it fails we don't have 19532 * anything to do as ipif_ill is not in the group and for the 19533 * latter, there are no failures in illgrp_insert/illgrp_delete 19534 * (ENOMEM can't occur for this. Check ifgrp_insert). 19535 */ 19536 while (irep > ire_array) { 19537 irep--; 19538 if (*irep != NULL) { 19539 ire_delete(*irep); 19540 if (ire_added) 19541 ire_refrele(*irep); 19542 } 19543 } 19544 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid); 19545 19546 if (ipif_saved_irep != NULL) { 19547 kmem_free(ipif_saved_irep, 19548 ipif_saved_ire_cnt * sizeof (ire_t *)); 19549 } 19550 if (src_ipif_held) 19551 ipif_refrele(src_ipif); 19552 19553 ipif_arp_down(ipif); 19554 return (err); 19555 } 19556 19557 /* 19558 * Turn off the ARP with the ILLF_NOARP flag. 19559 */ 19560 static int 19561 ill_arp_off(ill_t *ill) 19562 { 19563 mblk_t *arp_off_mp = NULL; 19564 mblk_t *arp_on_mp = NULL; 19565 19566 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 19567 19568 ASSERT(IAM_WRITER_ILL(ill)); 19569 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 19570 19571 /* 19572 * If the on message is still around we've already done 19573 * an arp_off without doing an arp_on thus there is no 19574 * work needed. 19575 */ 19576 if (ill->ill_arp_on_mp != NULL) 19577 return (0); 19578 19579 /* 19580 * Allocate an ARP on message (to be saved) and an ARP off message 19581 */ 19582 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 19583 if (!arp_off_mp) 19584 return (ENOMEM); 19585 19586 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 19587 if (!arp_on_mp) 19588 goto failed; 19589 19590 ASSERT(ill->ill_arp_on_mp == NULL); 19591 ill->ill_arp_on_mp = arp_on_mp; 19592 19593 /* Send an AR_INTERFACE_OFF request */ 19594 putnext(ill->ill_rq, arp_off_mp); 19595 return (0); 19596 failed: 19597 19598 if (arp_off_mp) 19599 freemsg(arp_off_mp); 19600 return (ENOMEM); 19601 } 19602 19603 /* 19604 * Turn on ARP by turning off the ILLF_NOARP flag. 19605 */ 19606 static int 19607 ill_arp_on(ill_t *ill) 19608 { 19609 mblk_t *mp; 19610 19611 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 19612 19613 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 19614 19615 ASSERT(IAM_WRITER_ILL(ill)); 19616 /* 19617 * Send an AR_INTERFACE_ON request if we have already done 19618 * an arp_off (which allocated the message). 19619 */ 19620 if (ill->ill_arp_on_mp != NULL) { 19621 mp = ill->ill_arp_on_mp; 19622 ill->ill_arp_on_mp = NULL; 19623 putnext(ill->ill_rq, mp); 19624 } 19625 return (0); 19626 } 19627 19628 /* 19629 * Called after either deleting ill from the group or when setting 19630 * FAILED or STANDBY on the interface. 19631 */ 19632 static void 19633 illgrp_reset_schednext(ill_t *ill) 19634 { 19635 ill_group_t *illgrp; 19636 ill_t *save_ill; 19637 19638 ASSERT(IAM_WRITER_ILL(ill)); 19639 /* 19640 * When called from illgrp_delete, ill_group will be non-NULL. 19641 * But when called from ip_sioctl_flags, it could be NULL if 19642 * somebody is setting FAILED/INACTIVE on some interface which 19643 * is not part of a group. 19644 */ 19645 illgrp = ill->ill_group; 19646 if (illgrp == NULL) 19647 return; 19648 if (illgrp->illgrp_ill_schednext != ill) 19649 return; 19650 19651 illgrp->illgrp_ill_schednext = NULL; 19652 save_ill = ill; 19653 /* 19654 * Choose a good ill to be the next one for 19655 * outbound traffic. As the flags FAILED/STANDBY is 19656 * not yet marked when called from ip_sioctl_flags, 19657 * we check for ill separately. 19658 */ 19659 for (ill = illgrp->illgrp_ill; ill != NULL; 19660 ill = ill->ill_group_next) { 19661 if ((ill != save_ill) && 19662 !(ill->ill_phyint->phyint_flags & 19663 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 19664 illgrp->illgrp_ill_schednext = ill; 19665 return; 19666 } 19667 } 19668 } 19669 19670 /* 19671 * Given an ill, find the next ill in the group to be scheduled. 19672 * (This should be called by ip_newroute() before ire_create().) 19673 * The passed in ill may be pulled out of the group, after we have picked 19674 * up a different outgoing ill from the same group. However ire add will 19675 * atomically check this. 19676 */ 19677 ill_t * 19678 illgrp_scheduler(ill_t *ill) 19679 { 19680 ill_t *retill; 19681 ill_group_t *illgrp; 19682 int illcnt; 19683 int i; 19684 uint64_t flags; 19685 19686 /* 19687 * We don't use a lock to check for the ill_group. If this ill 19688 * is currently being inserted we may end up just returning this 19689 * ill itself. That is ok. 19690 */ 19691 if (ill->ill_group == NULL) { 19692 ill_refhold(ill); 19693 return (ill); 19694 } 19695 19696 /* 19697 * Grab the ill_g_lock as reader to make sure we are dealing with 19698 * a set of stable ills. No ill can be added or deleted or change 19699 * group while we hold the reader lock. 19700 */ 19701 rw_enter(&ill_g_lock, RW_READER); 19702 if ((illgrp = ill->ill_group) == NULL) { 19703 rw_exit(&ill_g_lock); 19704 ill_refhold(ill); 19705 return (ill); 19706 } 19707 19708 illcnt = illgrp->illgrp_ill_count; 19709 mutex_enter(&illgrp->illgrp_lock); 19710 retill = illgrp->illgrp_ill_schednext; 19711 19712 if (retill == NULL) 19713 retill = illgrp->illgrp_ill; 19714 19715 /* 19716 * We do a circular search beginning at illgrp_ill_schednext 19717 * or illgrp_ill. We don't check the flags against the ill lock 19718 * since it can change anytime. The ire creation will be atomic 19719 * and will fail if the ill is FAILED or OFFLINE. 19720 */ 19721 for (i = 0; i < illcnt; i++) { 19722 flags = retill->ill_phyint->phyint_flags; 19723 19724 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 19725 ILL_CAN_LOOKUP(retill)) { 19726 illgrp->illgrp_ill_schednext = retill->ill_group_next; 19727 ill_refhold(retill); 19728 break; 19729 } 19730 retill = retill->ill_group_next; 19731 if (retill == NULL) 19732 retill = illgrp->illgrp_ill; 19733 } 19734 mutex_exit(&illgrp->illgrp_lock); 19735 rw_exit(&ill_g_lock); 19736 19737 return (i == illcnt ? NULL : retill); 19738 } 19739 19740 /* 19741 * Checks for availbility of a usable source address (if there is one) when the 19742 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 19743 * this selection is done regardless of the destination. 19744 */ 19745 boolean_t 19746 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 19747 { 19748 uint_t ifindex; 19749 ipif_t *ipif = NULL; 19750 ill_t *uill; 19751 boolean_t isv6; 19752 19753 ASSERT(ill != NULL); 19754 19755 isv6 = ill->ill_isv6; 19756 ifindex = ill->ill_usesrc_ifindex; 19757 if (ifindex != 0) { 19758 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 19759 NULL); 19760 if (uill == NULL) 19761 return (NULL); 19762 mutex_enter(&uill->ill_lock); 19763 for (ipif = uill->ill_ipif; ipif != NULL; 19764 ipif = ipif->ipif_next) { 19765 if (!IPIF_CAN_LOOKUP(ipif)) 19766 continue; 19767 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 19768 continue; 19769 if (!(ipif->ipif_flags & IPIF_UP)) 19770 continue; 19771 if (ipif->ipif_zoneid != zoneid) 19772 continue; 19773 if ((isv6 && 19774 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 19775 (ipif->ipif_lcl_addr == INADDR_ANY)) 19776 continue; 19777 mutex_exit(&uill->ill_lock); 19778 ill_refrele(uill); 19779 return (B_TRUE); 19780 } 19781 mutex_exit(&uill->ill_lock); 19782 ill_refrele(uill); 19783 } 19784 return (B_FALSE); 19785 } 19786 19787 /* 19788 * Determine the best source address given a destination address and an ill. 19789 * Prefers non-deprecated over deprecated but will return a deprecated 19790 * address if there is no other choice. If there is a usable source address 19791 * on the interface pointed to by ill_usesrc_ifindex then that is given 19792 * first preference. 19793 * 19794 * Returns NULL if there is no suitable source address for the ill. 19795 * This only occurs when there is no valid source address for the ill. 19796 */ 19797 ipif_t * 19798 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 19799 { 19800 ipif_t *ipif; 19801 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 19802 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 19803 int index = 0; 19804 boolean_t wrapped = B_FALSE; 19805 boolean_t same_subnet_only = B_FALSE; 19806 boolean_t ipif_same_found, ipif_other_found; 19807 boolean_t specific_found; 19808 ill_t *till, *usill = NULL; 19809 tsol_tpc_t *src_rhtp, *dst_rhtp; 19810 19811 if (ill->ill_usesrc_ifindex != 0) { 19812 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, B_FALSE, 19813 NULL, NULL, NULL, NULL); 19814 if (usill != NULL) 19815 ill = usill; /* Select source from usesrc ILL */ 19816 else 19817 return (NULL); 19818 } 19819 19820 /* 19821 * If we're dealing with an unlabeled destination on a labeled system, 19822 * make sure that we ignore source addresses that are incompatible with 19823 * the destination's default label. That destination's default label 19824 * must dominate the minimum label on the source address. 19825 */ 19826 dst_rhtp = NULL; 19827 if (is_system_labeled()) { 19828 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 19829 if (dst_rhtp == NULL) 19830 return (NULL); 19831 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 19832 TPC_RELE(dst_rhtp); 19833 dst_rhtp = NULL; 19834 } 19835 } 19836 19837 /* 19838 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 19839 * can be deleted. But an ipif/ill can get CONDEMNED any time. 19840 * After selecting the right ipif, under ill_lock make sure ipif is 19841 * not condemned, and increment refcnt. If ipif is CONDEMNED, 19842 * we retry. Inside the loop we still need to check for CONDEMNED, 19843 * but not under a lock. 19844 */ 19845 rw_enter(&ill_g_lock, RW_READER); 19846 19847 retry: 19848 till = ill; 19849 ipif_arr[0] = NULL; 19850 19851 if (till->ill_group != NULL) 19852 till = till->ill_group->illgrp_ill; 19853 19854 /* 19855 * Choose one good source address from each ill across the group. 19856 * If possible choose a source address in the same subnet as 19857 * the destination address. 19858 * 19859 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 19860 * This is okay because of the following. 19861 * 19862 * If PHYI_FAILED is set and we still have non-deprecated 19863 * addresses, it means the addresses have not yet been 19864 * failed over to a different interface. We potentially 19865 * select them to create IRE_CACHES, which will be later 19866 * flushed when the addresses move over. 19867 * 19868 * If PHYI_INACTIVE is set and we still have non-deprecated 19869 * addresses, it means either the user has configured them 19870 * or PHYI_INACTIVE has not been cleared after the addresses 19871 * been moved over. For the former, in.mpathd does a failover 19872 * when the interface becomes INACTIVE and hence we should 19873 * not find them. Once INACTIVE is set, we don't allow them 19874 * to create logical interfaces anymore. For the latter, a 19875 * flush will happen when INACTIVE is cleared which will 19876 * flush the IRE_CACHES. 19877 * 19878 * If PHYI_OFFLINE is set, all the addresses will be failed 19879 * over soon. We potentially select them to create IRE_CACHEs, 19880 * which will be later flushed when the addresses move over. 19881 * 19882 * NOTE : As ipif_select_source is called to borrow source address 19883 * for an ipif that is part of a group, source address selection 19884 * will be re-done whenever the group changes i.e either an 19885 * insertion/deletion in the group. 19886 * 19887 * Fill ipif_arr[] with source addresses, using these rules: 19888 * 19889 * 1. At most one source address from a given ill ends up 19890 * in ipif_arr[] -- that is, at most one of the ipif's 19891 * associated with a given ill ends up in ipif_arr[]. 19892 * 19893 * 2. If there is at least one non-deprecated ipif in the 19894 * IPMP group with a source address on the same subnet as 19895 * our destination, then fill ipif_arr[] only with 19896 * source addresses on the same subnet as our destination. 19897 * Note that because of (1), only the first 19898 * non-deprecated ipif found with a source address 19899 * matching the destination ends up in ipif_arr[]. 19900 * 19901 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 19902 * addresses not in the same subnet as our destination. 19903 * Again, because of (1), only the first off-subnet source 19904 * address will be chosen. 19905 * 19906 * 4. If there are no non-deprecated ipifs, then just use 19907 * the source address associated with the last deprecated 19908 * one we find that happens to be on the same subnet, 19909 * otherwise the first one not in the same subnet. 19910 */ 19911 specific_found = B_FALSE; 19912 for (; till != NULL; till = till->ill_group_next) { 19913 ipif_same_found = B_FALSE; 19914 ipif_other_found = B_FALSE; 19915 for (ipif = till->ill_ipif; ipif != NULL; 19916 ipif = ipif->ipif_next) { 19917 if (!IPIF_CAN_LOOKUP(ipif)) 19918 continue; 19919 /* Always skip NOLOCAL and ANYCAST interfaces */ 19920 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 19921 continue; 19922 if (!(ipif->ipif_flags & IPIF_UP)) 19923 continue; 19924 if (ipif->ipif_zoneid != zoneid && 19925 ipif->ipif_zoneid != ALL_ZONES) 19926 continue; 19927 /* 19928 * Interfaces with 0.0.0.0 address are allowed to be UP, 19929 * but are not valid as source addresses. 19930 */ 19931 if (ipif->ipif_lcl_addr == INADDR_ANY) 19932 continue; 19933 19934 /* 19935 * Check compatibility of local address for 19936 * destination's default label if we're on a labeled 19937 * system. Incompatible addresses can't be used at 19938 * all. 19939 */ 19940 if (dst_rhtp != NULL) { 19941 boolean_t incompat; 19942 19943 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 19944 IPV4_VERSION, B_FALSE); 19945 if (src_rhtp == NULL) 19946 continue; 19947 incompat = 19948 src_rhtp->tpc_tp.host_type != SUN_CIPSO || 19949 src_rhtp->tpc_tp.tp_doi != 19950 dst_rhtp->tpc_tp.tp_doi || 19951 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 19952 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 19953 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 19954 src_rhtp->tpc_tp.tp_sl_set_cipso)); 19955 TPC_RELE(src_rhtp); 19956 if (incompat) 19957 continue; 19958 } 19959 19960 /* 19961 * We prefer not to use all all-zones addresses, if we 19962 * can avoid it, as they pose problems with unlabeled 19963 * destinations. 19964 */ 19965 if (ipif->ipif_zoneid != ALL_ZONES) { 19966 if (!specific_found && 19967 (!same_subnet_only || 19968 (ipif->ipif_net_mask & dst) == 19969 ipif->ipif_subnet)) { 19970 index = 0; 19971 specific_found = B_TRUE; 19972 ipif_other_found = B_FALSE; 19973 } 19974 } else { 19975 if (specific_found) 19976 continue; 19977 } 19978 if (ipif->ipif_flags & IPIF_DEPRECATED) { 19979 if (ipif_dep == NULL || 19980 (ipif->ipif_net_mask & dst) == 19981 ipif->ipif_subnet) 19982 ipif_dep = ipif; 19983 continue; 19984 } 19985 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 19986 /* found a source address in the same subnet */ 19987 if (!same_subnet_only) { 19988 same_subnet_only = B_TRUE; 19989 index = 0; 19990 } 19991 ipif_same_found = B_TRUE; 19992 } else { 19993 if (same_subnet_only || ipif_other_found) 19994 continue; 19995 ipif_other_found = B_TRUE; 19996 } 19997 ipif_arr[index++] = ipif; 19998 if (index == MAX_IPIF_SELECT_SOURCE) { 19999 wrapped = B_TRUE; 20000 index = 0; 20001 } 20002 if (ipif_same_found) 20003 break; 20004 } 20005 } 20006 20007 if (ipif_arr[0] == NULL) { 20008 ipif = ipif_dep; 20009 } else { 20010 if (wrapped) 20011 index = MAX_IPIF_SELECT_SOURCE; 20012 ipif = ipif_arr[ipif_rand() % index]; 20013 ASSERT(ipif != NULL); 20014 } 20015 20016 if (ipif != NULL) { 20017 mutex_enter(&ipif->ipif_ill->ill_lock); 20018 if (!IPIF_CAN_LOOKUP(ipif)) { 20019 mutex_exit(&ipif->ipif_ill->ill_lock); 20020 goto retry; 20021 } 20022 ipif_refhold_locked(ipif); 20023 mutex_exit(&ipif->ipif_ill->ill_lock); 20024 } 20025 20026 rw_exit(&ill_g_lock); 20027 if (usill != NULL) 20028 ill_refrele(usill); 20029 if (dst_rhtp != NULL) 20030 TPC_RELE(dst_rhtp); 20031 20032 #ifdef DEBUG 20033 if (ipif == NULL) { 20034 char buf1[INET6_ADDRSTRLEN]; 20035 20036 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 20037 ill->ill_name, 20038 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 20039 } else { 20040 char buf1[INET6_ADDRSTRLEN]; 20041 char buf2[INET6_ADDRSTRLEN]; 20042 20043 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 20044 ipif->ipif_ill->ill_name, 20045 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 20046 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 20047 buf2, sizeof (buf2)))); 20048 } 20049 #endif /* DEBUG */ 20050 return (ipif); 20051 } 20052 20053 20054 /* 20055 * If old_ipif is not NULL, see if ipif was derived from old 20056 * ipif and if so, recreate the interface route by re-doing 20057 * source address selection. This happens when ipif_down -> 20058 * ipif_update_other_ipifs calls us. 20059 * 20060 * If old_ipif is NULL, just redo the source address selection 20061 * if needed. This happens when illgrp_insert or ipif_up_done 20062 * calls us. 20063 */ 20064 static void 20065 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 20066 { 20067 ire_t *ire; 20068 ire_t *ipif_ire; 20069 queue_t *stq; 20070 ipif_t *nipif; 20071 ill_t *ill; 20072 boolean_t need_rele = B_FALSE; 20073 20074 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 20075 ASSERT(IAM_WRITER_IPIF(ipif)); 20076 20077 ill = ipif->ipif_ill; 20078 if (!(ipif->ipif_flags & 20079 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 20080 /* 20081 * Can't possibly have borrowed the source 20082 * from old_ipif. 20083 */ 20084 return; 20085 } 20086 20087 /* 20088 * Is there any work to be done? No work if the address 20089 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 20090 * ipif_select_source() does not borrow addresses from 20091 * NOLOCAL and ANYCAST interfaces). 20092 */ 20093 if ((old_ipif != NULL) && 20094 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 20095 (old_ipif->ipif_ill->ill_wq == NULL) || 20096 (old_ipif->ipif_flags & 20097 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 20098 return; 20099 } 20100 20101 /* 20102 * Perform the same checks as when creating the 20103 * IRE_INTERFACE in ipif_up_done. 20104 */ 20105 if (!(ipif->ipif_flags & IPIF_UP)) 20106 return; 20107 20108 if ((ipif->ipif_flags & IPIF_NOXMIT) || 20109 (ipif->ipif_subnet == INADDR_ANY)) 20110 return; 20111 20112 ipif_ire = ipif_to_ire(ipif); 20113 if (ipif_ire == NULL) 20114 return; 20115 20116 /* 20117 * We know that ipif uses some other source for its 20118 * IRE_INTERFACE. Is it using the source of this 20119 * old_ipif? 20120 */ 20121 if (old_ipif != NULL && 20122 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 20123 ire_refrele(ipif_ire); 20124 return; 20125 } 20126 if (ip_debug > 2) { 20127 /* ip1dbg */ 20128 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 20129 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 20130 } 20131 20132 stq = ipif_ire->ire_stq; 20133 20134 /* 20135 * Can't use our source address. Select a different 20136 * source address for the IRE_INTERFACE. 20137 */ 20138 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 20139 if (nipif == NULL) { 20140 /* Last resort - all ipif's have IPIF_NOLOCAL */ 20141 nipif = ipif; 20142 } else { 20143 need_rele = B_TRUE; 20144 } 20145 20146 ire = ire_create( 20147 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 20148 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 20149 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 20150 NULL, /* no gateway */ 20151 NULL, 20152 &ipif->ipif_mtu, /* max frag */ 20153 NULL, /* fast path header */ 20154 NULL, /* no recv from queue */ 20155 stq, /* send-to queue */ 20156 ill->ill_net_type, /* IF_[NO]RESOLVER */ 20157 ill->ill_resolver_mp, /* xmit header */ 20158 ipif, 20159 NULL, 20160 0, 20161 0, 20162 0, 20163 0, 20164 &ire_uinfo_null, 20165 NULL, 20166 NULL); 20167 20168 if (ire != NULL) { 20169 ire_t *ret_ire; 20170 int error; 20171 20172 /* 20173 * We don't need ipif_ire anymore. We need to delete 20174 * before we add so that ire_add does not detect 20175 * duplicates. 20176 */ 20177 ire_delete(ipif_ire); 20178 ret_ire = ire; 20179 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 20180 ASSERT(error == 0); 20181 ASSERT(ire == ret_ire); 20182 /* Held in ire_add */ 20183 ire_refrele(ret_ire); 20184 } 20185 /* 20186 * Either we are falling through from above or could not 20187 * allocate a replacement. 20188 */ 20189 ire_refrele(ipif_ire); 20190 if (need_rele) 20191 ipif_refrele(nipif); 20192 } 20193 20194 /* 20195 * This old_ipif is going away. 20196 * 20197 * Determine if any other ipif's is using our address as 20198 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 20199 * IPIF_DEPRECATED). 20200 * Find the IRE_INTERFACE for such ipifs and recreate them 20201 * to use an different source address following the rules in 20202 * ipif_up_done. 20203 * 20204 * This function takes an illgrp as an argument so that illgrp_delete 20205 * can call this to update source address even after deleting the 20206 * old_ipif->ipif_ill from the ill group. 20207 */ 20208 static void 20209 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 20210 { 20211 ipif_t *ipif; 20212 ill_t *ill; 20213 char buf[INET6_ADDRSTRLEN]; 20214 20215 ASSERT(IAM_WRITER_IPIF(old_ipif)); 20216 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 20217 20218 ill = old_ipif->ipif_ill; 20219 20220 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 20221 ill->ill_name, 20222 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 20223 buf, sizeof (buf)))); 20224 /* 20225 * If this part of a group, look at all ills as ipif_select_source 20226 * borrows source address across all the ills in the group. 20227 */ 20228 if (illgrp != NULL) 20229 ill = illgrp->illgrp_ill; 20230 20231 for (; ill != NULL; ill = ill->ill_group_next) { 20232 for (ipif = ill->ill_ipif; ipif != NULL; 20233 ipif = ipif->ipif_next) { 20234 20235 if (ipif == old_ipif) 20236 continue; 20237 20238 ipif_recreate_interface_routes(old_ipif, ipif); 20239 } 20240 } 20241 } 20242 20243 /* ARGSUSED */ 20244 int 20245 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20246 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20247 { 20248 /* 20249 * ill_phyint_reinit merged the v4 and v6 into a single 20250 * ipsq. Could also have become part of a ipmp group in the 20251 * process, and we might not have been able to complete the 20252 * operation in ipif_set_values, if we could not become 20253 * exclusive. If so restart it here. 20254 */ 20255 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 20256 } 20257 20258 20259 /* ARGSUSED */ 20260 int 20261 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20262 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20263 { 20264 queue_t *q1 = q; 20265 char *cp; 20266 char interf_name[LIFNAMSIZ]; 20267 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 20268 20269 if (!q->q_next) { 20270 ip1dbg(( 20271 "if_unitsel: IF_UNITSEL: no q_next\n")); 20272 return (EINVAL); 20273 } 20274 20275 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 20276 return (EALREADY); 20277 20278 do { 20279 q1 = q1->q_next; 20280 } while (q1->q_next); 20281 cp = q1->q_qinfo->qi_minfo->mi_idname; 20282 (void) sprintf(interf_name, "%s%d", cp, ppa); 20283 20284 /* 20285 * Here we are not going to delay the ioack until after 20286 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 20287 * original ioctl message before sending the requests. 20288 */ 20289 return (ipif_set_values(q, mp, interf_name, &ppa)); 20290 } 20291 20292 /* ARGSUSED */ 20293 int 20294 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20295 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20296 { 20297 return (ENXIO); 20298 } 20299 20300 /* 20301 * Net and subnet broadcast ire's are now specific to the particular 20302 * physical interface (ill) and not to any one locigal interface (ipif). 20303 * However, if a particular logical interface is being taken down, it's 20304 * associated ire's will be taken down as well. Hence, when we go to 20305 * take down or change the local address, broadcast address or netmask 20306 * of a specific logical interface, we must check to make sure that we 20307 * have valid net and subnet broadcast ire's for the other logical 20308 * interfaces which may have been shared with the logical interface 20309 * being brought down or changed. 20310 * 20311 * There is one set of 0.0.0.0 and 255.255.255.255 per ill. Usually it 20312 * is tied to the first interface coming UP. If that ipif is going down, 20313 * we need to recreate them on the next valid ipif. 20314 * 20315 * Note: assume that the ipif passed in is still up so that it's IRE 20316 * entries are still valid. 20317 */ 20318 static void 20319 ipif_check_bcast_ires(ipif_t *test_ipif) 20320 { 20321 ipif_t *ipif; 20322 ire_t *test_subnet_ire, *test_net_ire; 20323 ire_t *test_allzero_ire, *test_allone_ire; 20324 ire_t *ire_array[12]; 20325 ire_t **irep = &ire_array[0]; 20326 ire_t **irep1; 20327 20328 ipaddr_t net_addr, subnet_addr, net_mask, subnet_mask; 20329 ipaddr_t test_net_addr, test_subnet_addr; 20330 ipaddr_t test_net_mask, test_subnet_mask; 20331 boolean_t need_net_bcast_ire = B_FALSE; 20332 boolean_t need_subnet_bcast_ire = B_FALSE; 20333 boolean_t allzero_bcast_ire_created = B_FALSE; 20334 boolean_t allone_bcast_ire_created = B_FALSE; 20335 boolean_t net_bcast_ire_created = B_FALSE; 20336 boolean_t subnet_bcast_ire_created = B_FALSE; 20337 20338 ipif_t *backup_ipif_net = (ipif_t *)NULL; 20339 ipif_t *backup_ipif_subnet = (ipif_t *)NULL; 20340 ipif_t *backup_ipif_allzeros = (ipif_t *)NULL; 20341 ipif_t *backup_ipif_allones = (ipif_t *)NULL; 20342 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 20343 20344 ASSERT(!test_ipif->ipif_isv6); 20345 ASSERT(IAM_WRITER_IPIF(test_ipif)); 20346 20347 /* 20348 * No broadcast IREs for the LOOPBACK interface 20349 * or others such as point to point and IPIF_NOXMIT. 20350 */ 20351 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 20352 (test_ipif->ipif_flags & IPIF_NOXMIT)) 20353 return; 20354 20355 test_allzero_ire = ire_ctable_lookup(0, 0, IRE_BROADCAST, 20356 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20357 20358 test_allone_ire = ire_ctable_lookup(INADDR_BROADCAST, 0, IRE_BROADCAST, 20359 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20360 20361 test_net_mask = ip_net_mask(test_ipif->ipif_subnet); 20362 test_subnet_mask = test_ipif->ipif_net_mask; 20363 20364 /* 20365 * If no net mask set, assume the default based on net class. 20366 */ 20367 if (test_subnet_mask == 0) 20368 test_subnet_mask = test_net_mask; 20369 20370 /* 20371 * Check if there is a network broadcast ire associated with this ipif 20372 */ 20373 test_net_addr = test_net_mask & test_ipif->ipif_subnet; 20374 test_net_ire = ire_ctable_lookup(test_net_addr, 0, IRE_BROADCAST, 20375 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20376 20377 /* 20378 * Check if there is a subnet broadcast IRE associated with this ipif 20379 */ 20380 test_subnet_addr = test_subnet_mask & test_ipif->ipif_subnet; 20381 test_subnet_ire = ire_ctable_lookup(test_subnet_addr, 0, IRE_BROADCAST, 20382 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20383 20384 /* 20385 * No broadcast ire's associated with this ipif. 20386 */ 20387 if ((test_subnet_ire == NULL) && (test_net_ire == NULL) && 20388 (test_allzero_ire == NULL) && (test_allone_ire == NULL)) { 20389 return; 20390 } 20391 20392 /* 20393 * We have established which bcast ires have to be replaced. 20394 * Next we try to locate ipifs that match there ires. 20395 * The rules are simple: If we find an ipif that matches on the subnet 20396 * address it will also match on the net address, the allzeros and 20397 * allones address. Any ipif that matches only on the net address will 20398 * also match the allzeros and allones addresses. 20399 * The other criterion is the ipif_flags. We look for non-deprecated 20400 * (and non-anycast and non-nolocal) ipifs as the best choice. 20401 * ipifs with check_flags matching (deprecated, etc) are used only 20402 * if good ipifs are not available. While looping, we save existing 20403 * deprecated ipifs as backup_ipif. 20404 * We loop through all the ipifs for this ill looking for ipifs 20405 * whose broadcast addr match the ipif passed in, but do not have 20406 * their own broadcast ires. For creating 0.0.0.0 and 20407 * 255.255.255.255 we just need an ipif on this ill to create. 20408 */ 20409 for (ipif = test_ipif->ipif_ill->ill_ipif; ipif != NULL; 20410 ipif = ipif->ipif_next) { 20411 20412 ASSERT(!ipif->ipif_isv6); 20413 /* 20414 * Already checked the ipif passed in. 20415 */ 20416 if (ipif == test_ipif) { 20417 continue; 20418 } 20419 20420 /* 20421 * We only need to recreate broadcast ires if another ipif in 20422 * the same zone uses them. The new ires must be created in the 20423 * same zone. 20424 */ 20425 if (ipif->ipif_zoneid != test_ipif->ipif_zoneid) { 20426 continue; 20427 } 20428 20429 /* 20430 * Only interested in logical interfaces with valid local 20431 * addresses or with the ability to broadcast. 20432 */ 20433 if ((ipif->ipif_subnet == 0) || 20434 !(ipif->ipif_flags & IPIF_BROADCAST) || 20435 (ipif->ipif_flags & IPIF_NOXMIT) || 20436 !(ipif->ipif_flags & IPIF_UP)) { 20437 continue; 20438 } 20439 /* 20440 * Check if there is a net broadcast ire for this 20441 * net address. If it turns out that the ipif we are 20442 * about to take down owns this ire, we must make a 20443 * new one because it is potentially going away. 20444 */ 20445 if (test_net_ire && (!net_bcast_ire_created)) { 20446 net_mask = ip_net_mask(ipif->ipif_subnet); 20447 net_addr = net_mask & ipif->ipif_subnet; 20448 if (net_addr == test_net_addr) { 20449 need_net_bcast_ire = B_TRUE; 20450 /* 20451 * Use DEPRECATED ipif only if no good 20452 * ires are available. subnet_addr is 20453 * a better match than net_addr. 20454 */ 20455 if ((ipif->ipif_flags & check_flags) && 20456 (backup_ipif_net == NULL)) { 20457 backup_ipif_net = ipif; 20458 } 20459 } 20460 } 20461 /* 20462 * Check if there is a subnet broadcast ire for this 20463 * net address. If it turns out that the ipif we are 20464 * about to take down owns this ire, we must make a 20465 * new one because it is potentially going away. 20466 */ 20467 if (test_subnet_ire && (!subnet_bcast_ire_created)) { 20468 subnet_mask = ipif->ipif_net_mask; 20469 subnet_addr = ipif->ipif_subnet; 20470 if (subnet_addr == test_subnet_addr) { 20471 need_subnet_bcast_ire = B_TRUE; 20472 if ((ipif->ipif_flags & check_flags) && 20473 (backup_ipif_subnet == NULL)) { 20474 backup_ipif_subnet = ipif; 20475 } 20476 } 20477 } 20478 20479 20480 /* Short circuit here if this ipif is deprecated */ 20481 if (ipif->ipif_flags & check_flags) { 20482 if ((test_allzero_ire != NULL) && 20483 (!allzero_bcast_ire_created) && 20484 (backup_ipif_allzeros == NULL)) { 20485 backup_ipif_allzeros = ipif; 20486 } 20487 if ((test_allone_ire != NULL) && 20488 (!allone_bcast_ire_created) && 20489 (backup_ipif_allones == NULL)) { 20490 backup_ipif_allones = ipif; 20491 } 20492 continue; 20493 } 20494 20495 /* 20496 * Found an ipif which has the same broadcast ire as the 20497 * ipif passed in and the ipif passed in "owns" the ire. 20498 * Create new broadcast ire's for this broadcast addr. 20499 */ 20500 if (need_net_bcast_ire && !net_bcast_ire_created) { 20501 irep = ire_create_bcast(ipif, net_addr, irep); 20502 irep = ire_create_bcast(ipif, 20503 ~net_mask | net_addr, irep); 20504 net_bcast_ire_created = B_TRUE; 20505 } 20506 if (need_subnet_bcast_ire && !subnet_bcast_ire_created) { 20507 irep = ire_create_bcast(ipif, subnet_addr, irep); 20508 irep = ire_create_bcast(ipif, 20509 ~subnet_mask | subnet_addr, irep); 20510 subnet_bcast_ire_created = B_TRUE; 20511 } 20512 if (test_allzero_ire != NULL && !allzero_bcast_ire_created) { 20513 irep = ire_create_bcast(ipif, 0, irep); 20514 allzero_bcast_ire_created = B_TRUE; 20515 } 20516 if (test_allone_ire != NULL && !allone_bcast_ire_created) { 20517 irep = ire_create_bcast(ipif, INADDR_BROADCAST, irep); 20518 allone_bcast_ire_created = B_TRUE; 20519 } 20520 /* 20521 * Once we have created all the appropriate ires, we 20522 * just break out of this loop to add what we have created. 20523 * This has been indented similar to ire_match_args for 20524 * readability. 20525 */ 20526 if (((test_net_ire == NULL) || 20527 (net_bcast_ire_created)) && 20528 ((test_subnet_ire == NULL) || 20529 (subnet_bcast_ire_created)) && 20530 ((test_allzero_ire == NULL) || 20531 (allzero_bcast_ire_created)) && 20532 ((test_allone_ire == NULL) || 20533 (allone_bcast_ire_created))) { 20534 break; 20535 } 20536 } 20537 20538 /* 20539 * Create bcast ires on deprecated ipifs if no non-deprecated ipifs 20540 * exist. 6 pairs of bcast ires are needed. 20541 * Note - the old ires are deleted in ipif_down. 20542 */ 20543 if (need_net_bcast_ire && !net_bcast_ire_created && backup_ipif_net) { 20544 ipif = backup_ipif_net; 20545 irep = ire_create_bcast(ipif, net_addr, irep); 20546 irep = ire_create_bcast(ipif, ~net_mask | net_addr, irep); 20547 net_bcast_ire_created = B_TRUE; 20548 } 20549 if (need_subnet_bcast_ire && !subnet_bcast_ire_created && 20550 backup_ipif_subnet) { 20551 ipif = backup_ipif_subnet; 20552 irep = ire_create_bcast(ipif, subnet_addr, irep); 20553 irep = ire_create_bcast(ipif, 20554 ~subnet_mask | subnet_addr, irep); 20555 subnet_bcast_ire_created = B_TRUE; 20556 } 20557 if (test_allzero_ire != NULL && !allzero_bcast_ire_created && 20558 backup_ipif_allzeros) { 20559 irep = ire_create_bcast(backup_ipif_allzeros, 0, irep); 20560 allzero_bcast_ire_created = B_TRUE; 20561 } 20562 if (test_allone_ire != NULL && !allone_bcast_ire_created && 20563 backup_ipif_allones) { 20564 irep = ire_create_bcast(backup_ipif_allones, 20565 INADDR_BROADCAST, irep); 20566 allone_bcast_ire_created = B_TRUE; 20567 } 20568 20569 /* 20570 * If we can't create all of them, don't add any of them. 20571 * Code in ip_wput_ire and ire_to_ill assumes that we 20572 * always have a non-loopback copy and loopback copy 20573 * for a given address. 20574 */ 20575 for (irep1 = irep; irep1 > ire_array; ) { 20576 irep1--; 20577 if (*irep1 == NULL) { 20578 ip0dbg(("ipif_check_bcast_ires: can't create " 20579 "IRE_BROADCAST, memory allocation failure\n")); 20580 while (irep > ire_array) { 20581 irep--; 20582 if (*irep != NULL) 20583 ire_delete(*irep); 20584 } 20585 goto bad; 20586 } 20587 } 20588 for (irep1 = irep; irep1 > ire_array; ) { 20589 int error; 20590 20591 irep1--; 20592 error = ire_add(irep1, NULL, NULL, NULL, B_FALSE); 20593 if (error == 0) { 20594 ire_refrele(*irep1); /* Held in ire_add */ 20595 } 20596 } 20597 bad: 20598 if (test_allzero_ire != NULL) 20599 ire_refrele(test_allzero_ire); 20600 if (test_allone_ire != NULL) 20601 ire_refrele(test_allone_ire); 20602 if (test_net_ire != NULL) 20603 ire_refrele(test_net_ire); 20604 if (test_subnet_ire != NULL) 20605 ire_refrele(test_subnet_ire); 20606 } 20607 20608 /* 20609 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 20610 * from lifr_flags and the name from lifr_name. 20611 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 20612 * since ipif_lookup_on_name uses the _isv6 flags when matching. 20613 * Returns EINPROGRESS when mp has been consumed by queueing it on 20614 * ill_pending_mp and the ioctl will complete in ip_rput. 20615 */ 20616 /* ARGSUSED */ 20617 int 20618 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20619 ip_ioctl_cmd_t *ipip, void *if_req) 20620 { 20621 int err; 20622 ill_t *ill; 20623 struct lifreq *lifr = (struct lifreq *)if_req; 20624 20625 ASSERT(ipif != NULL); 20626 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 20627 ASSERT(q->q_next != NULL); 20628 20629 ill = (ill_t *)q->q_ptr; 20630 /* 20631 * If we are not writer on 'q' then this interface exists already 20632 * and previous lookups (ipif_extract_lifreq_cmn) found this ipif. 20633 * So return EALREADY 20634 */ 20635 if (ill != ipif->ipif_ill) 20636 return (EALREADY); 20637 20638 if (ill->ill_name[0] != '\0') 20639 return (EALREADY); 20640 20641 /* 20642 * Set all the flags. Allows all kinds of override. Provide some 20643 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 20644 * unless there is either multicast/broadcast support in the driver 20645 * or it is a pt-pt link. 20646 */ 20647 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 20648 /* Meaningless to IP thus don't allow them to be set. */ 20649 ip1dbg(("ip_setname: EINVAL 1\n")); 20650 return (EINVAL); 20651 } 20652 /* 20653 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 20654 * ill_bcast_addr_length info. 20655 */ 20656 if (!ill->ill_needs_attach && 20657 ((lifr->lifr_flags & IFF_MULTICAST) && 20658 !(lifr->lifr_flags & IFF_POINTOPOINT) && 20659 ill->ill_bcast_addr_length == 0)) { 20660 /* Link not broadcast/pt-pt capable i.e. no multicast */ 20661 ip1dbg(("ip_setname: EINVAL 2\n")); 20662 return (EINVAL); 20663 } 20664 if ((lifr->lifr_flags & IFF_BROADCAST) && 20665 ((lifr->lifr_flags & IFF_IPV6) || 20666 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 20667 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 20668 ip1dbg(("ip_setname: EINVAL 3\n")); 20669 return (EINVAL); 20670 } 20671 if (lifr->lifr_flags & IFF_UP) { 20672 /* Can only be set with SIOCSLIFFLAGS */ 20673 ip1dbg(("ip_setname: EINVAL 4\n")); 20674 return (EINVAL); 20675 } 20676 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 20677 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 20678 ip1dbg(("ip_setname: EINVAL 5\n")); 20679 return (EINVAL); 20680 } 20681 /* 20682 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 20683 */ 20684 if ((lifr->lifr_flags & IFF_XRESOLV) && 20685 !(lifr->lifr_flags & IFF_IPV6) && 20686 !(ipif->ipif_isv6)) { 20687 ip1dbg(("ip_setname: EINVAL 6\n")); 20688 return (EINVAL); 20689 } 20690 20691 /* 20692 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 20693 * we have all the flags here. So, we assign rather than we OR. 20694 * We can't OR the flags here because we don't want to set 20695 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 20696 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 20697 * on lifr_flags value here. 20698 */ 20699 /* 20700 * This ill has not been inserted into the global list. 20701 * So we are still single threaded and don't need any lock 20702 */ 20703 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS; 20704 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 20705 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 20706 20707 /* We started off as V4. */ 20708 if (ill->ill_flags & ILLF_IPV6) { 20709 ill->ill_phyint->phyint_illv6 = ill; 20710 ill->ill_phyint->phyint_illv4 = NULL; 20711 } 20712 err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa); 20713 return (err); 20714 } 20715 20716 /* ARGSUSED */ 20717 int 20718 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20719 ip_ioctl_cmd_t *ipip, void *if_req) 20720 { 20721 /* 20722 * ill_phyint_reinit merged the v4 and v6 into a single 20723 * ipsq. Could also have become part of a ipmp group in the 20724 * process, and we might not have been able to complete the 20725 * slifname in ipif_set_values, if we could not become 20726 * exclusive. If so restart it here 20727 */ 20728 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 20729 } 20730 20731 /* 20732 * Return a pointer to the ipif which matches the index, IP version type and 20733 * zoneid. 20734 */ 20735 ipif_t * 20736 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 20737 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 20738 { 20739 ill_t *ill; 20740 ipsq_t *ipsq; 20741 phyint_t *phyi; 20742 ipif_t *ipif; 20743 20744 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 20745 (q != NULL && mp != NULL && func != NULL && err != NULL)); 20746 20747 if (err != NULL) 20748 *err = 0; 20749 20750 /* 20751 * Indexes are stored in the phyint - a common structure 20752 * to both IPv4 and IPv6. 20753 */ 20754 20755 rw_enter(&ill_g_lock, RW_READER); 20756 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 20757 (void *) &index, NULL); 20758 if (phyi != NULL) { 20759 ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4; 20760 if (ill == NULL) { 20761 rw_exit(&ill_g_lock); 20762 if (err != NULL) 20763 *err = ENXIO; 20764 return (NULL); 20765 } 20766 GRAB_CONN_LOCK(q); 20767 mutex_enter(&ill->ill_lock); 20768 if (ILL_CAN_LOOKUP(ill)) { 20769 for (ipif = ill->ill_ipif; ipif != NULL; 20770 ipif = ipif->ipif_next) { 20771 if (IPIF_CAN_LOOKUP(ipif) && 20772 (zoneid == ALL_ZONES || 20773 zoneid == ipif->ipif_zoneid || 20774 ipif->ipif_zoneid == ALL_ZONES)) { 20775 ipif_refhold_locked(ipif); 20776 mutex_exit(&ill->ill_lock); 20777 RELEASE_CONN_LOCK(q); 20778 rw_exit(&ill_g_lock); 20779 return (ipif); 20780 } 20781 } 20782 } else if (ILL_CAN_WAIT(ill, q)) { 20783 ipsq = ill->ill_phyint->phyint_ipsq; 20784 mutex_enter(&ipsq->ipsq_lock); 20785 rw_exit(&ill_g_lock); 20786 mutex_exit(&ill->ill_lock); 20787 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 20788 mutex_exit(&ipsq->ipsq_lock); 20789 RELEASE_CONN_LOCK(q); 20790 *err = EINPROGRESS; 20791 return (NULL); 20792 } 20793 mutex_exit(&ill->ill_lock); 20794 RELEASE_CONN_LOCK(q); 20795 } 20796 rw_exit(&ill_g_lock); 20797 if (err != NULL) 20798 *err = ENXIO; 20799 return (NULL); 20800 } 20801 20802 typedef struct conn_change_s { 20803 uint_t cc_old_ifindex; 20804 uint_t cc_new_ifindex; 20805 } conn_change_t; 20806 20807 /* 20808 * ipcl_walk function for changing interface index. 20809 */ 20810 static void 20811 conn_change_ifindex(conn_t *connp, caddr_t arg) 20812 { 20813 conn_change_t *connc; 20814 uint_t old_ifindex; 20815 uint_t new_ifindex; 20816 int i; 20817 ilg_t *ilg; 20818 20819 connc = (conn_change_t *)arg; 20820 old_ifindex = connc->cc_old_ifindex; 20821 new_ifindex = connc->cc_new_ifindex; 20822 20823 if (connp->conn_orig_bound_ifindex == old_ifindex) 20824 connp->conn_orig_bound_ifindex = new_ifindex; 20825 20826 if (connp->conn_orig_multicast_ifindex == old_ifindex) 20827 connp->conn_orig_multicast_ifindex = new_ifindex; 20828 20829 if (connp->conn_orig_xmit_ifindex == old_ifindex) 20830 connp->conn_orig_xmit_ifindex = new_ifindex; 20831 20832 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 20833 ilg = &connp->conn_ilg[i]; 20834 if (ilg->ilg_orig_ifindex == old_ifindex) 20835 ilg->ilg_orig_ifindex = new_ifindex; 20836 } 20837 } 20838 20839 /* 20840 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 20841 * to new_index if it matches the old_index. 20842 * 20843 * Failovers typically happen within a group of ills. But somebody 20844 * can remove an ill from the group after a failover happened. If 20845 * we are setting the ifindex after this, we potentially need to 20846 * look at all the ills rather than just the ones in the group. 20847 * We cut down the work by looking at matching ill_net_types 20848 * and ill_types as we could not possibly grouped them together. 20849 */ 20850 static void 20851 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 20852 { 20853 ill_t *ill; 20854 ipif_t *ipif; 20855 uint_t old_ifindex; 20856 uint_t new_ifindex; 20857 ilm_t *ilm; 20858 ill_walk_context_t ctx; 20859 20860 old_ifindex = connc->cc_old_ifindex; 20861 new_ifindex = connc->cc_new_ifindex; 20862 20863 rw_enter(&ill_g_lock, RW_READER); 20864 ill = ILL_START_WALK_ALL(&ctx); 20865 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 20866 if ((ill_orig->ill_net_type != ill->ill_net_type) || 20867 (ill_orig->ill_type != ill->ill_type)) { 20868 continue; 20869 } 20870 for (ipif = ill->ill_ipif; ipif != NULL; 20871 ipif = ipif->ipif_next) { 20872 if (ipif->ipif_orig_ifindex == old_ifindex) 20873 ipif->ipif_orig_ifindex = new_ifindex; 20874 } 20875 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 20876 if (ilm->ilm_orig_ifindex == old_ifindex) 20877 ilm->ilm_orig_ifindex = new_ifindex; 20878 } 20879 } 20880 rw_exit(&ill_g_lock); 20881 } 20882 20883 /* 20884 * We first need to ensure that the new index is unique, and 20885 * then carry the change across both v4 and v6 ill representation 20886 * of the physical interface. 20887 */ 20888 /* ARGSUSED */ 20889 int 20890 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20891 ip_ioctl_cmd_t *ipip, void *ifreq) 20892 { 20893 ill_t *ill; 20894 ill_t *ill_other; 20895 phyint_t *phyi; 20896 int old_index; 20897 conn_change_t connc; 20898 struct ifreq *ifr = (struct ifreq *)ifreq; 20899 struct lifreq *lifr = (struct lifreq *)ifreq; 20900 uint_t index; 20901 ill_t *ill_v4; 20902 ill_t *ill_v6; 20903 20904 if (ipip->ipi_cmd_type == IF_CMD) 20905 index = ifr->ifr_index; 20906 else 20907 index = lifr->lifr_index; 20908 20909 /* 20910 * Only allow on physical interface. Also, index zero is illegal. 20911 * 20912 * Need to check for PHYI_FAILED and PHYI_INACTIVE 20913 * 20914 * 1) If PHYI_FAILED is set, a failover could have happened which 20915 * implies a possible failback might have to happen. As failback 20916 * depends on the old index, we should fail setting the index. 20917 * 20918 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 20919 * any addresses or multicast memberships are failed over to 20920 * a non-STANDBY interface. As failback depends on the old 20921 * index, we should fail setting the index for this case also. 20922 * 20923 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 20924 * Be consistent with PHYI_FAILED and fail the ioctl. 20925 */ 20926 ill = ipif->ipif_ill; 20927 phyi = ill->ill_phyint; 20928 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 20929 ipif->ipif_id != 0 || index == 0) { 20930 return (EINVAL); 20931 } 20932 old_index = phyi->phyint_ifindex; 20933 20934 /* If the index is not changing, no work to do */ 20935 if (old_index == index) 20936 return (0); 20937 20938 /* 20939 * Use ill_lookup_on_ifindex to determine if the 20940 * new index is unused and if so allow the change. 20941 */ 20942 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL); 20943 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL); 20944 if (ill_v6 != NULL || ill_v4 != NULL) { 20945 if (ill_v4 != NULL) 20946 ill_refrele(ill_v4); 20947 if (ill_v6 != NULL) 20948 ill_refrele(ill_v6); 20949 return (EBUSY); 20950 } 20951 20952 /* 20953 * The new index is unused. Set it in the phyint. 20954 * Locate the other ill so that we can send a routing 20955 * sockets message. 20956 */ 20957 if (ill->ill_isv6) { 20958 ill_other = phyi->phyint_illv4; 20959 } else { 20960 ill_other = phyi->phyint_illv6; 20961 } 20962 20963 phyi->phyint_ifindex = index; 20964 20965 connc.cc_old_ifindex = old_index; 20966 connc.cc_new_ifindex = index; 20967 ip_change_ifindex(ill, &connc); 20968 ipcl_walk(conn_change_ifindex, (caddr_t)&connc); 20969 20970 /* Send the routing sockets message */ 20971 ip_rts_ifmsg(ipif); 20972 if (ill_other != NULL) 20973 ip_rts_ifmsg(ill_other->ill_ipif); 20974 20975 return (0); 20976 } 20977 20978 /* ARGSUSED */ 20979 int 20980 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20981 ip_ioctl_cmd_t *ipip, void *ifreq) 20982 { 20983 struct ifreq *ifr = (struct ifreq *)ifreq; 20984 struct lifreq *lifr = (struct lifreq *)ifreq; 20985 20986 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 20987 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 20988 /* Get the interface index */ 20989 if (ipip->ipi_cmd_type == IF_CMD) { 20990 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 20991 } else { 20992 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 20993 } 20994 return (0); 20995 } 20996 20997 /* ARGSUSED */ 20998 int 20999 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21000 ip_ioctl_cmd_t *ipip, void *ifreq) 21001 { 21002 struct lifreq *lifr = (struct lifreq *)ifreq; 21003 21004 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 21005 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21006 /* Get the interface zone */ 21007 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21008 lifr->lifr_zoneid = ipif->ipif_zoneid; 21009 return (0); 21010 } 21011 21012 /* 21013 * Set the zoneid of an interface. 21014 */ 21015 /* ARGSUSED */ 21016 int 21017 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21018 ip_ioctl_cmd_t *ipip, void *ifreq) 21019 { 21020 struct lifreq *lifr = (struct lifreq *)ifreq; 21021 int err = 0; 21022 boolean_t need_up = B_FALSE; 21023 zone_t *zptr; 21024 zone_status_t status; 21025 zoneid_t zoneid; 21026 21027 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21028 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 21029 if (!is_system_labeled()) 21030 return (ENOTSUP); 21031 zoneid = GLOBAL_ZONEID; 21032 } 21033 21034 /* cannot assign instance zero to a non-global zone */ 21035 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 21036 return (ENOTSUP); 21037 21038 /* 21039 * Cannot assign to a zone that doesn't exist or is shutting down. In 21040 * the event of a race with the zone shutdown processing, since IP 21041 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 21042 * interface will be cleaned up even if the zone is shut down 21043 * immediately after the status check. If the interface can't be brought 21044 * down right away, and the zone is shut down before the restart 21045 * function is called, we resolve the possible races by rechecking the 21046 * zone status in the restart function. 21047 */ 21048 if ((zptr = zone_find_by_id(zoneid)) == NULL) 21049 return (EINVAL); 21050 status = zone_status_get(zptr); 21051 zone_rele(zptr); 21052 21053 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 21054 return (EINVAL); 21055 21056 if (ipif->ipif_flags & IPIF_UP) { 21057 /* 21058 * If the interface is already marked up, 21059 * we call ipif_down which will take care 21060 * of ditching any IREs that have been set 21061 * up based on the old interface address. 21062 */ 21063 err = ipif_logical_down(ipif, q, mp); 21064 if (err == EINPROGRESS) 21065 return (err); 21066 ipif_down_tail(ipif); 21067 need_up = B_TRUE; 21068 } 21069 21070 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 21071 return (err); 21072 } 21073 21074 static int 21075 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 21076 queue_t *q, mblk_t *mp, boolean_t need_up) 21077 { 21078 int err = 0; 21079 21080 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 21081 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21082 21083 /* Set the new zone id. */ 21084 ipif->ipif_zoneid = zoneid; 21085 21086 /* Update sctp list */ 21087 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 21088 21089 if (need_up) { 21090 /* 21091 * Now bring the interface back up. If this 21092 * is the only IPIF for the ILL, ipif_up 21093 * will have to re-bind to the device, so 21094 * we may get back EINPROGRESS, in which 21095 * case, this IOCTL will get completed in 21096 * ip_rput_dlpi when we see the DL_BIND_ACK. 21097 */ 21098 err = ipif_up(ipif, q, mp); 21099 } 21100 return (err); 21101 } 21102 21103 /* ARGSUSED */ 21104 int 21105 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21106 ip_ioctl_cmd_t *ipip, void *if_req) 21107 { 21108 struct lifreq *lifr = (struct lifreq *)if_req; 21109 zoneid_t zoneid; 21110 zone_t *zptr; 21111 zone_status_t status; 21112 21113 ASSERT(ipif->ipif_id != 0); 21114 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21115 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 21116 zoneid = GLOBAL_ZONEID; 21117 21118 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 21119 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21120 21121 /* 21122 * We recheck the zone status to resolve the following race condition: 21123 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 21124 * 2) hme0:1 is up and can't be brought down right away; 21125 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 21126 * 3) zone "myzone" is halted; the zone status switches to 21127 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 21128 * the interfaces to remove - hme0:1 is not returned because it's not 21129 * yet in "myzone", so it won't be removed; 21130 * 4) the restart function for SIOCSLIFZONE is called; without the 21131 * status check here, we would have hme0:1 in "myzone" after it's been 21132 * destroyed. 21133 * Note that if the status check fails, we need to bring the interface 21134 * back to its state prior to ip_sioctl_slifzone(), hence the call to 21135 * ipif_up_done[_v6](). 21136 */ 21137 status = ZONE_IS_UNINITIALIZED; 21138 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 21139 status = zone_status_get(zptr); 21140 zone_rele(zptr); 21141 } 21142 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 21143 if (ipif->ipif_isv6) { 21144 (void) ipif_up_done_v6(ipif); 21145 } else { 21146 (void) ipif_up_done(ipif); 21147 } 21148 return (EINVAL); 21149 } 21150 21151 ipif_down_tail(ipif); 21152 21153 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 21154 B_TRUE)); 21155 } 21156 21157 /* ARGSUSED */ 21158 int 21159 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21160 ip_ioctl_cmd_t *ipip, void *ifreq) 21161 { 21162 struct lifreq *lifr = ifreq; 21163 21164 ASSERT(q->q_next == NULL); 21165 ASSERT(CONN_Q(q)); 21166 21167 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 21168 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21169 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 21170 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 21171 21172 return (0); 21173 } 21174 21175 21176 /* Find the previous ILL in this usesrc group */ 21177 static ill_t * 21178 ill_prev_usesrc(ill_t *uill) 21179 { 21180 ill_t *ill; 21181 21182 for (ill = uill->ill_usesrc_grp_next; 21183 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 21184 ill = ill->ill_usesrc_grp_next) 21185 /* do nothing */; 21186 return (ill); 21187 } 21188 21189 /* 21190 * Release all members of the usesrc group. This routine is called 21191 * from ill_delete when the interface being unplumbed is the 21192 * group head. 21193 */ 21194 static void 21195 ill_disband_usesrc_group(ill_t *uill) 21196 { 21197 ill_t *next_ill, *tmp_ill; 21198 ASSERT(RW_WRITE_HELD(&ill_g_usesrc_lock)); 21199 next_ill = uill->ill_usesrc_grp_next; 21200 21201 do { 21202 ASSERT(next_ill != NULL); 21203 tmp_ill = next_ill->ill_usesrc_grp_next; 21204 ASSERT(tmp_ill != NULL); 21205 next_ill->ill_usesrc_grp_next = NULL; 21206 next_ill->ill_usesrc_ifindex = 0; 21207 next_ill = tmp_ill; 21208 } while (next_ill->ill_usesrc_ifindex != 0); 21209 uill->ill_usesrc_grp_next = NULL; 21210 } 21211 21212 /* 21213 * Remove the client usesrc ILL from the list and relink to a new list 21214 */ 21215 int 21216 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 21217 { 21218 ill_t *ill, *tmp_ill; 21219 21220 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 21221 (uill != NULL) && RW_WRITE_HELD(&ill_g_usesrc_lock)); 21222 21223 /* 21224 * Check if the usesrc client ILL passed in is not already 21225 * in use as a usesrc ILL i.e one whose source address is 21226 * in use OR a usesrc ILL is not already in use as a usesrc 21227 * client ILL 21228 */ 21229 if ((ucill->ill_usesrc_ifindex == 0) || 21230 (uill->ill_usesrc_ifindex != 0)) { 21231 return (-1); 21232 } 21233 21234 ill = ill_prev_usesrc(ucill); 21235 ASSERT(ill->ill_usesrc_grp_next != NULL); 21236 21237 /* Remove from the current list */ 21238 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 21239 /* Only two elements in the list */ 21240 ASSERT(ill->ill_usesrc_ifindex == 0); 21241 ill->ill_usesrc_grp_next = NULL; 21242 } else { 21243 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 21244 } 21245 21246 if (ifindex == 0) { 21247 ucill->ill_usesrc_ifindex = 0; 21248 ucill->ill_usesrc_grp_next = NULL; 21249 return (0); 21250 } 21251 21252 ucill->ill_usesrc_ifindex = ifindex; 21253 tmp_ill = uill->ill_usesrc_grp_next; 21254 uill->ill_usesrc_grp_next = ucill; 21255 ucill->ill_usesrc_grp_next = 21256 (tmp_ill != NULL) ? tmp_ill : uill; 21257 return (0); 21258 } 21259 21260 /* 21261 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 21262 * ip.c for locking details. 21263 */ 21264 /* ARGSUSED */ 21265 int 21266 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21267 ip_ioctl_cmd_t *ipip, void *ifreq) 21268 { 21269 struct lifreq *lifr = (struct lifreq *)ifreq; 21270 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 21271 ill_flag_changed = B_FALSE; 21272 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 21273 int err = 0, ret; 21274 uint_t ifindex; 21275 phyint_t *us_phyint, *us_cli_phyint; 21276 ipsq_t *ipsq = NULL; 21277 21278 ASSERT(IAM_WRITER_IPIF(ipif)); 21279 ASSERT(q->q_next == NULL); 21280 ASSERT(CONN_Q(q)); 21281 21282 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 21283 us_cli_phyint = usesrc_cli_ill->ill_phyint; 21284 21285 ASSERT(us_cli_phyint != NULL); 21286 21287 /* 21288 * If the client ILL is being used for IPMP, abort. 21289 * Note, this can be done before ipsq_try_enter since we are already 21290 * exclusive on this ILL 21291 */ 21292 if ((us_cli_phyint->phyint_groupname != NULL) || 21293 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 21294 return (EINVAL); 21295 } 21296 21297 ifindex = lifr->lifr_index; 21298 if (ifindex == 0) { 21299 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 21300 /* non usesrc group interface, nothing to reset */ 21301 return (0); 21302 } 21303 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 21304 /* valid reset request */ 21305 reset_flg = B_TRUE; 21306 } 21307 21308 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 21309 ip_process_ioctl, &err); 21310 21311 if (usesrc_ill == NULL) { 21312 return (err); 21313 } 21314 21315 /* 21316 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 21317 * group nor can either of the interfaces be used for standy. So 21318 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 21319 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 21320 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 21321 * We are already exlusive on this ipsq i.e ipsq corresponding to 21322 * the usesrc_cli_ill 21323 */ 21324 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 21325 NEW_OP, B_TRUE); 21326 if (ipsq == NULL) { 21327 err = EINPROGRESS; 21328 /* Operation enqueued on the ipsq of the usesrc ILL */ 21329 goto done; 21330 } 21331 21332 /* Check if the usesrc_ill is used for IPMP */ 21333 us_phyint = usesrc_ill->ill_phyint; 21334 if ((us_phyint->phyint_groupname != NULL) || 21335 (us_phyint->phyint_flags & PHYI_STANDBY)) { 21336 err = EINVAL; 21337 goto done; 21338 } 21339 21340 /* 21341 * If the client is already in use as a usesrc_ill or a usesrc_ill is 21342 * already a client then return EINVAL 21343 */ 21344 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 21345 err = EINVAL; 21346 goto done; 21347 } 21348 21349 /* 21350 * If the ill_usesrc_ifindex field is already set to what it needs to 21351 * be then this is a duplicate operation. 21352 */ 21353 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 21354 err = 0; 21355 goto done; 21356 } 21357 21358 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 21359 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 21360 usesrc_ill->ill_isv6)); 21361 21362 /* 21363 * The next step ensures that no new ires will be created referencing 21364 * the client ill, until the ILL_CHANGING flag is cleared. Then 21365 * we go through an ire walk deleting all ire caches that reference 21366 * the client ill. New ires referencing the client ill that are added 21367 * to the ire table before the ILL_CHANGING flag is set, will be 21368 * cleaned up by the ire walk below. Attempt to add new ires referencing 21369 * the client ill while the ILL_CHANGING flag is set will be failed 21370 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 21371 * checks (under the ill_g_usesrc_lock) that the ire being added 21372 * is not stale, i.e the ire_stq and ire_ipif are consistent and 21373 * belong to the same usesrc group. 21374 */ 21375 mutex_enter(&usesrc_cli_ill->ill_lock); 21376 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 21377 mutex_exit(&usesrc_cli_ill->ill_lock); 21378 ill_flag_changed = B_TRUE; 21379 21380 if (ipif->ipif_isv6) 21381 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 21382 ALL_ZONES); 21383 else 21384 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 21385 ALL_ZONES); 21386 21387 /* 21388 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 21389 * and the ill_usesrc_ifindex fields 21390 */ 21391 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 21392 21393 if (reset_flg) { 21394 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 21395 if (ret != 0) { 21396 err = EINVAL; 21397 } 21398 rw_exit(&ill_g_usesrc_lock); 21399 goto done; 21400 } 21401 21402 /* 21403 * Four possibilities to consider: 21404 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 21405 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 21406 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 21407 * 4. Both are part of their respective usesrc groups 21408 */ 21409 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 21410 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 21411 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 21412 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 21413 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 21414 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 21415 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 21416 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 21417 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 21418 /* Insert at head of list */ 21419 usesrc_cli_ill->ill_usesrc_grp_next = 21420 usesrc_ill->ill_usesrc_grp_next; 21421 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 21422 } else { 21423 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 21424 ifindex); 21425 if (ret != 0) 21426 err = EINVAL; 21427 } 21428 rw_exit(&ill_g_usesrc_lock); 21429 21430 done: 21431 if (ill_flag_changed) { 21432 mutex_enter(&usesrc_cli_ill->ill_lock); 21433 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 21434 mutex_exit(&usesrc_cli_ill->ill_lock); 21435 } 21436 if (ipsq != NULL) 21437 ipsq_exit(ipsq, B_TRUE, B_TRUE); 21438 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 21439 ill_refrele(usesrc_ill); 21440 return (err); 21441 } 21442 21443 /* 21444 * comparison function used by avl. 21445 */ 21446 static int 21447 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 21448 { 21449 21450 uint_t index; 21451 21452 ASSERT(phyip != NULL && index_ptr != NULL); 21453 21454 index = *((uint_t *)index_ptr); 21455 /* 21456 * let the phyint with the lowest index be on top. 21457 */ 21458 if (((phyint_t *)phyip)->phyint_ifindex < index) 21459 return (1); 21460 if (((phyint_t *)phyip)->phyint_ifindex > index) 21461 return (-1); 21462 return (0); 21463 } 21464 21465 /* 21466 * comparison function used by avl. 21467 */ 21468 static int 21469 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 21470 { 21471 ill_t *ill; 21472 int res = 0; 21473 21474 ASSERT(phyip != NULL && name_ptr != NULL); 21475 21476 if (((phyint_t *)phyip)->phyint_illv4) 21477 ill = ((phyint_t *)phyip)->phyint_illv4; 21478 else 21479 ill = ((phyint_t *)phyip)->phyint_illv6; 21480 ASSERT(ill != NULL); 21481 21482 res = strcmp(ill->ill_name, (char *)name_ptr); 21483 if (res > 0) 21484 return (1); 21485 else if (res < 0) 21486 return (-1); 21487 return (0); 21488 } 21489 /* 21490 * This function is called from ill_delete when the ill is being 21491 * unplumbed. We remove the reference from the phyint and we also 21492 * free the phyint when there are no more references to it. 21493 */ 21494 static void 21495 ill_phyint_free(ill_t *ill) 21496 { 21497 phyint_t *phyi; 21498 phyint_t *next_phyint; 21499 ipsq_t *cur_ipsq; 21500 21501 ASSERT(ill->ill_phyint != NULL); 21502 21503 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 21504 phyi = ill->ill_phyint; 21505 ill->ill_phyint = NULL; 21506 /* 21507 * ill_init allocates a phyint always to store the copy 21508 * of flags relevant to phyint. At that point in time, we could 21509 * not assign the name and hence phyint_illv4/v6 could not be 21510 * initialized. Later in ipif_set_values, we assign the name to 21511 * the ill, at which point in time we assign phyint_illv4/v6. 21512 * Thus we don't rely on phyint_illv6 to be initialized always. 21513 */ 21514 if (ill->ill_flags & ILLF_IPV6) { 21515 phyi->phyint_illv6 = NULL; 21516 } else { 21517 phyi->phyint_illv4 = NULL; 21518 } 21519 /* 21520 * ipif_down removes it from the group when the last ipif goes 21521 * down. 21522 */ 21523 ASSERT(ill->ill_group == NULL); 21524 21525 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 21526 return; 21527 21528 /* 21529 * Make sure this phyint was put in the list. 21530 */ 21531 if (phyi->phyint_ifindex > 0) { 21532 avl_remove(&phyint_g_list.phyint_list_avl_by_index, 21533 phyi); 21534 avl_remove(&phyint_g_list.phyint_list_avl_by_name, 21535 phyi); 21536 } 21537 /* 21538 * remove phyint from the ipsq list. 21539 */ 21540 cur_ipsq = phyi->phyint_ipsq; 21541 if (phyi == cur_ipsq->ipsq_phyint_list) { 21542 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 21543 } else { 21544 next_phyint = cur_ipsq->ipsq_phyint_list; 21545 while (next_phyint != NULL) { 21546 if (next_phyint->phyint_ipsq_next == phyi) { 21547 next_phyint->phyint_ipsq_next = 21548 phyi->phyint_ipsq_next; 21549 break; 21550 } 21551 next_phyint = next_phyint->phyint_ipsq_next; 21552 } 21553 ASSERT(next_phyint != NULL); 21554 } 21555 IPSQ_DEC_REF(cur_ipsq); 21556 21557 if (phyi->phyint_groupname_len != 0) { 21558 ASSERT(phyi->phyint_groupname != NULL); 21559 mi_free(phyi->phyint_groupname); 21560 } 21561 mi_free(phyi); 21562 } 21563 21564 /* 21565 * Attach the ill to the phyint structure which can be shared by both 21566 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 21567 * function is called from ipif_set_values and ill_lookup_on_name (for 21568 * loopback) where we know the name of the ill. We lookup the ill and if 21569 * there is one present already with the name use that phyint. Otherwise 21570 * reuse the one allocated by ill_init. 21571 */ 21572 static void 21573 ill_phyint_reinit(ill_t *ill) 21574 { 21575 boolean_t isv6 = ill->ill_isv6; 21576 phyint_t *phyi_old; 21577 phyint_t *phyi; 21578 avl_index_t where = 0; 21579 ill_t *ill_other = NULL; 21580 ipsq_t *ipsq; 21581 21582 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 21583 21584 phyi_old = ill->ill_phyint; 21585 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 21586 phyi_old->phyint_illv6 == NULL)); 21587 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 21588 phyi_old->phyint_illv4 == NULL)); 21589 ASSERT(phyi_old->phyint_ifindex == 0); 21590 21591 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 21592 ill->ill_name, &where); 21593 21594 /* 21595 * 1. We grabbed the ill_g_lock before inserting this ill into 21596 * the global list of ills. So no other thread could have located 21597 * this ill and hence the ipsq of this ill is guaranteed to be empty. 21598 * 2. Now locate the other protocol instance of this ill. 21599 * 3. Now grab both ill locks in the right order, and the phyint lock of 21600 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 21601 * of neither ill can change. 21602 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 21603 * other ill. 21604 * 5. Release all locks. 21605 */ 21606 21607 /* 21608 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 21609 * we are initializing IPv4. 21610 */ 21611 if (phyi != NULL) { 21612 ill_other = (isv6) ? phyi->phyint_illv4 : 21613 phyi->phyint_illv6; 21614 ASSERT(ill_other->ill_phyint != NULL); 21615 ASSERT((isv6 && !ill_other->ill_isv6) || 21616 (!isv6 && ill_other->ill_isv6)); 21617 GRAB_ILL_LOCKS(ill, ill_other); 21618 /* 21619 * We are potentially throwing away phyint_flags which 21620 * could be different from the one that we obtain from 21621 * ill_other->ill_phyint. But it is okay as we are assuming 21622 * that the state maintained within IP is correct. 21623 */ 21624 mutex_enter(&phyi->phyint_lock); 21625 if (isv6) { 21626 ASSERT(phyi->phyint_illv6 == NULL); 21627 phyi->phyint_illv6 = ill; 21628 } else { 21629 ASSERT(phyi->phyint_illv4 == NULL); 21630 phyi->phyint_illv4 = ill; 21631 } 21632 /* 21633 * This is a new ill, currently undergoing SLIFNAME 21634 * So we could not have joined an IPMP group until now. 21635 */ 21636 ASSERT(phyi_old->phyint_ipsq_next == NULL && 21637 phyi_old->phyint_groupname == NULL); 21638 21639 /* 21640 * This phyi_old is going away. Decref ipsq_refs and 21641 * assert it is zero. The ipsq itself will be freed in 21642 * ipsq_exit 21643 */ 21644 ipsq = phyi_old->phyint_ipsq; 21645 IPSQ_DEC_REF(ipsq); 21646 ASSERT(ipsq->ipsq_refs == 0); 21647 /* Get the singleton phyint out of the ipsq list */ 21648 ASSERT(phyi_old->phyint_ipsq_next == NULL); 21649 ipsq->ipsq_phyint_list = NULL; 21650 phyi_old->phyint_illv4 = NULL; 21651 phyi_old->phyint_illv6 = NULL; 21652 mi_free(phyi_old); 21653 } else { 21654 mutex_enter(&ill->ill_lock); 21655 /* 21656 * We don't need to acquire any lock, since 21657 * the ill is not yet visible globally and we 21658 * have not yet released the ill_g_lock. 21659 */ 21660 phyi = phyi_old; 21661 mutex_enter(&phyi->phyint_lock); 21662 /* XXX We need a recovery strategy here. */ 21663 if (!phyint_assign_ifindex(phyi)) 21664 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 21665 21666 avl_insert(&phyint_g_list.phyint_list_avl_by_name, 21667 (void *)phyi, where); 21668 21669 (void) avl_find(&phyint_g_list.phyint_list_avl_by_index, 21670 &phyi->phyint_ifindex, &where); 21671 avl_insert(&phyint_g_list.phyint_list_avl_by_index, 21672 (void *)phyi, where); 21673 } 21674 21675 /* 21676 * Reassigning ill_phyint automatically reassigns the ipsq also. 21677 * pending mp is not affected because that is per ill basis. 21678 */ 21679 ill->ill_phyint = phyi; 21680 21681 /* 21682 * Keep the index on ipif_orig_index to be used by FAILOVER. 21683 * We do this here as when the first ipif was allocated, 21684 * ipif_allocate does not know the right interface index. 21685 */ 21686 21687 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 21688 /* 21689 * Now that the phyint's ifindex has been assigned, complete the 21690 * remaining 21691 */ 21692 if (ill->ill_isv6) { 21693 ill->ill_ip6_mib->ipv6IfIndex = 21694 ill->ill_phyint->phyint_ifindex; 21695 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 21696 ill->ill_phyint->phyint_ifindex; 21697 } 21698 21699 RELEASE_ILL_LOCKS(ill, ill_other); 21700 mutex_exit(&phyi->phyint_lock); 21701 } 21702 21703 /* 21704 * Notify any downstream modules of the name of this interface. 21705 * An M_IOCTL is used even though we don't expect a successful reply. 21706 * Any reply message from the driver (presumably an M_IOCNAK) will 21707 * eventually get discarded somewhere upstream. The message format is 21708 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 21709 * to IP. 21710 */ 21711 static void 21712 ip_ifname_notify(ill_t *ill, queue_t *q) 21713 { 21714 mblk_t *mp1, *mp2; 21715 struct iocblk *iocp; 21716 struct lifreq *lifr; 21717 21718 mp1 = mkiocb(SIOCSLIFNAME); 21719 if (mp1 == NULL) 21720 return; 21721 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 21722 if (mp2 == NULL) { 21723 freeb(mp1); 21724 return; 21725 } 21726 21727 mp1->b_cont = mp2; 21728 iocp = (struct iocblk *)mp1->b_rptr; 21729 iocp->ioc_count = sizeof (struct lifreq); 21730 21731 lifr = (struct lifreq *)mp2->b_rptr; 21732 mp2->b_wptr += sizeof (struct lifreq); 21733 bzero(lifr, sizeof (struct lifreq)); 21734 21735 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 21736 lifr->lifr_ppa = ill->ill_ppa; 21737 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 21738 21739 putnext(q, mp1); 21740 } 21741 21742 static boolean_t ip_trash_timer_started = B_FALSE; 21743 21744 static int 21745 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 21746 { 21747 int err; 21748 21749 /* Set the obsolete NDD per-interface forwarding name. */ 21750 err = ill_set_ndd_name(ill); 21751 if (err != 0) { 21752 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 21753 err); 21754 } 21755 21756 /* Tell downstream modules where they are. */ 21757 ip_ifname_notify(ill, q); 21758 21759 /* 21760 * ill_dl_phys returns EINPROGRESS in the usual case. 21761 * Error cases are ENOMEM ... 21762 */ 21763 err = ill_dl_phys(ill, ipif, mp, q); 21764 21765 /* 21766 * If there is no IRE expiration timer running, get one started. 21767 * igmp and mld timers will be triggered by the first multicast 21768 */ 21769 if (!ip_trash_timer_started) { 21770 /* 21771 * acquire the lock and check again. 21772 */ 21773 mutex_enter(&ip_trash_timer_lock); 21774 if (!ip_trash_timer_started) { 21775 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 21776 MSEC_TO_TICK(ip_timer_interval)); 21777 ip_trash_timer_started = B_TRUE; 21778 } 21779 mutex_exit(&ip_trash_timer_lock); 21780 } 21781 21782 if (ill->ill_isv6) { 21783 mutex_enter(&mld_slowtimeout_lock); 21784 if (mld_slowtimeout_id == 0) { 21785 mld_slowtimeout_id = timeout(mld_slowtimo, NULL, 21786 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 21787 } 21788 mutex_exit(&mld_slowtimeout_lock); 21789 } else { 21790 mutex_enter(&igmp_slowtimeout_lock); 21791 if (igmp_slowtimeout_id == 0) { 21792 igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL, 21793 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 21794 } 21795 mutex_exit(&igmp_slowtimeout_lock); 21796 } 21797 21798 return (err); 21799 } 21800 21801 /* 21802 * Common routine for ppa and ifname setting. Should be called exclusive. 21803 * 21804 * Returns EINPROGRESS when mp has been consumed by queueing it on 21805 * ill_pending_mp and the ioctl will complete in ip_rput. 21806 * 21807 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 21808 * the new name and new ppa in lifr_name and lifr_ppa respectively. 21809 * For SLIFNAME, we pass these values back to the userland. 21810 */ 21811 static int 21812 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 21813 { 21814 ill_t *ill; 21815 ipif_t *ipif; 21816 ipsq_t *ipsq; 21817 char *ppa_ptr; 21818 char *old_ptr; 21819 char old_char; 21820 int error; 21821 21822 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 21823 ASSERT(q->q_next != NULL); 21824 ASSERT(interf_name != NULL); 21825 21826 ill = (ill_t *)q->q_ptr; 21827 21828 ASSERT(ill->ill_name[0] == '\0'); 21829 ASSERT(IAM_WRITER_ILL(ill)); 21830 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 21831 ASSERT(ill->ill_ppa == UINT_MAX); 21832 21833 /* The ppa is sent down by ifconfig or is chosen */ 21834 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 21835 return (EINVAL); 21836 } 21837 21838 /* 21839 * make sure ppa passed in is same as ppa in the name. 21840 * This check is not made when ppa == UINT_MAX in that case ppa 21841 * in the name could be anything. System will choose a ppa and 21842 * update new_ppa_ptr and inter_name to contain the choosen ppa. 21843 */ 21844 if (*new_ppa_ptr != UINT_MAX) { 21845 /* stoi changes the pointer */ 21846 old_ptr = ppa_ptr; 21847 /* 21848 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 21849 * (they don't have an externally visible ppa). We assign one 21850 * here so that we can manage the interface. Note that in 21851 * the past this value was always 0 for DLPI 1 drivers. 21852 */ 21853 if (*new_ppa_ptr == 0) 21854 *new_ppa_ptr = stoi(&old_ptr); 21855 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 21856 return (EINVAL); 21857 } 21858 /* 21859 * terminate string before ppa 21860 * save char at that location. 21861 */ 21862 old_char = ppa_ptr[0]; 21863 ppa_ptr[0] = '\0'; 21864 21865 ill->ill_ppa = *new_ppa_ptr; 21866 /* 21867 * Finish as much work now as possible before calling ill_glist_insert 21868 * which makes the ill globally visible and also merges it with the 21869 * other protocol instance of this phyint. The remaining work is 21870 * done after entering the ipsq which may happen sometime later. 21871 * ill_set_ndd_name occurs after the ill has been made globally visible. 21872 */ 21873 ipif = ill->ill_ipif; 21874 21875 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 21876 ipif_assign_seqid(ipif); 21877 21878 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 21879 ill->ill_flags |= ILLF_IPV4; 21880 21881 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 21882 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 21883 21884 if (ill->ill_flags & ILLF_IPV6) { 21885 21886 ill->ill_isv6 = B_TRUE; 21887 if (ill->ill_rq != NULL) { 21888 ill->ill_rq->q_qinfo = &rinit_ipv6; 21889 ill->ill_wq->q_qinfo = &winit_ipv6; 21890 } 21891 21892 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 21893 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 21894 ipif->ipif_v6src_addr = ipv6_all_zeros; 21895 ipif->ipif_v6subnet = ipv6_all_zeros; 21896 ipif->ipif_v6net_mask = ipv6_all_zeros; 21897 ipif->ipif_v6brd_addr = ipv6_all_zeros; 21898 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 21899 /* 21900 * point-to-point or Non-mulicast capable 21901 * interfaces won't do NUD unless explicitly 21902 * configured to do so. 21903 */ 21904 if (ipif->ipif_flags & IPIF_POINTOPOINT || 21905 !(ill->ill_flags & ILLF_MULTICAST)) { 21906 ill->ill_flags |= ILLF_NONUD; 21907 } 21908 /* Make sure IPv4 specific flag is not set on IPv6 if */ 21909 if (ill->ill_flags & ILLF_NOARP) { 21910 /* 21911 * Note: xresolv interfaces will eventually need 21912 * NOARP set here as well, but that will require 21913 * those external resolvers to have some 21914 * knowledge of that flag and act appropriately. 21915 * Not to be changed at present. 21916 */ 21917 ill->ill_flags &= ~ILLF_NOARP; 21918 } 21919 /* 21920 * Set the ILLF_ROUTER flag according to the global 21921 * IPv6 forwarding policy. 21922 */ 21923 if (ipv6_forward != 0) 21924 ill->ill_flags |= ILLF_ROUTER; 21925 } else if (ill->ill_flags & ILLF_IPV4) { 21926 ill->ill_isv6 = B_FALSE; 21927 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 21928 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 21929 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 21930 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 21931 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 21932 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 21933 /* 21934 * Set the ILLF_ROUTER flag according to the global 21935 * IPv4 forwarding policy. 21936 */ 21937 if (ip_g_forward != 0) 21938 ill->ill_flags |= ILLF_ROUTER; 21939 } 21940 21941 ASSERT(ill->ill_phyint != NULL); 21942 21943 /* 21944 * The ipv6Ifindex and ipv6IfIcmpIfIndex assignments will 21945 * be completed in ill_glist_insert -> ill_phyint_reinit 21946 */ 21947 if (ill->ill_isv6) { 21948 /* allocate v6 mib */ 21949 if (!ill_allocate_mibs(ill)) 21950 return (ENOMEM); 21951 } 21952 21953 /* 21954 * Pick a default sap until we get the DL_INFO_ACK back from 21955 * the driver. 21956 */ 21957 if (ill->ill_sap == 0) { 21958 if (ill->ill_isv6) 21959 ill->ill_sap = IP6_DL_SAP; 21960 else 21961 ill->ill_sap = IP_DL_SAP; 21962 } 21963 21964 ill->ill_ifname_pending = 1; 21965 ill->ill_ifname_pending_err = 0; 21966 21967 ill_refhold(ill); 21968 rw_enter(&ill_g_lock, RW_WRITER); 21969 if ((error = ill_glist_insert(ill, interf_name, 21970 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 21971 ill->ill_ppa = UINT_MAX; 21972 ill->ill_name[0] = '\0'; 21973 /* 21974 * undo null termination done above. 21975 */ 21976 ppa_ptr[0] = old_char; 21977 rw_exit(&ill_g_lock); 21978 ill_refrele(ill); 21979 return (error); 21980 } 21981 21982 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 21983 21984 /* 21985 * When we return the buffer pointed to by interf_name should contain 21986 * the same name as in ill_name. 21987 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 21988 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 21989 * so copy full name and update the ppa ptr. 21990 * When ppa passed in != UINT_MAX all values are correct just undo 21991 * null termination, this saves a bcopy. 21992 */ 21993 if (*new_ppa_ptr == UINT_MAX) { 21994 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 21995 *new_ppa_ptr = ill->ill_ppa; 21996 } else { 21997 /* 21998 * undo null termination done above. 21999 */ 22000 ppa_ptr[0] = old_char; 22001 } 22002 22003 /* Let SCTP know about this ILL */ 22004 sctp_update_ill(ill, SCTP_ILL_INSERT); 22005 22006 /* and also about the first ipif */ 22007 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 22008 22009 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 22010 B_TRUE); 22011 22012 rw_exit(&ill_g_lock); 22013 ill_refrele(ill); 22014 if (ipsq == NULL) 22015 return (EINPROGRESS); 22016 22017 /* 22018 * Need to set the ipsq_current_ipif now, if we have changed ipsq 22019 * due to the phyint merge in ill_phyint_reinit. 22020 */ 22021 ASSERT(ipsq->ipsq_current_ipif == NULL || 22022 ipsq->ipsq_current_ipif == ipif); 22023 ipsq->ipsq_current_ipif = ipif; 22024 ipsq->ipsq_last_cmd = SIOCSLIFNAME; 22025 error = ipif_set_values_tail(ill, ipif, mp, q); 22026 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22027 if (error != 0 && error != EINPROGRESS) { 22028 /* 22029 * restore previous values 22030 */ 22031 ill->ill_isv6 = B_FALSE; 22032 } 22033 return (error); 22034 } 22035 22036 22037 extern void (*ip_cleanup_func)(void); 22038 22039 void 22040 ipif_init(void) 22041 { 22042 hrtime_t hrt; 22043 int i; 22044 22045 /* 22046 * Can't call drv_getparm here as it is too early in the boot. 22047 * As we use ipif_src_random just for picking a different 22048 * source address everytime, this need not be really random. 22049 */ 22050 hrt = gethrtime(); 22051 ipif_src_random = ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 22052 22053 for (i = 0; i < MAX_G_HEADS; i++) { 22054 ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ill_g_heads[i]; 22055 ill_g_heads[i].ill_g_list_tail = (ill_if_t *)&ill_g_heads[i]; 22056 } 22057 22058 avl_create(&phyint_g_list.phyint_list_avl_by_index, 22059 ill_phyint_compare_index, 22060 sizeof (phyint_t), 22061 offsetof(struct phyint, phyint_avl_by_index)); 22062 avl_create(&phyint_g_list.phyint_list_avl_by_name, 22063 ill_phyint_compare_name, 22064 sizeof (phyint_t), 22065 offsetof(struct phyint, phyint_avl_by_name)); 22066 22067 ip_cleanup_func = ip_thread_exit; 22068 } 22069 22070 /* 22071 * This is called by ip_rt_add when src_addr value is other than zero. 22072 * src_addr signifies the source address of the incoming packet. For 22073 * reverse tunnel route we need to create a source addr based routing 22074 * table. This routine creates ip_mrtun_table if it's empty and then 22075 * it adds the route entry hashed by source address. It verifies that 22076 * the outgoing interface is always a non-resolver interface (tunnel). 22077 */ 22078 int 22079 ip_mrtun_rt_add(ipaddr_t in_src_addr, int flags, ipif_t *ipif_arg, 22080 ipif_t *src_ipif, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func) 22081 { 22082 ire_t *ire; 22083 ire_t *save_ire; 22084 ipif_t *ipif; 22085 ill_t *in_ill = NULL; 22086 ill_t *out_ill; 22087 queue_t *stq; 22088 mblk_t *dlureq_mp; 22089 int error; 22090 22091 if (ire_arg != NULL) 22092 *ire_arg = NULL; 22093 ASSERT(in_src_addr != INADDR_ANY); 22094 22095 ipif = ipif_arg; 22096 if (ipif != NULL) { 22097 out_ill = ipif->ipif_ill; 22098 } else { 22099 ip1dbg(("ip_mrtun_rt_add: ipif is NULL\n")); 22100 return (EINVAL); 22101 } 22102 22103 if (src_ipif == NULL) { 22104 ip1dbg(("ip_mrtun_rt_add: src_ipif is NULL\n")); 22105 return (EINVAL); 22106 } 22107 in_ill = src_ipif->ipif_ill; 22108 22109 /* 22110 * Check for duplicates. We don't need to 22111 * match out_ill, because the uniqueness of 22112 * a route is only dependent on src_addr and 22113 * in_ill. 22114 */ 22115 ire = ire_mrtun_lookup(in_src_addr, in_ill); 22116 if (ire != NULL) { 22117 ire_refrele(ire); 22118 return (EEXIST); 22119 } 22120 if (ipif->ipif_net_type != IRE_IF_NORESOLVER) { 22121 ip2dbg(("ip_mrtun_rt_add: outgoing interface is type %d\n", 22122 ipif->ipif_net_type)); 22123 return (EINVAL); 22124 } 22125 22126 stq = ipif->ipif_wq; 22127 ASSERT(stq != NULL); 22128 22129 /* 22130 * The outgoing interface must be non-resolver 22131 * interface. 22132 */ 22133 dlureq_mp = ill_dlur_gen(NULL, 22134 out_ill->ill_phys_addr_length, out_ill->ill_sap, 22135 out_ill->ill_sap_length); 22136 22137 if (dlureq_mp == NULL) { 22138 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 22139 return (ENOMEM); 22140 } 22141 22142 /* Create the IRE. */ 22143 22144 ire = ire_create( 22145 NULL, /* Zero dst addr */ 22146 NULL, /* Zero mask */ 22147 NULL, /* Zero gateway addr */ 22148 NULL, /* Zero ipif_src addr */ 22149 (uint8_t *)&in_src_addr, /* in_src-addr */ 22150 &ipif->ipif_mtu, 22151 NULL, 22152 NULL, /* rfq */ 22153 stq, 22154 IRE_MIPRTUN, 22155 dlureq_mp, 22156 ipif, 22157 in_ill, 22158 0, 22159 0, 22160 0, 22161 flags, 22162 &ire_uinfo_null, 22163 NULL, 22164 NULL); 22165 22166 if (ire == NULL) 22167 return (ENOMEM); 22168 ip2dbg(("ip_mrtun_rt_add: mrtun route is created with type %d\n", 22169 ire->ire_type)); 22170 save_ire = ire; 22171 ASSERT(save_ire != NULL); 22172 error = ire_add_mrtun(&ire, q, mp, func); 22173 /* 22174 * If ire_add_mrtun() failed, the ire passed in was freed 22175 * so there is no need to do so here. 22176 */ 22177 if (error != 0) { 22178 return (error); 22179 } 22180 22181 /* Duplicate check */ 22182 if (ire != save_ire) { 22183 /* route already exists by now */ 22184 ire_refrele(ire); 22185 return (EEXIST); 22186 } 22187 22188 if (ire_arg != NULL) { 22189 /* 22190 * Store the ire that was just added. the caller 22191 * ip_rts_request responsible for doing ire_refrele() 22192 * on it. 22193 */ 22194 *ire_arg = ire; 22195 } else { 22196 ire_refrele(ire); /* held in ire_add_mrtun */ 22197 } 22198 22199 return (0); 22200 } 22201 22202 /* 22203 * It is called by ip_rt_delete() only when mipagent requests to delete 22204 * a reverse tunnel route that was added by ip_mrtun_rt_add() before. 22205 */ 22206 22207 int 22208 ip_mrtun_rt_delete(ipaddr_t in_src_addr, ipif_t *src_ipif) 22209 { 22210 ire_t *ire = NULL; 22211 22212 if (in_src_addr == INADDR_ANY) 22213 return (EINVAL); 22214 if (src_ipif == NULL) 22215 return (EINVAL); 22216 22217 /* search if this route exists in the ip_mrtun_table */ 22218 ire = ire_mrtun_lookup(in_src_addr, src_ipif->ipif_ill); 22219 if (ire == NULL) { 22220 ip2dbg(("ip_mrtun_rt_delete: ire not found\n")); 22221 return (ESRCH); 22222 } 22223 ire_delete(ire); 22224 ire_refrele(ire); 22225 return (0); 22226 } 22227 22228 /* 22229 * Lookup the ipif corresponding to the onlink destination address. For 22230 * point-to-point interfaces, it matches with remote endpoint destination 22231 * address. For point-to-multipoint interfaces it only tries to match the 22232 * destination with the interface's subnet address. The longest, most specific 22233 * match is found to take care of such rare network configurations like - 22234 * le0: 129.146.1.1/16 22235 * le1: 129.146.2.2/24 22236 * It is used only by SO_DONTROUTE at the moment. 22237 */ 22238 ipif_t * 22239 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid) 22240 { 22241 ipif_t *ipif, *best_ipif; 22242 ill_t *ill; 22243 ill_walk_context_t ctx; 22244 22245 ASSERT(zoneid != ALL_ZONES); 22246 best_ipif = NULL; 22247 22248 rw_enter(&ill_g_lock, RW_READER); 22249 ill = ILL_START_WALK_V4(&ctx); 22250 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 22251 mutex_enter(&ill->ill_lock); 22252 for (ipif = ill->ill_ipif; ipif != NULL; 22253 ipif = ipif->ipif_next) { 22254 if (!IPIF_CAN_LOOKUP(ipif)) 22255 continue; 22256 if (ipif->ipif_zoneid != zoneid && 22257 ipif->ipif_zoneid != ALL_ZONES) 22258 continue; 22259 /* 22260 * Point-to-point case. Look for exact match with 22261 * destination address. 22262 */ 22263 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 22264 if (ipif->ipif_pp_dst_addr == addr) { 22265 ipif_refhold_locked(ipif); 22266 mutex_exit(&ill->ill_lock); 22267 rw_exit(&ill_g_lock); 22268 if (best_ipif != NULL) 22269 ipif_refrele(best_ipif); 22270 return (ipif); 22271 } 22272 } else if (ipif->ipif_subnet == (addr & 22273 ipif->ipif_net_mask)) { 22274 /* 22275 * Point-to-multipoint case. Looping through to 22276 * find the most specific match. If there are 22277 * multiple best match ipif's then prefer ipif's 22278 * that are UP. If there is only one best match 22279 * ipif and it is DOWN we must still return it. 22280 */ 22281 if ((best_ipif == NULL) || 22282 (ipif->ipif_net_mask > 22283 best_ipif->ipif_net_mask) || 22284 ((ipif->ipif_net_mask == 22285 best_ipif->ipif_net_mask) && 22286 ((ipif->ipif_flags & IPIF_UP) && 22287 (!(best_ipif->ipif_flags & IPIF_UP))))) { 22288 ipif_refhold_locked(ipif); 22289 mutex_exit(&ill->ill_lock); 22290 rw_exit(&ill_g_lock); 22291 if (best_ipif != NULL) 22292 ipif_refrele(best_ipif); 22293 best_ipif = ipif; 22294 rw_enter(&ill_g_lock, RW_READER); 22295 mutex_enter(&ill->ill_lock); 22296 } 22297 } 22298 } 22299 mutex_exit(&ill->ill_lock); 22300 } 22301 rw_exit(&ill_g_lock); 22302 return (best_ipif); 22303 } 22304 22305 22306 /* 22307 * Save enough information so that we can recreate the IRE if 22308 * the interface goes down and then up. 22309 */ 22310 static void 22311 ipif_save_ire(ipif_t *ipif, ire_t *ire) 22312 { 22313 mblk_t *save_mp; 22314 22315 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 22316 if (save_mp != NULL) { 22317 ifrt_t *ifrt; 22318 22319 save_mp->b_wptr += sizeof (ifrt_t); 22320 ifrt = (ifrt_t *)save_mp->b_rptr; 22321 bzero(ifrt, sizeof (ifrt_t)); 22322 ifrt->ifrt_type = ire->ire_type; 22323 ifrt->ifrt_addr = ire->ire_addr; 22324 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 22325 ifrt->ifrt_src_addr = ire->ire_src_addr; 22326 ifrt->ifrt_mask = ire->ire_mask; 22327 ifrt->ifrt_flags = ire->ire_flags; 22328 ifrt->ifrt_max_frag = ire->ire_max_frag; 22329 mutex_enter(&ipif->ipif_saved_ire_lock); 22330 save_mp->b_cont = ipif->ipif_saved_ire_mp; 22331 ipif->ipif_saved_ire_mp = save_mp; 22332 ipif->ipif_saved_ire_cnt++; 22333 mutex_exit(&ipif->ipif_saved_ire_lock); 22334 } 22335 } 22336 22337 22338 static void 22339 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 22340 { 22341 mblk_t **mpp; 22342 mblk_t *mp; 22343 ifrt_t *ifrt; 22344 22345 /* Remove from ipif_saved_ire_mp list if it is there */ 22346 mutex_enter(&ipif->ipif_saved_ire_lock); 22347 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 22348 mpp = &(*mpp)->b_cont) { 22349 /* 22350 * On a given ipif, the triple of address, gateway and 22351 * mask is unique for each saved IRE (in the case of 22352 * ordinary interface routes, the gateway address is 22353 * all-zeroes). 22354 */ 22355 mp = *mpp; 22356 ifrt = (ifrt_t *)mp->b_rptr; 22357 if (ifrt->ifrt_addr == ire->ire_addr && 22358 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 22359 ifrt->ifrt_mask == ire->ire_mask) { 22360 *mpp = mp->b_cont; 22361 ipif->ipif_saved_ire_cnt--; 22362 freeb(mp); 22363 break; 22364 } 22365 } 22366 mutex_exit(&ipif->ipif_saved_ire_lock); 22367 } 22368 22369 22370 /* 22371 * IP multirouting broadcast routes handling 22372 * Append CGTP broadcast IREs to regular ones created 22373 * at ifconfig time. 22374 */ 22375 static void 22376 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst) 22377 { 22378 ire_t *ire_prim; 22379 22380 ASSERT(ire != NULL); 22381 ASSERT(ire_dst != NULL); 22382 22383 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 22384 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 22385 if (ire_prim != NULL) { 22386 /* 22387 * We are in the special case of broadcasts for 22388 * CGTP. We add an IRE_BROADCAST that holds 22389 * the RTF_MULTIRT flag, the destination 22390 * address of ire_dst and the low level 22391 * info of ire_prim. In other words, CGTP 22392 * broadcast is added to the redundant ipif. 22393 */ 22394 ipif_t *ipif_prim; 22395 ire_t *bcast_ire; 22396 22397 ipif_prim = ire_prim->ire_ipif; 22398 22399 ip2dbg(("ip_cgtp_filter_bcast_add: " 22400 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 22401 (void *)ire_dst, (void *)ire_prim, 22402 (void *)ipif_prim)); 22403 22404 bcast_ire = ire_create( 22405 (uchar_t *)&ire->ire_addr, 22406 (uchar_t *)&ip_g_all_ones, 22407 (uchar_t *)&ire_dst->ire_src_addr, 22408 (uchar_t *)&ire->ire_gateway_addr, 22409 NULL, 22410 &ipif_prim->ipif_mtu, 22411 NULL, 22412 ipif_prim->ipif_rq, 22413 ipif_prim->ipif_wq, 22414 IRE_BROADCAST, 22415 ipif_prim->ipif_bcast_mp, 22416 ipif_prim, 22417 NULL, 22418 0, 22419 0, 22420 0, 22421 ire->ire_flags, 22422 &ire_uinfo_null, 22423 NULL, 22424 NULL); 22425 22426 if (bcast_ire != NULL) { 22427 22428 if (ire_add(&bcast_ire, NULL, NULL, NULL, 22429 B_FALSE) == 0) { 22430 ip2dbg(("ip_cgtp_filter_bcast_add: " 22431 "added bcast_ire %p\n", 22432 (void *)bcast_ire)); 22433 22434 ipif_save_ire(bcast_ire->ire_ipif, 22435 bcast_ire); 22436 ire_refrele(bcast_ire); 22437 } 22438 } 22439 ire_refrele(ire_prim); 22440 } 22441 } 22442 22443 22444 /* 22445 * IP multirouting broadcast routes handling 22446 * Remove the broadcast ire 22447 */ 22448 static void 22449 ip_cgtp_bcast_delete(ire_t *ire) 22450 { 22451 ire_t *ire_dst; 22452 22453 ASSERT(ire != NULL); 22454 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 22455 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 22456 if (ire_dst != NULL) { 22457 ire_t *ire_prim; 22458 22459 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 22460 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 22461 if (ire_prim != NULL) { 22462 ipif_t *ipif_prim; 22463 ire_t *bcast_ire; 22464 22465 ipif_prim = ire_prim->ire_ipif; 22466 22467 ip2dbg(("ip_cgtp_filter_bcast_delete: " 22468 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 22469 (void *)ire_dst, (void *)ire_prim, 22470 (void *)ipif_prim)); 22471 22472 bcast_ire = ire_ctable_lookup(ire->ire_addr, 22473 ire->ire_gateway_addr, 22474 IRE_BROADCAST, 22475 ipif_prim, ALL_ZONES, 22476 NULL, 22477 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 22478 MATCH_IRE_MASK); 22479 22480 if (bcast_ire != NULL) { 22481 ip2dbg(("ip_cgtp_filter_bcast_delete: " 22482 "looked up bcast_ire %p\n", 22483 (void *)bcast_ire)); 22484 ipif_remove_ire(bcast_ire->ire_ipif, 22485 bcast_ire); 22486 ire_delete(bcast_ire); 22487 } 22488 ire_refrele(ire_prim); 22489 } 22490 ire_refrele(ire_dst); 22491 } 22492 } 22493 22494 /* 22495 * IPsec hardware acceleration capabilities related functions. 22496 */ 22497 22498 /* 22499 * Free a per-ill IPsec capabilities structure. 22500 */ 22501 static void 22502 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 22503 { 22504 if (capab->auth_hw_algs != NULL) 22505 kmem_free(capab->auth_hw_algs, capab->algs_size); 22506 if (capab->encr_hw_algs != NULL) 22507 kmem_free(capab->encr_hw_algs, capab->algs_size); 22508 if (capab->encr_algparm != NULL) 22509 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 22510 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 22511 } 22512 22513 /* 22514 * Allocate a new per-ill IPsec capabilities structure. This structure 22515 * is specific to an IPsec protocol (AH or ESP). It is implemented as 22516 * an array which specifies, for each algorithm, whether this algorithm 22517 * is supported by the ill or not. 22518 */ 22519 static ill_ipsec_capab_t * 22520 ill_ipsec_capab_alloc(void) 22521 { 22522 ill_ipsec_capab_t *capab; 22523 uint_t nelems; 22524 22525 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 22526 if (capab == NULL) 22527 return (NULL); 22528 22529 /* we need one bit per algorithm */ 22530 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 22531 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 22532 22533 /* allocate memory to store algorithm flags */ 22534 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 22535 if (capab->encr_hw_algs == NULL) 22536 goto nomem; 22537 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 22538 if (capab->auth_hw_algs == NULL) 22539 goto nomem; 22540 /* 22541 * Leave encr_algparm NULL for now since we won't need it half 22542 * the time 22543 */ 22544 return (capab); 22545 22546 nomem: 22547 ill_ipsec_capab_free(capab); 22548 return (NULL); 22549 } 22550 22551 /* 22552 * Resize capability array. Since we're exclusive, this is OK. 22553 */ 22554 static boolean_t 22555 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 22556 { 22557 ipsec_capab_algparm_t *nalp, *oalp; 22558 uint32_t olen, nlen; 22559 22560 oalp = capab->encr_algparm; 22561 olen = capab->encr_algparm_size; 22562 22563 if (oalp != NULL) { 22564 if (algid < capab->encr_algparm_end) 22565 return (B_TRUE); 22566 } 22567 22568 nlen = (algid + 1) * sizeof (*nalp); 22569 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 22570 if (nalp == NULL) 22571 return (B_FALSE); 22572 22573 if (oalp != NULL) { 22574 bcopy(oalp, nalp, olen); 22575 kmem_free(oalp, olen); 22576 } 22577 capab->encr_algparm = nalp; 22578 capab->encr_algparm_size = nlen; 22579 capab->encr_algparm_end = algid + 1; 22580 22581 return (B_TRUE); 22582 } 22583 22584 /* 22585 * Compare the capabilities of the specified ill with the protocol 22586 * and algorithms specified by the SA passed as argument. 22587 * If they match, returns B_TRUE, B_FALSE if they do not match. 22588 * 22589 * The ill can be passed as a pointer to it, or by specifying its index 22590 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 22591 * 22592 * Called by ipsec_out_is_accelerated() do decide whether an outbound 22593 * packet is eligible for hardware acceleration, and by 22594 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 22595 * to a particular ill. 22596 */ 22597 boolean_t 22598 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 22599 ipsa_t *sa) 22600 { 22601 boolean_t sa_isv6; 22602 uint_t algid; 22603 struct ill_ipsec_capab_s *cpp; 22604 boolean_t need_refrele = B_FALSE; 22605 22606 if (ill == NULL) { 22607 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 22608 NULL, NULL, NULL); 22609 if (ill == NULL) { 22610 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 22611 return (B_FALSE); 22612 } 22613 need_refrele = B_TRUE; 22614 } 22615 22616 /* 22617 * Use the address length specified by the SA to determine 22618 * if it corresponds to a IPv6 address, and fail the matching 22619 * if the isv6 flag passed as argument does not match. 22620 * Note: this check is used for SADB capability checking before 22621 * sending SA information to an ill. 22622 */ 22623 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 22624 if (sa_isv6 != ill_isv6) 22625 /* protocol mismatch */ 22626 goto done; 22627 22628 /* 22629 * Check if the ill supports the protocol, algorithm(s) and 22630 * key size(s) specified by the SA, and get the pointers to 22631 * the algorithms supported by the ill. 22632 */ 22633 switch (sa->ipsa_type) { 22634 22635 case SADB_SATYPE_ESP: 22636 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 22637 /* ill does not support ESP acceleration */ 22638 goto done; 22639 cpp = ill->ill_ipsec_capab_esp; 22640 algid = sa->ipsa_auth_alg; 22641 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 22642 goto done; 22643 algid = sa->ipsa_encr_alg; 22644 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 22645 goto done; 22646 if (algid < cpp->encr_algparm_end) { 22647 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 22648 if (sa->ipsa_encrkeybits < alp->minkeylen) 22649 goto done; 22650 if (sa->ipsa_encrkeybits > alp->maxkeylen) 22651 goto done; 22652 } 22653 break; 22654 22655 case SADB_SATYPE_AH: 22656 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 22657 /* ill does not support AH acceleration */ 22658 goto done; 22659 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 22660 ill->ill_ipsec_capab_ah->auth_hw_algs)) 22661 goto done; 22662 break; 22663 } 22664 22665 if (need_refrele) 22666 ill_refrele(ill); 22667 return (B_TRUE); 22668 done: 22669 if (need_refrele) 22670 ill_refrele(ill); 22671 return (B_FALSE); 22672 } 22673 22674 22675 /* 22676 * Add a new ill to the list of IPsec capable ills. 22677 * Called from ill_capability_ipsec_ack() when an ACK was received 22678 * indicating that IPsec hardware processing was enabled for an ill. 22679 * 22680 * ill must point to the ill for which acceleration was enabled. 22681 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 22682 */ 22683 static void 22684 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 22685 { 22686 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 22687 uint_t sa_type; 22688 uint_t ipproto; 22689 22690 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 22691 (dl_cap == DL_CAPAB_IPSEC_ESP)); 22692 22693 switch (dl_cap) { 22694 case DL_CAPAB_IPSEC_AH: 22695 sa_type = SADB_SATYPE_AH; 22696 ills = &ipsec_capab_ills_ah; 22697 ipproto = IPPROTO_AH; 22698 break; 22699 case DL_CAPAB_IPSEC_ESP: 22700 sa_type = SADB_SATYPE_ESP; 22701 ills = &ipsec_capab_ills_esp; 22702 ipproto = IPPROTO_ESP; 22703 break; 22704 } 22705 22706 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 22707 22708 /* 22709 * Add ill index to list of hardware accelerators. If 22710 * already in list, do nothing. 22711 */ 22712 for (cur_ill = *ills; cur_ill != NULL && 22713 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 22714 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 22715 ; 22716 22717 if (cur_ill == NULL) { 22718 /* if this is a new entry for this ill */ 22719 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 22720 if (new_ill == NULL) { 22721 rw_exit(&ipsec_capab_ills_lock); 22722 return; 22723 } 22724 22725 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 22726 new_ill->ill_isv6 = ill->ill_isv6; 22727 new_ill->next = *ills; 22728 *ills = new_ill; 22729 } else if (!sadb_resync) { 22730 /* not resync'ing SADB and an entry exists for this ill */ 22731 rw_exit(&ipsec_capab_ills_lock); 22732 return; 22733 } 22734 22735 rw_exit(&ipsec_capab_ills_lock); 22736 22737 if (ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 22738 /* 22739 * IPsec module for protocol loaded, initiate dump 22740 * of the SADB to this ill. 22741 */ 22742 sadb_ill_download(ill, sa_type); 22743 } 22744 22745 /* 22746 * Remove an ill from the list of IPsec capable ills. 22747 */ 22748 static void 22749 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 22750 { 22751 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 22752 22753 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 22754 dl_cap == DL_CAPAB_IPSEC_ESP); 22755 22756 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipsec_capab_ills_ah : 22757 &ipsec_capab_ills_esp; 22758 22759 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 22760 22761 prev_ill = NULL; 22762 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 22763 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 22764 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 22765 ; 22766 if (cur_ill == NULL) { 22767 /* entry not found */ 22768 rw_exit(&ipsec_capab_ills_lock); 22769 return; 22770 } 22771 if (prev_ill == NULL) { 22772 /* entry at front of list */ 22773 *ills = NULL; 22774 } else { 22775 prev_ill->next = cur_ill->next; 22776 } 22777 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 22778 rw_exit(&ipsec_capab_ills_lock); 22779 } 22780 22781 22782 /* 22783 * Handling of DL_CONTROL_REQ messages that must be sent down to 22784 * an ill while having exclusive access. 22785 */ 22786 /* ARGSUSED */ 22787 static void 22788 ill_ipsec_capab_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 22789 { 22790 ill_t *ill = (ill_t *)q->q_ptr; 22791 22792 ill_dlpi_send(ill, mp); 22793 } 22794 22795 22796 /* 22797 * Called by SADB to send a DL_CONTROL_REQ message to every ill 22798 * supporting the specified IPsec protocol acceleration. 22799 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 22800 * We free the mblk and, if sa is non-null, release the held referece. 22801 */ 22802 void 22803 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa) 22804 { 22805 ipsec_capab_ill_t *ici, *cur_ici; 22806 ill_t *ill; 22807 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 22808 22809 ici = (sa_type == SADB_SATYPE_AH) ? ipsec_capab_ills_ah : 22810 ipsec_capab_ills_esp; 22811 22812 rw_enter(&ipsec_capab_ills_lock, RW_READER); 22813 22814 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 22815 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 22816 cur_ici->ill_isv6, NULL, NULL, NULL, NULL); 22817 22818 /* 22819 * Handle the case where the ill goes away while the SADB is 22820 * attempting to send messages. If it's going away, it's 22821 * nuking its shadow SADB, so we don't care.. 22822 */ 22823 22824 if (ill == NULL) 22825 continue; 22826 22827 if (sa != NULL) { 22828 /* 22829 * Make sure capabilities match before 22830 * sending SA to ill. 22831 */ 22832 if (!ipsec_capab_match(ill, cur_ici->ill_index, 22833 cur_ici->ill_isv6, sa)) { 22834 ill_refrele(ill); 22835 continue; 22836 } 22837 22838 mutex_enter(&sa->ipsa_lock); 22839 sa->ipsa_flags |= IPSA_F_HW; 22840 mutex_exit(&sa->ipsa_lock); 22841 } 22842 22843 /* 22844 * Copy template message, and add it to the front 22845 * of the mblk ship list. We want to avoid holding 22846 * the ipsec_capab_ills_lock while sending the 22847 * message to the ills. 22848 * 22849 * The b_next and b_prev are temporarily used 22850 * to build a list of mblks to be sent down, and to 22851 * save the ill to which they must be sent. 22852 */ 22853 nmp = copymsg(mp); 22854 if (nmp == NULL) { 22855 ill_refrele(ill); 22856 continue; 22857 } 22858 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 22859 nmp->b_next = mp_ship_list; 22860 mp_ship_list = nmp; 22861 nmp->b_prev = (mblk_t *)ill; 22862 } 22863 22864 rw_exit(&ipsec_capab_ills_lock); 22865 22866 nmp = mp_ship_list; 22867 while (nmp != NULL) { 22868 /* restore the mblk to a sane state */ 22869 next_mp = nmp->b_next; 22870 nmp->b_next = NULL; 22871 ill = (ill_t *)nmp->b_prev; 22872 nmp->b_prev = NULL; 22873 22874 /* 22875 * Ship the mblk to the ill, must be exclusive. Keep the 22876 * reference to the ill as qwriter_ip() does a ill_referele(). 22877 */ 22878 (void) qwriter_ip(NULL, ill, ill->ill_wq, nmp, 22879 ill_ipsec_capab_send_writer, NEW_OP, B_TRUE); 22880 22881 nmp = next_mp; 22882 } 22883 22884 if (sa != NULL) 22885 IPSA_REFRELE(sa); 22886 freemsg(mp); 22887 } 22888 22889 22890 /* 22891 * Derive an interface id from the link layer address. 22892 * Knows about IEEE 802 and IEEE EUI-64 mappings. 22893 */ 22894 static boolean_t 22895 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 22896 { 22897 char *addr; 22898 22899 if (phys_length != ETHERADDRL) 22900 return (B_FALSE); 22901 22902 /* Form EUI-64 like address */ 22903 addr = (char *)&v6addr->s6_addr32[2]; 22904 bcopy((char *)phys_addr, addr, 3); 22905 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 22906 addr[3] = (char)0xff; 22907 addr[4] = (char)0xfe; 22908 bcopy((char *)phys_addr + 3, addr + 5, 3); 22909 return (B_TRUE); 22910 } 22911 22912 /* ARGSUSED */ 22913 static boolean_t 22914 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 22915 { 22916 return (B_FALSE); 22917 } 22918 22919 /* ARGSUSED */ 22920 static boolean_t 22921 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 22922 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 22923 { 22924 /* 22925 * Multicast address mappings used over Ethernet/802.X. 22926 * This address is used as a base for mappings. 22927 */ 22928 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 22929 0x00, 0x00, 0x00}; 22930 22931 /* 22932 * Extract low order 32 bits from IPv6 multicast address. 22933 * Or that into the link layer address, starting from the 22934 * second byte. 22935 */ 22936 *hw_start = 2; 22937 v6_extract_mask->s6_addr32[0] = 0; 22938 v6_extract_mask->s6_addr32[1] = 0; 22939 v6_extract_mask->s6_addr32[2] = 0; 22940 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 22941 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 22942 return (B_TRUE); 22943 } 22944 22945 /* 22946 * Indicate by return value whether multicast is supported. If not, 22947 * this code should not touch/change any parameters. 22948 */ 22949 /* ARGSUSED */ 22950 static boolean_t 22951 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 22952 uint32_t *hw_start, ipaddr_t *extract_mask) 22953 { 22954 /* 22955 * Multicast address mappings used over Ethernet/802.X. 22956 * This address is used as a base for mappings. 22957 */ 22958 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 22959 0x00, 0x00, 0x00 }; 22960 22961 if (phys_length != ETHERADDRL) 22962 return (B_FALSE); 22963 22964 *extract_mask = htonl(0x007fffff); 22965 *hw_start = 2; 22966 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 22967 return (B_TRUE); 22968 } 22969 22970 /* 22971 * Derive IPoIB interface id from the link layer address. 22972 */ 22973 static boolean_t 22974 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 22975 { 22976 char *addr; 22977 22978 if (phys_length != 20) 22979 return (B_FALSE); 22980 addr = (char *)&v6addr->s6_addr32[2]; 22981 bcopy(phys_addr + 12, addr, 8); 22982 /* 22983 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 22984 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 22985 * rules. In these cases, the IBA considers these GUIDs to be in 22986 * "Modified EUI-64" format, and thus toggling the u/l bit is not 22987 * required; vendors are required not to assign global EUI-64's 22988 * that differ only in u/l bit values, thus guaranteeing uniqueness 22989 * of the interface identifier. Whether the GUID is in modified 22990 * or proper EUI-64 format, the ipv6 identifier must have the u/l 22991 * bit set to 1. 22992 */ 22993 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 22994 return (B_TRUE); 22995 } 22996 22997 /* 22998 * Note on mapping from multicast IP addresses to IPoIB multicast link 22999 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 23000 * The format of an IPoIB multicast address is: 23001 * 23002 * 4 byte QPN Scope Sign. Pkey 23003 * +--------------------------------------------+ 23004 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 23005 * +--------------------------------------------+ 23006 * 23007 * The Scope and Pkey components are properties of the IBA port and 23008 * network interface. They can be ascertained from the broadcast address. 23009 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 23010 */ 23011 23012 static boolean_t 23013 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23014 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23015 { 23016 /* 23017 * Base IPoIB IPv6 multicast address used for mappings. 23018 * Does not contain the IBA scope/Pkey values. 23019 */ 23020 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23021 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 23022 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23023 23024 /* 23025 * Extract low order 80 bits from IPv6 multicast address. 23026 * Or that into the link layer address, starting from the 23027 * sixth byte. 23028 */ 23029 *hw_start = 6; 23030 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 23031 23032 /* 23033 * Now fill in the IBA scope/Pkey values from the broadcast address. 23034 */ 23035 *(maddr + 5) = *(bphys_addr + 5); 23036 *(maddr + 8) = *(bphys_addr + 8); 23037 *(maddr + 9) = *(bphys_addr + 9); 23038 23039 v6_extract_mask->s6_addr32[0] = 0; 23040 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 23041 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 23042 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23043 return (B_TRUE); 23044 } 23045 23046 static boolean_t 23047 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23048 uint32_t *hw_start, ipaddr_t *extract_mask) 23049 { 23050 /* 23051 * Base IPoIB IPv4 multicast address used for mappings. 23052 * Does not contain the IBA scope/Pkey values. 23053 */ 23054 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23055 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 23056 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23057 23058 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 23059 return (B_FALSE); 23060 23061 /* 23062 * Extract low order 28 bits from IPv4 multicast address. 23063 * Or that into the link layer address, starting from the 23064 * sixteenth byte. 23065 */ 23066 *extract_mask = htonl(0x0fffffff); 23067 *hw_start = 16; 23068 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 23069 23070 /* 23071 * Now fill in the IBA scope/Pkey values from the broadcast address. 23072 */ 23073 *(maddr + 5) = *(bphys_addr + 5); 23074 *(maddr + 8) = *(bphys_addr + 8); 23075 *(maddr + 9) = *(bphys_addr + 9); 23076 return (B_TRUE); 23077 } 23078 23079 /* 23080 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 23081 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 23082 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 23083 * the link-local address is preferred. 23084 */ 23085 boolean_t 23086 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23087 { 23088 ipif_t *ipif; 23089 ipif_t *maybe_ipif = NULL; 23090 23091 mutex_enter(&ill->ill_lock); 23092 if (ill->ill_state_flags & ILL_CONDEMNED) { 23093 mutex_exit(&ill->ill_lock); 23094 if (ipifp != NULL) 23095 *ipifp = NULL; 23096 return (B_FALSE); 23097 } 23098 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 23099 if (!IPIF_CAN_LOOKUP(ipif)) 23100 continue; 23101 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 23102 ipif->ipif_zoneid != ALL_ZONES) 23103 continue; 23104 if ((ipif->ipif_flags & flags) != flags) 23105 continue; 23106 23107 if (ipifp == NULL) { 23108 mutex_exit(&ill->ill_lock); 23109 ASSERT(maybe_ipif == NULL); 23110 return (B_TRUE); 23111 } 23112 if (!ill->ill_isv6 || 23113 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 23114 ipif_refhold_locked(ipif); 23115 mutex_exit(&ill->ill_lock); 23116 *ipifp = ipif; 23117 return (B_TRUE); 23118 } 23119 if (maybe_ipif == NULL) 23120 maybe_ipif = ipif; 23121 } 23122 if (ipifp != NULL) { 23123 if (maybe_ipif != NULL) 23124 ipif_refhold_locked(maybe_ipif); 23125 *ipifp = maybe_ipif; 23126 } 23127 mutex_exit(&ill->ill_lock); 23128 return (maybe_ipif != NULL); 23129 } 23130 23131 /* 23132 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 23133 */ 23134 boolean_t 23135 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23136 { 23137 ill_t *illg; 23138 23139 /* 23140 * We look at the passed-in ill first without grabbing ill_g_lock. 23141 */ 23142 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 23143 return (B_TRUE); 23144 } 23145 rw_enter(&ill_g_lock, RW_READER); 23146 if (ill->ill_group == NULL) { 23147 /* ill not in a group */ 23148 rw_exit(&ill_g_lock); 23149 return (B_FALSE); 23150 } 23151 23152 /* 23153 * There's no ipif in the zone on ill, however ill is part of an IPMP 23154 * group. We need to look for an ipif in the zone on all the ills in the 23155 * group. 23156 */ 23157 illg = ill->ill_group->illgrp_ill; 23158 do { 23159 /* 23160 * We don't call ipif_lookup_zoneid() on ill as we already know 23161 * that it's not there. 23162 */ 23163 if (illg != ill && 23164 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 23165 break; 23166 } 23167 } while ((illg = illg->ill_group_next) != NULL); 23168 rw_exit(&ill_g_lock); 23169 return (illg != NULL); 23170 } 23171 23172 /* 23173 * Check if this ill is only being used to send ICMP probes for IPMP 23174 */ 23175 boolean_t 23176 ill_is_probeonly(ill_t *ill) 23177 { 23178 /* 23179 * Check if the interface is FAILED, or INACTIVE 23180 */ 23181 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 23182 return (B_TRUE); 23183 23184 return (B_FALSE); 23185 } 23186