1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 47 #include <sys/kmem.h> 48 #include <sys/systm.h> 49 #include <sys/param.h> 50 #include <sys/socket.h> 51 #define _SUN_TPI_VERSION 2 52 #include <sys/tihdr.h> 53 #include <sys/isa_defs.h> 54 #include <net/if.h> 55 #include <net/if_arp.h> 56 #include <net/if_types.h> 57 #include <net/if_dl.h> 58 #include <net/route.h> 59 #include <sys/sockio.h> 60 #include <netinet/in.h> 61 #include <netinet/ip6.h> 62 #include <netinet/icmp6.h> 63 #include <netinet/igmp_var.h> 64 #include <sys/strsun.h> 65 #include <sys/policy.h> 66 #include <sys/ethernet.h> 67 68 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 69 #include <inet/mi.h> 70 #include <inet/nd.h> 71 #include <inet/arp.h> 72 #include <inet/mib2.h> 73 #include <inet/ip.h> 74 #include <inet/ip6.h> 75 #include <inet/ip6_asp.h> 76 #include <inet/tcp.h> 77 #include <inet/ip_multi.h> 78 #include <inet/ip_ire.h> 79 #include <inet/ip_rts.h> 80 #include <inet/ip_ndp.h> 81 #include <inet/ip_if.h> 82 #include <inet/ip_impl.h> 83 #include <inet/tun.h> 84 #include <inet/sctp_ip.h> 85 86 #include <net/pfkeyv2.h> 87 #include <inet/ipsec_info.h> 88 #include <inet/sadb.h> 89 #include <inet/ipsec_impl.h> 90 #include <sys/iphada.h> 91 92 93 #include <netinet/igmp.h> 94 #include <inet/ip_listutils.h> 95 #include <netinet/ip_mroute.h> 96 #include <inet/ipclassifier.h> 97 #include <sys/mac.h> 98 99 #include <sys/systeminfo.h> 100 #include <sys/bootconf.h> 101 102 /* The character which tells where the ill_name ends */ 103 #define IPIF_SEPARATOR_CHAR ':' 104 105 /* IP ioctl function table entry */ 106 typedef struct ipft_s { 107 int ipft_cmd; 108 pfi_t ipft_pfi; 109 int ipft_min_size; 110 int ipft_flags; 111 } ipft_t; 112 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 113 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 114 115 typedef struct ip_sock_ar_s { 116 union { 117 area_t ip_sock_area; 118 ared_t ip_sock_ared; 119 areq_t ip_sock_areq; 120 } ip_sock_ar_u; 121 queue_t *ip_sock_ar_q; 122 } ip_sock_ar_t; 123 124 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 125 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 126 char *value, caddr_t cp, cred_t *ioc_cr); 127 128 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 129 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 130 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 131 mblk_t *mp, boolean_t need_up); 132 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 133 mblk_t *mp, boolean_t need_up); 134 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 135 queue_t *q, mblk_t *mp, boolean_t need_up); 136 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 137 mblk_t *mp, boolean_t need_up); 138 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 139 mblk_t *mp); 140 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 141 queue_t *q, mblk_t *mp, boolean_t need_up); 142 static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, 143 sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl); 144 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **); 145 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 146 static void ipsq_flush(ill_t *ill); 147 static void ipsq_clean_all(ill_t *ill); 148 static void ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring); 149 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 150 queue_t *q, mblk_t *mp, boolean_t need_up); 151 static void ipsq_delete(ipsq_t *); 152 153 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 154 boolean_t initialize); 155 static void ipif_check_bcast_ires(ipif_t *test_ipif); 156 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 157 static void ipif_delete_cache_ire(ire_t *, char *); 158 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 159 static void ipif_down_tail(ipif_t *ipif); 160 static void ipif_free(ipif_t *ipif); 161 static void ipif_free_tail(ipif_t *ipif); 162 static void ipif_mask_reply(ipif_t *); 163 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 164 static void ipif_multicast_down(ipif_t *ipif); 165 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 166 static void ipif_set_default(ipif_t *ipif); 167 static int ipif_set_values(queue_t *q, mblk_t *mp, 168 char *interf_name, uint_t *ppa); 169 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 170 queue_t *q); 171 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 172 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 173 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error); 174 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 175 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 176 177 static int ill_alloc_ppa(ill_if_t *, ill_t *); 178 static int ill_arp_off(ill_t *ill); 179 static int ill_arp_on(ill_t *ill); 180 static void ill_delete_interface_type(ill_if_t *); 181 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 182 static void ill_down(ill_t *ill); 183 static void ill_downi(ire_t *ire, char *ill_arg); 184 static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); 185 static void ill_down_tail(ill_t *ill); 186 static void ill_free_mib(ill_t *ill); 187 static void ill_glist_delete(ill_t *); 188 static boolean_t ill_has_usable_ipif(ill_t *); 189 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 190 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 191 static void ill_phyint_free(ill_t *ill); 192 static void ill_phyint_reinit(ill_t *ill); 193 static void ill_set_nce_router_flags(ill_t *, boolean_t); 194 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 195 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 196 static void ill_stq_cache_delete(ire_t *, char *); 197 198 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 199 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 200 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 201 in6_addr_t *); 202 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 203 ipaddr_t *); 204 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 205 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 206 in6_addr_t *); 207 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 208 ipaddr_t *); 209 210 static void ipif_save_ire(ipif_t *, ire_t *); 211 static void ipif_remove_ire(ipif_t *, ire_t *); 212 static void ip_cgtp_bcast_add(ire_t *, ire_t *); 213 static void ip_cgtp_bcast_delete(ire_t *); 214 215 /* 216 * Per-ill IPsec capabilities management. 217 */ 218 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 219 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 220 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 221 static void ill_ipsec_capab_delete(ill_t *, uint_t); 222 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 223 static void ill_capability_proto(ill_t *, int, mblk_t *); 224 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 225 boolean_t); 226 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 227 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 228 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 229 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 230 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 231 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 232 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 233 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 234 dl_capability_sub_t *); 235 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 236 237 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 238 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 239 static void ill_capability_dls_reset(ill_t *, mblk_t **); 240 static void ill_capability_dls_disable(ill_t *); 241 242 static void illgrp_cache_delete(ire_t *, char *); 243 static void illgrp_delete(ill_t *ill); 244 static void illgrp_reset_schednext(ill_t *ill); 245 246 static ill_t *ill_prev_usesrc(ill_t *); 247 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 248 static void ill_disband_usesrc_group(ill_t *); 249 250 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 251 252 /* 253 * if we go over the memory footprint limit more than once in this msec 254 * interval, we'll start pruning aggressively. 255 */ 256 int ip_min_frag_prune_time = 0; 257 258 /* 259 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 260 * and the IPsec DOI 261 */ 262 #define MAX_IPSEC_ALGS 256 263 264 #define BITSPERBYTE 8 265 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 266 267 #define IPSEC_ALG_ENABLE(algs, algid) \ 268 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 269 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 270 271 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 272 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 273 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 274 275 typedef uint8_t ipsec_capab_elem_t; 276 277 /* 278 * Per-algorithm parameters. Note that at present, only encryption 279 * algorithms have variable keysize (IKE does not provide a way to negotiate 280 * auth algorithm keysize). 281 * 282 * All sizes here are in bits. 283 */ 284 typedef struct 285 { 286 uint16_t minkeylen; 287 uint16_t maxkeylen; 288 } ipsec_capab_algparm_t; 289 290 /* 291 * Per-ill capabilities. 292 */ 293 struct ill_ipsec_capab_s { 294 ipsec_capab_elem_t *encr_hw_algs; 295 ipsec_capab_elem_t *auth_hw_algs; 296 uint32_t algs_size; /* size of _hw_algs in bytes */ 297 /* algorithm key lengths */ 298 ipsec_capab_algparm_t *encr_algparm; 299 uint32_t encr_algparm_size; 300 uint32_t encr_algparm_end; 301 }; 302 303 /* 304 * List of AH and ESP IPsec acceleration capable ills 305 */ 306 typedef struct ipsec_capab_ill_s { 307 uint_t ill_index; 308 boolean_t ill_isv6; 309 struct ipsec_capab_ill_s *next; 310 } ipsec_capab_ill_t; 311 312 static ipsec_capab_ill_t *ipsec_capab_ills_ah; 313 static ipsec_capab_ill_t *ipsec_capab_ills_esp; 314 krwlock_t ipsec_capab_ills_lock; 315 316 /* 317 * The field values are larger than strictly necessary for simple 318 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 319 */ 320 static area_t ip_area_template = { 321 AR_ENTRY_ADD, /* area_cmd */ 322 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 323 /* area_name_offset */ 324 /* area_name_length temporarily holds this structure length */ 325 sizeof (area_t), /* area_name_length */ 326 IP_ARP_PROTO_TYPE, /* area_proto */ 327 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 328 IP_ADDR_LEN, /* area_proto_addr_length */ 329 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 330 /* area_proto_mask_offset */ 331 0, /* area_flags */ 332 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 333 /* area_hw_addr_offset */ 334 /* Zero length hw_addr_length means 'use your idea of the address' */ 335 0 /* area_hw_addr_length */ 336 }; 337 338 /* 339 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 340 * support 341 */ 342 static area_t ip6_area_template = { 343 AR_ENTRY_ADD, /* area_cmd */ 344 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 345 /* area_name_offset */ 346 /* area_name_length temporarily holds this structure length */ 347 sizeof (area_t), /* area_name_length */ 348 IP_ARP_PROTO_TYPE, /* area_proto */ 349 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 350 IPV6_ADDR_LEN, /* area_proto_addr_length */ 351 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 352 /* area_proto_mask_offset */ 353 0, /* area_flags */ 354 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 355 /* area_hw_addr_offset */ 356 /* Zero length hw_addr_length means 'use your idea of the address' */ 357 0 /* area_hw_addr_length */ 358 }; 359 360 static ared_t ip_ared_template = { 361 AR_ENTRY_DELETE, 362 sizeof (ared_t) + IP_ADDR_LEN, 363 sizeof (ared_t), 364 IP_ARP_PROTO_TYPE, 365 sizeof (ared_t), 366 IP_ADDR_LEN 367 }; 368 369 static ared_t ip6_ared_template = { 370 AR_ENTRY_DELETE, 371 sizeof (ared_t) + IPV6_ADDR_LEN, 372 sizeof (ared_t), 373 IP_ARP_PROTO_TYPE, 374 sizeof (ared_t), 375 IPV6_ADDR_LEN 376 }; 377 378 /* 379 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 380 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 381 * areq is used). 382 */ 383 static areq_t ip_areq_template = { 384 AR_ENTRY_QUERY, /* cmd */ 385 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 386 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 387 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 388 sizeof (areq_t), /* target addr offset */ 389 IP_ADDR_LEN, /* target addr_length */ 390 0, /* flags */ 391 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 392 IP_ADDR_LEN, /* sender addr length */ 393 6, /* xmit_count */ 394 1000, /* (re)xmit_interval in milliseconds */ 395 4 /* max # of requests to buffer */ 396 /* anything else filled in by the code */ 397 }; 398 399 static arc_t ip_aru_template = { 400 AR_INTERFACE_UP, 401 sizeof (arc_t), /* Name offset */ 402 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 403 }; 404 405 static arc_t ip_ard_template = { 406 AR_INTERFACE_DOWN, 407 sizeof (arc_t), /* Name offset */ 408 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 409 }; 410 411 static arc_t ip_aron_template = { 412 AR_INTERFACE_ON, 413 sizeof (arc_t), /* Name offset */ 414 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 415 }; 416 417 static arc_t ip_aroff_template = { 418 AR_INTERFACE_OFF, 419 sizeof (arc_t), /* Name offset */ 420 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 421 }; 422 423 424 static arma_t ip_arma_multi_template = { 425 AR_MAPPING_ADD, 426 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 427 /* Name offset */ 428 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 429 IP_ARP_PROTO_TYPE, 430 sizeof (arma_t), /* proto_addr_offset */ 431 IP_ADDR_LEN, /* proto_addr_length */ 432 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 433 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 434 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 435 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 436 IP_MAX_HW_LEN, /* hw_addr_length */ 437 0, /* hw_mapping_start */ 438 }; 439 440 static ipft_t ip_ioctl_ftbl[] = { 441 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 442 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 443 IPFT_F_NO_REPLY }, 444 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 445 IPFT_F_NO_REPLY }, 446 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 447 { 0 } 448 }; 449 450 /* Simple ICMP IP Header Template */ 451 static ipha_t icmp_ipha = { 452 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 453 }; 454 455 /* Flag descriptors for ip_ipif_report */ 456 static nv_t ipif_nv_tbl[] = { 457 { IPIF_UP, "UP" }, 458 { IPIF_BROADCAST, "BROADCAST" }, 459 { ILLF_DEBUG, "DEBUG" }, 460 { PHYI_LOOPBACK, "LOOPBACK" }, 461 { IPIF_POINTOPOINT, "POINTOPOINT" }, 462 { ILLF_NOTRAILERS, "NOTRAILERS" }, 463 { PHYI_RUNNING, "RUNNING" }, 464 { ILLF_NOARP, "NOARP" }, 465 { PHYI_PROMISC, "PROMISC" }, 466 { PHYI_ALLMULTI, "ALLMULTI" }, 467 { PHYI_INTELLIGENT, "INTELLIGENT" }, 468 { ILLF_MULTICAST, "MULTICAST" }, 469 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 470 { IPIF_UNNUMBERED, "UNNUMBERED" }, 471 { IPIF_DHCPRUNNING, "DHCP" }, 472 { IPIF_PRIVATE, "PRIVATE" }, 473 { IPIF_NOXMIT, "NOXMIT" }, 474 { IPIF_NOLOCAL, "NOLOCAL" }, 475 { IPIF_DEPRECATED, "DEPRECATED" }, 476 { IPIF_PREFERRED, "PREFERRED" }, 477 { IPIF_TEMPORARY, "TEMPORARY" }, 478 { IPIF_ADDRCONF, "ADDRCONF" }, 479 { PHYI_VIRTUAL, "VIRTUAL" }, 480 { ILLF_ROUTER, "ROUTER" }, 481 { ILLF_NONUD, "NONUD" }, 482 { IPIF_ANYCAST, "ANYCAST" }, 483 { ILLF_NORTEXCH, "NORTEXCH" }, 484 { ILLF_IPV4, "IPV4" }, 485 { ILLF_IPV6, "IPV6" }, 486 { IPIF_MIPRUNNING, "MIP" }, 487 { IPIF_NOFAILOVER, "NOFAILOVER" }, 488 { PHYI_FAILED, "FAILED" }, 489 { PHYI_STANDBY, "STANDBY" }, 490 { PHYI_INACTIVE, "INACTIVE" }, 491 { PHYI_OFFLINE, "OFFLINE" }, 492 }; 493 494 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 495 496 static ip_m_t ip_m_tbl[] = { 497 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 498 ip_ether_v6intfid }, 499 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 500 ip_nodef_v6intfid }, 501 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 502 ip_nodef_v6intfid }, 503 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 504 ip_nodef_v6intfid }, 505 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 506 ip_ether_v6intfid }, 507 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 508 ip_ib_v6intfid }, 509 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 510 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 511 ip_nodef_v6intfid } 512 }; 513 514 static ill_t ill_null; /* Empty ILL for init. */ 515 char ipif_loopback_name[] = "lo0"; 516 static char *ipv4_forward_suffix = ":ip_forwarding"; 517 static char *ipv6_forward_suffix = ":ip6_forwarding"; 518 static kstat_t *loopback_ksp = NULL; 519 static sin6_t sin6_null; /* Zero address for quick clears */ 520 static sin_t sin_null; /* Zero address for quick clears */ 521 static uint_t ill_index = 1; /* Used to assign interface indicies */ 522 /* When set search for unused index */ 523 static boolean_t ill_index_wrap = B_FALSE; 524 /* When set search for unused ipif_seqid */ 525 static ipif_t ipif_zero; 526 uint_t ipif_src_random; 527 528 /* 529 * For details on the protection offered by these locks please refer 530 * to the notes under the Synchronization section at the start of ip.c 531 */ 532 krwlock_t ill_g_lock; /* The global ill_g_lock */ 533 kmutex_t ip_addr_avail_lock; /* Address availability check lock */ 534 ipsq_t *ipsq_g_head; /* List of all ipsq's on the system */ 535 536 krwlock_t ill_g_usesrc_lock; /* Protects usesrc related fields */ 537 538 /* 539 * illgrp_head/ifgrp_head is protected by IP's perimeter. 540 */ 541 static ill_group_t *illgrp_head_v4; /* Head of IPv4 ill groups */ 542 ill_group_t *illgrp_head_v6; /* Head of IPv6 ill groups */ 543 544 ill_g_head_t ill_g_heads[MAX_G_HEADS]; /* ILL List Head */ 545 546 /* 547 * ppa arena is created after these many 548 * interfaces have been plumbed. 549 */ 550 uint_t ill_no_arena = 12; 551 552 #pragma align CACHE_ALIGN_SIZE(phyint_g_list) 553 static phyint_list_t phyint_g_list; /* start of phyint list */ 554 555 /* 556 * Reflects value of FAILBACK variable in IPMP config file 557 * /etc/default/mpathd. Default value is B_TRUE. 558 * Set to B_FALSE if user disabled failback by configuring "FAILBACK=no" 559 * in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel. 560 */ 561 static boolean_t ipmp_enable_failback = B_TRUE; 562 563 /* 564 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 565 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 566 * set through platform specific code (Niagara/Ontario). 567 */ 568 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 569 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 570 571 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 572 573 static uint_t 574 ipif_rand(void) 575 { 576 ipif_src_random = ipif_src_random * 1103515245 + 12345; 577 return ((ipif_src_random >> 16) & 0x7fff); 578 } 579 580 /* 581 * Allocate per-interface mibs. Only used for ipv6. 582 * Returns true if ok. False otherwise. 583 * ipsq may not yet be allocated (loopback case ). 584 */ 585 static boolean_t 586 ill_allocate_mibs(ill_t *ill) 587 { 588 ASSERT(ill->ill_isv6); 589 590 /* Already allocated? */ 591 if (ill->ill_ip6_mib != NULL) { 592 ASSERT(ill->ill_icmp6_mib != NULL); 593 return (B_TRUE); 594 } 595 596 ill->ill_ip6_mib = kmem_zalloc(sizeof (*ill->ill_ip6_mib), 597 KM_NOSLEEP); 598 if (ill->ill_ip6_mib == NULL) { 599 return (B_FALSE); 600 } 601 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 602 KM_NOSLEEP); 603 if (ill->ill_icmp6_mib == NULL) { 604 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 605 ill->ill_ip6_mib = NULL; 606 return (B_FALSE); 607 } 608 /* 609 * The ipv6Ifindex and ipv6IfIcmpIndex will be assigned later 610 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 611 * -> ill_phyint_reinit 612 */ 613 return (B_TRUE); 614 } 615 616 /* 617 * Common code for preparation of ARP commands. Two points to remember: 618 * 1) The ill_name is tacked on at the end of the allocated space so 619 * the templates name_offset field must contain the total space 620 * to allocate less the name length. 621 * 622 * 2) The templates name_length field should contain the *template* 623 * length. We use it as a parameter to bcopy() and then write 624 * the real ill_name_length into the name_length field of the copy. 625 * (Always called as writer.) 626 */ 627 mblk_t * 628 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 629 { 630 arc_t *arc = (arc_t *)template; 631 char *cp; 632 int len; 633 mblk_t *mp; 634 uint_t name_length = ill->ill_name_length; 635 uint_t template_len = arc->arc_name_length; 636 637 len = arc->arc_name_offset + name_length; 638 mp = allocb(len, BPRI_HI); 639 if (mp == NULL) 640 return (NULL); 641 cp = (char *)mp->b_rptr; 642 mp->b_wptr = (uchar_t *)&cp[len]; 643 if (template_len) 644 bcopy(template, cp, template_len); 645 if (len > template_len) 646 bzero(&cp[template_len], len - template_len); 647 mp->b_datap->db_type = M_PROTO; 648 649 arc = (arc_t *)cp; 650 arc->arc_name_length = name_length; 651 cp = (char *)arc + arc->arc_name_offset; 652 bcopy(ill->ill_name, cp, name_length); 653 654 if (addr) { 655 area_t *area = (area_t *)mp->b_rptr; 656 657 cp = (char *)area + area->area_proto_addr_offset; 658 bcopy(addr, cp, area->area_proto_addr_length); 659 if (area->area_cmd == AR_ENTRY_ADD) { 660 cp = (char *)area; 661 len = area->area_proto_addr_length; 662 if (area->area_proto_mask_offset) 663 cp += area->area_proto_mask_offset; 664 else 665 cp += area->area_proto_addr_offset + len; 666 while (len-- > 0) 667 *cp++ = (char)~0; 668 } 669 } 670 return (mp); 671 } 672 673 /* 674 * Completely vaporize a lower level tap and all associated interfaces. 675 * ill_delete is called only out of ip_close when the device control 676 * stream is being closed. 677 */ 678 void 679 ill_delete(ill_t *ill) 680 { 681 ipif_t *ipif; 682 ill_t *prev_ill; 683 684 /* 685 * ill_delete may be forcibly entering the ipsq. The previous 686 * ioctl may not have completed and may need to be aborted. 687 * ipsq_flush takes care of it. If we don't need to enter the 688 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 689 * ill_delete_tail is sufficient. 690 */ 691 ipsq_flush(ill); 692 693 /* 694 * Nuke all interfaces. ipif_free will take down the interface, 695 * remove it from the list, and free the data structure. 696 * Walk down the ipif list and remove the logical interfaces 697 * first before removing the main ipif. We can't unplumb 698 * zeroth interface first in the case of IPv6 as reset_conn_ill 699 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 700 * POINTOPOINT. 701 * 702 * If ill_ipif was not properly initialized (i.e low on memory), 703 * then no interfaces to clean up. In this case just clean up the 704 * ill. 705 */ 706 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 707 ipif_free(ipif); 708 709 /* 710 * Used only by ill_arp_on and ill_arp_off, which are writers. 711 * So nobody can be using this mp now. Free the mp allocated for 712 * honoring ILLF_NOARP 713 */ 714 freemsg(ill->ill_arp_on_mp); 715 ill->ill_arp_on_mp = NULL; 716 717 /* Clean up msgs on pending upcalls for mrouted */ 718 reset_mrt_ill(ill); 719 720 /* 721 * ipif_free -> reset_conn_ipif will remove all multicast 722 * references for IPv4. For IPv6, we need to do it here as 723 * it points only at ills. 724 */ 725 reset_conn_ill(ill); 726 727 /* 728 * ill_down will arrange to blow off any IRE's dependent on this 729 * ILL, and shut down fragmentation reassembly. 730 */ 731 ill_down(ill); 732 733 /* Let SCTP know, so that it can remove this from its list. */ 734 sctp_update_ill(ill, SCTP_ILL_REMOVE); 735 736 /* 737 * If an address on this ILL is being used as a source address then 738 * clear out the pointers in other ILLs that point to this ILL. 739 */ 740 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 741 if (ill->ill_usesrc_grp_next != NULL) { 742 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 743 ill_disband_usesrc_group(ill); 744 } else { /* consumer of the usesrc ILL */ 745 prev_ill = ill_prev_usesrc(ill); 746 prev_ill->ill_usesrc_grp_next = 747 ill->ill_usesrc_grp_next; 748 } 749 } 750 rw_exit(&ill_g_usesrc_lock); 751 } 752 753 /* 754 * ill_delete_tail is called from ip_modclose after all references 755 * to the closing ill are gone. The wait is done in ip_modclose 756 */ 757 void 758 ill_delete_tail(ill_t *ill) 759 { 760 mblk_t **mpp; 761 ipif_t *ipif; 762 763 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 764 ipif_down_tail(ipif); 765 766 /* 767 * Send the detach if there's one to send (i.e., if we're above a 768 * style 2 DLPI driver). 769 */ 770 if (ill->ill_detach_mp != NULL) { 771 ill_dlpi_send(ill, ill->ill_detach_mp); 772 ill->ill_detach_mp = NULL; 773 } 774 775 /* 776 * If polling capability is enabled (which signifies direct 777 * upcall into IP and driver has ill saved as a handle), 778 * we need to make sure that unbind has completed before we 779 * let the ill disappear and driver no longer has any reference 780 * to this ill. 781 */ 782 mutex_enter(&ill->ill_lock); 783 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) { 784 while (!(ill->ill_state_flags & ILL_DL_UNBIND_DONE)) 785 cv_wait(&ill->ill_cv, &ill->ill_lock); 786 } 787 mutex_exit(&ill->ill_lock); 788 789 if (ill->ill_net_type != IRE_LOOPBACK) 790 qprocsoff(ill->ill_rq); 791 792 /* 793 * We do an ipsq_flush once again now. New messages could have 794 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 795 * could also have landed up if an ioctl thread had looked up 796 * the ill before we set the ILL_CONDEMNED flag, but not yet 797 * enqueued the ioctl when we did the ipsq_flush last time. 798 */ 799 ipsq_flush(ill); 800 801 /* 802 * Free capabilities. 803 */ 804 if (ill->ill_ipsec_capab_ah != NULL) { 805 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 806 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 807 ill->ill_ipsec_capab_ah = NULL; 808 } 809 810 if (ill->ill_ipsec_capab_esp != NULL) { 811 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 812 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 813 ill->ill_ipsec_capab_esp = NULL; 814 } 815 816 if (ill->ill_mdt_capab != NULL) { 817 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 818 ill->ill_mdt_capab = NULL; 819 } 820 821 if (ill->ill_hcksum_capab != NULL) { 822 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 823 ill->ill_hcksum_capab = NULL; 824 } 825 826 if (ill->ill_zerocopy_capab != NULL) { 827 kmem_free(ill->ill_zerocopy_capab, 828 sizeof (ill_zerocopy_capab_t)); 829 ill->ill_zerocopy_capab = NULL; 830 } 831 832 /* 833 * Clean up polling and soft ring capabilities 834 */ 835 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 836 ill_capability_dls_disable(ill); 837 838 if (ill->ill_dls_capab != NULL) { 839 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 840 ill->ill_dls_capab->ill_unbind_conn = NULL; 841 kmem_free(ill->ill_dls_capab, 842 sizeof (ill_dls_capab_t) + 843 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 844 ill->ill_dls_capab = NULL; 845 } 846 847 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 848 849 while (ill->ill_ipif != NULL) 850 ipif_free_tail(ill->ill_ipif); 851 852 ill_down_tail(ill); 853 854 /* 855 * We have removed all references to ilm from conn and the ones joined 856 * within the kernel. 857 * 858 * We don't walk conns, mrts and ires because 859 * 860 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 861 * 2) ill_down ->ill_downi walks all the ires and cleans up 862 * ill references. 863 */ 864 ASSERT(ilm_walk_ill(ill) == 0); 865 /* 866 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 867 * could free the phyint. No more reference to the phyint after this 868 * point. 869 */ 870 (void) ill_glist_delete(ill); 871 872 rw_enter(&ip_g_nd_lock, RW_WRITER); 873 if (ill->ill_ndd_name != NULL) 874 nd_unload(&ip_g_nd, ill->ill_ndd_name); 875 rw_exit(&ip_g_nd_lock); 876 877 878 if (ill->ill_frag_ptr != NULL) { 879 uint_t count; 880 881 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 882 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 883 } 884 mi_free(ill->ill_frag_ptr); 885 ill->ill_frag_ptr = NULL; 886 ill->ill_frag_hash_tbl = NULL; 887 } 888 if (ill->ill_nd_lla_mp != NULL) 889 freemsg(ill->ill_nd_lla_mp); 890 /* Free all retained control messages. */ 891 mpp = &ill->ill_first_mp_to_free; 892 do { 893 while (mpp[0]) { 894 mblk_t *mp; 895 mblk_t *mp1; 896 897 mp = mpp[0]; 898 mpp[0] = mp->b_next; 899 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 900 mp1->b_next = NULL; 901 mp1->b_prev = NULL; 902 } 903 freemsg(mp); 904 } 905 } while (mpp++ != &ill->ill_last_mp_to_free); 906 907 ill_free_mib(ill); 908 ILL_TRACE_CLEANUP(ill); 909 } 910 911 static void 912 ill_free_mib(ill_t *ill) 913 { 914 if (ill->ill_ip6_mib != NULL) { 915 kmem_free(ill->ill_ip6_mib, sizeof (*ill->ill_ip6_mib)); 916 ill->ill_ip6_mib = NULL; 917 } 918 if (ill->ill_icmp6_mib != NULL) { 919 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 920 ill->ill_icmp6_mib = NULL; 921 } 922 } 923 924 /* 925 * Concatenate together a physical address and a sap. 926 * 927 * Sap_lengths are interpreted as follows: 928 * sap_length == 0 ==> no sap 929 * sap_length > 0 ==> sap is at the head of the dlpi address 930 * sap_length < 0 ==> sap is at the tail of the dlpi address 931 */ 932 static void 933 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 934 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 935 { 936 uint16_t sap_addr = (uint16_t)sap_src; 937 938 if (sap_length == 0) { 939 if (phys_src == NULL) 940 bzero(dst, phys_length); 941 else 942 bcopy(phys_src, dst, phys_length); 943 } else if (sap_length < 0) { 944 if (phys_src == NULL) 945 bzero(dst, phys_length); 946 else 947 bcopy(phys_src, dst, phys_length); 948 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 949 } else { 950 bcopy(&sap_addr, dst, sizeof (sap_addr)); 951 if (phys_src == NULL) 952 bzero((char *)dst + sap_length, phys_length); 953 else 954 bcopy(phys_src, (char *)dst + sap_length, phys_length); 955 } 956 } 957 958 /* 959 * Generate a dl_unitdata_req mblk for the device and address given. 960 * addr_length is the length of the physical portion of the address. 961 * If addr is NULL include an all zero address of the specified length. 962 * TRUE? In any case, addr_length is taken to be the entire length of the 963 * dlpi address, including the absolute value of sap_length. 964 */ 965 mblk_t * 966 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 967 t_scalar_t sap_length) 968 { 969 dl_unitdata_req_t *dlur; 970 mblk_t *mp; 971 t_scalar_t abs_sap_length; /* absolute value */ 972 973 abs_sap_length = ABS(sap_length); 974 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 975 DL_UNITDATA_REQ); 976 if (mp == NULL) 977 return (NULL); 978 dlur = (dl_unitdata_req_t *)mp->b_rptr; 979 /* HACK: accomodate incompatible DLPI drivers */ 980 if (addr_length == 8) 981 addr_length = 6; 982 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 983 dlur->dl_dest_addr_offset = sizeof (*dlur); 984 dlur->dl_priority.dl_min = 0; 985 dlur->dl_priority.dl_max = 0; 986 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 987 (uchar_t *)&dlur[1]); 988 return (mp); 989 } 990 991 /* 992 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 993 * Return an error if we already have 1 or more ioctls in progress. 994 * This is used only for non-exclusive ioctls. Currently this is used 995 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 996 * and thus need to use ipsq_pending_mp_add. 997 */ 998 boolean_t 999 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1000 { 1001 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1002 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1003 /* 1004 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1005 */ 1006 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1007 (add_mp->b_datap->db_type == M_IOCTL)); 1008 1009 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1010 /* 1011 * Return error if the conn has started closing. The conn 1012 * could have finished cleaning up the pending mp list, 1013 * If so we should not add another mp to the list negating 1014 * the cleanup. 1015 */ 1016 if (connp->conn_state_flags & CONN_CLOSING) 1017 return (B_FALSE); 1018 /* 1019 * Add the pending mp to the head of the list, chained by b_next. 1020 * Note down the conn on which the ioctl request came, in b_prev. 1021 * This will be used to later get the conn, when we get a response 1022 * on the ill queue, from some other module (typically arp) 1023 */ 1024 add_mp->b_next = (void *)ill->ill_pending_mp; 1025 add_mp->b_queue = CONNP_TO_WQ(connp); 1026 ill->ill_pending_mp = add_mp; 1027 if (connp != NULL) 1028 connp->conn_oper_pending_ill = ill; 1029 return (B_TRUE); 1030 } 1031 1032 /* 1033 * Retrieve the ill_pending_mp and return it. We have to walk the list 1034 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1035 */ 1036 mblk_t * 1037 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1038 { 1039 mblk_t *prev = NULL; 1040 mblk_t *curr = NULL; 1041 uint_t id; 1042 conn_t *connp; 1043 1044 /* 1045 * When the conn closes, conn_ioctl_cleanup needs to clean 1046 * up the pending mp, but it does not know the ioc_id and 1047 * passes in a zero for it. 1048 */ 1049 mutex_enter(&ill->ill_lock); 1050 if (ioc_id != 0) 1051 *connpp = NULL; 1052 1053 /* Search the list for the appropriate ioctl based on ioc_id */ 1054 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1055 prev = curr, curr = curr->b_next) { 1056 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1057 connp = Q_TO_CONN(curr->b_queue); 1058 /* Match based on the ioc_id or based on the conn */ 1059 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1060 break; 1061 } 1062 1063 if (curr != NULL) { 1064 /* Unlink the mblk from the pending mp list */ 1065 if (prev != NULL) { 1066 prev->b_next = curr->b_next; 1067 } else { 1068 ASSERT(ill->ill_pending_mp == curr); 1069 ill->ill_pending_mp = curr->b_next; 1070 } 1071 1072 /* 1073 * conn refcnt must have been bumped up at the start of 1074 * the ioctl. So we can safely access the conn. 1075 */ 1076 ASSERT(CONN_Q(curr->b_queue)); 1077 *connpp = Q_TO_CONN(curr->b_queue); 1078 curr->b_next = NULL; 1079 curr->b_queue = NULL; 1080 } 1081 1082 mutex_exit(&ill->ill_lock); 1083 1084 return (curr); 1085 } 1086 1087 /* 1088 * Add the pending mp to the list. There can be only 1 pending mp 1089 * in the list. Any exclusive ioctl that needs to wait for a response 1090 * from another module or driver needs to use this function to set 1091 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1092 * the other module/driver. This is also used while waiting for the 1093 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1094 */ 1095 boolean_t 1096 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1097 int waitfor) 1098 { 1099 ipsq_t *ipsq; 1100 1101 ASSERT(IAM_WRITER_IPIF(ipif)); 1102 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1103 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1104 /* 1105 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1106 * M_ERROR/M_HANGUP from driver 1107 */ 1108 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1109 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP)); 1110 1111 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1112 if (connp != NULL) { 1113 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1114 /* 1115 * Return error if the conn has started closing. The conn 1116 * could have finished cleaning up the pending mp list, 1117 * If so we should not add another mp to the list negating 1118 * the cleanup. 1119 */ 1120 if (connp->conn_state_flags & CONN_CLOSING) 1121 return (B_FALSE); 1122 } 1123 mutex_enter(&ipsq->ipsq_lock); 1124 ipsq->ipsq_pending_ipif = ipif; 1125 /* 1126 * Note down the queue in b_queue. This will be returned by 1127 * ipsq_pending_mp_get. Caller will then use these values to restart 1128 * the processing 1129 */ 1130 add_mp->b_next = NULL; 1131 add_mp->b_queue = q; 1132 ipsq->ipsq_pending_mp = add_mp; 1133 ipsq->ipsq_waitfor = waitfor; 1134 /* 1135 * ipsq_current_ipif is needed to restart the operation from 1136 * ipif_ill_refrele_tail when the last reference to the ipi/ill 1137 * is gone. Since this is not an ioctl ipsq_current_ipif has not 1138 * been set until now. 1139 */ 1140 if (DB_TYPE(add_mp) == M_ERROR || DB_TYPE(add_mp) == M_HANGUP) { 1141 ASSERT(ipsq->ipsq_current_ipif == NULL); 1142 ipsq->ipsq_current_ipif = ipif; 1143 ipsq->ipsq_last_cmd = DB_TYPE(add_mp); 1144 } 1145 if (connp != NULL) 1146 connp->conn_oper_pending_ill = ipif->ipif_ill; 1147 mutex_exit(&ipsq->ipsq_lock); 1148 return (B_TRUE); 1149 } 1150 1151 /* 1152 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1153 * queued in the list. 1154 */ 1155 mblk_t * 1156 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1157 { 1158 mblk_t *curr = NULL; 1159 1160 mutex_enter(&ipsq->ipsq_lock); 1161 *connpp = NULL; 1162 if (ipsq->ipsq_pending_mp == NULL) { 1163 mutex_exit(&ipsq->ipsq_lock); 1164 return (NULL); 1165 } 1166 1167 /* There can be only 1 such excl message */ 1168 curr = ipsq->ipsq_pending_mp; 1169 ASSERT(curr != NULL && curr->b_next == NULL); 1170 ipsq->ipsq_pending_ipif = NULL; 1171 ipsq->ipsq_pending_mp = NULL; 1172 ipsq->ipsq_waitfor = 0; 1173 mutex_exit(&ipsq->ipsq_lock); 1174 1175 if (CONN_Q(curr->b_queue)) { 1176 /* 1177 * This mp did a refhold on the conn, at the start of the ioctl. 1178 * So we can safely return a pointer to the conn to the caller. 1179 */ 1180 *connpp = Q_TO_CONN(curr->b_queue); 1181 } else { 1182 *connpp = NULL; 1183 } 1184 curr->b_next = NULL; 1185 curr->b_prev = NULL; 1186 return (curr); 1187 } 1188 1189 /* 1190 * Cleanup the ioctl mp queued in ipsq_pending_mp 1191 * - Called in the ill_delete path 1192 * - Called in the M_ERROR or M_HANGUP path on the ill. 1193 * - Called in the conn close path. 1194 */ 1195 boolean_t 1196 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1197 { 1198 mblk_t *mp; 1199 ipsq_t *ipsq; 1200 queue_t *q; 1201 ipif_t *ipif; 1202 1203 ASSERT(IAM_WRITER_ILL(ill)); 1204 ipsq = ill->ill_phyint->phyint_ipsq; 1205 mutex_enter(&ipsq->ipsq_lock); 1206 /* 1207 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1208 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1209 * even if it is meant for another ill, since we have to enqueue 1210 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1211 * If connp is non-null we are called from the conn close path. 1212 */ 1213 mp = ipsq->ipsq_pending_mp; 1214 if (mp == NULL || (connp != NULL && 1215 mp->b_queue != CONNP_TO_WQ(connp))) { 1216 mutex_exit(&ipsq->ipsq_lock); 1217 return (B_FALSE); 1218 } 1219 /* Now remove from the ipsq_pending_mp */ 1220 ipsq->ipsq_pending_mp = NULL; 1221 q = mp->b_queue; 1222 mp->b_next = NULL; 1223 mp->b_prev = NULL; 1224 mp->b_queue = NULL; 1225 1226 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1227 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1228 if (ill->ill_move_in_progress) { 1229 ILL_CLEAR_MOVE(ill); 1230 } else if (ill->ill_up_ipifs) { 1231 ill_group_cleanup(ill); 1232 } 1233 1234 ipif = ipsq->ipsq_pending_ipif; 1235 ipsq->ipsq_pending_ipif = NULL; 1236 ipsq->ipsq_waitfor = 0; 1237 ipsq->ipsq_current_ipif = NULL; 1238 mutex_exit(&ipsq->ipsq_lock); 1239 1240 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1241 ip_ioctl_finish(q, mp, ENXIO, connp != NULL ? CONN_CLOSE : 1242 NO_COPYOUT, connp != NULL ? ipif : NULL, NULL); 1243 } else { 1244 /* 1245 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1246 * be just inet_freemsg. we have to restart it 1247 * otherwise the thread will be stuck. 1248 */ 1249 inet_freemsg(mp); 1250 } 1251 return (B_TRUE); 1252 } 1253 1254 /* 1255 * The ill is closing. Cleanup all the pending mps. Called exclusively 1256 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1257 * knows this ill, and hence nobody can add an mp to this list 1258 */ 1259 static void 1260 ill_pending_mp_cleanup(ill_t *ill) 1261 { 1262 mblk_t *mp; 1263 queue_t *q; 1264 1265 ASSERT(IAM_WRITER_ILL(ill)); 1266 1267 mutex_enter(&ill->ill_lock); 1268 /* 1269 * Every mp on the pending mp list originating from an ioctl 1270 * added 1 to the conn refcnt, at the start of the ioctl. 1271 * So bump it down now. See comments in ip_wput_nondata() 1272 */ 1273 while (ill->ill_pending_mp != NULL) { 1274 mp = ill->ill_pending_mp; 1275 ill->ill_pending_mp = mp->b_next; 1276 mutex_exit(&ill->ill_lock); 1277 1278 q = mp->b_queue; 1279 ASSERT(CONN_Q(q)); 1280 mp->b_next = NULL; 1281 mp->b_prev = NULL; 1282 mp->b_queue = NULL; 1283 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL, NULL); 1284 mutex_enter(&ill->ill_lock); 1285 } 1286 ill->ill_pending_ipif = NULL; 1287 1288 mutex_exit(&ill->ill_lock); 1289 } 1290 1291 /* 1292 * Called in the conn close path and ill delete path 1293 */ 1294 static void 1295 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1296 { 1297 ipsq_t *ipsq; 1298 mblk_t *prev; 1299 mblk_t *curr; 1300 mblk_t *next; 1301 queue_t *q; 1302 mblk_t *tmp_list = NULL; 1303 1304 ASSERT(IAM_WRITER_ILL(ill)); 1305 if (connp != NULL) 1306 q = CONNP_TO_WQ(connp); 1307 else 1308 q = ill->ill_wq; 1309 1310 ipsq = ill->ill_phyint->phyint_ipsq; 1311 /* 1312 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1313 * In the case of ioctl from a conn, there can be only 1 mp 1314 * queued on the ipsq. If an ill is being unplumbed, only messages 1315 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1316 * ioctls meant for this ill form conn's are not flushed. They will 1317 * be processed during ipsq_exit and will not find the ill and will 1318 * return error. 1319 */ 1320 mutex_enter(&ipsq->ipsq_lock); 1321 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1322 curr = next) { 1323 next = curr->b_next; 1324 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1325 /* Unlink the mblk from the pending mp list */ 1326 if (prev != NULL) { 1327 prev->b_next = curr->b_next; 1328 } else { 1329 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1330 ipsq->ipsq_xopq_mphead = curr->b_next; 1331 } 1332 if (ipsq->ipsq_xopq_mptail == curr) 1333 ipsq->ipsq_xopq_mptail = prev; 1334 /* 1335 * Create a temporary list and release the ipsq lock 1336 * New elements are added to the head of the tmp_list 1337 */ 1338 curr->b_next = tmp_list; 1339 tmp_list = curr; 1340 } else { 1341 prev = curr; 1342 } 1343 } 1344 mutex_exit(&ipsq->ipsq_lock); 1345 1346 while (tmp_list != NULL) { 1347 curr = tmp_list; 1348 tmp_list = curr->b_next; 1349 curr->b_next = NULL; 1350 curr->b_prev = NULL; 1351 curr->b_queue = NULL; 1352 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1353 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1354 CONN_CLOSE : NO_COPYOUT, NULL, NULL); 1355 } else { 1356 /* 1357 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1358 * this can't be just inet_freemsg. we have to 1359 * restart it otherwise the thread will be stuck. 1360 */ 1361 inet_freemsg(curr); 1362 } 1363 } 1364 } 1365 1366 /* 1367 * This conn has started closing. Cleanup any pending ioctl from this conn. 1368 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1369 */ 1370 void 1371 conn_ioctl_cleanup(conn_t *connp) 1372 { 1373 mblk_t *curr; 1374 ipsq_t *ipsq; 1375 ill_t *ill; 1376 boolean_t refheld; 1377 1378 /* 1379 * Is any exclusive ioctl pending ? If so clean it up. If the 1380 * ioctl has not yet started, the mp is pending in the list headed by 1381 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1382 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1383 * is currently executing now the mp is not queued anywhere but 1384 * conn_oper_pending_ill is null. The conn close will wait 1385 * till the conn_ref drops to zero. 1386 */ 1387 mutex_enter(&connp->conn_lock); 1388 ill = connp->conn_oper_pending_ill; 1389 if (ill == NULL) { 1390 mutex_exit(&connp->conn_lock); 1391 return; 1392 } 1393 1394 curr = ill_pending_mp_get(ill, &connp, 0); 1395 if (curr != NULL) { 1396 mutex_exit(&connp->conn_lock); 1397 CONN_DEC_REF(connp); 1398 inet_freemsg(curr); 1399 return; 1400 } 1401 /* 1402 * We may not be able to refhold the ill if the ill/ipif 1403 * is changing. But we need to make sure that the ill will 1404 * not vanish. So we just bump up the ill_waiter count. 1405 */ 1406 refheld = ill_waiter_inc(ill); 1407 mutex_exit(&connp->conn_lock); 1408 if (refheld) { 1409 if (ipsq_enter(ill, B_TRUE)) { 1410 ill_waiter_dcr(ill); 1411 /* 1412 * Check whether this ioctl has started and is 1413 * pending now in ipsq_pending_mp. If it is not 1414 * found there then check whether this ioctl has 1415 * not even started and is in the ipsq_xopq list. 1416 */ 1417 if (!ipsq_pending_mp_cleanup(ill, connp)) 1418 ipsq_xopq_mp_cleanup(ill, connp); 1419 ipsq = ill->ill_phyint->phyint_ipsq; 1420 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1421 return; 1422 } 1423 } 1424 1425 /* 1426 * The ill is also closing and we could not bump up the 1427 * ill_waiter_count or we could not enter the ipsq. Leave 1428 * the cleanup to ill_delete 1429 */ 1430 mutex_enter(&connp->conn_lock); 1431 while (connp->conn_oper_pending_ill != NULL) 1432 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1433 mutex_exit(&connp->conn_lock); 1434 if (refheld) 1435 ill_waiter_dcr(ill); 1436 } 1437 1438 /* 1439 * ipcl_walk function for cleaning up conn_*_ill fields. 1440 */ 1441 static void 1442 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1443 { 1444 ill_t *ill = (ill_t *)arg; 1445 ire_t *ire; 1446 1447 mutex_enter(&connp->conn_lock); 1448 if (connp->conn_multicast_ill == ill) { 1449 /* Revert to late binding */ 1450 connp->conn_multicast_ill = NULL; 1451 connp->conn_orig_multicast_ifindex = 0; 1452 } 1453 if (connp->conn_incoming_ill == ill) 1454 connp->conn_incoming_ill = NULL; 1455 if (connp->conn_outgoing_ill == ill) 1456 connp->conn_outgoing_ill = NULL; 1457 if (connp->conn_outgoing_pill == ill) 1458 connp->conn_outgoing_pill = NULL; 1459 if (connp->conn_nofailover_ill == ill) 1460 connp->conn_nofailover_ill = NULL; 1461 if (connp->conn_xmit_if_ill == ill) 1462 connp->conn_xmit_if_ill = NULL; 1463 if (connp->conn_ire_cache != NULL) { 1464 ire = connp->conn_ire_cache; 1465 /* 1466 * ip_newroute creates IRE_CACHE with ire_stq coming from 1467 * interface X and ipif coming from interface Y, if interface 1468 * X and Y are part of the same IPMPgroup. Thus whenever 1469 * interface X goes down, remove all references to it by 1470 * checking both on ire_ipif and ire_stq. 1471 */ 1472 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1473 (ire->ire_type == IRE_CACHE && 1474 ire->ire_stq == ill->ill_wq)) { 1475 connp->conn_ire_cache = NULL; 1476 mutex_exit(&connp->conn_lock); 1477 ire_refrele_notr(ire); 1478 return; 1479 } 1480 } 1481 mutex_exit(&connp->conn_lock); 1482 1483 } 1484 1485 /* ARGSUSED */ 1486 void 1487 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1488 { 1489 ill_t *ill = q->q_ptr; 1490 ipif_t *ipif; 1491 1492 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1493 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1494 ipif_down_tail(ipif); 1495 ill_down_tail(ill); 1496 freemsg(mp); 1497 ipsq->ipsq_current_ipif = NULL; 1498 } 1499 1500 /* 1501 * ill_down_start is called when we want to down this ill and bring it up again 1502 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1503 * all interfaces, but don't tear down any plumbing. 1504 */ 1505 boolean_t 1506 ill_down_start(queue_t *q, mblk_t *mp) 1507 { 1508 ill_t *ill; 1509 ipif_t *ipif; 1510 1511 ill = q->q_ptr; 1512 1513 ASSERT(IAM_WRITER_ILL(ill)); 1514 1515 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1516 (void) ipif_down(ipif, NULL, NULL); 1517 1518 ill_down(ill); 1519 1520 (void) ipsq_pending_mp_cleanup(ill, NULL); 1521 mutex_enter(&ill->ill_lock); 1522 /* 1523 * Atomically test and add the pending mp if references are 1524 * still active. 1525 */ 1526 if (!ill_is_quiescent(ill)) { 1527 /* 1528 * Get rid of any pending mps and cleanup. Call will 1529 * not fail since we are passing a null connp. 1530 */ 1531 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1532 mp, ILL_DOWN); 1533 mutex_exit(&ill->ill_lock); 1534 return (B_FALSE); 1535 } 1536 mutex_exit(&ill->ill_lock); 1537 return (B_TRUE); 1538 } 1539 1540 static void 1541 ill_down(ill_t *ill) 1542 { 1543 /* Blow off any IREs dependent on this ILL. */ 1544 ire_walk(ill_downi, (char *)ill); 1545 1546 mutex_enter(&ire_mrtun_lock); 1547 if (ire_mrtun_count != 0) { 1548 mutex_exit(&ire_mrtun_lock); 1549 ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif, 1550 (char *)ill, NULL); 1551 } else { 1552 mutex_exit(&ire_mrtun_lock); 1553 } 1554 1555 /* 1556 * If any interface based forwarding table exists 1557 * Blow off the ires there dependent on this ill 1558 */ 1559 mutex_enter(&ire_srcif_table_lock); 1560 if (ire_srcif_table_count > 0) { 1561 mutex_exit(&ire_srcif_table_lock); 1562 ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill); 1563 } else { 1564 mutex_exit(&ire_srcif_table_lock); 1565 } 1566 1567 /* Remove any conn_*_ill depending on this ill */ 1568 ipcl_walk(conn_cleanup_ill, (caddr_t)ill); 1569 1570 if (ill->ill_group != NULL) { 1571 illgrp_delete(ill); 1572 } 1573 1574 } 1575 1576 static void 1577 ill_down_tail(ill_t *ill) 1578 { 1579 int i; 1580 1581 /* Destroy ill_srcif_table if it exists */ 1582 /* Lock not reqd really because nobody should be able to access */ 1583 mutex_enter(&ill->ill_lock); 1584 if (ill->ill_srcif_table != NULL) { 1585 ill->ill_srcif_refcnt = 0; 1586 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 1587 rw_destroy(&ill->ill_srcif_table[i].irb_lock); 1588 } 1589 kmem_free(ill->ill_srcif_table, 1590 IP_SRCIF_TABLE_SIZE * sizeof (irb_t)); 1591 ill->ill_srcif_table = NULL; 1592 ill->ill_srcif_refcnt = 0; 1593 ill->ill_mrtun_refcnt = 0; 1594 } 1595 mutex_exit(&ill->ill_lock); 1596 } 1597 1598 /* 1599 * ire_walk routine used to delete every IRE that depends on queues 1600 * associated with 'ill'. (Always called as writer.) 1601 */ 1602 static void 1603 ill_downi(ire_t *ire, char *ill_arg) 1604 { 1605 ill_t *ill = (ill_t *)ill_arg; 1606 1607 /* 1608 * ip_newroute creates IRE_CACHE with ire_stq coming from 1609 * interface X and ipif coming from interface Y, if interface 1610 * X and Y are part of the same IPMP group. Thus whenever interface 1611 * X goes down, remove all references to it by checking both 1612 * on ire_ipif and ire_stq. 1613 */ 1614 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1615 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1616 ire_delete(ire); 1617 } 1618 } 1619 1620 /* 1621 * A seperate routine for deleting revtun and srcif based routes 1622 * are needed because the ires only deleted when the interface 1623 * is unplumbed. Also these ires have ire_in_ill non-null as well. 1624 * we want to keep mobile IP specific code separate. 1625 */ 1626 static void 1627 ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg) 1628 { 1629 ill_t *ill = (ill_t *)ill_arg; 1630 1631 ASSERT(ire->ire_in_ill != NULL); 1632 1633 if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) || 1634 (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) { 1635 ire_delete(ire); 1636 } 1637 } 1638 1639 /* 1640 * Remove ire/nce from the fastpath list. 1641 */ 1642 void 1643 ill_fastpath_nack(ill_t *ill) 1644 { 1645 if (ill->ill_isv6) { 1646 nce_fastpath_list_dispatch(ill, NULL, NULL); 1647 } else { 1648 ire_fastpath_list_dispatch(ill, NULL, NULL); 1649 } 1650 } 1651 1652 /* Consume an M_IOCACK of the fastpath probe. */ 1653 void 1654 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1655 { 1656 mblk_t *mp1 = mp; 1657 1658 /* 1659 * If this was the first attempt turn on the fastpath probing. 1660 */ 1661 mutex_enter(&ill->ill_lock); 1662 if (ill->ill_dlpi_fastpath_state == IDMS_INPROGRESS) 1663 ill->ill_dlpi_fastpath_state = IDMS_OK; 1664 mutex_exit(&ill->ill_lock); 1665 1666 /* Free the M_IOCACK mblk, hold on to the data */ 1667 mp = mp->b_cont; 1668 freeb(mp1); 1669 if (mp == NULL) 1670 return; 1671 if (mp->b_cont != NULL) { 1672 /* 1673 * Update all IRE's or NCE's that are waiting for 1674 * fastpath update. 1675 */ 1676 if (ill->ill_isv6) { 1677 /* 1678 * update nce's in the fastpath list. 1679 */ 1680 nce_fastpath_list_dispatch(ill, 1681 ndp_fastpath_update, mp); 1682 } else { 1683 1684 /* 1685 * update ire's in the fastpath list. 1686 */ 1687 ire_fastpath_list_dispatch(ill, 1688 ire_fastpath_update, mp); 1689 /* 1690 * Check if we need to traverse reverse tunnel table. 1691 * Since there is only single ire_type (IRE_MIPRTUN) 1692 * in the table, we don't need to match on ire_type. 1693 * We have to check ire_mrtun_count and not the 1694 * ill_mrtun_refcnt since ill_mrtun_refcnt is set 1695 * on the incoming ill and here we are dealing with 1696 * outgoing ill. 1697 */ 1698 mutex_enter(&ire_mrtun_lock); 1699 if (ire_mrtun_count != 0) { 1700 mutex_exit(&ire_mrtun_lock); 1701 ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN, 1702 (void (*)(ire_t *, void *)) 1703 ire_fastpath_update, mp, ill); 1704 } else { 1705 mutex_exit(&ire_mrtun_lock); 1706 } 1707 } 1708 mp1 = mp->b_cont; 1709 freeb(mp); 1710 mp = mp1; 1711 } else { 1712 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1713 } 1714 1715 freeb(mp); 1716 } 1717 1718 /* 1719 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1720 * The data portion of the request is a dl_unitdata_req_t template for 1721 * what we would send downstream in the absence of a fastpath confirmation. 1722 */ 1723 int 1724 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1725 { 1726 struct iocblk *ioc; 1727 mblk_t *mp; 1728 1729 if (dlur_mp == NULL) 1730 return (EINVAL); 1731 1732 mutex_enter(&ill->ill_lock); 1733 switch (ill->ill_dlpi_fastpath_state) { 1734 case IDMS_FAILED: 1735 /* 1736 * Driver NAKed the first fastpath ioctl - assume it doesn't 1737 * support it. 1738 */ 1739 mutex_exit(&ill->ill_lock); 1740 return (ENOTSUP); 1741 case IDMS_UNKNOWN: 1742 /* This is the first probe */ 1743 ill->ill_dlpi_fastpath_state = IDMS_INPROGRESS; 1744 break; 1745 default: 1746 break; 1747 } 1748 mutex_exit(&ill->ill_lock); 1749 1750 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1751 return (EAGAIN); 1752 1753 mp->b_cont = copyb(dlur_mp); 1754 if (mp->b_cont == NULL) { 1755 freeb(mp); 1756 return (EAGAIN); 1757 } 1758 1759 ioc = (struct iocblk *)mp->b_rptr; 1760 ioc->ioc_count = msgdsize(mp->b_cont); 1761 1762 putnext(ill->ill_wq, mp); 1763 return (0); 1764 } 1765 1766 void 1767 ill_capability_probe(ill_t *ill) 1768 { 1769 /* 1770 * Do so only if negotiation is enabled, capabilities are unknown, 1771 * and a capability negotiation is not already in progress. 1772 */ 1773 if (ill->ill_capab_state != IDMS_UNKNOWN && 1774 ill->ill_capab_state != IDMS_RENEG) 1775 return; 1776 1777 ill->ill_capab_state = IDMS_INPROGRESS; 1778 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1779 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1780 } 1781 1782 void 1783 ill_capability_reset(ill_t *ill) 1784 { 1785 mblk_t *sc_mp = NULL; 1786 mblk_t *tmp; 1787 1788 /* 1789 * Note here that we reset the state to UNKNOWN, and later send 1790 * down the DL_CAPABILITY_REQ without first setting the state to 1791 * INPROGRESS. We do this in order to distinguish the 1792 * DL_CAPABILITY_ACK response which may come back in response to 1793 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1794 * also handle the case where the driver doesn't send us back 1795 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1796 * requires the state to be in UNKNOWN anyway. In any case, all 1797 * features are turned off until the state reaches IDMS_OK. 1798 */ 1799 ill->ill_capab_state = IDMS_UNKNOWN; 1800 1801 /* 1802 * Disable sub-capabilities and request a list of sub-capability 1803 * messages which will be sent down to the driver. Each handler 1804 * allocates the corresponding dl_capability_sub_t inside an 1805 * mblk, and links it to the existing sc_mp mblk, or return it 1806 * as sc_mp if it's the first sub-capability (the passed in 1807 * sc_mp is NULL). Upon returning from all capability handlers, 1808 * sc_mp will be pulled-up, before passing it downstream. 1809 */ 1810 ill_capability_mdt_reset(ill, &sc_mp); 1811 ill_capability_hcksum_reset(ill, &sc_mp); 1812 ill_capability_zerocopy_reset(ill, &sc_mp); 1813 ill_capability_ipsec_reset(ill, &sc_mp); 1814 ill_capability_dls_reset(ill, &sc_mp); 1815 1816 /* Nothing to send down in order to disable the capabilities? */ 1817 if (sc_mp == NULL) 1818 return; 1819 1820 tmp = msgpullup(sc_mp, -1); 1821 freemsg(sc_mp); 1822 if ((sc_mp = tmp) == NULL) { 1823 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1824 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1825 return; 1826 } 1827 1828 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1829 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1830 } 1831 1832 /* 1833 * Request or set new-style hardware capabilities supported by DLS provider. 1834 */ 1835 static void 1836 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1837 { 1838 mblk_t *mp; 1839 dl_capability_req_t *capb; 1840 size_t size = 0; 1841 uint8_t *ptr; 1842 1843 if (reqp != NULL) 1844 size = MBLKL(reqp); 1845 1846 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1847 if (mp == NULL) { 1848 freemsg(reqp); 1849 return; 1850 } 1851 ptr = mp->b_rptr; 1852 1853 capb = (dl_capability_req_t *)ptr; 1854 ptr += sizeof (dl_capability_req_t); 1855 1856 if (reqp != NULL) { 1857 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1858 capb->dl_sub_length = size; 1859 bcopy(reqp->b_rptr, ptr, size); 1860 ptr += size; 1861 mp->b_cont = reqp->b_cont; 1862 freeb(reqp); 1863 } 1864 ASSERT(ptr == mp->b_wptr); 1865 1866 ill_dlpi_send(ill, mp); 1867 } 1868 1869 static void 1870 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1871 { 1872 dl_capab_id_t *id_ic; 1873 uint_t sub_dl_cap = outers->dl_cap; 1874 dl_capability_sub_t *inners; 1875 uint8_t *capend; 1876 1877 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1878 1879 /* 1880 * Note: range checks here are not absolutely sufficient to 1881 * make us robust against malformed messages sent by drivers; 1882 * this is in keeping with the rest of IP's dlpi handling. 1883 * (Remember, it's coming from something else in the kernel 1884 * address space) 1885 */ 1886 1887 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1888 if (capend > mp->b_wptr) { 1889 cmn_err(CE_WARN, "ill_capability_id_ack: " 1890 "malformed sub-capability too long for mblk"); 1891 return; 1892 } 1893 1894 id_ic = (dl_capab_id_t *)(outers + 1); 1895 1896 if (outers->dl_length < sizeof (*id_ic) || 1897 (inners = &id_ic->id_subcap, 1898 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1899 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1900 "encapsulated capab type %d too long for mblk", 1901 inners->dl_cap); 1902 return; 1903 } 1904 1905 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1906 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1907 "isn't as expected; pass-thru module(s) detected, " 1908 "discarding capability\n", inners->dl_cap)); 1909 return; 1910 } 1911 1912 /* Process the encapsulated sub-capability */ 1913 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1914 } 1915 1916 /* 1917 * Process Multidata Transmit capability negotiation ack received from a 1918 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1919 * DL_CAPABILITY_ACK message. 1920 */ 1921 static void 1922 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1923 { 1924 mblk_t *nmp = NULL; 1925 dl_capability_req_t *oc; 1926 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1927 ill_mdt_capab_t **ill_mdt_capab; 1928 uint_t sub_dl_cap = isub->dl_cap; 1929 uint8_t *capend; 1930 1931 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1932 1933 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1934 1935 /* 1936 * Note: range checks here are not absolutely sufficient to 1937 * make us robust against malformed messages sent by drivers; 1938 * this is in keeping with the rest of IP's dlpi handling. 1939 * (Remember, it's coming from something else in the kernel 1940 * address space) 1941 */ 1942 1943 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1944 if (capend > mp->b_wptr) { 1945 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1946 "malformed sub-capability too long for mblk"); 1947 return; 1948 } 1949 1950 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1951 1952 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1953 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1954 "unsupported MDT sub-capability (version %d, expected %d)", 1955 mdt_ic->mdt_version, MDT_VERSION_2); 1956 return; 1957 } 1958 1959 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 1960 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 1961 "capability isn't as expected; pass-thru module(s) " 1962 "detected, discarding capability\n")); 1963 return; 1964 } 1965 1966 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 1967 1968 if (*ill_mdt_capab == NULL) { 1969 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 1970 KM_NOSLEEP); 1971 1972 if (*ill_mdt_capab == NULL) { 1973 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1974 "could not enable MDT version %d " 1975 "for %s (ENOMEM)\n", MDT_VERSION_2, 1976 ill->ill_name); 1977 return; 1978 } 1979 } 1980 1981 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 1982 "MDT version %d (%d bytes leading, %d bytes trailing " 1983 "header spaces, %d max pld bufs, %d span limit)\n", 1984 ill->ill_name, MDT_VERSION_2, 1985 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 1986 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 1987 1988 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 1989 (*ill_mdt_capab)->ill_mdt_on = 1; 1990 /* 1991 * Round the following values to the nearest 32-bit; ULP 1992 * may further adjust them to accomodate for additional 1993 * protocol headers. We pass these values to ULP during 1994 * bind time. 1995 */ 1996 (*ill_mdt_capab)->ill_mdt_hdr_head = 1997 roundup(mdt_ic->mdt_hdr_head, 4); 1998 (*ill_mdt_capab)->ill_mdt_hdr_tail = 1999 roundup(mdt_ic->mdt_hdr_tail, 4); 2000 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 2001 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 2002 2003 ill->ill_capabilities |= ILL_CAPAB_MDT; 2004 } else { 2005 uint_t size; 2006 uchar_t *rptr; 2007 2008 size = sizeof (dl_capability_req_t) + 2009 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2010 2011 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2012 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2013 "could not enable MDT for %s (ENOMEM)\n", 2014 ill->ill_name); 2015 return; 2016 } 2017 2018 rptr = nmp->b_rptr; 2019 /* initialize dl_capability_req_t */ 2020 oc = (dl_capability_req_t *)nmp->b_rptr; 2021 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2022 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2023 sizeof (dl_capab_mdt_t); 2024 nmp->b_rptr += sizeof (dl_capability_req_t); 2025 2026 /* initialize dl_capability_sub_t */ 2027 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2028 nmp->b_rptr += sizeof (*isub); 2029 2030 /* initialize dl_capab_mdt_t */ 2031 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2032 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2033 2034 nmp->b_rptr = rptr; 2035 2036 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2037 "to enable MDT version %d\n", ill->ill_name, 2038 MDT_VERSION_2)); 2039 2040 /* set ENABLE flag */ 2041 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2042 2043 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2044 ill_dlpi_send(ill, nmp); 2045 } 2046 } 2047 2048 static void 2049 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2050 { 2051 mblk_t *mp; 2052 dl_capab_mdt_t *mdt_subcap; 2053 dl_capability_sub_t *dl_subcap; 2054 int size; 2055 2056 if (!ILL_MDT_CAPABLE(ill)) 2057 return; 2058 2059 ASSERT(ill->ill_mdt_capab != NULL); 2060 /* 2061 * Clear the capability flag for MDT but retain the ill_mdt_capab 2062 * structure since it's possible that another thread is still 2063 * referring to it. The structure only gets deallocated when 2064 * we destroy the ill. 2065 */ 2066 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2067 2068 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2069 2070 mp = allocb(size, BPRI_HI); 2071 if (mp == NULL) { 2072 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2073 "request to disable MDT\n")); 2074 return; 2075 } 2076 2077 mp->b_wptr = mp->b_rptr + size; 2078 2079 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2080 dl_subcap->dl_cap = DL_CAPAB_MDT; 2081 dl_subcap->dl_length = sizeof (*mdt_subcap); 2082 2083 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2084 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2085 mdt_subcap->mdt_flags = 0; 2086 mdt_subcap->mdt_hdr_head = 0; 2087 mdt_subcap->mdt_hdr_tail = 0; 2088 2089 if (*sc_mp != NULL) 2090 linkb(*sc_mp, mp); 2091 else 2092 *sc_mp = mp; 2093 } 2094 2095 /* 2096 * Send a DL_NOTIFY_REQ to the specified ill to enable 2097 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2098 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2099 * acceleration. 2100 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2101 */ 2102 static boolean_t 2103 ill_enable_promisc_notify(ill_t *ill) 2104 { 2105 mblk_t *mp; 2106 dl_notify_req_t *req; 2107 2108 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2109 2110 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2111 if (mp == NULL) 2112 return (B_FALSE); 2113 2114 req = (dl_notify_req_t *)mp->b_rptr; 2115 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2116 DL_NOTE_PROMISC_OFF_PHYS; 2117 2118 ill_dlpi_send(ill, mp); 2119 2120 return (B_TRUE); 2121 } 2122 2123 2124 /* 2125 * Allocate an IPsec capability request which will be filled by our 2126 * caller to turn on support for one or more algorithms. 2127 */ 2128 static mblk_t * 2129 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2130 { 2131 mblk_t *nmp; 2132 dl_capability_req_t *ocap; 2133 dl_capab_ipsec_t *ocip; 2134 dl_capab_ipsec_t *icip; 2135 uint8_t *ptr; 2136 icip = (dl_capab_ipsec_t *)(isub + 1); 2137 2138 /* 2139 * The first time around, we send a DL_NOTIFY_REQ to enable 2140 * PROMISC_ON/OFF notification from the provider. We need to 2141 * do this before enabling the algorithms to avoid leakage of 2142 * cleartext packets. 2143 */ 2144 2145 if (!ill_enable_promisc_notify(ill)) 2146 return (NULL); 2147 2148 /* 2149 * Allocate new mblk which will contain a new capability 2150 * request to enable the capabilities. 2151 */ 2152 2153 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2154 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2155 if (nmp == NULL) 2156 return (NULL); 2157 2158 ptr = nmp->b_rptr; 2159 2160 /* initialize dl_capability_req_t */ 2161 ocap = (dl_capability_req_t *)ptr; 2162 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2163 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2164 ptr += sizeof (dl_capability_req_t); 2165 2166 /* initialize dl_capability_sub_t */ 2167 bcopy(isub, ptr, sizeof (*isub)); 2168 ptr += sizeof (*isub); 2169 2170 /* initialize dl_capab_ipsec_t */ 2171 ocip = (dl_capab_ipsec_t *)ptr; 2172 bcopy(icip, ocip, sizeof (*icip)); 2173 2174 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2175 return (nmp); 2176 } 2177 2178 /* 2179 * Process an IPsec capability negotiation ack received from a DLS Provider. 2180 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2181 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2182 */ 2183 static void 2184 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2185 { 2186 dl_capab_ipsec_t *icip; 2187 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2188 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2189 uint_t cipher, nciphers; 2190 mblk_t *nmp; 2191 uint_t alg_len; 2192 boolean_t need_sadb_dump; 2193 uint_t sub_dl_cap = isub->dl_cap; 2194 ill_ipsec_capab_t **ill_capab; 2195 uint64_t ill_capab_flag; 2196 uint8_t *capend, *ciphend; 2197 boolean_t sadb_resync; 2198 2199 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2200 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2201 2202 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2203 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2204 ill_capab_flag = ILL_CAPAB_AH; 2205 } else { 2206 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2207 ill_capab_flag = ILL_CAPAB_ESP; 2208 } 2209 2210 /* 2211 * If the ill capability structure exists, then this incoming 2212 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2213 * If this is so, then we'd need to resynchronize the SADB 2214 * after re-enabling the offloaded ciphers. 2215 */ 2216 sadb_resync = (*ill_capab != NULL); 2217 2218 /* 2219 * Note: range checks here are not absolutely sufficient to 2220 * make us robust against malformed messages sent by drivers; 2221 * this is in keeping with the rest of IP's dlpi handling. 2222 * (Remember, it's coming from something else in the kernel 2223 * address space) 2224 */ 2225 2226 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2227 if (capend > mp->b_wptr) { 2228 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2229 "malformed sub-capability too long for mblk"); 2230 return; 2231 } 2232 2233 /* 2234 * There are two types of acks we process here: 2235 * 1. acks in reply to a (first form) generic capability req 2236 * (no ENABLE flag set) 2237 * 2. acks in reply to a ENABLE capability req. 2238 * (ENABLE flag set) 2239 * 2240 * We process the subcapability passed as argument as follows: 2241 * 1 do initializations 2242 * 1.1 initialize nmp = NULL 2243 * 1.2 set need_sadb_dump to B_FALSE 2244 * 2 for each cipher in subcapability: 2245 * 2.1 if ENABLE flag is set: 2246 * 2.1.1 update per-ill ipsec capabilities info 2247 * 2.1.2 set need_sadb_dump to B_TRUE 2248 * 2.2 if ENABLE flag is not set: 2249 * 2.2.1 if nmp is NULL: 2250 * 2.2.1.1 allocate and initialize nmp 2251 * 2.2.1.2 init current pos in nmp 2252 * 2.2.2 copy current cipher to current pos in nmp 2253 * 2.2.3 set ENABLE flag in nmp 2254 * 2.2.4 update current pos 2255 * 3 if nmp is not equal to NULL, send enable request 2256 * 3.1 send capability request 2257 * 4 if need_sadb_dump is B_TRUE 2258 * 4.1 enable promiscuous on/off notifications 2259 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2260 * AH or ESP SA's to interface. 2261 */ 2262 2263 nmp = NULL; 2264 oalg = NULL; 2265 need_sadb_dump = B_FALSE; 2266 icip = (dl_capab_ipsec_t *)(isub + 1); 2267 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2268 2269 nciphers = icip->cip_nciphers; 2270 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2271 2272 if (ciphend > capend) { 2273 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2274 "too many ciphers for sub-capability len"); 2275 return; 2276 } 2277 2278 for (cipher = 0; cipher < nciphers; cipher++) { 2279 alg_len = sizeof (dl_capab_ipsec_alg_t); 2280 2281 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2282 /* 2283 * TBD: when we provide a way to disable capabilities 2284 * from above, need to manage the request-pending state 2285 * and fail if we were not expecting this ACK. 2286 */ 2287 IPSECHW_DEBUG(IPSECHW_CAPAB, 2288 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2289 2290 /* 2291 * Update IPsec capabilities for this ill 2292 */ 2293 2294 if (*ill_capab == NULL) { 2295 IPSECHW_DEBUG(IPSECHW_CAPAB, 2296 ("ill_capability_ipsec_ack: " 2297 "allocating ipsec_capab for ill\n")); 2298 *ill_capab = ill_ipsec_capab_alloc(); 2299 2300 if (*ill_capab == NULL) { 2301 cmn_err(CE_WARN, 2302 "ill_capability_ipsec_ack: " 2303 "could not enable IPsec Hardware " 2304 "acceleration for %s (ENOMEM)\n", 2305 ill->ill_name); 2306 return; 2307 } 2308 } 2309 2310 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2311 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2312 2313 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2314 cmn_err(CE_WARN, 2315 "ill_capability_ipsec_ack: " 2316 "malformed IPsec algorithm id %d", 2317 ialg->alg_prim); 2318 continue; 2319 } 2320 2321 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2322 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2323 ialg->alg_prim); 2324 } else { 2325 ipsec_capab_algparm_t *alp; 2326 2327 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2328 ialg->alg_prim); 2329 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2330 ialg->alg_prim)) { 2331 cmn_err(CE_WARN, 2332 "ill_capability_ipsec_ack: " 2333 "no space for IPsec alg id %d", 2334 ialg->alg_prim); 2335 continue; 2336 } 2337 alp = &((*ill_capab)->encr_algparm[ 2338 ialg->alg_prim]); 2339 alp->minkeylen = ialg->alg_minbits; 2340 alp->maxkeylen = ialg->alg_maxbits; 2341 } 2342 ill->ill_capabilities |= ill_capab_flag; 2343 /* 2344 * indicate that a capability was enabled, which 2345 * will be used below to kick off a SADB dump 2346 * to the ill. 2347 */ 2348 need_sadb_dump = B_TRUE; 2349 } else { 2350 IPSECHW_DEBUG(IPSECHW_CAPAB, 2351 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2352 ialg->alg_prim)); 2353 2354 if (nmp == NULL) { 2355 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2356 if (nmp == NULL) { 2357 /* 2358 * Sending the PROMISC_ON/OFF 2359 * notification request failed. 2360 * We cannot enable the algorithms 2361 * since the Provider will not 2362 * notify IP of promiscous mode 2363 * changes, which could lead 2364 * to leakage of packets. 2365 */ 2366 cmn_err(CE_WARN, 2367 "ill_capability_ipsec_ack: " 2368 "could not enable IPsec Hardware " 2369 "acceleration for %s (ENOMEM)\n", 2370 ill->ill_name); 2371 return; 2372 } 2373 /* ptr to current output alg specifier */ 2374 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2375 } 2376 2377 /* 2378 * Copy current alg specifier, set ENABLE 2379 * flag, and advance to next output alg. 2380 * For now we enable all IPsec capabilities. 2381 */ 2382 ASSERT(oalg != NULL); 2383 bcopy(ialg, oalg, alg_len); 2384 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2385 nmp->b_wptr += alg_len; 2386 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2387 } 2388 2389 /* move to next input algorithm specifier */ 2390 ialg = (dl_capab_ipsec_alg_t *) 2391 ((char *)ialg + alg_len); 2392 } 2393 2394 if (nmp != NULL) 2395 /* 2396 * nmp points to a DL_CAPABILITY_REQ message to enable 2397 * IPsec hardware acceleration. 2398 */ 2399 ill_dlpi_send(ill, nmp); 2400 2401 if (need_sadb_dump) 2402 /* 2403 * An acknowledgement corresponding to a request to 2404 * enable acceleration was received, notify SADB. 2405 */ 2406 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2407 } 2408 2409 /* 2410 * Given an mblk with enough space in it, create sub-capability entries for 2411 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2412 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2413 * in preparation for the reset the DL_CAPABILITY_REQ message. 2414 */ 2415 static void 2416 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2417 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2418 { 2419 dl_capab_ipsec_t *oipsec; 2420 dl_capab_ipsec_alg_t *oalg; 2421 dl_capability_sub_t *dl_subcap; 2422 int i, k; 2423 2424 ASSERT(nciphers > 0); 2425 ASSERT(ill_cap != NULL); 2426 ASSERT(mp != NULL); 2427 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2428 2429 /* dl_capability_sub_t for "stype" */ 2430 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2431 dl_subcap->dl_cap = stype; 2432 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2433 mp->b_wptr += sizeof (dl_capability_sub_t); 2434 2435 /* dl_capab_ipsec_t for "stype" */ 2436 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2437 oipsec->cip_version = 1; 2438 oipsec->cip_nciphers = nciphers; 2439 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2440 2441 /* create entries for "stype" AUTH ciphers */ 2442 for (i = 0; i < ill_cap->algs_size; i++) { 2443 for (k = 0; k < BITSPERBYTE; k++) { 2444 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2445 continue; 2446 2447 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2448 bzero((void *)oalg, sizeof (*oalg)); 2449 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2450 oalg->alg_prim = k + (BITSPERBYTE * i); 2451 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2452 } 2453 } 2454 /* create entries for "stype" ENCR ciphers */ 2455 for (i = 0; i < ill_cap->algs_size; i++) { 2456 for (k = 0; k < BITSPERBYTE; k++) { 2457 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2458 continue; 2459 2460 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2461 bzero((void *)oalg, sizeof (*oalg)); 2462 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2463 oalg->alg_prim = k + (BITSPERBYTE * i); 2464 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2465 } 2466 } 2467 } 2468 2469 /* 2470 * Macro to count number of 1s in a byte (8-bit word). The total count is 2471 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2472 * POPC instruction, but our macro is more flexible for an arbitrary length 2473 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2474 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2475 * stays that way, we can reduce the number of iterations required. 2476 */ 2477 #define COUNT_1S(val, sum) { \ 2478 uint8_t x = val & 0xff; \ 2479 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2480 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2481 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2482 } 2483 2484 /* ARGSUSED */ 2485 static void 2486 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2487 { 2488 mblk_t *mp; 2489 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2490 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2491 uint64_t ill_capabilities = ill->ill_capabilities; 2492 int ah_cnt = 0, esp_cnt = 0; 2493 int ah_len = 0, esp_len = 0; 2494 int i, size = 0; 2495 2496 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2497 return; 2498 2499 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2500 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2501 2502 /* Find out the number of ciphers for AH */ 2503 if (cap_ah != NULL) { 2504 for (i = 0; i < cap_ah->algs_size; i++) { 2505 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2506 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2507 } 2508 if (ah_cnt > 0) { 2509 size += sizeof (dl_capability_sub_t) + 2510 sizeof (dl_capab_ipsec_t); 2511 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2512 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2513 size += ah_len; 2514 } 2515 } 2516 2517 /* Find out the number of ciphers for ESP */ 2518 if (cap_esp != NULL) { 2519 for (i = 0; i < cap_esp->algs_size; i++) { 2520 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2521 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2522 } 2523 if (esp_cnt > 0) { 2524 size += sizeof (dl_capability_sub_t) + 2525 sizeof (dl_capab_ipsec_t); 2526 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2527 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2528 size += esp_len; 2529 } 2530 } 2531 2532 if (size == 0) { 2533 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2534 "there's nothing to reset\n")); 2535 return; 2536 } 2537 2538 mp = allocb(size, BPRI_HI); 2539 if (mp == NULL) { 2540 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2541 "request to disable IPSEC Hardware Acceleration\n")); 2542 return; 2543 } 2544 2545 /* 2546 * Clear the capability flags for IPSec HA but retain the ill 2547 * capability structures since it's possible that another thread 2548 * is still referring to them. The structures only get deallocated 2549 * when we destroy the ill. 2550 * 2551 * Various places check the flags to see if the ill is capable of 2552 * hardware acceleration, and by clearing them we ensure that new 2553 * outbound IPSec packets are sent down encrypted. 2554 */ 2555 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2556 2557 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2558 if (ah_cnt > 0) { 2559 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2560 cap_ah, mp); 2561 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2562 } 2563 2564 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2565 if (esp_cnt > 0) { 2566 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2567 cap_esp, mp); 2568 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2569 } 2570 2571 /* 2572 * At this point we've composed a bunch of sub-capabilities to be 2573 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2574 * by the caller. Upon receiving this reset message, the driver 2575 * must stop inbound decryption (by destroying all inbound SAs) 2576 * and let the corresponding packets come in encrypted. 2577 */ 2578 2579 if (*sc_mp != NULL) 2580 linkb(*sc_mp, mp); 2581 else 2582 *sc_mp = mp; 2583 } 2584 2585 static void 2586 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2587 boolean_t encapsulated) 2588 { 2589 boolean_t legacy = B_FALSE; 2590 2591 /* 2592 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2593 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2594 * instructed the driver to disable its advertised capabilities, 2595 * so there's no point in accepting any response at this moment. 2596 */ 2597 if (ill->ill_capab_state == IDMS_UNKNOWN) 2598 return; 2599 2600 /* 2601 * Note that only the following two sub-capabilities may be 2602 * considered as "legacy", since their original definitions 2603 * do not incorporate the dl_mid_t module ID token, and hence 2604 * may require the use of the wrapper sub-capability. 2605 */ 2606 switch (subp->dl_cap) { 2607 case DL_CAPAB_IPSEC_AH: 2608 case DL_CAPAB_IPSEC_ESP: 2609 legacy = B_TRUE; 2610 break; 2611 } 2612 2613 /* 2614 * For legacy sub-capabilities which don't incorporate a queue_t 2615 * pointer in their structures, discard them if we detect that 2616 * there are intermediate modules in between IP and the driver. 2617 */ 2618 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2619 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2620 "%d discarded; %d module(s) present below IP\n", 2621 subp->dl_cap, ill->ill_lmod_cnt)); 2622 return; 2623 } 2624 2625 switch (subp->dl_cap) { 2626 case DL_CAPAB_IPSEC_AH: 2627 case DL_CAPAB_IPSEC_ESP: 2628 ill_capability_ipsec_ack(ill, mp, subp); 2629 break; 2630 case DL_CAPAB_MDT: 2631 ill_capability_mdt_ack(ill, mp, subp); 2632 break; 2633 case DL_CAPAB_HCKSUM: 2634 ill_capability_hcksum_ack(ill, mp, subp); 2635 break; 2636 case DL_CAPAB_ZEROCOPY: 2637 ill_capability_zerocopy_ack(ill, mp, subp); 2638 break; 2639 case DL_CAPAB_POLL: 2640 if (!SOFT_RINGS_ENABLED()) 2641 ill_capability_dls_ack(ill, mp, subp); 2642 break; 2643 case DL_CAPAB_SOFT_RING: 2644 if (SOFT_RINGS_ENABLED()) 2645 ill_capability_dls_ack(ill, mp, subp); 2646 break; 2647 default: 2648 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2649 subp->dl_cap)); 2650 } 2651 } 2652 2653 /* 2654 * As part of negotiating polling capability, the driver tells us 2655 * the default (or normal) blanking interval and packet threshold 2656 * (the receive timer fires if blanking interval is reached or 2657 * the packet threshold is reached). 2658 * 2659 * As part of manipulating the polling interval, we always use our 2660 * estimated interval (avg service time * number of packets queued 2661 * on the squeue) but we try to blank for a minimum of 2662 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2663 * packet threshold during this time. When we are not in polling mode 2664 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2665 * rr_min_blank_ratio but up the packet cnt by a ratio of 2666 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2667 * possible although for a shorter interval. 2668 */ 2669 #define RR_MAX_BLANK_RATIO 20 2670 #define RR_MIN_BLANK_RATIO 10 2671 #define RR_MAX_PKT_CNT_RATIO 3 2672 #define RR_MIN_PKT_CNT_RATIO 3 2673 2674 /* 2675 * These can be tuned via /etc/system. 2676 */ 2677 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2678 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2679 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2680 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2681 2682 static mac_resource_handle_t 2683 ill_ring_add(void *arg, mac_resource_t *mrp) 2684 { 2685 ill_t *ill = (ill_t *)arg; 2686 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2687 ill_rx_ring_t *rx_ring; 2688 int ip_rx_index; 2689 2690 ASSERT(mrp != NULL); 2691 if (mrp->mr_type != MAC_RX_FIFO) { 2692 return (NULL); 2693 } 2694 ASSERT(ill != NULL); 2695 ASSERT(ill->ill_dls_capab != NULL); 2696 2697 mutex_enter(&ill->ill_lock); 2698 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2699 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2700 ASSERT(rx_ring != NULL); 2701 2702 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2703 time_t normal_blank_time = 2704 mrfp->mrf_normal_blank_time; 2705 uint_t normal_pkt_cnt = 2706 mrfp->mrf_normal_pkt_count; 2707 2708 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2709 2710 rx_ring->rr_blank = mrfp->mrf_blank; 2711 rx_ring->rr_handle = mrfp->mrf_arg; 2712 rx_ring->rr_ill = ill; 2713 rx_ring->rr_normal_blank_time = normal_blank_time; 2714 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2715 2716 rx_ring->rr_max_blank_time = 2717 normal_blank_time * rr_max_blank_ratio; 2718 rx_ring->rr_min_blank_time = 2719 normal_blank_time * rr_min_blank_ratio; 2720 rx_ring->rr_max_pkt_cnt = 2721 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2722 rx_ring->rr_min_pkt_cnt = 2723 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2724 2725 rx_ring->rr_ring_state = ILL_RING_INUSE; 2726 mutex_exit(&ill->ill_lock); 2727 2728 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2729 (int), ip_rx_index); 2730 return ((mac_resource_handle_t)rx_ring); 2731 } 2732 } 2733 2734 /* 2735 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2736 * we have devices which can overwhelm this limit, ILL_MAX_RING 2737 * should be made configurable. Meanwhile it cause no panic because 2738 * driver will pass ip_input a NULL handle which will make 2739 * IP allocate the default squeue and Polling mode will not 2740 * be used for this ring. 2741 */ 2742 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2743 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2744 2745 mutex_exit(&ill->ill_lock); 2746 return (NULL); 2747 } 2748 2749 static boolean_t 2750 ill_capability_dls_init(ill_t *ill) 2751 { 2752 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2753 conn_t *connp; 2754 size_t sz; 2755 2756 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2757 if (ill_dls == NULL) { 2758 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2759 "soft_ring enabled for ill=%s (%p) but data " 2760 "structs uninitialized\n", ill->ill_name, 2761 (void *)ill); 2762 } 2763 return (B_TRUE); 2764 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2765 if (ill_dls == NULL) { 2766 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2767 "polling enabled for ill=%s (%p) but data " 2768 "structs uninitialized\n", ill->ill_name, 2769 (void *)ill); 2770 } 2771 return (B_TRUE); 2772 } 2773 2774 if (ill_dls != NULL) { 2775 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2776 /* Soft_Ring or polling is being re-enabled */ 2777 2778 connp = ill_dls->ill_unbind_conn; 2779 ASSERT(rx_ring != NULL); 2780 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2781 bzero((void *)rx_ring, 2782 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2783 ill_dls->ill_ring_tbl = rx_ring; 2784 ill_dls->ill_unbind_conn = connp; 2785 return (B_TRUE); 2786 } 2787 2788 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 2789 return (B_FALSE); 2790 2791 sz = sizeof (ill_dls_capab_t); 2792 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2793 2794 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2795 if (ill_dls == NULL) { 2796 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2797 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2798 (void *)ill); 2799 CONN_DEC_REF(connp); 2800 return (B_FALSE); 2801 } 2802 2803 /* Allocate space to hold ring table */ 2804 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2805 ill->ill_dls_capab = ill_dls; 2806 ill_dls->ill_unbind_conn = connp; 2807 return (B_TRUE); 2808 } 2809 2810 /* 2811 * ill_capability_dls_disable: disable soft_ring and/or polling 2812 * capability. Since any of the rings might already be in use, need 2813 * to call ipsq_clean_all() which gets behind the squeue to disable 2814 * direct calls if necessary. 2815 */ 2816 static void 2817 ill_capability_dls_disable(ill_t *ill) 2818 { 2819 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2820 2821 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2822 ipsq_clean_all(ill); 2823 ill_dls->ill_tx = NULL; 2824 ill_dls->ill_tx_handle = NULL; 2825 ill_dls->ill_dls_change_status = NULL; 2826 ill_dls->ill_dls_bind = NULL; 2827 ill_dls->ill_dls_unbind = NULL; 2828 } 2829 2830 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2831 } 2832 2833 static void 2834 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2835 dl_capability_sub_t *isub) 2836 { 2837 uint_t size; 2838 uchar_t *rptr; 2839 dl_capab_dls_t dls, *odls; 2840 ill_dls_capab_t *ill_dls; 2841 mblk_t *nmp = NULL; 2842 dl_capability_req_t *ocap; 2843 uint_t sub_dl_cap = isub->dl_cap; 2844 2845 if (!ill_capability_dls_init(ill)) 2846 return; 2847 ill_dls = ill->ill_dls_capab; 2848 2849 /* Copy locally to get the members aligned */ 2850 bcopy((void *)idls, (void *)&dls, 2851 sizeof (dl_capab_dls_t)); 2852 2853 /* Get the tx function and handle from dld */ 2854 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2855 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2856 2857 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2858 ill_dls->ill_dls_change_status = 2859 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2860 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2861 ill_dls->ill_dls_unbind = 2862 (ip_dls_unbind_t)dls.dls_ring_unbind; 2863 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2864 } 2865 2866 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2867 isub->dl_length; 2868 2869 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2870 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2871 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2872 ill->ill_name, (void *)ill); 2873 return; 2874 } 2875 2876 /* initialize dl_capability_req_t */ 2877 rptr = nmp->b_rptr; 2878 ocap = (dl_capability_req_t *)rptr; 2879 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2880 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2881 rptr += sizeof (dl_capability_req_t); 2882 2883 /* initialize dl_capability_sub_t */ 2884 bcopy(isub, rptr, sizeof (*isub)); 2885 rptr += sizeof (*isub); 2886 2887 odls = (dl_capab_dls_t *)rptr; 2888 rptr += sizeof (dl_capab_dls_t); 2889 2890 /* initialize dl_capab_dls_t to be sent down */ 2891 dls.dls_rx_handle = (uintptr_t)ill; 2892 dls.dls_rx = (uintptr_t)ip_input; 2893 dls.dls_ring_add = (uintptr_t)ill_ring_add; 2894 2895 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2896 dls.dls_ring_cnt = ip_soft_rings_cnt; 2897 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 2898 dls.dls_flags = SOFT_RING_ENABLE; 2899 } else { 2900 dls.dls_flags = POLL_ENABLE; 2901 ip1dbg(("ill_capability_dls_capable: asking interface %s " 2902 "to enable polling\n", ill->ill_name)); 2903 } 2904 bcopy((void *)&dls, (void *)odls, 2905 sizeof (dl_capab_dls_t)); 2906 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2907 /* 2908 * nmp points to a DL_CAPABILITY_REQ message to 2909 * enable either soft_ring or polling 2910 */ 2911 ill_dlpi_send(ill, nmp); 2912 } 2913 2914 static void 2915 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 2916 { 2917 mblk_t *mp; 2918 dl_capab_dls_t *idls; 2919 dl_capability_sub_t *dl_subcap; 2920 int size; 2921 2922 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 2923 return; 2924 2925 ASSERT(ill->ill_dls_capab != NULL); 2926 2927 size = sizeof (*dl_subcap) + sizeof (*idls); 2928 2929 mp = allocb(size, BPRI_HI); 2930 if (mp == NULL) { 2931 ip1dbg(("ill_capability_dls_reset: unable to allocate " 2932 "request to disable soft_ring\n")); 2933 return; 2934 } 2935 2936 mp->b_wptr = mp->b_rptr + size; 2937 2938 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2939 dl_subcap->dl_length = sizeof (*idls); 2940 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2941 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 2942 else 2943 dl_subcap->dl_cap = DL_CAPAB_POLL; 2944 2945 idls = (dl_capab_dls_t *)(dl_subcap + 1); 2946 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2947 idls->dls_flags = SOFT_RING_DISABLE; 2948 else 2949 idls->dls_flags = POLL_DISABLE; 2950 2951 if (*sc_mp != NULL) 2952 linkb(*sc_mp, mp); 2953 else 2954 *sc_mp = mp; 2955 } 2956 2957 /* 2958 * Process a soft_ring/poll capability negotiation ack received 2959 * from a DLS Provider.isub must point to the sub-capability 2960 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 2961 */ 2962 static void 2963 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2964 { 2965 dl_capab_dls_t *idls; 2966 uint_t sub_dl_cap = isub->dl_cap; 2967 uint8_t *capend; 2968 2969 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 2970 sub_dl_cap == DL_CAPAB_POLL); 2971 2972 if (ill->ill_isv6) 2973 return; 2974 2975 /* 2976 * Note: range checks here are not absolutely sufficient to 2977 * make us robust against malformed messages sent by drivers; 2978 * this is in keeping with the rest of IP's dlpi handling. 2979 * (Remember, it's coming from something else in the kernel 2980 * address space) 2981 */ 2982 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2983 if (capend > mp->b_wptr) { 2984 cmn_err(CE_WARN, "ill_capability_dls_ack: " 2985 "malformed sub-capability too long for mblk"); 2986 return; 2987 } 2988 2989 /* 2990 * There are two types of acks we process here: 2991 * 1. acks in reply to a (first form) generic capability req 2992 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 2993 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 2994 * capability req. 2995 */ 2996 idls = (dl_capab_dls_t *)(isub + 1); 2997 2998 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 2999 ip1dbg(("ill_capability_dls_ack: mid token for dls " 3000 "capability isn't as expected; pass-thru " 3001 "module(s) detected, discarding capability\n")); 3002 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 3003 /* 3004 * This is a capability renegotitation case. 3005 * The interface better be unusable at this 3006 * point other wise bad things will happen 3007 * if we disable direct calls on a running 3008 * and up interface. 3009 */ 3010 ill_capability_dls_disable(ill); 3011 } 3012 return; 3013 } 3014 3015 switch (idls->dls_flags) { 3016 default: 3017 /* Disable if unknown flag */ 3018 case SOFT_RING_DISABLE: 3019 case POLL_DISABLE: 3020 ill_capability_dls_disable(ill); 3021 break; 3022 case SOFT_RING_CAPABLE: 3023 case POLL_CAPABLE: 3024 /* 3025 * If the capability was already enabled, its safe 3026 * to disable it first to get rid of stale information 3027 * and then start enabling it again. 3028 */ 3029 ill_capability_dls_disable(ill); 3030 ill_capability_dls_capable(ill, idls, isub); 3031 break; 3032 case SOFT_RING_ENABLE: 3033 case POLL_ENABLE: 3034 mutex_enter(&ill->ill_lock); 3035 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3036 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3037 ASSERT(ill->ill_dls_capab != NULL); 3038 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3039 } 3040 if (sub_dl_cap == DL_CAPAB_POLL && 3041 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3042 ASSERT(ill->ill_dls_capab != NULL); 3043 ill->ill_capabilities |= ILL_CAPAB_POLL; 3044 ip1dbg(("ill_capability_dls_ack: interface %s " 3045 "has enabled polling\n", ill->ill_name)); 3046 } 3047 mutex_exit(&ill->ill_lock); 3048 break; 3049 } 3050 } 3051 3052 /* 3053 * Process a hardware checksum offload capability negotiation ack received 3054 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3055 * of a DL_CAPABILITY_ACK message. 3056 */ 3057 static void 3058 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3059 { 3060 dl_capability_req_t *ocap; 3061 dl_capab_hcksum_t *ihck, *ohck; 3062 ill_hcksum_capab_t **ill_hcksum; 3063 mblk_t *nmp = NULL; 3064 uint_t sub_dl_cap = isub->dl_cap; 3065 uint8_t *capend; 3066 3067 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3068 3069 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3070 3071 /* 3072 * Note: range checks here are not absolutely sufficient to 3073 * make us robust against malformed messages sent by drivers; 3074 * this is in keeping with the rest of IP's dlpi handling. 3075 * (Remember, it's coming from something else in the kernel 3076 * address space) 3077 */ 3078 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3079 if (capend > mp->b_wptr) { 3080 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3081 "malformed sub-capability too long for mblk"); 3082 return; 3083 } 3084 3085 /* 3086 * There are two types of acks we process here: 3087 * 1. acks in reply to a (first form) generic capability req 3088 * (no ENABLE flag set) 3089 * 2. acks in reply to a ENABLE capability req. 3090 * (ENABLE flag set) 3091 */ 3092 ihck = (dl_capab_hcksum_t *)(isub + 1); 3093 3094 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3095 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3096 "unsupported hardware checksum " 3097 "sub-capability (version %d, expected %d)", 3098 ihck->hcksum_version, HCKSUM_VERSION_1); 3099 return; 3100 } 3101 3102 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3103 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3104 "checksum capability isn't as expected; pass-thru " 3105 "module(s) detected, discarding capability\n")); 3106 return; 3107 } 3108 3109 #define CURR_HCKSUM_CAPAB \ 3110 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3111 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3112 3113 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3114 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3115 /* do ENABLE processing */ 3116 if (*ill_hcksum == NULL) { 3117 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3118 KM_NOSLEEP); 3119 3120 if (*ill_hcksum == NULL) { 3121 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3122 "could not enable hcksum version %d " 3123 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3124 ill->ill_name); 3125 return; 3126 } 3127 } 3128 3129 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3130 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3131 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3132 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3133 "has enabled hardware checksumming\n ", 3134 ill->ill_name)); 3135 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3136 /* 3137 * Enabling hardware checksum offload 3138 * Currently IP supports {TCP,UDP}/IPv4 3139 * partial and full cksum offload and 3140 * IPv4 header checksum offload. 3141 * Allocate new mblk which will 3142 * contain a new capability request 3143 * to enable hardware checksum offload. 3144 */ 3145 uint_t size; 3146 uchar_t *rptr; 3147 3148 size = sizeof (dl_capability_req_t) + 3149 sizeof (dl_capability_sub_t) + isub->dl_length; 3150 3151 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3152 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3153 "could not enable hardware cksum for %s (ENOMEM)\n", 3154 ill->ill_name); 3155 return; 3156 } 3157 3158 rptr = nmp->b_rptr; 3159 /* initialize dl_capability_req_t */ 3160 ocap = (dl_capability_req_t *)nmp->b_rptr; 3161 ocap->dl_sub_offset = 3162 sizeof (dl_capability_req_t); 3163 ocap->dl_sub_length = 3164 sizeof (dl_capability_sub_t) + 3165 isub->dl_length; 3166 nmp->b_rptr += sizeof (dl_capability_req_t); 3167 3168 /* initialize dl_capability_sub_t */ 3169 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3170 nmp->b_rptr += sizeof (*isub); 3171 3172 /* initialize dl_capab_hcksum_t */ 3173 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3174 bcopy(ihck, ohck, sizeof (*ihck)); 3175 3176 nmp->b_rptr = rptr; 3177 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3178 3179 /* Set ENABLE flag */ 3180 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3181 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3182 3183 /* 3184 * nmp points to a DL_CAPABILITY_REQ message to enable 3185 * hardware checksum acceleration. 3186 */ 3187 ill_dlpi_send(ill, nmp); 3188 } else { 3189 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3190 "advertised %x hardware checksum capability flags\n", 3191 ill->ill_name, ihck->hcksum_txflags)); 3192 } 3193 } 3194 3195 static void 3196 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3197 { 3198 mblk_t *mp; 3199 dl_capab_hcksum_t *hck_subcap; 3200 dl_capability_sub_t *dl_subcap; 3201 int size; 3202 3203 if (!ILL_HCKSUM_CAPABLE(ill)) 3204 return; 3205 3206 ASSERT(ill->ill_hcksum_capab != NULL); 3207 /* 3208 * Clear the capability flag for hardware checksum offload but 3209 * retain the ill_hcksum_capab structure since it's possible that 3210 * another thread is still referring to it. The structure only 3211 * gets deallocated when we destroy the ill. 3212 */ 3213 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3214 3215 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3216 3217 mp = allocb(size, BPRI_HI); 3218 if (mp == NULL) { 3219 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3220 "request to disable hardware checksum offload\n")); 3221 return; 3222 } 3223 3224 mp->b_wptr = mp->b_rptr + size; 3225 3226 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3227 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3228 dl_subcap->dl_length = sizeof (*hck_subcap); 3229 3230 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3231 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3232 hck_subcap->hcksum_txflags = 0; 3233 3234 if (*sc_mp != NULL) 3235 linkb(*sc_mp, mp); 3236 else 3237 *sc_mp = mp; 3238 } 3239 3240 static void 3241 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3242 { 3243 mblk_t *nmp = NULL; 3244 dl_capability_req_t *oc; 3245 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3246 ill_zerocopy_capab_t **ill_zerocopy_capab; 3247 uint_t sub_dl_cap = isub->dl_cap; 3248 uint8_t *capend; 3249 3250 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3251 3252 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3253 3254 /* 3255 * Note: range checks here are not absolutely sufficient to 3256 * make us robust against malformed messages sent by drivers; 3257 * this is in keeping with the rest of IP's dlpi handling. 3258 * (Remember, it's coming from something else in the kernel 3259 * address space) 3260 */ 3261 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3262 if (capend > mp->b_wptr) { 3263 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3264 "malformed sub-capability too long for mblk"); 3265 return; 3266 } 3267 3268 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3269 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3270 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3271 "unsupported ZEROCOPY sub-capability (version %d, " 3272 "expected %d)", zc_ic->zerocopy_version, 3273 ZEROCOPY_VERSION_1); 3274 return; 3275 } 3276 3277 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3278 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3279 "capability isn't as expected; pass-thru module(s) " 3280 "detected, discarding capability\n")); 3281 return; 3282 } 3283 3284 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3285 if (*ill_zerocopy_capab == NULL) { 3286 *ill_zerocopy_capab = 3287 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3288 KM_NOSLEEP); 3289 3290 if (*ill_zerocopy_capab == NULL) { 3291 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3292 "could not enable Zero-copy version %d " 3293 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3294 ill->ill_name); 3295 return; 3296 } 3297 } 3298 3299 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3300 "supports Zero-copy version %d\n", ill->ill_name, 3301 ZEROCOPY_VERSION_1)); 3302 3303 (*ill_zerocopy_capab)->ill_zerocopy_version = 3304 zc_ic->zerocopy_version; 3305 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3306 zc_ic->zerocopy_flags; 3307 3308 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3309 } else { 3310 uint_t size; 3311 uchar_t *rptr; 3312 3313 size = sizeof (dl_capability_req_t) + 3314 sizeof (dl_capability_sub_t) + 3315 sizeof (dl_capab_zerocopy_t); 3316 3317 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3318 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3319 "could not enable zerocopy for %s (ENOMEM)\n", 3320 ill->ill_name); 3321 return; 3322 } 3323 3324 rptr = nmp->b_rptr; 3325 /* initialize dl_capability_req_t */ 3326 oc = (dl_capability_req_t *)rptr; 3327 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3328 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3329 sizeof (dl_capab_zerocopy_t); 3330 rptr += sizeof (dl_capability_req_t); 3331 3332 /* initialize dl_capability_sub_t */ 3333 bcopy(isub, rptr, sizeof (*isub)); 3334 rptr += sizeof (*isub); 3335 3336 /* initialize dl_capab_zerocopy_t */ 3337 zc_oc = (dl_capab_zerocopy_t *)rptr; 3338 *zc_oc = *zc_ic; 3339 3340 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3341 "to enable zero-copy version %d\n", ill->ill_name, 3342 ZEROCOPY_VERSION_1)); 3343 3344 /* set VMSAFE_MEM flag */ 3345 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3346 3347 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3348 ill_dlpi_send(ill, nmp); 3349 } 3350 } 3351 3352 static void 3353 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3354 { 3355 mblk_t *mp; 3356 dl_capab_zerocopy_t *zerocopy_subcap; 3357 dl_capability_sub_t *dl_subcap; 3358 int size; 3359 3360 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3361 return; 3362 3363 ASSERT(ill->ill_zerocopy_capab != NULL); 3364 /* 3365 * Clear the capability flag for Zero-copy but retain the 3366 * ill_zerocopy_capab structure since it's possible that another 3367 * thread is still referring to it. The structure only gets 3368 * deallocated when we destroy the ill. 3369 */ 3370 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3371 3372 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3373 3374 mp = allocb(size, BPRI_HI); 3375 if (mp == NULL) { 3376 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3377 "request to disable Zero-copy\n")); 3378 return; 3379 } 3380 3381 mp->b_wptr = mp->b_rptr + size; 3382 3383 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3384 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3385 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3386 3387 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3388 zerocopy_subcap->zerocopy_version = 3389 ill->ill_zerocopy_capab->ill_zerocopy_version; 3390 zerocopy_subcap->zerocopy_flags = 0; 3391 3392 if (*sc_mp != NULL) 3393 linkb(*sc_mp, mp); 3394 else 3395 *sc_mp = mp; 3396 } 3397 3398 /* 3399 * Consume a new-style hardware capabilities negotiation ack. 3400 * Called from ip_rput_dlpi_writer(). 3401 */ 3402 void 3403 ill_capability_ack(ill_t *ill, mblk_t *mp) 3404 { 3405 dl_capability_ack_t *capp; 3406 dl_capability_sub_t *subp, *endp; 3407 3408 if (ill->ill_capab_state == IDMS_INPROGRESS) 3409 ill->ill_capab_state = IDMS_OK; 3410 3411 capp = (dl_capability_ack_t *)mp->b_rptr; 3412 3413 if (capp->dl_sub_length == 0) 3414 /* no new-style capabilities */ 3415 return; 3416 3417 /* make sure the driver supplied correct dl_sub_length */ 3418 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3419 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3420 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3421 return; 3422 } 3423 3424 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3425 /* 3426 * There are sub-capabilities. Process the ones we know about. 3427 * Loop until we don't have room for another sub-cap header.. 3428 */ 3429 for (subp = SC(capp, capp->dl_sub_offset), 3430 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3431 subp <= endp; 3432 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3433 3434 switch (subp->dl_cap) { 3435 case DL_CAPAB_ID_WRAPPER: 3436 ill_capability_id_ack(ill, mp, subp); 3437 break; 3438 default: 3439 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3440 break; 3441 } 3442 } 3443 #undef SC 3444 } 3445 3446 /* 3447 * This routine is called to scan the fragmentation reassembly table for 3448 * the specified ILL for any packets that are starting to smell. 3449 * dead_interval is the maximum time in seconds that will be tolerated. It 3450 * will either be the value specified in ip_g_frag_timeout, or zero if the 3451 * ILL is shutting down and it is time to blow everything off. 3452 * 3453 * It returns the number of seconds (as a time_t) that the next frag timer 3454 * should be scheduled for, 0 meaning that the timer doesn't need to be 3455 * re-started. Note that the method of calculating next_timeout isn't 3456 * entirely accurate since time will flow between the time we grab 3457 * current_time and the time we schedule the next timeout. This isn't a 3458 * big problem since this is the timer for sending an ICMP reassembly time 3459 * exceeded messages, and it doesn't have to be exactly accurate. 3460 * 3461 * This function is 3462 * sometimes called as writer, although this is not required. 3463 */ 3464 time_t 3465 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3466 { 3467 ipfb_t *ipfb; 3468 ipfb_t *endp; 3469 ipf_t *ipf; 3470 ipf_t *ipfnext; 3471 mblk_t *mp; 3472 time_t current_time = gethrestime_sec(); 3473 time_t next_timeout = 0; 3474 uint32_t hdr_length; 3475 mblk_t *send_icmp_head; 3476 mblk_t *send_icmp_head_v6; 3477 3478 ipfb = ill->ill_frag_hash_tbl; 3479 if (ipfb == NULL) 3480 return (B_FALSE); 3481 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3482 /* Walk the frag hash table. */ 3483 for (; ipfb < endp; ipfb++) { 3484 send_icmp_head = NULL; 3485 send_icmp_head_v6 = NULL; 3486 mutex_enter(&ipfb->ipfb_lock); 3487 while ((ipf = ipfb->ipfb_ipf) != 0) { 3488 time_t frag_time = current_time - ipf->ipf_timestamp; 3489 time_t frag_timeout; 3490 3491 if (frag_time < dead_interval) { 3492 /* 3493 * There are some outstanding fragments 3494 * that will timeout later. Make note of 3495 * the time so that we can reschedule the 3496 * next timeout appropriately. 3497 */ 3498 frag_timeout = dead_interval - frag_time; 3499 if (next_timeout == 0 || 3500 frag_timeout < next_timeout) { 3501 next_timeout = frag_timeout; 3502 } 3503 break; 3504 } 3505 /* Time's up. Get it out of here. */ 3506 hdr_length = ipf->ipf_nf_hdr_len; 3507 ipfnext = ipf->ipf_hash_next; 3508 if (ipfnext) 3509 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3510 *ipf->ipf_ptphn = ipfnext; 3511 mp = ipf->ipf_mp->b_cont; 3512 for (; mp; mp = mp->b_cont) { 3513 /* Extra points for neatness. */ 3514 IP_REASS_SET_START(mp, 0); 3515 IP_REASS_SET_END(mp, 0); 3516 } 3517 mp = ipf->ipf_mp->b_cont; 3518 ill->ill_frag_count -= ipf->ipf_count; 3519 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3520 ipfb->ipfb_count -= ipf->ipf_count; 3521 ASSERT(ipfb->ipfb_frag_pkts > 0); 3522 ipfb->ipfb_frag_pkts--; 3523 /* 3524 * We do not send any icmp message from here because 3525 * we currently are holding the ipfb_lock for this 3526 * hash chain. If we try and send any icmp messages 3527 * from here we may end up via a put back into ip 3528 * trying to get the same lock, causing a recursive 3529 * mutex panic. Instead we build a list and send all 3530 * the icmp messages after we have dropped the lock. 3531 */ 3532 if (ill->ill_isv6) { 3533 BUMP_MIB(ill->ill_ip6_mib, ipv6ReasmFails); 3534 if (hdr_length != 0) { 3535 mp->b_next = send_icmp_head_v6; 3536 send_icmp_head_v6 = mp; 3537 } else { 3538 freemsg(mp); 3539 } 3540 } else { 3541 BUMP_MIB(&ip_mib, ipReasmFails); 3542 if (hdr_length != 0) { 3543 mp->b_next = send_icmp_head; 3544 send_icmp_head = mp; 3545 } else { 3546 freemsg(mp); 3547 } 3548 } 3549 freeb(ipf->ipf_mp); 3550 } 3551 mutex_exit(&ipfb->ipfb_lock); 3552 /* 3553 * Now need to send any icmp messages that we delayed from 3554 * above. 3555 */ 3556 while (send_icmp_head_v6 != NULL) { 3557 mp = send_icmp_head_v6; 3558 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3559 mp->b_next = NULL; 3560 icmp_time_exceeded_v6(ill->ill_wq, mp, 3561 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, B_FALSE); 3562 } 3563 while (send_icmp_head != NULL) { 3564 mp = send_icmp_head; 3565 send_icmp_head = send_icmp_head->b_next; 3566 mp->b_next = NULL; 3567 icmp_time_exceeded(ill->ill_wq, mp, 3568 ICMP_REASSEMBLY_TIME_EXCEEDED); 3569 } 3570 } 3571 /* 3572 * A non-dying ILL will use the return value to decide whether to 3573 * restart the frag timer, and for how long. 3574 */ 3575 return (next_timeout); 3576 } 3577 3578 /* 3579 * This routine is called when the approximate count of mblk memory used 3580 * for the specified ILL has exceeded max_count. 3581 */ 3582 void 3583 ill_frag_prune(ill_t *ill, uint_t max_count) 3584 { 3585 ipfb_t *ipfb; 3586 ipf_t *ipf; 3587 size_t count; 3588 3589 /* 3590 * If we are here within ip_min_frag_prune_time msecs remove 3591 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3592 * ill_frag_free_num_pkts. 3593 */ 3594 mutex_enter(&ill->ill_lock); 3595 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3596 (ip_min_frag_prune_time != 0 ? 3597 ip_min_frag_prune_time : msec_per_tick)) { 3598 3599 ill->ill_frag_free_num_pkts++; 3600 3601 } else { 3602 ill->ill_frag_free_num_pkts = 0; 3603 } 3604 ill->ill_last_frag_clean_time = lbolt; 3605 mutex_exit(&ill->ill_lock); 3606 3607 /* 3608 * free ill_frag_free_num_pkts oldest packets from each bucket. 3609 */ 3610 if (ill->ill_frag_free_num_pkts != 0) { 3611 int ix; 3612 3613 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3614 ipfb = &ill->ill_frag_hash_tbl[ix]; 3615 mutex_enter(&ipfb->ipfb_lock); 3616 if (ipfb->ipfb_ipf != NULL) { 3617 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3618 ill->ill_frag_free_num_pkts); 3619 } 3620 mutex_exit(&ipfb->ipfb_lock); 3621 } 3622 } 3623 /* 3624 * While the reassembly list for this ILL is too big, prune a fragment 3625 * queue by age, oldest first. Note that the per ILL count is 3626 * approximate, while the per frag hash bucket counts are accurate. 3627 */ 3628 while (ill->ill_frag_count > max_count) { 3629 int ix; 3630 ipfb_t *oipfb = NULL; 3631 uint_t oldest = UINT_MAX; 3632 3633 count = 0; 3634 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3635 ipfb = &ill->ill_frag_hash_tbl[ix]; 3636 mutex_enter(&ipfb->ipfb_lock); 3637 ipf = ipfb->ipfb_ipf; 3638 if (ipf != NULL && ipf->ipf_gen < oldest) { 3639 oldest = ipf->ipf_gen; 3640 oipfb = ipfb; 3641 } 3642 count += ipfb->ipfb_count; 3643 mutex_exit(&ipfb->ipfb_lock); 3644 } 3645 /* Refresh the per ILL count */ 3646 ill->ill_frag_count = count; 3647 if (oipfb == NULL) { 3648 ill->ill_frag_count = 0; 3649 break; 3650 } 3651 if (count <= max_count) 3652 return; /* Somebody beat us to it, nothing to do */ 3653 mutex_enter(&oipfb->ipfb_lock); 3654 ipf = oipfb->ipfb_ipf; 3655 if (ipf != NULL) { 3656 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3657 } 3658 mutex_exit(&oipfb->ipfb_lock); 3659 } 3660 } 3661 3662 /* 3663 * free 'free_cnt' fragmented packets starting at ipf. 3664 */ 3665 void 3666 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3667 { 3668 size_t count; 3669 mblk_t *mp; 3670 mblk_t *tmp; 3671 ipf_t **ipfp = ipf->ipf_ptphn; 3672 3673 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3674 ASSERT(ipfp != NULL); 3675 ASSERT(ipf != NULL); 3676 3677 while (ipf != NULL && free_cnt-- > 0) { 3678 count = ipf->ipf_count; 3679 mp = ipf->ipf_mp; 3680 ipf = ipf->ipf_hash_next; 3681 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3682 IP_REASS_SET_START(tmp, 0); 3683 IP_REASS_SET_END(tmp, 0); 3684 } 3685 ill->ill_frag_count -= count; 3686 ASSERT(ipfb->ipfb_count >= count); 3687 ipfb->ipfb_count -= count; 3688 ASSERT(ipfb->ipfb_frag_pkts > 0); 3689 ipfb->ipfb_frag_pkts--; 3690 freemsg(mp); 3691 BUMP_MIB(&ip_mib, ipReasmFails); 3692 } 3693 3694 if (ipf) 3695 ipf->ipf_ptphn = ipfp; 3696 ipfp[0] = ipf; 3697 } 3698 3699 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3700 "obsolete and may be removed in a future release of Solaris. Use " \ 3701 "ifconfig(1M) to manipulate the forwarding status of an interface." 3702 3703 /* 3704 * For obsolete per-interface forwarding configuration; 3705 * called in response to ND_GET. 3706 */ 3707 /* ARGSUSED */ 3708 static int 3709 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3710 { 3711 ill_t *ill = (ill_t *)cp; 3712 3713 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3714 3715 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3716 return (0); 3717 } 3718 3719 /* 3720 * For obsolete per-interface forwarding configuration; 3721 * called in response to ND_SET. 3722 */ 3723 /* ARGSUSED */ 3724 static int 3725 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3726 cred_t *ioc_cr) 3727 { 3728 long value; 3729 int retval; 3730 3731 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3732 3733 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3734 value < 0 || value > 1) { 3735 return (EINVAL); 3736 } 3737 3738 rw_enter(&ill_g_lock, RW_READER); 3739 retval = ill_forward_set(q, mp, (value != 0), cp); 3740 rw_exit(&ill_g_lock); 3741 return (retval); 3742 } 3743 3744 /* 3745 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 3746 * IPMP group, make sure all ill's in the group adopt the new policy. Send 3747 * up RTS_IFINFO routing socket messages for each interface whose flags we 3748 * change. 3749 */ 3750 /* ARGSUSED */ 3751 int 3752 ill_forward_set(queue_t *q, mblk_t *mp, boolean_t enable, caddr_t cp) 3753 { 3754 ill_t *ill = (ill_t *)cp; 3755 ill_group_t *illgrp; 3756 3757 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ill_g_lock)); 3758 3759 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3760 (!enable && !(ill->ill_flags & ILLF_ROUTER)) || 3761 (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) 3762 return (EINVAL); 3763 3764 /* 3765 * If the ill is in an IPMP group, set the forwarding policy on all 3766 * members of the group to the same value. 3767 */ 3768 illgrp = ill->ill_group; 3769 if (illgrp != NULL) { 3770 ill_t *tmp_ill; 3771 3772 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 3773 tmp_ill = tmp_ill->ill_group_next) { 3774 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3775 (enable ? "Enabling" : "Disabling"), 3776 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 3777 tmp_ill->ill_name)); 3778 mutex_enter(&tmp_ill->ill_lock); 3779 if (enable) 3780 tmp_ill->ill_flags |= ILLF_ROUTER; 3781 else 3782 tmp_ill->ill_flags &= ~ILLF_ROUTER; 3783 mutex_exit(&tmp_ill->ill_lock); 3784 if (tmp_ill->ill_isv6) 3785 ill_set_nce_router_flags(tmp_ill, enable); 3786 /* Notify routing socket listeners of this change. */ 3787 ip_rts_ifmsg(tmp_ill->ill_ipif); 3788 } 3789 } else { 3790 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3791 (enable ? "Enabling" : "Disabling"), 3792 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3793 mutex_enter(&ill->ill_lock); 3794 if (enable) 3795 ill->ill_flags |= ILLF_ROUTER; 3796 else 3797 ill->ill_flags &= ~ILLF_ROUTER; 3798 mutex_exit(&ill->ill_lock); 3799 if (ill->ill_isv6) 3800 ill_set_nce_router_flags(ill, enable); 3801 /* Notify routing socket listeners of this change. */ 3802 ip_rts_ifmsg(ill->ill_ipif); 3803 } 3804 3805 return (0); 3806 } 3807 3808 /* 3809 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3810 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3811 * set or clear. 3812 */ 3813 static void 3814 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3815 { 3816 ipif_t *ipif; 3817 nce_t *nce; 3818 3819 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3820 nce = ndp_lookup(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 3821 if (nce != NULL) { 3822 mutex_enter(&nce->nce_lock); 3823 if (enable) 3824 nce->nce_flags |= NCE_F_ISROUTER; 3825 else 3826 nce->nce_flags &= ~NCE_F_ISROUTER; 3827 mutex_exit(&nce->nce_lock); 3828 NCE_REFRELE(nce); 3829 } 3830 } 3831 } 3832 3833 /* 3834 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 3835 * for this ill. Make sure the v6/v4 question has been answered about this 3836 * ill. The creation of this ndd variable is only for backwards compatibility. 3837 * The preferred way to control per-interface IP forwarding is through the 3838 * ILLF_ROUTER interface flag. 3839 */ 3840 static int 3841 ill_set_ndd_name(ill_t *ill) 3842 { 3843 char *suffix; 3844 3845 ASSERT(IAM_WRITER_ILL(ill)); 3846 3847 if (ill->ill_isv6) 3848 suffix = ipv6_forward_suffix; 3849 else 3850 suffix = ipv4_forward_suffix; 3851 3852 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 3853 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 3854 /* 3855 * Copies over the '\0'. 3856 * Note that strlen(suffix) is always bounded. 3857 */ 3858 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 3859 strlen(suffix) + 1); 3860 3861 /* 3862 * Use of the nd table requires holding the reader lock. 3863 * Modifying the nd table thru nd_load/nd_unload requires 3864 * the writer lock. 3865 */ 3866 rw_enter(&ip_g_nd_lock, RW_WRITER); 3867 if (!nd_load(&ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 3868 nd_ill_forward_set, (caddr_t)ill)) { 3869 /* 3870 * If the nd_load failed, it only meant that it could not 3871 * allocate a new bunch of room for further NDD expansion. 3872 * Because of that, the ill_ndd_name will be set to 0, and 3873 * this interface is at the mercy of the global ip_forwarding 3874 * variable. 3875 */ 3876 rw_exit(&ip_g_nd_lock); 3877 ill->ill_ndd_name = NULL; 3878 return (ENOMEM); 3879 } 3880 rw_exit(&ip_g_nd_lock); 3881 return (0); 3882 } 3883 3884 /* 3885 * Intializes the context structure and returns the first ill in the list 3886 * cuurently start_list and end_list can have values: 3887 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 3888 * IP_V4_G_HEAD Traverse IPV4 list only. 3889 * IP_V6_G_HEAD Traverse IPV6 list only. 3890 */ 3891 3892 /* 3893 * We don't check for CONDEMNED ills here. Caller must do that if 3894 * necessary under the ill lock. 3895 */ 3896 ill_t * 3897 ill_first(int start_list, int end_list, ill_walk_context_t *ctx) 3898 { 3899 ill_if_t *ifp; 3900 ill_t *ill; 3901 avl_tree_t *avl_tree; 3902 3903 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 3904 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 3905 3906 /* 3907 * setup the lists to search 3908 */ 3909 if (end_list != MAX_G_HEADS) { 3910 ctx->ctx_current_list = start_list; 3911 ctx->ctx_last_list = end_list; 3912 } else { 3913 ctx->ctx_last_list = MAX_G_HEADS - 1; 3914 ctx->ctx_current_list = 0; 3915 } 3916 3917 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 3918 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 3919 if (ifp != (ill_if_t *) 3920 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 3921 avl_tree = &ifp->illif_avl_by_ppa; 3922 ill = avl_first(avl_tree); 3923 /* 3924 * ill is guaranteed to be non NULL or ifp should have 3925 * not existed. 3926 */ 3927 ASSERT(ill != NULL); 3928 return (ill); 3929 } 3930 ctx->ctx_current_list++; 3931 } 3932 3933 return (NULL); 3934 } 3935 3936 /* 3937 * returns the next ill in the list. ill_first() must have been called 3938 * before calling ill_next() or bad things will happen. 3939 */ 3940 3941 /* 3942 * We don't check for CONDEMNED ills here. Caller must do that if 3943 * necessary under the ill lock. 3944 */ 3945 ill_t * 3946 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 3947 { 3948 ill_if_t *ifp; 3949 ill_t *ill; 3950 3951 3952 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 3953 ASSERT(lastill->ill_ifptr != (ill_if_t *) 3954 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)); 3955 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 3956 AVL_AFTER)) != NULL) { 3957 return (ill); 3958 } 3959 3960 /* goto next ill_ifp in the list. */ 3961 ifp = lastill->ill_ifptr->illif_next; 3962 3963 /* make sure not at end of circular list */ 3964 while (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 3965 if (++ctx->ctx_current_list > ctx->ctx_last_list) 3966 return (NULL); 3967 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 3968 } 3969 3970 return (avl_first(&ifp->illif_avl_by_ppa)); 3971 } 3972 3973 /* 3974 * Check interface name for correct format which is name+ppa. 3975 * name can contain characters and digits, the right most digits 3976 * make up the ppa number. use of octal is not allowed, name must contain 3977 * a ppa, return pointer to the start of ppa. 3978 * In case of error return NULL. 3979 */ 3980 static char * 3981 ill_get_ppa_ptr(char *name) 3982 { 3983 int namelen = mi_strlen(name); 3984 3985 int len = namelen; 3986 3987 name += len; 3988 while (len > 0) { 3989 name--; 3990 if (*name < '0' || *name > '9') 3991 break; 3992 len--; 3993 } 3994 3995 /* empty string, all digits, or no trailing digits */ 3996 if (len == 0 || len == (int)namelen) 3997 return (NULL); 3998 3999 name++; 4000 /* check for attempted use of octal */ 4001 if (*name == '0' && len != (int)namelen - 1) 4002 return (NULL); 4003 return (name); 4004 } 4005 4006 /* 4007 * use avl tree to locate the ill. 4008 */ 4009 static ill_t * 4010 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4011 ipsq_func_t func, int *error) 4012 { 4013 char *ppa_ptr = NULL; 4014 int len; 4015 uint_t ppa; 4016 ill_t *ill = NULL; 4017 ill_if_t *ifp; 4018 int list; 4019 ipsq_t *ipsq; 4020 4021 if (error != NULL) 4022 *error = 0; 4023 4024 /* 4025 * get ppa ptr 4026 */ 4027 if (isv6) 4028 list = IP_V6_G_HEAD; 4029 else 4030 list = IP_V4_G_HEAD; 4031 4032 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4033 if (error != NULL) 4034 *error = ENXIO; 4035 return (NULL); 4036 } 4037 4038 len = ppa_ptr - name + 1; 4039 4040 ppa = stoi(&ppa_ptr); 4041 4042 ifp = IP_VX_ILL_G_LIST(list); 4043 4044 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4045 /* 4046 * match is done on len - 1 as the name is not null 4047 * terminated it contains ppa in addition to the interface 4048 * name. 4049 */ 4050 if ((ifp->illif_name_len == len) && 4051 bcmp(ifp->illif_name, name, len - 1) == 0) { 4052 break; 4053 } else { 4054 ifp = ifp->illif_next; 4055 } 4056 } 4057 4058 4059 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4060 /* 4061 * Even the interface type does not exist. 4062 */ 4063 if (error != NULL) 4064 *error = ENXIO; 4065 return (NULL); 4066 } 4067 4068 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4069 if (ill != NULL) { 4070 /* 4071 * The block comment at the start of ipif_down 4072 * explains the use of the macros used below 4073 */ 4074 GRAB_CONN_LOCK(q); 4075 mutex_enter(&ill->ill_lock); 4076 if (ILL_CAN_LOOKUP(ill)) { 4077 ill_refhold_locked(ill); 4078 mutex_exit(&ill->ill_lock); 4079 RELEASE_CONN_LOCK(q); 4080 return (ill); 4081 } else if (ILL_CAN_WAIT(ill, q)) { 4082 ipsq = ill->ill_phyint->phyint_ipsq; 4083 mutex_enter(&ipsq->ipsq_lock); 4084 mutex_exit(&ill->ill_lock); 4085 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4086 mutex_exit(&ipsq->ipsq_lock); 4087 RELEASE_CONN_LOCK(q); 4088 *error = EINPROGRESS; 4089 return (NULL); 4090 } 4091 mutex_exit(&ill->ill_lock); 4092 RELEASE_CONN_LOCK(q); 4093 } 4094 if (error != NULL) 4095 *error = ENXIO; 4096 return (NULL); 4097 } 4098 4099 /* 4100 * comparison function for use with avl. 4101 */ 4102 static int 4103 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4104 { 4105 uint_t ppa; 4106 uint_t ill_ppa; 4107 4108 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4109 4110 ppa = *((uint_t *)ppa_ptr); 4111 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4112 /* 4113 * We want the ill with the lowest ppa to be on the 4114 * top. 4115 */ 4116 if (ill_ppa < ppa) 4117 return (1); 4118 if (ill_ppa > ppa) 4119 return (-1); 4120 return (0); 4121 } 4122 4123 /* 4124 * remove an interface type from the global list. 4125 */ 4126 static void 4127 ill_delete_interface_type(ill_if_t *interface) 4128 { 4129 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4130 4131 ASSERT(interface != NULL); 4132 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4133 4134 avl_destroy(&interface->illif_avl_by_ppa); 4135 if (interface->illif_ppa_arena != NULL) 4136 vmem_destroy(interface->illif_ppa_arena); 4137 4138 remque(interface); 4139 4140 mi_free(interface); 4141 } 4142 4143 /* 4144 * remove ill from the global list. 4145 */ 4146 static void 4147 ill_glist_delete(ill_t *ill) 4148 { 4149 if (ill == NULL) 4150 return; 4151 4152 rw_enter(&ill_g_lock, RW_WRITER); 4153 /* 4154 * If the ill was never inserted into the AVL tree 4155 * we skip the if branch. 4156 */ 4157 if (ill->ill_ifptr != NULL) { 4158 /* 4159 * remove from AVL tree and free ppa number 4160 */ 4161 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4162 4163 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4164 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4165 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4166 } 4167 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4168 ill_delete_interface_type(ill->ill_ifptr); 4169 } 4170 4171 /* 4172 * Indicate ill is no longer in the list. 4173 */ 4174 ill->ill_ifptr = NULL; 4175 ill->ill_name_length = 0; 4176 ill->ill_name[0] = '\0'; 4177 ill->ill_ppa = UINT_MAX; 4178 } 4179 ill_phyint_free(ill); 4180 rw_exit(&ill_g_lock); 4181 } 4182 4183 /* 4184 * allocate a ppa, if the number of plumbed interfaces of this type are 4185 * less than ill_no_arena do a linear search to find a unused ppa. 4186 * When the number goes beyond ill_no_arena switch to using an arena. 4187 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4188 * is the return value for an error condition, so allocation starts at one 4189 * and is decremented by one. 4190 */ 4191 static int 4192 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4193 { 4194 ill_t *tmp_ill; 4195 uint_t start, end; 4196 int ppa; 4197 4198 if (ifp->illif_ppa_arena == NULL && 4199 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4200 /* 4201 * Create an arena. 4202 */ 4203 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4204 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4205 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4206 /* allocate what has already been assigned */ 4207 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4208 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4209 tmp_ill, AVL_AFTER)) { 4210 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4211 1, /* size */ 4212 1, /* align/quantum */ 4213 0, /* phase */ 4214 0, /* nocross */ 4215 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), /* minaddr */ 4216 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), /* maxaddr */ 4217 VM_NOSLEEP|VM_FIRSTFIT); 4218 if (ppa == 0) { 4219 ip1dbg(("ill_alloc_ppa: ppa allocation" 4220 " failed while switching")); 4221 vmem_destroy(ifp->illif_ppa_arena); 4222 ifp->illif_ppa_arena = NULL; 4223 break; 4224 } 4225 } 4226 } 4227 4228 if (ifp->illif_ppa_arena != NULL) { 4229 if (ill->ill_ppa == UINT_MAX) { 4230 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4231 1, VM_NOSLEEP|VM_FIRSTFIT); 4232 if (ppa == 0) 4233 return (EAGAIN); 4234 ill->ill_ppa = --ppa; 4235 } else { 4236 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4237 1, /* size */ 4238 1, /* align/quantum */ 4239 0, /* phase */ 4240 0, /* nocross */ 4241 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4242 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4243 VM_NOSLEEP|VM_FIRSTFIT); 4244 /* 4245 * Most likely the allocation failed because 4246 * the requested ppa was in use. 4247 */ 4248 if (ppa == 0) 4249 return (EEXIST); 4250 } 4251 return (0); 4252 } 4253 4254 /* 4255 * No arena is in use and not enough (>ill_no_arena) interfaces have 4256 * been plumbed to create one. Do a linear search to get a unused ppa. 4257 */ 4258 if (ill->ill_ppa == UINT_MAX) { 4259 end = UINT_MAX - 1; 4260 start = 0; 4261 } else { 4262 end = start = ill->ill_ppa; 4263 } 4264 4265 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4266 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4267 if (start++ >= end) { 4268 if (ill->ill_ppa == UINT_MAX) 4269 return (EAGAIN); 4270 else 4271 return (EEXIST); 4272 } 4273 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4274 } 4275 ill->ill_ppa = start; 4276 return (0); 4277 } 4278 4279 /* 4280 * Insert ill into the list of configured ill's. Once this function completes, 4281 * the ill is globally visible and is available through lookups. More precisely 4282 * this happens after the caller drops the ill_g_lock. 4283 */ 4284 static int 4285 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4286 { 4287 ill_if_t *ill_interface; 4288 avl_index_t where = 0; 4289 int error; 4290 int name_length; 4291 int index; 4292 boolean_t check_length = B_FALSE; 4293 4294 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4295 4296 name_length = mi_strlen(name) + 1; 4297 4298 if (isv6) 4299 index = IP_V6_G_HEAD; 4300 else 4301 index = IP_V4_G_HEAD; 4302 4303 ill_interface = IP_VX_ILL_G_LIST(index); 4304 /* 4305 * Search for interface type based on name 4306 */ 4307 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4308 if ((ill_interface->illif_name_len == name_length) && 4309 (strcmp(ill_interface->illif_name, name) == 0)) { 4310 break; 4311 } 4312 ill_interface = ill_interface->illif_next; 4313 } 4314 4315 /* 4316 * Interface type not found, create one. 4317 */ 4318 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4319 4320 ill_g_head_t ghead; 4321 4322 /* 4323 * allocate ill_if_t structure 4324 */ 4325 4326 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4327 if (ill_interface == NULL) { 4328 return (ENOMEM); 4329 } 4330 4331 4332 4333 (void) strcpy(ill_interface->illif_name, name); 4334 ill_interface->illif_name_len = name_length; 4335 4336 avl_create(&ill_interface->illif_avl_by_ppa, 4337 ill_compare_ppa, sizeof (ill_t), 4338 offsetof(struct ill_s, ill_avl_byppa)); 4339 4340 /* 4341 * link the structure in the back to maintain order 4342 * of configuration for ifconfig output. 4343 */ 4344 ghead = ill_g_heads[index]; 4345 insque(ill_interface, ghead.ill_g_list_tail); 4346 4347 } 4348 4349 if (ill->ill_ppa == UINT_MAX) 4350 check_length = B_TRUE; 4351 4352 error = ill_alloc_ppa(ill_interface, ill); 4353 if (error != 0) { 4354 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4355 ill_delete_interface_type(ill->ill_ifptr); 4356 return (error); 4357 } 4358 4359 /* 4360 * When the ppa is choosen by the system, check that there is 4361 * enough space to insert ppa. if a specific ppa was passed in this 4362 * check is not required as the interface name passed in will have 4363 * the right ppa in it. 4364 */ 4365 if (check_length) { 4366 /* 4367 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4368 */ 4369 char buf[sizeof (uint_t) * 3]; 4370 4371 /* 4372 * convert ppa to string to calculate the amount of space 4373 * required for it in the name. 4374 */ 4375 numtos(ill->ill_ppa, buf); 4376 4377 /* Do we have enough space to insert ppa ? */ 4378 4379 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4380 /* Free ppa and interface type struct */ 4381 if (ill_interface->illif_ppa_arena != NULL) { 4382 vmem_free(ill_interface->illif_ppa_arena, 4383 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4384 } 4385 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4386 0) { 4387 ill_delete_interface_type(ill->ill_ifptr); 4388 } 4389 4390 return (EINVAL); 4391 } 4392 } 4393 4394 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4395 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4396 4397 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4398 &where); 4399 ill->ill_ifptr = ill_interface; 4400 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4401 4402 ill_phyint_reinit(ill); 4403 return (0); 4404 } 4405 4406 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4407 static boolean_t 4408 ipsq_init(ill_t *ill) 4409 { 4410 ipsq_t *ipsq; 4411 4412 /* Init the ipsq and impicitly enter as writer */ 4413 ill->ill_phyint->phyint_ipsq = 4414 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4415 if (ill->ill_phyint->phyint_ipsq == NULL) 4416 return (B_FALSE); 4417 ipsq = ill->ill_phyint->phyint_ipsq; 4418 ipsq->ipsq_phyint_list = ill->ill_phyint; 4419 ill->ill_phyint->phyint_ipsq_next = NULL; 4420 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4421 ipsq->ipsq_refs = 1; 4422 ipsq->ipsq_writer = curthread; 4423 ipsq->ipsq_reentry_cnt = 1; 4424 #ifdef ILL_DEBUG 4425 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, IP_STACK_DEPTH); 4426 #endif 4427 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4428 return (B_TRUE); 4429 } 4430 4431 /* 4432 * ill_init is called by ip_open when a device control stream is opened. 4433 * It does a few initializations, and shoots a DL_INFO_REQ message down 4434 * to the driver. The response is later picked up in ip_rput_dlpi and 4435 * used to set up default mechanisms for talking to the driver. (Always 4436 * called as writer.) 4437 * 4438 * If this function returns error, ip_open will call ip_close which in 4439 * turn will call ill_delete to clean up any memory allocated here that 4440 * is not yet freed. 4441 */ 4442 int 4443 ill_init(queue_t *q, ill_t *ill) 4444 { 4445 int count; 4446 dl_info_req_t *dlir; 4447 mblk_t *info_mp; 4448 uchar_t *frag_ptr; 4449 4450 /* 4451 * The ill is initialized to zero by mi_alloc*(). In addition 4452 * some fields already contain valid values, initialized in 4453 * ip_open(), before we reach here. 4454 */ 4455 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4456 4457 ill->ill_rq = q; 4458 ill->ill_wq = WR(q); 4459 4460 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4461 BPRI_HI); 4462 if (info_mp == NULL) 4463 return (ENOMEM); 4464 4465 /* 4466 * Allocate sufficient space to contain our fragment hash table and 4467 * the device name. 4468 */ 4469 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4470 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4471 if (frag_ptr == NULL) { 4472 freemsg(info_mp); 4473 return (ENOMEM); 4474 } 4475 ill->ill_frag_ptr = frag_ptr; 4476 ill->ill_frag_free_num_pkts = 0; 4477 ill->ill_last_frag_clean_time = 0; 4478 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4479 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4480 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4481 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4482 NULL, MUTEX_DEFAULT, NULL); 4483 } 4484 4485 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4486 if (ill->ill_phyint == NULL) { 4487 freemsg(info_mp); 4488 mi_free(frag_ptr); 4489 return (ENOMEM); 4490 } 4491 4492 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4493 /* 4494 * For now pretend this is a v4 ill. We need to set phyint_ill* 4495 * at this point because of the following reason. If we can't 4496 * enter the ipsq at some point and cv_wait, the writer that 4497 * wakes us up tries to locate us using the list of all phyints 4498 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4499 * If we don't set it now, we risk a missed wakeup. 4500 */ 4501 ill->ill_phyint->phyint_illv4 = ill; 4502 ill->ill_ppa = UINT_MAX; 4503 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4504 4505 if (!ipsq_init(ill)) { 4506 freemsg(info_mp); 4507 mi_free(frag_ptr); 4508 mi_free(ill->ill_phyint); 4509 return (ENOMEM); 4510 } 4511 4512 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4513 4514 4515 /* Frag queue limit stuff */ 4516 ill->ill_frag_count = 0; 4517 ill->ill_ipf_gen = 0; 4518 4519 ill->ill_global_timer = INFINITY; 4520 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4521 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4522 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4523 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4524 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4525 4526 /* 4527 * Initialize IPv6 configuration variables. The IP module is always 4528 * opened as an IPv4 module. Instead tracking down the cases where 4529 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4530 * here for convenience, this has no effect until the ill is set to do 4531 * IPv6. 4532 */ 4533 ill->ill_reachable_time = ND_REACHABLE_TIME; 4534 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4535 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4536 ill->ill_max_buf = ND_MAX_Q; 4537 ill->ill_refcnt = 0; 4538 4539 /* Send down the Info Request to the driver. */ 4540 info_mp->b_datap->db_type = M_PCPROTO; 4541 dlir = (dl_info_req_t *)info_mp->b_rptr; 4542 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4543 dlir->dl_primitive = DL_INFO_REQ; 4544 4545 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4546 4547 qprocson(q); 4548 ill_dlpi_send(ill, info_mp); 4549 4550 return (0); 4551 } 4552 4553 /* 4554 * ill_dls_info 4555 * creates datalink socket info from the device. 4556 */ 4557 int 4558 ill_dls_info(struct sockaddr_dl *sdl, ipif_t *ipif) 4559 { 4560 size_t length; 4561 ill_t *ill = ipif->ipif_ill; 4562 4563 sdl->sdl_family = AF_LINK; 4564 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4565 sdl->sdl_type = ipif->ipif_type; 4566 (void) ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4567 length = mi_strlen(sdl->sdl_data); 4568 ASSERT(length < 256); 4569 sdl->sdl_nlen = (uchar_t)length; 4570 sdl->sdl_alen = ill->ill_phys_addr_length; 4571 mutex_enter(&ill->ill_lock); 4572 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) { 4573 bcopy(ill->ill_phys_addr, &sdl->sdl_data[length], 4574 ill->ill_phys_addr_length); 4575 } 4576 mutex_exit(&ill->ill_lock); 4577 sdl->sdl_slen = 0; 4578 return (sizeof (struct sockaddr_dl)); 4579 } 4580 4581 /* 4582 * ill_xarp_info 4583 * creates xarp info from the device. 4584 */ 4585 static int 4586 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4587 { 4588 sdl->sdl_family = AF_LINK; 4589 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4590 sdl->sdl_type = ill->ill_type; 4591 (void) ipif_get_name(ill->ill_ipif, sdl->sdl_data, 4592 sizeof (sdl->sdl_data)); 4593 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4594 sdl->sdl_alen = ill->ill_phys_addr_length; 4595 sdl->sdl_slen = 0; 4596 return (sdl->sdl_nlen); 4597 } 4598 4599 static int 4600 loopback_kstat_update(kstat_t *ksp, int rw) 4601 { 4602 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 4603 4604 if (rw == KSTAT_WRITE) 4605 return (EACCES); 4606 kn[0].value.ui32 = loopback_packets; 4607 kn[1].value.ui32 = loopback_packets; 4608 return (0); 4609 } 4610 4611 4612 /* 4613 * Has ifindex been plumbed already. 4614 */ 4615 static boolean_t 4616 phyint_exists(uint_t index) 4617 { 4618 phyint_t *phyi; 4619 4620 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4621 /* 4622 * Indexes are stored in the phyint - a common structure 4623 * to both IPv4 and IPv6. 4624 */ 4625 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 4626 (void *) &index, NULL); 4627 return (phyi != NULL); 4628 } 4629 4630 /* 4631 * Assign a unique interface index for the phyint. 4632 */ 4633 static boolean_t 4634 phyint_assign_ifindex(phyint_t *phyi) 4635 { 4636 uint_t starting_index; 4637 4638 ASSERT(phyi->phyint_ifindex == 0); 4639 if (!ill_index_wrap) { 4640 phyi->phyint_ifindex = ill_index++; 4641 if (ill_index == 0) { 4642 /* Reached the uint_t limit Next time wrap */ 4643 ill_index_wrap = B_TRUE; 4644 } 4645 return (B_TRUE); 4646 } 4647 4648 /* 4649 * Start reusing unused indexes. Note that we hold the ill_g_lock 4650 * at this point and don't want to call any function that attempts 4651 * to get the lock again. 4652 */ 4653 starting_index = ill_index++; 4654 for (; ill_index != starting_index; ill_index++) { 4655 if (ill_index != 0 && !phyint_exists(ill_index)) { 4656 /* found unused index - use it */ 4657 phyi->phyint_ifindex = ill_index; 4658 return (B_TRUE); 4659 } 4660 } 4661 4662 /* 4663 * all interface indicies are inuse. 4664 */ 4665 return (B_FALSE); 4666 } 4667 4668 /* 4669 * Return a pointer to the ill which matches the supplied name. Note that 4670 * the ill name length includes the null termination character. (May be 4671 * called as writer.) 4672 * If do_alloc and the interface is "lo0" it will be automatically created. 4673 * Cannot bump up reference on condemned ills. So dup detect can't be done 4674 * using this func. 4675 */ 4676 ill_t * 4677 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4678 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc) 4679 { 4680 ill_t *ill; 4681 ipif_t *ipif; 4682 kstat_named_t *kn; 4683 boolean_t isloopback; 4684 ipsq_t *old_ipsq; 4685 4686 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4687 4688 rw_enter(&ill_g_lock, RW_READER); 4689 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4690 rw_exit(&ill_g_lock); 4691 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4692 return (ill); 4693 4694 /* 4695 * Couldn't find it. Does this happen to be a lookup for the 4696 * loopback device and are we allowed to allocate it? 4697 */ 4698 if (!isloopback || !do_alloc) 4699 return (NULL); 4700 4701 rw_enter(&ill_g_lock, RW_WRITER); 4702 4703 ill = ill_find_by_name(name, isv6, q, mp, func, error); 4704 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4705 rw_exit(&ill_g_lock); 4706 return (ill); 4707 } 4708 4709 /* Create the loopback device on demand */ 4710 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4711 sizeof (ipif_loopback_name), BPRI_MED)); 4712 if (ill == NULL) 4713 goto done; 4714 4715 *ill = ill_null; 4716 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4717 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4718 if (ill->ill_phyint == NULL) 4719 goto done; 4720 4721 if (isv6) 4722 ill->ill_phyint->phyint_illv6 = ill; 4723 else 4724 ill->ill_phyint->phyint_illv4 = ill; 4725 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4726 ill->ill_max_frag = IP_LOOPBACK_MTU; 4727 /* Add room for tcp+ip headers */ 4728 if (isv6) { 4729 ill->ill_isv6 = B_TRUE; 4730 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 4731 if (!ill_allocate_mibs(ill)) 4732 goto done; 4733 } else { 4734 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 4735 } 4736 ill->ill_max_mtu = ill->ill_max_frag; 4737 /* 4738 * ipif_loopback_name can't be pointed at directly because its used 4739 * by both the ipv4 and ipv6 interfaces. When the ill is removed 4740 * from the glist, ill_glist_delete() sets the first character of 4741 * ill_name to '\0'. 4742 */ 4743 ill->ill_name = (char *)ill + sizeof (*ill); 4744 (void) strcpy(ill->ill_name, ipif_loopback_name); 4745 ill->ill_name_length = sizeof (ipif_loopback_name); 4746 /* Set ill_name_set for ill_phyint_reinit to work properly */ 4747 4748 ill->ill_global_timer = INFINITY; 4749 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4750 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4751 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4752 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4753 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4754 4755 /* No resolver here. */ 4756 ill->ill_net_type = IRE_LOOPBACK; 4757 4758 /* Initialize the ipsq */ 4759 if (!ipsq_init(ill)) 4760 goto done; 4761 4762 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 4763 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 4764 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 4765 #ifdef ILL_DEBUG 4766 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 4767 #endif 4768 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 4769 if (ipif == NULL) 4770 goto done; 4771 4772 ill->ill_flags = ILLF_MULTICAST; 4773 4774 /* Set up default loopback address and mask. */ 4775 if (!isv6) { 4776 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 4777 4778 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 4779 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4780 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 4781 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4782 ipif->ipif_v6subnet); 4783 ill->ill_flags |= ILLF_IPV4; 4784 } else { 4785 ipif->ipif_v6lcl_addr = ipv6_loopback; 4786 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4787 ipif->ipif_v6net_mask = ipv6_all_ones; 4788 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4789 ipif->ipif_v6subnet); 4790 ill->ill_flags |= ILLF_IPV6; 4791 } 4792 4793 /* 4794 * Chain us in at the end of the ill list. hold the ill 4795 * before we make it globally visible. 1 for the lookup. 4796 */ 4797 ill->ill_refcnt = 0; 4798 ill_refhold(ill); 4799 4800 ill->ill_frag_count = 0; 4801 ill->ill_frag_free_num_pkts = 0; 4802 ill->ill_last_frag_clean_time = 0; 4803 4804 old_ipsq = ill->ill_phyint->phyint_ipsq; 4805 4806 if (ill_glist_insert(ill, "lo", isv6) != 0) 4807 cmn_err(CE_PANIC, "cannot insert loopback interface"); 4808 4809 /* Let SCTP know so that it can add this to its list */ 4810 sctp_update_ill(ill, SCTP_ILL_INSERT); 4811 4812 /* Let SCTP know about this IPIF, so that it can add it to its list */ 4813 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 4814 4815 /* 4816 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 4817 */ 4818 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 4819 /* Loopback ills aren't in any IPMP group */ 4820 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 4821 ipsq_delete(old_ipsq); 4822 } 4823 4824 /* 4825 * Delay this till the ipif is allocated as ipif_allocate 4826 * de-references ill_phyint for getting the ifindex. We 4827 * can't do this before ipif_allocate because ill_phyint_reinit 4828 * -> phyint_assign_ifindex expects ipif to be present. 4829 */ 4830 mutex_enter(&ill->ill_phyint->phyint_lock); 4831 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 4832 mutex_exit(&ill->ill_phyint->phyint_lock); 4833 4834 if (loopback_ksp == NULL) { 4835 /* Export loopback interface statistics */ 4836 loopback_ksp = kstat_create("lo", 0, ipif_loopback_name, "net", 4837 KSTAT_TYPE_NAMED, 2, 0); 4838 if (loopback_ksp != NULL) { 4839 loopback_ksp->ks_update = loopback_kstat_update; 4840 kn = KSTAT_NAMED_PTR(loopback_ksp); 4841 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 4842 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 4843 kstat_install(loopback_ksp); 4844 } 4845 } 4846 4847 if (error != NULL) 4848 *error = 0; 4849 *did_alloc = B_TRUE; 4850 rw_exit(&ill_g_lock); 4851 return (ill); 4852 done: 4853 if (ill != NULL) { 4854 if (ill->ill_phyint != NULL) { 4855 ipsq_t *ipsq; 4856 4857 ipsq = ill->ill_phyint->phyint_ipsq; 4858 if (ipsq != NULL) 4859 kmem_free(ipsq, sizeof (ipsq_t)); 4860 mi_free(ill->ill_phyint); 4861 } 4862 ill_free_mib(ill); 4863 mi_free(ill); 4864 } 4865 rw_exit(&ill_g_lock); 4866 if (error != NULL) 4867 *error = ENOMEM; 4868 return (NULL); 4869 } 4870 4871 /* 4872 * Return a pointer to the ill which matches the index and IP version type. 4873 */ 4874 ill_t * 4875 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 4876 ipsq_func_t func, int *err) 4877 { 4878 ill_t *ill; 4879 ipsq_t *ipsq; 4880 phyint_t *phyi; 4881 4882 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 4883 (q != NULL && mp != NULL && func != NULL && err != NULL)); 4884 4885 if (err != NULL) 4886 *err = 0; 4887 4888 /* 4889 * Indexes are stored in the phyint - a common structure 4890 * to both IPv4 and IPv6. 4891 */ 4892 rw_enter(&ill_g_lock, RW_READER); 4893 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 4894 (void *) &index, NULL); 4895 if (phyi != NULL) { 4896 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 4897 if (ill != NULL) { 4898 /* 4899 * The block comment at the start of ipif_down 4900 * explains the use of the macros used below 4901 */ 4902 GRAB_CONN_LOCK(q); 4903 mutex_enter(&ill->ill_lock); 4904 if (ILL_CAN_LOOKUP(ill)) { 4905 ill_refhold_locked(ill); 4906 mutex_exit(&ill->ill_lock); 4907 RELEASE_CONN_LOCK(q); 4908 rw_exit(&ill_g_lock); 4909 return (ill); 4910 } else if (ILL_CAN_WAIT(ill, q)) { 4911 ipsq = ill->ill_phyint->phyint_ipsq; 4912 mutex_enter(&ipsq->ipsq_lock); 4913 rw_exit(&ill_g_lock); 4914 mutex_exit(&ill->ill_lock); 4915 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4916 mutex_exit(&ipsq->ipsq_lock); 4917 RELEASE_CONN_LOCK(q); 4918 *err = EINPROGRESS; 4919 return (NULL); 4920 } 4921 RELEASE_CONN_LOCK(q); 4922 mutex_exit(&ill->ill_lock); 4923 } 4924 } 4925 rw_exit(&ill_g_lock); 4926 if (err != NULL) 4927 *err = ENXIO; 4928 return (NULL); 4929 } 4930 4931 /* 4932 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4933 * that gives a running thread a reference to the ill. This reference must be 4934 * released by the thread when it is done accessing the ill and related 4935 * objects. ill_refcnt can not be used to account for static references 4936 * such as other structures pointing to an ill. Callers must generally 4937 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4938 * or be sure that the ill is not being deleted or changing state before 4939 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4940 * ill won't change any of its critical state such as address, netmask etc. 4941 */ 4942 void 4943 ill_refhold(ill_t *ill) 4944 { 4945 mutex_enter(&ill->ill_lock); 4946 ill->ill_refcnt++; 4947 ILL_TRACE_REF(ill); 4948 mutex_exit(&ill->ill_lock); 4949 } 4950 4951 void 4952 ill_refhold_locked(ill_t *ill) 4953 { 4954 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4955 ill->ill_refcnt++; 4956 ILL_TRACE_REF(ill); 4957 } 4958 4959 int 4960 ill_check_and_refhold(ill_t *ill) 4961 { 4962 mutex_enter(&ill->ill_lock); 4963 if (ILL_CAN_LOOKUP(ill)) { 4964 ill_refhold_locked(ill); 4965 mutex_exit(&ill->ill_lock); 4966 return (0); 4967 } 4968 mutex_exit(&ill->ill_lock); 4969 return (ILL_LOOKUP_FAILED); 4970 } 4971 4972 /* 4973 * Must not be called while holding any locks. Otherwise if this is 4974 * the last reference to be released, there is a chance of recursive mutex 4975 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4976 * to restart an ioctl. 4977 */ 4978 void 4979 ill_refrele(ill_t *ill) 4980 { 4981 mutex_enter(&ill->ill_lock); 4982 ASSERT(ill->ill_refcnt != 0); 4983 ill->ill_refcnt--; 4984 ILL_UNTRACE_REF(ill); 4985 if (ill->ill_refcnt != 0) { 4986 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4987 mutex_exit(&ill->ill_lock); 4988 return; 4989 } 4990 4991 /* Drops the ill_lock */ 4992 ipif_ill_refrele_tail(ill); 4993 } 4994 4995 /* 4996 * Obtain a weak reference count on the ill. This reference ensures the 4997 * ill won't be freed, but the ill may change any of its critical state 4998 * such as netmask, address etc. Returns an error if the ill has started 4999 * closing. 5000 */ 5001 boolean_t 5002 ill_waiter_inc(ill_t *ill) 5003 { 5004 mutex_enter(&ill->ill_lock); 5005 if (ill->ill_state_flags & ILL_CONDEMNED) { 5006 mutex_exit(&ill->ill_lock); 5007 return (B_FALSE); 5008 } 5009 ill->ill_waiters++; 5010 mutex_exit(&ill->ill_lock); 5011 return (B_TRUE); 5012 } 5013 5014 void 5015 ill_waiter_dcr(ill_t *ill) 5016 { 5017 mutex_enter(&ill->ill_lock); 5018 ill->ill_waiters--; 5019 if (ill->ill_waiters == 0) 5020 cv_broadcast(&ill->ill_cv); 5021 mutex_exit(&ill->ill_lock); 5022 } 5023 5024 /* 5025 * Named Dispatch routine to produce a formatted report on all ILLs. 5026 * This report is accessed by using the ndd utility to "get" ND variable 5027 * "ip_ill_status". 5028 */ 5029 /* ARGSUSED */ 5030 int 5031 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5032 { 5033 ill_t *ill; 5034 ill_walk_context_t ctx; 5035 5036 (void) mi_mpprintf(mp, 5037 "ILL " MI_COL_HDRPAD_STR 5038 /* 01234567[89ABCDEF] */ 5039 "rq " MI_COL_HDRPAD_STR 5040 /* 01234567[89ABCDEF] */ 5041 "wq " MI_COL_HDRPAD_STR 5042 /* 01234567[89ABCDEF] */ 5043 "upcnt mxfrg err name"); 5044 /* 12345 12345 123 xxxxxxxx */ 5045 5046 rw_enter(&ill_g_lock, RW_READER); 5047 ill = ILL_START_WALK_ALL(&ctx); 5048 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5049 (void) mi_mpprintf(mp, 5050 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5051 "%05u %05u %03d %s", 5052 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5053 ill->ill_ipif_up_count, 5054 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5055 } 5056 rw_exit(&ill_g_lock); 5057 5058 return (0); 5059 } 5060 5061 /* 5062 * Named Dispatch routine to produce a formatted report on all IPIFs. 5063 * This report is accessed by using the ndd utility to "get" ND variable 5064 * "ip_ipif_status". 5065 */ 5066 /* ARGSUSED */ 5067 int 5068 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5069 { 5070 char buf1[INET6_ADDRSTRLEN]; 5071 char buf2[INET6_ADDRSTRLEN]; 5072 char buf3[INET6_ADDRSTRLEN]; 5073 char buf4[INET6_ADDRSTRLEN]; 5074 char buf5[INET6_ADDRSTRLEN]; 5075 char buf6[INET6_ADDRSTRLEN]; 5076 char buf[LIFNAMSIZ]; 5077 ill_t *ill; 5078 ipif_t *ipif; 5079 nv_t *nvp; 5080 uint64_t flags; 5081 zoneid_t zoneid; 5082 ill_walk_context_t ctx; 5083 5084 (void) mi_mpprintf(mp, 5085 "IPIF metric mtu in/out/forward name zone flags...\n" 5086 "\tlocal address\n" 5087 "\tsrc address\n" 5088 "\tsubnet\n" 5089 "\tmask\n" 5090 "\tbroadcast\n" 5091 "\tp-p-dst"); 5092 5093 ASSERT(q->q_next == NULL); 5094 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5095 5096 rw_enter(&ill_g_lock, RW_READER); 5097 ill = ILL_START_WALK_ALL(&ctx); 5098 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5099 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 5100 if (zoneid != GLOBAL_ZONEID && 5101 zoneid != ipif->ipif_zoneid) 5102 continue; 5103 (void) mi_mpprintf(mp, 5104 MI_COL_PTRFMT_STR 5105 "%04u %05u %u/%u/%u %s %d", 5106 (void *)ipif, 5107 ipif->ipif_metric, ipif->ipif_mtu, 5108 ipif->ipif_ib_pkt_count, 5109 ipif->ipif_ob_pkt_count, 5110 ipif->ipif_fo_pkt_count, 5111 ipif_get_name(ipif, buf, sizeof (buf)), 5112 ipif->ipif_zoneid); 5113 5114 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5115 ipif->ipif_ill->ill_phyint->phyint_flags; 5116 5117 /* Tack on text strings for any flags. */ 5118 nvp = ipif_nv_tbl; 5119 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5120 if (nvp->nv_value & flags) 5121 (void) mi_mpprintf_nr(mp, " %s", 5122 nvp->nv_name); 5123 } 5124 (void) mi_mpprintf(mp, 5125 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5126 inet_ntop(AF_INET6, 5127 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5128 inet_ntop(AF_INET6, 5129 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5130 inet_ntop(AF_INET6, 5131 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5132 inet_ntop(AF_INET6, 5133 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5134 inet_ntop(AF_INET6, 5135 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5136 inet_ntop(AF_INET6, 5137 &ipif->ipif_v6pp_dst_addr, 5138 buf6, sizeof (buf6))); 5139 } 5140 } 5141 rw_exit(&ill_g_lock); 5142 return (0); 5143 } 5144 5145 /* 5146 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5147 * driver. We construct best guess defaults for lower level information that 5148 * we need. If an interface is brought up without injection of any overriding 5149 * information from outside, we have to be ready to go with these defaults. 5150 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5151 * we primarely want the dl_provider_style. 5152 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5153 * at which point we assume the other part of the information is valid. 5154 */ 5155 void 5156 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5157 { 5158 uchar_t *brdcst_addr; 5159 uint_t brdcst_addr_length, phys_addr_length; 5160 t_scalar_t sap_length; 5161 dl_info_ack_t *dlia; 5162 ip_m_t *ipm; 5163 dl_qos_cl_sel1_t *sel1; 5164 5165 ASSERT(IAM_WRITER_ILL(ill)); 5166 5167 /* 5168 * Till the ill is fully up ILL_CHANGING will be set and 5169 * the ill is not globally visible. So no need for a lock. 5170 */ 5171 dlia = (dl_info_ack_t *)mp->b_rptr; 5172 ill->ill_mactype = dlia->dl_mac_type; 5173 5174 ipm = ip_m_lookup(dlia->dl_mac_type); 5175 if (ipm == NULL) { 5176 ipm = ip_m_lookup(DL_OTHER); 5177 ASSERT(ipm != NULL); 5178 } 5179 ill->ill_media = ipm; 5180 5181 /* 5182 * When the new DLPI stuff is ready we'll pull lengths 5183 * from dlia. 5184 */ 5185 if (dlia->dl_version == DL_VERSION_2) { 5186 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5187 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5188 brdcst_addr_length); 5189 if (brdcst_addr == NULL) { 5190 brdcst_addr_length = 0; 5191 } 5192 sap_length = dlia->dl_sap_length; 5193 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5194 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5195 brdcst_addr_length, sap_length, phys_addr_length)); 5196 } else { 5197 brdcst_addr_length = 6; 5198 brdcst_addr = ip_six_byte_all_ones; 5199 sap_length = -2; 5200 phys_addr_length = brdcst_addr_length; 5201 } 5202 5203 ill->ill_bcast_addr_length = brdcst_addr_length; 5204 ill->ill_phys_addr_length = phys_addr_length; 5205 ill->ill_sap_length = sap_length; 5206 ill->ill_max_frag = dlia->dl_max_sdu; 5207 ill->ill_max_mtu = ill->ill_max_frag; 5208 5209 ill->ill_type = ipm->ip_m_type; 5210 5211 if (!ill->ill_dlpi_style_set) { 5212 if (dlia->dl_provider_style == DL_STYLE2) 5213 ill->ill_needs_attach = 1; 5214 5215 /* 5216 * Allocate the first ipif on this ill. We don't delay it 5217 * further as ioctl handling assumes atleast one ipif to 5218 * be present. 5219 * 5220 * At this point we don't know whether the ill is v4 or v6. 5221 * We will know this whan the SIOCSLIFNAME happens and 5222 * the correct value for ill_isv6 will be assigned in 5223 * ipif_set_values(). We need to hold the ill lock and 5224 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5225 * the wakeup. 5226 */ 5227 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5228 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5229 mutex_enter(&ill->ill_lock); 5230 ASSERT(ill->ill_dlpi_style_set == 0); 5231 ill->ill_dlpi_style_set = 1; 5232 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5233 cv_broadcast(&ill->ill_cv); 5234 mutex_exit(&ill->ill_lock); 5235 freemsg(mp); 5236 return; 5237 } 5238 ASSERT(ill->ill_ipif != NULL); 5239 /* 5240 * We know whether it is IPv4 or IPv6 now, as this is the 5241 * second DL_INFO_ACK we are recieving in response to the 5242 * DL_INFO_REQ sent in ipif_set_values. 5243 */ 5244 if (ill->ill_isv6) 5245 ill->ill_sap = IP6_DL_SAP; 5246 else 5247 ill->ill_sap = IP_DL_SAP; 5248 /* 5249 * Set ipif_mtu which is used to set the IRE's 5250 * ire_max_frag value. The driver could have sent 5251 * a different mtu from what it sent last time. No 5252 * need to call ipif_mtu_change because IREs have 5253 * not yet been created. 5254 */ 5255 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5256 /* 5257 * Clear all the flags that were set based on ill_bcast_addr_length 5258 * and ill_phys_addr_length (in ipif_set_values) as these could have 5259 * changed now and we need to re-evaluate. 5260 */ 5261 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5262 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5263 5264 /* 5265 * Free ill_resolver_mp and ill_bcast_mp as things could have 5266 * changed now. 5267 */ 5268 if (ill->ill_bcast_addr_length == 0) { 5269 if (ill->ill_resolver_mp != NULL) 5270 freemsg(ill->ill_resolver_mp); 5271 if (ill->ill_bcast_mp != NULL) 5272 freemsg(ill->ill_bcast_mp); 5273 if (ill->ill_flags & ILLF_XRESOLV) 5274 ill->ill_net_type = IRE_IF_RESOLVER; 5275 else 5276 ill->ill_net_type = IRE_IF_NORESOLVER; 5277 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5278 ill->ill_phys_addr_length, 5279 ill->ill_sap, 5280 ill->ill_sap_length); 5281 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5282 5283 if (ill->ill_isv6) 5284 /* 5285 * Note: xresolv interfaces will eventually need NOARP 5286 * set here as well, but that will require those 5287 * external resolvers to have some knowledge of 5288 * that flag and act appropriately. Not to be changed 5289 * at present. 5290 */ 5291 ill->ill_flags |= ILLF_NONUD; 5292 else 5293 ill->ill_flags |= ILLF_NOARP; 5294 5295 if (ill->ill_phys_addr_length == 0) { 5296 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5297 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5298 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5299 } else { 5300 /* pt-pt supports multicast. */ 5301 ill->ill_flags |= ILLF_MULTICAST; 5302 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5303 } 5304 } 5305 } else { 5306 ill->ill_net_type = IRE_IF_RESOLVER; 5307 if (ill->ill_bcast_mp != NULL) 5308 freemsg(ill->ill_bcast_mp); 5309 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5310 ill->ill_bcast_addr_length, ill->ill_sap, 5311 ill->ill_sap_length); 5312 /* 5313 * Later detect lack of DLPI driver multicast 5314 * capability by catching DL_ENABMULTI errors in 5315 * ip_rput_dlpi. 5316 */ 5317 ill->ill_flags |= ILLF_MULTICAST; 5318 if (!ill->ill_isv6) 5319 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5320 } 5321 /* By default an interface does not support any CoS marking */ 5322 ill->ill_flags &= ~ILLF_COS_ENABLED; 5323 5324 /* 5325 * If we get QoS information in DL_INFO_ACK, the device supports 5326 * some form of CoS marking, set ILLF_COS_ENABLED. 5327 */ 5328 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5329 dlia->dl_qos_length); 5330 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5331 ill->ill_flags |= ILLF_COS_ENABLED; 5332 } 5333 5334 /* Clear any previous error indication. */ 5335 ill->ill_error = 0; 5336 freemsg(mp); 5337 } 5338 5339 /* 5340 * Perform various checks to verify that an address would make sense as a 5341 * local, remote, or subnet interface address. 5342 */ 5343 static boolean_t 5344 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5345 { 5346 ipaddr_t net_mask; 5347 5348 /* 5349 * Don't allow all zeroes, all ones or experimental address, but allow 5350 * all ones netmask. 5351 */ 5352 if ((net_mask = ip_net_mask(addr)) == 0) 5353 return (B_FALSE); 5354 /* A given netmask overrides the "guess" netmask */ 5355 if (subnet_mask != 0) 5356 net_mask = subnet_mask; 5357 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5358 (addr == (addr | ~net_mask)))) { 5359 return (B_FALSE); 5360 } 5361 if (CLASSD(addr)) 5362 return (B_FALSE); 5363 5364 return (B_TRUE); 5365 } 5366 5367 /* 5368 * ipif_lookup_group 5369 * Returns held ipif 5370 */ 5371 ipif_t * 5372 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid) 5373 { 5374 ire_t *ire; 5375 ipif_t *ipif; 5376 5377 ire = ire_lookup_multi(group, zoneid); 5378 if (ire == NULL) 5379 return (NULL); 5380 ipif = ire->ire_ipif; 5381 ipif_refhold(ipif); 5382 ire_refrele(ire); 5383 return (ipif); 5384 } 5385 5386 /* 5387 * Look for an ipif with the specified interface address and destination. 5388 * The destination address is used only for matching point-to-point interfaces. 5389 */ 5390 ipif_t * 5391 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5392 ipsq_func_t func, int *error) 5393 { 5394 ipif_t *ipif; 5395 ill_t *ill; 5396 ill_walk_context_t ctx; 5397 ipsq_t *ipsq; 5398 5399 if (error != NULL) 5400 *error = 0; 5401 5402 /* 5403 * First match all the point-to-point interfaces 5404 * before looking at non-point-to-point interfaces. 5405 * This is done to avoid returning non-point-to-point 5406 * ipif instead of unnumbered point-to-point ipif. 5407 */ 5408 rw_enter(&ill_g_lock, RW_READER); 5409 ill = ILL_START_WALK_V4(&ctx); 5410 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5411 GRAB_CONN_LOCK(q); 5412 mutex_enter(&ill->ill_lock); 5413 for (ipif = ill->ill_ipif; ipif != NULL; 5414 ipif = ipif->ipif_next) { 5415 /* Allow the ipif to be down */ 5416 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5417 (ipif->ipif_lcl_addr == if_addr) && 5418 (ipif->ipif_pp_dst_addr == dst)) { 5419 /* 5420 * The block comment at the start of ipif_down 5421 * explains the use of the macros used below 5422 */ 5423 if (IPIF_CAN_LOOKUP(ipif)) { 5424 ipif_refhold_locked(ipif); 5425 mutex_exit(&ill->ill_lock); 5426 RELEASE_CONN_LOCK(q); 5427 rw_exit(&ill_g_lock); 5428 return (ipif); 5429 } else if (IPIF_CAN_WAIT(ipif, q)) { 5430 ipsq = ill->ill_phyint->phyint_ipsq; 5431 mutex_enter(&ipsq->ipsq_lock); 5432 mutex_exit(&ill->ill_lock); 5433 rw_exit(&ill_g_lock); 5434 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5435 ill); 5436 mutex_exit(&ipsq->ipsq_lock); 5437 RELEASE_CONN_LOCK(q); 5438 *error = EINPROGRESS; 5439 return (NULL); 5440 } 5441 } 5442 } 5443 mutex_exit(&ill->ill_lock); 5444 RELEASE_CONN_LOCK(q); 5445 } 5446 rw_exit(&ill_g_lock); 5447 5448 /* lookup the ipif based on interface address */ 5449 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error); 5450 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5451 return (ipif); 5452 } 5453 5454 /* 5455 * Look for an ipif with the specified address. For point-point links 5456 * we look for matches on either the destination address and the local 5457 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5458 * is set. 5459 * Matches on a specific ill if match_ill is set. 5460 */ 5461 ipif_t * 5462 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5463 mblk_t *mp, ipsq_func_t func, int *error) 5464 { 5465 ipif_t *ipif; 5466 ill_t *ill; 5467 boolean_t ptp = B_FALSE; 5468 ipsq_t *ipsq; 5469 ill_walk_context_t ctx; 5470 5471 if (error != NULL) 5472 *error = 0; 5473 5474 rw_enter(&ill_g_lock, RW_READER); 5475 /* 5476 * Repeat twice, first based on local addresses and 5477 * next time for pointopoint. 5478 */ 5479 repeat: 5480 ill = ILL_START_WALK_V4(&ctx); 5481 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5482 if (match_ill != NULL && ill != match_ill) { 5483 continue; 5484 } 5485 GRAB_CONN_LOCK(q); 5486 mutex_enter(&ill->ill_lock); 5487 for (ipif = ill->ill_ipif; ipif != NULL; 5488 ipif = ipif->ipif_next) { 5489 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid) 5490 continue; 5491 /* Allow the ipif to be down */ 5492 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5493 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5494 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5495 (ipif->ipif_pp_dst_addr == addr))) { 5496 /* 5497 * The block comment at the start of ipif_down 5498 * explains the use of the macros used below 5499 */ 5500 if (IPIF_CAN_LOOKUP(ipif)) { 5501 ipif_refhold_locked(ipif); 5502 mutex_exit(&ill->ill_lock); 5503 RELEASE_CONN_LOCK(q); 5504 rw_exit(&ill_g_lock); 5505 return (ipif); 5506 } else if (IPIF_CAN_WAIT(ipif, q)) { 5507 ipsq = ill->ill_phyint->phyint_ipsq; 5508 mutex_enter(&ipsq->ipsq_lock); 5509 mutex_exit(&ill->ill_lock); 5510 rw_exit(&ill_g_lock); 5511 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5512 ill); 5513 mutex_exit(&ipsq->ipsq_lock); 5514 RELEASE_CONN_LOCK(q); 5515 *error = EINPROGRESS; 5516 return (NULL); 5517 } 5518 } 5519 } 5520 mutex_exit(&ill->ill_lock); 5521 RELEASE_CONN_LOCK(q); 5522 } 5523 5524 /* Now try the ptp case */ 5525 if (ptp) { 5526 rw_exit(&ill_g_lock); 5527 if (error != NULL) 5528 *error = ENXIO; 5529 return (NULL); 5530 } 5531 ptp = B_TRUE; 5532 goto repeat; 5533 } 5534 5535 /* 5536 * Look for an ipif that matches the specified remote address i.e. the 5537 * ipif that would receive the specified packet. 5538 * First look for directly connected interfaces and then do a recursive 5539 * IRE lookup and pick the first ipif corresponding to the source address in the 5540 * ire. 5541 * Returns: held ipif 5542 */ 5543 ipif_t * 5544 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 5545 { 5546 ipif_t *ipif; 5547 ire_t *ire; 5548 5549 ASSERT(!ill->ill_isv6); 5550 5551 /* 5552 * Someone could be changing this ipif currently or change it 5553 * after we return this. Thus a few packets could use the old 5554 * old values. However structure updates/creates (ire, ilg, ilm etc) 5555 * will atomically be updated or cleaned up with the new value 5556 * Thus we don't need a lock to check the flags or other attrs below. 5557 */ 5558 mutex_enter(&ill->ill_lock); 5559 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5560 if (!IPIF_CAN_LOOKUP(ipif)) 5561 continue; 5562 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid) 5563 continue; 5564 /* Allow the ipif to be down */ 5565 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5566 if ((ipif->ipif_pp_dst_addr == addr) || 5567 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 5568 ipif->ipif_lcl_addr == addr)) { 5569 ipif_refhold_locked(ipif); 5570 mutex_exit(&ill->ill_lock); 5571 return (ipif); 5572 } 5573 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 5574 ipif_refhold_locked(ipif); 5575 mutex_exit(&ill->ill_lock); 5576 return (ipif); 5577 } 5578 } 5579 mutex_exit(&ill->ill_lock); 5580 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 5581 MATCH_IRE_RECURSIVE); 5582 if (ire != NULL) { 5583 /* 5584 * The callers of this function wants to know the 5585 * interface on which they have to send the replies 5586 * back. For IRE_CACHES that have ire_stq and ire_ipif 5587 * derived from different ills, we really don't care 5588 * what we return here. 5589 */ 5590 ipif = ire->ire_ipif; 5591 if (ipif != NULL) { 5592 ipif_refhold(ipif); 5593 ire_refrele(ire); 5594 return (ipif); 5595 } 5596 ire_refrele(ire); 5597 } 5598 /* Pick the first interface */ 5599 ipif = ipif_get_next_ipif(NULL, ill); 5600 return (ipif); 5601 } 5602 5603 /* 5604 * This func does not prevent refcnt from increasing. But if 5605 * the caller has taken steps to that effect, then this func 5606 * can be used to determine whether the ill has become quiescent 5607 */ 5608 boolean_t 5609 ill_is_quiescent(ill_t *ill) 5610 { 5611 ipif_t *ipif; 5612 5613 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5614 5615 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5616 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) 5617 return (B_FALSE); 5618 } 5619 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 5620 ill->ill_nce_cnt != 0 || ill->ill_srcif_refcnt != 0 || 5621 ill->ill_mrtun_refcnt != 0) 5622 return (B_FALSE); 5623 return (B_TRUE); 5624 } 5625 5626 /* 5627 * This func does not prevent refcnt from increasing. But if 5628 * the caller has taken steps to that effect, then this func 5629 * can be used to determine whether the ipif has become quiescent 5630 */ 5631 static boolean_t 5632 ipif_is_quiescent(ipif_t *ipif) 5633 { 5634 ill_t *ill; 5635 5636 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5637 5638 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) 5639 return (B_FALSE); 5640 5641 ill = ipif->ipif_ill; 5642 if (ill->ill_ipif_up_count != 0 || ill->ill_logical_down) 5643 return (B_TRUE); 5644 5645 /* This is the last ipif going down or being deleted on this ill */ 5646 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) 5647 return (B_FALSE); 5648 5649 return (B_TRUE); 5650 } 5651 5652 /* 5653 * This func does not prevent refcnt from increasing. But if 5654 * the caller has taken steps to that effect, then this func 5655 * can be used to determine whether the ipifs marked with IPIF_MOVING 5656 * have become quiescent and can be moved in a failover/failback. 5657 */ 5658 static ipif_t * 5659 ill_quiescent_to_move(ill_t *ill) 5660 { 5661 ipif_t *ipif; 5662 5663 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5664 5665 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5666 if (ipif->ipif_state_flags & IPIF_MOVING) { 5667 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 5668 return (ipif); 5669 } 5670 } 5671 } 5672 return (NULL); 5673 } 5674 5675 /* 5676 * The ipif/ill/ire has been refreled. Do the tail processing. 5677 * Determine if the ipif or ill in question has become quiescent and if so 5678 * wakeup close and/or restart any queued pending ioctl that is waiting 5679 * for the ipif_down (or ill_down) 5680 */ 5681 void 5682 ipif_ill_refrele_tail(ill_t *ill) 5683 { 5684 mblk_t *mp; 5685 conn_t *connp; 5686 ipsq_t *ipsq; 5687 ipif_t *ipif; 5688 5689 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5690 5691 if ((ill->ill_state_flags & ILL_CONDEMNED) && 5692 ill_is_quiescent(ill)) { 5693 /* ill_close may be waiting */ 5694 cv_broadcast(&ill->ill_cv); 5695 } 5696 5697 /* ipsq can't change because ill_lock is held */ 5698 ipsq = ill->ill_phyint->phyint_ipsq; 5699 if (ipsq->ipsq_waitfor == 0) { 5700 /* Not waiting for anything, just return. */ 5701 mutex_exit(&ill->ill_lock); 5702 return; 5703 } 5704 ASSERT(ipsq->ipsq_pending_mp != NULL && 5705 ipsq->ipsq_pending_ipif != NULL); 5706 /* 5707 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 5708 * Last ipif going down needs to down the ill, so ill_ire_cnt must 5709 * be zero for restarting an ioctl that ends up downing the ill. 5710 */ 5711 ipif = ipsq->ipsq_pending_ipif; 5712 if (ipif->ipif_ill != ill) { 5713 /* The ioctl is pending on some other ill. */ 5714 mutex_exit(&ill->ill_lock); 5715 return; 5716 } 5717 5718 switch (ipsq->ipsq_waitfor) { 5719 case IPIF_DOWN: 5720 case IPIF_FREE: 5721 if (!ipif_is_quiescent(ipif)) { 5722 mutex_exit(&ill->ill_lock); 5723 return; 5724 } 5725 break; 5726 5727 case ILL_DOWN: 5728 case ILL_FREE: 5729 /* 5730 * case ILL_FREE arises only for loopback. otherwise ill_delete 5731 * waits synchronously in ip_close, and no message is queued in 5732 * ipsq_pending_mp at all in this case 5733 */ 5734 if (!ill_is_quiescent(ill)) { 5735 mutex_exit(&ill->ill_lock); 5736 return; 5737 } 5738 5739 break; 5740 5741 case ILL_MOVE_OK: 5742 if (ill_quiescent_to_move(ill) != NULL) { 5743 mutex_exit(&ill->ill_lock); 5744 return; 5745 } 5746 5747 break; 5748 default: 5749 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 5750 (void *)ipsq, ipsq->ipsq_waitfor); 5751 } 5752 5753 /* 5754 * Incr refcnt for the qwriter_ip call below which 5755 * does a refrele 5756 */ 5757 ill_refhold_locked(ill); 5758 mutex_exit(&ill->ill_lock); 5759 5760 mp = ipsq_pending_mp_get(ipsq, &connp); 5761 ASSERT(mp != NULL); 5762 5763 switch (mp->b_datap->db_type) { 5764 case M_ERROR: 5765 case M_HANGUP: 5766 (void) qwriter_ip(NULL, ill, ill->ill_rq, mp, 5767 ipif_all_down_tail, CUR_OP, B_TRUE); 5768 return; 5769 5770 case M_IOCTL: 5771 case M_IOCDATA: 5772 (void) qwriter_ip(NULL, ill, 5773 (connp != NULL ? CONNP_TO_WQ(connp) : ill->ill_wq), mp, 5774 ip_reprocess_ioctl, CUR_OP, B_TRUE); 5775 return; 5776 5777 default: 5778 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5779 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5780 } 5781 } 5782 5783 #ifdef ILL_DEBUG 5784 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5785 void 5786 th_trace_rrecord(th_trace_t *th_trace) 5787 { 5788 tr_buf_t *tr_buf; 5789 uint_t lastref; 5790 5791 lastref = th_trace->th_trace_lastref; 5792 lastref++; 5793 if (lastref == TR_BUF_MAX) 5794 lastref = 0; 5795 th_trace->th_trace_lastref = lastref; 5796 tr_buf = &th_trace->th_trbuf[lastref]; 5797 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, IP_STACK_DEPTH); 5798 } 5799 5800 th_trace_t * 5801 th_trace_ipif_lookup(ipif_t *ipif) 5802 { 5803 int bucket_id; 5804 th_trace_t *th_trace; 5805 5806 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5807 5808 bucket_id = IP_TR_HASH(curthread); 5809 ASSERT(bucket_id < IP_TR_HASH_MAX); 5810 5811 for (th_trace = ipif->ipif_trace[bucket_id]; th_trace != NULL; 5812 th_trace = th_trace->th_next) { 5813 if (th_trace->th_id == curthread) 5814 return (th_trace); 5815 } 5816 return (NULL); 5817 } 5818 5819 void 5820 ipif_trace_ref(ipif_t *ipif) 5821 { 5822 int bucket_id; 5823 th_trace_t *th_trace; 5824 5825 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5826 5827 if (ipif->ipif_trace_disable) 5828 return; 5829 5830 /* 5831 * Attempt to locate the trace buffer for the curthread. 5832 * If it does not exist, then allocate a new trace buffer 5833 * and link it in list of trace bufs for this ipif, at the head 5834 */ 5835 th_trace = th_trace_ipif_lookup(ipif); 5836 if (th_trace == NULL) { 5837 bucket_id = IP_TR_HASH(curthread); 5838 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 5839 KM_NOSLEEP); 5840 if (th_trace == NULL) { 5841 ipif->ipif_trace_disable = B_TRUE; 5842 ipif_trace_cleanup(ipif); 5843 return; 5844 } 5845 th_trace->th_id = curthread; 5846 th_trace->th_next = ipif->ipif_trace[bucket_id]; 5847 th_trace->th_prev = &ipif->ipif_trace[bucket_id]; 5848 if (th_trace->th_next != NULL) 5849 th_trace->th_next->th_prev = &th_trace->th_next; 5850 ipif->ipif_trace[bucket_id] = th_trace; 5851 } 5852 ASSERT(th_trace->th_refcnt >= 0 && 5853 th_trace->th_refcnt < TR_BUF_MAX -1); 5854 th_trace->th_refcnt++; 5855 th_trace_rrecord(th_trace); 5856 } 5857 5858 void 5859 ipif_untrace_ref(ipif_t *ipif) 5860 { 5861 th_trace_t *th_trace; 5862 5863 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5864 5865 if (ipif->ipif_trace_disable) 5866 return; 5867 th_trace = th_trace_ipif_lookup(ipif); 5868 ASSERT(th_trace != NULL); 5869 ASSERT(th_trace->th_refcnt > 0); 5870 5871 th_trace->th_refcnt--; 5872 th_trace_rrecord(th_trace); 5873 } 5874 5875 th_trace_t * 5876 th_trace_ill_lookup(ill_t *ill) 5877 { 5878 th_trace_t *th_trace; 5879 int bucket_id; 5880 5881 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5882 5883 bucket_id = IP_TR_HASH(curthread); 5884 ASSERT(bucket_id < IP_TR_HASH_MAX); 5885 5886 for (th_trace = ill->ill_trace[bucket_id]; th_trace != NULL; 5887 th_trace = th_trace->th_next) { 5888 if (th_trace->th_id == curthread) 5889 return (th_trace); 5890 } 5891 return (NULL); 5892 } 5893 5894 void 5895 ill_trace_ref(ill_t *ill) 5896 { 5897 int bucket_id; 5898 th_trace_t *th_trace; 5899 5900 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5901 if (ill->ill_trace_disable) 5902 return; 5903 /* 5904 * Attempt to locate the trace buffer for the curthread. 5905 * If it does not exist, then allocate a new trace buffer 5906 * and link it in list of trace bufs for this ill, at the head 5907 */ 5908 th_trace = th_trace_ill_lookup(ill); 5909 if (th_trace == NULL) { 5910 bucket_id = IP_TR_HASH(curthread); 5911 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 5912 KM_NOSLEEP); 5913 if (th_trace == NULL) { 5914 ill->ill_trace_disable = B_TRUE; 5915 ill_trace_cleanup(ill); 5916 return; 5917 } 5918 th_trace->th_id = curthread; 5919 th_trace->th_next = ill->ill_trace[bucket_id]; 5920 th_trace->th_prev = &ill->ill_trace[bucket_id]; 5921 if (th_trace->th_next != NULL) 5922 th_trace->th_next->th_prev = &th_trace->th_next; 5923 ill->ill_trace[bucket_id] = th_trace; 5924 } 5925 ASSERT(th_trace->th_refcnt >= 0 && 5926 th_trace->th_refcnt < TR_BUF_MAX - 1); 5927 5928 th_trace->th_refcnt++; 5929 th_trace_rrecord(th_trace); 5930 } 5931 5932 void 5933 ill_untrace_ref(ill_t *ill) 5934 { 5935 th_trace_t *th_trace; 5936 5937 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5938 5939 if (ill->ill_trace_disable) 5940 return; 5941 th_trace = th_trace_ill_lookup(ill); 5942 ASSERT(th_trace != NULL); 5943 ASSERT(th_trace->th_refcnt > 0); 5944 5945 th_trace->th_refcnt--; 5946 th_trace_rrecord(th_trace); 5947 } 5948 5949 /* 5950 * Verify that this thread has no refs to the ipif and free 5951 * the trace buffers 5952 */ 5953 /* ARGSUSED */ 5954 void 5955 ipif_thread_exit(ipif_t *ipif, void *dummy) 5956 { 5957 th_trace_t *th_trace; 5958 5959 mutex_enter(&ipif->ipif_ill->ill_lock); 5960 5961 th_trace = th_trace_ipif_lookup(ipif); 5962 if (th_trace == NULL) { 5963 mutex_exit(&ipif->ipif_ill->ill_lock); 5964 return; 5965 } 5966 ASSERT(th_trace->th_refcnt == 0); 5967 /* unlink th_trace and free it */ 5968 *th_trace->th_prev = th_trace->th_next; 5969 if (th_trace->th_next != NULL) 5970 th_trace->th_next->th_prev = th_trace->th_prev; 5971 th_trace->th_next = NULL; 5972 th_trace->th_prev = NULL; 5973 kmem_free(th_trace, sizeof (th_trace_t)); 5974 5975 mutex_exit(&ipif->ipif_ill->ill_lock); 5976 } 5977 5978 /* 5979 * Verify that this thread has no refs to the ill and free 5980 * the trace buffers 5981 */ 5982 /* ARGSUSED */ 5983 void 5984 ill_thread_exit(ill_t *ill, void *dummy) 5985 { 5986 th_trace_t *th_trace; 5987 5988 mutex_enter(&ill->ill_lock); 5989 5990 th_trace = th_trace_ill_lookup(ill); 5991 if (th_trace == NULL) { 5992 mutex_exit(&ill->ill_lock); 5993 return; 5994 } 5995 ASSERT(th_trace->th_refcnt == 0); 5996 /* unlink th_trace and free it */ 5997 *th_trace->th_prev = th_trace->th_next; 5998 if (th_trace->th_next != NULL) 5999 th_trace->th_next->th_prev = th_trace->th_prev; 6000 th_trace->th_next = NULL; 6001 th_trace->th_prev = NULL; 6002 kmem_free(th_trace, sizeof (th_trace_t)); 6003 6004 mutex_exit(&ill->ill_lock); 6005 } 6006 #endif 6007 6008 #ifdef ILL_DEBUG 6009 void 6010 ip_thread_exit(void) 6011 { 6012 ill_t *ill; 6013 ipif_t *ipif; 6014 ill_walk_context_t ctx; 6015 6016 rw_enter(&ill_g_lock, RW_READER); 6017 ill = ILL_START_WALK_ALL(&ctx); 6018 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6019 for (ipif = ill->ill_ipif; ipif != NULL; 6020 ipif = ipif->ipif_next) { 6021 ipif_thread_exit(ipif, NULL); 6022 } 6023 ill_thread_exit(ill, NULL); 6024 } 6025 rw_exit(&ill_g_lock); 6026 6027 ire_walk(ire_thread_exit, NULL); 6028 ndp_walk_impl(NULL, nce_thread_exit, NULL, B_FALSE); 6029 } 6030 6031 /* 6032 * Called when ipif is unplumbed or when memory alloc fails 6033 */ 6034 void 6035 ipif_trace_cleanup(ipif_t *ipif) 6036 { 6037 int i; 6038 th_trace_t *th_trace; 6039 th_trace_t *th_trace_next; 6040 6041 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6042 for (th_trace = ipif->ipif_trace[i]; th_trace != NULL; 6043 th_trace = th_trace_next) { 6044 th_trace_next = th_trace->th_next; 6045 kmem_free(th_trace, sizeof (th_trace_t)); 6046 } 6047 ipif->ipif_trace[i] = NULL; 6048 } 6049 } 6050 6051 /* 6052 * Called when ill is unplumbed or when memory alloc fails 6053 */ 6054 void 6055 ill_trace_cleanup(ill_t *ill) 6056 { 6057 int i; 6058 th_trace_t *th_trace; 6059 th_trace_t *th_trace_next; 6060 6061 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6062 for (th_trace = ill->ill_trace[i]; th_trace != NULL; 6063 th_trace = th_trace_next) { 6064 th_trace_next = th_trace->th_next; 6065 kmem_free(th_trace, sizeof (th_trace_t)); 6066 } 6067 ill->ill_trace[i] = NULL; 6068 } 6069 } 6070 6071 #else 6072 void ip_thread_exit(void) {} 6073 #endif 6074 6075 void 6076 ipif_refhold_locked(ipif_t *ipif) 6077 { 6078 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6079 ipif->ipif_refcnt++; 6080 IPIF_TRACE_REF(ipif); 6081 } 6082 6083 void 6084 ipif_refhold(ipif_t *ipif) 6085 { 6086 ill_t *ill; 6087 6088 ill = ipif->ipif_ill; 6089 mutex_enter(&ill->ill_lock); 6090 ipif->ipif_refcnt++; 6091 IPIF_TRACE_REF(ipif); 6092 mutex_exit(&ill->ill_lock); 6093 } 6094 6095 /* 6096 * Must not be called while holding any locks. Otherwise if this is 6097 * the last reference to be released there is a chance of recursive mutex 6098 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6099 * to restart an ioctl. 6100 */ 6101 void 6102 ipif_refrele(ipif_t *ipif) 6103 { 6104 ill_t *ill; 6105 6106 ill = ipif->ipif_ill; 6107 6108 mutex_enter(&ill->ill_lock); 6109 ASSERT(ipif->ipif_refcnt != 0); 6110 ipif->ipif_refcnt--; 6111 IPIF_UNTRACE_REF(ipif); 6112 if (ipif->ipif_refcnt != 0) { 6113 mutex_exit(&ill->ill_lock); 6114 return; 6115 } 6116 6117 /* Drops the ill_lock */ 6118 ipif_ill_refrele_tail(ill); 6119 } 6120 6121 ipif_t * 6122 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6123 { 6124 ipif_t *ipif; 6125 6126 mutex_enter(&ill->ill_lock); 6127 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6128 ipif != NULL; ipif = ipif->ipif_next) { 6129 if (!IPIF_CAN_LOOKUP(ipif)) 6130 continue; 6131 ipif_refhold_locked(ipif); 6132 mutex_exit(&ill->ill_lock); 6133 return (ipif); 6134 } 6135 mutex_exit(&ill->ill_lock); 6136 return (NULL); 6137 } 6138 6139 /* 6140 * TODO: make this table extendible at run time 6141 * Return a pointer to the mac type info for 'mac_type' 6142 */ 6143 static ip_m_t * 6144 ip_m_lookup(t_uscalar_t mac_type) 6145 { 6146 ip_m_t *ipm; 6147 6148 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6149 if (ipm->ip_m_mac_type == mac_type) 6150 return (ipm); 6151 return (NULL); 6152 } 6153 6154 /* 6155 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6156 * ipif_arg is passed in to associate it with the correct interface. 6157 * We may need to restart this operation if the ipif cannot be looked up 6158 * due to an exclusive operation that is currently in progress. The restart 6159 * entry point is specified by 'func' 6160 */ 6161 int 6162 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6163 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6164 ire_t **ire_arg, boolean_t ioctl_msg, queue_t *q, mblk_t *mp, 6165 ipsq_func_t func) 6166 { 6167 ire_t *ire; 6168 ire_t *gw_ire = NULL; 6169 ipif_t *ipif = NULL; 6170 boolean_t ipif_refheld = B_FALSE; 6171 uint_t type; 6172 int match_flags = MATCH_IRE_TYPE; 6173 int error; 6174 6175 ip1dbg(("ip_rt_add:")); 6176 6177 if (ire_arg != NULL) 6178 *ire_arg = NULL; 6179 6180 /* 6181 * If this is the case of RTF_HOST being set, then we set the netmask 6182 * to all ones (regardless if one was supplied). 6183 */ 6184 if (flags & RTF_HOST) 6185 mask = IP_HOST_MASK; 6186 6187 /* 6188 * Prevent routes with a zero gateway from being created (since 6189 * interfaces can currently be plumbed and brought up no assigned 6190 * address). 6191 * For routes with RTA_SRCIFP, the gateway address can be 0.0.0.0. 6192 */ 6193 if (gw_addr == 0 && src_ipif == NULL) 6194 return (ENETUNREACH); 6195 /* 6196 * Get the ipif, if any, corresponding to the gw_addr 6197 */ 6198 if (gw_addr != 0) { 6199 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, 6200 &error); 6201 if (ipif != NULL) { 6202 if (IS_VNI(ipif->ipif_ill)) { 6203 ipif_refrele(ipif); 6204 return (EINVAL); 6205 } 6206 ipif_refheld = B_TRUE; 6207 } else if (error == EINPROGRESS) { 6208 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6209 return (EINPROGRESS); 6210 } else { 6211 error = 0; 6212 } 6213 } 6214 6215 if (ipif != NULL) { 6216 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6217 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6218 } else { 6219 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6220 } 6221 6222 /* 6223 * GateD will attempt to create routes with a loopback interface 6224 * address as the gateway and with RTF_GATEWAY set. We allow 6225 * these routes to be added, but create them as interface routes 6226 * since the gateway is an interface address. 6227 */ 6228 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) 6229 flags &= ~RTF_GATEWAY; 6230 6231 /* 6232 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6233 * and the gateway address provided is one of the system's interface 6234 * addresses. By using the routing socket interface and supplying an 6235 * RTA_IFP sockaddr with an interface index, an alternate method of 6236 * specifying an interface route to be created is available which uses 6237 * the interface index that specifies the outgoing interface rather than 6238 * the address of an outgoing interface (which may not be able to 6239 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6240 * flag, routes can be specified which not only specify the next-hop to 6241 * be used when routing to a certain prefix, but also which outgoing 6242 * interface should be used. 6243 * 6244 * Previously, interfaces would have unique addresses assigned to them 6245 * and so the address assigned to a particular interface could be used 6246 * to identify a particular interface. One exception to this was the 6247 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6248 * 6249 * With the advent of IPv6 and its link-local addresses, this 6250 * restriction was relaxed and interfaces could share addresses between 6251 * themselves. In fact, typically all of the link-local interfaces on 6252 * an IPv6 node or router will have the same link-local address. In 6253 * order to differentiate between these interfaces, the use of an 6254 * interface index is necessary and this index can be carried inside a 6255 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6256 * of using the interface index, however, is that all of the ipif's that 6257 * are part of an ill have the same index and so the RTA_IFP sockaddr 6258 * cannot be used to differentiate between ipif's (or logical 6259 * interfaces) that belong to the same ill (physical interface). 6260 * 6261 * For example, in the following case involving IPv4 interfaces and 6262 * logical interfaces 6263 * 6264 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6265 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6266 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6267 * 6268 * the ipif's corresponding to each of these interface routes can be 6269 * uniquely identified by the "gateway" (actually interface address). 6270 * 6271 * In this case involving multiple IPv6 default routes to a particular 6272 * link-local gateway, the use of RTA_IFP is necessary to specify which 6273 * default route is of interest: 6274 * 6275 * default fe80::123:4567:89ab:cdef U if0 6276 * default fe80::123:4567:89ab:cdef U if1 6277 */ 6278 6279 /* RTF_GATEWAY not set */ 6280 if (!(flags & RTF_GATEWAY)) { 6281 queue_t *stq; 6282 queue_t *rfq = NULL; 6283 ill_t *in_ill = NULL; 6284 6285 /* 6286 * As the interface index specified with the RTA_IFP sockaddr is 6287 * the same for all ipif's off of an ill, the matching logic 6288 * below uses MATCH_IRE_ILL if such an index was specified. 6289 * This means that routes sharing the same prefix when added 6290 * using a RTA_IFP sockaddr must have distinct interface 6291 * indices (namely, they must be on distinct ill's). 6292 * 6293 * On the other hand, since the gateway address will usually be 6294 * different for each ipif on the system, the matching logic 6295 * uses MATCH_IRE_IPIF in the case of a traditional interface 6296 * route. This means that interface routes for the same prefix 6297 * can be created if they belong to distinct ipif's and if a 6298 * RTA_IFP sockaddr is not present. 6299 */ 6300 if (ipif_arg != NULL) { 6301 if (ipif_refheld) { 6302 ipif_refrele(ipif); 6303 ipif_refheld = B_FALSE; 6304 } 6305 ipif = ipif_arg; 6306 match_flags |= MATCH_IRE_ILL; 6307 } else { 6308 /* 6309 * Check the ipif corresponding to the gw_addr 6310 */ 6311 if (ipif == NULL) 6312 return (ENETUNREACH); 6313 match_flags |= MATCH_IRE_IPIF; 6314 } 6315 ASSERT(ipif != NULL); 6316 /* 6317 * If src_ipif is not NULL, we have to create 6318 * an ire with non-null ire_in_ill value 6319 */ 6320 if (src_ipif != NULL) { 6321 in_ill = src_ipif->ipif_ill; 6322 } 6323 6324 /* 6325 * We check for an existing entry at this point. 6326 * 6327 * Since a netmask isn't passed in via the ioctl interface 6328 * (SIOCADDRT), we don't check for a matching netmask in that 6329 * case. 6330 */ 6331 if (!ioctl_msg) 6332 match_flags |= MATCH_IRE_MASK; 6333 if (src_ipif != NULL) { 6334 /* Look up in the special table */ 6335 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6336 ipif, src_ipif->ipif_ill, match_flags); 6337 } else { 6338 ire = ire_ftable_lookup(dst_addr, mask, 0, 6339 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6340 match_flags); 6341 } 6342 if (ire != NULL) { 6343 ire_refrele(ire); 6344 if (ipif_refheld) 6345 ipif_refrele(ipif); 6346 return (EEXIST); 6347 } 6348 6349 if (src_ipif != NULL) { 6350 /* 6351 * Create the special ire for the IRE table 6352 * which hangs out of ire_in_ill. This ire 6353 * is in-between IRE_CACHE and IRE_INTERFACE. 6354 * Thus rfq is non-NULL. 6355 */ 6356 rfq = ipif->ipif_rq; 6357 } 6358 /* Create the usual interface ires */ 6359 6360 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6361 ? ipif->ipif_rq : ipif->ipif_wq; 6362 6363 /* 6364 * Create a copy of the IRE_LOOPBACK, 6365 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6366 * the modified address and netmask. 6367 */ 6368 ire = ire_create( 6369 (uchar_t *)&dst_addr, 6370 (uint8_t *)&mask, 6371 (uint8_t *)&ipif->ipif_src_addr, 6372 NULL, 6373 NULL, 6374 &ipif->ipif_mtu, 6375 NULL, 6376 rfq, 6377 stq, 6378 ipif->ipif_net_type, 6379 ipif->ipif_resolver_mp, 6380 ipif, 6381 in_ill, 6382 0, 6383 0, 6384 0, 6385 flags, 6386 &ire_uinfo_null); 6387 if (ire == NULL) { 6388 if (ipif_refheld) 6389 ipif_refrele(ipif); 6390 return (ENOMEM); 6391 } 6392 6393 /* 6394 * Some software (for example, GateD and Sun Cluster) attempts 6395 * to create (what amount to) IRE_PREFIX routes with the 6396 * loopback address as the gateway. This is primarily done to 6397 * set up prefixes with the RTF_REJECT flag set (for example, 6398 * when generating aggregate routes.) 6399 * 6400 * If the IRE type (as defined by ipif->ipif_net_type) is 6401 * IRE_LOOPBACK, then we map the request into a 6402 * IRE_IF_NORESOLVER. 6403 * 6404 * Needless to say, the real IRE_LOOPBACK is NOT created by this 6405 * routine, but rather using ire_create() directly. 6406 */ 6407 if (ipif->ipif_net_type == IRE_LOOPBACK) 6408 ire->ire_type = IRE_IF_NORESOLVER; 6409 error = ire_add(&ire, q, mp, func); 6410 if (error == 0) 6411 goto save_ire; 6412 6413 /* 6414 * In the result of failure, ire_add() will have already 6415 * deleted the ire in question, so there is no need to 6416 * do that here. 6417 */ 6418 if (ipif_refheld) 6419 ipif_refrele(ipif); 6420 return (error); 6421 } 6422 if (ipif_refheld) { 6423 ipif_refrele(ipif); 6424 ipif_refheld = B_FALSE; 6425 } 6426 6427 if (src_ipif != NULL) { 6428 /* RTA_SRCIFP is not supported on RTF_GATEWAY */ 6429 ip2dbg(("ip_rt_add: SRCIF cannot be set with gateway route\n")); 6430 return (EINVAL); 6431 } 6432 /* 6433 * Get an interface IRE for the specified gateway. 6434 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 6435 * gateway, it is currently unreachable and we fail the request 6436 * accordingly. 6437 */ 6438 ipif = ipif_arg; 6439 if (ipif_arg != NULL) 6440 match_flags |= MATCH_IRE_ILL; 6441 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 6442 ALL_ZONES, 0, match_flags); 6443 if (gw_ire == NULL) 6444 return (ENETUNREACH); 6445 6446 /* 6447 * We create one of three types of IREs as a result of this request 6448 * based on the netmask. A netmask of all ones (which is automatically 6449 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 6450 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 6451 * created. Otherwise, an IRE_PREFIX route is created for the 6452 * destination prefix. 6453 */ 6454 if (mask == IP_HOST_MASK) 6455 type = IRE_HOST; 6456 else if (mask == 0) 6457 type = IRE_DEFAULT; 6458 else 6459 type = IRE_PREFIX; 6460 6461 /* check for a duplicate entry */ 6462 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 6463 NULL, ALL_ZONES, 0, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW); 6464 if (ire != NULL) { 6465 ire_refrele(gw_ire); 6466 ire_refrele(ire); 6467 return (EEXIST); 6468 } 6469 6470 /* Create the IRE. */ 6471 ire = ire_create( 6472 (uchar_t *)&dst_addr, /* dest address */ 6473 (uchar_t *)&mask, /* mask */ 6474 /* src address assigned by the caller? */ 6475 (uchar_t *)(((src_addr != INADDR_ANY) && 6476 (flags & RTF_SETSRC)) ? &src_addr : NULL), 6477 (uchar_t *)&gw_addr, /* gateway address */ 6478 NULL, /* no in-srcaddress */ 6479 &gw_ire->ire_max_frag, 6480 NULL, /* no Fast Path header */ 6481 NULL, /* no recv-from queue */ 6482 NULL, /* no send-to queue */ 6483 (ushort_t)type, /* IRE type */ 6484 NULL, 6485 ipif_arg, 6486 NULL, 6487 0, 6488 0, 6489 0, 6490 flags, 6491 &gw_ire->ire_uinfo); /* Inherit ULP info from gw */ 6492 if (ire == NULL) { 6493 ire_refrele(gw_ire); 6494 return (ENOMEM); 6495 } 6496 6497 /* 6498 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 6499 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 6500 */ 6501 6502 /* Add the new IRE. */ 6503 error = ire_add(&ire, q, mp, func); 6504 if (error != 0) { 6505 /* 6506 * In the result of failure, ire_add() will have already 6507 * deleted the ire in question, so there is no need to 6508 * do that here. 6509 */ 6510 ire_refrele(gw_ire); 6511 return (error); 6512 } 6513 6514 if (flags & RTF_MULTIRT) { 6515 /* 6516 * Invoke the CGTP (multirouting) filtering module 6517 * to add the dst address in the filtering database. 6518 * Replicated inbound packets coming from that address 6519 * will be filtered to discard the duplicates. 6520 * It is not necessary to call the CGTP filter hook 6521 * when the dst address is a broadcast or multicast, 6522 * because an IP source address cannot be a broadcast 6523 * or a multicast. 6524 */ 6525 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 6526 IRE_BROADCAST, NULL, NULL, MATCH_IRE_TYPE); 6527 if (ire_dst != NULL) { 6528 ip_cgtp_bcast_add(ire, ire_dst); 6529 ire_refrele(ire_dst); 6530 goto save_ire; 6531 } 6532 if ((ip_cgtp_filter_ops != NULL) && !CLASSD(ire->ire_addr)) { 6533 int res = ip_cgtp_filter_ops->cfo_add_dest_v4( 6534 ire->ire_addr, 6535 ire->ire_gateway_addr, 6536 ire->ire_src_addr, 6537 gw_ire->ire_src_addr); 6538 if (res != 0) { 6539 ire_refrele(gw_ire); 6540 ire_delete(ire); 6541 return (res); 6542 } 6543 } 6544 } 6545 6546 save_ire: 6547 if (gw_ire != NULL) { 6548 ire_refrele(gw_ire); 6549 } 6550 /* 6551 * We do not do save_ire for the routes added with RTA_SRCIFP 6552 * flag. This route is only added and deleted by mipagent. 6553 * So, for simplicity of design, we refrain from saving 6554 * ires that are created with srcif value. This may change 6555 * in future if we find more usage of srcifp feature. 6556 */ 6557 if (ipif != NULL && src_ipif == NULL) { 6558 /* 6559 * Save enough information so that we can recreate the IRE if 6560 * the interface goes down and then up. The metrics associated 6561 * with the route will be saved as well when rts_setmetrics() is 6562 * called after the IRE has been created. In the case where 6563 * memory cannot be allocated, none of this information will be 6564 * saved. 6565 */ 6566 ipif_save_ire(ipif, ire); 6567 } 6568 if (ioctl_msg) 6569 ip_rts_rtmsg(RTM_OLDADD, ire, 0); 6570 if (ire_arg != NULL) { 6571 /* 6572 * Store the ire that was successfully added into where ire_arg 6573 * points to so that callers don't have to look it up 6574 * themselves (but they are responsible for ire_refrele()ing 6575 * the ire when they are finished with it). 6576 */ 6577 *ire_arg = ire; 6578 } else { 6579 ire_refrele(ire); /* Held in ire_add */ 6580 } 6581 if (ipif_refheld) 6582 ipif_refrele(ipif); 6583 return (0); 6584 } 6585 6586 /* 6587 * ip_rt_delete is called to delete an IPv4 route. 6588 * ipif_arg is passed in to associate it with the correct interface. 6589 * src_ipif is passed to associate the incoming interface of the packet. 6590 * We may need to restart this operation if the ipif cannot be looked up 6591 * due to an exclusive operation that is currently in progress. The restart 6592 * entry point is specified by 'func' 6593 */ 6594 /* ARGSUSED4 */ 6595 int 6596 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6597 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6598 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func) 6599 { 6600 ire_t *ire = NULL; 6601 ipif_t *ipif; 6602 boolean_t ipif_refheld = B_FALSE; 6603 uint_t type; 6604 uint_t match_flags = MATCH_IRE_TYPE; 6605 int err = 0; 6606 6607 ip1dbg(("ip_rt_delete:")); 6608 /* 6609 * If this is the case of RTF_HOST being set, then we set the netmask 6610 * to all ones. Otherwise, we use the netmask if one was supplied. 6611 */ 6612 if (flags & RTF_HOST) { 6613 mask = IP_HOST_MASK; 6614 match_flags |= MATCH_IRE_MASK; 6615 } else if (rtm_addrs & RTA_NETMASK) { 6616 match_flags |= MATCH_IRE_MASK; 6617 } 6618 6619 /* 6620 * Note that RTF_GATEWAY is never set on a delete, therefore 6621 * we check if the gateway address is one of our interfaces first, 6622 * and fall back on RTF_GATEWAY routes. 6623 * 6624 * This makes it possible to delete an original 6625 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 6626 * 6627 * As the interface index specified with the RTA_IFP sockaddr is the 6628 * same for all ipif's off of an ill, the matching logic below uses 6629 * MATCH_IRE_ILL if such an index was specified. This means a route 6630 * sharing the same prefix and interface index as the the route 6631 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 6632 * is specified in the request. 6633 * 6634 * On the other hand, since the gateway address will usually be 6635 * different for each ipif on the system, the matching logic 6636 * uses MATCH_IRE_IPIF in the case of a traditional interface 6637 * route. This means that interface routes for the same prefix can be 6638 * uniquely identified if they belong to distinct ipif's and if a 6639 * RTA_IFP sockaddr is not present. 6640 * 6641 * For more detail on specifying routes by gateway address and by 6642 * interface index, see the comments in ip_rt_add(). 6643 * gw_addr could be zero in some cases when both RTA_SRCIFP and 6644 * RTA_IFP are specified. If RTA_SRCIFP is specified and both 6645 * RTA_IFP and gateway_addr are NULL/zero, then delete will not 6646 * succeed. 6647 */ 6648 if (src_ipif != NULL) { 6649 if (ipif_arg == NULL && gw_addr != 0) { 6650 ipif_arg = ipif_lookup_interface(gw_addr, dst_addr, 6651 q, mp, func, &err); 6652 if (ipif_arg != NULL) 6653 ipif_refheld = B_TRUE; 6654 } 6655 if (ipif_arg == NULL) { 6656 err = (err == EINPROGRESS) ? err : ESRCH; 6657 return (err); 6658 } 6659 ipif = ipif_arg; 6660 } else { 6661 ipif = ipif_lookup_interface(gw_addr, dst_addr, 6662 q, mp, func, &err); 6663 if (ipif != NULL) 6664 ipif_refheld = B_TRUE; 6665 else if (err == EINPROGRESS) 6666 return (err); 6667 else 6668 err = 0; 6669 } 6670 if (ipif != NULL) { 6671 if (ipif_arg != NULL) { 6672 if (ipif_refheld) { 6673 ipif_refrele(ipif); 6674 ipif_refheld = B_FALSE; 6675 } 6676 ipif = ipif_arg; 6677 match_flags |= MATCH_IRE_ILL; 6678 } else { 6679 match_flags |= MATCH_IRE_IPIF; 6680 } 6681 if (src_ipif != NULL) { 6682 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6683 ipif, src_ipif->ipif_ill, match_flags); 6684 } else { 6685 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 6686 ire = ire_ctable_lookup(dst_addr, 0, 6687 IRE_LOOPBACK, ipif, ALL_ZONES, match_flags); 6688 } 6689 if (ire == NULL) { 6690 ire = ire_ftable_lookup(dst_addr, mask, 0, 6691 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6692 match_flags); 6693 } 6694 } 6695 } 6696 6697 if (ire == NULL) { 6698 /* 6699 * At this point, the gateway address is not one of our own 6700 * addresses or a matching interface route was not found. We 6701 * set the IRE type to lookup based on whether 6702 * this is a host route, a default route or just a prefix. 6703 * 6704 * If an ipif_arg was passed in, then the lookup is based on an 6705 * interface index so MATCH_IRE_ILL is added to match_flags. 6706 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 6707 * set as the route being looked up is not a traditional 6708 * interface route. 6709 * Since we do not add gateway route with srcipif, we don't 6710 * expect to find it either. 6711 */ 6712 if (src_ipif != NULL) { 6713 if (ipif_refheld) 6714 ipif_refrele(ipif); 6715 return (ESRCH); 6716 } else { 6717 match_flags &= ~MATCH_IRE_IPIF; 6718 match_flags |= MATCH_IRE_GW; 6719 if (ipif_arg != NULL) 6720 match_flags |= MATCH_IRE_ILL; 6721 if (mask == IP_HOST_MASK) 6722 type = IRE_HOST; 6723 else if (mask == 0) 6724 type = IRE_DEFAULT; 6725 else 6726 type = IRE_PREFIX; 6727 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, 6728 ipif_arg, NULL, ALL_ZONES, 0, match_flags); 6729 if (ire == NULL && type == IRE_HOST) { 6730 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, 6731 IRE_HOST_REDIRECT, ipif_arg, NULL, 6732 ALL_ZONES, 0, match_flags); 6733 } 6734 } 6735 } 6736 6737 if (ipif_refheld) 6738 ipif_refrele(ipif); 6739 6740 /* ipif is not refheld anymore */ 6741 if (ire == NULL) 6742 return (ESRCH); 6743 6744 if (ire->ire_flags & RTF_MULTIRT) { 6745 /* 6746 * Invoke the CGTP (multirouting) filtering module 6747 * to remove the dst address from the filtering database. 6748 * Packets coming from that address will no longer be 6749 * filtered to remove duplicates. 6750 */ 6751 if (ip_cgtp_filter_ops != NULL) { 6752 err = ip_cgtp_filter_ops->cfo_del_dest_v4(ire->ire_addr, 6753 ire->ire_gateway_addr); 6754 } 6755 ip_cgtp_bcast_delete(ire); 6756 } 6757 6758 ipif = ire->ire_ipif; 6759 /* 6760 * Removing from ipif_saved_ire_mp is not necessary 6761 * when src_ipif being non-NULL. ip_rt_add does not 6762 * save the ires which src_ipif being non-NULL. 6763 */ 6764 if (ipif != NULL && src_ipif == NULL) { 6765 ipif_remove_ire(ipif, ire); 6766 } 6767 if (ioctl_msg) 6768 ip_rts_rtmsg(RTM_OLDDEL, ire, 0); 6769 ire_delete(ire); 6770 ire_refrele(ire); 6771 return (err); 6772 } 6773 6774 /* 6775 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6776 */ 6777 /* ARGSUSED */ 6778 int 6779 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6780 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6781 { 6782 ipaddr_t dst_addr; 6783 ipaddr_t gw_addr; 6784 ipaddr_t mask; 6785 int error = 0; 6786 mblk_t *mp1; 6787 struct rtentry *rt; 6788 ipif_t *ipif = NULL; 6789 6790 ip1dbg(("ip_siocaddrt:")); 6791 /* Existence of mp1 verified in ip_wput_nondata */ 6792 mp1 = mp->b_cont->b_cont; 6793 rt = (struct rtentry *)mp1->b_rptr; 6794 6795 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6796 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6797 6798 /* 6799 * If the RTF_HOST flag is on, this is a request to assign a gateway 6800 * to a particular host address. In this case, we set the netmask to 6801 * all ones for the particular destination address. Otherwise, 6802 * determine the netmask to be used based on dst_addr and the interfaces 6803 * in use. 6804 */ 6805 if (rt->rt_flags & RTF_HOST) { 6806 mask = IP_HOST_MASK; 6807 } else { 6808 /* 6809 * Note that ip_subnet_mask returns a zero mask in the case of 6810 * default (an all-zeroes address). 6811 */ 6812 mask = ip_subnet_mask(dst_addr, &ipif); 6813 } 6814 6815 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, 6816 NULL, NULL, NULL, B_TRUE, q, mp, ip_process_ioctl); 6817 if (ipif != NULL) 6818 ipif_refrele(ipif); 6819 return (error); 6820 } 6821 6822 /* 6823 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6824 */ 6825 /* ARGSUSED */ 6826 int 6827 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6828 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6829 { 6830 ipaddr_t dst_addr; 6831 ipaddr_t gw_addr; 6832 ipaddr_t mask; 6833 int error; 6834 mblk_t *mp1; 6835 struct rtentry *rt; 6836 ipif_t *ipif = NULL; 6837 6838 ip1dbg(("ip_siocdelrt:")); 6839 /* Existence of mp1 verified in ip_wput_nondata */ 6840 mp1 = mp->b_cont->b_cont; 6841 rt = (struct rtentry *)mp1->b_rptr; 6842 6843 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6844 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6845 6846 /* 6847 * If the RTF_HOST flag is on, this is a request to delete a gateway 6848 * to a particular host address. In this case, we set the netmask to 6849 * all ones for the particular destination address. Otherwise, 6850 * determine the netmask to be used based on dst_addr and the interfaces 6851 * in use. 6852 */ 6853 if (rt->rt_flags & RTF_HOST) { 6854 mask = IP_HOST_MASK; 6855 } else { 6856 /* 6857 * Note that ip_subnet_mask returns a zero mask in the case of 6858 * default (an all-zeroes address). 6859 */ 6860 mask = ip_subnet_mask(dst_addr, &ipif); 6861 } 6862 6863 error = ip_rt_delete(dst_addr, mask, gw_addr, 6864 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, NULL, 6865 B_TRUE, q, mp, ip_process_ioctl); 6866 if (ipif != NULL) 6867 ipif_refrele(ipif); 6868 return (error); 6869 } 6870 6871 /* 6872 * Enqueue the mp onto the ipsq, chained by b_next. 6873 * b_prev stores the function to be executed later, and b_queue the queue 6874 * where this mp originated. 6875 */ 6876 void 6877 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6878 ill_t *pending_ill) 6879 { 6880 conn_t *connp = NULL; 6881 6882 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6883 ASSERT(func != NULL); 6884 6885 mp->b_queue = q; 6886 mp->b_prev = (void *)func; 6887 mp->b_next = NULL; 6888 6889 switch (type) { 6890 case CUR_OP: 6891 if (ipsq->ipsq_mptail != NULL) { 6892 ASSERT(ipsq->ipsq_mphead != NULL); 6893 ipsq->ipsq_mptail->b_next = mp; 6894 } else { 6895 ASSERT(ipsq->ipsq_mphead == NULL); 6896 ipsq->ipsq_mphead = mp; 6897 } 6898 ipsq->ipsq_mptail = mp; 6899 break; 6900 6901 case NEW_OP: 6902 if (ipsq->ipsq_xopq_mptail != NULL) { 6903 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 6904 ipsq->ipsq_xopq_mptail->b_next = mp; 6905 } else { 6906 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 6907 ipsq->ipsq_xopq_mphead = mp; 6908 } 6909 ipsq->ipsq_xopq_mptail = mp; 6910 break; 6911 default: 6912 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 6913 } 6914 6915 if (CONN_Q(q) && pending_ill != NULL) { 6916 connp = Q_TO_CONN(q); 6917 6918 ASSERT(MUTEX_HELD(&connp->conn_lock)); 6919 connp->conn_oper_pending_ill = pending_ill; 6920 } 6921 } 6922 6923 /* 6924 * Return the mp at the head of the ipsq. After emptying the ipsq 6925 * look at the next ioctl, if this ioctl is complete. Otherwise 6926 * return, we will resume when we complete the current ioctl. 6927 * The current ioctl will wait till it gets a response from the 6928 * driver below. 6929 */ 6930 static mblk_t * 6931 ipsq_dq(ipsq_t *ipsq) 6932 { 6933 mblk_t *mp; 6934 6935 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6936 6937 mp = ipsq->ipsq_mphead; 6938 if (mp != NULL) { 6939 ipsq->ipsq_mphead = mp->b_next; 6940 if (ipsq->ipsq_mphead == NULL) 6941 ipsq->ipsq_mptail = NULL; 6942 mp->b_next = NULL; 6943 return (mp); 6944 } 6945 if (ipsq->ipsq_current_ipif != NULL) 6946 return (NULL); 6947 mp = ipsq->ipsq_xopq_mphead; 6948 if (mp != NULL) { 6949 ipsq->ipsq_xopq_mphead = mp->b_next; 6950 if (ipsq->ipsq_xopq_mphead == NULL) 6951 ipsq->ipsq_xopq_mptail = NULL; 6952 mp->b_next = NULL; 6953 return (mp); 6954 } 6955 return (NULL); 6956 } 6957 6958 /* 6959 * Enter the ipsq corresponding to ill, by waiting synchronously till 6960 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 6961 * will have to drain completely before ipsq_enter returns success. 6962 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 6963 * and the ipsq_exit logic will start the next enqueued ioctl after 6964 * completion of the current ioctl. If 'force' is used, we don't wait 6965 * for the enqueued ioctls. This is needed when a conn_close wants to 6966 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 6967 * of an ill can also use this option. But we dont' use it currently. 6968 */ 6969 #define ENTER_SQ_WAIT_TICKS 100 6970 boolean_t 6971 ipsq_enter(ill_t *ill, boolean_t force) 6972 { 6973 ipsq_t *ipsq; 6974 boolean_t waited_enough = B_FALSE; 6975 6976 /* 6977 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 6978 * Since the <ill-ipsq> assocs could change while we wait for the 6979 * writer, it is easier to wait on a fixed global rather than try to 6980 * cv_wait on a changing ipsq. 6981 */ 6982 mutex_enter(&ill->ill_lock); 6983 for (;;) { 6984 if (ill->ill_state_flags & ILL_CONDEMNED) { 6985 mutex_exit(&ill->ill_lock); 6986 return (B_FALSE); 6987 } 6988 6989 ipsq = ill->ill_phyint->phyint_ipsq; 6990 mutex_enter(&ipsq->ipsq_lock); 6991 if (ipsq->ipsq_writer == NULL && 6992 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 6993 break; 6994 } else if (ipsq->ipsq_writer != NULL) { 6995 mutex_exit(&ipsq->ipsq_lock); 6996 cv_wait(&ill->ill_cv, &ill->ill_lock); 6997 } else { 6998 mutex_exit(&ipsq->ipsq_lock); 6999 if (force) { 7000 (void) cv_timedwait(&ill->ill_cv, 7001 &ill->ill_lock, 7002 lbolt + ENTER_SQ_WAIT_TICKS); 7003 waited_enough = B_TRUE; 7004 continue; 7005 } else { 7006 cv_wait(&ill->ill_cv, &ill->ill_lock); 7007 } 7008 } 7009 } 7010 7011 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7012 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7013 ipsq->ipsq_writer = curthread; 7014 ipsq->ipsq_reentry_cnt++; 7015 #ifdef ILL_DEBUG 7016 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7017 #endif 7018 mutex_exit(&ipsq->ipsq_lock); 7019 mutex_exit(&ill->ill_lock); 7020 return (B_TRUE); 7021 } 7022 7023 /* 7024 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7025 * certain critical operations like plumbing (i.e. most set ioctls), 7026 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7027 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7028 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7029 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7030 * threads executing in the ipsq. Responses from the driver pertain to the 7031 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7032 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7033 * 7034 * If a thread does not want to reenter the ipsq when it is already writer, 7035 * it must make sure that the specified reentry point to be called later 7036 * when the ipsq is empty, nor any code path starting from the specified reentry 7037 * point must never ever try to enter the ipsq again. Otherwise it can lead 7038 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7039 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7040 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7041 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7042 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7043 * ioctl if the current ioctl has completed. If the current ioctl is still 7044 * in progress it simply returns. The current ioctl could be waiting for 7045 * a response from another module (arp_ or the driver or could be waiting for 7046 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7047 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7048 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7049 * ipsq_current_ipif is clear which happens only on ioctl completion. 7050 */ 7051 7052 /* 7053 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7054 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7055 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7056 * completion. 7057 */ 7058 ipsq_t * 7059 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7060 ipsq_func_t func, int type, boolean_t reentry_ok) 7061 { 7062 ipsq_t *ipsq; 7063 7064 /* Only 1 of ipif or ill can be specified */ 7065 ASSERT((ipif != NULL) ^ (ill != NULL)); 7066 if (ipif != NULL) 7067 ill = ipif->ipif_ill; 7068 7069 /* 7070 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7071 * ipsq of an ill can't change when ill_lock is held. 7072 */ 7073 GRAB_CONN_LOCK(q); 7074 mutex_enter(&ill->ill_lock); 7075 ipsq = ill->ill_phyint->phyint_ipsq; 7076 mutex_enter(&ipsq->ipsq_lock); 7077 7078 /* 7079 * 1. Enter the ipsq if we are already writer and reentry is ok. 7080 * (Note: If the caller does not specify reentry_ok then neither 7081 * 'func' nor any of its callees must ever attempt to enter the ipsq 7082 * again. Otherwise it can lead to an infinite loop 7083 * 2. Enter the ipsq if there is no current writer and this attempted 7084 * entry is part of the current ioctl or operation 7085 * 3. Enter the ipsq if there is no current writer and this is a new 7086 * ioctl (or operation) and the ioctl (or operation) queue is 7087 * empty and there is no ioctl (or operation) currently in progress 7088 */ 7089 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7090 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7091 ipsq->ipsq_current_ipif == NULL))) || 7092 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7093 /* Success. */ 7094 ipsq->ipsq_reentry_cnt++; 7095 ipsq->ipsq_writer = curthread; 7096 mutex_exit(&ipsq->ipsq_lock); 7097 mutex_exit(&ill->ill_lock); 7098 RELEASE_CONN_LOCK(q); 7099 #ifdef ILL_DEBUG 7100 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7101 #endif 7102 return (ipsq); 7103 } 7104 7105 ipsq_enq(ipsq, q, mp, func, type, ill); 7106 7107 mutex_exit(&ipsq->ipsq_lock); 7108 mutex_exit(&ill->ill_lock); 7109 RELEASE_CONN_LOCK(q); 7110 return (NULL); 7111 } 7112 7113 /* 7114 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7115 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7116 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7117 * completion. 7118 * 7119 * This function does a refrele on the ipif/ill. 7120 */ 7121 void 7122 qwriter_ip(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7123 ipsq_func_t func, int type, boolean_t reentry_ok) 7124 { 7125 ipsq_t *ipsq; 7126 7127 ipsq = ipsq_try_enter(ipif, ill, q, mp, func, type, reentry_ok); 7128 /* 7129 * Caller must have done a refhold on the ipif. ipif_refrele 7130 * happens on the passed ipif. We can do this since we are 7131 * already exclusive, or we won't access ipif henceforth, Both 7132 * this func and caller will just return if we ipsq_try_enter 7133 * fails above. This is needed because func needs to 7134 * see the correct refcount. Eg. removeif can work only then. 7135 */ 7136 if (ipif != NULL) 7137 ipif_refrele(ipif); 7138 else 7139 ill_refrele(ill); 7140 if (ipsq != NULL) { 7141 (*func)(ipsq, q, mp, NULL); 7142 ipsq_exit(ipsq, B_TRUE, B_TRUE); 7143 } 7144 } 7145 7146 /* 7147 * If there are more than ILL_GRP_CNT ills in a group, 7148 * we use kmem alloc'd buffers, else use the stack 7149 */ 7150 #define ILL_GRP_CNT 14 7151 /* 7152 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 7153 * Called by a thread that is currently exclusive on this ipsq. 7154 */ 7155 void 7156 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 7157 { 7158 queue_t *q; 7159 mblk_t *mp; 7160 ipsq_func_t func; 7161 int next; 7162 ill_t **ill_list = NULL; 7163 size_t ill_list_size = 0; 7164 int cnt = 0; 7165 boolean_t need_ipsq_free = B_FALSE; 7166 7167 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7168 mutex_enter(&ipsq->ipsq_lock); 7169 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 7170 if (ipsq->ipsq_reentry_cnt != 1) { 7171 ipsq->ipsq_reentry_cnt--; 7172 mutex_exit(&ipsq->ipsq_lock); 7173 return; 7174 } 7175 7176 mp = ipsq_dq(ipsq); 7177 while (mp != NULL) { 7178 again: 7179 mutex_exit(&ipsq->ipsq_lock); 7180 func = (ipsq_func_t)mp->b_prev; 7181 q = (queue_t *)mp->b_queue; 7182 mp->b_prev = NULL; 7183 mp->b_queue = NULL; 7184 7185 /* 7186 * If 'q' is an conn queue, it is valid, since we did a 7187 * a refhold on the connp, at the start of the ioctl. 7188 * If 'q' is an ill queue, it is valid, since close of an 7189 * ill will clean up the 'ipsq'. 7190 */ 7191 (*func)(ipsq, q, mp, NULL); 7192 7193 mutex_enter(&ipsq->ipsq_lock); 7194 mp = ipsq_dq(ipsq); 7195 } 7196 7197 mutex_exit(&ipsq->ipsq_lock); 7198 7199 /* 7200 * Need to grab the locks in the right order. Need to 7201 * atomically check (under ipsq_lock) that there are no 7202 * messages before relinquishing the ipsq. Also need to 7203 * atomically wakeup waiters on ill_cv while holding ill_lock. 7204 * Holding ill_g_lock ensures that ipsq list of ills is stable. 7205 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 7206 * to grab ill_g_lock as writer. 7207 */ 7208 rw_enter(&ill_g_lock, ipsq->ipsq_split ? RW_WRITER : RW_READER); 7209 7210 /* ipsq_refs can't change while ill_g_lock is held as reader */ 7211 if (ipsq->ipsq_refs != 0) { 7212 /* At most 2 ills v4/v6 per phyint */ 7213 cnt = ipsq->ipsq_refs << 1; 7214 ill_list_size = cnt * sizeof (ill_t *); 7215 /* 7216 * If memory allocation fails, we will do the split 7217 * the next time ipsq_exit is called for whatever reason. 7218 * As long as the ipsq_split flag is set the need to 7219 * split is remembered. 7220 */ 7221 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 7222 if (ill_list != NULL) 7223 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 7224 } 7225 mutex_enter(&ipsq->ipsq_lock); 7226 mp = ipsq_dq(ipsq); 7227 if (mp != NULL) { 7228 /* oops, some message has landed up, we can't get out */ 7229 if (ill_list != NULL) 7230 ill_unlock_ills(ill_list, cnt); 7231 rw_exit(&ill_g_lock); 7232 if (ill_list != NULL) 7233 kmem_free(ill_list, ill_list_size); 7234 ill_list = NULL; 7235 ill_list_size = 0; 7236 cnt = 0; 7237 goto again; 7238 } 7239 7240 /* 7241 * Split only if no ioctl is pending and if memory alloc succeeded 7242 * above. 7243 */ 7244 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 7245 ill_list != NULL) { 7246 /* 7247 * No new ill can join this ipsq since we are holding the 7248 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 7249 * ipsq. ill_split_ipsq may fail due to memory shortage. 7250 * If so we will retry on the next ipsq_exit. 7251 */ 7252 ipsq->ipsq_split = ill_split_ipsq(ipsq); 7253 } 7254 7255 /* 7256 * We are holding the ipsq lock, hence no new messages can 7257 * land up on the ipsq, and there are no messages currently. 7258 * Now safe to get out. Wake up waiters and relinquish ipsq 7259 * atomically while holding ill locks. 7260 */ 7261 ipsq->ipsq_writer = NULL; 7262 ipsq->ipsq_reentry_cnt--; 7263 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7264 #ifdef ILL_DEBUG 7265 ipsq->ipsq_depth = 0; 7266 #endif 7267 mutex_exit(&ipsq->ipsq_lock); 7268 /* 7269 * For IPMP this should wake up all ills in this ipsq. 7270 * We need to hold the ill_lock while waking up waiters to 7271 * avoid missed wakeups. But there is no need to acquire all 7272 * the ill locks and then wakeup. If we have not acquired all 7273 * the locks (due to memory failure above) ill_signal_ipsq_ills 7274 * wakes up ills one at a time after getting the right ill_lock 7275 */ 7276 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 7277 if (ill_list != NULL) 7278 ill_unlock_ills(ill_list, cnt); 7279 if (ipsq->ipsq_refs == 0) 7280 need_ipsq_free = B_TRUE; 7281 rw_exit(&ill_g_lock); 7282 if (ill_list != 0) 7283 kmem_free(ill_list, ill_list_size); 7284 7285 if (need_ipsq_free) { 7286 /* 7287 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 7288 * looked up. ipsq can be looked up only thru ill or phyint 7289 * and there are no ills/phyint on this ipsq. 7290 */ 7291 ipsq_delete(ipsq); 7292 } 7293 /* 7294 * Now start any igmp or mld timers that could not be started 7295 * while inside the ipsq. The timers can't be started while inside 7296 * the ipsq, since igmp_start_timers may need to call untimeout() 7297 * which can't be done while holding a lock i.e. the ipsq. Otherwise 7298 * there could be a deadlock since the timeout handlers 7299 * mld_timeout_handler / igmp_timeout_handler also synchronously 7300 * wait in ipsq_enter() trying to get the ipsq. 7301 * 7302 * However there is one exception to the above. If this thread is 7303 * itself the igmp/mld timeout handler thread, then we don't want 7304 * to start any new timer until the current handler is done. The 7305 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 7306 * all others pass B_TRUE. 7307 */ 7308 if (start_igmp_timer) { 7309 mutex_enter(&igmp_timer_lock); 7310 next = igmp_deferred_next; 7311 igmp_deferred_next = INFINITY; 7312 mutex_exit(&igmp_timer_lock); 7313 7314 if (next != INFINITY) 7315 igmp_start_timers(next); 7316 } 7317 7318 if (start_mld_timer) { 7319 mutex_enter(&mld_timer_lock); 7320 next = mld_deferred_next; 7321 mld_deferred_next = INFINITY; 7322 mutex_exit(&mld_timer_lock); 7323 7324 if (next != INFINITY) 7325 mld_start_timers(next); 7326 } 7327 } 7328 7329 /* 7330 * The ill is closing. Flush all messages on the ipsq that originated 7331 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 7332 * for this ill since ipsq_enter could not have entered until then. 7333 * New messages can't be queued since the CONDEMNED flag is set. 7334 */ 7335 static void 7336 ipsq_flush(ill_t *ill) 7337 { 7338 queue_t *q; 7339 mblk_t *prev; 7340 mblk_t *mp; 7341 mblk_t *mp_next; 7342 ipsq_t *ipsq; 7343 7344 ASSERT(IAM_WRITER_ILL(ill)); 7345 ipsq = ill->ill_phyint->phyint_ipsq; 7346 /* 7347 * Flush any messages sent up by the driver. 7348 */ 7349 mutex_enter(&ipsq->ipsq_lock); 7350 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 7351 mp_next = mp->b_next; 7352 q = mp->b_queue; 7353 if (q == ill->ill_rq || q == ill->ill_wq) { 7354 /* Remove the mp from the ipsq */ 7355 if (prev == NULL) 7356 ipsq->ipsq_mphead = mp->b_next; 7357 else 7358 prev->b_next = mp->b_next; 7359 if (ipsq->ipsq_mptail == mp) { 7360 ASSERT(mp_next == NULL); 7361 ipsq->ipsq_mptail = prev; 7362 } 7363 inet_freemsg(mp); 7364 } else { 7365 prev = mp; 7366 } 7367 } 7368 mutex_exit(&ipsq->ipsq_lock); 7369 (void) ipsq_pending_mp_cleanup(ill, NULL); 7370 ipsq_xopq_mp_cleanup(ill, NULL); 7371 ill_pending_mp_cleanup(ill); 7372 } 7373 7374 /* 7375 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 7376 * The real cleanup happens behind the squeue via ip_squeue_clean function but 7377 * we need to protect ourselfs from 2 threads trying to cleanup at the same 7378 * time (possible with one port going down for aggr and someone tearing down the 7379 * entire aggr simultaneously. So we use ill_inuse_ref protected by ill_lock 7380 * to indicate when the cleanup has started (1 ref) and when the cleanup 7381 * is done (0 ref). When a new ring gets assigned to squeue, we start by 7382 * putting 2 ref on ill_inuse_ref. 7383 */ 7384 static void 7385 ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 7386 { 7387 conn_t *connp; 7388 squeue_t *sqp; 7389 mblk_t *mp; 7390 7391 ASSERT(rx_ring != NULL); 7392 7393 /* Just clean one squeue */ 7394 mutex_enter(&ill->ill_lock); 7395 /* 7396 * Reset the ILL_SOFT_RING_ASSIGN bit so that 7397 * ip_squeue_soft_ring_affinty() will not go 7398 * ahead with assigning rings. 7399 */ 7400 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 7401 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 7402 /* Some operations pending on the ring. Wait */ 7403 cv_wait(&ill->ill_cv, &ill->ill_lock); 7404 7405 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 7406 /* 7407 * Someone already trying to clean 7408 * this squeue or its already been cleaned. 7409 */ 7410 mutex_exit(&ill->ill_lock); 7411 return; 7412 } 7413 sqp = rx_ring->rr_sqp; 7414 7415 if (sqp == NULL) { 7416 /* 7417 * The rx_ring never had a squeue assigned to it. 7418 * We are under ill_lock so we can clean it up 7419 * here itself since no one can get to it. 7420 */ 7421 rx_ring->rr_blank = NULL; 7422 rx_ring->rr_handle = NULL; 7423 rx_ring->rr_sqp = NULL; 7424 rx_ring->rr_ring_state = ILL_RING_FREE; 7425 mutex_exit(&ill->ill_lock); 7426 return; 7427 } 7428 7429 /* Set the state that its being cleaned */ 7430 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 7431 ASSERT(sqp != NULL); 7432 mutex_exit(&ill->ill_lock); 7433 7434 /* 7435 * Use the preallocated ill_unbind_conn for this purpose 7436 */ 7437 connp = ill->ill_dls_capab->ill_unbind_conn; 7438 mp = &connp->conn_tcp->tcp_closemp; 7439 CONN_INC_REF(connp); 7440 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 7441 7442 mutex_enter(&ill->ill_lock); 7443 while (rx_ring->rr_ring_state != ILL_RING_FREE) 7444 cv_wait(&ill->ill_cv, &ill->ill_lock); 7445 7446 mutex_exit(&ill->ill_lock); 7447 } 7448 7449 static void 7450 ipsq_clean_all(ill_t *ill) 7451 { 7452 int idx; 7453 7454 /* 7455 * No need to clean if poll_capab isn't set for this ill 7456 */ 7457 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 7458 return; 7459 7460 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 7461 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 7462 ipsq_clean_ring(ill, ipr); 7463 } 7464 7465 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 7466 } 7467 7468 /* ARGSUSED */ 7469 int 7470 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7471 ip_ioctl_cmd_t *ipip, void *ifreq) 7472 { 7473 ill_t *ill; 7474 struct lifreq *lifr = (struct lifreq *)ifreq; 7475 boolean_t isv6; 7476 conn_t *connp; 7477 7478 connp = Q_TO_CONN(q); 7479 isv6 = connp->conn_af_isv6; 7480 /* 7481 * Set original index. 7482 * Failover and failback move logical interfaces 7483 * from one physical interface to another. The 7484 * original index indicates the parent of a logical 7485 * interface, in other words, the physical interface 7486 * the logical interface will be moved back to on 7487 * failback. 7488 */ 7489 7490 /* 7491 * Don't allow the original index to be changed 7492 * for non-failover addresses, autoconfigured 7493 * addresses, or IPv6 link local addresses. 7494 */ 7495 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 7496 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 7497 return (EINVAL); 7498 } 7499 /* 7500 * The new original index must be in use by some 7501 * physical interface. 7502 */ 7503 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 7504 NULL, NULL); 7505 if (ill == NULL) 7506 return (ENXIO); 7507 ill_refrele(ill); 7508 7509 ipif->ipif_orig_ifindex = lifr->lifr_index; 7510 /* 7511 * When this ipif gets failed back, don't 7512 * preserve the original id, as it is no 7513 * longer applicable. 7514 */ 7515 ipif->ipif_orig_ipifid = 0; 7516 /* 7517 * For IPv4, change the original index of any 7518 * multicast addresses associated with the 7519 * ipif to the new value. 7520 */ 7521 if (!isv6) { 7522 ilm_t *ilm; 7523 7524 mutex_enter(&ipif->ipif_ill->ill_lock); 7525 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 7526 ilm = ilm->ilm_next) { 7527 if (ilm->ilm_ipif == ipif) { 7528 ilm->ilm_orig_ifindex = lifr->lifr_index; 7529 } 7530 } 7531 mutex_exit(&ipif->ipif_ill->ill_lock); 7532 } 7533 return (0); 7534 } 7535 7536 /* ARGSUSED */ 7537 int 7538 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 7539 ip_ioctl_cmd_t *ipip, void *ifreq) 7540 { 7541 struct lifreq *lifr = (struct lifreq *)ifreq; 7542 7543 /* 7544 * Get the original interface index i.e the one 7545 * before FAILOVER if it ever happened. 7546 */ 7547 lifr->lifr_index = ipif->ipif_orig_ifindex; 7548 return (0); 7549 } 7550 7551 /* 7552 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 7553 * refhold and return the associated ipif 7554 */ 7555 int 7556 ip_extract_tunreq(queue_t *q, mblk_t *mp, ipif_t **ipifp, ipsq_func_t func) 7557 { 7558 boolean_t exists; 7559 struct iftun_req *ta; 7560 ipif_t *ipif; 7561 ill_t *ill; 7562 boolean_t isv6; 7563 mblk_t *mp1; 7564 int error; 7565 conn_t *connp; 7566 7567 /* Existence verified in ip_wput_nondata */ 7568 mp1 = mp->b_cont->b_cont; 7569 ta = (struct iftun_req *)mp1->b_rptr; 7570 /* 7571 * Null terminate the string to protect against buffer 7572 * overrun. String was generated by user code and may not 7573 * be trusted. 7574 */ 7575 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 7576 7577 connp = Q_TO_CONN(q); 7578 isv6 = connp->conn_af_isv6; 7579 7580 /* Disallows implicit create */ 7581 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 7582 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 7583 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error); 7584 if (ipif == NULL) 7585 return (error); 7586 7587 if (ipif->ipif_id != 0) { 7588 /* 7589 * We really don't want to set/get tunnel parameters 7590 * on virtual tunnel interfaces. Only allow the 7591 * base tunnel to do these. 7592 */ 7593 ipif_refrele(ipif); 7594 return (EINVAL); 7595 } 7596 7597 /* 7598 * Send down to tunnel mod for ioctl processing. 7599 * Will finish ioctl in ip_rput_other(). 7600 */ 7601 ill = ipif->ipif_ill; 7602 if (ill->ill_net_type == IRE_LOOPBACK) { 7603 ipif_refrele(ipif); 7604 return (EOPNOTSUPP); 7605 } 7606 7607 if (ill->ill_wq == NULL) { 7608 ipif_refrele(ipif); 7609 return (ENXIO); 7610 } 7611 /* 7612 * Mark the ioctl as coming from an IPv6 interface for 7613 * tun's convenience. 7614 */ 7615 if (ill->ill_isv6) 7616 ta->ifta_flags |= 0x80000000; 7617 *ipifp = ipif; 7618 return (0); 7619 } 7620 7621 /* 7622 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7623 * and return the associated ipif. 7624 * Return value: 7625 * Non zero: An error has occurred. ci may not be filled out. 7626 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7627 * a held ipif in ci.ci_ipif. 7628 */ 7629 int 7630 ip_extract_lifreq_cmn(queue_t *q, mblk_t *mp, int cmd_type, int flags, 7631 cmd_info_t *ci, ipsq_func_t func) 7632 { 7633 sin_t *sin; 7634 sin6_t *sin6; 7635 char *name; 7636 struct ifreq *ifr; 7637 struct lifreq *lifr; 7638 ipif_t *ipif = NULL; 7639 ill_t *ill; 7640 conn_t *connp; 7641 boolean_t isv6; 7642 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7643 boolean_t exists; 7644 int err; 7645 mblk_t *mp1; 7646 zoneid_t zoneid; 7647 7648 if (q->q_next != NULL) { 7649 ill = (ill_t *)q->q_ptr; 7650 isv6 = ill->ill_isv6; 7651 connp = NULL; 7652 zoneid = ALL_ZONES; 7653 } else { 7654 ill = NULL; 7655 connp = Q_TO_CONN(q); 7656 isv6 = connp->conn_af_isv6; 7657 zoneid = connp->conn_zoneid; 7658 if (zoneid == GLOBAL_ZONEID) { 7659 /* global zone can access ipifs in all zones */ 7660 zoneid = ALL_ZONES; 7661 } 7662 } 7663 7664 /* Has been checked in ip_wput_nondata */ 7665 mp1 = mp->b_cont->b_cont; 7666 7667 7668 if (cmd_type == IF_CMD) { 7669 /* This a old style SIOC[GS]IF* command */ 7670 ifr = (struct ifreq *)mp1->b_rptr; 7671 /* 7672 * Null terminate the string to protect against buffer 7673 * overrun. String was generated by user code and may not 7674 * be trusted. 7675 */ 7676 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7677 sin = (sin_t *)&ifr->ifr_addr; 7678 name = ifr->ifr_name; 7679 ci->ci_sin = sin; 7680 ci->ci_sin6 = NULL; 7681 ci->ci_lifr = (struct lifreq *)ifr; 7682 } else { 7683 /* This a new style SIOC[GS]LIF* command */ 7684 ASSERT(cmd_type == LIF_CMD); 7685 lifr = (struct lifreq *)mp1->b_rptr; 7686 /* 7687 * Null terminate the string to protect against buffer 7688 * overrun. String was generated by user code and may not 7689 * be trusted. 7690 */ 7691 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7692 name = lifr->lifr_name; 7693 sin = (sin_t *)&lifr->lifr_addr; 7694 sin6 = (sin6_t *)&lifr->lifr_addr; 7695 if (iocp->ioc_cmd == SIOCSLIFGROUPNAME) { 7696 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 7697 LIFNAMSIZ); 7698 } 7699 ci->ci_sin = sin; 7700 ci->ci_sin6 = sin6; 7701 ci->ci_lifr = lifr; 7702 } 7703 7704 7705 if (iocp->ioc_cmd == SIOCSLIFNAME) { 7706 /* 7707 * The ioctl will be failed if the ioctl comes down 7708 * an conn stream 7709 */ 7710 if (ill == NULL) { 7711 /* 7712 * Not an ill queue, return EINVAL same as the 7713 * old error code. 7714 */ 7715 return (ENXIO); 7716 } 7717 ipif = ill->ill_ipif; 7718 ipif_refhold(ipif); 7719 } else { 7720 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 7721 &exists, isv6, zoneid, 7722 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err); 7723 if (ipif == NULL) { 7724 if (err == EINPROGRESS) 7725 return (err); 7726 if (iocp->ioc_cmd == SIOCLIFFAILOVER || 7727 iocp->ioc_cmd == SIOCLIFFAILBACK) { 7728 /* 7729 * Need to try both v4 and v6 since this 7730 * ioctl can come down either v4 or v6 7731 * socket. The lifreq.lifr_family passed 7732 * down by this ioctl is AF_UNSPEC. 7733 */ 7734 ipif = ipif_lookup_on_name(name, 7735 mi_strlen(name), B_FALSE, &exists, !isv6, 7736 zoneid, (connp == NULL) ? q : 7737 CONNP_TO_WQ(connp), mp, func, &err); 7738 if (err == EINPROGRESS) 7739 return (err); 7740 } 7741 err = 0; /* Ensure we don't use it below */ 7742 } 7743 } 7744 7745 /* 7746 * Old style [GS]IFCMD does not admit IPv6 ipif 7747 */ 7748 if (ipif != NULL && ipif->ipif_isv6 && cmd_type == IF_CMD) { 7749 ipif_refrele(ipif); 7750 return (ENXIO); 7751 } 7752 7753 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7754 name[0] == '\0') { 7755 /* 7756 * Handle a or a SIOC?IF* with a null name 7757 * during plumb (on the ill queue before the I_PLINK). 7758 */ 7759 ipif = ill->ill_ipif; 7760 ipif_refhold(ipif); 7761 } 7762 7763 if (ipif == NULL) 7764 return (ENXIO); 7765 7766 /* 7767 * Allow only GET operations if this ipif has been created 7768 * temporarily due to a MOVE operation. 7769 */ 7770 if (ipif->ipif_replace_zero && !(flags & IPI_REPL)) { 7771 ipif_refrele(ipif); 7772 return (EINVAL); 7773 } 7774 7775 ci->ci_ipif = ipif; 7776 return (0); 7777 } 7778 7779 /* 7780 * Return the total number of ipifs. 7781 */ 7782 static uint_t 7783 ip_get_numifs(zoneid_t zoneid) 7784 { 7785 uint_t numifs = 0; 7786 ill_t *ill; 7787 ill_walk_context_t ctx; 7788 ipif_t *ipif; 7789 7790 rw_enter(&ill_g_lock, RW_READER); 7791 ill = ILL_START_WALK_V4(&ctx); 7792 7793 while (ill != NULL) { 7794 for (ipif = ill->ill_ipif; ipif != NULL; 7795 ipif = ipif->ipif_next) { 7796 if (ipif->ipif_zoneid == zoneid) 7797 numifs++; 7798 } 7799 ill = ill_next(&ctx, ill); 7800 } 7801 rw_exit(&ill_g_lock); 7802 return (numifs); 7803 } 7804 7805 /* 7806 * Return the total number of ipifs. 7807 */ 7808 static uint_t 7809 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid) 7810 { 7811 uint_t numifs = 0; 7812 ill_t *ill; 7813 ipif_t *ipif; 7814 ill_walk_context_t ctx; 7815 7816 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7817 7818 rw_enter(&ill_g_lock, RW_READER); 7819 if (family == AF_INET) 7820 ill = ILL_START_WALK_V4(&ctx); 7821 else if (family == AF_INET6) 7822 ill = ILL_START_WALK_V6(&ctx); 7823 else 7824 ill = ILL_START_WALK_ALL(&ctx); 7825 7826 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7827 for (ipif = ill->ill_ipif; ipif != NULL; 7828 ipif = ipif->ipif_next) { 7829 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7830 !(lifn_flags & LIFC_NOXMIT)) 7831 continue; 7832 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7833 !(lifn_flags & LIFC_TEMPORARY)) 7834 continue; 7835 if (((ipif->ipif_flags & 7836 (IPIF_NOXMIT|IPIF_NOLOCAL| 7837 IPIF_DEPRECATED)) || 7838 (ill->ill_phyint->phyint_flags & 7839 PHYI_LOOPBACK) || 7840 !(ipif->ipif_flags & IPIF_UP)) && 7841 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7842 continue; 7843 7844 if (zoneid != ipif->ipif_zoneid && 7845 (zoneid != GLOBAL_ZONEID || 7846 !(lifn_flags & LIFC_ALLZONES))) 7847 continue; 7848 7849 numifs++; 7850 } 7851 } 7852 rw_exit(&ill_g_lock); 7853 return (numifs); 7854 } 7855 7856 uint_t 7857 ip_get_lifsrcofnum(ill_t *ill) 7858 { 7859 uint_t numifs = 0; 7860 ill_t *ill_head = ill; 7861 7862 /* 7863 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7864 * other thread may be trying to relink the ILLs in this usesrc group 7865 * and adjusting the ill_usesrc_grp_next pointers 7866 */ 7867 rw_enter(&ill_g_usesrc_lock, RW_READER); 7868 if ((ill->ill_usesrc_ifindex == 0) && 7869 (ill->ill_usesrc_grp_next != NULL)) { 7870 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 7871 ill = ill->ill_usesrc_grp_next) 7872 numifs++; 7873 } 7874 rw_exit(&ill_g_usesrc_lock); 7875 7876 return (numifs); 7877 } 7878 7879 /* Null values are passed in for ipif, sin, and ifreq */ 7880 /* ARGSUSED */ 7881 int 7882 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7883 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7884 { 7885 int *nump; 7886 7887 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7888 7889 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 7890 nump = (int *)mp->b_cont->b_cont->b_rptr; 7891 7892 *nump = ip_get_numifs(Q_TO_CONN(q)->conn_zoneid); 7893 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 7894 return (0); 7895 } 7896 7897 /* Null values are passed in for ipif, sin, and ifreq */ 7898 /* ARGSUSED */ 7899 int 7900 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 7901 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7902 { 7903 struct lifnum *lifn; 7904 mblk_t *mp1; 7905 7906 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7907 7908 /* Existence checked in ip_wput_nondata */ 7909 mp1 = mp->b_cont->b_cont; 7910 7911 lifn = (struct lifnum *)mp1->b_rptr; 7912 switch (lifn->lifn_family) { 7913 case AF_UNSPEC: 7914 case AF_INET: 7915 case AF_INET6: 7916 break; 7917 default: 7918 return (EAFNOSUPPORT); 7919 } 7920 7921 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 7922 Q_TO_CONN(q)->conn_zoneid); 7923 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 7924 return (0); 7925 } 7926 7927 /* ARGSUSED */ 7928 int 7929 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7930 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7931 { 7932 STRUCT_HANDLE(ifconf, ifc); 7933 mblk_t *mp1; 7934 struct iocblk *iocp; 7935 struct ifreq *ifr; 7936 ill_walk_context_t ctx; 7937 ill_t *ill; 7938 ipif_t *ipif; 7939 struct sockaddr_in *sin; 7940 int32_t ifclen; 7941 zoneid_t zoneid; 7942 7943 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 7944 7945 ip1dbg(("ip_sioctl_get_ifconf")); 7946 /* Existence verified in ip_wput_nondata */ 7947 mp1 = mp->b_cont->b_cont; 7948 iocp = (struct iocblk *)mp->b_rptr; 7949 zoneid = Q_TO_CONN(q)->conn_zoneid; 7950 7951 /* 7952 * The original SIOCGIFCONF passed in a struct ifconf which specified 7953 * the user buffer address and length into which the list of struct 7954 * ifreqs was to be copied. Since AT&T Streams does not seem to 7955 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 7956 * the SIOCGIFCONF operation was redefined to simply provide 7957 * a large output buffer into which we are supposed to jam the ifreq 7958 * array. The same ioctl command code was used, despite the fact that 7959 * both the applications and the kernel code had to change, thus making 7960 * it impossible to support both interfaces. 7961 * 7962 * For reasons not good enough to try to explain, the following 7963 * algorithm is used for deciding what to do with one of these: 7964 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 7965 * form with the output buffer coming down as the continuation message. 7966 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 7967 * and we have to copy in the ifconf structure to find out how big the 7968 * output buffer is and where to copy out to. Sure no problem... 7969 * 7970 */ 7971 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 7972 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 7973 int numifs = 0; 7974 size_t ifc_bufsize; 7975 7976 /* 7977 * Must be (better be!) continuation of a TRANSPARENT 7978 * IOCTL. We just copied in the ifconf structure. 7979 */ 7980 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 7981 (struct ifconf *)mp1->b_rptr); 7982 7983 /* 7984 * Allocate a buffer to hold requested information. 7985 * 7986 * If ifc_len is larger than what is needed, we only 7987 * allocate what we will use. 7988 * 7989 * If ifc_len is smaller than what is needed, return 7990 * EINVAL. 7991 * 7992 * XXX: the ill_t structure can hava 2 counters, for 7993 * v4 and v6 (not just ill_ipif_up_count) to store the 7994 * number of interfaces for a device, so we don't need 7995 * to count them here... 7996 */ 7997 numifs = ip_get_numifs(zoneid); 7998 7999 ifclen = STRUCT_FGET(ifc, ifc_len); 8000 ifc_bufsize = numifs * sizeof (struct ifreq); 8001 if (ifc_bufsize > ifclen) { 8002 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8003 /* old behaviour */ 8004 return (EINVAL); 8005 } else { 8006 ifc_bufsize = ifclen; 8007 } 8008 } 8009 8010 mp1 = mi_copyout_alloc(q, mp, 8011 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8012 if (mp1 == NULL) 8013 return (ENOMEM); 8014 8015 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8016 } 8017 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8018 /* 8019 * the SIOCGIFCONF ioctl only knows about 8020 * IPv4 addresses, so don't try to tell 8021 * it about interfaces with IPv6-only 8022 * addresses. (Last parm 'isv6' is B_FALSE) 8023 */ 8024 8025 ifr = (struct ifreq *)mp1->b_rptr; 8026 8027 rw_enter(&ill_g_lock, RW_READER); 8028 ill = ILL_START_WALK_V4(&ctx); 8029 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8030 for (ipif = ill->ill_ipif; ipif; 8031 ipif = ipif->ipif_next) { 8032 if (zoneid != ipif->ipif_zoneid) 8033 continue; 8034 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8035 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8036 /* old behaviour */ 8037 rw_exit(&ill_g_lock); 8038 return (EINVAL); 8039 } else { 8040 goto if_copydone; 8041 } 8042 } 8043 (void) ipif_get_name(ipif, 8044 ifr->ifr_name, 8045 sizeof (ifr->ifr_name)); 8046 sin = (sin_t *)&ifr->ifr_addr; 8047 *sin = sin_null; 8048 sin->sin_family = AF_INET; 8049 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8050 ifr++; 8051 } 8052 } 8053 if_copydone: 8054 rw_exit(&ill_g_lock); 8055 mp1->b_wptr = (uchar_t *)ifr; 8056 8057 if (STRUCT_BUF(ifc) != NULL) { 8058 STRUCT_FSET(ifc, ifc_len, 8059 (int)((uchar_t *)ifr - mp1->b_rptr)); 8060 } 8061 return (0); 8062 } 8063 8064 /* 8065 * Get the interfaces using the address hosted on the interface passed in, 8066 * as a source adddress 8067 */ 8068 /* ARGSUSED */ 8069 int 8070 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8071 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8072 { 8073 mblk_t *mp1; 8074 ill_t *ill, *ill_head; 8075 ipif_t *ipif, *orig_ipif; 8076 int numlifs = 0; 8077 size_t lifs_bufsize, lifsmaxlen; 8078 struct lifreq *lifr; 8079 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8080 uint_t ifindex; 8081 zoneid_t zoneid; 8082 int err = 0; 8083 boolean_t isv6 = B_FALSE; 8084 struct sockaddr_in *sin; 8085 struct sockaddr_in6 *sin6; 8086 8087 STRUCT_HANDLE(lifsrcof, lifs); 8088 8089 ASSERT(q->q_next == NULL); 8090 8091 zoneid = Q_TO_CONN(q)->conn_zoneid; 8092 8093 /* Existence verified in ip_wput_nondata */ 8094 mp1 = mp->b_cont->b_cont; 8095 8096 /* 8097 * Must be (better be!) continuation of a TRANSPARENT 8098 * IOCTL. We just copied in the lifsrcof structure. 8099 */ 8100 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8101 (struct lifsrcof *)mp1->b_rptr); 8102 8103 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8104 return (EINVAL); 8105 8106 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8107 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8108 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8109 ip_process_ioctl, &err); 8110 if (ipif == NULL) { 8111 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8112 ifindex)); 8113 return (err); 8114 } 8115 8116 8117 /* Allocate a buffer to hold requested information */ 8118 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8119 lifs_bufsize = numlifs * sizeof (struct lifreq); 8120 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8121 /* The actual size needed is always returned in lifs_len */ 8122 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8123 8124 /* If the amount we need is more than what is passed in, abort */ 8125 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8126 ipif_refrele(ipif); 8127 return (0); 8128 } 8129 8130 mp1 = mi_copyout_alloc(q, mp, 8131 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8132 if (mp1 == NULL) { 8133 ipif_refrele(ipif); 8134 return (ENOMEM); 8135 } 8136 8137 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8138 bzero(mp1->b_rptr, lifs_bufsize); 8139 8140 lifr = (struct lifreq *)mp1->b_rptr; 8141 8142 ill = ill_head = ipif->ipif_ill; 8143 orig_ipif = ipif; 8144 8145 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8146 rw_enter(&ill_g_usesrc_lock, RW_READER); 8147 rw_enter(&ill_g_lock, RW_READER); 8148 8149 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8150 for (; (ill != NULL) && (ill != ill_head); 8151 ill = ill->ill_usesrc_grp_next) { 8152 8153 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8154 break; 8155 8156 ipif = ill->ill_ipif; 8157 (void) ipif_get_name(ipif, 8158 lifr->lifr_name, sizeof (lifr->lifr_name)); 8159 if (ipif->ipif_isv6) { 8160 sin6 = (sin6_t *)&lifr->lifr_addr; 8161 *sin6 = sin6_null; 8162 sin6->sin6_family = AF_INET6; 8163 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8164 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8165 &ipif->ipif_v6net_mask); 8166 } else { 8167 sin = (sin_t *)&lifr->lifr_addr; 8168 *sin = sin_null; 8169 sin->sin_family = AF_INET; 8170 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8171 lifr->lifr_addrlen = ip_mask_to_plen( 8172 ipif->ipif_net_mask); 8173 } 8174 lifr++; 8175 } 8176 rw_exit(&ill_g_usesrc_lock); 8177 rw_exit(&ill_g_lock); 8178 ipif_refrele(orig_ipif); 8179 mp1->b_wptr = (uchar_t *)lifr; 8180 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8181 8182 return (0); 8183 } 8184 8185 /* ARGSUSED */ 8186 int 8187 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8188 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8189 { 8190 mblk_t *mp1; 8191 int list; 8192 ill_t *ill; 8193 ipif_t *ipif; 8194 int flags; 8195 int numlifs = 0; 8196 size_t lifc_bufsize; 8197 struct lifreq *lifr; 8198 sa_family_t family; 8199 struct sockaddr_in *sin; 8200 struct sockaddr_in6 *sin6; 8201 ill_walk_context_t ctx; 8202 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8203 int32_t lifclen; 8204 zoneid_t zoneid; 8205 STRUCT_HANDLE(lifconf, lifc); 8206 8207 ip1dbg(("ip_sioctl_get_lifconf")); 8208 8209 ASSERT(q->q_next == NULL); 8210 8211 zoneid = Q_TO_CONN(q)->conn_zoneid; 8212 8213 /* Existence verified in ip_wput_nondata */ 8214 mp1 = mp->b_cont->b_cont; 8215 8216 /* 8217 * An extended version of SIOCGIFCONF that takes an 8218 * additional address family and flags field. 8219 * AF_UNSPEC retrieve both IPv4 and IPv6. 8220 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8221 * interfaces are omitted. 8222 * Similarly, IPIF_TEMPORARY interfaces are omitted 8223 * unless LIFC_TEMPORARY is specified. 8224 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8225 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8226 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8227 * has priority over LIFC_NOXMIT. 8228 */ 8229 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8230 8231 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8232 return (EINVAL); 8233 8234 /* 8235 * Must be (better be!) continuation of a TRANSPARENT 8236 * IOCTL. We just copied in the lifconf structure. 8237 */ 8238 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8239 8240 family = STRUCT_FGET(lifc, lifc_family); 8241 flags = STRUCT_FGET(lifc, lifc_flags); 8242 8243 switch (family) { 8244 case AF_UNSPEC: 8245 /* 8246 * walk all ILL's. 8247 */ 8248 list = MAX_G_HEADS; 8249 break; 8250 case AF_INET: 8251 /* 8252 * walk only IPV4 ILL's. 8253 */ 8254 list = IP_V4_G_HEAD; 8255 break; 8256 case AF_INET6: 8257 /* 8258 * walk only IPV6 ILL's. 8259 */ 8260 list = IP_V6_G_HEAD; 8261 break; 8262 default: 8263 return (EAFNOSUPPORT); 8264 } 8265 8266 /* 8267 * Allocate a buffer to hold requested information. 8268 * 8269 * If lifc_len is larger than what is needed, we only 8270 * allocate what we will use. 8271 * 8272 * If lifc_len is smaller than what is needed, return 8273 * EINVAL. 8274 */ 8275 numlifs = ip_get_numlifs(family, flags, zoneid); 8276 lifc_bufsize = numlifs * sizeof (struct lifreq); 8277 lifclen = STRUCT_FGET(lifc, lifc_len); 8278 if (lifc_bufsize > lifclen) { 8279 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8280 return (EINVAL); 8281 else 8282 lifc_bufsize = lifclen; 8283 } 8284 8285 mp1 = mi_copyout_alloc(q, mp, 8286 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8287 if (mp1 == NULL) 8288 return (ENOMEM); 8289 8290 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8291 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8292 8293 lifr = (struct lifreq *)mp1->b_rptr; 8294 8295 rw_enter(&ill_g_lock, RW_READER); 8296 ill = ill_first(list, list, &ctx); 8297 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8298 for (ipif = ill->ill_ipif; ipif != NULL; 8299 ipif = ipif->ipif_next) { 8300 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8301 !(flags & LIFC_NOXMIT)) 8302 continue; 8303 8304 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8305 !(flags & LIFC_TEMPORARY)) 8306 continue; 8307 8308 if (((ipif->ipif_flags & 8309 (IPIF_NOXMIT|IPIF_NOLOCAL| 8310 IPIF_DEPRECATED)) || 8311 (ill->ill_phyint->phyint_flags & 8312 PHYI_LOOPBACK) || 8313 !(ipif->ipif_flags & IPIF_UP)) && 8314 (flags & LIFC_EXTERNAL_SOURCE)) 8315 continue; 8316 8317 if (zoneid != ipif->ipif_zoneid && 8318 (zoneid != GLOBAL_ZONEID || 8319 !(flags & LIFC_ALLZONES))) 8320 continue; 8321 8322 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8323 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8324 rw_exit(&ill_g_lock); 8325 return (EINVAL); 8326 } else { 8327 goto lif_copydone; 8328 } 8329 } 8330 8331 (void) ipif_get_name(ipif, 8332 lifr->lifr_name, 8333 sizeof (lifr->lifr_name)); 8334 if (ipif->ipif_isv6) { 8335 sin6 = (sin6_t *)&lifr->lifr_addr; 8336 *sin6 = sin6_null; 8337 sin6->sin6_family = AF_INET6; 8338 sin6->sin6_addr = 8339 ipif->ipif_v6lcl_addr; 8340 lifr->lifr_addrlen = 8341 ip_mask_to_plen_v6( 8342 &ipif->ipif_v6net_mask); 8343 } else { 8344 sin = (sin_t *)&lifr->lifr_addr; 8345 *sin = sin_null; 8346 sin->sin_family = AF_INET; 8347 sin->sin_addr.s_addr = 8348 ipif->ipif_lcl_addr; 8349 lifr->lifr_addrlen = 8350 ip_mask_to_plen( 8351 ipif->ipif_net_mask); 8352 } 8353 lifr++; 8354 } 8355 } 8356 lif_copydone: 8357 rw_exit(&ill_g_lock); 8358 8359 mp1->b_wptr = (uchar_t *)lifr; 8360 if (STRUCT_BUF(lifc) != NULL) { 8361 STRUCT_FSET(lifc, lifc_len, 8362 (int)((uchar_t *)lifr - mp1->b_rptr)); 8363 } 8364 return (0); 8365 } 8366 8367 /* ARGSUSED */ 8368 int 8369 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 8370 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8371 { 8372 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8373 ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 8374 return (0); 8375 } 8376 8377 static void 8378 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 8379 { 8380 ip6_asp_t *table; 8381 size_t table_size; 8382 mblk_t *data_mp; 8383 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8384 8385 /* These two ioctls are I_STR only */ 8386 if (iocp->ioc_count == TRANSPARENT) { 8387 miocnak(q, mp, 0, EINVAL); 8388 return; 8389 } 8390 8391 data_mp = mp->b_cont; 8392 if (data_mp == NULL) { 8393 /* The user passed us a NULL argument */ 8394 table = NULL; 8395 table_size = iocp->ioc_count; 8396 } else { 8397 /* 8398 * The user provided a table. The stream head 8399 * may have copied in the user data in chunks, 8400 * so make sure everything is pulled up 8401 * properly. 8402 */ 8403 if (MBLKL(data_mp) < iocp->ioc_count) { 8404 mblk_t *new_data_mp; 8405 if ((new_data_mp = msgpullup(data_mp, -1)) == 8406 NULL) { 8407 miocnak(q, mp, 0, ENOMEM); 8408 return; 8409 } 8410 freemsg(data_mp); 8411 data_mp = new_data_mp; 8412 mp->b_cont = data_mp; 8413 } 8414 table = (ip6_asp_t *)data_mp->b_rptr; 8415 table_size = iocp->ioc_count; 8416 } 8417 8418 switch (iocp->ioc_cmd) { 8419 case SIOCGIP6ADDRPOLICY: 8420 iocp->ioc_rval = ip6_asp_get(table, table_size); 8421 if (iocp->ioc_rval == -1) 8422 iocp->ioc_error = EINVAL; 8423 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8424 else if (table != NULL && 8425 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 8426 ip6_asp_t *src = table; 8427 ip6_asp32_t *dst = (void *)table; 8428 int count = table_size / sizeof (ip6_asp_t); 8429 int i; 8430 8431 /* 8432 * We need to do an in-place shrink of the array 8433 * to match the alignment attributes of the 8434 * 32-bit ABI looking at it. 8435 */ 8436 /* LINTED: logical expression always true: op "||" */ 8437 ASSERT(sizeof (*src) > sizeof (*dst)); 8438 for (i = 1; i < count; i++) 8439 bcopy(src + i, dst + i, sizeof (*dst)); 8440 } 8441 #endif 8442 break; 8443 8444 case SIOCSIP6ADDRPOLICY: 8445 ASSERT(mp->b_prev == NULL); 8446 mp->b_prev = (void *)q; 8447 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8448 /* 8449 * We pass in the datamodel here so that the ip6_asp_replace() 8450 * routine can handle converting from 32-bit to native formats 8451 * where necessary. 8452 * 8453 * A better way to handle this might be to convert the inbound 8454 * data structure here, and hang it off a new 'mp'; thus the 8455 * ip6_asp_replace() logic would always be dealing with native 8456 * format data structures.. 8457 * 8458 * (An even simpler way to handle these ioctls is to just 8459 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 8460 * and just recompile everything that depends on it.) 8461 */ 8462 #endif 8463 ip6_asp_replace(mp, table, table_size, B_FALSE, 8464 iocp->ioc_flag & IOC_MODELS); 8465 return; 8466 } 8467 8468 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 8469 qreply(q, mp); 8470 } 8471 8472 static void 8473 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 8474 { 8475 mblk_t *data_mp; 8476 struct dstinforeq *dir; 8477 uint8_t *end, *cur; 8478 in6_addr_t *daddr, *saddr; 8479 ipaddr_t v4daddr; 8480 ire_t *ire; 8481 char *slabel, *dlabel; 8482 boolean_t isipv4; 8483 int match_ire; 8484 ill_t *dst_ill; 8485 ipif_t *src_ipif, *ire_ipif; 8486 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8487 zoneid_t zoneid; 8488 8489 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8490 zoneid = Q_TO_CONN(q)->conn_zoneid; 8491 8492 /* 8493 * This ioctl is I_STR only, and must have a 8494 * data mblk following the M_IOCTL mblk. 8495 */ 8496 data_mp = mp->b_cont; 8497 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 8498 miocnak(q, mp, 0, EINVAL); 8499 return; 8500 } 8501 8502 if (MBLKL(data_mp) < iocp->ioc_count) { 8503 mblk_t *new_data_mp; 8504 8505 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 8506 miocnak(q, mp, 0, ENOMEM); 8507 return; 8508 } 8509 freemsg(data_mp); 8510 data_mp = new_data_mp; 8511 mp->b_cont = data_mp; 8512 } 8513 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 8514 8515 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 8516 end - cur >= sizeof (struct dstinforeq); 8517 cur += sizeof (struct dstinforeq)) { 8518 dir = (struct dstinforeq *)cur; 8519 daddr = &dir->dir_daddr; 8520 saddr = &dir->dir_saddr; 8521 8522 /* 8523 * ip_addr_scope_v6() and ip6_asp_lookup() handle 8524 * v4 mapped addresses; ire_ftable_lookup[_v6]() 8525 * and ipif_select_source[_v6]() do not. 8526 */ 8527 dir->dir_dscope = ip_addr_scope_v6(daddr); 8528 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence); 8529 8530 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 8531 if (isipv4) { 8532 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 8533 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 8534 0, NULL, NULL, zoneid, 0, match_ire); 8535 } else { 8536 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 8537 0, NULL, NULL, zoneid, 0, match_ire); 8538 } 8539 if (ire == NULL) { 8540 dir->dir_dreachable = 0; 8541 8542 /* move on to next dst addr */ 8543 continue; 8544 } 8545 dir->dir_dreachable = 1; 8546 8547 ire_ipif = ire->ire_ipif; 8548 if (ire_ipif == NULL) 8549 goto next_dst; 8550 8551 /* 8552 * We expect to get back an interface ire or a 8553 * gateway ire cache entry. For both types, the 8554 * output interface is ire_ipif->ipif_ill. 8555 */ 8556 dst_ill = ire_ipif->ipif_ill; 8557 dir->dir_dmactype = dst_ill->ill_mactype; 8558 8559 if (isipv4) { 8560 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 8561 } else { 8562 src_ipif = ipif_select_source_v6(dst_ill, 8563 daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, 8564 zoneid); 8565 } 8566 if (src_ipif == NULL) 8567 goto next_dst; 8568 8569 *saddr = src_ipif->ipif_v6lcl_addr; 8570 dir->dir_sscope = ip_addr_scope_v6(saddr); 8571 slabel = ip6_asp_lookup(saddr, NULL); 8572 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 8573 dir->dir_sdeprecated = 8574 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 8575 ipif_refrele(src_ipif); 8576 next_dst: 8577 ire_refrele(ire); 8578 } 8579 miocack(q, mp, iocp->ioc_count, 0); 8580 } 8581 8582 8583 /* 8584 * Check if this is an address assigned to this machine. 8585 * Skips interfaces that are down by using ire checks. 8586 * Translates mapped addresses to v4 addresses and then 8587 * treats them as such, returning true if the v4 address 8588 * associated with this mapped address is configured. 8589 * Note: Applications will have to be careful what they do 8590 * with the response; use of mapped addresses limits 8591 * what can be done with the socket, especially with 8592 * respect to socket options and ioctls - neither IPv4 8593 * options nor IPv6 sticky options/ancillary data options 8594 * may be used. 8595 */ 8596 /* ARGSUSED */ 8597 int 8598 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8599 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8600 { 8601 struct sioc_addrreq *sia; 8602 sin_t *sin; 8603 ire_t *ire; 8604 mblk_t *mp1; 8605 zoneid_t zoneid; 8606 8607 ip1dbg(("ip_sioctl_tmyaddr")); 8608 8609 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8610 zoneid = Q_TO_CONN(q)->conn_zoneid; 8611 8612 /* Existence verified in ip_wput_nondata */ 8613 mp1 = mp->b_cont->b_cont; 8614 sia = (struct sioc_addrreq *)mp1->b_rptr; 8615 sin = (sin_t *)&sia->sa_addr; 8616 switch (sin->sin_family) { 8617 case AF_INET6: { 8618 sin6_t *sin6 = (sin6_t *)sin; 8619 8620 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8621 ipaddr_t v4_addr; 8622 8623 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8624 v4_addr); 8625 ire = ire_ctable_lookup(v4_addr, 0, 8626 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8627 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8628 } else { 8629 in6_addr_t v6addr; 8630 8631 v6addr = sin6->sin6_addr; 8632 ire = ire_ctable_lookup_v6(&v6addr, 0, 8633 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8634 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8635 } 8636 break; 8637 } 8638 case AF_INET: { 8639 ipaddr_t v4addr; 8640 8641 v4addr = sin->sin_addr.s_addr; 8642 ire = ire_ctable_lookup(v4addr, 0, 8643 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8644 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 8645 break; 8646 } 8647 default: 8648 return (EAFNOSUPPORT); 8649 } 8650 if (ire != NULL) { 8651 sia->sa_res = 1; 8652 ire_refrele(ire); 8653 } else { 8654 sia->sa_res = 0; 8655 } 8656 return (0); 8657 } 8658 8659 /* 8660 * Check if this is an address assigned on-link i.e. neighbor, 8661 * and makes sure it's reachable from the current zone. 8662 * Returns true for my addresses as well. 8663 * Translates mapped addresses to v4 addresses and then 8664 * treats them as such, returning true if the v4 address 8665 * associated with this mapped address is configured. 8666 * Note: Applications will have to be careful what they do 8667 * with the response; use of mapped addresses limits 8668 * what can be done with the socket, especially with 8669 * respect to socket options and ioctls - neither IPv4 8670 * options nor IPv6 sticky options/ancillary data options 8671 * may be used. 8672 */ 8673 /* ARGSUSED */ 8674 int 8675 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8676 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8677 { 8678 struct sioc_addrreq *sia; 8679 sin_t *sin; 8680 mblk_t *mp1; 8681 ire_t *ire = NULL; 8682 zoneid_t zoneid; 8683 8684 ip1dbg(("ip_sioctl_tonlink")); 8685 8686 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8687 zoneid = Q_TO_CONN(q)->conn_zoneid; 8688 8689 /* Existence verified in ip_wput_nondata */ 8690 mp1 = mp->b_cont->b_cont; 8691 sia = (struct sioc_addrreq *)mp1->b_rptr; 8692 sin = (sin_t *)&sia->sa_addr; 8693 8694 /* 8695 * Match addresses with a zero gateway field to avoid 8696 * routes going through a router. 8697 * Exclude broadcast and multicast addresses. 8698 */ 8699 switch (sin->sin_family) { 8700 case AF_INET6: { 8701 sin6_t *sin6 = (sin6_t *)sin; 8702 8703 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8704 ipaddr_t v4_addr; 8705 8706 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8707 v4_addr); 8708 if (!CLASSD(v4_addr)) { 8709 ire = ire_route_lookup(v4_addr, 0, 0, 0, 8710 NULL, NULL, zoneid, MATCH_IRE_GW); 8711 } 8712 } else { 8713 in6_addr_t v6addr; 8714 in6_addr_t v6gw; 8715 8716 v6addr = sin6->sin6_addr; 8717 v6gw = ipv6_all_zeros; 8718 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8719 ire = ire_route_lookup_v6(&v6addr, 0, 8720 &v6gw, 0, NULL, NULL, zoneid, 8721 MATCH_IRE_GW); 8722 } 8723 } 8724 break; 8725 } 8726 case AF_INET: { 8727 ipaddr_t v4addr; 8728 8729 v4addr = sin->sin_addr.s_addr; 8730 if (!CLASSD(v4addr)) { 8731 ire = ire_route_lookup(v4addr, 0, 0, 0, 8732 NULL, NULL, zoneid, MATCH_IRE_GW); 8733 } 8734 break; 8735 } 8736 default: 8737 return (EAFNOSUPPORT); 8738 } 8739 sia->sa_res = 0; 8740 if (ire != NULL) { 8741 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 8742 IRE_LOCAL|IRE_LOOPBACK)) { 8743 sia->sa_res = 1; 8744 } 8745 ire_refrele(ire); 8746 } 8747 return (0); 8748 } 8749 8750 /* 8751 * TBD: implement when kernel maintaines a list of site prefixes. 8752 */ 8753 /* ARGSUSED */ 8754 int 8755 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8756 ip_ioctl_cmd_t *ipip, void *ifreq) 8757 { 8758 return (ENXIO); 8759 } 8760 8761 /* ARGSUSED */ 8762 int 8763 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8764 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8765 { 8766 ill_t *ill; 8767 mblk_t *mp1; 8768 conn_t *connp; 8769 boolean_t success; 8770 8771 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 8772 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 8773 /* ioctl comes down on an conn */ 8774 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8775 connp = Q_TO_CONN(q); 8776 8777 mp->b_datap->db_type = M_IOCTL; 8778 8779 /* 8780 * Send down a copy. (copymsg does not copy b_next/b_prev). 8781 * The original mp contains contaminated b_next values due to 'mi', 8782 * which is needed to do the mi_copy_done. Unfortunately if we 8783 * send down the original mblk itself and if we are popped due to an 8784 * an unplumb before the response comes back from tunnel, 8785 * the streamhead (which does a freemsg) will see this contaminated 8786 * message and the assertion in freemsg about non-null b_next/b_prev 8787 * will panic a DEBUG kernel. 8788 */ 8789 mp1 = copymsg(mp); 8790 if (mp1 == NULL) 8791 return (ENOMEM); 8792 8793 ill = ipif->ipif_ill; 8794 mutex_enter(&connp->conn_lock); 8795 mutex_enter(&ill->ill_lock); 8796 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 8797 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 8798 mp, 0); 8799 } else { 8800 success = ill_pending_mp_add(ill, connp, mp); 8801 } 8802 mutex_exit(&ill->ill_lock); 8803 mutex_exit(&connp->conn_lock); 8804 8805 if (success) { 8806 ip1dbg(("sending down tunparam request ")); 8807 putnext(ill->ill_wq, mp1); 8808 return (EINPROGRESS); 8809 } else { 8810 /* The conn has started closing */ 8811 freemsg(mp1); 8812 return (EINTR); 8813 } 8814 } 8815 8816 static int 8817 ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, 8818 boolean_t x_arp_ioctl, boolean_t if_arp_ioctl) 8819 { 8820 mblk_t *mp1; 8821 mblk_t *mp2; 8822 mblk_t *pending_mp; 8823 ipaddr_t ipaddr; 8824 area_t *area; 8825 struct iocblk *iocp; 8826 conn_t *connp; 8827 struct arpreq *ar; 8828 struct xarpreq *xar; 8829 boolean_t success; 8830 int flags, alength; 8831 char *lladdr; 8832 8833 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8834 connp = Q_TO_CONN(q); 8835 8836 iocp = (struct iocblk *)mp->b_rptr; 8837 /* 8838 * ill has already been set depending on whether 8839 * bsd style or interface style ioctl. 8840 */ 8841 ASSERT(ill != NULL); 8842 8843 /* 8844 * Is this one of the new SIOC*XARP ioctls? 8845 */ 8846 if (x_arp_ioctl) { 8847 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8848 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8849 ar = NULL; 8850 8851 flags = xar->xarp_flags; 8852 lladdr = LLADDR(&xar->xarp_ha); 8853 /* 8854 * Validate against user's link layer address length 8855 * input and name and addr length limits. 8856 */ 8857 alength = ill->ill_phys_addr_length; 8858 if (iocp->ioc_cmd == SIOCSXARP) { 8859 if (alength != xar->xarp_ha.sdl_alen || 8860 (alength + xar->xarp_ha.sdl_nlen > 8861 sizeof (xar->xarp_ha.sdl_data))) 8862 return (EINVAL); 8863 } 8864 } else { 8865 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 8866 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 8867 xar = NULL; 8868 8869 flags = ar->arp_flags; 8870 lladdr = ar->arp_ha.sa_data; 8871 /* 8872 * Theoretically, the sa_family could tell us what link 8873 * layer type this operation is trying to deal with. By 8874 * common usage AF_UNSPEC means ethernet. We'll assume 8875 * any attempt to use the SIOC?ARP ioctls is for ethernet, 8876 * for now. Our new SIOC*XARP ioctls can be used more 8877 * generally. 8878 * 8879 * If the underlying media happens to have a non 6 byte 8880 * address, arp module will fail set/get, but the del 8881 * operation will succeed. 8882 */ 8883 alength = 6; 8884 if ((iocp->ioc_cmd != SIOCDARP) && 8885 (alength != ill->ill_phys_addr_length)) { 8886 return (EINVAL); 8887 } 8888 } 8889 8890 /* 8891 * We are going to pass up to ARP a packet chain that looks 8892 * like: 8893 * 8894 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 8895 * 8896 * Get a copy of the original IOCTL mblk to head the chain, 8897 * to be sent up (in mp1). Also get another copy to store 8898 * in the ill_pending_mp list, for matching the response 8899 * when it comes back from ARP. 8900 */ 8901 mp1 = copyb(mp); 8902 pending_mp = copymsg(mp); 8903 if (mp1 == NULL || pending_mp == NULL) { 8904 if (mp1 != NULL) 8905 freeb(mp1); 8906 if (pending_mp != NULL) 8907 inet_freemsg(pending_mp); 8908 return (ENOMEM); 8909 } 8910 8911 ipaddr = sin->sin_addr.s_addr; 8912 8913 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 8914 (caddr_t)&ipaddr); 8915 if (mp2 == NULL) { 8916 freeb(mp1); 8917 inet_freemsg(pending_mp); 8918 return (ENOMEM); 8919 } 8920 /* Put together the chain. */ 8921 mp1->b_cont = mp2; 8922 mp1->b_datap->db_type = M_IOCTL; 8923 mp2->b_cont = mp; 8924 mp2->b_datap->db_type = M_DATA; 8925 8926 iocp = (struct iocblk *)mp1->b_rptr; 8927 8928 /* 8929 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 8930 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 8931 * cp_private field (or cp_rval on 32-bit systems) in place of the 8932 * ioc_count field; set ioc_count to be correct. 8933 */ 8934 iocp->ioc_count = MBLKL(mp1->b_cont); 8935 8936 /* 8937 * Set the proper command in the ARP message. 8938 * Convert the SIOC{G|S|D}ARP calls into our 8939 * AR_ENTRY_xxx calls. 8940 */ 8941 area = (area_t *)mp2->b_rptr; 8942 switch (iocp->ioc_cmd) { 8943 case SIOCDARP: 8944 case SIOCDXARP: 8945 /* 8946 * We defer deleting the corresponding IRE until 8947 * we return from arp. 8948 */ 8949 area->area_cmd = AR_ENTRY_DELETE; 8950 area->area_proto_mask_offset = 0; 8951 break; 8952 case SIOCGARP: 8953 case SIOCGXARP: 8954 area->area_cmd = AR_ENTRY_SQUERY; 8955 area->area_proto_mask_offset = 0; 8956 break; 8957 case SIOCSARP: 8958 case SIOCSXARP: { 8959 /* 8960 * Delete the corresponding ire to make sure IP will 8961 * pick up any change from arp. 8962 */ 8963 if (!if_arp_ioctl) { 8964 (void) ip_ire_clookup_and_delete(ipaddr, NULL); 8965 break; 8966 } else { 8967 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 8968 if (ipif != NULL) { 8969 (void) ip_ire_clookup_and_delete(ipaddr, ipif); 8970 ipif_refrele(ipif); 8971 } 8972 break; 8973 } 8974 } 8975 } 8976 iocp->ioc_cmd = area->area_cmd; 8977 8978 /* 8979 * Before sending 'mp' to ARP, we have to clear the b_next 8980 * and b_prev. Otherwise if STREAMS encounters such a message 8981 * in freemsg(), (because ARP can close any time) it can cause 8982 * a panic. But mi code needs the b_next and b_prev values of 8983 * mp->b_cont, to complete the ioctl. So we store it here 8984 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 8985 * when the response comes down from ARP. 8986 */ 8987 pending_mp->b_cont->b_next = mp->b_cont->b_next; 8988 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 8989 mp->b_cont->b_next = NULL; 8990 mp->b_cont->b_prev = NULL; 8991 8992 mutex_enter(&connp->conn_lock); 8993 mutex_enter(&ill->ill_lock); 8994 /* conn has not yet started closing, hence this can't fail */ 8995 success = ill_pending_mp_add(ill, connp, pending_mp); 8996 ASSERT(success); 8997 mutex_exit(&ill->ill_lock); 8998 mutex_exit(&connp->conn_lock); 8999 9000 /* 9001 * Fill in the rest of the ARP operation fields. 9002 */ 9003 area->area_hw_addr_length = alength; 9004 bcopy(lladdr, 9005 (char *)area + area->area_hw_addr_offset, 9006 area->area_hw_addr_length); 9007 /* Translate the flags. */ 9008 if (flags & ATF_PERM) 9009 area->area_flags |= ACE_F_PERMANENT; 9010 if (flags & ATF_PUBL) 9011 area->area_flags |= ACE_F_PUBLISH; 9012 9013 /* 9014 * Up to ARP it goes. The response will come 9015 * back in ip_wput as an M_IOCACK message, and 9016 * will be handed to ip_sioctl_iocack for 9017 * completion. 9018 */ 9019 putnext(ill->ill_rq, mp1); 9020 return (EINPROGRESS); 9021 } 9022 9023 /* ARGSUSED */ 9024 int 9025 ip_sioctl_xarp(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9026 ip_ioctl_cmd_t *ipip, void *ifreq) 9027 { 9028 struct xarpreq *xar; 9029 boolean_t isv6; 9030 mblk_t *mp1; 9031 int err; 9032 conn_t *connp; 9033 int ifnamelen; 9034 ire_t *ire = NULL; 9035 ill_t *ill = NULL; 9036 struct sockaddr_in *sin; 9037 boolean_t if_arp_ioctl = B_FALSE; 9038 9039 /* ioctl comes down on an conn */ 9040 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9041 connp = Q_TO_CONN(q); 9042 isv6 = connp->conn_af_isv6; 9043 9044 /* Existance verified in ip_wput_nondata */ 9045 mp1 = mp->b_cont->b_cont; 9046 9047 ASSERT(MBLKL(mp1) >= sizeof (*xar)); 9048 xar = (struct xarpreq *)mp1->b_rptr; 9049 sin = (sin_t *)&xar->xarp_pa; 9050 9051 if (isv6 || (xar->xarp_ha.sdl_family != AF_LINK) || 9052 (xar->xarp_pa.ss_family != AF_INET)) 9053 return (ENXIO); 9054 9055 ifnamelen = xar->xarp_ha.sdl_nlen; 9056 if (ifnamelen != 0) { 9057 char *cptr, cval; 9058 9059 if (ifnamelen >= LIFNAMSIZ) 9060 return (EINVAL); 9061 9062 /* 9063 * Instead of bcopying a bunch of bytes, 9064 * null-terminate the string in-situ. 9065 */ 9066 cptr = xar->xarp_ha.sdl_data + ifnamelen; 9067 cval = *cptr; 9068 *cptr = '\0'; 9069 ill = ill_lookup_on_name(xar->xarp_ha.sdl_data, 9070 B_FALSE, isv6, CONNP_TO_WQ(connp), mp, ip_process_ioctl, 9071 &err, NULL); 9072 *cptr = cval; 9073 if (ill == NULL) 9074 return (err); 9075 if (ill->ill_net_type != IRE_IF_RESOLVER) { 9076 ill_refrele(ill); 9077 return (ENXIO); 9078 } 9079 9080 if_arp_ioctl = B_TRUE; 9081 } else { 9082 /* 9083 * PSARC 2003/088 states that if sdl_nlen == 0, it behaves 9084 * as an extended BSD ioctl. The kernel uses the IP address 9085 * to figure out the network interface. 9086 */ 9087 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES); 9088 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9089 ((ill = ire_to_ill(ire)) == NULL)) { 9090 if (ire != NULL) 9091 ire_refrele(ire); 9092 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9093 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9094 MATCH_IRE_TYPE); 9095 if ((ire == NULL) || 9096 ((ill = ire_to_ill(ire)) == NULL)) { 9097 if (ire != NULL) 9098 ire_refrele(ire); 9099 return (ENXIO); 9100 } 9101 } 9102 ASSERT(ire != NULL && ill != NULL); 9103 } 9104 9105 err = ip_sioctl_arp_common(ill, q, mp, sin, B_TRUE, if_arp_ioctl); 9106 if (if_arp_ioctl) 9107 ill_refrele(ill); 9108 if (ire != NULL) 9109 ire_refrele(ire); 9110 9111 return (err); 9112 } 9113 9114 /* 9115 * ARP IOCTLs. 9116 * How does IP get in the business of fronting ARP configuration/queries? 9117 * Well its like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9118 * are by tradition passed in through a datagram socket. That lands in IP. 9119 * As it happens, this is just as well since the interface is quite crude in 9120 * that it passes in no information about protocol or hardware types, or 9121 * interface association. After making the protocol assumption, IP is in 9122 * the position to look up the name of the ILL, which ARP will need, and 9123 * format a request that can be handled by ARP. The request is passed up 9124 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9125 * back a response. ARP supports its own set of more general IOCTLs, in 9126 * case anyone is interested. 9127 */ 9128 /* ARGSUSED */ 9129 int 9130 ip_sioctl_arp(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9131 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9132 { 9133 struct arpreq *ar; 9134 struct sockaddr_in *sin; 9135 ire_t *ire; 9136 boolean_t isv6; 9137 mblk_t *mp1; 9138 int err; 9139 conn_t *connp; 9140 ill_t *ill; 9141 9142 /* ioctl comes down on an conn */ 9143 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9144 connp = Q_TO_CONN(q); 9145 isv6 = connp->conn_af_isv6; 9146 if (isv6) 9147 return (ENXIO); 9148 9149 /* Existance verified in ip_wput_nondata */ 9150 mp1 = mp->b_cont->b_cont; 9151 9152 ar = (struct arpreq *)mp1->b_rptr; 9153 sin = (sin_t *)&ar->arp_pa; 9154 9155 /* 9156 * We need to let ARP know on which interface the IP 9157 * address has an ARP mapping. In the IPMP case, a 9158 * simple forwarding table lookup will return the 9159 * IRE_IF_RESOLVER for the first interface in the group, 9160 * which might not be the interface on which the 9161 * requested IP address was resolved due to the ill 9162 * selection algorithm (see ip_newroute_get_dst_ill()). 9163 * So we do a cache table lookup first: if the IRE cache 9164 * entry for the IP address is still there, it will 9165 * contain the ill pointer for the right interface, so 9166 * we use that. If the cache entry has been flushed, we 9167 * fall back to the forwarding table lookup. This should 9168 * be rare enough since IRE cache entries have a longer 9169 * life expectancy than ARP cache entries. 9170 */ 9171 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES); 9172 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9173 ((ill = ire_to_ill(ire)) == NULL)) { 9174 if (ire != NULL) 9175 ire_refrele(ire); 9176 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9177 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9178 MATCH_IRE_TYPE); 9179 if ((ire == NULL) || ((ill = ire_to_ill(ire)) == NULL)) { 9180 if (ire != NULL) 9181 ire_refrele(ire); 9182 return (ENXIO); 9183 } 9184 } 9185 ASSERT(ire != NULL && ill != NULL); 9186 9187 err = ip_sioctl_arp_common(ill, q, mp, sin, B_FALSE, B_FALSE); 9188 ire_refrele(ire); 9189 return (err); 9190 } 9191 9192 /* 9193 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9194 * atomically set/clear the muxids. Also complete the ioctl by acking or 9195 * naking it. Note that the code is structured such that the link type, 9196 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9197 * its clones use the persistent link, while pppd(1M) and perhaps many 9198 * other daemons may use non-persistent link. When combined with some 9199 * ill_t states, linking and unlinking lower streams may be used as 9200 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9201 */ 9202 /* ARGSUSED */ 9203 void 9204 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9205 { 9206 mblk_t *mp1; 9207 mblk_t *mp2; 9208 struct linkblk *li; 9209 queue_t *ipwq; 9210 char *name; 9211 struct qinit *qinfo; 9212 struct ipmx_s *ipmxp; 9213 ill_t *ill = NULL; 9214 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9215 int err = 0; 9216 boolean_t entered_ipsq = B_FALSE; 9217 boolean_t islink; 9218 queue_t *dwq = NULL; 9219 9220 ASSERT(iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_PUNLINK || 9221 iocp->ioc_cmd == I_LINK || iocp->ioc_cmd == I_UNLINK); 9222 9223 islink = (iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_LINK) ? 9224 B_TRUE : B_FALSE; 9225 9226 mp1 = mp->b_cont; /* This is the linkblk info */ 9227 li = (struct linkblk *)mp1->b_rptr; 9228 9229 /* 9230 * ARP has added this special mblk, and the utility is asking us 9231 * to perform consistency checks, and also atomically set the 9232 * muxid. Ifconfig is an example. It achieves this by using 9233 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9234 * to /dev/udp[6] stream for use as the mux when plinking the IP 9235 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9236 * and other comments in this routine for more details. 9237 */ 9238 mp2 = mp1->b_cont; /* This is added by ARP */ 9239 9240 /* 9241 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9242 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9243 * get the special mblk above. For backward compatibility, we just 9244 * return success. The utility will use SIOCSLIFMUXID to store 9245 * the muxids. This is not atomic, and can leave the streams 9246 * unplumbable if the utility is interrrupted, before it does the 9247 * SIOCSLIFMUXID. 9248 */ 9249 if (mp2 == NULL) { 9250 /* 9251 * At this point we don't know whether or not this is the 9252 * IP module stream or the ARP device stream. We need to 9253 * walk the lower stream in order to find this out, since 9254 * the capability negotiation is done only on the IP module 9255 * stream. IP module instance is identified by the module 9256 * name IP, non-null q_next, and it's wput not being ip_lwput. 9257 * STREAMS ensures that the lower stream (l_qbot) will not 9258 * vanish until this ioctl completes. So we can safely walk 9259 * the stream or refer to the q_ptr. 9260 */ 9261 ipwq = li->l_qbot; 9262 while (ipwq != NULL) { 9263 qinfo = ipwq->q_qinfo; 9264 name = qinfo->qi_minfo->mi_idname; 9265 if (name != NULL && name[0] != NULL && 9266 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9267 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9268 (ipwq->q_next != NULL)) { 9269 break; 9270 } 9271 ipwq = ipwq->q_next; 9272 } 9273 /* 9274 * This looks like an IP module stream, so trigger 9275 * the capability reset or re-negotiation if necessary. 9276 */ 9277 if (ipwq != NULL) { 9278 ill = ipwq->q_ptr; 9279 ASSERT(ill != NULL); 9280 9281 if (ipsq == NULL) { 9282 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9283 ip_sioctl_plink, NEW_OP, B_TRUE); 9284 if (ipsq == NULL) 9285 return; 9286 entered_ipsq = B_TRUE; 9287 } 9288 ASSERT(IAM_WRITER_ILL(ill)); 9289 /* 9290 * Store the upper read queue of the module 9291 * immediately below IP, and count the total 9292 * number of lower modules. Do this only 9293 * for I_PLINK or I_LINK event. 9294 */ 9295 ill->ill_lmod_rq = NULL; 9296 ill->ill_lmod_cnt = 0; 9297 if (islink && (dwq = ipwq->q_next) != NULL) { 9298 ill->ill_lmod_rq = RD(dwq); 9299 9300 while (dwq != NULL) { 9301 ill->ill_lmod_cnt++; 9302 dwq = dwq->q_next; 9303 } 9304 } 9305 /* 9306 * There's no point in resetting or re-negotiating if 9307 * we are not bound to the driver, so only do this if 9308 * the DLPI state is idle (up); we assume such state 9309 * since ill_ipif_up_count gets incremented in 9310 * ipif_up_done(), which is after we are bound to the 9311 * driver. Note that in the case of logical 9312 * interfaces, IP won't rebind to the driver unless 9313 * the ill_ipif_up_count is 0, meaning that all other 9314 * IP interfaces (including the main ipif) are in the 9315 * down state. Because of this, we use such counter 9316 * as an indicator, instead of relying on the IPIF_UP 9317 * flag, which is per ipif instance. 9318 */ 9319 if (ill->ill_ipif_up_count > 0) { 9320 if (islink) 9321 ill_capability_probe(ill); 9322 else 9323 ill_capability_reset(ill); 9324 } 9325 } 9326 goto done; 9327 } 9328 9329 /* 9330 * This is an I_{P}LINK sent down by ifconfig on 9331 * /dev/arp. ARP has appended this last (3rd) mblk, 9332 * giving more info. STREAMS ensures that the lower 9333 * stream (l_qbot) will not vanish until this ioctl 9334 * completes. So we can safely walk the stream or refer 9335 * to the q_ptr. 9336 */ 9337 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9338 if (ipmxp->ipmx_arpdev_stream) { 9339 /* 9340 * The operation is occuring on the arp-device 9341 * stream. 9342 */ 9343 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9344 q, mp, ip_sioctl_plink, &err, NULL); 9345 if (ill == NULL) { 9346 if (err == EINPROGRESS) { 9347 return; 9348 } else { 9349 err = EINVAL; 9350 goto done; 9351 } 9352 } 9353 9354 if (ipsq == NULL) { 9355 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9356 NEW_OP, B_TRUE); 9357 if (ipsq == NULL) { 9358 ill_refrele(ill); 9359 return; 9360 } 9361 entered_ipsq = B_TRUE; 9362 } 9363 ASSERT(IAM_WRITER_ILL(ill)); 9364 ill_refrele(ill); 9365 /* 9366 * To ensure consistency between IP and ARP, 9367 * the following LIFO scheme is used in 9368 * plink/punlink. (IP first, ARP last). 9369 * This is because the muxid's are stored 9370 * in the IP stream on the ill. 9371 * 9372 * I_{P}LINK: ifconfig plinks the IP stream before 9373 * plinking the ARP stream. On an arp-dev 9374 * stream, IP checks that it is not yet 9375 * plinked, and it also checks that the 9376 * corresponding IP stream is already plinked. 9377 * 9378 * I_{P}UNLINK: ifconfig punlinks the ARP stream 9379 * before punlinking the IP stream. IP does 9380 * not allow punlink of the IP stream unless 9381 * the arp stream has been punlinked. 9382 * 9383 */ 9384 if ((islink && 9385 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9386 (!islink && 9387 ill->ill_arp_muxid != li->l_index)) { 9388 err = EINVAL; 9389 goto done; 9390 } 9391 if (islink) { 9392 ill->ill_arp_muxid = li->l_index; 9393 } else { 9394 ill->ill_arp_muxid = 0; 9395 } 9396 } else { 9397 /* 9398 * This must be the IP module stream with or 9399 * without arp. Walk the stream and locate the 9400 * IP module. An IP module instance is 9401 * identified by the module name IP, non-null 9402 * q_next, and it's wput not being ip_lwput. 9403 */ 9404 ipwq = li->l_qbot; 9405 while (ipwq != NULL) { 9406 qinfo = ipwq->q_qinfo; 9407 name = qinfo->qi_minfo->mi_idname; 9408 if (name != NULL && name[0] != NULL && 9409 (strcmp(name, ip_mod_info.mi_idname) == 0) && 9410 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 9411 (ipwq->q_next != NULL)) { 9412 break; 9413 } 9414 ipwq = ipwq->q_next; 9415 } 9416 if (ipwq != NULL) { 9417 ill = ipwq->q_ptr; 9418 ASSERT(ill != NULL); 9419 9420 if (ipsq == NULL) { 9421 ipsq = ipsq_try_enter(NULL, ill, q, mp, 9422 ip_sioctl_plink, NEW_OP, B_TRUE); 9423 if (ipsq == NULL) 9424 return; 9425 entered_ipsq = B_TRUE; 9426 } 9427 ASSERT(IAM_WRITER_ILL(ill)); 9428 /* 9429 * Return error if the ip_mux_id is 9430 * non-zero and command is I_{P}LINK. 9431 * If command is I_{P}UNLINK, return 9432 * error if the arp-devstr is not 9433 * yet punlinked. 9434 */ 9435 if ((islink && ill->ill_ip_muxid != 0) || 9436 (!islink && ill->ill_arp_muxid != 0)) { 9437 err = EINVAL; 9438 goto done; 9439 } 9440 ill->ill_lmod_rq = NULL; 9441 ill->ill_lmod_cnt = 0; 9442 if (islink) { 9443 /* 9444 * Store the upper read queue of the module 9445 * immediately below IP, and count the total 9446 * number of lower modules. 9447 */ 9448 if ((dwq = ipwq->q_next) != NULL) { 9449 ill->ill_lmod_rq = RD(dwq); 9450 9451 while (dwq != NULL) { 9452 ill->ill_lmod_cnt++; 9453 dwq = dwq->q_next; 9454 } 9455 } 9456 ill->ill_ip_muxid = li->l_index; 9457 } else { 9458 ill->ill_ip_muxid = 0; 9459 } 9460 9461 /* 9462 * See comments above about resetting/re- 9463 * negotiating driver sub-capabilities. 9464 */ 9465 if (ill->ill_ipif_up_count > 0) { 9466 if (islink) 9467 ill_capability_probe(ill); 9468 else 9469 ill_capability_reset(ill); 9470 } 9471 } 9472 } 9473 done: 9474 iocp->ioc_count = 0; 9475 iocp->ioc_error = err; 9476 if (err == 0) 9477 mp->b_datap->db_type = M_IOCACK; 9478 else 9479 mp->b_datap->db_type = M_IOCNAK; 9480 qreply(q, mp); 9481 9482 /* Conn was refheld in ip_sioctl_copyin_setup */ 9483 if (CONN_Q(q)) 9484 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9485 if (entered_ipsq) 9486 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9487 } 9488 9489 /* 9490 * Search the ioctl command in the ioctl tables and return a pointer 9491 * to the ioctl command information. The ioctl command tables are 9492 * static and fully populated at compile time. 9493 */ 9494 ip_ioctl_cmd_t * 9495 ip_sioctl_lookup(int ioc_cmd) 9496 { 9497 int index; 9498 ip_ioctl_cmd_t *ipip; 9499 ip_ioctl_cmd_t *ipip_end; 9500 9501 if (ioc_cmd == IPI_DONTCARE) 9502 return (NULL); 9503 9504 /* 9505 * Do a 2 step search. First search the indexed table 9506 * based on the least significant byte of the ioctl cmd. 9507 * If we don't find a match, then search the misc table 9508 * serially. 9509 */ 9510 index = ioc_cmd & 0xFF; 9511 if (index < ip_ndx_ioctl_count) { 9512 ipip = &ip_ndx_ioctl_table[index]; 9513 if (ipip->ipi_cmd == ioc_cmd) { 9514 /* Found a match in the ndx table */ 9515 return (ipip); 9516 } 9517 } 9518 9519 /* Search the misc table */ 9520 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 9521 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 9522 if (ipip->ipi_cmd == ioc_cmd) 9523 /* Found a match in the misc table */ 9524 return (ipip); 9525 } 9526 9527 return (NULL); 9528 } 9529 9530 /* 9531 * Wrapper function for resuming deferred ioctl processing 9532 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 9533 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 9534 */ 9535 /* ARGSUSED */ 9536 void 9537 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 9538 void *dummy_arg) 9539 { 9540 ip_sioctl_copyin_setup(q, mp); 9541 } 9542 9543 /* 9544 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 9545 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 9546 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 9547 * We establish here the size of the block to be copied in. mi_copyin 9548 * arranges for this to happen, an processing continues in ip_wput with 9549 * an M_IOCDATA message. 9550 */ 9551 void 9552 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 9553 { 9554 int copyin_size; 9555 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9556 ip_ioctl_cmd_t *ipip; 9557 cred_t *cr; 9558 9559 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 9560 if (ipip == NULL) { 9561 /* 9562 * The ioctl is not one we understand or own. 9563 * Pass it along to be processed down stream, 9564 * if this is a module instance of IP, else nak 9565 * the ioctl. 9566 */ 9567 if (q->q_next == NULL) { 9568 goto nak; 9569 } else { 9570 putnext(q, mp); 9571 return; 9572 } 9573 } 9574 9575 /* 9576 * If this is deferred, then we will do all the checks when we 9577 * come back. 9578 */ 9579 if ((iocp->ioc_cmd == SIOCGDSTINFO || 9580 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup()) { 9581 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 9582 return; 9583 } 9584 9585 /* 9586 * Only allow a very small subset of IP ioctls on this stream if 9587 * IP is a module and not a driver. Allowing ioctls to be processed 9588 * in this case may cause assert failures or data corruption. 9589 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 9590 * ioctls allowed on an IP module stream, after which this stream 9591 * normally becomes a multiplexor (at which time the stream head 9592 * will fail all ioctls). 9593 */ 9594 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 9595 if (ipip->ipi_flags & IPI_PASS_DOWN) { 9596 /* 9597 * Pass common Streams ioctls which the IP 9598 * module does not own or consume along to 9599 * be processed down stream. 9600 */ 9601 putnext(q, mp); 9602 return; 9603 } else { 9604 goto nak; 9605 } 9606 } 9607 9608 /* Make sure we have ioctl data to process. */ 9609 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 9610 goto nak; 9611 9612 /* 9613 * Prefer dblk credential over ioctl credential; some synthesized 9614 * ioctls have kcred set because there's no way to crhold() 9615 * a credential in some contexts. (ioc_cr is not crfree() by 9616 * the framework; the caller of ioctl needs to hold the reference 9617 * for the duration of the call). 9618 */ 9619 cr = DB_CREDDEF(mp, iocp->ioc_cr); 9620 9621 /* Make sure normal users don't send down privileged ioctls */ 9622 if ((ipip->ipi_flags & IPI_PRIV) && 9623 (cr != NULL) && secpolicy_net_config(cr, B_TRUE) != 0) { 9624 /* We checked the privilege earlier but log it here */ 9625 miocnak(q, mp, 0, secpolicy_net_config(cr, B_FALSE)); 9626 return; 9627 } 9628 9629 /* 9630 * The ioctl command tables can only encode fixed length 9631 * ioctl data. If the length is variable, the table will 9632 * encode the length as zero. Such special cases are handled 9633 * below in the switch. 9634 */ 9635 if (ipip->ipi_copyin_size != 0) { 9636 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 9637 return; 9638 } 9639 9640 switch (iocp->ioc_cmd) { 9641 case O_SIOCGIFCONF: 9642 case SIOCGIFCONF: 9643 /* 9644 * This IOCTL is hilarious. See comments in 9645 * ip_sioctl_get_ifconf for the story. 9646 */ 9647 if (iocp->ioc_count == TRANSPARENT) 9648 copyin_size = SIZEOF_STRUCT(ifconf, 9649 iocp->ioc_flag); 9650 else 9651 copyin_size = iocp->ioc_count; 9652 mi_copyin(q, mp, NULL, copyin_size); 9653 return; 9654 9655 case O_SIOCGLIFCONF: 9656 case SIOCGLIFCONF: 9657 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 9658 mi_copyin(q, mp, NULL, copyin_size); 9659 return; 9660 9661 case SIOCGLIFSRCOF: 9662 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 9663 mi_copyin(q, mp, NULL, copyin_size); 9664 return; 9665 case SIOCGIP6ADDRPOLICY: 9666 ip_sioctl_ip6addrpolicy(q, mp); 9667 ip6_asp_table_refrele(); 9668 return; 9669 9670 case SIOCSIP6ADDRPOLICY: 9671 ip_sioctl_ip6addrpolicy(q, mp); 9672 return; 9673 9674 case SIOCGDSTINFO: 9675 ip_sioctl_dstinfo(q, mp); 9676 ip6_asp_table_refrele(); 9677 return; 9678 9679 case I_PLINK: 9680 case I_PUNLINK: 9681 case I_LINK: 9682 case I_UNLINK: 9683 /* 9684 * We treat non-persistent link similarly as the persistent 9685 * link case, in terms of plumbing/unplumbing, as well as 9686 * dynamic re-plumbing events indicator. See comments 9687 * in ip_sioctl_plink() for more. 9688 * 9689 * Request can be enqueued in the 'ipsq' while waiting 9690 * to become exclusive. So bump up the conn ref. 9691 */ 9692 if (CONN_Q(q)) 9693 CONN_INC_REF(Q_TO_CONN(q)); 9694 ip_sioctl_plink(NULL, q, mp, NULL); 9695 return; 9696 9697 case ND_GET: 9698 case ND_SET: 9699 /* 9700 * Use of the nd table requires holding the reader lock. 9701 * Modifying the nd table thru nd_load/nd_unload requires 9702 * the writer lock. 9703 */ 9704 rw_enter(&ip_g_nd_lock, RW_READER); 9705 if (nd_getset(q, ip_g_nd, mp)) { 9706 rw_exit(&ip_g_nd_lock); 9707 9708 if (iocp->ioc_error) 9709 iocp->ioc_count = 0; 9710 mp->b_datap->db_type = M_IOCACK; 9711 qreply(q, mp); 9712 return; 9713 } 9714 rw_exit(&ip_g_nd_lock); 9715 /* 9716 * We don't understand this subioctl of ND_GET / ND_SET. 9717 * Maybe intended for some driver / module below us 9718 */ 9719 if (q->q_next) { 9720 putnext(q, mp); 9721 } else { 9722 iocp->ioc_error = ENOENT; 9723 mp->b_datap->db_type = M_IOCNAK; 9724 iocp->ioc_count = 0; 9725 qreply(q, mp); 9726 } 9727 return; 9728 9729 case IP_IOCTL: 9730 ip_wput_ioctl(q, mp); 9731 return; 9732 default: 9733 cmn_err(CE_PANIC, "should not happen "); 9734 } 9735 nak: 9736 if (mp->b_cont != NULL) { 9737 freemsg(mp->b_cont); 9738 mp->b_cont = NULL; 9739 } 9740 iocp->ioc_error = EINVAL; 9741 mp->b_datap->db_type = M_IOCNAK; 9742 iocp->ioc_count = 0; 9743 qreply(q, mp); 9744 } 9745 9746 /* ip_wput hands off ARP IOCTL responses to us */ 9747 void 9748 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 9749 { 9750 struct arpreq *ar; 9751 struct xarpreq *xar; 9752 area_t *area; 9753 mblk_t *area_mp; 9754 struct iocblk *iocp; 9755 mblk_t *orig_ioc_mp, *tmp; 9756 struct iocblk *orig_iocp; 9757 ill_t *ill; 9758 conn_t *connp = NULL; 9759 uint_t ioc_id; 9760 mblk_t *pending_mp; 9761 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 9762 int *flagsp; 9763 char *storage = NULL; 9764 sin_t *sin; 9765 ipaddr_t addr; 9766 int err; 9767 9768 ill = q->q_ptr; 9769 ASSERT(ill != NULL); 9770 9771 /* 9772 * We should get back from ARP a packet chain that looks like: 9773 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9774 */ 9775 if (!(area_mp = mp->b_cont) || 9776 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 9777 !(orig_ioc_mp = area_mp->b_cont) || 9778 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 9779 freemsg(mp); 9780 return; 9781 } 9782 9783 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 9784 9785 tmp = (orig_ioc_mp->b_cont)->b_cont; 9786 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 9787 (orig_iocp->ioc_cmd == SIOCSXARP) || 9788 (orig_iocp->ioc_cmd == SIOCDXARP)) { 9789 x_arp_ioctl = B_TRUE; 9790 xar = (struct xarpreq *)tmp->b_rptr; 9791 sin = (sin_t *)&xar->xarp_pa; 9792 flagsp = &xar->xarp_flags; 9793 storage = xar->xarp_ha.sdl_data; 9794 if (xar->xarp_ha.sdl_nlen != 0) 9795 ifx_arp_ioctl = B_TRUE; 9796 } else { 9797 ar = (struct arpreq *)tmp->b_rptr; 9798 sin = (sin_t *)&ar->arp_pa; 9799 flagsp = &ar->arp_flags; 9800 storage = ar->arp_ha.sa_data; 9801 } 9802 9803 iocp = (struct iocblk *)mp->b_rptr; 9804 9805 /* 9806 * Pick out the originating queue based on the ioc_id. 9807 */ 9808 ioc_id = iocp->ioc_id; 9809 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 9810 if (pending_mp == NULL) { 9811 ASSERT(connp == NULL); 9812 inet_freemsg(mp); 9813 return; 9814 } 9815 ASSERT(connp != NULL); 9816 q = CONNP_TO_WQ(connp); 9817 9818 /* Uncouple the internally generated IOCTL from the original one */ 9819 area = (area_t *)area_mp->b_rptr; 9820 area_mp->b_cont = NULL; 9821 9822 /* 9823 * Restore the b_next and b_prev used by mi code. This is needed 9824 * to complete the ioctl using mi* functions. We stored them in 9825 * the pending mp prior to sending the request to ARP. 9826 */ 9827 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 9828 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 9829 inet_freemsg(pending_mp); 9830 9831 /* 9832 * We're done if there was an error or if this is not an SIOCG{X}ARP 9833 * Catch the case where there is an IRE_CACHE by no entry in the 9834 * arp table. 9835 */ 9836 addr = sin->sin_addr.s_addr; 9837 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 9838 ire_t *ire; 9839 dl_unitdata_req_t *dlup; 9840 mblk_t *llmp; 9841 int addr_len; 9842 ill_t *ipsqill = NULL; 9843 9844 if (ifx_arp_ioctl) { 9845 /* 9846 * There's no need to lookup the ill, since 9847 * we've already done that when we started 9848 * processing the ioctl and sent the message 9849 * to ARP on that ill. So use the ill that 9850 * is stored in q->q_ptr. 9851 */ 9852 ipsqill = ill; 9853 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 9854 ipsqill->ill_ipif, ALL_ZONES, 9855 MATCH_IRE_TYPE | MATCH_IRE_ILL); 9856 } else { 9857 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 9858 NULL, ALL_ZONES, MATCH_IRE_TYPE); 9859 if (ire != NULL) 9860 ipsqill = ire_to_ill(ire); 9861 } 9862 9863 if ((x_arp_ioctl) && (ipsqill != NULL)) 9864 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 9865 9866 if (ire != NULL) { 9867 *flagsp = ATF_INUSE; 9868 llmp = ire->ire_dlureq_mp; 9869 if (llmp != NULL && ipsqill != NULL) { 9870 uchar_t *macaddr; 9871 9872 addr_len = ipsqill->ill_phys_addr_length; 9873 if (x_arp_ioctl && ((addr_len + 9874 ipsqill->ill_name_length) > 9875 sizeof (xar->xarp_ha.sdl_data))) { 9876 ire_refrele(ire); 9877 freemsg(mp); 9878 ip_ioctl_finish(q, orig_ioc_mp, 9879 EINVAL, NO_COPYOUT, NULL, NULL); 9880 return; 9881 } 9882 *flagsp |= ATF_COM; 9883 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 9884 if (ipsqill->ill_sap_length < 0) 9885 macaddr = llmp->b_rptr + 9886 dlup->dl_dest_addr_offset; 9887 else 9888 macaddr = llmp->b_rptr + 9889 dlup->dl_dest_addr_offset + 9890 ipsqill->ill_sap_length; 9891 /* 9892 * For SIOCGARP, MAC address length 9893 * validation has already been done 9894 * before the ioctl was issued to ARP to 9895 * allow it to progress only on 6 byte 9896 * addressable (ethernet like) media. Thus 9897 * the mac address copying can not overwrite 9898 * the sa_data area below. 9899 */ 9900 bcopy(macaddr, storage, addr_len); 9901 } 9902 /* Ditch the internal IOCTL. */ 9903 freemsg(mp); 9904 ire_refrele(ire); 9905 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 9906 return; 9907 } 9908 } 9909 9910 /* 9911 * Delete the coresponding IRE_CACHE if any. 9912 * Reset the error if there was one (in case there was no entry 9913 * in arp.) 9914 */ 9915 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 9916 ipif_t *ipintf = NULL; 9917 9918 if (ifx_arp_ioctl) { 9919 /* 9920 * There's no need to lookup the ill, since 9921 * we've already done that when we started 9922 * processing the ioctl and sent the message 9923 * to ARP on that ill. So use the ill that 9924 * is stored in q->q_ptr. 9925 */ 9926 ipintf = ill->ill_ipif; 9927 } 9928 if (ip_ire_clookup_and_delete(addr, ipintf)) { 9929 /* 9930 * The address in "addr" may be an entry for a 9931 * router. If that's true, then any off-net 9932 * IRE_CACHE entries that go through the router 9933 * with address "addr" must be clobbered. Use 9934 * ire_walk to achieve this goal. 9935 */ 9936 if (ifx_arp_ioctl) 9937 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 9938 ire_delete_cache_gw, (char *)&addr, ill); 9939 else 9940 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 9941 ALL_ZONES); 9942 iocp->ioc_error = 0; 9943 } 9944 } 9945 9946 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 9947 err = iocp->ioc_error; 9948 freemsg(mp); 9949 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL, NULL); 9950 return; 9951 } 9952 9953 /* 9954 * Completion of an SIOCG{X}ARP. Translate the information from 9955 * the area_t into the struct {x}arpreq. 9956 */ 9957 if (x_arp_ioctl) { 9958 storage += ill_xarp_info(&xar->xarp_ha, ill); 9959 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 9960 sizeof (xar->xarp_ha.sdl_data)) { 9961 freemsg(mp); 9962 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, 9963 NO_COPYOUT, NULL, NULL); 9964 return; 9965 } 9966 } 9967 *flagsp = ATF_INUSE; 9968 if (area->area_flags & ACE_F_PERMANENT) 9969 *flagsp |= ATF_PERM; 9970 if (area->area_flags & ACE_F_PUBLISH) 9971 *flagsp |= ATF_PUBL; 9972 if (area->area_hw_addr_length != 0) { 9973 *flagsp |= ATF_COM; 9974 /* 9975 * For SIOCGARP, MAC address length validation has 9976 * already been done before the ioctl was issued to ARP 9977 * to allow it to progress only on 6 byte addressable 9978 * (ethernet like) media. Thus the mac address copying 9979 * can not overwrite the sa_data area below. 9980 */ 9981 bcopy((char *)area + area->area_hw_addr_offset, 9982 storage, area->area_hw_addr_length); 9983 } 9984 9985 /* Ditch the internal IOCTL. */ 9986 freemsg(mp); 9987 /* Complete the original. */ 9988 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL, NULL); 9989 } 9990 9991 /* 9992 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 9993 * interface) create the next available logical interface for this 9994 * physical interface. 9995 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 9996 * ipif with the specified name. 9997 * 9998 * If the address family is not AF_UNSPEC then set the address as well. 9999 * 10000 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10001 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10002 * 10003 * Executed as a writer on the ill or ill group. 10004 * So no lock is needed to traverse the ipif chain, or examine the 10005 * phyint flags. 10006 */ 10007 /* ARGSUSED */ 10008 int 10009 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10010 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10011 { 10012 mblk_t *mp1; 10013 struct lifreq *lifr; 10014 boolean_t isv6; 10015 boolean_t exists; 10016 char *name; 10017 char *endp; 10018 char *cp; 10019 int namelen; 10020 ipif_t *ipif; 10021 long id; 10022 ipsq_t *ipsq; 10023 ill_t *ill; 10024 sin_t *sin; 10025 int err = 0; 10026 boolean_t found_sep = B_FALSE; 10027 conn_t *connp; 10028 zoneid_t zoneid; 10029 int orig_ifindex = 0; 10030 10031 ip1dbg(("ip_sioctl_addif\n")); 10032 /* Existence of mp1 has been checked in ip_wput_nondata */ 10033 mp1 = mp->b_cont->b_cont; 10034 /* 10035 * Null terminate the string to protect against buffer 10036 * overrun. String was generated by user code and may not 10037 * be trusted. 10038 */ 10039 lifr = (struct lifreq *)mp1->b_rptr; 10040 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10041 name = lifr->lifr_name; 10042 ASSERT(CONN_Q(q)); 10043 connp = Q_TO_CONN(q); 10044 isv6 = connp->conn_af_isv6; 10045 zoneid = connp->conn_zoneid; 10046 namelen = mi_strlen(name); 10047 if (namelen == 0) 10048 return (EINVAL); 10049 10050 exists = B_FALSE; 10051 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10052 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10053 /* 10054 * Allow creating lo0 using SIOCLIFADDIF. 10055 * can't be any other writer thread. So can pass null below 10056 * for the last 4 args to ipif_lookup_name. 10057 */ 10058 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, 10059 B_TRUE, &exists, isv6, zoneid, NULL, NULL, NULL, NULL); 10060 /* Prevent any further action */ 10061 if (ipif == NULL) { 10062 return (ENOBUFS); 10063 } else if (!exists) { 10064 /* We created the ipif now and as writer */ 10065 ipif_refrele(ipif); 10066 return (0); 10067 } else { 10068 ill = ipif->ipif_ill; 10069 ill_refhold(ill); 10070 ipif_refrele(ipif); 10071 } 10072 } else { 10073 /* Look for a colon in the name. */ 10074 endp = &name[namelen]; 10075 for (cp = endp; --cp > name; ) { 10076 if (*cp == IPIF_SEPARATOR_CHAR) { 10077 found_sep = B_TRUE; 10078 /* 10079 * Reject any non-decimal aliases for plumbing 10080 * of logical interfaces. Aliases with leading 10081 * zeroes are also rejected as they introduce 10082 * ambiguity in the naming of the interfaces. 10083 * Comparing with "0" takes care of all such 10084 * cases. 10085 */ 10086 if ((strncmp("0", cp+1, 1)) == 0) 10087 return (EINVAL); 10088 10089 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10090 id <= 0 || *endp != '\0') { 10091 return (EINVAL); 10092 } 10093 *cp = '\0'; 10094 break; 10095 } 10096 } 10097 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10098 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL); 10099 if (found_sep) 10100 *cp = IPIF_SEPARATOR_CHAR; 10101 if (ill == NULL) 10102 return (err); 10103 } 10104 10105 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10106 B_TRUE); 10107 10108 /* 10109 * Release the refhold due to the lookup, now that we are excl 10110 * or we are just returning 10111 */ 10112 ill_refrele(ill); 10113 10114 if (ipsq == NULL) 10115 return (EINPROGRESS); 10116 10117 /* 10118 * If the interface is failed, inactive or offlined, look for a working 10119 * interface in the ill group and create the ipif there. If we can't 10120 * find a good interface, create the ipif anyway so that in.mpathd can 10121 * move it to the first repaired interface. 10122 */ 10123 if ((ill->ill_phyint->phyint_flags & 10124 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10125 ill->ill_phyint->phyint_groupname_len != 0) { 10126 phyint_t *phyi; 10127 char *groupname = ill->ill_phyint->phyint_groupname; 10128 10129 /* 10130 * We're looking for a working interface, but it doesn't matter 10131 * if it's up or down; so instead of following the group lists, 10132 * we look at each physical interface and compare the groupname. 10133 * We're only interested in interfaces with IPv4 (resp. IPv6) 10134 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 10135 * Otherwise we create the ipif on the failed interface. 10136 */ 10137 rw_enter(&ill_g_lock, RW_READER); 10138 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 10139 for (; phyi != NULL; 10140 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 10141 phyi, AVL_AFTER)) { 10142 if (phyi->phyint_groupname_len == 0) 10143 continue; 10144 ASSERT(phyi->phyint_groupname != NULL); 10145 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 10146 !(phyi->phyint_flags & 10147 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10148 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 10149 (phyi->phyint_illv4 != NULL))) { 10150 break; 10151 } 10152 } 10153 rw_exit(&ill_g_lock); 10154 10155 if (phyi != NULL) { 10156 orig_ifindex = ill->ill_phyint->phyint_ifindex; 10157 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 10158 phyi->phyint_illv4); 10159 } 10160 } 10161 10162 /* 10163 * We are now exclusive on the ipsq, so an ill move will be serialized 10164 * before or after us. 10165 */ 10166 ASSERT(IAM_WRITER_ILL(ill)); 10167 ASSERT(ill->ill_move_in_progress == B_FALSE); 10168 10169 if (found_sep && orig_ifindex == 0) { 10170 /* Now see if there is an IPIF with this unit number. */ 10171 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 10172 if (ipif->ipif_id == id) { 10173 err = EEXIST; 10174 goto done; 10175 } 10176 } 10177 } 10178 10179 /* 10180 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10181 * of lo0. We never come here when we plumb lo0:0. It 10182 * happens in ipif_lookup_on_name. 10183 * The specified unit number is ignored when we create the ipif on a 10184 * different interface. However, we save it in ipif_orig_ipifid below so 10185 * that the ipif fails back to the right position. 10186 */ 10187 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 10188 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 10189 err = ENOBUFS; 10190 goto done; 10191 } 10192 10193 /* Return created name with ioctl */ 10194 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10195 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10196 ip1dbg(("created %s\n", lifr->lifr_name)); 10197 10198 /* Set address */ 10199 sin = (sin_t *)&lifr->lifr_addr; 10200 if (sin->sin_family != AF_UNSPEC) { 10201 err = ip_sioctl_addr(ipif, sin, q, mp, 10202 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10203 } 10204 10205 /* Set ifindex and unit number for failback */ 10206 if (err == 0 && orig_ifindex != 0) { 10207 ipif->ipif_orig_ifindex = orig_ifindex; 10208 if (found_sep) { 10209 ipif->ipif_orig_ipifid = id; 10210 } 10211 } 10212 10213 done: 10214 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10215 return (err); 10216 } 10217 10218 /* 10219 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10220 * interface) delete it based on the IP address (on this physical interface). 10221 * Otherwise delete it based on the ipif_id. 10222 * Also, special handling to allow a removeif of lo0. 10223 */ 10224 /* ARGSUSED */ 10225 int 10226 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10227 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10228 { 10229 conn_t *connp; 10230 ill_t *ill = ipif->ipif_ill; 10231 boolean_t success; 10232 10233 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10234 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10235 ASSERT(IAM_WRITER_IPIF(ipif)); 10236 10237 connp = Q_TO_CONN(q); 10238 /* 10239 * Special case for unplumbing lo0 (the loopback physical interface). 10240 * If unplumbing lo0, the incoming address structure has been 10241 * initialized to all zeros. When unplumbing lo0, all its logical 10242 * interfaces must be removed too. 10243 * 10244 * Note that this interface may be called to remove a specific 10245 * loopback logical interface (eg, lo0:1). But in that case 10246 * ipif->ipif_id != 0 so that the code path for that case is the 10247 * same as any other interface (meaning it skips the code directly 10248 * below). 10249 */ 10250 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10251 if (sin->sin_family == AF_UNSPEC && 10252 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10253 /* 10254 * Mark it condemned. No new ref. will be made to ill. 10255 */ 10256 mutex_enter(&ill->ill_lock); 10257 ill->ill_state_flags |= ILL_CONDEMNED; 10258 for (ipif = ill->ill_ipif; ipif != NULL; 10259 ipif = ipif->ipif_next) { 10260 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10261 } 10262 mutex_exit(&ill->ill_lock); 10263 10264 ipif = ill->ill_ipif; 10265 /* unplumb the loopback interface */ 10266 ill_delete(ill); 10267 mutex_enter(&connp->conn_lock); 10268 mutex_enter(&ill->ill_lock); 10269 ASSERT(ill->ill_group == NULL); 10270 10271 /* Are any references to this ill active */ 10272 if (ill_is_quiescent(ill)) { 10273 mutex_exit(&ill->ill_lock); 10274 mutex_exit(&connp->conn_lock); 10275 ill_delete_tail(ill); 10276 mi_free(ill); 10277 return (0); 10278 } 10279 success = ipsq_pending_mp_add(connp, ipif, 10280 CONNP_TO_WQ(connp), mp, ILL_FREE); 10281 mutex_exit(&connp->conn_lock); 10282 mutex_exit(&ill->ill_lock); 10283 if (success) 10284 return (EINPROGRESS); 10285 else 10286 return (EINTR); 10287 } 10288 } 10289 10290 /* 10291 * We are exclusive on the ipsq, so an ill move will be serialized 10292 * before or after us. 10293 */ 10294 ASSERT(ill->ill_move_in_progress == B_FALSE); 10295 10296 if (ipif->ipif_id == 0) { 10297 /* Find based on address */ 10298 if (ipif->ipif_isv6) { 10299 sin6_t *sin6; 10300 10301 if (sin->sin_family != AF_INET6) 10302 return (EAFNOSUPPORT); 10303 10304 sin6 = (sin6_t *)sin; 10305 /* We are a writer, so we should be able to lookup */ 10306 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10307 ill, ALL_ZONES, NULL, NULL, NULL, NULL); 10308 if (ipif == NULL) { 10309 /* 10310 * Maybe the address in on another interface in 10311 * the same IPMP group? We check this below. 10312 */ 10313 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10314 NULL, ALL_ZONES, NULL, NULL, NULL, NULL); 10315 } 10316 } else { 10317 ipaddr_t addr; 10318 10319 if (sin->sin_family != AF_INET) 10320 return (EAFNOSUPPORT); 10321 10322 addr = sin->sin_addr.s_addr; 10323 /* We are a writer, so we should be able to lookup */ 10324 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 10325 NULL, NULL, NULL); 10326 if (ipif == NULL) { 10327 /* 10328 * Maybe the address in on another interface in 10329 * the same IPMP group? We check this below. 10330 */ 10331 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 10332 NULL, NULL, NULL, NULL); 10333 } 10334 } 10335 if (ipif == NULL) { 10336 return (EADDRNOTAVAIL); 10337 } 10338 /* 10339 * When the address to be removed is hosted on a different 10340 * interface, we check if the interface is in the same IPMP 10341 * group as the specified one; if so we proceed with the 10342 * removal. 10343 * ill->ill_group is NULL when the ill is down, so we have to 10344 * compare the group names instead. 10345 */ 10346 if (ipif->ipif_ill != ill && 10347 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 10348 ill->ill_phyint->phyint_groupname_len == 0 || 10349 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 10350 ill->ill_phyint->phyint_groupname) != 0)) { 10351 ipif_refrele(ipif); 10352 return (EADDRNOTAVAIL); 10353 } 10354 10355 /* This is a writer */ 10356 ipif_refrele(ipif); 10357 } 10358 10359 /* 10360 * Can not delete instance zero since it is tied to the ill. 10361 */ 10362 if (ipif->ipif_id == 0) 10363 return (EBUSY); 10364 10365 mutex_enter(&ill->ill_lock); 10366 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10367 mutex_exit(&ill->ill_lock); 10368 10369 ipif_free(ipif); 10370 10371 mutex_enter(&connp->conn_lock); 10372 mutex_enter(&ill->ill_lock); 10373 10374 /* Are any references to this ipif active */ 10375 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 10376 mutex_exit(&ill->ill_lock); 10377 mutex_exit(&connp->conn_lock); 10378 ipif_down_tail(ipif); 10379 ipif_free_tail(ipif); 10380 return (0); 10381 } 10382 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10383 IPIF_FREE); 10384 mutex_exit(&ill->ill_lock); 10385 mutex_exit(&connp->conn_lock); 10386 if (success) 10387 return (EINPROGRESS); 10388 else 10389 return (EINTR); 10390 } 10391 10392 /* 10393 * Restart the removeif ioctl. The refcnt has gone down to 0. 10394 * The ipif is already condemned. So can't find it thru lookups. 10395 */ 10396 /* ARGSUSED */ 10397 int 10398 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10399 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10400 { 10401 ill_t *ill; 10402 10403 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10404 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10405 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10406 ill = ipif->ipif_ill; 10407 ASSERT(IAM_WRITER_ILL(ill)); 10408 ASSERT((ipif->ipif_state_flags & IPIF_CONDEMNED) && 10409 (ill->ill_state_flags & IPIF_CONDEMNED)); 10410 ill_delete_tail(ill); 10411 mi_free(ill); 10412 return (0); 10413 } 10414 10415 ill = ipif->ipif_ill; 10416 ASSERT(IAM_WRITER_IPIF(ipif)); 10417 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10418 10419 ipif_down_tail(ipif); 10420 ipif_free_tail(ipif); 10421 10422 ILL_UNMARK_CHANGING(ill); 10423 return (0); 10424 } 10425 10426 /* 10427 * Set the local interface address. 10428 * Allow an address of all zero when the interface is down. 10429 */ 10430 /* ARGSUSED */ 10431 int 10432 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10433 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10434 { 10435 int err = 0; 10436 in6_addr_t v6addr; 10437 boolean_t need_up = B_FALSE; 10438 10439 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10440 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10441 10442 ASSERT(IAM_WRITER_IPIF(ipif)); 10443 10444 if (ipif->ipif_isv6) { 10445 sin6_t *sin6; 10446 ill_t *ill; 10447 phyint_t *phyi; 10448 10449 if (sin->sin_family != AF_INET6) 10450 return (EAFNOSUPPORT); 10451 10452 sin6 = (sin6_t *)sin; 10453 v6addr = sin6->sin6_addr; 10454 ill = ipif->ipif_ill; 10455 phyi = ill->ill_phyint; 10456 10457 /* 10458 * Enforce that true multicast interfaces have a link-local 10459 * address for logical unit 0. 10460 */ 10461 if (ipif->ipif_id == 0 && 10462 (ill->ill_flags & ILLF_MULTICAST) && 10463 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 10464 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 10465 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 10466 return (EADDRNOTAVAIL); 10467 } 10468 10469 /* 10470 * up interfaces shouldn't have the unspecified address 10471 * unless they also have the IPIF_NOLOCAL flags set and 10472 * have a subnet assigned. 10473 */ 10474 if ((ipif->ipif_flags & IPIF_UP) && 10475 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 10476 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 10477 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 10478 return (EADDRNOTAVAIL); 10479 } 10480 10481 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10482 return (EADDRNOTAVAIL); 10483 } else { 10484 ipaddr_t addr; 10485 10486 if (sin->sin_family != AF_INET) 10487 return (EAFNOSUPPORT); 10488 10489 addr = sin->sin_addr.s_addr; 10490 10491 /* Allow 0 as the local address. */ 10492 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10493 return (EADDRNOTAVAIL); 10494 10495 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10496 } 10497 10498 10499 /* 10500 * Even if there is no change we redo things just to rerun 10501 * ipif_set_default. 10502 */ 10503 if (ipif->ipif_flags & IPIF_UP) { 10504 /* 10505 * Setting a new local address, make sure 10506 * we have net and subnet bcast ire's for 10507 * the old address if we need them. 10508 */ 10509 if (!ipif->ipif_isv6) 10510 ipif_check_bcast_ires(ipif); 10511 /* 10512 * If the interface is already marked up, 10513 * we call ipif_down which will take care 10514 * of ditching any IREs that have been set 10515 * up based on the old interface address. 10516 */ 10517 err = ipif_logical_down(ipif, q, mp); 10518 if (err == EINPROGRESS) 10519 return (err); 10520 ipif_down_tail(ipif); 10521 need_up = 1; 10522 } 10523 10524 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 10525 return (err); 10526 } 10527 10528 int 10529 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10530 boolean_t need_up) 10531 { 10532 in6_addr_t v6addr; 10533 ipaddr_t addr; 10534 sin6_t *sin6; 10535 int err = 0; 10536 10537 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 10538 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10539 ASSERT(IAM_WRITER_IPIF(ipif)); 10540 if (ipif->ipif_isv6) { 10541 sin6 = (sin6_t *)sin; 10542 v6addr = sin6->sin6_addr; 10543 } else { 10544 addr = sin->sin_addr.s_addr; 10545 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10546 } 10547 mutex_enter(&ipif->ipif_ill->ill_lock); 10548 ipif->ipif_v6lcl_addr = v6addr; 10549 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 10550 ipif->ipif_v6src_addr = ipv6_all_zeros; 10551 } else { 10552 ipif->ipif_v6src_addr = v6addr; 10553 } 10554 10555 if ((ipif->ipif_isv6) && IN6_IS_ADDR_6TO4(&v6addr) && 10556 (!ipif->ipif_ill->ill_is_6to4tun)) { 10557 queue_t *wqp = ipif->ipif_ill->ill_wq; 10558 10559 /* 10560 * The local address of this interface is a 6to4 address, 10561 * check if this interface is in fact a 6to4 tunnel or just 10562 * an interface configured with a 6to4 address. We are only 10563 * interested in the former. 10564 */ 10565 if (wqp != NULL) { 10566 while ((wqp->q_next != NULL) && 10567 (wqp->q_next->q_qinfo != NULL) && 10568 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 10569 10570 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 10571 == TUN6TO4_MODID) { 10572 /* set for use in IP */ 10573 ipif->ipif_ill->ill_is_6to4tun = 1; 10574 break; 10575 } 10576 wqp = wqp->q_next; 10577 } 10578 } 10579 } 10580 10581 ipif_set_default(ipif); 10582 mutex_exit(&ipif->ipif_ill->ill_lock); 10583 10584 if (need_up) { 10585 /* 10586 * Now bring the interface back up. If this 10587 * is the only IPIF for the ILL, ipif_up 10588 * will have to re-bind to the device, so 10589 * we may get back EINPROGRESS, in which 10590 * case, this IOCTL will get completed in 10591 * ip_rput_dlpi when we see the DL_BIND_ACK. 10592 */ 10593 err = ipif_up(ipif, q, mp); 10594 } else { 10595 /* 10596 * Update the IPIF list in SCTP, ipif_up_done() will do it 10597 * if need_up is true. 10598 */ 10599 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10600 } 10601 10602 return (err); 10603 } 10604 10605 10606 /* 10607 * Restart entry point to restart the address set operation after the 10608 * refcounts have dropped to zero. 10609 */ 10610 /* ARGSUSED */ 10611 int 10612 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10613 ip_ioctl_cmd_t *ipip, void *ifreq) 10614 { 10615 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 10616 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10617 ASSERT(IAM_WRITER_IPIF(ipif)); 10618 ipif_down_tail(ipif); 10619 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 10620 } 10621 10622 /* ARGSUSED */ 10623 int 10624 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10625 ip_ioctl_cmd_t *ipip, void *if_req) 10626 { 10627 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 10628 struct lifreq *lifr = (struct lifreq *)if_req; 10629 10630 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 10631 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10632 /* 10633 * The net mask and address can't change since we have a 10634 * reference to the ipif. So no lock is necessary. 10635 */ 10636 if (ipif->ipif_isv6) { 10637 *sin6 = sin6_null; 10638 sin6->sin6_family = AF_INET6; 10639 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 10640 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10641 lifr->lifr_addrlen = 10642 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 10643 } else { 10644 *sin = sin_null; 10645 sin->sin_family = AF_INET; 10646 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 10647 if (ipip->ipi_cmd_type == LIF_CMD) { 10648 lifr->lifr_addrlen = 10649 ip_mask_to_plen(ipif->ipif_net_mask); 10650 } 10651 } 10652 return (0); 10653 } 10654 10655 /* 10656 * Set the destination address for a pt-pt interface. 10657 */ 10658 /* ARGSUSED */ 10659 int 10660 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10661 ip_ioctl_cmd_t *ipip, void *if_req) 10662 { 10663 int err = 0; 10664 in6_addr_t v6addr; 10665 boolean_t need_up = B_FALSE; 10666 10667 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 10668 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10669 ASSERT(IAM_WRITER_IPIF(ipif)); 10670 10671 if (ipif->ipif_isv6) { 10672 sin6_t *sin6; 10673 10674 if (sin->sin_family != AF_INET6) 10675 return (EAFNOSUPPORT); 10676 10677 sin6 = (sin6_t *)sin; 10678 v6addr = sin6->sin6_addr; 10679 10680 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10681 return (EADDRNOTAVAIL); 10682 } else { 10683 ipaddr_t addr; 10684 10685 if (sin->sin_family != AF_INET) 10686 return (EAFNOSUPPORT); 10687 10688 addr = sin->sin_addr.s_addr; 10689 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10690 return (EADDRNOTAVAIL); 10691 10692 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10693 } 10694 10695 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 10696 return (0); /* No change */ 10697 10698 if (ipif->ipif_flags & IPIF_UP) { 10699 /* 10700 * If the interface is already marked up, 10701 * we call ipif_down which will take care 10702 * of ditching any IREs that have been set 10703 * up based on the old pp dst address. 10704 */ 10705 err = ipif_logical_down(ipif, q, mp); 10706 if (err == EINPROGRESS) 10707 return (err); 10708 ipif_down_tail(ipif); 10709 need_up = B_TRUE; 10710 } 10711 /* 10712 * could return EINPROGRESS. If so ioctl will complete in 10713 * ip_rput_dlpi_writer 10714 */ 10715 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 10716 return (err); 10717 } 10718 10719 static int 10720 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10721 boolean_t need_up) 10722 { 10723 in6_addr_t v6addr; 10724 ill_t *ill = ipif->ipif_ill; 10725 int err = 0; 10726 10727 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", 10728 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10729 if (ipif->ipif_isv6) { 10730 sin6_t *sin6; 10731 10732 sin6 = (sin6_t *)sin; 10733 v6addr = sin6->sin6_addr; 10734 } else { 10735 ipaddr_t addr; 10736 10737 addr = sin->sin_addr.s_addr; 10738 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10739 } 10740 mutex_enter(&ill->ill_lock); 10741 /* Set point to point destination address. */ 10742 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10743 /* 10744 * Allow this as a means of creating logical 10745 * pt-pt interfaces on top of e.g. an Ethernet. 10746 * XXX Undocumented HACK for testing. 10747 * pt-pt interfaces are created with NUD disabled. 10748 */ 10749 ipif->ipif_flags |= IPIF_POINTOPOINT; 10750 ipif->ipif_flags &= ~IPIF_BROADCAST; 10751 if (ipif->ipif_isv6) 10752 ipif->ipif_ill->ill_flags |= ILLF_NONUD; 10753 } 10754 10755 /* Set the new address. */ 10756 ipif->ipif_v6pp_dst_addr = v6addr; 10757 /* Make sure subnet tracks pp_dst */ 10758 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 10759 mutex_exit(&ill->ill_lock); 10760 10761 if (need_up) { 10762 /* 10763 * Now bring the interface back up. If this 10764 * is the only IPIF for the ILL, ipif_up 10765 * will have to re-bind to the device, so 10766 * we may get back EINPROGRESS, in which 10767 * case, this IOCTL will get completed in 10768 * ip_rput_dlpi when we see the DL_BIND_ACK. 10769 */ 10770 err = ipif_up(ipif, q, mp); 10771 } 10772 return (err); 10773 } 10774 10775 /* 10776 * Restart entry point to restart the dstaddress set operation after the 10777 * refcounts have dropped to zero. 10778 */ 10779 /* ARGSUSED */ 10780 int 10781 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10782 ip_ioctl_cmd_t *ipip, void *ifreq) 10783 { 10784 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 10785 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10786 ipif_down_tail(ipif); 10787 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 10788 } 10789 10790 /* ARGSUSED */ 10791 int 10792 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10793 ip_ioctl_cmd_t *ipip, void *if_req) 10794 { 10795 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 10796 10797 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 10798 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10799 /* 10800 * Get point to point destination address. The addresses can't 10801 * change since we hold a reference to the ipif. 10802 */ 10803 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 10804 return (EADDRNOTAVAIL); 10805 10806 if (ipif->ipif_isv6) { 10807 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10808 *sin6 = sin6_null; 10809 sin6->sin6_family = AF_INET6; 10810 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 10811 } else { 10812 *sin = sin_null; 10813 sin->sin_family = AF_INET; 10814 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 10815 } 10816 return (0); 10817 } 10818 10819 /* 10820 * part of ipmp, make this func return the active/inactive state and 10821 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 10822 */ 10823 /* 10824 * This function either sets or clears the IFF_INACTIVE flag. 10825 * 10826 * As long as there are some addresses or multicast memberships on the 10827 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 10828 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 10829 * will be used for outbound packets. 10830 * 10831 * Caller needs to verify the validity of setting IFF_INACTIVE. 10832 */ 10833 static void 10834 phyint_inactive(phyint_t *phyi) 10835 { 10836 ill_t *ill_v4; 10837 ill_t *ill_v6; 10838 ipif_t *ipif; 10839 ilm_t *ilm; 10840 10841 ill_v4 = phyi->phyint_illv4; 10842 ill_v6 = phyi->phyint_illv6; 10843 10844 /* 10845 * No need for a lock while traversing the list since iam 10846 * a writer 10847 */ 10848 if (ill_v4 != NULL) { 10849 ASSERT(IAM_WRITER_ILL(ill_v4)); 10850 for (ipif = ill_v4->ill_ipif; ipif != NULL; 10851 ipif = ipif->ipif_next) { 10852 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 10853 mutex_enter(&phyi->phyint_lock); 10854 phyi->phyint_flags &= ~PHYI_INACTIVE; 10855 mutex_exit(&phyi->phyint_lock); 10856 return; 10857 } 10858 } 10859 for (ilm = ill_v4->ill_ilm; ilm != NULL; 10860 ilm = ilm->ilm_next) { 10861 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 10862 mutex_enter(&phyi->phyint_lock); 10863 phyi->phyint_flags &= ~PHYI_INACTIVE; 10864 mutex_exit(&phyi->phyint_lock); 10865 return; 10866 } 10867 } 10868 } 10869 if (ill_v6 != NULL) { 10870 ill_v6 = phyi->phyint_illv6; 10871 for (ipif = ill_v6->ill_ipif; ipif != NULL; 10872 ipif = ipif->ipif_next) { 10873 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 10874 mutex_enter(&phyi->phyint_lock); 10875 phyi->phyint_flags &= ~PHYI_INACTIVE; 10876 mutex_exit(&phyi->phyint_lock); 10877 return; 10878 } 10879 } 10880 for (ilm = ill_v6->ill_ilm; ilm != NULL; 10881 ilm = ilm->ilm_next) { 10882 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 10883 mutex_enter(&phyi->phyint_lock); 10884 phyi->phyint_flags &= ~PHYI_INACTIVE; 10885 mutex_exit(&phyi->phyint_lock); 10886 return; 10887 } 10888 } 10889 } 10890 mutex_enter(&phyi->phyint_lock); 10891 phyi->phyint_flags |= PHYI_INACTIVE; 10892 mutex_exit(&phyi->phyint_lock); 10893 } 10894 10895 /* 10896 * This function is called only when the phyint flags change. Currently 10897 * called from ip_sioctl_flags. We re-do the broadcast nomination so 10898 * that we can select a good ill. 10899 */ 10900 static void 10901 ip_redo_nomination(phyint_t *phyi) 10902 { 10903 ill_t *ill_v4; 10904 10905 ill_v4 = phyi->phyint_illv4; 10906 10907 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 10908 ASSERT(IAM_WRITER_ILL(ill_v4)); 10909 if (ill_v4->ill_group->illgrp_ill_count > 1) 10910 ill_nominate_bcast_rcv(ill_v4->ill_group); 10911 } 10912 } 10913 10914 /* 10915 * Heuristic to check if ill is INACTIVE. 10916 * Checks if ill has an ipif with an usable ip address. 10917 * 10918 * Return values: 10919 * B_TRUE - ill is INACTIVE; has no usable ipif 10920 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 10921 */ 10922 static boolean_t 10923 ill_is_inactive(ill_t *ill) 10924 { 10925 ipif_t *ipif; 10926 10927 /* Check whether it is in an IPMP group */ 10928 if (ill->ill_phyint->phyint_groupname == NULL) 10929 return (B_FALSE); 10930 10931 if (ill->ill_ipif_up_count == 0) 10932 return (B_TRUE); 10933 10934 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 10935 uint64_t flags = ipif->ipif_flags; 10936 10937 /* 10938 * This ipif is usable if it is IPIF_UP and not a 10939 * dedicated test address. A dedicated test address 10940 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 10941 * (note in particular that V6 test addresses are 10942 * link-local data addresses and thus are marked 10943 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 10944 */ 10945 if ((flags & IPIF_UP) && 10946 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 10947 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 10948 return (B_FALSE); 10949 } 10950 return (B_TRUE); 10951 } 10952 10953 /* 10954 * Set interface flags. 10955 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 10956 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 10957 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 10958 * 10959 * NOTE : We really don't enforce that ipif_id zero should be used 10960 * for setting any flags other than IFF_LOGINT_FLAGS. This 10961 * is because applications generally does SICGLIFFLAGS and 10962 * ORs in the new flags (that affects the logical) and does a 10963 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 10964 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 10965 * flags that will be turned on is correct with respect to 10966 * ipif_id 0. For backward compatibility reasons, it is not done. 10967 */ 10968 /* ARGSUSED */ 10969 int 10970 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10971 ip_ioctl_cmd_t *ipip, void *if_req) 10972 { 10973 uint64_t turn_on; 10974 uint64_t turn_off; 10975 int err; 10976 boolean_t need_up = B_FALSE; 10977 phyint_t *phyi; 10978 ill_t *ill; 10979 uint64_t intf_flags; 10980 boolean_t phyint_flags_modified = B_FALSE; 10981 uint64_t flags; 10982 struct ifreq *ifr; 10983 struct lifreq *lifr; 10984 boolean_t set_linklocal = B_FALSE; 10985 boolean_t zero_source = B_FALSE; 10986 10987 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 10988 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10989 10990 ASSERT(IAM_WRITER_IPIF(ipif)); 10991 10992 ill = ipif->ipif_ill; 10993 phyi = ill->ill_phyint; 10994 10995 if (ipip->ipi_cmd_type == IF_CMD) { 10996 ifr = (struct ifreq *)if_req; 10997 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 10998 } else { 10999 lifr = (struct lifreq *)if_req; 11000 flags = lifr->lifr_flags; 11001 } 11002 11003 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11004 11005 /* 11006 * Has the flags been set correctly till now ? 11007 */ 11008 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11009 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11010 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11011 /* 11012 * Compare the new flags to the old, and partition 11013 * into those coming on and those going off. 11014 * For the 16 bit command keep the bits above bit 16 unchanged. 11015 */ 11016 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11017 flags |= intf_flags & ~0xFFFF; 11018 11019 /* 11020 * First check which bits will change and then which will 11021 * go on and off 11022 */ 11023 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 11024 if (!turn_on) 11025 return (0); /* No change */ 11026 11027 turn_off = intf_flags & turn_on; 11028 turn_on ^= turn_off; 11029 err = 0; 11030 11031 /* 11032 * Don't allow any bits belonging to the logical interface 11033 * to be set or cleared on the replacement ipif that was 11034 * created temporarily during a MOVE. 11035 */ 11036 if (ipif->ipif_replace_zero && 11037 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 11038 return (EINVAL); 11039 } 11040 11041 /* 11042 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11043 * IPv6 interfaces. 11044 */ 11045 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11046 return (EINVAL); 11047 11048 /* 11049 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11050 * interfaces. It makes no sense in that context. 11051 */ 11052 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11053 return (EINVAL); 11054 11055 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11056 zero_source = B_TRUE; 11057 11058 /* 11059 * For IPv6 ipif_id 0, don't allow the interface to be up without 11060 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11061 * If the link local address isn't set, and can be set, it will get 11062 * set later on in this function. 11063 */ 11064 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11065 (flags & IFF_UP) && !zero_source && 11066 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11067 if (ipif_cant_setlinklocal(ipif)) 11068 return (EINVAL); 11069 set_linklocal = B_TRUE; 11070 } 11071 11072 /* 11073 * ILL cannot be part of a usesrc group and and IPMP group at the 11074 * same time. No need to grab ill_g_usesrc_lock here, see 11075 * synchronization notes in ip.c 11076 */ 11077 if (turn_on & PHYI_STANDBY && 11078 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 11079 return (EINVAL); 11080 } 11081 11082 /* 11083 * If we modify physical interface flags, we'll potentially need to 11084 * send up two routing socket messages for the changes (one for the 11085 * IPv4 ill, and another for the IPv6 ill). Note that here. 11086 */ 11087 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11088 phyint_flags_modified = B_TRUE; 11089 11090 /* 11091 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 11092 * we need to flush the IRE_CACHES belonging to this ill. 11093 * We handle this case here without doing the DOWN/UP dance 11094 * like it is done for other flags. If some other flags are 11095 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 11096 * below will handle it by bringing it down and then 11097 * bringing it UP. 11098 */ 11099 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 11100 ill_t *ill_v4, *ill_v6; 11101 11102 ill_v4 = phyi->phyint_illv4; 11103 ill_v6 = phyi->phyint_illv6; 11104 11105 /* 11106 * First set the INACTIVE flag if needed. Then delete the ires. 11107 * ire_add will atomically prevent creating new IRE_CACHEs 11108 * unless hidden flag is set. 11109 * PHYI_FAILED and PHYI_INACTIVE are exclusive 11110 */ 11111 if ((turn_on & PHYI_FAILED) && 11112 ((intf_flags & PHYI_STANDBY) || !ipmp_enable_failback)) { 11113 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 11114 phyi->phyint_flags &= ~PHYI_INACTIVE; 11115 } 11116 if ((turn_off & PHYI_FAILED) && 11117 ((intf_flags & PHYI_STANDBY) || 11118 (!ipmp_enable_failback && ill_is_inactive(ill)))) { 11119 phyint_inactive(phyi); 11120 } 11121 11122 if (turn_on & PHYI_STANDBY) { 11123 /* 11124 * We implicitly set INACTIVE only when STANDBY is set. 11125 * INACTIVE is also set on non-STANDBY phyint when user 11126 * disables FAILBACK using configuration file. 11127 * Do not allow STANDBY to be set on such INACTIVE 11128 * phyint 11129 */ 11130 if (phyi->phyint_flags & PHYI_INACTIVE) 11131 return (EINVAL); 11132 if (!(phyi->phyint_flags & PHYI_FAILED)) 11133 phyint_inactive(phyi); 11134 } 11135 if (turn_off & PHYI_STANDBY) { 11136 if (ipmp_enable_failback) { 11137 /* 11138 * Reset PHYI_INACTIVE. 11139 */ 11140 phyi->phyint_flags &= ~PHYI_INACTIVE; 11141 } else if (ill_is_inactive(ill) && 11142 !(phyi->phyint_flags & PHYI_FAILED)) { 11143 /* 11144 * Need to set INACTIVE, when user sets 11145 * STANDBY on a non-STANDBY phyint and 11146 * later resets STANDBY 11147 */ 11148 phyint_inactive(phyi); 11149 } 11150 } 11151 /* 11152 * We should always send up a message so that the 11153 * daemons come to know of it. Note that the zeroth 11154 * interface can be down and the check below for IPIF_UP 11155 * will not make sense as we are actually setting 11156 * a phyint flag here. We assume that the ipif used 11157 * is always the zeroth ipif. (ip_rts_ifmsg does not 11158 * send up any message for non-zero ipifs). 11159 */ 11160 phyint_flags_modified = B_TRUE; 11161 11162 if (ill_v4 != NULL) { 11163 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11164 IRE_CACHE, ill_stq_cache_delete, 11165 (char *)ill_v4, ill_v4); 11166 illgrp_reset_schednext(ill_v4); 11167 } 11168 if (ill_v6 != NULL) { 11169 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11170 IRE_CACHE, ill_stq_cache_delete, 11171 (char *)ill_v6, ill_v6); 11172 illgrp_reset_schednext(ill_v6); 11173 } 11174 } 11175 11176 /* 11177 * If ILLF_ROUTER changes, we need to change the ip forwarding 11178 * status of the interface and, if the interface is part of an IPMP 11179 * group, all other interfaces that are part of the same IPMP 11180 * group. 11181 */ 11182 if ((turn_on | turn_off) & ILLF_ROUTER) { 11183 (void) ill_forward_set(q, mp, ((turn_on & ILLF_ROUTER) != 0), 11184 (caddr_t)ill); 11185 } 11186 11187 /* 11188 * If the interface is not UP and we are not going to 11189 * bring it UP, record the flags and return. When the 11190 * interface comes UP later, the right actions will be 11191 * taken. 11192 */ 11193 if (!(ipif->ipif_flags & IPIF_UP) && 11194 !(turn_on & IPIF_UP)) { 11195 /* Record new flags in their respective places. */ 11196 mutex_enter(&ill->ill_lock); 11197 mutex_enter(&ill->ill_phyint->phyint_lock); 11198 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11199 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11200 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11201 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11202 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11203 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11204 mutex_exit(&ill->ill_lock); 11205 mutex_exit(&ill->ill_phyint->phyint_lock); 11206 11207 /* 11208 * We do the broadcast and nomination here rather 11209 * than waiting for a FAILOVER/FAILBACK to happen. In 11210 * the case of FAILBACK from INACTIVE standby to the 11211 * interface that has been repaired, PHYI_FAILED has not 11212 * been cleared yet. If there are only two interfaces in 11213 * that group, all we have is a FAILED and INACTIVE 11214 * interface. If we do the nomination soon after a failback, 11215 * the broadcast nomination code would select the 11216 * INACTIVE interface for receiving broadcasts as FAILED is 11217 * not yet cleared. As we don't want STANDBY/INACTIVE to 11218 * receive broadcast packets, we need to redo nomination 11219 * when the FAILED is cleared here. Thus, in general we 11220 * always do the nomination here for FAILED, STANDBY 11221 * and OFFLINE. 11222 */ 11223 if (((turn_on | turn_off) & 11224 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 11225 ip_redo_nomination(phyi); 11226 } 11227 if (phyint_flags_modified) { 11228 if (phyi->phyint_illv4 != NULL) { 11229 ip_rts_ifmsg(phyi->phyint_illv4-> 11230 ill_ipif); 11231 } 11232 if (phyi->phyint_illv6 != NULL) { 11233 ip_rts_ifmsg(phyi->phyint_illv6-> 11234 ill_ipif); 11235 } 11236 } 11237 return (0); 11238 } else if (set_linklocal || zero_source) { 11239 mutex_enter(&ill->ill_lock); 11240 if (set_linklocal) 11241 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11242 if (zero_source) 11243 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11244 mutex_exit(&ill->ill_lock); 11245 } 11246 11247 /* 11248 * Disallow IPv6 interfaces coming up that have the unspecified address, 11249 * or point-to-point interfaces with an unspecified destination. We do 11250 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11251 * have a subnet assigned, which is how in.ndpd currently manages its 11252 * onlink prefix list when no addresses are configured with those 11253 * prefixes. 11254 */ 11255 if (ipif->ipif_isv6 && 11256 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11257 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11258 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11259 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11260 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11261 return (EINVAL); 11262 } 11263 11264 /* 11265 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11266 * from being brought up. 11267 */ 11268 if (!ipif->ipif_isv6 && 11269 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11270 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11271 return (EINVAL); 11272 } 11273 11274 /* 11275 * The only flag changes that we currently take specific action on 11276 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 11277 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 11278 * IPIF_PREFERRED. This is done by bring the ipif down, changing 11279 * the flags and bringing it back up again. 11280 */ 11281 if ((turn_on|turn_off) & 11282 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11283 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 11284 /* 11285 * Taking this ipif down, make sure we have 11286 * valid net and subnet bcast ire's for other 11287 * logical interfaces, if we need them. 11288 */ 11289 if (!ipif->ipif_isv6) 11290 ipif_check_bcast_ires(ipif); 11291 11292 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11293 !(turn_off & IPIF_UP)) { 11294 need_up = B_TRUE; 11295 if (ipif->ipif_flags & IPIF_UP) 11296 ill->ill_logical_down = 1; 11297 turn_on &= ~IPIF_UP; 11298 } 11299 err = ipif_down(ipif, q, mp); 11300 ip1dbg(("ipif_down returns %d err ", err)); 11301 if (err == EINPROGRESS) 11302 return (err); 11303 ipif_down_tail(ipif); 11304 } 11305 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 11306 } 11307 11308 static int 11309 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 11310 boolean_t need_up) 11311 { 11312 ill_t *ill; 11313 phyint_t *phyi; 11314 uint64_t turn_on; 11315 uint64_t turn_off; 11316 uint64_t intf_flags; 11317 boolean_t phyint_flags_modified = B_FALSE; 11318 int err = 0; 11319 boolean_t set_linklocal = B_FALSE; 11320 boolean_t zero_source = B_FALSE; 11321 11322 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 11323 ipif->ipif_ill->ill_name, ipif->ipif_id)); 11324 11325 ASSERT(IAM_WRITER_IPIF(ipif)); 11326 11327 ill = ipif->ipif_ill; 11328 phyi = ill->ill_phyint; 11329 11330 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11331 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 11332 11333 turn_off = intf_flags & turn_on; 11334 turn_on ^= turn_off; 11335 11336 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 11337 phyint_flags_modified = B_TRUE; 11338 11339 /* 11340 * Now we change the flags. Track current value of 11341 * other flags in their respective places. 11342 */ 11343 mutex_enter(&ill->ill_lock); 11344 mutex_enter(&phyi->phyint_lock); 11345 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11346 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11347 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11348 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11349 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11350 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11351 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 11352 set_linklocal = B_TRUE; 11353 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 11354 } 11355 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 11356 zero_source = B_TRUE; 11357 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 11358 } 11359 mutex_exit(&ill->ill_lock); 11360 mutex_exit(&phyi->phyint_lock); 11361 11362 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 11363 ip_redo_nomination(phyi); 11364 11365 if (set_linklocal) 11366 (void) ipif_setlinklocal(ipif); 11367 11368 if (zero_source) 11369 ipif->ipif_v6src_addr = ipv6_all_zeros; 11370 else 11371 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 11372 11373 if (need_up) { 11374 /* 11375 * XXX ipif_up really does not know whether a phyint flags 11376 * was modified or not. So, it sends up information on 11377 * only one routing sockets message. As we don't bring up 11378 * the interface and also set STANDBY/FAILED simultaneously 11379 * it should be okay. 11380 */ 11381 err = ipif_up(ipif, q, mp); 11382 } else { 11383 /* 11384 * Make sure routing socket sees all changes to the flags. 11385 * ipif_up_done* handles this when we use ipif_up. 11386 */ 11387 if (phyint_flags_modified) { 11388 if (phyi->phyint_illv4 != NULL) { 11389 ip_rts_ifmsg(phyi->phyint_illv4-> 11390 ill_ipif); 11391 } 11392 if (phyi->phyint_illv6 != NULL) { 11393 ip_rts_ifmsg(phyi->phyint_illv6-> 11394 ill_ipif); 11395 } 11396 } else { 11397 ip_rts_ifmsg(ipif); 11398 } 11399 } 11400 return (err); 11401 } 11402 11403 /* 11404 * Restart entry point to restart the flags restart operation after the 11405 * refcounts have dropped to zero. 11406 */ 11407 /* ARGSUSED */ 11408 int 11409 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11410 ip_ioctl_cmd_t *ipip, void *if_req) 11411 { 11412 int err; 11413 struct ifreq *ifr = (struct ifreq *)if_req; 11414 struct lifreq *lifr = (struct lifreq *)if_req; 11415 11416 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 11417 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11418 11419 ipif_down_tail(ipif); 11420 if (ipip->ipi_cmd_type == IF_CMD) { 11421 /* 11422 * Since ip_sioctl_flags expects an int and ifr_flags 11423 * is a short we need to cast ifr_flags into an int 11424 * to avoid having sign extension cause bits to get 11425 * set that should not be. 11426 */ 11427 err = ip_sioctl_flags_tail(ipif, 11428 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 11429 q, mp, B_TRUE); 11430 } else { 11431 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 11432 q, mp, B_TRUE); 11433 } 11434 return (err); 11435 } 11436 11437 /* ARGSUSED */ 11438 int 11439 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11440 ip_ioctl_cmd_t *ipip, void *if_req) 11441 { 11442 /* 11443 * Has the flags been set correctly till now ? 11444 */ 11445 ill_t *ill = ipif->ipif_ill; 11446 phyint_t *phyi = ill->ill_phyint; 11447 11448 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 11449 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11450 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11451 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11452 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11453 11454 /* 11455 * Need a lock since some flags can be set even when there are 11456 * references to the ipif. 11457 */ 11458 mutex_enter(&ill->ill_lock); 11459 if (ipip->ipi_cmd_type == IF_CMD) { 11460 struct ifreq *ifr = (struct ifreq *)if_req; 11461 11462 /* Get interface flags (low 16 only). */ 11463 ifr->ifr_flags = ((ipif->ipif_flags | 11464 ill->ill_flags | phyi->phyint_flags) & 0xffff); 11465 } else { 11466 struct lifreq *lifr = (struct lifreq *)if_req; 11467 11468 /* Get interface flags. */ 11469 lifr->lifr_flags = ipif->ipif_flags | 11470 ill->ill_flags | phyi->phyint_flags; 11471 } 11472 mutex_exit(&ill->ill_lock); 11473 return (0); 11474 } 11475 11476 /* ARGSUSED */ 11477 int 11478 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11479 ip_ioctl_cmd_t *ipip, void *if_req) 11480 { 11481 int mtu; 11482 int ip_min_mtu; 11483 struct ifreq *ifr; 11484 struct lifreq *lifr; 11485 ire_t *ire; 11486 11487 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 11488 ipif->ipif_id, (void *)ipif)); 11489 if (ipip->ipi_cmd_type == IF_CMD) { 11490 ifr = (struct ifreq *)if_req; 11491 mtu = ifr->ifr_metric; 11492 } else { 11493 lifr = (struct lifreq *)if_req; 11494 mtu = lifr->lifr_mtu; 11495 } 11496 11497 if (ipif->ipif_isv6) 11498 ip_min_mtu = IPV6_MIN_MTU; 11499 else 11500 ip_min_mtu = IP_MIN_MTU; 11501 11502 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 11503 return (EINVAL); 11504 11505 /* 11506 * Change the MTU size in all relevant ire's. 11507 * Mtu change Vs. new ire creation - protocol below. 11508 * First change ipif_mtu and the ire_max_frag of the 11509 * interface ire. Then do an ire walk and change the 11510 * ire_max_frag of all affected ires. During ire_add 11511 * under the bucket lock, set the ire_max_frag of the 11512 * new ire being created from the ipif/ire from which 11513 * it is being derived. If an mtu change happens after 11514 * the ire is added, the new ire will be cleaned up. 11515 * Conversely if the mtu change happens before the ire 11516 * is added, ire_add will see the new value of the mtu. 11517 */ 11518 ipif->ipif_mtu = mtu; 11519 ipif->ipif_flags |= IPIF_FIXEDMTU; 11520 11521 if (ipif->ipif_isv6) 11522 ire = ipif_to_ire_v6(ipif); 11523 else 11524 ire = ipif_to_ire(ipif); 11525 if (ire != NULL) { 11526 ire->ire_max_frag = ipif->ipif_mtu; 11527 ire_refrele(ire); 11528 } 11529 if (ipif->ipif_flags & IPIF_UP) { 11530 if (ipif->ipif_isv6) 11531 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES); 11532 else 11533 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES); 11534 } 11535 /* Update the MTU in SCTP's list */ 11536 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11537 return (0); 11538 } 11539 11540 /* Get interface MTU. */ 11541 /* ARGSUSED */ 11542 int 11543 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11544 ip_ioctl_cmd_t *ipip, void *if_req) 11545 { 11546 struct ifreq *ifr; 11547 struct lifreq *lifr; 11548 11549 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 11550 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11551 if (ipip->ipi_cmd_type == IF_CMD) { 11552 ifr = (struct ifreq *)if_req; 11553 ifr->ifr_metric = ipif->ipif_mtu; 11554 } else { 11555 lifr = (struct lifreq *)if_req; 11556 lifr->lifr_mtu = ipif->ipif_mtu; 11557 } 11558 return (0); 11559 } 11560 11561 /* Set interface broadcast address. */ 11562 /* ARGSUSED2 */ 11563 int 11564 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11565 ip_ioctl_cmd_t *ipip, void *if_req) 11566 { 11567 ipaddr_t addr; 11568 ire_t *ire; 11569 11570 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 11571 ipif->ipif_id)); 11572 11573 ASSERT(IAM_WRITER_IPIF(ipif)); 11574 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11575 return (EADDRNOTAVAIL); 11576 11577 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 11578 11579 if (sin->sin_family != AF_INET) 11580 return (EAFNOSUPPORT); 11581 11582 addr = sin->sin_addr.s_addr; 11583 if (ipif->ipif_flags & IPIF_UP) { 11584 /* 11585 * If we are already up, make sure the new 11586 * broadcast address makes sense. If it does, 11587 * there should be an IRE for it already. 11588 * Don't match on ipif, only on the ill 11589 * since we are sharing these now. Don't use 11590 * MATCH_IRE_ILL_GROUP as we are looking for 11591 * the broadcast ire on this ill and each ill 11592 * in the group has its own broadcast ire. 11593 */ 11594 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 11595 ipif, ALL_ZONES, (MATCH_IRE_ILL | MATCH_IRE_TYPE)); 11596 if (ire == NULL) { 11597 return (EINVAL); 11598 } else { 11599 ire_refrele(ire); 11600 } 11601 } 11602 /* 11603 * Changing the broadcast addr for this ipif. 11604 * Make sure we have valid net and subnet bcast 11605 * ire's for other logical interfaces, if needed. 11606 */ 11607 if (addr != ipif->ipif_brd_addr) 11608 ipif_check_bcast_ires(ipif); 11609 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 11610 return (0); 11611 } 11612 11613 /* Get interface broadcast address. */ 11614 /* ARGSUSED */ 11615 int 11616 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11617 ip_ioctl_cmd_t *ipip, void *if_req) 11618 { 11619 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 11620 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11621 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11622 return (EADDRNOTAVAIL); 11623 11624 /* IPIF_BROADCAST not possible with IPv6 */ 11625 ASSERT(!ipif->ipif_isv6); 11626 *sin = sin_null; 11627 sin->sin_family = AF_INET; 11628 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 11629 return (0); 11630 } 11631 11632 /* 11633 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 11634 */ 11635 /* ARGSUSED */ 11636 int 11637 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11638 ip_ioctl_cmd_t *ipip, void *if_req) 11639 { 11640 int err = 0; 11641 in6_addr_t v6mask; 11642 11643 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 11644 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11645 11646 ASSERT(IAM_WRITER_IPIF(ipif)); 11647 11648 if (ipif->ipif_isv6) { 11649 sin6_t *sin6; 11650 11651 if (sin->sin_family != AF_INET6) 11652 return (EAFNOSUPPORT); 11653 11654 sin6 = (sin6_t *)sin; 11655 v6mask = sin6->sin6_addr; 11656 } else { 11657 ipaddr_t mask; 11658 11659 if (sin->sin_family != AF_INET) 11660 return (EAFNOSUPPORT); 11661 11662 mask = sin->sin_addr.s_addr; 11663 V4MASK_TO_V6(mask, v6mask); 11664 } 11665 11666 /* 11667 * No big deal if the interface isn't already up, or the mask 11668 * isn't really changing, or this is pt-pt. 11669 */ 11670 if (!(ipif->ipif_flags & IPIF_UP) || 11671 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 11672 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 11673 ipif->ipif_v6net_mask = v6mask; 11674 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11675 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 11676 ipif->ipif_v6net_mask, 11677 ipif->ipif_v6subnet); 11678 } 11679 return (0); 11680 } 11681 /* 11682 * Make sure we have valid net and subnet broadcast ire's 11683 * for the old netmask, if needed by other logical interfaces. 11684 */ 11685 if (!ipif->ipif_isv6) 11686 ipif_check_bcast_ires(ipif); 11687 11688 err = ipif_logical_down(ipif, q, mp); 11689 if (err == EINPROGRESS) 11690 return (err); 11691 ipif_down_tail(ipif); 11692 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 11693 return (err); 11694 } 11695 11696 static int 11697 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 11698 { 11699 in6_addr_t v6mask; 11700 int err = 0; 11701 11702 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 11703 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11704 11705 if (ipif->ipif_isv6) { 11706 sin6_t *sin6; 11707 11708 sin6 = (sin6_t *)sin; 11709 v6mask = sin6->sin6_addr; 11710 } else { 11711 ipaddr_t mask; 11712 11713 mask = sin->sin_addr.s_addr; 11714 V4MASK_TO_V6(mask, v6mask); 11715 } 11716 11717 ipif->ipif_v6net_mask = v6mask; 11718 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11719 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 11720 ipif->ipif_v6subnet); 11721 } 11722 err = ipif_up(ipif, q, mp); 11723 11724 if (err == 0 || err == EINPROGRESS) { 11725 /* 11726 * The interface must be DL_BOUND if this packet has to 11727 * go out on the wire. Since we only go through a logical 11728 * down and are bound with the driver during an internal 11729 * down/up that is satisfied. 11730 */ 11731 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 11732 /* Potentially broadcast an address mask reply. */ 11733 ipif_mask_reply(ipif); 11734 } 11735 } 11736 return (err); 11737 } 11738 11739 /* ARGSUSED */ 11740 int 11741 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11742 ip_ioctl_cmd_t *ipip, void *if_req) 11743 { 11744 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 11745 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11746 ipif_down_tail(ipif); 11747 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 11748 } 11749 11750 /* Get interface net mask. */ 11751 /* ARGSUSED */ 11752 int 11753 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11754 ip_ioctl_cmd_t *ipip, void *if_req) 11755 { 11756 struct lifreq *lifr = (struct lifreq *)if_req; 11757 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 11758 11759 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 11760 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11761 11762 /* 11763 * net mask can't change since we have a reference to the ipif. 11764 */ 11765 if (ipif->ipif_isv6) { 11766 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11767 *sin6 = sin6_null; 11768 sin6->sin6_family = AF_INET6; 11769 sin6->sin6_addr = ipif->ipif_v6net_mask; 11770 lifr->lifr_addrlen = 11771 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11772 } else { 11773 *sin = sin_null; 11774 sin->sin_family = AF_INET; 11775 sin->sin_addr.s_addr = ipif->ipif_net_mask; 11776 if (ipip->ipi_cmd_type == LIF_CMD) { 11777 lifr->lifr_addrlen = 11778 ip_mask_to_plen(ipif->ipif_net_mask); 11779 } 11780 } 11781 return (0); 11782 } 11783 11784 /* ARGSUSED */ 11785 int 11786 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11787 ip_ioctl_cmd_t *ipip, void *if_req) 11788 { 11789 11790 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 11791 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11792 /* 11793 * Set interface metric. We don't use this for 11794 * anything but we keep track of it in case it is 11795 * important to routing applications or such. 11796 */ 11797 if (ipip->ipi_cmd_type == IF_CMD) { 11798 struct ifreq *ifr; 11799 11800 ifr = (struct ifreq *)if_req; 11801 ipif->ipif_metric = ifr->ifr_metric; 11802 } else { 11803 struct lifreq *lifr; 11804 11805 lifr = (struct lifreq *)if_req; 11806 ipif->ipif_metric = lifr->lifr_metric; 11807 } 11808 return (0); 11809 } 11810 11811 11812 /* ARGSUSED */ 11813 int 11814 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11815 ip_ioctl_cmd_t *ipip, void *if_req) 11816 { 11817 11818 /* Get interface metric. */ 11819 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 11820 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11821 if (ipip->ipi_cmd_type == IF_CMD) { 11822 struct ifreq *ifr; 11823 11824 ifr = (struct ifreq *)if_req; 11825 ifr->ifr_metric = ipif->ipif_metric; 11826 } else { 11827 struct lifreq *lifr; 11828 11829 lifr = (struct lifreq *)if_req; 11830 lifr->lifr_metric = ipif->ipif_metric; 11831 } 11832 11833 return (0); 11834 } 11835 11836 /* ARGSUSED */ 11837 int 11838 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11839 ip_ioctl_cmd_t *ipip, void *if_req) 11840 { 11841 11842 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 11843 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11844 /* 11845 * Set the muxid returned from I_PLINK. 11846 */ 11847 if (ipip->ipi_cmd_type == IF_CMD) { 11848 struct ifreq *ifr = (struct ifreq *)if_req; 11849 11850 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 11851 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 11852 } else { 11853 struct lifreq *lifr = (struct lifreq *)if_req; 11854 11855 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 11856 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 11857 } 11858 return (0); 11859 } 11860 11861 /* ARGSUSED */ 11862 int 11863 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11864 ip_ioctl_cmd_t *ipip, void *if_req) 11865 { 11866 11867 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 11868 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11869 /* 11870 * Get the muxid saved in ill for I_PUNLINK. 11871 */ 11872 if (ipip->ipi_cmd_type == IF_CMD) { 11873 struct ifreq *ifr = (struct ifreq *)if_req; 11874 11875 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 11876 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 11877 } else { 11878 struct lifreq *lifr = (struct lifreq *)if_req; 11879 11880 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 11881 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 11882 } 11883 return (0); 11884 } 11885 11886 /* 11887 * Set the subnet prefix. Does not modify the broadcast address. 11888 */ 11889 /* ARGSUSED */ 11890 int 11891 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11892 ip_ioctl_cmd_t *ipip, void *if_req) 11893 { 11894 int err = 0; 11895 in6_addr_t v6addr; 11896 in6_addr_t v6mask; 11897 boolean_t need_up = B_FALSE; 11898 int addrlen; 11899 11900 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 11901 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11902 11903 ASSERT(IAM_WRITER_IPIF(ipif)); 11904 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 11905 11906 if (ipif->ipif_isv6) { 11907 sin6_t *sin6; 11908 11909 if (sin->sin_family != AF_INET6) 11910 return (EAFNOSUPPORT); 11911 11912 sin6 = (sin6_t *)sin; 11913 v6addr = sin6->sin6_addr; 11914 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 11915 return (EADDRNOTAVAIL); 11916 } else { 11917 ipaddr_t addr; 11918 11919 if (sin->sin_family != AF_INET) 11920 return (EAFNOSUPPORT); 11921 11922 addr = sin->sin_addr.s_addr; 11923 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 11924 return (EADDRNOTAVAIL); 11925 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11926 /* Add 96 bits */ 11927 addrlen += IPV6_ABITS - IP_ABITS; 11928 } 11929 11930 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 11931 return (EINVAL); 11932 11933 /* Check if bits in the address is set past the mask */ 11934 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 11935 return (EINVAL); 11936 11937 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 11938 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 11939 return (0); /* No change */ 11940 11941 if (ipif->ipif_flags & IPIF_UP) { 11942 /* 11943 * If the interface is already marked up, 11944 * we call ipif_down which will take care 11945 * of ditching any IREs that have been set 11946 * up based on the old interface address. 11947 */ 11948 err = ipif_logical_down(ipif, q, mp); 11949 if (err == EINPROGRESS) 11950 return (err); 11951 ipif_down_tail(ipif); 11952 need_up = B_TRUE; 11953 } 11954 11955 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 11956 return (err); 11957 } 11958 11959 static int 11960 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 11961 queue_t *q, mblk_t *mp, boolean_t need_up) 11962 { 11963 ill_t *ill = ipif->ipif_ill; 11964 int err = 0; 11965 11966 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 11967 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11968 11969 /* Set the new address. */ 11970 mutex_enter(&ill->ill_lock); 11971 ipif->ipif_v6net_mask = v6mask; 11972 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11973 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 11974 ipif->ipif_v6subnet); 11975 } 11976 mutex_exit(&ill->ill_lock); 11977 11978 if (need_up) { 11979 /* 11980 * Now bring the interface back up. If this 11981 * is the only IPIF for the ILL, ipif_up 11982 * will have to re-bind to the device, so 11983 * we may get back EINPROGRESS, in which 11984 * case, this IOCTL will get completed in 11985 * ip_rput_dlpi when we see the DL_BIND_ACK. 11986 */ 11987 err = ipif_up(ipif, q, mp); 11988 if (err == EINPROGRESS) 11989 return (err); 11990 } 11991 return (err); 11992 } 11993 11994 /* ARGSUSED */ 11995 int 11996 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11997 ip_ioctl_cmd_t *ipip, void *if_req) 11998 { 11999 int addrlen; 12000 in6_addr_t v6addr; 12001 in6_addr_t v6mask; 12002 struct lifreq *lifr = (struct lifreq *)if_req; 12003 12004 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12005 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12006 ipif_down_tail(ipif); 12007 12008 addrlen = lifr->lifr_addrlen; 12009 if (ipif->ipif_isv6) { 12010 sin6_t *sin6; 12011 12012 sin6 = (sin6_t *)sin; 12013 v6addr = sin6->sin6_addr; 12014 } else { 12015 ipaddr_t addr; 12016 12017 addr = sin->sin_addr.s_addr; 12018 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12019 addrlen += IPV6_ABITS - IP_ABITS; 12020 } 12021 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12022 12023 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12024 } 12025 12026 /* ARGSUSED */ 12027 int 12028 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12029 ip_ioctl_cmd_t *ipip, void *if_req) 12030 { 12031 struct lifreq *lifr = (struct lifreq *)if_req; 12032 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12033 12034 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12035 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12036 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12037 12038 if (ipif->ipif_isv6) { 12039 *sin6 = sin6_null; 12040 sin6->sin6_family = AF_INET6; 12041 sin6->sin6_addr = ipif->ipif_v6subnet; 12042 lifr->lifr_addrlen = 12043 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12044 } else { 12045 *sin = sin_null; 12046 sin->sin_family = AF_INET; 12047 sin->sin_addr.s_addr = ipif->ipif_subnet; 12048 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12049 } 12050 return (0); 12051 } 12052 12053 /* 12054 * Set the IPv6 address token. 12055 */ 12056 /* ARGSUSED */ 12057 int 12058 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12059 ip_ioctl_cmd_t *ipi, void *if_req) 12060 { 12061 ill_t *ill = ipif->ipif_ill; 12062 int err; 12063 in6_addr_t v6addr; 12064 in6_addr_t v6mask; 12065 boolean_t need_up = B_FALSE; 12066 int i; 12067 sin6_t *sin6 = (sin6_t *)sin; 12068 struct lifreq *lifr = (struct lifreq *)if_req; 12069 int addrlen; 12070 12071 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12072 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12073 ASSERT(IAM_WRITER_IPIF(ipif)); 12074 12075 addrlen = lifr->lifr_addrlen; 12076 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12077 if (ipif->ipif_id != 0) 12078 return (EINVAL); 12079 12080 if (!ipif->ipif_isv6) 12081 return (EINVAL); 12082 12083 if (addrlen > IPV6_ABITS) 12084 return (EINVAL); 12085 12086 v6addr = sin6->sin6_addr; 12087 12088 /* 12089 * The length of the token is the length from the end. To get 12090 * the proper mask for this, compute the mask of the bits not 12091 * in the token; ie. the prefix, and then xor to get the mask. 12092 */ 12093 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12094 return (EINVAL); 12095 for (i = 0; i < 4; i++) { 12096 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12097 } 12098 12099 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12100 ill->ill_token_length == addrlen) 12101 return (0); /* No change */ 12102 12103 if (ipif->ipif_flags & IPIF_UP) { 12104 err = ipif_logical_down(ipif, q, mp); 12105 if (err == EINPROGRESS) 12106 return (err); 12107 ipif_down_tail(ipif); 12108 need_up = B_TRUE; 12109 } 12110 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12111 return (err); 12112 } 12113 12114 static int 12115 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12116 mblk_t *mp, boolean_t need_up) 12117 { 12118 in6_addr_t v6addr; 12119 in6_addr_t v6mask; 12120 ill_t *ill = ipif->ipif_ill; 12121 int i; 12122 int err = 0; 12123 12124 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12125 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12126 v6addr = sin6->sin6_addr; 12127 /* 12128 * The length of the token is the length from the end. To get 12129 * the proper mask for this, compute the mask of the bits not 12130 * in the token; ie. the prefix, and then xor to get the mask. 12131 */ 12132 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12133 for (i = 0; i < 4; i++) 12134 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12135 12136 mutex_enter(&ill->ill_lock); 12137 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12138 ill->ill_token_length = addrlen; 12139 mutex_exit(&ill->ill_lock); 12140 12141 if (need_up) { 12142 /* 12143 * Now bring the interface back up. If this 12144 * is the only IPIF for the ILL, ipif_up 12145 * will have to re-bind to the device, so 12146 * we may get back EINPROGRESS, in which 12147 * case, this IOCTL will get completed in 12148 * ip_rput_dlpi when we see the DL_BIND_ACK. 12149 */ 12150 err = ipif_up(ipif, q, mp); 12151 if (err == EINPROGRESS) 12152 return (err); 12153 } 12154 return (err); 12155 } 12156 12157 /* ARGSUSED */ 12158 int 12159 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12160 ip_ioctl_cmd_t *ipi, void *if_req) 12161 { 12162 ill_t *ill; 12163 sin6_t *sin6 = (sin6_t *)sin; 12164 struct lifreq *lifr = (struct lifreq *)if_req; 12165 12166 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12167 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12168 if (ipif->ipif_id != 0) 12169 return (EINVAL); 12170 12171 ill = ipif->ipif_ill; 12172 if (!ill->ill_isv6) 12173 return (ENXIO); 12174 12175 *sin6 = sin6_null; 12176 sin6->sin6_family = AF_INET6; 12177 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12178 sin6->sin6_addr = ill->ill_token; 12179 lifr->lifr_addrlen = ill->ill_token_length; 12180 return (0); 12181 } 12182 12183 /* 12184 * Set (hardware) link specific information that might override 12185 * what was acquired through the DL_INFO_ACK. 12186 * The logic is as follows. 12187 * 12188 * become exclusive 12189 * set CHANGING flag 12190 * change mtu on affected IREs 12191 * clear CHANGING flag 12192 * 12193 * An ire add that occurs before the CHANGING flag is set will have its mtu 12194 * changed by the ip_sioctl_lnkinfo. 12195 * 12196 * During the time the CHANGING flag is set, no new ires will be added to the 12197 * bucket, and ire add will fail (due the CHANGING flag). 12198 * 12199 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12200 * before it is added to the bucket. 12201 * 12202 * Obviously only 1 thread can set the CHANGING flag and we need to become 12203 * exclusive to set the flag. 12204 */ 12205 /* ARGSUSED */ 12206 int 12207 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12208 ip_ioctl_cmd_t *ipi, void *if_req) 12209 { 12210 ill_t *ill = ipif->ipif_ill; 12211 ipif_t *nipif; 12212 int ip_min_mtu; 12213 boolean_t mtu_walk = B_FALSE; 12214 struct lifreq *lifr = (struct lifreq *)if_req; 12215 lif_ifinfo_req_t *lir; 12216 ire_t *ire; 12217 12218 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12219 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12220 lir = &lifr->lifr_ifinfo; 12221 ASSERT(IAM_WRITER_IPIF(ipif)); 12222 12223 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12224 if (ipif->ipif_id != 0) 12225 return (EINVAL); 12226 12227 /* Set interface MTU. */ 12228 if (ipif->ipif_isv6) 12229 ip_min_mtu = IPV6_MIN_MTU; 12230 else 12231 ip_min_mtu = IP_MIN_MTU; 12232 12233 /* 12234 * Verify values before we set anything. Allow zero to 12235 * mean unspecified. 12236 */ 12237 if (lir->lir_maxmtu != 0 && 12238 (lir->lir_maxmtu > ill->ill_max_frag || 12239 lir->lir_maxmtu < ip_min_mtu)) 12240 return (EINVAL); 12241 if (lir->lir_reachtime != 0 && 12242 lir->lir_reachtime > ND_MAX_REACHTIME) 12243 return (EINVAL); 12244 if (lir->lir_reachretrans != 0 && 12245 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12246 return (EINVAL); 12247 12248 mutex_enter(&ill->ill_lock); 12249 ill->ill_state_flags |= ILL_CHANGING; 12250 for (nipif = ill->ill_ipif; nipif != NULL; 12251 nipif = nipif->ipif_next) { 12252 nipif->ipif_state_flags |= IPIF_CHANGING; 12253 } 12254 12255 mutex_exit(&ill->ill_lock); 12256 12257 if (lir->lir_maxmtu != 0) { 12258 ill->ill_max_mtu = lir->lir_maxmtu; 12259 ill->ill_mtu_userspecified = 1; 12260 mtu_walk = B_TRUE; 12261 } 12262 12263 if (lir->lir_reachtime != 0) 12264 ill->ill_reachable_time = lir->lir_reachtime; 12265 12266 if (lir->lir_reachretrans != 0) 12267 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12268 12269 ill->ill_max_hops = lir->lir_maxhops; 12270 12271 ill->ill_max_buf = ND_MAX_Q; 12272 12273 if (mtu_walk) { 12274 /* 12275 * Set the MTU on all ipifs associated with this ill except 12276 * for those whose MTU was fixed via SIOCSLIFMTU. 12277 */ 12278 for (nipif = ill->ill_ipif; nipif != NULL; 12279 nipif = nipif->ipif_next) { 12280 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12281 continue; 12282 12283 nipif->ipif_mtu = ill->ill_max_mtu; 12284 12285 if (!(nipif->ipif_flags & IPIF_UP)) 12286 continue; 12287 12288 if (nipif->ipif_isv6) 12289 ire = ipif_to_ire_v6(nipif); 12290 else 12291 ire = ipif_to_ire(nipif); 12292 if (ire != NULL) { 12293 ire->ire_max_frag = ipif->ipif_mtu; 12294 ire_refrele(ire); 12295 } 12296 if (ill->ill_isv6) { 12297 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 12298 ipif_mtu_change, (char *)nipif, 12299 ill); 12300 } else { 12301 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 12302 ipif_mtu_change, (char *)nipif, 12303 ill); 12304 } 12305 } 12306 } 12307 12308 mutex_enter(&ill->ill_lock); 12309 for (nipif = ill->ill_ipif; nipif != NULL; 12310 nipif = nipif->ipif_next) { 12311 nipif->ipif_state_flags &= ~IPIF_CHANGING; 12312 } 12313 ILL_UNMARK_CHANGING(ill); 12314 mutex_exit(&ill->ill_lock); 12315 12316 return (0); 12317 } 12318 12319 /* ARGSUSED */ 12320 int 12321 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12322 ip_ioctl_cmd_t *ipi, void *if_req) 12323 { 12324 struct lif_ifinfo_req *lir; 12325 ill_t *ill = ipif->ipif_ill; 12326 12327 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 12328 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12329 if (ipif->ipif_id != 0) 12330 return (EINVAL); 12331 12332 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 12333 lir->lir_maxhops = ill->ill_max_hops; 12334 lir->lir_reachtime = ill->ill_reachable_time; 12335 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 12336 lir->lir_maxmtu = ill->ill_max_mtu; 12337 12338 return (0); 12339 } 12340 12341 /* 12342 * Return best guess as to the subnet mask for the specified address. 12343 * Based on the subnet masks for all the configured interfaces. 12344 * 12345 * We end up returning a zero mask in the case of default, multicast or 12346 * experimental. 12347 */ 12348 static ipaddr_t 12349 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp) 12350 { 12351 ipaddr_t net_mask; 12352 ill_t *ill; 12353 ipif_t *ipif; 12354 ill_walk_context_t ctx; 12355 ipif_t *fallback_ipif = NULL; 12356 12357 net_mask = ip_net_mask(addr); 12358 if (net_mask == 0) { 12359 *ipifp = NULL; 12360 return (0); 12361 } 12362 12363 /* Let's check to see if this is maybe a local subnet route. */ 12364 /* this function only applies to IPv4 interfaces */ 12365 rw_enter(&ill_g_lock, RW_READER); 12366 ill = ILL_START_WALK_V4(&ctx); 12367 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 12368 mutex_enter(&ill->ill_lock); 12369 for (ipif = ill->ill_ipif; ipif != NULL; 12370 ipif = ipif->ipif_next) { 12371 if (!IPIF_CAN_LOOKUP(ipif)) 12372 continue; 12373 if (!(ipif->ipif_flags & IPIF_UP)) 12374 continue; 12375 if ((ipif->ipif_subnet & net_mask) == 12376 (addr & net_mask)) { 12377 /* 12378 * Don't trust pt-pt interfaces if there are 12379 * other interfaces. 12380 */ 12381 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 12382 if (fallback_ipif == NULL) { 12383 ipif_refhold_locked(ipif); 12384 fallback_ipif = ipif; 12385 } 12386 continue; 12387 } 12388 12389 /* 12390 * Fine. Just assume the same net mask as the 12391 * directly attached subnet interface is using. 12392 */ 12393 ipif_refhold_locked(ipif); 12394 mutex_exit(&ill->ill_lock); 12395 rw_exit(&ill_g_lock); 12396 if (fallback_ipif != NULL) 12397 ipif_refrele(fallback_ipif); 12398 *ipifp = ipif; 12399 return (ipif->ipif_net_mask); 12400 } 12401 } 12402 mutex_exit(&ill->ill_lock); 12403 } 12404 rw_exit(&ill_g_lock); 12405 12406 *ipifp = fallback_ipif; 12407 return ((fallback_ipif != NULL) ? 12408 fallback_ipif->ipif_net_mask : net_mask); 12409 } 12410 12411 /* 12412 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 12413 */ 12414 static void 12415 ip_wput_ioctl(queue_t *q, mblk_t *mp) 12416 { 12417 IOCP iocp; 12418 ipft_t *ipft; 12419 ipllc_t *ipllc; 12420 mblk_t *mp1; 12421 cred_t *cr; 12422 int error = 0; 12423 conn_t *connp; 12424 12425 ip1dbg(("ip_wput_ioctl")); 12426 iocp = (IOCP)mp->b_rptr; 12427 mp1 = mp->b_cont; 12428 if (mp1 == NULL) { 12429 iocp->ioc_error = EINVAL; 12430 mp->b_datap->db_type = M_IOCNAK; 12431 iocp->ioc_count = 0; 12432 qreply(q, mp); 12433 return; 12434 } 12435 12436 /* 12437 * These IOCTLs provide various control capabilities to 12438 * upstream agents such as ULPs and processes. There 12439 * are currently two such IOCTLs implemented. They 12440 * are used by TCP to provide update information for 12441 * existing IREs and to forcibly delete an IRE for a 12442 * host that is not responding, thereby forcing an 12443 * attempt at a new route. 12444 */ 12445 iocp->ioc_error = EINVAL; 12446 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 12447 goto done; 12448 12449 ipllc = (ipllc_t *)mp1->b_rptr; 12450 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 12451 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 12452 break; 12453 } 12454 /* 12455 * prefer credential from mblk over ioctl; 12456 * see ip_sioctl_copyin_setup 12457 */ 12458 cr = DB_CREDDEF(mp, iocp->ioc_cr); 12459 12460 /* 12461 * Refhold the conn in case the request gets queued up in some lookup 12462 */ 12463 ASSERT(CONN_Q(q)); 12464 connp = Q_TO_CONN(q); 12465 CONN_INC_REF(connp); 12466 if (ipft->ipft_pfi && 12467 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 12468 pullupmsg(mp1, ipft->ipft_min_size))) { 12469 error = (*ipft->ipft_pfi)(q, 12470 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 12471 } 12472 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 12473 /* 12474 * CONN_OPER_PENDING_DONE happens in the function called 12475 * through ipft_pfi above. 12476 */ 12477 return; 12478 } 12479 12480 CONN_OPER_PENDING_DONE(connp); 12481 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 12482 freemsg(mp); 12483 return; 12484 } 12485 iocp->ioc_error = error; 12486 12487 done: 12488 mp->b_datap->db_type = M_IOCACK; 12489 if (iocp->ioc_error) 12490 iocp->ioc_count = 0; 12491 qreply(q, mp); 12492 } 12493 12494 /* 12495 * Lookup an ipif using the sequence id (ipif_seqid) 12496 */ 12497 ipif_t * 12498 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 12499 { 12500 ipif_t *ipif; 12501 12502 ASSERT(MUTEX_HELD(&ill->ill_lock)); 12503 12504 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12505 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 12506 return (ipif); 12507 } 12508 return (NULL); 12509 } 12510 12511 uint64_t ipif_g_seqid; 12512 12513 /* 12514 * Assign a unique id for the ipif. This is used later when we send 12515 * IRES to ARP for resolution where we initialize ire_ipif_seqid 12516 * to the value pointed by ire_ipif->ipif_seqid. Later when the 12517 * IRE is added, we verify that ipif has not disappeared. 12518 */ 12519 12520 static void 12521 ipif_assign_seqid(ipif_t *ipif) 12522 { 12523 ipif->ipif_seqid = atomic_add_64_nv(&ipif_g_seqid, 1); 12524 } 12525 12526 /* 12527 * Insert the ipif, so that the list of ipifs on the ill will be sorted 12528 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 12529 * be inserted into the first space available in the list. The value of 12530 * ipif_id will then be set to the appropriate value for its position. 12531 */ 12532 static int 12533 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 12534 { 12535 ill_t *ill; 12536 ipif_t *tipif; 12537 ipif_t **tipifp; 12538 int id; 12539 12540 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 12541 IAM_WRITER_IPIF(ipif)); 12542 12543 ill = ipif->ipif_ill; 12544 ASSERT(ill != NULL); 12545 12546 /* 12547 * In the case of lo0:0 we already hold the ill_g_lock. 12548 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 12549 * ipif_insert. Another such caller is ipif_move. 12550 */ 12551 if (acquire_g_lock) 12552 rw_enter(&ill_g_lock, RW_WRITER); 12553 if (acquire_ill_lock) 12554 mutex_enter(&ill->ill_lock); 12555 id = ipif->ipif_id; 12556 tipifp = &(ill->ill_ipif); 12557 if (id == -1) { /* need to find a real id */ 12558 id = 0; 12559 while ((tipif = *tipifp) != NULL) { 12560 ASSERT(tipif->ipif_id >= id); 12561 if (tipif->ipif_id != id) 12562 break; /* non-consecutive id */ 12563 id++; 12564 tipifp = &(tipif->ipif_next); 12565 } 12566 /* limit number of logical interfaces */ 12567 if (id >= ip_addrs_per_if) { 12568 if (acquire_ill_lock) 12569 mutex_exit(&ill->ill_lock); 12570 if (acquire_g_lock) 12571 rw_exit(&ill_g_lock); 12572 return (-1); 12573 } 12574 ipif->ipif_id = id; /* assign new id */ 12575 } else if (id < ip_addrs_per_if) { 12576 /* we have a real id; insert ipif in the right place */ 12577 while ((tipif = *tipifp) != NULL) { 12578 ASSERT(tipif->ipif_id != id); 12579 if (tipif->ipif_id > id) 12580 break; /* found correct location */ 12581 tipifp = &(tipif->ipif_next); 12582 } 12583 } else { 12584 if (acquire_ill_lock) 12585 mutex_exit(&ill->ill_lock); 12586 if (acquire_g_lock) 12587 rw_exit(&ill_g_lock); 12588 return (-1); 12589 } 12590 12591 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 12592 12593 ipif->ipif_next = tipif; 12594 *tipifp = ipif; 12595 if (acquire_ill_lock) 12596 mutex_exit(&ill->ill_lock); 12597 if (acquire_g_lock) 12598 rw_exit(&ill_g_lock); 12599 return (0); 12600 } 12601 12602 /* 12603 * Allocate and initialize a new interface control structure. (Always 12604 * called as writer.) 12605 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 12606 * is not part of the global linked list of ills. ipif_seqid is unique 12607 * in the system and to preserve the uniqueness, it is assigned only 12608 * when ill becomes part of the global list. At that point ill will 12609 * have a name. If it doesn't get assigned here, it will get assigned 12610 * in ipif_set_values() as part of SIOCSLIFNAME processing. 12611 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 12612 * the interface flags or any other information from the DL_INFO_ACK for 12613 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 12614 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 12615 * second DL_INFO_ACK comes in from the driver. 12616 */ 12617 static ipif_t * 12618 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 12619 { 12620 ipif_t *ipif; 12621 phyint_t *phyi; 12622 12623 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 12624 ill->ill_name, id, (void *)ill)); 12625 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 12626 12627 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 12628 return (NULL); 12629 *ipif = ipif_zero; /* start clean */ 12630 12631 ipif->ipif_ill = ill; 12632 ipif->ipif_id = id; /* could be -1 */ 12633 ipif->ipif_zoneid = GLOBAL_ZONEID; 12634 12635 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 12636 12637 ipif->ipif_refcnt = 0; 12638 ipif->ipif_saved_ire_cnt = 0; 12639 12640 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 12641 mi_free(ipif); 12642 return (NULL); 12643 } 12644 /* -1 id should have been replaced by real id */ 12645 id = ipif->ipif_id; 12646 ASSERT(id >= 0); 12647 12648 if (ill->ill_name[0] != '\0') { 12649 ipif_assign_seqid(ipif); 12650 if (ill->ill_phyint->phyint_ifindex != 0) 12651 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 12652 } 12653 /* 12654 * Keep a copy of original id in ipif_orig_ipifid. Failback 12655 * will attempt to restore the original id. The SIOCSLIFOINDEX 12656 * ioctl sets ipif_orig_ipifid to zero. 12657 */ 12658 ipif->ipif_orig_ipifid = id; 12659 12660 /* 12661 * We grab the ill_lock and phyint_lock to protect the flag changes. 12662 * The ipif is still not up and can't be looked up until the 12663 * ioctl completes and the IPIF_CHANGING flag is cleared. 12664 */ 12665 mutex_enter(&ill->ill_lock); 12666 mutex_enter(&ill->ill_phyint->phyint_lock); 12667 /* 12668 * Set the running flag when logical interface zero is created. 12669 * For subsequent logical interfaces, a DLPI link down 12670 * notification message may have cleared the running flag to 12671 * indicate the link is down, so we shouldn't just blindly set it. 12672 */ 12673 if (id == 0) 12674 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 12675 ipif->ipif_ire_type = ire_type; 12676 phyi = ill->ill_phyint; 12677 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 12678 12679 if (ipif->ipif_isv6) { 12680 ill->ill_flags |= ILLF_IPV6; 12681 } else { 12682 ipaddr_t inaddr_any = INADDR_ANY; 12683 12684 ill->ill_flags |= ILLF_IPV4; 12685 12686 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 12687 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12688 &ipif->ipif_v6lcl_addr); 12689 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12690 &ipif->ipif_v6src_addr); 12691 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12692 &ipif->ipif_v6subnet); 12693 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12694 &ipif->ipif_v6net_mask); 12695 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12696 &ipif->ipif_v6brd_addr); 12697 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12698 &ipif->ipif_v6pp_dst_addr); 12699 } 12700 12701 /* 12702 * Don't set the interface flags etc. now, will do it in 12703 * ip_ll_subnet_defaults. 12704 */ 12705 if (!initialize) { 12706 mutex_exit(&ill->ill_lock); 12707 mutex_exit(&ill->ill_phyint->phyint_lock); 12708 return (ipif); 12709 } 12710 ipif->ipif_mtu = ill->ill_max_mtu; 12711 12712 if (ill->ill_bcast_addr_length != 0) { 12713 /* 12714 * Later detect lack of DLPI driver multicast 12715 * capability by catching DL_ENABMULTI errors in 12716 * ip_rput_dlpi. 12717 */ 12718 ill->ill_flags |= ILLF_MULTICAST; 12719 if (!ipif->ipif_isv6) 12720 ipif->ipif_flags |= IPIF_BROADCAST; 12721 } else { 12722 if (ill->ill_net_type != IRE_LOOPBACK) { 12723 if (ipif->ipif_isv6) 12724 /* 12725 * Note: xresolv interfaces will eventually need 12726 * NOARP set here as well, but that will require 12727 * those external resolvers to have some 12728 * knowledge of that flag and act appropriately. 12729 * Not to be changed at present. 12730 */ 12731 ill->ill_flags |= ILLF_NONUD; 12732 else 12733 ill->ill_flags |= ILLF_NOARP; 12734 } 12735 if (ill->ill_phys_addr_length == 0) { 12736 if (ill->ill_media && 12737 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 12738 ipif->ipif_flags |= IPIF_NOXMIT; 12739 phyi->phyint_flags |= PHYI_VIRTUAL; 12740 } else { 12741 /* pt-pt supports multicast. */ 12742 ill->ill_flags |= ILLF_MULTICAST; 12743 if (ill->ill_net_type == IRE_LOOPBACK) { 12744 phyi->phyint_flags |= 12745 (PHYI_LOOPBACK | PHYI_VIRTUAL); 12746 } else { 12747 ipif->ipif_flags |= IPIF_POINTOPOINT; 12748 } 12749 } 12750 } 12751 } 12752 mutex_exit(&ill->ill_lock); 12753 mutex_exit(&ill->ill_phyint->phyint_lock); 12754 return (ipif); 12755 } 12756 12757 /* 12758 * If appropriate, send a message up to the resolver delete the entry 12759 * for the address of this interface which is going out of business. 12760 * (Always called as writer). 12761 * 12762 * NOTE : We need to check for NULL mps as some of the fields are 12763 * initialized only for some interface types. See ipif_resolver_up() 12764 * for details. 12765 */ 12766 void 12767 ipif_arp_down(ipif_t *ipif) 12768 { 12769 mblk_t *mp; 12770 12771 ip1dbg(("ipif_arp_down(%s:%u)\n", 12772 ipif->ipif_ill->ill_name, ipif->ipif_id)); 12773 ASSERT(IAM_WRITER_IPIF(ipif)); 12774 12775 /* Delete the mapping for the local address */ 12776 mp = ipif->ipif_arp_del_mp; 12777 if (mp != NULL) { 12778 ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", 12779 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12780 ipif->ipif_ill->ill_name, ipif->ipif_id)); 12781 putnext(ipif->ipif_ill->ill_rq, mp); 12782 ipif->ipif_arp_del_mp = NULL; 12783 } 12784 12785 /* 12786 * If this is the last ipif that is going down, we need 12787 * to clean up ARP completely. 12788 */ 12789 if (ipif->ipif_ill->ill_ipif_up_count == 0) { 12790 12791 /* Send up AR_INTERFACE_DOWN message */ 12792 mp = ipif->ipif_ill->ill_arp_down_mp; 12793 if (mp != NULL) { 12794 ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", 12795 dlpi_prim_str(*(int *)mp->b_rptr), 12796 *(int *)mp->b_rptr, ipif->ipif_ill->ill_name, 12797 ipif->ipif_id)); 12798 putnext(ipif->ipif_ill->ill_rq, mp); 12799 ipif->ipif_ill->ill_arp_down_mp = NULL; 12800 } 12801 12802 /* Tell ARP to delete the multicast mappings */ 12803 mp = ipif->ipif_ill->ill_arp_del_mapping_mp; 12804 if (mp != NULL) { 12805 ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", 12806 dlpi_prim_str(*(int *)mp->b_rptr), 12807 *(int *)mp->b_rptr, ipif->ipif_ill->ill_name, 12808 ipif->ipif_id)); 12809 putnext(ipif->ipif_ill->ill_rq, mp); 12810 ipif->ipif_ill->ill_arp_del_mapping_mp = NULL; 12811 } 12812 } 12813 } 12814 12815 /* 12816 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 12817 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 12818 * that it wants the add_mp allocated in this function to be returned 12819 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 12820 * just re-do the multicast, it wants us to send the add_mp to ARP also. 12821 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 12822 * as it does a ipif_arp_down after calling this function - which will 12823 * remove what we add here. 12824 * 12825 * Returns -1 on failures and 0 on success. 12826 */ 12827 int 12828 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 12829 { 12830 mblk_t *del_mp = NULL; 12831 mblk_t *add_mp = NULL; 12832 mblk_t *mp; 12833 ill_t *ill = ipif->ipif_ill; 12834 phyint_t *phyi = ill->ill_phyint; 12835 ipaddr_t addr, mask, extract_mask = 0; 12836 arma_t *arma; 12837 uint8_t *maddr, *bphys_addr; 12838 uint32_t hw_start; 12839 dl_unitdata_req_t *dlur; 12840 12841 ASSERT(IAM_WRITER_IPIF(ipif)); 12842 if (ipif->ipif_flags & IPIF_POINTOPOINT) 12843 return (0); 12844 12845 /* 12846 * Delete the existing mapping from ARP. Normally ipif_down 12847 * -> ipif_arp_down should send this up to ARP. The only 12848 * reason we would find this when we are switching from 12849 * Multicast to Broadcast where we did not do a down. 12850 */ 12851 mp = ill->ill_arp_del_mapping_mp; 12852 if (mp != NULL) { 12853 ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", 12854 dlpi_prim_str(*(int *)mp->b_rptr), 12855 *(int *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 12856 putnext(ill->ill_rq, mp); 12857 ill->ill_arp_del_mapping_mp = NULL; 12858 } 12859 12860 if (arp_add_mapping_mp != NULL) 12861 *arp_add_mapping_mp = NULL; 12862 12863 /* 12864 * Check that the address is not to long for the constant 12865 * length reserved in the template arma_t. 12866 */ 12867 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 12868 return (-1); 12869 12870 /* Add mapping mblk */ 12871 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 12872 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 12873 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 12874 (caddr_t)&addr); 12875 if (add_mp == NULL) 12876 return (-1); 12877 arma = (arma_t *)add_mp->b_rptr; 12878 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 12879 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 12880 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 12881 12882 /* 12883 * Determine the broadcast address. 12884 */ 12885 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 12886 if (ill->ill_sap_length < 0) 12887 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 12888 else 12889 bphys_addr = (uchar_t *)dlur + 12890 dlur->dl_dest_addr_offset + ill->ill_sap_length; 12891 /* 12892 * Check PHYI_MULTI_BCAST and length of physical 12893 * address to determine if we use the mapping or the 12894 * broadcast address. 12895 */ 12896 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 12897 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 12898 bphys_addr, maddr, &hw_start, &extract_mask)) 12899 phyi->phyint_flags |= PHYI_MULTI_BCAST; 12900 12901 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 12902 (ill->ill_flags & ILLF_MULTICAST)) { 12903 /* Make sure this will not match the "exact" entry. */ 12904 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 12905 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 12906 (caddr_t)&addr); 12907 if (del_mp == NULL) { 12908 freemsg(add_mp); 12909 return (-1); 12910 } 12911 bcopy(&extract_mask, (char *)arma + 12912 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 12913 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 12914 /* Use link-layer broadcast address for MULTI_BCAST */ 12915 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 12916 ip2dbg(("ipif_arp_setup_multicast: adding" 12917 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 12918 } else { 12919 arma->arma_hw_mapping_start = hw_start; 12920 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 12921 " ARP setup for %s\n", ill->ill_name)); 12922 } 12923 } else { 12924 freemsg(add_mp); 12925 ASSERT(del_mp == NULL); 12926 /* It is neither MULTICAST nor MULTI_BCAST */ 12927 return (0); 12928 } 12929 ASSERT(add_mp != NULL && del_mp != NULL); 12930 ill->ill_arp_del_mapping_mp = del_mp; 12931 if (arp_add_mapping_mp != NULL) { 12932 /* The caller just wants the mblks allocated */ 12933 *arp_add_mapping_mp = add_mp; 12934 } else { 12935 /* The caller wants us to send it to arp */ 12936 putnext(ill->ill_rq, add_mp); 12937 } 12938 return (0); 12939 } 12940 12941 /* 12942 * Get the resolver set up for a new interface address. 12943 * (Always called as writer.) 12944 * Called both for IPv4 and IPv6 interfaces, 12945 * though it only sets up the resolver for v6 12946 * if it's an xresolv interface (one using an external resolver). 12947 * Honors ILLF_NOARP. 12948 * The boolean value arp_just_publish, if B_TRUE, indicates that 12949 * it only needs to send an AR_ENTRY_ADD message up to ARP for 12950 * IPv4 interfaces. Currently, B_TRUE is only set when this 12951 * function is called by ip_rput_dlpi_writer() to handle 12952 * asynchronous hardware address change notification. 12953 * Returns error on failure. 12954 */ 12955 int 12956 ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish) 12957 { 12958 caddr_t addr; 12959 mblk_t *arp_up_mp = NULL; 12960 mblk_t *arp_down_mp = NULL; 12961 mblk_t *arp_add_mp = NULL; 12962 mblk_t *arp_del_mp = NULL; 12963 mblk_t *arp_add_mapping_mp = NULL; 12964 mblk_t *arp_del_mapping_mp = NULL; 12965 ill_t *ill = ipif->ipif_ill; 12966 uchar_t *area_p = NULL; 12967 uchar_t *ared_p = NULL; 12968 int err = ENOMEM; 12969 12970 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 12971 ipif->ipif_ill->ill_name, ipif->ipif_id, 12972 (uint_t)ipif->ipif_flags)); 12973 ASSERT(IAM_WRITER_IPIF(ipif)); 12974 12975 if ((ill->ill_net_type != IRE_IF_RESOLVER) || 12976 (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))) { 12977 return (0); 12978 } 12979 12980 if (ill->ill_isv6) { 12981 /* 12982 * External resolver for IPv6 12983 */ 12984 ASSERT(!arp_just_publish); 12985 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 12986 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 12987 area_p = (uchar_t *)&ip6_area_template; 12988 ared_p = (uchar_t *)&ip6_ared_template; 12989 } 12990 } else { 12991 /* 12992 * IPv4 arp case. If the ARP stream has already started 12993 * closing, fail this request for ARP bringup. Else 12994 * record the fact that an ARP bringup is pending. 12995 */ 12996 mutex_enter(&ill->ill_lock); 12997 if (ill->ill_arp_closing) { 12998 mutex_exit(&ill->ill_lock); 12999 err = EINVAL; 13000 goto failed; 13001 } else { 13002 if (ill->ill_ipif_up_count == 0) 13003 ill->ill_arp_bringup_pending = 1; 13004 mutex_exit(&ill->ill_lock); 13005 } 13006 if (ipif->ipif_lcl_addr != INADDR_ANY) { 13007 addr = (caddr_t)&ipif->ipif_lcl_addr; 13008 area_p = (uchar_t *)&ip_area_template; 13009 ared_p = (uchar_t *)&ip_ared_template; 13010 } 13011 } 13012 13013 /* 13014 * Add an entry for the local address in ARP only if it 13015 * is not UNNUMBERED and the address is not INADDR_ANY. 13016 */ 13017 if (((ipif->ipif_flags & IPIF_UNNUMBERED) == 0) && area_p != NULL) { 13018 /* Now ask ARP to publish our address. */ 13019 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 13020 if (arp_add_mp == NULL) 13021 goto failed; 13022 if (arp_just_publish) { 13023 /* 13024 * Copy the new hardware address and length into 13025 * arp_add_mp to be sent to ARP. 13026 */ 13027 area_t *area = (area_t *)arp_add_mp->b_rptr; 13028 area->area_hw_addr_length = 13029 ill->ill_phys_addr_length; 13030 bcopy((char *)ill->ill_phys_addr, 13031 ((char *)area + area->area_hw_addr_offset), 13032 area->area_hw_addr_length); 13033 } 13034 13035 ((area_t *)arp_add_mp->b_rptr)->area_flags = 13036 ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; 13037 13038 if (arp_just_publish) 13039 goto arp_setup_multicast; 13040 13041 /* 13042 * Allocate an ARP deletion message so we know we can tell ARP 13043 * when the interface goes down. 13044 */ 13045 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 13046 if (arp_del_mp == NULL) 13047 goto failed; 13048 13049 } else { 13050 if (arp_just_publish) 13051 goto done; 13052 } 13053 /* 13054 * Need to bring up ARP or setup multicast mapping only 13055 * when the first interface is coming UP. 13056 */ 13057 if (ill->ill_ipif_up_count != 0) 13058 goto done; 13059 13060 /* 13061 * Allocate an ARP down message (to be saved) and an ARP up 13062 * message. 13063 */ 13064 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13065 if (arp_down_mp == NULL) 13066 goto failed; 13067 13068 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13069 if (arp_up_mp == NULL) 13070 goto failed; 13071 13072 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13073 goto done; 13074 13075 arp_setup_multicast: 13076 /* 13077 * Setup the multicast mappings. This function initializes 13078 * ill_arp_del_mapping_mp also. This does not need to be done for 13079 * IPv6. 13080 */ 13081 if (!ill->ill_isv6) { 13082 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13083 if (err != 0) 13084 goto failed; 13085 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13086 ASSERT(arp_add_mapping_mp != NULL); 13087 } 13088 13089 done:; 13090 if (arp_del_mp != NULL) { 13091 ASSERT(ipif->ipif_arp_del_mp == NULL); 13092 ipif->ipif_arp_del_mp = arp_del_mp; 13093 } 13094 if (arp_down_mp != NULL) { 13095 ASSERT(ill->ill_arp_down_mp == NULL); 13096 ill->ill_arp_down_mp = arp_down_mp; 13097 } 13098 if (arp_del_mapping_mp != NULL) { 13099 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13100 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13101 } 13102 if (arp_up_mp != NULL) { 13103 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13104 ipif->ipif_ill->ill_name, ipif->ipif_id)); 13105 putnext(ill->ill_rq, arp_up_mp); 13106 } 13107 if (arp_add_mp != NULL) { 13108 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13109 ipif->ipif_ill->ill_name, ipif->ipif_id)); 13110 putnext(ill->ill_rq, arp_add_mp); 13111 } 13112 if (arp_add_mapping_mp != NULL) { 13113 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13114 ipif->ipif_ill->ill_name, ipif->ipif_id)); 13115 putnext(ill->ill_rq, arp_add_mapping_mp); 13116 } 13117 if (arp_just_publish) 13118 return (0); 13119 13120 if (ill->ill_flags & ILLF_NOARP) 13121 err = ill_arp_off(ill); 13122 else 13123 err = ill_arp_on(ill); 13124 if (err) { 13125 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 13126 freemsg(ipif->ipif_arp_del_mp); 13127 if (arp_down_mp != NULL) 13128 freemsg(ill->ill_arp_down_mp); 13129 if (ill->ill_arp_del_mapping_mp != NULL) 13130 freemsg(ill->ill_arp_del_mapping_mp); 13131 ipif->ipif_arp_del_mp = NULL; 13132 ill->ill_arp_down_mp = NULL; 13133 ill->ill_arp_del_mapping_mp = NULL; 13134 return (err); 13135 } 13136 return (ill->ill_ipif_up_count != 0 ? 0 : EINPROGRESS); 13137 13138 failed:; 13139 ip1dbg(("ipif_resolver_up: FAILED\n")); 13140 freemsg(arp_add_mp); 13141 freemsg(arp_del_mp); 13142 freemsg(arp_add_mapping_mp); 13143 freemsg(arp_up_mp); 13144 freemsg(arp_down_mp); 13145 ill->ill_arp_bringup_pending = 0; 13146 return (err); 13147 } 13148 13149 /* 13150 * Wakeup all threads waiting to enter the ipsq, and sleeping 13151 * on any of the ills in this ipsq. The ill_lock of the ill 13152 * must be held so that waiters don't miss wakeups 13153 */ 13154 static void 13155 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 13156 { 13157 phyint_t *phyint; 13158 13159 phyint = ipsq->ipsq_phyint_list; 13160 while (phyint != NULL) { 13161 if (phyint->phyint_illv4) { 13162 if (!caller_holds_lock) 13163 mutex_enter(&phyint->phyint_illv4->ill_lock); 13164 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13165 cv_broadcast(&phyint->phyint_illv4->ill_cv); 13166 if (!caller_holds_lock) 13167 mutex_exit(&phyint->phyint_illv4->ill_lock); 13168 } 13169 if (phyint->phyint_illv6) { 13170 if (!caller_holds_lock) 13171 mutex_enter(&phyint->phyint_illv6->ill_lock); 13172 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13173 cv_broadcast(&phyint->phyint_illv6->ill_cv); 13174 if (!caller_holds_lock) 13175 mutex_exit(&phyint->phyint_illv6->ill_lock); 13176 } 13177 phyint = phyint->phyint_ipsq_next; 13178 } 13179 } 13180 13181 static ipsq_t * 13182 ipsq_create(char *groupname) 13183 { 13184 ipsq_t *ipsq; 13185 13186 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13187 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 13188 if (ipsq == NULL) { 13189 return (NULL); 13190 } 13191 13192 if (groupname != NULL) 13193 (void) strcpy(ipsq->ipsq_name, groupname); 13194 else 13195 ipsq->ipsq_name[0] = '\0'; 13196 13197 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 13198 ipsq->ipsq_flags |= IPSQ_GROUP; 13199 ipsq->ipsq_next = ipsq_g_head; 13200 ipsq_g_head = ipsq; 13201 return (ipsq); 13202 } 13203 13204 /* 13205 * Return an ipsq correspoding to the groupname. If 'create' is true 13206 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 13207 * uniquely with an IPMP group. However during IPMP groupname operations, 13208 * multiple IPMP groups may be associated with a single ipsq. But no 13209 * IPMP group can be associated with more than 1 ipsq at any time. 13210 * For example 13211 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 13212 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 13213 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 13214 * 13215 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 13216 * status shown below during the execution of the above command. 13217 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 13218 * 13219 * After the completion of the above groupname command we return to the stable 13220 * state shown below. 13221 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 13222 * hme4 mpk17-85 ipsq2 mpk17-85 1 13223 * 13224 * Because of the above, we don't search based on the ipsq_name since that 13225 * would miss the correct ipsq during certain windows as shown above. 13226 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 13227 * natural state. 13228 */ 13229 static ipsq_t * 13230 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq) 13231 { 13232 ipsq_t *ipsq; 13233 int group_len; 13234 phyint_t *phyint; 13235 13236 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 13237 13238 group_len = strlen(groupname); 13239 ASSERT(group_len != 0); 13240 group_len++; 13241 13242 for (ipsq = ipsq_g_head; ipsq != NULL; ipsq = ipsq->ipsq_next) { 13243 /* 13244 * When an ipsq is being split, and ill_split_ipsq 13245 * calls this function, we exclude it from being considered. 13246 */ 13247 if (ipsq == exclude_ipsq) 13248 continue; 13249 13250 /* 13251 * Compare against the ipsq_name. The groupname change happens 13252 * in 2 phases. The 1st phase merges the from group into 13253 * the to group's ipsq, by calling ill_merge_groups and restarts 13254 * the ioctl. The 2nd phase then locates the ipsq again thru 13255 * ipsq_name. At this point the phyint_groupname has not been 13256 * updated. 13257 */ 13258 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 13259 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 13260 /* 13261 * Verify that an ipmp groupname is exactly 13262 * part of 1 ipsq and is not found in any other 13263 * ipsq. 13264 */ 13265 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) == 13266 NULL); 13267 return (ipsq); 13268 } 13269 13270 /* 13271 * Comparison against ipsq_name alone is not sufficient. 13272 * In the case when groups are currently being 13273 * merged, the ipsq could hold other IPMP groups temporarily. 13274 * so we walk the phyint list and compare against the 13275 * phyint_groupname as well. 13276 */ 13277 phyint = ipsq->ipsq_phyint_list; 13278 while (phyint != NULL) { 13279 if ((group_len == phyint->phyint_groupname_len) && 13280 (bcmp(phyint->phyint_groupname, groupname, 13281 group_len) == 0)) { 13282 /* 13283 * Verify that an ipmp groupname is exactly 13284 * part of 1 ipsq and is not found in any other 13285 * ipsq. 13286 */ 13287 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) 13288 == NULL); 13289 return (ipsq); 13290 } 13291 phyint = phyint->phyint_ipsq_next; 13292 } 13293 } 13294 if (create) 13295 ipsq = ipsq_create(groupname); 13296 return (ipsq); 13297 } 13298 13299 static void 13300 ipsq_delete(ipsq_t *ipsq) 13301 { 13302 ipsq_t *nipsq; 13303 ipsq_t *pipsq = NULL; 13304 13305 /* 13306 * We don't hold the ipsq lock, but we are sure no new 13307 * messages can land up, since the ipsq_refs is zero. 13308 * i.e. this ipsq is unnamed and no phyint or phyint group 13309 * is associated with this ipsq. (Lookups are based on ill_name 13310 * or phyint_group_name) 13311 */ 13312 ASSERT(ipsq->ipsq_refs == 0); 13313 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 13314 ASSERT(ipsq->ipsq_pending_mp == NULL); 13315 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 13316 /* 13317 * This is not the ipsq of an IPMP group. 13318 */ 13319 kmem_free(ipsq, sizeof (ipsq_t)); 13320 return; 13321 } 13322 13323 rw_enter(&ill_g_lock, RW_WRITER); 13324 13325 /* 13326 * Locate the ipsq before we can remove it from 13327 * the singly linked list of ipsq's. 13328 */ 13329 for (nipsq = ipsq_g_head; nipsq != NULL; nipsq = nipsq->ipsq_next) { 13330 if (nipsq == ipsq) { 13331 break; 13332 } 13333 pipsq = nipsq; 13334 } 13335 13336 ASSERT(nipsq == ipsq); 13337 13338 /* unlink ipsq from the list */ 13339 if (pipsq != NULL) 13340 pipsq->ipsq_next = ipsq->ipsq_next; 13341 else 13342 ipsq_g_head = ipsq->ipsq_next; 13343 kmem_free(ipsq, sizeof (ipsq_t)); 13344 rw_exit(&ill_g_lock); 13345 } 13346 13347 static void 13348 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 13349 queue_t *q) 13350 13351 { 13352 13353 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 13354 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 13355 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 13356 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 13357 ASSERT(current_mp != NULL); 13358 13359 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 13360 NEW_OP, NULL); 13361 13362 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 13363 new_ipsq->ipsq_xopq_mphead != NULL); 13364 13365 /* 13366 * move from old ipsq to the new ipsq. 13367 */ 13368 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 13369 if (old_ipsq->ipsq_xopq_mphead != NULL) 13370 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 13371 13372 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 13373 } 13374 13375 void 13376 ill_group_cleanup(ill_t *ill) 13377 { 13378 ill_t *ill_v4; 13379 ill_t *ill_v6; 13380 ipif_t *ipif; 13381 13382 ill_v4 = ill->ill_phyint->phyint_illv4; 13383 ill_v6 = ill->ill_phyint->phyint_illv6; 13384 13385 if (ill_v4 != NULL) { 13386 mutex_enter(&ill_v4->ill_lock); 13387 for (ipif = ill_v4->ill_ipif; ipif != NULL; 13388 ipif = ipif->ipif_next) { 13389 IPIF_UNMARK_MOVING(ipif); 13390 } 13391 ill_v4->ill_up_ipifs = B_FALSE; 13392 mutex_exit(&ill_v4->ill_lock); 13393 } 13394 13395 if (ill_v6 != NULL) { 13396 mutex_enter(&ill_v6->ill_lock); 13397 for (ipif = ill_v6->ill_ipif; ipif != NULL; 13398 ipif = ipif->ipif_next) { 13399 IPIF_UNMARK_MOVING(ipif); 13400 } 13401 ill_v6->ill_up_ipifs = B_FALSE; 13402 mutex_exit(&ill_v6->ill_lock); 13403 } 13404 } 13405 /* 13406 * This function is called when an ill has had a change in its group status 13407 * to bring up all the ipifs that were up before the change. 13408 */ 13409 int 13410 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 13411 { 13412 ipif_t *ipif; 13413 ill_t *ill_v4; 13414 ill_t *ill_v6; 13415 ill_t *from_ill; 13416 int err = 0; 13417 13418 13419 ASSERT(IAM_WRITER_ILL(ill)); 13420 13421 /* 13422 * Except for ipif_state_flags and ill_state_flags the other 13423 * fields of the ipif/ill that are modified below are protected 13424 * implicitly since we are a writer. We would have tried to down 13425 * even an ipif that was already down, in ill_down_ipifs. So we 13426 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 13427 */ 13428 ill_v4 = ill->ill_phyint->phyint_illv4; 13429 ill_v6 = ill->ill_phyint->phyint_illv6; 13430 if (ill_v4 != NULL) { 13431 ill_v4->ill_up_ipifs = B_TRUE; 13432 for (ipif = ill_v4->ill_ipif; ipif != NULL; 13433 ipif = ipif->ipif_next) { 13434 mutex_enter(&ill_v4->ill_lock); 13435 ipif->ipif_state_flags &= ~IPIF_CHANGING; 13436 IPIF_UNMARK_MOVING(ipif); 13437 mutex_exit(&ill_v4->ill_lock); 13438 if (ipif->ipif_was_up) { 13439 if (!(ipif->ipif_flags & IPIF_UP)) 13440 err = ipif_up(ipif, q, mp); 13441 ipif->ipif_was_up = B_FALSE; 13442 if (err != 0) { 13443 /* 13444 * Can there be any other error ? 13445 */ 13446 ASSERT(err == EINPROGRESS); 13447 return (err); 13448 } 13449 } 13450 } 13451 mutex_enter(&ill_v4->ill_lock); 13452 ill_v4->ill_state_flags &= ~ILL_CHANGING; 13453 mutex_exit(&ill_v4->ill_lock); 13454 ill_v4->ill_up_ipifs = B_FALSE; 13455 if (ill_v4->ill_move_in_progress) { 13456 ASSERT(ill_v4->ill_move_peer != NULL); 13457 ill_v4->ill_move_in_progress = B_FALSE; 13458 from_ill = ill_v4->ill_move_peer; 13459 from_ill->ill_move_in_progress = B_FALSE; 13460 from_ill->ill_move_peer = NULL; 13461 mutex_enter(&from_ill->ill_lock); 13462 from_ill->ill_state_flags &= ~ILL_CHANGING; 13463 mutex_exit(&from_ill->ill_lock); 13464 if (ill_v6 == NULL) { 13465 if (from_ill->ill_phyint->phyint_flags & 13466 PHYI_STANDBY) { 13467 phyint_inactive(from_ill->ill_phyint); 13468 } 13469 if (ill_v4->ill_phyint->phyint_flags & 13470 PHYI_STANDBY) { 13471 phyint_inactive(ill_v4->ill_phyint); 13472 } 13473 } 13474 ill_v4->ill_move_peer = NULL; 13475 } 13476 } 13477 13478 if (ill_v6 != NULL) { 13479 ill_v6->ill_up_ipifs = B_TRUE; 13480 for (ipif = ill_v6->ill_ipif; ipif != NULL; 13481 ipif = ipif->ipif_next) { 13482 mutex_enter(&ill_v6->ill_lock); 13483 ipif->ipif_state_flags &= ~IPIF_CHANGING; 13484 IPIF_UNMARK_MOVING(ipif); 13485 mutex_exit(&ill_v6->ill_lock); 13486 if (ipif->ipif_was_up) { 13487 if (!(ipif->ipif_flags & IPIF_UP)) 13488 err = ipif_up(ipif, q, mp); 13489 ipif->ipif_was_up = B_FALSE; 13490 if (err != 0) { 13491 /* 13492 * Can there be any other error ? 13493 */ 13494 ASSERT(err == EINPROGRESS); 13495 return (err); 13496 } 13497 } 13498 } 13499 mutex_enter(&ill_v6->ill_lock); 13500 ill_v6->ill_state_flags &= ~ILL_CHANGING; 13501 mutex_exit(&ill_v6->ill_lock); 13502 ill_v6->ill_up_ipifs = B_FALSE; 13503 if (ill_v6->ill_move_in_progress) { 13504 ASSERT(ill_v6->ill_move_peer != NULL); 13505 ill_v6->ill_move_in_progress = B_FALSE; 13506 from_ill = ill_v6->ill_move_peer; 13507 from_ill->ill_move_in_progress = B_FALSE; 13508 from_ill->ill_move_peer = NULL; 13509 mutex_enter(&from_ill->ill_lock); 13510 from_ill->ill_state_flags &= ~ILL_CHANGING; 13511 mutex_exit(&from_ill->ill_lock); 13512 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 13513 phyint_inactive(from_ill->ill_phyint); 13514 } 13515 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 13516 phyint_inactive(ill_v6->ill_phyint); 13517 } 13518 ill_v6->ill_move_peer = NULL; 13519 } 13520 } 13521 return (0); 13522 } 13523 13524 /* 13525 * bring down all the approriate ipifs. 13526 */ 13527 /* ARGSUSED */ 13528 static void 13529 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 13530 { 13531 ipif_t *ipif; 13532 13533 ASSERT(IAM_WRITER_ILL(ill)); 13534 13535 /* 13536 * Except for ipif_state_flags the other fields of the ipif/ill that 13537 * are modified below are protected implicitly since we are a writer 13538 */ 13539 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13540 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 13541 continue; 13542 if (index == 0 || index == ipif->ipif_orig_ifindex) { 13543 /* 13544 * We go through the ipif_down logic even if the ipif 13545 * is already down, since routes can be added based 13546 * on down ipifs. Going through ipif_down once again 13547 * will delete any IREs created based on these routes. 13548 */ 13549 if (ipif->ipif_flags & IPIF_UP) 13550 ipif->ipif_was_up = B_TRUE; 13551 /* 13552 * If called with chk_nofailover true ipif is moving. 13553 */ 13554 mutex_enter(&ill->ill_lock); 13555 if (chk_nofailover) { 13556 ipif->ipif_state_flags |= 13557 IPIF_MOVING | IPIF_CHANGING; 13558 } else { 13559 ipif->ipif_state_flags |= IPIF_CHANGING; 13560 } 13561 mutex_exit(&ill->ill_lock); 13562 /* 13563 * Need to re-create net/subnet bcast ires if 13564 * they are dependent on ipif. 13565 */ 13566 if (!ipif->ipif_isv6) 13567 ipif_check_bcast_ires(ipif); 13568 (void) ipif_logical_down(ipif, NULL, NULL); 13569 ipif_down_tail(ipif); 13570 /* 13571 * We don't do ipif_multicast_down for IPv4 in 13572 * ipif_down. We need to set this so that 13573 * ipif_multicast_up will join the 13574 * ALLHOSTS_GROUP on to_ill. 13575 */ 13576 ipif->ipif_multicast_up = B_FALSE; 13577 } 13578 } 13579 } 13580 13581 #define IPSQ_INC_REF(ipsq) { \ 13582 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 13583 (ipsq)->ipsq_refs++; \ 13584 } 13585 13586 #define IPSQ_DEC_REF(ipsq) { \ 13587 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 13588 (ipsq)->ipsq_refs--; \ 13589 if ((ipsq)->ipsq_refs == 0) \ 13590 (ipsq)->ipsq_name[0] = '\0'; \ 13591 } 13592 13593 /* 13594 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 13595 * new_ipsq. 13596 */ 13597 static void 13598 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq) 13599 { 13600 phyint_t *phyint; 13601 phyint_t *next_phyint; 13602 13603 /* 13604 * To change the ipsq of an ill, we need to hold the ill_g_lock as 13605 * writer and the ill_lock of the ill in question. Also the dest 13606 * ipsq can't vanish while we hold the ill_g_lock as writer. 13607 */ 13608 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13609 13610 phyint = cur_ipsq->ipsq_phyint_list; 13611 cur_ipsq->ipsq_phyint_list = NULL; 13612 while (phyint != NULL) { 13613 next_phyint = phyint->phyint_ipsq_next; 13614 IPSQ_DEC_REF(cur_ipsq); 13615 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 13616 new_ipsq->ipsq_phyint_list = phyint; 13617 IPSQ_INC_REF(new_ipsq); 13618 phyint->phyint_ipsq = new_ipsq; 13619 phyint = next_phyint; 13620 } 13621 } 13622 13623 #define SPLIT_SUCCESS 0 13624 #define SPLIT_NOT_NEEDED 1 13625 #define SPLIT_FAILED 2 13626 13627 int 13628 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry) 13629 { 13630 ipsq_t *newipsq = NULL; 13631 13632 /* 13633 * Assertions denote pre-requisites for changing the ipsq of 13634 * a phyint 13635 */ 13636 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13637 /* 13638 * <ill-phyint> assocs can't change while ill_g_lock 13639 * is held as writer. See ill_phyint_reinit() 13640 */ 13641 ASSERT(phyint->phyint_illv4 == NULL || 13642 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13643 ASSERT(phyint->phyint_illv6 == NULL || 13644 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13645 13646 if ((phyint->phyint_groupname_len != 13647 (strlen(cur_ipsq->ipsq_name) + 1) || 13648 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 13649 phyint->phyint_groupname_len) != 0)) { 13650 /* 13651 * Once we fail in creating a new ipsq due to memory shortage, 13652 * don't attempt to create new ipsq again, based on another 13653 * phyint, since we want all phyints belonging to an IPMP group 13654 * to be in the same ipsq even in the event of mem alloc fails. 13655 */ 13656 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 13657 cur_ipsq); 13658 if (newipsq == NULL) { 13659 /* Memory allocation failure */ 13660 return (SPLIT_FAILED); 13661 } else { 13662 /* ipsq_refs protected by ill_g_lock (writer) */ 13663 IPSQ_DEC_REF(cur_ipsq); 13664 phyint->phyint_ipsq = newipsq; 13665 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 13666 newipsq->ipsq_phyint_list = phyint; 13667 IPSQ_INC_REF(newipsq); 13668 return (SPLIT_SUCCESS); 13669 } 13670 } 13671 return (SPLIT_NOT_NEEDED); 13672 } 13673 13674 /* 13675 * The ill locks of the phyint and the ill_g_lock (writer) must be held 13676 * to do this split 13677 */ 13678 static int 13679 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq) 13680 { 13681 ipsq_t *newipsq; 13682 13683 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 13684 /* 13685 * <ill-phyint> assocs can't change while ill_g_lock 13686 * is held as writer. See ill_phyint_reinit() 13687 */ 13688 13689 ASSERT(phyint->phyint_illv4 == NULL || 13690 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 13691 ASSERT(phyint->phyint_illv6 == NULL || 13692 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 13693 13694 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 13695 phyint->phyint_illv4: phyint->phyint_illv6)) { 13696 /* 13697 * ipsq_init failed due to no memory 13698 * caller will use the same ipsq 13699 */ 13700 return (SPLIT_FAILED); 13701 } 13702 13703 /* ipsq_ref is protected by ill_g_lock (writer) */ 13704 IPSQ_DEC_REF(cur_ipsq); 13705 13706 /* 13707 * This is a new ipsq that is unknown to the world. 13708 * So we don't need to hold ipsq_lock, 13709 */ 13710 newipsq = phyint->phyint_ipsq; 13711 newipsq->ipsq_writer = NULL; 13712 newipsq->ipsq_reentry_cnt--; 13713 ASSERT(newipsq->ipsq_reentry_cnt == 0); 13714 #ifdef ILL_DEBUG 13715 newipsq->ipsq_depth = 0; 13716 #endif 13717 13718 return (SPLIT_SUCCESS); 13719 } 13720 13721 /* 13722 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 13723 * ipsq's representing their individual groups or themselves. Return 13724 * whether split needs to be retried again later. 13725 */ 13726 static boolean_t 13727 ill_split_ipsq(ipsq_t *cur_ipsq) 13728 { 13729 phyint_t *phyint; 13730 phyint_t *next_phyint; 13731 int error; 13732 boolean_t need_retry = B_FALSE; 13733 13734 phyint = cur_ipsq->ipsq_phyint_list; 13735 cur_ipsq->ipsq_phyint_list = NULL; 13736 while (phyint != NULL) { 13737 next_phyint = phyint->phyint_ipsq_next; 13738 /* 13739 * 'created' will tell us whether the callee actually 13740 * created an ipsq. Lack of memory may force the callee 13741 * to return without creating an ipsq. 13742 */ 13743 if (phyint->phyint_groupname == NULL) { 13744 error = ill_split_to_own_ipsq(phyint, cur_ipsq); 13745 } else { 13746 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 13747 need_retry); 13748 } 13749 13750 switch (error) { 13751 case SPLIT_FAILED: 13752 need_retry = B_TRUE; 13753 /* FALLTHRU */ 13754 case SPLIT_NOT_NEEDED: 13755 /* 13756 * Keep it on the list. 13757 */ 13758 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 13759 cur_ipsq->ipsq_phyint_list = phyint; 13760 break; 13761 case SPLIT_SUCCESS: 13762 break; 13763 default: 13764 ASSERT(0); 13765 } 13766 13767 phyint = next_phyint; 13768 } 13769 return (need_retry); 13770 } 13771 13772 /* 13773 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 13774 * and return the ills in the list. This list will be 13775 * needed to unlock all the ills later on by the caller. 13776 * The <ill-ipsq> associations could change between the 13777 * lock and unlock. Hence the unlock can't traverse the 13778 * ipsq to get the list of ills. 13779 */ 13780 static int 13781 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 13782 { 13783 int cnt = 0; 13784 phyint_t *phyint; 13785 13786 /* 13787 * The caller holds ill_g_lock to ensure that the ill memberships 13788 * of the ipsq don't change 13789 */ 13790 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 13791 13792 phyint = ipsq->ipsq_phyint_list; 13793 while (phyint != NULL) { 13794 if (phyint->phyint_illv4 != NULL) { 13795 ASSERT(cnt < list_max); 13796 list[cnt++] = phyint->phyint_illv4; 13797 } 13798 if (phyint->phyint_illv6 != NULL) { 13799 ASSERT(cnt < list_max); 13800 list[cnt++] = phyint->phyint_illv6; 13801 } 13802 phyint = phyint->phyint_ipsq_next; 13803 } 13804 ill_lock_ills(list, cnt); 13805 return (cnt); 13806 } 13807 13808 void 13809 ill_lock_ills(ill_t **list, int cnt) 13810 { 13811 int i; 13812 13813 if (cnt > 1) { 13814 boolean_t try_again; 13815 do { 13816 try_again = B_FALSE; 13817 for (i = 0; i < cnt - 1; i++) { 13818 if (list[i] < list[i + 1]) { 13819 ill_t *tmp; 13820 13821 /* swap the elements */ 13822 tmp = list[i]; 13823 list[i] = list[i + 1]; 13824 list[i + 1] = tmp; 13825 try_again = B_TRUE; 13826 } 13827 } 13828 } while (try_again); 13829 } 13830 13831 for (i = 0; i < cnt; i++) { 13832 if (i == 0) { 13833 if (list[i] != NULL) 13834 mutex_enter(&list[i]->ill_lock); 13835 else 13836 return; 13837 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 13838 mutex_enter(&list[i]->ill_lock); 13839 } 13840 } 13841 } 13842 13843 void 13844 ill_unlock_ills(ill_t **list, int cnt) 13845 { 13846 int i; 13847 13848 for (i = 0; i < cnt; i++) { 13849 if ((i == 0) && (list[i] != NULL)) { 13850 mutex_exit(&list[i]->ill_lock); 13851 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 13852 mutex_exit(&list[i]->ill_lock); 13853 } 13854 } 13855 } 13856 13857 /* 13858 * Merge all the ills from 1 ipsq group into another ipsq group. 13859 * The source ipsq group is specified by the ipsq associated with 13860 * 'from_ill'. The destination ipsq group is specified by the ipsq 13861 * associated with 'to_ill' or 'groupname' respectively. 13862 * Note that ipsq itself does not have a reference count mechanism 13863 * and functions don't look up an ipsq and pass it around. Instead 13864 * functions pass around an ill or groupname, and the ipsq is looked 13865 * up from the ill or groupname and the required operation performed 13866 * atomically with the lookup on the ipsq. 13867 */ 13868 static int 13869 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 13870 queue_t *q) 13871 { 13872 ipsq_t *old_ipsq; 13873 ipsq_t *new_ipsq; 13874 ill_t **ill_list; 13875 int cnt; 13876 size_t ill_list_size; 13877 boolean_t became_writer_on_new_sq = B_FALSE; 13878 13879 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 13880 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 13881 13882 /* 13883 * Need to hold ill_g_lock as writer and also the ill_lock to 13884 * change the <ill-ipsq> assoc of an ill. Need to hold the 13885 * ipsq_lock to prevent new messages from landing on an ipsq. 13886 */ 13887 rw_enter(&ill_g_lock, RW_WRITER); 13888 13889 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 13890 if (groupname != NULL) 13891 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL); 13892 else { 13893 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 13894 } 13895 13896 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 13897 13898 /* 13899 * both groups are on the same ipsq. 13900 */ 13901 if (old_ipsq == new_ipsq) { 13902 rw_exit(&ill_g_lock); 13903 return (0); 13904 } 13905 13906 cnt = old_ipsq->ipsq_refs << 1; 13907 ill_list_size = cnt * sizeof (ill_t *); 13908 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 13909 if (ill_list == NULL) { 13910 rw_exit(&ill_g_lock); 13911 return (ENOMEM); 13912 } 13913 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 13914 13915 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 13916 mutex_enter(&new_ipsq->ipsq_lock); 13917 if ((new_ipsq->ipsq_writer == NULL && 13918 new_ipsq->ipsq_current_ipif == NULL) || 13919 (new_ipsq->ipsq_writer == curthread)) { 13920 new_ipsq->ipsq_writer = curthread; 13921 new_ipsq->ipsq_reentry_cnt++; 13922 became_writer_on_new_sq = B_TRUE; 13923 } 13924 13925 /* 13926 * We are holding ill_g_lock as writer and all the ill locks of 13927 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 13928 * message can land up on the old ipsq even though we don't hold the 13929 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 13930 */ 13931 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 13932 13933 /* 13934 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 13935 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 13936 * assocs. till we release the ill_g_lock, and hence it can't vanish. 13937 */ 13938 ill_merge_ipsq(old_ipsq, new_ipsq); 13939 13940 /* 13941 * Mark the new ipsq as needing a split since it is currently 13942 * being shared by more than 1 IPMP group. The split will 13943 * occur at the end of ipsq_exit 13944 */ 13945 new_ipsq->ipsq_split = B_TRUE; 13946 13947 /* Now release all the locks */ 13948 mutex_exit(&new_ipsq->ipsq_lock); 13949 ill_unlock_ills(ill_list, cnt); 13950 rw_exit(&ill_g_lock); 13951 13952 kmem_free(ill_list, ill_list_size); 13953 13954 /* 13955 * If we succeeded in becoming writer on the new ipsq, then 13956 * drain the new ipsq and start processing all enqueued messages 13957 * including the current ioctl we are processing which is either 13958 * a set groupname or failover/failback. 13959 */ 13960 if (became_writer_on_new_sq) 13961 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 13962 13963 /* 13964 * syncq has been changed and all the messages have been moved. 13965 */ 13966 mutex_enter(&old_ipsq->ipsq_lock); 13967 old_ipsq->ipsq_current_ipif = NULL; 13968 mutex_exit(&old_ipsq->ipsq_lock); 13969 return (EINPROGRESS); 13970 } 13971 13972 /* 13973 * Delete and add the loopback copy and non-loopback copy of 13974 * the BROADCAST ire corresponding to ill and addr. Used to 13975 * group broadcast ires together when ill becomes part of 13976 * a group. 13977 * 13978 * This function is also called when ill is leaving the group 13979 * so that the ires belonging to the group gets re-grouped. 13980 */ 13981 static void 13982 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 13983 { 13984 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 13985 ire_t **ire_ptpn = &ire_head; 13986 13987 /* 13988 * The loopback and non-loopback IREs are inserted in the order in which 13989 * they're found, on the basis that they are correctly ordered (loopback 13990 * first). 13991 */ 13992 for (;;) { 13993 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 13994 ALL_ZONES, MATCH_IRE_TYPE | MATCH_IRE_ILL); 13995 if (ire == NULL) 13996 break; 13997 13998 /* 13999 * we are passing in KM_SLEEP because it is not easy to 14000 * go back to a sane state in case of memory failure. 14001 */ 14002 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 14003 ASSERT(nire != NULL); 14004 bzero(nire, sizeof (ire_t)); 14005 /* 14006 * Don't use ire_max_frag directly since we don't 14007 * hold on to 'ire' until we add the new ire 'nire' and 14008 * we don't want the new ire to have a dangling reference 14009 * to 'ire'. The ire_max_frag of a broadcast ire must 14010 * be in sync with the ipif_mtu of the associate ipif. 14011 * For eg. this happens as a result of SIOCSLIFNAME, 14012 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 14013 * the driver. A change in ire_max_frag triggered as 14014 * as a result of path mtu discovery, or due to an 14015 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 14016 * route change -mtu command does not apply to broadcast ires. 14017 * 14018 * XXX We need a recovery strategy here if ire_init fails 14019 */ 14020 if (ire_init(nire, 14021 (uchar_t *)&ire->ire_addr, 14022 (uchar_t *)&ire->ire_mask, 14023 (uchar_t *)&ire->ire_src_addr, 14024 (uchar_t *)&ire->ire_gateway_addr, 14025 (uchar_t *)&ire->ire_in_src_addr, 14026 ire->ire_stq == NULL ? &ip_loopback_mtu : 14027 &ire->ire_ipif->ipif_mtu, 14028 ire->ire_fp_mp, 14029 ire->ire_rfq, 14030 ire->ire_stq, 14031 ire->ire_type, 14032 ire->ire_dlureq_mp, 14033 ire->ire_ipif, 14034 ire->ire_in_ill, 14035 ire->ire_cmask, 14036 ire->ire_phandle, 14037 ire->ire_ihandle, 14038 ire->ire_flags, 14039 &ire->ire_uinfo) == NULL) { 14040 cmn_err(CE_PANIC, "ire_init() failed"); 14041 } 14042 ire_delete(ire); 14043 ire_refrele(ire); 14044 14045 /* 14046 * The newly created IREs are inserted at the tail of the list 14047 * starting with ire_head. As we've just allocated them no one 14048 * knows about them so it's safe. 14049 */ 14050 *ire_ptpn = nire; 14051 ire_ptpn = &nire->ire_next; 14052 } 14053 14054 for (nire = ire_head; nire != NULL; nire = nire_next) { 14055 int error; 14056 ire_t *oire; 14057 /* unlink the IRE from our list before calling ire_add() */ 14058 nire_next = nire->ire_next; 14059 nire->ire_next = NULL; 14060 14061 /* ire_add adds the ire at the right place in the list */ 14062 oire = nire; 14063 error = ire_add(&nire, NULL, NULL, NULL); 14064 ASSERT(error == 0); 14065 ASSERT(oire == nire); 14066 ire_refrele(nire); /* Held in ire_add */ 14067 } 14068 } 14069 14070 /* 14071 * This function is usually called when an ill is inserted in 14072 * a group and all the ipifs are already UP. As all the ipifs 14073 * are already UP, the broadcast ires have already been created 14074 * and been inserted. But, ire_add_v4 would not have grouped properly. 14075 * We need to re-group for the benefit of ip_wput_ire which 14076 * expects BROADCAST ires to be grouped properly to avoid sending 14077 * more than one copy of the broadcast packet per group. 14078 * 14079 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 14080 * because when ipif_up_done ends up calling this, ires have 14081 * already been added before illgrp_insert i.e before ill_group 14082 * has been initialized. 14083 */ 14084 static void 14085 ill_group_bcast_for_xmit(ill_t *ill) 14086 { 14087 ill_group_t *illgrp; 14088 ipif_t *ipif; 14089 ipaddr_t addr; 14090 ipaddr_t net_mask; 14091 ipaddr_t subnet_netmask; 14092 14093 illgrp = ill->ill_group; 14094 14095 /* 14096 * This function is called even when an ill is deleted from 14097 * the group. Hence, illgrp could be null. 14098 */ 14099 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 14100 return; 14101 14102 /* 14103 * Delete all the BROADCAST ires matching this ill and add 14104 * them back. This time, ire_add_v4 should take care of 14105 * grouping them with others because ill is part of the 14106 * group. 14107 */ 14108 ill_bcast_delete_and_add(ill, 0); 14109 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 14110 14111 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14112 14113 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14114 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14115 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14116 } else { 14117 net_mask = htonl(IN_CLASSA_NET); 14118 } 14119 addr = net_mask & ipif->ipif_subnet; 14120 ill_bcast_delete_and_add(ill, addr); 14121 ill_bcast_delete_and_add(ill, ~net_mask | addr); 14122 14123 subnet_netmask = ipif->ipif_net_mask; 14124 addr = ipif->ipif_subnet; 14125 ill_bcast_delete_and_add(ill, addr); 14126 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 14127 } 14128 } 14129 14130 /* 14131 * This function is called from illgrp_delete when ill is being deleted 14132 * from the group. 14133 * 14134 * As ill is not there in the group anymore, any address belonging 14135 * to this ill should be cleared of IRE_MARK_NORECV. 14136 */ 14137 static void 14138 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 14139 { 14140 ire_t *ire; 14141 irb_t *irb; 14142 14143 ASSERT(ill->ill_group == NULL); 14144 14145 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14146 ALL_ZONES, MATCH_IRE_TYPE | MATCH_IRE_ILL); 14147 14148 if (ire != NULL) { 14149 /* 14150 * IPMP and plumbing operations are serialized on the ipsq, so 14151 * no one will insert or delete a broadcast ire under our feet. 14152 */ 14153 irb = ire->ire_bucket; 14154 rw_enter(&irb->irb_lock, RW_READER); 14155 ire_refrele(ire); 14156 14157 for (; ire != NULL; ire = ire->ire_next) { 14158 if (ire->ire_addr != addr) 14159 break; 14160 if (ire_to_ill(ire) != ill) 14161 continue; 14162 14163 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 14164 ire->ire_marks &= ~IRE_MARK_NORECV; 14165 } 14166 rw_exit(&irb->irb_lock); 14167 } 14168 } 14169 14170 /* 14171 * This function must be called only after the broadcast ires 14172 * have been grouped together. For a given address addr, nominate 14173 * only one of the ires whose interface is not FAILED or OFFLINE. 14174 * 14175 * This is also called when an ipif goes down, so that we can nominate 14176 * a different ire with the same address for receiving. 14177 */ 14178 static void 14179 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr) 14180 { 14181 irb_t *irb; 14182 ire_t *ire; 14183 ire_t *ire1; 14184 ire_t *save_ire; 14185 ire_t **irep = NULL; 14186 boolean_t first = B_TRUE; 14187 ire_t *clear_ire = NULL; 14188 ire_t *start_ire = NULL; 14189 ire_t *new_lb_ire; 14190 ire_t *new_nlb_ire; 14191 boolean_t new_lb_ire_used = B_FALSE; 14192 boolean_t new_nlb_ire_used = B_FALSE; 14193 uint64_t match_flags; 14194 uint64_t phyi_flags; 14195 boolean_t fallback = B_FALSE; 14196 14197 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 14198 MATCH_IRE_TYPE); 14199 /* 14200 * We may not be able to find some ires if a previous 14201 * ire_create failed. This happens when an ipif goes 14202 * down and we are unable to create BROADCAST ires due 14203 * to memory failure. Thus, we have to check for NULL 14204 * below. This should handle the case for LOOPBACK, 14205 * POINTOPOINT and interfaces with some POINTOPOINT 14206 * logicals for which there are no BROADCAST ires. 14207 */ 14208 if (ire == NULL) 14209 return; 14210 /* 14211 * Currently IRE_BROADCASTS are deleted when an ipif 14212 * goes down which runs exclusively. Thus, setting 14213 * IRE_MARK_RCVD should not race with ire_delete marking 14214 * IRE_MARK_CONDEMNED. We grab the lock below just to 14215 * be consistent with other parts of the code that walks 14216 * a given bucket. 14217 */ 14218 save_ire = ire; 14219 irb = ire->ire_bucket; 14220 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14221 if (new_lb_ire == NULL) { 14222 ire_refrele(ire); 14223 return; 14224 } 14225 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 14226 if (new_nlb_ire == NULL) { 14227 ire_refrele(ire); 14228 kmem_cache_free(ire_cache, new_lb_ire); 14229 return; 14230 } 14231 IRB_REFHOLD(irb); 14232 rw_enter(&irb->irb_lock, RW_WRITER); 14233 /* 14234 * Get to the first ire matching the address and the 14235 * group. If the address does not match we are done 14236 * as we could not find the IRE. If the address matches 14237 * we should get to the first one matching the group. 14238 */ 14239 while (ire != NULL) { 14240 if (ire->ire_addr != addr || 14241 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14242 break; 14243 } 14244 ire = ire->ire_next; 14245 } 14246 match_flags = PHYI_FAILED | PHYI_INACTIVE; 14247 start_ire = ire; 14248 redo: 14249 while (ire != NULL && ire->ire_addr == addr && 14250 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 14251 /* 14252 * The first ire for any address within a group 14253 * should always be the one with IRE_MARK_NORECV cleared 14254 * so that ip_wput_ire can avoid searching for one. 14255 * Note down the insertion point which will be used 14256 * later. 14257 */ 14258 if (first && (irep == NULL)) 14259 irep = ire->ire_ptpn; 14260 /* 14261 * PHYI_FAILED is set when the interface fails. 14262 * This interface might have become good, but the 14263 * daemon has not yet detected. We should still 14264 * not receive on this. PHYI_OFFLINE should never 14265 * be picked as this has been offlined and soon 14266 * be removed. 14267 */ 14268 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 14269 if (phyi_flags & PHYI_OFFLINE) { 14270 ire->ire_marks |= IRE_MARK_NORECV; 14271 ire = ire->ire_next; 14272 continue; 14273 } 14274 if (phyi_flags & match_flags) { 14275 ire->ire_marks |= IRE_MARK_NORECV; 14276 ire = ire->ire_next; 14277 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 14278 PHYI_INACTIVE) { 14279 fallback = B_TRUE; 14280 } 14281 continue; 14282 } 14283 if (first) { 14284 /* 14285 * We will move this to the front of the list later 14286 * on. 14287 */ 14288 clear_ire = ire; 14289 ire->ire_marks &= ~IRE_MARK_NORECV; 14290 } else { 14291 ire->ire_marks |= IRE_MARK_NORECV; 14292 } 14293 first = B_FALSE; 14294 ire = ire->ire_next; 14295 } 14296 /* 14297 * If we never nominated anybody, try nominating at least 14298 * an INACTIVE, if we found one. Do it only once though. 14299 */ 14300 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 14301 fallback) { 14302 match_flags = PHYI_FAILED; 14303 ire = start_ire; 14304 irep = NULL; 14305 goto redo; 14306 } 14307 ire_refrele(save_ire); 14308 14309 /* 14310 * irep non-NULL indicates that we entered the while loop 14311 * above. If clear_ire is at the insertion point, we don't 14312 * have to do anything. clear_ire will be NULL if all the 14313 * interfaces are failed. 14314 * 14315 * We cannot unlink and reinsert the ire at the right place 14316 * in the list since there can be other walkers of this bucket. 14317 * Instead we delete and recreate the ire 14318 */ 14319 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 14320 ire_t *clear_ire_stq = NULL; 14321 bzero(new_lb_ire, sizeof (ire_t)); 14322 /* XXX We need a recovery strategy here. */ 14323 if (ire_init(new_lb_ire, 14324 (uchar_t *)&clear_ire->ire_addr, 14325 (uchar_t *)&clear_ire->ire_mask, 14326 (uchar_t *)&clear_ire->ire_src_addr, 14327 (uchar_t *)&clear_ire->ire_gateway_addr, 14328 (uchar_t *)&clear_ire->ire_in_src_addr, 14329 &clear_ire->ire_max_frag, 14330 clear_ire->ire_fp_mp, 14331 clear_ire->ire_rfq, 14332 clear_ire->ire_stq, 14333 clear_ire->ire_type, 14334 clear_ire->ire_dlureq_mp, 14335 clear_ire->ire_ipif, 14336 clear_ire->ire_in_ill, 14337 clear_ire->ire_cmask, 14338 clear_ire->ire_phandle, 14339 clear_ire->ire_ihandle, 14340 clear_ire->ire_flags, 14341 &clear_ire->ire_uinfo) == NULL) 14342 cmn_err(CE_PANIC, "ire_init() failed"); 14343 if (clear_ire->ire_stq == NULL) { 14344 ire_t *ire_next = clear_ire->ire_next; 14345 if (ire_next != NULL && 14346 ire_next->ire_stq != NULL && 14347 ire_next->ire_addr == clear_ire->ire_addr && 14348 ire_next->ire_ipif->ipif_ill == 14349 clear_ire->ire_ipif->ipif_ill) { 14350 clear_ire_stq = ire_next; 14351 14352 bzero(new_nlb_ire, sizeof (ire_t)); 14353 /* XXX We need a recovery strategy here. */ 14354 if (ire_init(new_nlb_ire, 14355 (uchar_t *)&clear_ire_stq->ire_addr, 14356 (uchar_t *)&clear_ire_stq->ire_mask, 14357 (uchar_t *)&clear_ire_stq->ire_src_addr, 14358 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 14359 (uchar_t *)&clear_ire_stq->ire_in_src_addr, 14360 &clear_ire_stq->ire_max_frag, 14361 clear_ire_stq->ire_fp_mp, 14362 clear_ire_stq->ire_rfq, 14363 clear_ire_stq->ire_stq, 14364 clear_ire_stq->ire_type, 14365 clear_ire_stq->ire_dlureq_mp, 14366 clear_ire_stq->ire_ipif, 14367 clear_ire_stq->ire_in_ill, 14368 clear_ire_stq->ire_cmask, 14369 clear_ire_stq->ire_phandle, 14370 clear_ire_stq->ire_ihandle, 14371 clear_ire_stq->ire_flags, 14372 &clear_ire_stq->ire_uinfo) == NULL) 14373 cmn_err(CE_PANIC, "ire_init() failed"); 14374 } 14375 } 14376 14377 /* 14378 * Delete the ire. We can't call ire_delete() since 14379 * we are holding the bucket lock. We can't release the 14380 * bucket lock since we can't allow irep to change. So just 14381 * mark it CONDEMNED. The IRB_REFRELE will delete the 14382 * ire from the list and do the refrele. 14383 */ 14384 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 14385 irb->irb_marks |= IRE_MARK_CONDEMNED; 14386 14387 if (clear_ire_stq != NULL) { 14388 ire_fastpath_list_delete( 14389 (ill_t *)clear_ire_stq->ire_stq->q_ptr, 14390 clear_ire_stq); 14391 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 14392 } 14393 14394 /* 14395 * Also take care of otherfields like ib/ob pkt count 14396 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 14397 */ 14398 14399 /* Add the new ire's. Insert at *irep */ 14400 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 14401 ire1 = *irep; 14402 if (ire1 != NULL) 14403 ire1->ire_ptpn = &new_lb_ire->ire_next; 14404 new_lb_ire->ire_next = ire1; 14405 /* Link the new one in. */ 14406 new_lb_ire->ire_ptpn = irep; 14407 membar_producer(); 14408 *irep = new_lb_ire; 14409 new_lb_ire_used = B_TRUE; 14410 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 14411 new_lb_ire->ire_bucket->irb_ire_cnt++; 14412 new_lb_ire->ire_ipif->ipif_ire_cnt++; 14413 14414 if (clear_ire_stq != NULL) { 14415 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 14416 irep = &new_lb_ire->ire_next; 14417 /* Add the new ire. Insert at *irep */ 14418 ire1 = *irep; 14419 if (ire1 != NULL) 14420 ire1->ire_ptpn = &new_nlb_ire->ire_next; 14421 new_nlb_ire->ire_next = ire1; 14422 /* Link the new one in. */ 14423 new_nlb_ire->ire_ptpn = irep; 14424 membar_producer(); 14425 *irep = new_nlb_ire; 14426 new_nlb_ire_used = B_TRUE; 14427 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 14428 new_nlb_ire->ire_bucket->irb_ire_cnt++; 14429 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 14430 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 14431 } 14432 } 14433 rw_exit(&irb->irb_lock); 14434 if (!new_lb_ire_used) 14435 kmem_cache_free(ire_cache, new_lb_ire); 14436 if (!new_nlb_ire_used) 14437 kmem_cache_free(ire_cache, new_nlb_ire); 14438 IRB_REFRELE(irb); 14439 } 14440 14441 /* 14442 * Whenever an ipif goes down we have to renominate a different 14443 * broadcast ire to receive. Whenever an ipif comes up, we need 14444 * to make sure that we have only one nominated to receive. 14445 */ 14446 static void 14447 ipif_renominate_bcast(ipif_t *ipif) 14448 { 14449 ill_t *ill = ipif->ipif_ill; 14450 ipaddr_t subnet_addr; 14451 ipaddr_t net_addr; 14452 ipaddr_t net_mask = 0; 14453 ipaddr_t subnet_netmask; 14454 ipaddr_t addr; 14455 ill_group_t *illgrp; 14456 14457 illgrp = ill->ill_group; 14458 /* 14459 * If this is the last ipif going down, it might take 14460 * the ill out of the group. In that case ipif_down -> 14461 * illgrp_delete takes care of doing the nomination. 14462 * ipif_down does not call for this case. 14463 */ 14464 ASSERT(illgrp != NULL); 14465 14466 /* There could not have been any ires associated with this */ 14467 if (ipif->ipif_subnet == 0) 14468 return; 14469 14470 ill_mark_bcast(illgrp, 0); 14471 ill_mark_bcast(illgrp, INADDR_BROADCAST); 14472 14473 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14474 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14475 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14476 } else { 14477 net_mask = htonl(IN_CLASSA_NET); 14478 } 14479 addr = net_mask & ipif->ipif_subnet; 14480 ill_mark_bcast(illgrp, addr); 14481 14482 net_addr = ~net_mask | addr; 14483 ill_mark_bcast(illgrp, net_addr); 14484 14485 subnet_netmask = ipif->ipif_net_mask; 14486 addr = ipif->ipif_subnet; 14487 ill_mark_bcast(illgrp, addr); 14488 14489 subnet_addr = ~subnet_netmask | addr; 14490 ill_mark_bcast(illgrp, subnet_addr); 14491 } 14492 14493 /* 14494 * Whenever we form or delete ill groups, we need to nominate one set of 14495 * BROADCAST ires for receiving in the group. 14496 * 14497 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 14498 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 14499 * for ill_ipif_up_count to be non-zero. This is the only case where 14500 * ill_ipif_up_count is zero and we would still find the ires. 14501 * 14502 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 14503 * ipif is UP and we just have to do the nomination. 14504 * 14505 * 3) When ill_handoff_responsibility calls us, some ill has been removed 14506 * from the group. So, we have to do the nomination. 14507 * 14508 * Because of (3), there could be just one ill in the group. But we have 14509 * to nominate still as IRE_MARK_NORCV may have been marked on this. 14510 * Thus, this function does not optimize when there is only one ill as 14511 * it is not correct for (3). 14512 */ 14513 static void 14514 ill_nominate_bcast_rcv(ill_group_t *illgrp) 14515 { 14516 ill_t *ill; 14517 ipif_t *ipif; 14518 ipaddr_t subnet_addr; 14519 ipaddr_t prev_subnet_addr = 0; 14520 ipaddr_t net_addr; 14521 ipaddr_t prev_net_addr = 0; 14522 ipaddr_t net_mask = 0; 14523 ipaddr_t subnet_netmask; 14524 ipaddr_t addr; 14525 14526 /* 14527 * When the last memeber is leaving, there is nothing to 14528 * nominate. 14529 */ 14530 if (illgrp->illgrp_ill_count == 0) { 14531 ASSERT(illgrp->illgrp_ill == NULL); 14532 return; 14533 } 14534 14535 ill = illgrp->illgrp_ill; 14536 ASSERT(!ill->ill_isv6); 14537 /* 14538 * We assume that ires with same address and belonging to the 14539 * same group, has been grouped together. Nominating a *single* 14540 * ill in the group for sending and receiving broadcast is done 14541 * by making sure that the first BROADCAST ire (which will be 14542 * the one returned by ire_ctable_lookup for ip_rput and the 14543 * one that will be used in ip_wput_ire) will be the one that 14544 * will not have IRE_MARK_NORECV set. 14545 * 14546 * 1) ip_rput checks and discards packets received on ires marked 14547 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 14548 * broadcast packets. We need to clear IRE_MARK_NORECV on the 14549 * first ire in the group for every broadcast address in the group. 14550 * ip_rput will accept packets only on the first ire i.e only 14551 * one copy of the ill. 14552 * 14553 * 2) ip_wput_ire needs to send out just one copy of the broadcast 14554 * packet for the whole group. It needs to send out on the ill 14555 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 14556 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 14557 * the copy echoed back on other port where the ire is not marked 14558 * with IRE_MARK_NORECV. 14559 * 14560 * Note that we just need to have the first IRE either loopback or 14561 * non-loopback (either of them may not exist if ire_create failed 14562 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 14563 * always hit the first one and hence will always accept one copy. 14564 * 14565 * We have a broadcast ire per ill for all the unique prefixes 14566 * hosted on that ill. As we don't have a way of knowing the 14567 * unique prefixes on a given ill and hence in the whole group, 14568 * we just call ill_mark_bcast on all the prefixes that exist 14569 * in the group. For the common case of one prefix, the code 14570 * below optimizes by remebering the last address used for 14571 * markng. In the case of multiple prefixes, this will still 14572 * optimize depending the order of prefixes. 14573 * 14574 * The only unique address across the whole group is 0.0.0.0 and 14575 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 14576 * the first ire in the bucket for receiving and disables the 14577 * others. 14578 */ 14579 ill_mark_bcast(illgrp, 0); 14580 ill_mark_bcast(illgrp, INADDR_BROADCAST); 14581 for (; ill != NULL; ill = ill->ill_group_next) { 14582 14583 for (ipif = ill->ill_ipif; ipif != NULL; 14584 ipif = ipif->ipif_next) { 14585 14586 if (!(ipif->ipif_flags & IPIF_UP) || 14587 ipif->ipif_subnet == 0) { 14588 continue; 14589 } 14590 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14591 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14592 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14593 } else { 14594 net_mask = htonl(IN_CLASSA_NET); 14595 } 14596 addr = net_mask & ipif->ipif_subnet; 14597 if (prev_net_addr == 0 || prev_net_addr != addr) { 14598 ill_mark_bcast(illgrp, addr); 14599 net_addr = ~net_mask | addr; 14600 ill_mark_bcast(illgrp, net_addr); 14601 } 14602 prev_net_addr = addr; 14603 14604 subnet_netmask = ipif->ipif_net_mask; 14605 addr = ipif->ipif_subnet; 14606 if (prev_subnet_addr == 0 || 14607 prev_subnet_addr != addr) { 14608 ill_mark_bcast(illgrp, addr); 14609 subnet_addr = ~subnet_netmask | addr; 14610 ill_mark_bcast(illgrp, subnet_addr); 14611 } 14612 prev_subnet_addr = addr; 14613 } 14614 } 14615 } 14616 14617 /* 14618 * This function is called while forming ill groups. 14619 * 14620 * Currently, we handle only allmulti groups. We want to join 14621 * allmulti on only one of the ills in the groups. In future, 14622 * when we have link aggregation, we may have to join normal 14623 * multicast groups on multiple ills as switch does inbound load 14624 * balancing. Following are the functions that calls this 14625 * function : 14626 * 14627 * 1) ill_recover_multicast : Interface is coming back UP. 14628 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 14629 * will call ill_recover_multicast to recover all the multicast 14630 * groups. We need to make sure that only one member is joined 14631 * in the ill group. 14632 * 14633 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 14634 * Somebody is joining allmulti. We need to make sure that only one 14635 * member is joined in the group. 14636 * 14637 * 3) illgrp_insert : If allmulti has already joined, we need to make 14638 * sure that only one member is joined in the group. 14639 * 14640 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 14641 * allmulti who we have nominated. We need to pick someother ill. 14642 * 14643 * 5) illgrp_delete : The ill we nominated is leaving the group, 14644 * we need to pick a new ill to join the group. 14645 * 14646 * For (1), (2), (5) - we just have to check whether there is 14647 * a good ill joined in the group. If we could not find any ills 14648 * joined the group, we should join. 14649 * 14650 * For (4), the one that was nominated to receive, left the group. 14651 * There could be nobody joined in the group when this function is 14652 * called. 14653 * 14654 * For (3) - we need to explicitly check whether there are multiple 14655 * ills joined in the group. 14656 * 14657 * For simplicity, we don't differentiate any of the above cases. We 14658 * just leave the group if it is joined on any of them and join on 14659 * the first good ill. 14660 */ 14661 int 14662 ill_nominate_mcast_rcv(ill_group_t *illgrp) 14663 { 14664 ilm_t *ilm; 14665 ill_t *ill; 14666 ill_t *fallback_inactive_ill = NULL; 14667 ill_t *fallback_failed_ill = NULL; 14668 int ret = 0; 14669 14670 /* 14671 * Leave the allmulti on all the ills and start fresh. 14672 */ 14673 for (ill = illgrp->illgrp_ill; ill != NULL; 14674 ill = ill->ill_group_next) { 14675 if (ill->ill_join_allmulti) 14676 (void) ip_leave_allmulti(ill->ill_ipif); 14677 } 14678 14679 /* 14680 * Choose a good ill. Fallback to inactive or failed if 14681 * none available. We need to fallback to FAILED in the 14682 * case where we have 2 interfaces in a group - where 14683 * one of them is failed and another is a good one and 14684 * the good one (not marked inactive) is leaving the group. 14685 */ 14686 ret = 0; 14687 for (ill = illgrp->illgrp_ill; ill != NULL; 14688 ill = ill->ill_group_next) { 14689 /* Never pick an offline interface */ 14690 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 14691 continue; 14692 14693 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 14694 fallback_failed_ill = ill; 14695 continue; 14696 } 14697 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 14698 fallback_inactive_ill = ill; 14699 continue; 14700 } 14701 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 14702 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 14703 ret = ip_join_allmulti(ill->ill_ipif); 14704 /* 14705 * ip_join_allmulti can fail because of memory 14706 * failures. So, make sure we join at least 14707 * on one ill. 14708 */ 14709 if (ill->ill_join_allmulti) 14710 return (0); 14711 } 14712 } 14713 } 14714 if (ret != 0) { 14715 /* 14716 * If we tried nominating above and failed to do so, 14717 * return error. We might have tried multiple times. 14718 * But, return the latest error. 14719 */ 14720 return (ret); 14721 } 14722 if ((ill = fallback_inactive_ill) != NULL) { 14723 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 14724 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 14725 ret = ip_join_allmulti(ill->ill_ipif); 14726 return (ret); 14727 } 14728 } 14729 } else if ((ill = fallback_failed_ill) != NULL) { 14730 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 14731 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 14732 ret = ip_join_allmulti(ill->ill_ipif); 14733 return (ret); 14734 } 14735 } 14736 } 14737 return (0); 14738 } 14739 14740 /* 14741 * This function is called from illgrp_delete after it is 14742 * deleted from the group to reschedule responsibilities 14743 * to a different ill. 14744 */ 14745 static void 14746 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 14747 { 14748 ilm_t *ilm; 14749 ipif_t *ipif; 14750 ipaddr_t subnet_addr; 14751 ipaddr_t net_addr; 14752 ipaddr_t net_mask = 0; 14753 ipaddr_t subnet_netmask; 14754 ipaddr_t addr; 14755 14756 ASSERT(ill->ill_group == NULL); 14757 /* 14758 * Broadcast Responsibility: 14759 * 14760 * 1. If this ill has been nominated for receiving broadcast 14761 * packets, we need to find a new one. Before we find a new 14762 * one, we need to re-group the ires that are part of this new 14763 * group (assumed by ill_nominate_bcast_rcv). We do this by 14764 * calling ill_group_bcast_for_xmit(ill) which will do the right 14765 * thing for us. 14766 * 14767 * 2. If this ill was not nominated for receiving broadcast 14768 * packets, we need to clear the IRE_MARK_NORECV flag 14769 * so that we continue to send up broadcast packets. 14770 */ 14771 if (!ill->ill_isv6) { 14772 /* 14773 * Case 1 above : No optimization here. Just redo the 14774 * nomination. 14775 */ 14776 ill_group_bcast_for_xmit(ill); 14777 ill_nominate_bcast_rcv(illgrp); 14778 14779 /* 14780 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 14781 */ 14782 ill_clear_bcast_mark(ill, 0); 14783 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 14784 14785 for (ipif = ill->ill_ipif; ipif != NULL; 14786 ipif = ipif->ipif_next) { 14787 14788 if (!(ipif->ipif_flags & IPIF_UP) || 14789 ipif->ipif_subnet == 0) { 14790 continue; 14791 } 14792 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14793 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14794 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14795 } else { 14796 net_mask = htonl(IN_CLASSA_NET); 14797 } 14798 addr = net_mask & ipif->ipif_subnet; 14799 ill_clear_bcast_mark(ill, addr); 14800 14801 net_addr = ~net_mask | addr; 14802 ill_clear_bcast_mark(ill, net_addr); 14803 14804 subnet_netmask = ipif->ipif_net_mask; 14805 addr = ipif->ipif_subnet; 14806 ill_clear_bcast_mark(ill, addr); 14807 14808 subnet_addr = ~subnet_netmask | addr; 14809 ill_clear_bcast_mark(ill, subnet_addr); 14810 } 14811 } 14812 14813 /* 14814 * Multicast Responsibility. 14815 * 14816 * If we have joined allmulti on this one, find a new member 14817 * in the group to join allmulti. As this ill is already part 14818 * of allmulti, we don't have to join on this one. 14819 * 14820 * If we have not joined allmulti on this one, there is no 14821 * responsibility to handoff. But we need to take new 14822 * responsibility i.e, join allmulti on this one if we need 14823 * to. 14824 */ 14825 if (ill->ill_join_allmulti) { 14826 (void) ill_nominate_mcast_rcv(illgrp); 14827 } else { 14828 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 14829 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 14830 (void) ip_join_allmulti(ill->ill_ipif); 14831 break; 14832 } 14833 } 14834 } 14835 14836 /* 14837 * We intentionally do the flushing of IRE_CACHES only matching 14838 * on the ill and not on groups. Note that we are already deleted 14839 * from the group. 14840 * 14841 * This will make sure that all IRE_CACHES whose stq is pointing 14842 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 14843 * deleted and IRE_CACHES that are not pointing at this ill will 14844 * be left alone. 14845 */ 14846 if (ill->ill_isv6) { 14847 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 14848 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 14849 } else { 14850 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 14851 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 14852 } 14853 14854 /* 14855 * Some conn may have cached one of the IREs deleted above. By removing 14856 * the ire reference, we clean up the extra reference to the ill held in 14857 * ire->ire_stq. 14858 */ 14859 ipcl_walk(conn_cleanup_stale_ire, NULL); 14860 14861 /* 14862 * Re-do source address selection for all the members in the 14863 * group, if they borrowed source address from one of the ipifs 14864 * in this ill. 14865 */ 14866 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14867 if (ill->ill_isv6) { 14868 ipif_update_other_ipifs_v6(ipif, illgrp); 14869 } else { 14870 ipif_update_other_ipifs(ipif, illgrp); 14871 } 14872 } 14873 } 14874 14875 /* 14876 * Delete the ill from the group. The caller makes sure that it is 14877 * in a group and it okay to delete from the group. So, we always 14878 * delete here. 14879 */ 14880 static void 14881 illgrp_delete(ill_t *ill) 14882 { 14883 ill_group_t *illgrp; 14884 ill_group_t *tmpg; 14885 ill_t *tmp_ill; 14886 14887 /* 14888 * Reset illgrp_ill_schednext if it was pointing at us. 14889 * We need to do this before we set ill_group to NULL. 14890 */ 14891 rw_enter(&ill_g_lock, RW_WRITER); 14892 mutex_enter(&ill->ill_lock); 14893 14894 illgrp_reset_schednext(ill); 14895 14896 illgrp = ill->ill_group; 14897 14898 /* Delete the ill from illgrp. */ 14899 if (illgrp->illgrp_ill == ill) { 14900 illgrp->illgrp_ill = ill->ill_group_next; 14901 } else { 14902 tmp_ill = illgrp->illgrp_ill; 14903 while (tmp_ill->ill_group_next != ill) { 14904 tmp_ill = tmp_ill->ill_group_next; 14905 ASSERT(tmp_ill != NULL); 14906 } 14907 tmp_ill->ill_group_next = ill->ill_group_next; 14908 } 14909 ill->ill_group = NULL; 14910 ill->ill_group_next = NULL; 14911 14912 illgrp->illgrp_ill_count--; 14913 mutex_exit(&ill->ill_lock); 14914 rw_exit(&ill_g_lock); 14915 14916 /* 14917 * As this ill is leaving the group, we need to hand off 14918 * the responsibilities to the other ills in the group, if 14919 * this ill had some responsibilities. 14920 */ 14921 14922 ill_handoff_responsibility(ill, illgrp); 14923 14924 rw_enter(&ill_g_lock, RW_WRITER); 14925 14926 if (illgrp->illgrp_ill_count == 0) { 14927 14928 ASSERT(illgrp->illgrp_ill == NULL); 14929 if (ill->ill_isv6) { 14930 if (illgrp == illgrp_head_v6) { 14931 illgrp_head_v6 = illgrp->illgrp_next; 14932 } else { 14933 tmpg = illgrp_head_v6; 14934 while (tmpg->illgrp_next != illgrp) { 14935 tmpg = tmpg->illgrp_next; 14936 ASSERT(tmpg != NULL); 14937 } 14938 tmpg->illgrp_next = illgrp->illgrp_next; 14939 } 14940 } else { 14941 if (illgrp == illgrp_head_v4) { 14942 illgrp_head_v4 = illgrp->illgrp_next; 14943 } else { 14944 tmpg = illgrp_head_v4; 14945 while (tmpg->illgrp_next != illgrp) { 14946 tmpg = tmpg->illgrp_next; 14947 ASSERT(tmpg != NULL); 14948 } 14949 tmpg->illgrp_next = illgrp->illgrp_next; 14950 } 14951 } 14952 mutex_destroy(&illgrp->illgrp_lock); 14953 mi_free(illgrp); 14954 } 14955 rw_exit(&ill_g_lock); 14956 14957 /* 14958 * Even though the ill is out of the group its not necessary 14959 * to set ipsq_split as TRUE as the ipifs could be down temporarily 14960 * We will split the ipsq when phyint_groupname is set to NULL. 14961 */ 14962 14963 /* 14964 * Send a routing sockets message if we are deleting from 14965 * groups with names. 14966 */ 14967 if (ill->ill_phyint->phyint_groupname_len != 0) 14968 ip_rts_ifmsg(ill->ill_ipif); 14969 } 14970 14971 /* 14972 * Re-do source address selection. This is normally called when 14973 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 14974 * ipif comes up. 14975 */ 14976 void 14977 ill_update_source_selection(ill_t *ill) 14978 { 14979 ipif_t *ipif; 14980 14981 ASSERT(IAM_WRITER_ILL(ill)); 14982 14983 if (ill->ill_group != NULL) 14984 ill = ill->ill_group->illgrp_ill; 14985 14986 for (; ill != NULL; ill = ill->ill_group_next) { 14987 for (ipif = ill->ill_ipif; ipif != NULL; 14988 ipif = ipif->ipif_next) { 14989 if (ill->ill_isv6) 14990 ipif_recreate_interface_routes_v6(NULL, ipif); 14991 else 14992 ipif_recreate_interface_routes(NULL, ipif); 14993 } 14994 } 14995 } 14996 14997 /* 14998 * Insert ill in a group headed by illgrp_head. The caller can either 14999 * pass a groupname in which case we search for a group with the 15000 * same name to insert in or pass a group to insert in. This function 15001 * would only search groups with names. 15002 * 15003 * NOTE : The caller should make sure that there is at least one ipif 15004 * UP on this ill so that illgrp_scheduler can pick this ill 15005 * for outbound packets. If ill_ipif_up_count is zero, we have 15006 * already sent a DL_UNBIND to the driver and we don't want to 15007 * send anymore packets. We don't assert for ipif_up_count 15008 * to be greater than zero, because ipif_up_done wants to call 15009 * this function before bumping up the ipif_up_count. See 15010 * ipif_up_done() for details. 15011 */ 15012 int 15013 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 15014 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 15015 { 15016 ill_group_t *illgrp; 15017 ill_t *prev_ill; 15018 phyint_t *phyi; 15019 15020 ASSERT(ill->ill_group == NULL); 15021 15022 rw_enter(&ill_g_lock, RW_WRITER); 15023 mutex_enter(&ill->ill_lock); 15024 15025 if (groupname != NULL) { 15026 /* 15027 * Look for a group with a matching groupname to insert. 15028 */ 15029 for (illgrp = *illgrp_head; illgrp != NULL; 15030 illgrp = illgrp->illgrp_next) { 15031 15032 ill_t *tmp_ill; 15033 15034 /* 15035 * If we have an ill_group_t in the list which has 15036 * no ill_t assigned then we must be in the process of 15037 * removing this group. We skip this as illgrp_delete() 15038 * will remove it from the list. 15039 */ 15040 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 15041 ASSERT(illgrp->illgrp_ill_count == 0); 15042 continue; 15043 } 15044 15045 ASSERT(tmp_ill->ill_phyint != NULL); 15046 phyi = tmp_ill->ill_phyint; 15047 /* 15048 * Look at groups which has names only. 15049 */ 15050 if (phyi->phyint_groupname_len == 0) 15051 continue; 15052 /* 15053 * Names are stored in the phyint common to both 15054 * IPv4 and IPv6. 15055 */ 15056 if (mi_strcmp(phyi->phyint_groupname, 15057 groupname) == 0) { 15058 break; 15059 } 15060 } 15061 } else { 15062 /* 15063 * If the caller passes in a NULL "grp_to_insert", we 15064 * allocate one below and insert this singleton. 15065 */ 15066 illgrp = grp_to_insert; 15067 } 15068 15069 ill->ill_group_next = NULL; 15070 15071 if (illgrp == NULL) { 15072 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 15073 if (illgrp == NULL) { 15074 return (ENOMEM); 15075 } 15076 illgrp->illgrp_next = *illgrp_head; 15077 *illgrp_head = illgrp; 15078 illgrp->illgrp_ill = ill; 15079 illgrp->illgrp_ill_count = 1; 15080 ill->ill_group = illgrp; 15081 /* 15082 * Used in illgrp_scheduler to protect multiple threads 15083 * from traversing the list. 15084 */ 15085 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 15086 } else { 15087 ASSERT(ill->ill_net_type == 15088 illgrp->illgrp_ill->ill_net_type); 15089 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 15090 15091 /* Insert ill at tail of this group */ 15092 prev_ill = illgrp->illgrp_ill; 15093 while (prev_ill->ill_group_next != NULL) 15094 prev_ill = prev_ill->ill_group_next; 15095 prev_ill->ill_group_next = ill; 15096 ill->ill_group = illgrp; 15097 illgrp->illgrp_ill_count++; 15098 /* 15099 * Inherit group properties. Currently only forwarding 15100 * is the property we try to keep the same with all the 15101 * ills. When there are more, we will abstract this into 15102 * a function. 15103 */ 15104 ill->ill_flags &= ~ILLF_ROUTER; 15105 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 15106 } 15107 mutex_exit(&ill->ill_lock); 15108 rw_exit(&ill_g_lock); 15109 15110 /* 15111 * 1) When ipif_up_done() calls this function, ipif_up_count 15112 * may be zero as it has not yet been bumped. But the ires 15113 * have already been added. So, we do the nomination here 15114 * itself. But, when ip_sioctl_groupname calls this, it checks 15115 * for ill_ipif_up_count != 0. Thus we don't check for 15116 * ill_ipif_up_count here while nominating broadcast ires for 15117 * receive. 15118 * 15119 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 15120 * to group them properly as ire_add() has already happened 15121 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 15122 * case, we need to do it here anyway. 15123 */ 15124 if (!ill->ill_isv6) { 15125 ill_group_bcast_for_xmit(ill); 15126 ill_nominate_bcast_rcv(illgrp); 15127 } 15128 15129 if (!ipif_is_coming_up) { 15130 /* 15131 * When ipif_up_done() calls this function, the multicast 15132 * groups have not been joined yet. So, there is no point in 15133 * nomination. ip_join_allmulti will handle groups when 15134 * ill_recover_multicast is called from ipif_up_done() later. 15135 */ 15136 (void) ill_nominate_mcast_rcv(illgrp); 15137 /* 15138 * ipif_up_done calls ill_update_source_selection 15139 * anyway. Moreover, we don't want to re-create 15140 * interface routes while ipif_up_done() still has reference 15141 * to them. Refer to ipif_up_done() for more details. 15142 */ 15143 ill_update_source_selection(ill); 15144 } 15145 15146 /* 15147 * Send a routing sockets message if we are inserting into 15148 * groups with names. 15149 */ 15150 if (groupname != NULL) 15151 ip_rts_ifmsg(ill->ill_ipif); 15152 return (0); 15153 } 15154 15155 /* 15156 * Return the first phyint matching the groupname. There could 15157 * be more than one when there are ill groups. 15158 * 15159 * Needs work: called only from ip_sioctl_groupname 15160 */ 15161 static phyint_t * 15162 phyint_lookup_group(char *groupname) 15163 { 15164 phyint_t *phyi; 15165 15166 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 15167 /* 15168 * Group names are stored in the phyint - a common structure 15169 * to both IPv4 and IPv6. 15170 */ 15171 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 15172 for (; phyi != NULL; 15173 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 15174 phyi, AVL_AFTER)) { 15175 if (phyi->phyint_groupname_len == 0) 15176 continue; 15177 ASSERT(phyi->phyint_groupname != NULL); 15178 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 15179 return (phyi); 15180 } 15181 return (NULL); 15182 } 15183 15184 15185 15186 /* 15187 * MT notes on creation and deletion of IPMP groups 15188 * 15189 * Creation and deletion of IPMP groups introduce the need to merge or 15190 * split the associated serialization objects i.e the ipsq's. Normally all 15191 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 15192 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 15193 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 15194 * is a need to change the <ill-ipsq> association and we have to operate on both 15195 * the source and destination IPMP groups. For eg. attempting to set the 15196 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 15197 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 15198 * source or destination IPMP group are mapped to a single ipsq for executing 15199 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 15200 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 15201 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 15202 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 15203 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 15204 * ipsq has to be examined for redoing the <ill-ipsq> associations. 15205 * 15206 * In the above example the ioctl handling code locates the current ipsq of hme0 15207 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 15208 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 15209 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 15210 * the destination ipsq. If the destination ipsq is not busy, it also enters 15211 * the destination ipsq exclusively. Now the actual groupname setting operation 15212 * can proceed. If the destination ipsq is busy, the operation is enqueued 15213 * on the destination (merged) ipsq and will be handled in the unwind from 15214 * ipsq_exit. 15215 * 15216 * To prevent other threads accessing the ill while the group name change is 15217 * in progres, we bring down the ipifs which also removes the ill from the 15218 * group. The group is changed in phyint and when the first ipif on the ill 15219 * is brought up, the ill is inserted into the right IPMP group by 15220 * illgrp_insert. 15221 */ 15222 /* ARGSUSED */ 15223 int 15224 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15225 ip_ioctl_cmd_t *ipip, void *ifreq) 15226 { 15227 int i; 15228 char *tmp; 15229 int namelen; 15230 ill_t *ill = ipif->ipif_ill; 15231 ill_t *ill_v4, *ill_v6; 15232 int err = 0; 15233 phyint_t *phyi; 15234 phyint_t *phyi_tmp; 15235 struct lifreq *lifr; 15236 mblk_t *mp1; 15237 char *groupname; 15238 ipsq_t *ipsq; 15239 15240 ASSERT(IAM_WRITER_IPIF(ipif)); 15241 15242 /* Existance verified in ip_wput_nondata */ 15243 mp1 = mp->b_cont->b_cont; 15244 lifr = (struct lifreq *)mp1->b_rptr; 15245 groupname = lifr->lifr_groupname; 15246 15247 if (ipif->ipif_id != 0) 15248 return (EINVAL); 15249 15250 phyi = ill->ill_phyint; 15251 ASSERT(phyi != NULL); 15252 15253 if (phyi->phyint_flags & PHYI_VIRTUAL) 15254 return (EINVAL); 15255 15256 tmp = groupname; 15257 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 15258 ; 15259 15260 if (i == LIFNAMSIZ) { 15261 /* no null termination */ 15262 return (EINVAL); 15263 } 15264 15265 /* 15266 * Calculate the namelen exclusive of the null 15267 * termination character. 15268 */ 15269 namelen = tmp - groupname; 15270 15271 ill_v4 = phyi->phyint_illv4; 15272 ill_v6 = phyi->phyint_illv6; 15273 15274 /* 15275 * ILL cannot be part of a usesrc group and and IPMP group at the 15276 * same time. No need to grab the ill_g_usesrc_lock here, see 15277 * synchronization notes in ip.c 15278 */ 15279 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 15280 return (EINVAL); 15281 } 15282 15283 /* 15284 * mark the ill as changing. 15285 * this should queue all new requests on the syncq. 15286 */ 15287 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15288 15289 if (ill_v4 != NULL) 15290 ill_v4->ill_state_flags |= ILL_CHANGING; 15291 if (ill_v6 != NULL) 15292 ill_v6->ill_state_flags |= ILL_CHANGING; 15293 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15294 15295 if (namelen == 0) { 15296 /* 15297 * Null string means remove this interface from the 15298 * existing group. 15299 */ 15300 if (phyi->phyint_groupname_len == 0) { 15301 /* 15302 * Never was in a group. 15303 */ 15304 err = 0; 15305 goto done; 15306 } 15307 15308 /* 15309 * IPv4 or IPv6 may be temporarily out of the group when all 15310 * the ipifs are down. Thus, we need to check for ill_group to 15311 * be non-NULL. 15312 */ 15313 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 15314 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 15315 mutex_enter(&ill_v4->ill_lock); 15316 if (!ill_is_quiescent(ill_v4)) { 15317 /* 15318 * ipsq_pending_mp_add will not fail since 15319 * connp is NULL 15320 */ 15321 (void) ipsq_pending_mp_add(NULL, 15322 ill_v4->ill_ipif, q, mp, ILL_DOWN); 15323 mutex_exit(&ill_v4->ill_lock); 15324 err = EINPROGRESS; 15325 goto done; 15326 } 15327 mutex_exit(&ill_v4->ill_lock); 15328 } 15329 15330 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 15331 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 15332 mutex_enter(&ill_v6->ill_lock); 15333 if (!ill_is_quiescent(ill_v6)) { 15334 (void) ipsq_pending_mp_add(NULL, 15335 ill_v6->ill_ipif, q, mp, ILL_DOWN); 15336 mutex_exit(&ill_v6->ill_lock); 15337 err = EINPROGRESS; 15338 goto done; 15339 } 15340 mutex_exit(&ill_v6->ill_lock); 15341 } 15342 15343 rw_enter(&ill_g_lock, RW_WRITER); 15344 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15345 mutex_enter(&phyi->phyint_lock); 15346 ASSERT(phyi->phyint_groupname != NULL); 15347 mi_free(phyi->phyint_groupname); 15348 phyi->phyint_groupname = NULL; 15349 phyi->phyint_groupname_len = 0; 15350 mutex_exit(&phyi->phyint_lock); 15351 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15352 rw_exit(&ill_g_lock); 15353 err = ill_up_ipifs(ill, q, mp); 15354 15355 /* 15356 * set the split flag so that the ipsq can be split 15357 */ 15358 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15359 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15360 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15361 15362 } else { 15363 if (phyi->phyint_groupname_len != 0) { 15364 ASSERT(phyi->phyint_groupname != NULL); 15365 /* Are we inserting in the same group ? */ 15366 if (mi_strcmp(groupname, 15367 phyi->phyint_groupname) == 0) { 15368 err = 0; 15369 goto done; 15370 } 15371 } 15372 15373 rw_enter(&ill_g_lock, RW_READER); 15374 /* 15375 * Merge ipsq for the group's. 15376 * This check is here as multiple groups/ills might be 15377 * sharing the same ipsq. 15378 * If we have to merege than the operation is restarted 15379 * on the new ipsq. 15380 */ 15381 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL); 15382 if (phyi->phyint_ipsq != ipsq) { 15383 rw_exit(&ill_g_lock); 15384 err = ill_merge_groups(ill, NULL, groupname, mp, q); 15385 goto done; 15386 } 15387 /* 15388 * Running exclusive on new ipsq. 15389 */ 15390 15391 ASSERT(ipsq != NULL); 15392 ASSERT(ipsq->ipsq_writer == curthread); 15393 15394 /* 15395 * Check whether the ill_type and ill_net_type matches before 15396 * we allocate any memory so that the cleanup is easier. 15397 * 15398 * We can't group dissimilar ones as we can't load spread 15399 * packets across the group because of potential link-level 15400 * header differences. 15401 */ 15402 phyi_tmp = phyint_lookup_group(groupname); 15403 if (phyi_tmp != NULL) { 15404 if ((ill_v4 != NULL && 15405 phyi_tmp->phyint_illv4 != NULL) && 15406 ((ill_v4->ill_net_type != 15407 phyi_tmp->phyint_illv4->ill_net_type) || 15408 (ill_v4->ill_type != 15409 phyi_tmp->phyint_illv4->ill_type))) { 15410 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15411 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15412 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15413 rw_exit(&ill_g_lock); 15414 return (EINVAL); 15415 } 15416 if ((ill_v6 != NULL && 15417 phyi_tmp->phyint_illv6 != NULL) && 15418 ((ill_v6->ill_net_type != 15419 phyi_tmp->phyint_illv6->ill_net_type) || 15420 (ill_v6->ill_type != 15421 phyi_tmp->phyint_illv6->ill_type))) { 15422 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 15423 phyi->phyint_ipsq->ipsq_split = B_TRUE; 15424 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 15425 rw_exit(&ill_g_lock); 15426 return (EINVAL); 15427 } 15428 } 15429 15430 rw_exit(&ill_g_lock); 15431 15432 /* 15433 * bring down all v4 ipifs. 15434 */ 15435 if (ill_v4 != NULL) { 15436 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 15437 } 15438 15439 /* 15440 * bring down all v6 ipifs. 15441 */ 15442 if (ill_v6 != NULL) { 15443 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 15444 } 15445 15446 /* 15447 * make sure all ipifs are down and there are no active 15448 * references. Call to ipsq_pending_mp_add will not fail 15449 * since connp is NULL. 15450 */ 15451 if (ill_v4 != NULL) { 15452 mutex_enter(&ill_v4->ill_lock); 15453 if (!ill_is_quiescent(ill_v4)) { 15454 (void) ipsq_pending_mp_add(NULL, 15455 ill_v4->ill_ipif, q, mp, ILL_DOWN); 15456 mutex_exit(&ill_v4->ill_lock); 15457 err = EINPROGRESS; 15458 goto done; 15459 } 15460 mutex_exit(&ill_v4->ill_lock); 15461 } 15462 15463 if (ill_v6 != NULL) { 15464 mutex_enter(&ill_v6->ill_lock); 15465 if (!ill_is_quiescent(ill_v6)) { 15466 (void) ipsq_pending_mp_add(NULL, 15467 ill_v6->ill_ipif, q, mp, ILL_DOWN); 15468 mutex_exit(&ill_v6->ill_lock); 15469 err = EINPROGRESS; 15470 goto done; 15471 } 15472 mutex_exit(&ill_v6->ill_lock); 15473 } 15474 15475 /* 15476 * allocate including space for null terminator 15477 * before we insert. 15478 */ 15479 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 15480 if (tmp == NULL) 15481 return (ENOMEM); 15482 15483 rw_enter(&ill_g_lock, RW_WRITER); 15484 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15485 mutex_enter(&phyi->phyint_lock); 15486 if (phyi->phyint_groupname_len != 0) { 15487 ASSERT(phyi->phyint_groupname != NULL); 15488 mi_free(phyi->phyint_groupname); 15489 } 15490 15491 /* 15492 * setup the new group name. 15493 */ 15494 phyi->phyint_groupname = tmp; 15495 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 15496 phyi->phyint_groupname_len = namelen + 1; 15497 mutex_exit(&phyi->phyint_lock); 15498 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15499 rw_exit(&ill_g_lock); 15500 15501 err = ill_up_ipifs(ill, q, mp); 15502 } 15503 15504 done: 15505 /* 15506 * normally ILL_CHANGING is cleared in ill_up_ipifs. 15507 */ 15508 if (err != EINPROGRESS) { 15509 GRAB_ILL_LOCKS(ill_v4, ill_v6); 15510 if (ill_v4 != NULL) 15511 ill_v4->ill_state_flags &= ~ILL_CHANGING; 15512 if (ill_v6 != NULL) 15513 ill_v6->ill_state_flags &= ~ILL_CHANGING; 15514 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 15515 } 15516 return (err); 15517 } 15518 15519 /* ARGSUSED */ 15520 int 15521 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 15522 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15523 { 15524 ill_t *ill; 15525 phyint_t *phyi; 15526 struct lifreq *lifr; 15527 mblk_t *mp1; 15528 15529 /* Existence verified in ip_wput_nondata */ 15530 mp1 = mp->b_cont->b_cont; 15531 lifr = (struct lifreq *)mp1->b_rptr; 15532 ill = ipif->ipif_ill; 15533 phyi = ill->ill_phyint; 15534 15535 lifr->lifr_groupname[0] = '\0'; 15536 /* 15537 * ill_group may be null if all the interfaces 15538 * are down. But still, the phyint should always 15539 * hold the name. 15540 */ 15541 if (phyi->phyint_groupname_len != 0) { 15542 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 15543 phyi->phyint_groupname_len); 15544 } 15545 15546 return (0); 15547 } 15548 15549 15550 typedef struct conn_move_s { 15551 ill_t *cm_from_ill; 15552 ill_t *cm_to_ill; 15553 int cm_ifindex; 15554 } conn_move_t; 15555 15556 /* 15557 * ipcl_walk function for moving conn_multicast_ill for a given ill. 15558 */ 15559 static void 15560 conn_move(conn_t *connp, caddr_t arg) 15561 { 15562 conn_move_t *connm; 15563 int ifindex; 15564 int i; 15565 ill_t *from_ill; 15566 ill_t *to_ill; 15567 ilg_t *ilg; 15568 ilm_t *ret_ilm; 15569 15570 connm = (conn_move_t *)arg; 15571 ifindex = connm->cm_ifindex; 15572 from_ill = connm->cm_from_ill; 15573 to_ill = connm->cm_to_ill; 15574 15575 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 15576 15577 /* All multicast fields protected by conn_lock */ 15578 mutex_enter(&connp->conn_lock); 15579 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 15580 if ((connp->conn_outgoing_ill == from_ill) && 15581 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 15582 connp->conn_outgoing_ill = to_ill; 15583 connp->conn_incoming_ill = to_ill; 15584 } 15585 15586 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 15587 15588 if ((connp->conn_multicast_ill == from_ill) && 15589 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 15590 connp->conn_multicast_ill = connm->cm_to_ill; 15591 } 15592 15593 /* Change IP_XMIT_IF associations */ 15594 if ((connp->conn_xmit_if_ill == from_ill) && 15595 (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) { 15596 connp->conn_xmit_if_ill = to_ill; 15597 } 15598 /* 15599 * Change the ilg_ill to point to the new one. This assumes 15600 * ilm_move_v6 has moved the ilms to new_ill and the driver 15601 * has been told to receive packets on this interface. 15602 * ilm_move_v6 FAILBACKS all the ilms successfully always. 15603 * But when doing a FAILOVER, it might fail with ENOMEM and so 15604 * some ilms may not have moved. We check to see whether 15605 * the ilms have moved to to_ill. We can't check on from_ill 15606 * as in the process of moving, we could have split an ilm 15607 * in to two - which has the same orig_ifindex and v6group. 15608 * 15609 * For IPv4, ilg_ipif moves implicitly. The code below really 15610 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 15611 */ 15612 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 15613 ilg = &connp->conn_ilg[i]; 15614 if ((ilg->ilg_ill == from_ill) && 15615 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 15616 /* ifindex != 0 indicates failback */ 15617 if (ifindex != 0) { 15618 connp->conn_ilg[i].ilg_ill = to_ill; 15619 continue; 15620 } 15621 15622 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 15623 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 15624 connp->conn_zoneid); 15625 15626 if (ret_ilm != NULL) 15627 connp->conn_ilg[i].ilg_ill = to_ill; 15628 } 15629 } 15630 mutex_exit(&connp->conn_lock); 15631 } 15632 15633 static void 15634 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 15635 { 15636 conn_move_t connm; 15637 15638 connm.cm_from_ill = from_ill; 15639 connm.cm_to_ill = to_ill; 15640 connm.cm_ifindex = ifindex; 15641 15642 ipcl_walk(conn_move, (caddr_t)&connm); 15643 } 15644 15645 /* 15646 * ilm has been moved from from_ill to to_ill. 15647 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 15648 * appropriately. 15649 * 15650 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 15651 * the code there de-references ipif_ill to get the ill to 15652 * send multicast requests. It does not work as ipif is on its 15653 * move and already moved when this function is called. 15654 * Thus, we need to use from_ill and to_ill send down multicast 15655 * requests. 15656 */ 15657 static void 15658 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 15659 { 15660 ipif_t *ipif; 15661 ilm_t *ilm; 15662 15663 /* 15664 * See whether we need to send down DL_ENABMULTI_REQ on 15665 * to_ill as ilm has just been added. 15666 */ 15667 ASSERT(IAM_WRITER_ILL(to_ill)); 15668 ASSERT(IAM_WRITER_ILL(from_ill)); 15669 15670 ILM_WALKER_HOLD(to_ill); 15671 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15672 15673 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 15674 continue; 15675 /* 15676 * no locks held, ill/ipif cannot dissappear as long 15677 * as we are writer. 15678 */ 15679 ipif = to_ill->ill_ipif; 15680 /* 15681 * No need to hold any lock as we are the writer and this 15682 * can only be changed by a writer. 15683 */ 15684 ilm->ilm_is_new = B_FALSE; 15685 15686 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 15687 ipif->ipif_flags & IPIF_POINTOPOINT) { 15688 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 15689 "resolver\n")); 15690 continue; /* Must be IRE_IF_NORESOLVER */ 15691 } 15692 15693 15694 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 15695 ip1dbg(("ilm_send_multicast_reqs: " 15696 "to_ill MULTI_BCAST\n")); 15697 goto from; 15698 } 15699 15700 if (to_ill->ill_isv6) 15701 mld_joingroup(ilm); 15702 else 15703 igmp_joingroup(ilm); 15704 15705 if (to_ill->ill_ipif_up_count == 0) { 15706 /* 15707 * Nobody there. All multicast addresses will be 15708 * re-joined when we get the DL_BIND_ACK bringing the 15709 * interface up. 15710 */ 15711 ilm->ilm_notify_driver = B_FALSE; 15712 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 15713 goto from; 15714 } 15715 15716 /* 15717 * For allmulti address, we want to join on only one interface. 15718 * Checking for ilm_numentries_v6 is not correct as you may 15719 * find an ilm with zero address on to_ill, but we may not 15720 * have nominated to_ill for receiving. Thus, if we have 15721 * nominated from_ill (ill_join_allmulti is set), nominate 15722 * only if to_ill is not already nominated (to_ill normally 15723 * should not have been nominated if "from_ill" has already 15724 * been nominated. As we don't prevent failovers from happening 15725 * across groups, we don't assert). 15726 */ 15727 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15728 /* 15729 * There is no need to hold ill locks as we are 15730 * writer on both ills and when ill_join_allmulti 15731 * is changed the thread is always a writer. 15732 */ 15733 if (from_ill->ill_join_allmulti && 15734 !to_ill->ill_join_allmulti) { 15735 (void) ip_join_allmulti(to_ill->ill_ipif); 15736 } 15737 } else if (ilm->ilm_notify_driver) { 15738 15739 /* 15740 * This is a newly moved ilm so we need to tell the 15741 * driver about the new group. There can be more than 15742 * one ilm's for the same group in the list each with a 15743 * different orig_ifindex. We have to inform the driver 15744 * once. In ilm_move_v[4,6] we only set the flag 15745 * ilm_notify_driver for the first ilm. 15746 */ 15747 15748 (void) ip_ll_send_enabmulti_req(to_ill, 15749 &ilm->ilm_v6addr); 15750 } 15751 15752 ilm->ilm_notify_driver = B_FALSE; 15753 15754 /* 15755 * See whether we need to send down DL_DISABMULTI_REQ on 15756 * from_ill as ilm has just been removed. 15757 */ 15758 from: 15759 ipif = from_ill->ill_ipif; 15760 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 15761 ipif->ipif_flags & IPIF_POINTOPOINT) { 15762 ip1dbg(("ilm_send_multicast_reqs: " 15763 "from_ill not resolver\n")); 15764 continue; /* Must be IRE_IF_NORESOLVER */ 15765 } 15766 15767 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 15768 ip1dbg(("ilm_send_multicast_reqs: " 15769 "from_ill MULTI_BCAST\n")); 15770 continue; 15771 } 15772 15773 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15774 if (from_ill->ill_join_allmulti) 15775 (void) ip_leave_allmulti(from_ill->ill_ipif); 15776 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 15777 (void) ip_ll_send_disabmulti_req(from_ill, 15778 &ilm->ilm_v6addr); 15779 } 15780 } 15781 ILM_WALKER_RELE(to_ill); 15782 } 15783 15784 /* 15785 * This function is called when all multicast memberships needs 15786 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 15787 * called only once unlike the IPv4 counterpart where it is called after 15788 * every logical interface is moved. The reason is due to multicast 15789 * memberships are joined using an interface address in IPv4 while in 15790 * IPv6, interface index is used. 15791 */ 15792 static void 15793 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 15794 { 15795 ilm_t *ilm; 15796 ilm_t *ilm_next; 15797 ilm_t *new_ilm; 15798 ilm_t **ilmp; 15799 int count; 15800 char buf[INET6_ADDRSTRLEN]; 15801 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 15802 15803 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 15804 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 15805 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 15806 15807 if (ifindex == 0) { 15808 /* 15809 * Form the solicited node mcast address which is used later. 15810 */ 15811 ipif_t *ipif; 15812 15813 ipif = from_ill->ill_ipif; 15814 ASSERT(ipif->ipif_id == 0); 15815 15816 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 15817 } 15818 15819 ilmp = &from_ill->ill_ilm; 15820 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 15821 ilm_next = ilm->ilm_next; 15822 15823 if (ilm->ilm_flags & ILM_DELETED) { 15824 ilmp = &ilm->ilm_next; 15825 continue; 15826 } 15827 15828 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 15829 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 15830 ASSERT(ilm->ilm_orig_ifindex != 0); 15831 if (ilm->ilm_orig_ifindex == ifindex) { 15832 /* 15833 * We are failing back multicast memberships. 15834 * If the same ilm exists in to_ill, it means somebody 15835 * has joined the same group there e.g. ff02::1 15836 * is joined within the kernel when the interfaces 15837 * came UP. 15838 */ 15839 ASSERT(ilm->ilm_ipif == NULL); 15840 if (new_ilm != NULL) { 15841 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 15842 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 15843 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 15844 new_ilm->ilm_is_new = B_TRUE; 15845 } 15846 } else { 15847 /* 15848 * check if we can just move the ilm 15849 */ 15850 if (from_ill->ill_ilm_walker_cnt != 0) { 15851 /* 15852 * We have walkers we cannot move 15853 * the ilm, so allocate a new ilm, 15854 * this (old) ilm will be marked 15855 * ILM_DELETED at the end of the loop 15856 * and will be freed when the 15857 * last walker exits. 15858 */ 15859 new_ilm = (ilm_t *)mi_zalloc 15860 (sizeof (ilm_t)); 15861 if (new_ilm == NULL) { 15862 ip0dbg(("ilm_move_v6: " 15863 "FAILBACK of IPv6" 15864 " multicast address %s : " 15865 "from %s to" 15866 " %s failed : ENOMEM \n", 15867 inet_ntop(AF_INET6, 15868 &ilm->ilm_v6addr, buf, 15869 sizeof (buf)), 15870 from_ill->ill_name, 15871 to_ill->ill_name)); 15872 15873 ilmp = &ilm->ilm_next; 15874 continue; 15875 } 15876 *new_ilm = *ilm; 15877 /* 15878 * we don't want new_ilm linked to 15879 * ilm's filter list. 15880 */ 15881 new_ilm->ilm_filter = NULL; 15882 } else { 15883 /* 15884 * No walkers we can move the ilm. 15885 * lets take it out of the list. 15886 */ 15887 *ilmp = ilm->ilm_next; 15888 ilm->ilm_next = NULL; 15889 new_ilm = ilm; 15890 } 15891 15892 /* 15893 * if this is the first ilm for the group 15894 * set ilm_notify_driver so that we notify the 15895 * driver in ilm_send_multicast_reqs. 15896 */ 15897 if (ilm_lookup_ill_v6(to_ill, 15898 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 15899 new_ilm->ilm_notify_driver = B_TRUE; 15900 15901 new_ilm->ilm_ill = to_ill; 15902 /* Add to the to_ill's list */ 15903 new_ilm->ilm_next = to_ill->ill_ilm; 15904 to_ill->ill_ilm = new_ilm; 15905 /* 15906 * set the flag so that mld_joingroup is 15907 * called in ilm_send_multicast_reqs(). 15908 */ 15909 new_ilm->ilm_is_new = B_TRUE; 15910 } 15911 goto bottom; 15912 } else if (ifindex != 0) { 15913 /* 15914 * If this is FAILBACK (ifindex != 0) and the ifindex 15915 * has not matched above, look at the next ilm. 15916 */ 15917 ilmp = &ilm->ilm_next; 15918 continue; 15919 } 15920 /* 15921 * If we are here, it means ifindex is 0. Failover 15922 * everything. 15923 * 15924 * We need to handle solicited node mcast address 15925 * and all_nodes mcast address differently as they 15926 * are joined witin the kenrel (ipif_multicast_up) 15927 * and potentially from the userland. We are called 15928 * after the ipifs of from_ill has been moved. 15929 * If we still find ilms on ill with solicited node 15930 * mcast address or all_nodes mcast address, it must 15931 * belong to the UP interface that has not moved e.g. 15932 * ipif_id 0 with the link local prefix does not move. 15933 * We join this on the new ill accounting for all the 15934 * userland memberships so that applications don't 15935 * see any failure. 15936 * 15937 * We need to make sure that we account only for the 15938 * solicited node and all node multicast addresses 15939 * that was brought UP on these. In the case of 15940 * a failover from A to B, we might have ilms belonging 15941 * to A (ilm_orig_ifindex pointing at A) on B accounting 15942 * for the membership from the userland. If we are failing 15943 * over from B to C now, we will find the ones belonging 15944 * to A on B. These don't account for the ill_ipif_up_count. 15945 * They just move from B to C. The check below on 15946 * ilm_orig_ifindex ensures that. 15947 */ 15948 if ((ilm->ilm_orig_ifindex == 15949 from_ill->ill_phyint->phyint_ifindex) && 15950 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 15951 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 15952 &ilm->ilm_v6addr))) { 15953 ASSERT(ilm->ilm_refcnt > 0); 15954 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 15955 /* 15956 * For indentation reasons, we are not using a 15957 * "else" here. 15958 */ 15959 if (count == 0) { 15960 ilmp = &ilm->ilm_next; 15961 continue; 15962 } 15963 ilm->ilm_refcnt -= count; 15964 if (new_ilm != NULL) { 15965 /* 15966 * Can find one with the same 15967 * ilm_orig_ifindex, if we are failing 15968 * over to a STANDBY. This happens 15969 * when somebody wants to join a group 15970 * on a STANDBY interface and we 15971 * internally join on a different one. 15972 * If we had joined on from_ill then, a 15973 * failover now will find a new ilm 15974 * with this index. 15975 */ 15976 ip1dbg(("ilm_move_v6: FAILOVER, found" 15977 " new ilm on %s, group address %s\n", 15978 to_ill->ill_name, 15979 inet_ntop(AF_INET6, 15980 &ilm->ilm_v6addr, buf, 15981 sizeof (buf)))); 15982 new_ilm->ilm_refcnt += count; 15983 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 15984 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 15985 new_ilm->ilm_is_new = B_TRUE; 15986 } 15987 } else { 15988 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 15989 if (new_ilm == NULL) { 15990 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 15991 " multicast address %s : from %s to" 15992 " %s failed : ENOMEM \n", 15993 inet_ntop(AF_INET6, 15994 &ilm->ilm_v6addr, buf, 15995 sizeof (buf)), from_ill->ill_name, 15996 to_ill->ill_name)); 15997 ilmp = &ilm->ilm_next; 15998 continue; 15999 } 16000 *new_ilm = *ilm; 16001 new_ilm->ilm_filter = NULL; 16002 new_ilm->ilm_refcnt = count; 16003 new_ilm->ilm_timer = INFINITY; 16004 new_ilm->ilm_rtx.rtx_timer = INFINITY; 16005 new_ilm->ilm_is_new = B_TRUE; 16006 /* 16007 * If the to_ill has not joined this 16008 * group we need to tell the driver in 16009 * ill_send_multicast_reqs. 16010 */ 16011 if (ilm_lookup_ill_v6(to_ill, 16012 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16013 new_ilm->ilm_notify_driver = B_TRUE; 16014 16015 new_ilm->ilm_ill = to_ill; 16016 /* Add to the to_ill's list */ 16017 new_ilm->ilm_next = to_ill->ill_ilm; 16018 to_ill->ill_ilm = new_ilm; 16019 ASSERT(new_ilm->ilm_ipif == NULL); 16020 } 16021 if (ilm->ilm_refcnt == 0) { 16022 goto bottom; 16023 } else { 16024 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16025 CLEAR_SLIST(new_ilm->ilm_filter); 16026 ilmp = &ilm->ilm_next; 16027 } 16028 continue; 16029 } else { 16030 /* 16031 * ifindex = 0 means, move everything pointing at 16032 * from_ill. We are doing this becuase ill has 16033 * either FAILED or became INACTIVE. 16034 * 16035 * As we would like to move things later back to 16036 * from_ill, we want to retain the identity of this 16037 * ilm. Thus, we don't blindly increment the reference 16038 * count on the ilms matching the address alone. We 16039 * need to match on the ilm_orig_index also. new_ilm 16040 * was obtained by matching ilm_orig_index also. 16041 */ 16042 if (new_ilm != NULL) { 16043 /* 16044 * This is possible only if a previous restore 16045 * was incomplete i.e restore to 16046 * ilm_orig_ifindex left some ilms because 16047 * of some failures. Thus when we are failing 16048 * again, we might find our old friends there. 16049 */ 16050 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 16051 " on %s, group address %s\n", 16052 to_ill->ill_name, 16053 inet_ntop(AF_INET6, 16054 &ilm->ilm_v6addr, buf, 16055 sizeof (buf)))); 16056 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16057 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16058 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16059 new_ilm->ilm_is_new = B_TRUE; 16060 } 16061 } else { 16062 if (from_ill->ill_ilm_walker_cnt != 0) { 16063 new_ilm = (ilm_t *) 16064 mi_zalloc(sizeof (ilm_t)); 16065 if (new_ilm == NULL) { 16066 ip0dbg(("ilm_move_v6: " 16067 "FAILOVER of IPv6" 16068 " multicast address %s : " 16069 "from %s to" 16070 " %s failed : ENOMEM \n", 16071 inet_ntop(AF_INET6, 16072 &ilm->ilm_v6addr, buf, 16073 sizeof (buf)), 16074 from_ill->ill_name, 16075 to_ill->ill_name)); 16076 16077 ilmp = &ilm->ilm_next; 16078 continue; 16079 } 16080 *new_ilm = *ilm; 16081 new_ilm->ilm_filter = NULL; 16082 } else { 16083 *ilmp = ilm->ilm_next; 16084 new_ilm = ilm; 16085 } 16086 /* 16087 * If the to_ill has not joined this 16088 * group we need to tell the driver in 16089 * ill_send_multicast_reqs. 16090 */ 16091 if (ilm_lookup_ill_v6(to_ill, 16092 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16093 new_ilm->ilm_notify_driver = B_TRUE; 16094 16095 /* Add to the to_ill's list */ 16096 new_ilm->ilm_next = to_ill->ill_ilm; 16097 to_ill->ill_ilm = new_ilm; 16098 ASSERT(ilm->ilm_ipif == NULL); 16099 new_ilm->ilm_ill = to_ill; 16100 new_ilm->ilm_is_new = B_TRUE; 16101 } 16102 16103 } 16104 16105 bottom: 16106 /* 16107 * Revert multicast filter state to (EXCLUDE, NULL). 16108 * new_ilm->ilm_is_new should already be set if needed. 16109 */ 16110 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16111 CLEAR_SLIST(new_ilm->ilm_filter); 16112 /* 16113 * We allocated/got a new ilm, free the old one. 16114 */ 16115 if (new_ilm != ilm) { 16116 if (from_ill->ill_ilm_walker_cnt == 0) { 16117 *ilmp = ilm->ilm_next; 16118 ilm->ilm_next = NULL; 16119 FREE_SLIST(ilm->ilm_filter); 16120 FREE_SLIST(ilm->ilm_pendsrcs); 16121 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 16122 FREE_SLIST(ilm->ilm_rtx.rtx_block); 16123 mi_free((char *)ilm); 16124 } else { 16125 ilm->ilm_flags |= ILM_DELETED; 16126 from_ill->ill_ilm_cleanup_reqd = 1; 16127 ilmp = &ilm->ilm_next; 16128 } 16129 } 16130 } 16131 } 16132 16133 /* 16134 * Move all the multicast memberships to to_ill. Called when 16135 * an ipif moves from "from_ill" to "to_ill". This function is slightly 16136 * different from IPv6 counterpart as multicast memberships are associated 16137 * with ills in IPv6. This function is called after every ipif is moved 16138 * unlike IPv6, where it is moved only once. 16139 */ 16140 static void 16141 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 16142 { 16143 ilm_t *ilm; 16144 ilm_t *ilm_next; 16145 ilm_t *new_ilm; 16146 ilm_t **ilmp; 16147 16148 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16149 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16150 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16151 16152 ilmp = &from_ill->ill_ilm; 16153 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16154 ilm_next = ilm->ilm_next; 16155 16156 if (ilm->ilm_flags & ILM_DELETED) { 16157 ilmp = &ilm->ilm_next; 16158 continue; 16159 } 16160 16161 ASSERT(ilm->ilm_ipif != NULL); 16162 16163 if (ilm->ilm_ipif != ipif) { 16164 ilmp = &ilm->ilm_next; 16165 continue; 16166 } 16167 16168 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 16169 htonl(INADDR_ALLHOSTS_GROUP)) { 16170 /* 16171 * We joined this in ipif_multicast_up 16172 * and we never did an ipif_multicast_down 16173 * for IPv4. If nobody else from the userland 16174 * has reference, we free the ilm, and later 16175 * when this ipif comes up on the new ill, 16176 * we will join this again. 16177 */ 16178 if (--ilm->ilm_refcnt == 0) 16179 goto delete_ilm; 16180 16181 new_ilm = ilm_lookup_ipif(ipif, 16182 V4_PART_OF_V6(ilm->ilm_v6addr)); 16183 if (new_ilm != NULL) { 16184 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16185 /* 16186 * We still need to deal with the from_ill. 16187 */ 16188 new_ilm->ilm_is_new = B_TRUE; 16189 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16190 CLEAR_SLIST(new_ilm->ilm_filter); 16191 goto delete_ilm; 16192 } 16193 /* 16194 * If we could not find one e.g. ipif is 16195 * still down on to_ill, we add this ilm 16196 * on ill_new to preserve the reference 16197 * count. 16198 */ 16199 } 16200 /* 16201 * When ipifs move, ilms always move with it 16202 * to the NEW ill. Thus we should never be 16203 * able to find ilm till we really move it here. 16204 */ 16205 ASSERT(ilm_lookup_ipif(ipif, 16206 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 16207 16208 if (from_ill->ill_ilm_walker_cnt != 0) { 16209 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 16210 if (new_ilm == NULL) { 16211 char buf[INET6_ADDRSTRLEN]; 16212 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 16213 " multicast address %s : " 16214 "from %s to" 16215 " %s failed : ENOMEM \n", 16216 inet_ntop(AF_INET, 16217 &ilm->ilm_v6addr, buf, 16218 sizeof (buf)), 16219 from_ill->ill_name, 16220 to_ill->ill_name)); 16221 16222 ilmp = &ilm->ilm_next; 16223 continue; 16224 } 16225 *new_ilm = *ilm; 16226 /* We don't want new_ilm linked to ilm's filter list */ 16227 new_ilm->ilm_filter = NULL; 16228 } else { 16229 /* Remove from the list */ 16230 *ilmp = ilm->ilm_next; 16231 new_ilm = ilm; 16232 } 16233 16234 /* 16235 * If we have never joined this group on the to_ill 16236 * make sure we tell the driver. 16237 */ 16238 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 16239 ALL_ZONES) == NULL) 16240 new_ilm->ilm_notify_driver = B_TRUE; 16241 16242 /* Add to the to_ill's list */ 16243 new_ilm->ilm_next = to_ill->ill_ilm; 16244 to_ill->ill_ilm = new_ilm; 16245 new_ilm->ilm_is_new = B_TRUE; 16246 16247 /* 16248 * Revert multicast filter state to (EXCLUDE, NULL) 16249 */ 16250 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 16251 CLEAR_SLIST(new_ilm->ilm_filter); 16252 16253 /* 16254 * Delete only if we have allocated a new ilm. 16255 */ 16256 if (new_ilm != ilm) { 16257 delete_ilm: 16258 if (from_ill->ill_ilm_walker_cnt == 0) { 16259 /* Remove from the list */ 16260 *ilmp = ilm->ilm_next; 16261 ilm->ilm_next = NULL; 16262 FREE_SLIST(ilm->ilm_filter); 16263 FREE_SLIST(ilm->ilm_pendsrcs); 16264 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 16265 FREE_SLIST(ilm->ilm_rtx.rtx_block); 16266 mi_free((char *)ilm); 16267 } else { 16268 ilm->ilm_flags |= ILM_DELETED; 16269 from_ill->ill_ilm_cleanup_reqd = 1; 16270 ilmp = &ilm->ilm_next; 16271 } 16272 } 16273 } 16274 } 16275 16276 static uint_t 16277 ipif_get_id(ill_t *ill, uint_t id) 16278 { 16279 uint_t unit; 16280 ipif_t *tipif; 16281 boolean_t found = B_FALSE; 16282 16283 /* 16284 * During failback, we want to go back to the same id 16285 * instead of the smallest id so that the original 16286 * configuration is maintained. id is non-zero in that 16287 * case. 16288 */ 16289 if (id != 0) { 16290 /* 16291 * While failing back, if we still have an ipif with 16292 * MAX_ADDRS_PER_IF, it means this will be replaced 16293 * as soon as we return from this function. It was 16294 * to set to MAX_ADDRS_PER_IF by the caller so that 16295 * we can choose the smallest id. Thus we return zero 16296 * in that case ignoring the hint. 16297 */ 16298 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 16299 return (0); 16300 for (tipif = ill->ill_ipif; tipif != NULL; 16301 tipif = tipif->ipif_next) { 16302 if (tipif->ipif_id == id) { 16303 found = B_TRUE; 16304 break; 16305 } 16306 } 16307 /* 16308 * If somebody already plumbed another logical 16309 * with the same id, we won't be able to find it. 16310 */ 16311 if (!found) 16312 return (id); 16313 } 16314 for (unit = 0; unit <= ip_addrs_per_if; unit++) { 16315 found = B_FALSE; 16316 for (tipif = ill->ill_ipif; tipif != NULL; 16317 tipif = tipif->ipif_next) { 16318 if (tipif->ipif_id == unit) { 16319 found = B_TRUE; 16320 break; 16321 } 16322 } 16323 if (!found) 16324 break; 16325 } 16326 return (unit); 16327 } 16328 16329 /* ARGSUSED */ 16330 static int 16331 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 16332 ipif_t **rep_ipif_ptr) 16333 { 16334 ill_t *from_ill; 16335 ipif_t *rep_ipif; 16336 ipif_t **ipifp; 16337 uint_t unit; 16338 int err = 0; 16339 ipif_t *to_ipif; 16340 struct iocblk *iocp; 16341 boolean_t failback_cmd; 16342 boolean_t remove_ipif; 16343 int rc; 16344 16345 ASSERT(IAM_WRITER_ILL(to_ill)); 16346 ASSERT(IAM_WRITER_IPIF(ipif)); 16347 16348 iocp = (struct iocblk *)mp->b_rptr; 16349 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 16350 remove_ipif = B_FALSE; 16351 16352 from_ill = ipif->ipif_ill; 16353 16354 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16355 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16356 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16357 16358 /* 16359 * Don't move LINK LOCAL addresses as they are tied to 16360 * physical interface. 16361 */ 16362 if (from_ill->ill_isv6 && 16363 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 16364 ipif->ipif_was_up = B_FALSE; 16365 IPIF_UNMARK_MOVING(ipif); 16366 return (0); 16367 } 16368 16369 /* 16370 * We set the ipif_id to maximum so that the search for 16371 * ipif_id will pick the lowest number i.e 0 in the 16372 * following 2 cases : 16373 * 16374 * 1) We have a replacement ipif at the head of to_ill. 16375 * We can't remove it yet as we can exceed ip_addrs_per_if 16376 * on to_ill and hence the MOVE might fail. We want to 16377 * remove it only if we could move the ipif. Thus, by 16378 * setting it to the MAX value, we make the search in 16379 * ipif_get_id return the zeroth id. 16380 * 16381 * 2) When DR pulls out the NIC and re-plumbs the interface, 16382 * we might just have a zero address plumbed on the ipif 16383 * with zero id in the case of IPv4. We remove that while 16384 * doing the failback. We want to remove it only if we 16385 * could move the ipif. Thus, by setting it to the MAX 16386 * value, we make the search in ipif_get_id return the 16387 * zeroth id. 16388 * 16389 * Both (1) and (2) are done only when when we are moving 16390 * an ipif (either due to failover/failback) which originally 16391 * belonged to this interface i.e the ipif_orig_ifindex is 16392 * the same as to_ill's ifindex. This is needed so that 16393 * FAILOVER from A -> B ( A failed) followed by FAILOVER 16394 * from B -> A (B is being removed from the group) and 16395 * FAILBACK from A -> B restores the original configuration. 16396 * Without the check for orig_ifindex, the second FAILOVER 16397 * could make the ipif belonging to B replace the A's zeroth 16398 * ipif and the subsequent failback re-creating the replacement 16399 * ipif again. 16400 * 16401 * NOTE : We created the replacement ipif when we did a 16402 * FAILOVER (See below). We could check for FAILBACK and 16403 * then look for replacement ipif to be removed. But we don't 16404 * want to do that because we wan't to allow the possibility 16405 * of a FAILOVER from A -> B (which creates the replacement ipif), 16406 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 16407 * from B -> A. 16408 */ 16409 to_ipif = to_ill->ill_ipif; 16410 if ((to_ill->ill_phyint->phyint_ifindex == 16411 ipif->ipif_orig_ifindex) && 16412 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 16413 ASSERT(to_ipif->ipif_id == 0); 16414 remove_ipif = B_TRUE; 16415 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 16416 } 16417 /* 16418 * Find the lowest logical unit number on the to_ill. 16419 * If we are failing back, try to get the original id 16420 * rather than the lowest one so that the original 16421 * configuration is maintained. 16422 * 16423 * XXX need a better scheme for this. 16424 */ 16425 if (failback_cmd) { 16426 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 16427 } else { 16428 unit = ipif_get_id(to_ill, 0); 16429 } 16430 16431 /* Reset back to zero in case we fail below */ 16432 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 16433 to_ipif->ipif_id = 0; 16434 16435 if (unit == ip_addrs_per_if) { 16436 ipif->ipif_was_up = B_FALSE; 16437 IPIF_UNMARK_MOVING(ipif); 16438 return (EINVAL); 16439 } 16440 16441 /* 16442 * ipif is ready to move from "from_ill" to "to_ill". 16443 * 16444 * 1) If we are moving ipif with id zero, create a 16445 * replacement ipif for this ipif on from_ill. If this fails 16446 * fail the MOVE operation. 16447 * 16448 * 2) Remove the replacement ipif on to_ill if any. 16449 * We could remove the replacement ipif when we are moving 16450 * the ipif with id zero. But what if somebody already 16451 * unplumbed it ? Thus we always remove it if it is present. 16452 * We want to do it only if we are sure we are going to 16453 * move the ipif to to_ill which is why there are no 16454 * returns due to error till ipif is linked to to_ill. 16455 * Note that the first ipif that we failback will always 16456 * be zero if it is present. 16457 */ 16458 if (ipif->ipif_id == 0) { 16459 ipaddr_t inaddr_any = INADDR_ANY; 16460 16461 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 16462 if (rep_ipif == NULL) { 16463 ipif->ipif_was_up = B_FALSE; 16464 IPIF_UNMARK_MOVING(ipif); 16465 return (ENOMEM); 16466 } 16467 *rep_ipif = ipif_zero; 16468 /* 16469 * Before we put the ipif on the list, store the addresses 16470 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 16471 * assumes so. This logic is not any different from what 16472 * ipif_allocate does. 16473 */ 16474 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16475 &rep_ipif->ipif_v6lcl_addr); 16476 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16477 &rep_ipif->ipif_v6src_addr); 16478 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16479 &rep_ipif->ipif_v6subnet); 16480 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16481 &rep_ipif->ipif_v6net_mask); 16482 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16483 &rep_ipif->ipif_v6brd_addr); 16484 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 16485 &rep_ipif->ipif_v6pp_dst_addr); 16486 /* 16487 * We mark IPIF_NOFAILOVER so that this can never 16488 * move. 16489 */ 16490 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 16491 rep_ipif->ipif_flags &= ~IPIF_UP; 16492 rep_ipif->ipif_replace_zero = B_TRUE; 16493 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 16494 MUTEX_DEFAULT, NULL); 16495 rep_ipif->ipif_id = 0; 16496 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 16497 rep_ipif->ipif_ill = from_ill; 16498 rep_ipif->ipif_orig_ifindex = 16499 from_ill->ill_phyint->phyint_ifindex; 16500 /* Insert at head */ 16501 rep_ipif->ipif_next = from_ill->ill_ipif; 16502 from_ill->ill_ipif = rep_ipif; 16503 /* 16504 * We don't really care to let apps know about 16505 * this interface. 16506 */ 16507 } 16508 16509 if (remove_ipif) { 16510 /* 16511 * We set to a max value above for this case to get 16512 * id zero. ASSERT that we did get one. 16513 */ 16514 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 16515 rep_ipif = to_ipif; 16516 to_ill->ill_ipif = rep_ipif->ipif_next; 16517 rep_ipif->ipif_next = NULL; 16518 /* 16519 * If some apps scanned and find this interface, 16520 * it is time to let them know, so that they can 16521 * delete it. 16522 */ 16523 16524 *rep_ipif_ptr = rep_ipif; 16525 } 16526 16527 /* Get it out of the ILL interface list. */ 16528 ipifp = &ipif->ipif_ill->ill_ipif; 16529 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 16530 if (*ipifp == ipif) { 16531 *ipifp = ipif->ipif_next; 16532 break; 16533 } 16534 } 16535 16536 /* Assign the new ill */ 16537 ipif->ipif_ill = to_ill; 16538 ipif->ipif_id = unit; 16539 /* id has already been checked */ 16540 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 16541 ASSERT(rc == 0); 16542 /* Let SCTP update its list */ 16543 sctp_move_ipif(ipif, from_ill, to_ill); 16544 /* 16545 * Handle the failover and failback of ipif_t between 16546 * ill_t that have differing maximum mtu values. 16547 */ 16548 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 16549 if (ipif->ipif_saved_mtu == 0) { 16550 /* 16551 * As this ipif_t is moving to an ill_t 16552 * that has a lower ill_max_mtu, its 16553 * ipif_mtu needs to be saved so it can 16554 * be restored during failback or during 16555 * failover to an ill_t which has a 16556 * higher ill_max_mtu. 16557 */ 16558 ipif->ipif_saved_mtu = ipif->ipif_mtu; 16559 ipif->ipif_mtu = to_ill->ill_max_mtu; 16560 } else { 16561 /* 16562 * The ipif_t is, once again, moving to 16563 * an ill_t that has a lower maximum mtu 16564 * value. 16565 */ 16566 ipif->ipif_mtu = to_ill->ill_max_mtu; 16567 } 16568 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 16569 ipif->ipif_saved_mtu != 0) { 16570 /* 16571 * The mtu of this ipif_t had to be reduced 16572 * during an earlier failover; this is an 16573 * opportunity for it to be increased (either as 16574 * part of another failover or a failback). 16575 */ 16576 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 16577 ipif->ipif_mtu = ipif->ipif_saved_mtu; 16578 ipif->ipif_saved_mtu = 0; 16579 } else { 16580 ipif->ipif_mtu = to_ill->ill_max_mtu; 16581 } 16582 } 16583 16584 /* 16585 * We preserve all the other fields of the ipif including 16586 * ipif_saved_ire_mp. The routes that are saved here will 16587 * be recreated on the new interface and back on the old 16588 * interface when we move back. 16589 */ 16590 ASSERT(ipif->ipif_arp_del_mp == NULL); 16591 16592 return (err); 16593 } 16594 16595 static int 16596 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 16597 int ifindex, ipif_t **rep_ipif_ptr) 16598 { 16599 ipif_t *mipif; 16600 ipif_t *ipif_next; 16601 int err; 16602 16603 /* 16604 * We don't really try to MOVE back things if some of the 16605 * operations fail. The daemon will take care of moving again 16606 * later on. 16607 */ 16608 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 16609 ipif_next = mipif->ipif_next; 16610 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 16611 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 16612 16613 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 16614 16615 /* 16616 * When the MOVE fails, it is the job of the 16617 * application to take care of this properly 16618 * i.e try again if it is ENOMEM. 16619 */ 16620 if (mipif->ipif_ill != from_ill) { 16621 /* 16622 * ipif has moved. 16623 * 16624 * Move the multicast memberships associated 16625 * with this ipif to the new ill. For IPv6, we 16626 * do it once after all the ipifs are moved 16627 * (in ill_move) as they are not associated 16628 * with ipifs. 16629 * 16630 * We need to move the ilms as the ipif has 16631 * already been moved to a new ill even 16632 * in the case of errors. Neither 16633 * ilm_free(ipif) will find the ilm 16634 * when somebody unplumbs this ipif nor 16635 * ilm_delete(ilm) will be able to find the 16636 * ilm, if we don't move now. 16637 */ 16638 if (!from_ill->ill_isv6) 16639 ilm_move_v4(from_ill, to_ill, mipif); 16640 } 16641 16642 if (err != 0) 16643 return (err); 16644 } 16645 } 16646 return (0); 16647 } 16648 16649 static int 16650 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 16651 { 16652 int ifindex; 16653 int err; 16654 struct iocblk *iocp; 16655 ipif_t *ipif; 16656 ipif_t *rep_ipif_ptr = NULL; 16657 ipif_t *from_ipif = NULL; 16658 boolean_t check_rep_if = B_FALSE; 16659 16660 iocp = (struct iocblk *)mp->b_rptr; 16661 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 16662 /* 16663 * Move everything pointing at from_ill to to_ill. 16664 * We acheive this by passing in 0 as ifindex. 16665 */ 16666 ifindex = 0; 16667 } else { 16668 /* 16669 * Move everything pointing at from_ill whose original 16670 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 16671 * We acheive this by passing in ifindex rather than 0. 16672 * Multicast vifs, ilgs move implicitly because ipifs move. 16673 */ 16674 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 16675 ifindex = to_ill->ill_phyint->phyint_ifindex; 16676 } 16677 16678 /* 16679 * Determine if there is at least one ipif that would move from 16680 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 16681 * ipif (if it exists) on the to_ill would be consumed as a result of 16682 * the move, in which case we need to quiesce the replacement ipif also. 16683 */ 16684 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 16685 from_ipif = from_ipif->ipif_next) { 16686 if (((ifindex == 0) || 16687 (ifindex == from_ipif->ipif_orig_ifindex)) && 16688 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 16689 check_rep_if = B_TRUE; 16690 break; 16691 } 16692 } 16693 16694 16695 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 16696 16697 GRAB_ILL_LOCKS(from_ill, to_ill); 16698 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 16699 (void) ipsq_pending_mp_add(NULL, ipif, q, 16700 mp, ILL_MOVE_OK); 16701 RELEASE_ILL_LOCKS(from_ill, to_ill); 16702 return (EINPROGRESS); 16703 } 16704 16705 /* Check if the replacement ipif is quiescent to delete */ 16706 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 16707 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 16708 to_ill->ill_ipif->ipif_state_flags |= 16709 IPIF_MOVING | IPIF_CHANGING; 16710 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 16711 (void) ipsq_pending_mp_add(NULL, ipif, q, 16712 mp, ILL_MOVE_OK); 16713 RELEASE_ILL_LOCKS(from_ill, to_ill); 16714 return (EINPROGRESS); 16715 } 16716 } 16717 RELEASE_ILL_LOCKS(from_ill, to_ill); 16718 16719 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 16720 rw_enter(&ill_g_lock, RW_WRITER); 16721 GRAB_ILL_LOCKS(from_ill, to_ill); 16722 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 16723 16724 /* ilm_move is done inside ipif_move for IPv4 */ 16725 if (err == 0 && from_ill->ill_isv6) 16726 ilm_move_v6(from_ill, to_ill, ifindex); 16727 16728 RELEASE_ILL_LOCKS(from_ill, to_ill); 16729 rw_exit(&ill_g_lock); 16730 16731 /* 16732 * send rts messages and multicast messages. 16733 */ 16734 if (rep_ipif_ptr != NULL) { 16735 ip_rts_ifmsg(rep_ipif_ptr); 16736 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 16737 IPIF_TRACE_CLEANUP(rep_ipif_ptr); 16738 mi_free(rep_ipif_ptr); 16739 } 16740 16741 ilm_send_multicast_reqs(from_ill, to_ill); 16742 16743 conn_move_ill(from_ill, to_ill, ifindex); 16744 16745 return (err); 16746 } 16747 16748 /* 16749 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 16750 * Also checks for the validity of the arguments. 16751 * Note: We are already exclusive inside the from group. 16752 * It is upto the caller to release refcnt on the to_ill's. 16753 */ 16754 static int 16755 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 16756 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 16757 { 16758 int dst_index; 16759 ipif_t *ipif_v4, *ipif_v6; 16760 struct lifreq *lifr; 16761 mblk_t *mp1; 16762 boolean_t exists; 16763 sin_t *sin; 16764 int err = 0; 16765 16766 if ((mp1 = mp->b_cont) == NULL) 16767 return (EPROTO); 16768 16769 if ((mp1 = mp1->b_cont) == NULL) 16770 return (EPROTO); 16771 16772 lifr = (struct lifreq *)mp1->b_rptr; 16773 sin = (sin_t *)&lifr->lifr_addr; 16774 16775 /* 16776 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 16777 * specific operations. 16778 */ 16779 if (sin->sin_family != AF_UNSPEC) 16780 return (EINVAL); 16781 16782 /* 16783 * Get ipif with id 0. We are writer on the from ill. So we can pass 16784 * NULLs for the last 4 args and we know the lookup won't fail 16785 * with EINPROGRESS. 16786 */ 16787 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 16788 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 16789 ALL_ZONES, NULL, NULL, NULL, NULL); 16790 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 16791 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 16792 ALL_ZONES, NULL, NULL, NULL, NULL); 16793 16794 if (ipif_v4 == NULL && ipif_v6 == NULL) 16795 return (ENXIO); 16796 16797 if (ipif_v4 != NULL) { 16798 ASSERT(ipif_v4->ipif_refcnt != 0); 16799 if (ipif_v4->ipif_id != 0) { 16800 err = EINVAL; 16801 goto done; 16802 } 16803 16804 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 16805 *ill_from_v4 = ipif_v4->ipif_ill; 16806 } 16807 16808 if (ipif_v6 != NULL) { 16809 ASSERT(ipif_v6->ipif_refcnt != 0); 16810 if (ipif_v6->ipif_id != 0) { 16811 err = EINVAL; 16812 goto done; 16813 } 16814 16815 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 16816 *ill_from_v6 = ipif_v6->ipif_ill; 16817 } 16818 16819 err = 0; 16820 dst_index = lifr->lifr_movetoindex; 16821 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 16822 q, mp, ip_process_ioctl, &err); 16823 if (err != 0) { 16824 /* 16825 * There could be only v6. 16826 */ 16827 if (err != ENXIO) 16828 goto done; 16829 err = 0; 16830 } 16831 16832 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 16833 q, mp, ip_process_ioctl, &err); 16834 if (err != 0) { 16835 if (err != ENXIO) 16836 goto done; 16837 if (*ill_to_v4 == NULL) { 16838 err = ENXIO; 16839 goto done; 16840 } 16841 err = 0; 16842 } 16843 16844 /* 16845 * If we have something to MOVE i.e "from" not NULL, 16846 * "to" should be non-NULL. 16847 */ 16848 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 16849 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 16850 err = EINVAL; 16851 } 16852 16853 done: 16854 if (ipif_v4 != NULL) 16855 ipif_refrele(ipif_v4); 16856 if (ipif_v6 != NULL) 16857 ipif_refrele(ipif_v6); 16858 return (err); 16859 } 16860 16861 /* 16862 * FAILOVER and FAILBACK are modelled as MOVE operations. 16863 * 16864 * We don't check whether the MOVE is within the same group or 16865 * not, because this ioctl can be used as a generic mechanism 16866 * to failover from interface A to B, though things will function 16867 * only if they are really part of the same group. Moreover, 16868 * all ipifs may be down and hence temporarily out of the group. 16869 * 16870 * ipif's that need to be moved are first brought down; V4 ipifs are brought 16871 * down first and then V6. For each we wait for the ipif's to become quiescent. 16872 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 16873 * have been deleted and there are no active references. Once quiescent the 16874 * ipif's are moved and brought up on the new ill. 16875 * 16876 * Normally the source ill and destination ill belong to the same IPMP group 16877 * and hence the same ipsq_t. In the event they don't belong to the same 16878 * same group the two ipsq's are first merged into one ipsq - that of the 16879 * to_ill. The multicast memberships on the source and destination ill cannot 16880 * change during the move operation since multicast joins/leaves also have to 16881 * execute on the same ipsq and are hence serialized. 16882 */ 16883 /* ARGSUSED */ 16884 int 16885 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16886 ip_ioctl_cmd_t *ipip, void *ifreq) 16887 { 16888 ill_t *ill_to_v4 = NULL; 16889 ill_t *ill_to_v6 = NULL; 16890 ill_t *ill_from_v4 = NULL; 16891 ill_t *ill_from_v6 = NULL; 16892 int err = 0; 16893 16894 /* 16895 * setup from and to ill's, we can get EINPROGRESS only for 16896 * to_ill's. 16897 */ 16898 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 16899 &ill_to_v4, &ill_to_v6); 16900 16901 if (err != 0) { 16902 ip0dbg(("ip_sioctl_move: extract args failed\n")); 16903 goto done; 16904 } 16905 16906 /* 16907 * nothing to do. 16908 */ 16909 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 16910 goto done; 16911 } 16912 16913 /* 16914 * nothing to do. 16915 */ 16916 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 16917 goto done; 16918 } 16919 16920 /* 16921 * Mark the ill as changing. 16922 * ILL_CHANGING flag is cleared when the ipif's are brought up 16923 * in ill_up_ipifs in case of error they are cleared below. 16924 */ 16925 16926 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 16927 if (ill_from_v4 != NULL) 16928 ill_from_v4->ill_state_flags |= ILL_CHANGING; 16929 if (ill_from_v6 != NULL) 16930 ill_from_v6->ill_state_flags |= ILL_CHANGING; 16931 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 16932 16933 /* 16934 * Make sure that both src and dst are 16935 * in the same syncq group. If not make it happen. 16936 * We are not holding any locks because we are the writer 16937 * on the from_ipsq and we will hold locks in ill_merge_groups 16938 * to protect to_ipsq against changing. 16939 */ 16940 if (ill_from_v4 != NULL) { 16941 if (ill_from_v4->ill_phyint->phyint_ipsq != 16942 ill_to_v4->ill_phyint->phyint_ipsq) { 16943 err = ill_merge_groups(ill_from_v4, ill_to_v4, 16944 NULL, mp, q); 16945 goto err_ret; 16946 16947 } 16948 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 16949 } else { 16950 16951 if (ill_from_v6->ill_phyint->phyint_ipsq != 16952 ill_to_v6->ill_phyint->phyint_ipsq) { 16953 err = ill_merge_groups(ill_from_v6, ill_to_v6, 16954 NULL, mp, q); 16955 goto err_ret; 16956 16957 } 16958 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 16959 } 16960 16961 /* 16962 * Now that the ipsq's have been merged and we are the writer 16963 * lets mark to_ill as changing as well. 16964 */ 16965 16966 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 16967 if (ill_to_v4 != NULL) 16968 ill_to_v4->ill_state_flags |= ILL_CHANGING; 16969 if (ill_to_v6 != NULL) 16970 ill_to_v6->ill_state_flags |= ILL_CHANGING; 16971 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 16972 16973 /* 16974 * Its ok for us to proceed with the move even if 16975 * ill_pending_mp is non null on one of the from ill's as the reply 16976 * should not be looking at the ipif, it should only care about the 16977 * ill itself. 16978 */ 16979 16980 /* 16981 * lets move ipv4 first. 16982 */ 16983 if (ill_from_v4 != NULL) { 16984 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 16985 ill_from_v4->ill_move_in_progress = B_TRUE; 16986 ill_to_v4->ill_move_in_progress = B_TRUE; 16987 ill_to_v4->ill_move_peer = ill_from_v4; 16988 ill_from_v4->ill_move_peer = ill_to_v4; 16989 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 16990 } 16991 16992 /* 16993 * Now lets move ipv6. 16994 */ 16995 if (err == 0 && ill_from_v6 != NULL) { 16996 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 16997 ill_from_v6->ill_move_in_progress = B_TRUE; 16998 ill_to_v6->ill_move_in_progress = B_TRUE; 16999 ill_to_v6->ill_move_peer = ill_from_v6; 17000 ill_from_v6->ill_move_peer = ill_to_v6; 17001 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 17002 } 17003 17004 err_ret: 17005 /* 17006 * EINPROGRESS means we are waiting for the ipif's that need to be 17007 * moved to become quiescent. 17008 */ 17009 if (err == EINPROGRESS) { 17010 goto done; 17011 } 17012 17013 /* 17014 * if err is set ill_up_ipifs will not be called 17015 * lets clear the flags. 17016 */ 17017 17018 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17019 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17020 /* 17021 * Some of the clearing may be redundant. But it is simple 17022 * not making any extra checks. 17023 */ 17024 if (ill_from_v6 != NULL) { 17025 ill_from_v6->ill_move_in_progress = B_FALSE; 17026 ill_from_v6->ill_move_peer = NULL; 17027 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 17028 } 17029 if (ill_from_v4 != NULL) { 17030 ill_from_v4->ill_move_in_progress = B_FALSE; 17031 ill_from_v4->ill_move_peer = NULL; 17032 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 17033 } 17034 if (ill_to_v6 != NULL) { 17035 ill_to_v6->ill_move_in_progress = B_FALSE; 17036 ill_to_v6->ill_move_peer = NULL; 17037 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 17038 } 17039 if (ill_to_v4 != NULL) { 17040 ill_to_v4->ill_move_in_progress = B_FALSE; 17041 ill_to_v4->ill_move_peer = NULL; 17042 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 17043 } 17044 17045 /* 17046 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 17047 * Do this always to maintain proper state i.e even in case of errors. 17048 * As phyint_inactive looks at both v4 and v6 interfaces, 17049 * we need not call on both v4 and v6 interfaces. 17050 */ 17051 if (ill_from_v4 != NULL) { 17052 if ((ill_from_v4->ill_phyint->phyint_flags & 17053 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17054 phyint_inactive(ill_from_v4->ill_phyint); 17055 } 17056 } else if (ill_from_v6 != NULL) { 17057 if ((ill_from_v6->ill_phyint->phyint_flags & 17058 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 17059 phyint_inactive(ill_from_v6->ill_phyint); 17060 } 17061 } 17062 17063 if (ill_to_v4 != NULL) { 17064 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17065 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17066 } 17067 } else if (ill_to_v6 != NULL) { 17068 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 17069 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 17070 } 17071 } 17072 17073 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 17074 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17075 17076 no_err: 17077 /* 17078 * lets bring the interfaces up on the to_ill. 17079 */ 17080 if (err == 0) { 17081 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 17082 q, mp); 17083 } 17084 done: 17085 17086 if (ill_to_v4 != NULL) { 17087 ill_refrele(ill_to_v4); 17088 } 17089 if (ill_to_v6 != NULL) { 17090 ill_refrele(ill_to_v6); 17091 } 17092 17093 return (err); 17094 } 17095 17096 static void 17097 ill_dl_down(ill_t *ill) 17098 { 17099 /* 17100 * The ill is down; unbind but stay attached since we're still 17101 * associated with a PPA. 17102 */ 17103 mblk_t *mp = ill->ill_unbind_mp; 17104 17105 ill->ill_unbind_mp = NULL; 17106 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 17107 if (mp != NULL) { 17108 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 17109 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 17110 ill->ill_name)); 17111 ill_dlpi_send(ill, mp); 17112 } 17113 17114 /* 17115 * Toss all of our multicast memberships. We could keep them, but 17116 * then we'd have to do bookkeeping of any joins and leaves performed 17117 * by the application while the the interface is down (we can't just 17118 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 17119 * on a downed interface). 17120 */ 17121 ill_leave_multicast(ill); 17122 17123 mutex_enter(&ill->ill_lock); 17124 ill->ill_dl_up = 0; 17125 mutex_exit(&ill->ill_lock); 17126 } 17127 17128 void 17129 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 17130 { 17131 union DL_primitives *dlp; 17132 t_uscalar_t prim; 17133 17134 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17135 17136 dlp = (union DL_primitives *)mp->b_rptr; 17137 prim = dlp->dl_primitive; 17138 17139 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 17140 dlpi_prim_str(prim), prim, ill->ill_name)); 17141 17142 switch (prim) { 17143 case DL_PHYS_ADDR_REQ: 17144 { 17145 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 17146 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 17147 break; 17148 } 17149 case DL_BIND_REQ: 17150 mutex_enter(&ill->ill_lock); 17151 ill->ill_state_flags &= ~ILL_DL_UNBIND_DONE; 17152 mutex_exit(&ill->ill_lock); 17153 break; 17154 } 17155 17156 ill->ill_dlpi_pending = prim; 17157 17158 /* 17159 * Some drivers send M_FLUSH up to IP as part of unbind 17160 * request. When this M_FLUSH is sent back to the driver, 17161 * this can go after we send the detach request if the 17162 * M_FLUSH ends up in IP's syncq. To avoid that, we reply 17163 * to the M_FLUSH in ip_rput and locally generate another 17164 * M_FLUSH for the correctness. This will get freed in 17165 * ip_wput_nondata. 17166 */ 17167 if (prim == DL_UNBIND_REQ) 17168 (void) putnextctl1(ill->ill_rq, M_FLUSH, FLUSHRW); 17169 17170 putnext(ill->ill_wq, mp); 17171 } 17172 17173 /* 17174 * Send a DLPI control message to the driver but make sure there 17175 * is only one outstanding message. Uses ill_dlpi_pending to tell 17176 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 17177 * when an ACK or a NAK is received to process the next queued message. 17178 * 17179 * We don't protect ill_dlpi_pending with any lock. This is okay as 17180 * every place where its accessed, ip is exclusive while accessing 17181 * ill_dlpi_pending except when this function is called from ill_init() 17182 */ 17183 void 17184 ill_dlpi_send(ill_t *ill, mblk_t *mp) 17185 { 17186 mblk_t **mpp; 17187 17188 ASSERT(IAM_WRITER_ILL(ill)); 17189 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 17190 17191 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 17192 /* Must queue message. Tail insertion */ 17193 mpp = &ill->ill_dlpi_deferred; 17194 while (*mpp != NULL) 17195 mpp = &((*mpp)->b_next); 17196 17197 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 17198 ill->ill_name)); 17199 17200 *mpp = mp; 17201 return; 17202 } 17203 17204 ill_dlpi_dispatch(ill, mp); 17205 } 17206 17207 /* 17208 * Called when an DLPI control message has been acked or nacked to 17209 * send down the next queued message (if any). 17210 */ 17211 void 17212 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 17213 { 17214 mblk_t *mp; 17215 17216 ASSERT(IAM_WRITER_ILL(ill)); 17217 17218 ASSERT(prim != DL_PRIM_INVAL); 17219 if (ill->ill_dlpi_pending != prim) { 17220 if (ill->ill_dlpi_pending == DL_PRIM_INVAL) { 17221 (void) mi_strlog(ill->ill_rq, 1, 17222 SL_CONSOLE|SL_ERROR|SL_TRACE, 17223 "ill_dlpi_done: unsolicited ack for %s from %s\n", 17224 dlpi_prim_str(prim), ill->ill_name); 17225 } else { 17226 (void) mi_strlog(ill->ill_rq, 1, 17227 SL_CONSOLE|SL_ERROR|SL_TRACE, 17228 "ill_dlpi_done: unexpected ack for %s from %s " 17229 "(expecting ack for %s)\n", 17230 dlpi_prim_str(prim), ill->ill_name, 17231 dlpi_prim_str(ill->ill_dlpi_pending)); 17232 } 17233 return; 17234 } 17235 17236 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 17237 dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 17238 17239 if ((mp = ill->ill_dlpi_deferred) == NULL) { 17240 ill->ill_dlpi_pending = DL_PRIM_INVAL; 17241 return; 17242 } 17243 17244 ill->ill_dlpi_deferred = mp->b_next; 17245 mp->b_next = NULL; 17246 17247 ill_dlpi_dispatch(ill, mp); 17248 } 17249 17250 void 17251 conn_delete_ire(conn_t *connp, caddr_t arg) 17252 { 17253 ipif_t *ipif = (ipif_t *)arg; 17254 ire_t *ire; 17255 17256 /* 17257 * Look at the cached ires on conns which has pointers to ipifs. 17258 * We just call ire_refrele which clears up the reference 17259 * to ire. Called when a conn closes. Also called from ipif_free 17260 * to cleanup indirect references to the stale ipif via the cached ire. 17261 */ 17262 mutex_enter(&connp->conn_lock); 17263 ire = connp->conn_ire_cache; 17264 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 17265 connp->conn_ire_cache = NULL; 17266 mutex_exit(&connp->conn_lock); 17267 IRE_REFRELE_NOTR(ire); 17268 return; 17269 } 17270 mutex_exit(&connp->conn_lock); 17271 17272 } 17273 17274 /* 17275 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 17276 * of IREs. Those IREs may have been previously cached in the conn structure. 17277 * This ipcl_walk() walker function releases all references to such IREs based 17278 * on the condemned flag. 17279 */ 17280 /* ARGSUSED */ 17281 void 17282 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 17283 { 17284 ire_t *ire; 17285 17286 mutex_enter(&connp->conn_lock); 17287 ire = connp->conn_ire_cache; 17288 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 17289 connp->conn_ire_cache = NULL; 17290 mutex_exit(&connp->conn_lock); 17291 IRE_REFRELE_NOTR(ire); 17292 return; 17293 } 17294 mutex_exit(&connp->conn_lock); 17295 } 17296 17297 /* 17298 * Take down a specific interface, but don't lose any information about it. 17299 * Also delete interface from its interface group (ifgrp). 17300 * (Always called as writer.) 17301 * This function goes through the down sequence even if the interface is 17302 * already down. There are 2 reasons. 17303 * a. Currently we permit interface routes that depend on down interfaces 17304 * to be added. This behaviour itself is questionable. However it appears 17305 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 17306 * time. We go thru the cleanup in order to remove these routes. 17307 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 17308 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 17309 * down, but we need to cleanup i.e. do ill_dl_down and 17310 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 17311 * 17312 * IP-MT notes: 17313 * 17314 * Model of reference to interfaces. 17315 * 17316 * The following members in ipif_t track references to the ipif. 17317 * int ipif_refcnt; Active reference count 17318 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 17319 * The following members in ill_t track references to the ill. 17320 * int ill_refcnt; active refcnt 17321 * uint_t ill_ire_cnt; Number of ires referencing ill 17322 * uint_t ill_nce_cnt; Number of nces referencing ill 17323 * 17324 * Reference to an ipif or ill can be obtained in any of the following ways. 17325 * 17326 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 17327 * Pointers to ipif / ill from other data structures viz ire and conn. 17328 * Implicit reference to the ipif / ill by holding a reference to the ire. 17329 * 17330 * The ipif/ill lookup functions return a reference held ipif / ill. 17331 * ipif_refcnt and ill_refcnt track the reference counts respectively. 17332 * This is a purely dynamic reference count associated with threads holding 17333 * references to the ipif / ill. Pointers from other structures do not 17334 * count towards this reference count. 17335 * 17336 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 17337 * ipif/ill. This is incremented whenever a new ire is created referencing the 17338 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 17339 * actually added to the ire hash table. The count is decremented in 17340 * ire_inactive where the ire is destroyed. 17341 * 17342 * nce's reference ill's thru nce_ill and the count of nce's associated with 17343 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 17344 * ndp_add() where the nce is actually added to the table. Similarly it is 17345 * decremented in ndp_inactive where the nce is destroyed. 17346 * 17347 * Flow of ioctls involving interface down/up 17348 * 17349 * The following is the sequence of an attempt to set some critical flags on an 17350 * up interface. 17351 * ip_sioctl_flags 17352 * ipif_down 17353 * wait for ipif to be quiescent 17354 * ipif_down_tail 17355 * ip_sioctl_flags_tail 17356 * 17357 * All set ioctls that involve down/up sequence would have a skeleton similar 17358 * to the above. All the *tail functions are called after the refcounts have 17359 * dropped to the appropriate values. 17360 * 17361 * The mechanism to quiesce an ipif is as follows. 17362 * 17363 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 17364 * on the ipif. Callers either pass a flag requesting wait or the lookup 17365 * functions will return NULL. 17366 * 17367 * Delete all ires referencing this ipif 17368 * 17369 * Any thread attempting to do an ipif_refhold on an ipif that has been 17370 * obtained thru a cached pointer will first make sure that 17371 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 17372 * increment the refcount. 17373 * 17374 * The above guarantees that the ipif refcount will eventually come down to 17375 * zero and the ipif will quiesce, once all threads that currently hold a 17376 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 17377 * ipif_refcount has dropped to zero and all ire's associated with this ipif 17378 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 17379 * drop to zero. 17380 * 17381 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 17382 * 17383 * Threads trying to lookup an ipif or ill can pass a flag requesting 17384 * wait and restart if the ipif / ill cannot be looked up currently. 17385 * For eg. bind, and route operations (Eg. route add / delete) cannot return 17386 * failure if the ipif is currently undergoing an exclusive operation, and 17387 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 17388 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 17389 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 17390 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 17391 * change while the ill_lock is held. Before dropping the ill_lock we acquire 17392 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 17393 * until we release the ipsq_lock, even though the the ill/ipif state flags 17394 * can change after we drop the ill_lock. 17395 * 17396 * An attempt to send out a packet using an ipif that is currently 17397 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 17398 * operation and restart it later when the exclusive condition on the ipif ends. 17399 * This is an example of not passing the wait flag to the lookup functions. For 17400 * example an attempt to refhold and use conn->conn_multicast_ipif and send 17401 * out a multicast packet on that ipif will fail while the ipif is 17402 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 17403 * currently IPIF_CHANGING will also fail. 17404 */ 17405 int 17406 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 17407 { 17408 ill_t *ill = ipif->ipif_ill; 17409 phyint_t *phyi; 17410 conn_t *connp; 17411 boolean_t success; 17412 boolean_t ipif_was_up = B_FALSE; 17413 17414 ASSERT(IAM_WRITER_IPIF(ipif)); 17415 17416 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 17417 17418 if (ipif->ipif_flags & IPIF_UP) { 17419 mutex_enter(&ill->ill_lock); 17420 ipif->ipif_flags &= ~IPIF_UP; 17421 ASSERT(ill->ill_ipif_up_count > 0); 17422 --ill->ill_ipif_up_count; 17423 mutex_exit(&ill->ill_lock); 17424 ipif_was_up = B_TRUE; 17425 /* Update status in SCTP's list */ 17426 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 17427 } 17428 17429 /* 17430 * Blow away v6 memberships we established in ipif_multicast_up(); the 17431 * v4 ones are left alone (as is the ipif_multicast_up flag, so we 17432 * know not to rejoin when the interface is brought back up). 17433 */ 17434 if (ipif->ipif_isv6) 17435 ipif_multicast_down(ipif); 17436 /* 17437 * Remove from the mapping for __sin6_src_id. We insert only 17438 * when the address is not INADDR_ANY. As IPv4 addresses are 17439 * stored as mapped addresses, we need to check for mapped 17440 * INADDR_ANY also. 17441 */ 17442 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 17443 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 17444 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 17445 int err; 17446 17447 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 17448 ipif->ipif_zoneid); 17449 if (err != 0) { 17450 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 17451 } 17452 } 17453 17454 /* 17455 * Before we delete the ill from the group (if any), we need 17456 * to make sure that we delete all the routes dependent on 17457 * this and also any ipifs dependent on this ipif for 17458 * source address. We need to do before we delete from 17459 * the group because 17460 * 17461 * 1) ipif_down_delete_ire de-references ill->ill_group. 17462 * 17463 * 2) ipif_update_other_ipifs needs to walk the whole group 17464 * for re-doing source address selection. Note that 17465 * ipif_select_source[_v6] called from 17466 * ipif_update_other_ipifs[_v6] will not pick this ipif 17467 * because we have already marked down here i.e cleared 17468 * IPIF_UP. 17469 */ 17470 if (ipif->ipif_isv6) 17471 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 17472 else 17473 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 17474 17475 /* 17476 * Need to add these also to be saved and restored when the 17477 * ipif is brought down and up 17478 */ 17479 mutex_enter(&ire_mrtun_lock); 17480 if (ire_mrtun_count != 0) { 17481 mutex_exit(&ire_mrtun_lock); 17482 ire_walk_ill_mrtun(0, 0, ipif_down_delete_ire, 17483 (char *)ipif, NULL); 17484 } else { 17485 mutex_exit(&ire_mrtun_lock); 17486 } 17487 17488 mutex_enter(&ire_srcif_table_lock); 17489 if (ire_srcif_table_count > 0) { 17490 mutex_exit(&ire_srcif_table_lock); 17491 ire_walk_srcif_table_v4(ipif_down_delete_ire, (char *)ipif); 17492 } else { 17493 mutex_exit(&ire_srcif_table_lock); 17494 } 17495 17496 /* 17497 * Cleaning up the conn_ire_cache or conns must be done only after the 17498 * ires have been deleted above. Otherwise a thread could end up 17499 * caching an ire in a conn after we have finished the cleanup of the 17500 * conn. The caching is done after making sure that the ire is not yet 17501 * condemned. Also documented in the block comment above ip_output 17502 */ 17503 ipcl_walk(conn_cleanup_stale_ire, NULL); 17504 /* Also, delete the ires cached in SCTP */ 17505 sctp_ire_cache_flush(ipif); 17506 17507 /* Resolve any IPsec/IKE NAT-T instances that depend on this ipif. */ 17508 nattymod_clean_ipif(ipif); 17509 17510 /* 17511 * Update any other ipifs which have used "our" local address as 17512 * a source address. This entails removing and recreating IRE_INTERFACE 17513 * entries for such ipifs. 17514 */ 17515 if (ipif->ipif_isv6) 17516 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 17517 else 17518 ipif_update_other_ipifs(ipif, ill->ill_group); 17519 17520 if (ipif_was_up) { 17521 /* 17522 * Check whether it is last ipif to leave this group. 17523 * If this is the last ipif to leave, we should remove 17524 * this ill from the group as ipif_select_source will not 17525 * be able to find any useful ipifs if this ill is selected 17526 * for load balancing. 17527 * 17528 * For nameless groups, we should call ifgrp_delete if this 17529 * belongs to some group. As this ipif is going down, we may 17530 * need to reconstruct groups. 17531 */ 17532 phyi = ill->ill_phyint; 17533 /* 17534 * If the phyint_groupname_len is 0, it may or may not 17535 * be in the nameless group. If the phyint_groupname_len is 17536 * not 0, then this ill should be part of some group. 17537 * As we always insert this ill in the group if 17538 * phyint_groupname_len is not zero when the first ipif 17539 * comes up (in ipif_up_done), it should be in a group 17540 * when the namelen is not 0. 17541 * 17542 * NOTE : When we delete the ill from the group,it will 17543 * blow away all the IRE_CACHES pointing either at this ipif or 17544 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 17545 * should be pointing at this ill. 17546 */ 17547 ASSERT(phyi->phyint_groupname_len == 0 || 17548 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 17549 17550 if (phyi->phyint_groupname_len != 0) { 17551 if (ill->ill_ipif_up_count == 0) 17552 illgrp_delete(ill); 17553 } 17554 17555 /* 17556 * If we have deleted some of the broadcast ires associated 17557 * with this ipif, we need to re-nominate somebody else if 17558 * the ires that we deleted were the nominated ones. 17559 */ 17560 if (ill->ill_group != NULL && !ill->ill_isv6) 17561 ipif_renominate_bcast(ipif); 17562 } 17563 17564 if (ipif->ipif_isv6) 17565 ipif_ndp_down(ipif); 17566 17567 /* 17568 * If mp is NULL the caller will wait for the appropriate refcnt. 17569 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 17570 * and ill_delete -> ipif_free -> ipif_down 17571 */ 17572 if (mp == NULL) { 17573 ASSERT(q == NULL); 17574 return (0); 17575 } 17576 17577 if (CONN_Q(q)) { 17578 connp = Q_TO_CONN(q); 17579 mutex_enter(&connp->conn_lock); 17580 } else { 17581 connp = NULL; 17582 } 17583 mutex_enter(&ill->ill_lock); 17584 /* 17585 * Are there any ire's pointing to this ipif that are still active ? 17586 * If this is the last ipif going down, are there any ire's pointing 17587 * to this ill that are still active ? 17588 */ 17589 if (ipif_is_quiescent(ipif)) { 17590 mutex_exit(&ill->ill_lock); 17591 if (connp != NULL) 17592 mutex_exit(&connp->conn_lock); 17593 return (0); 17594 } 17595 17596 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 17597 ill->ill_name, (void *)ill)); 17598 /* 17599 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 17600 * drops down, the operation will be restarted by ipif_ill_refrele_tail 17601 * which in turn is called by the last refrele on the ipif/ill/ire. 17602 */ 17603 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 17604 if (!success) { 17605 /* The conn is closing. So just return */ 17606 ASSERT(connp != NULL); 17607 mutex_exit(&ill->ill_lock); 17608 mutex_exit(&connp->conn_lock); 17609 return (EINTR); 17610 } 17611 17612 mutex_exit(&ill->ill_lock); 17613 if (connp != NULL) 17614 mutex_exit(&connp->conn_lock); 17615 return (EINPROGRESS); 17616 } 17617 17618 static void 17619 ipif_down_tail(ipif_t *ipif) 17620 { 17621 ill_t *ill = ipif->ipif_ill; 17622 17623 /* 17624 * Skip any loopback interface (null wq). 17625 * If this is the last logical interface on the ill 17626 * have ill_dl_down tell the driver we are gone (unbind) 17627 * Note that lun 0 can ipif_down even though 17628 * there are other logical units that are up. 17629 * This occurs e.g. when we change a "significant" IFF_ flag. 17630 */ 17631 if (ipif->ipif_ill->ill_wq != NULL) { 17632 if (!ill->ill_logical_down && (ill->ill_ipif_up_count == 0) && 17633 ill->ill_dl_up) { 17634 ill_dl_down(ill); 17635 } 17636 } 17637 ill->ill_logical_down = 0; 17638 17639 /* 17640 * Have to be after removing the routes in ipif_down_delete_ire. 17641 */ 17642 if (ipif->ipif_isv6) { 17643 if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) 17644 ipif_arp_down(ipif); 17645 } else { 17646 ipif_arp_down(ipif); 17647 } 17648 17649 ip_rts_ifmsg(ipif); 17650 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 17651 } 17652 17653 /* 17654 * Bring interface logically down without bringing the physical interface 17655 * down e.g. when the netmask is changed. This avoids long lasting link 17656 * negotiations between an ethernet interface and a certain switches. 17657 */ 17658 static int 17659 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 17660 { 17661 /* 17662 * The ill_logical_down flag is a transient flag. It is set here 17663 * and is cleared once the down has completed in ipif_down_tail. 17664 * This flag does not indicate whether the ill stream is in the 17665 * DL_BOUND state with the driver. Instead this flag is used by 17666 * ipif_down_tail to determine whether to DL_UNBIND the stream with 17667 * the driver. The state of the ill stream i.e. whether it is 17668 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 17669 */ 17670 ipif->ipif_ill->ill_logical_down = 1; 17671 return (ipif_down(ipif, q, mp)); 17672 } 17673 17674 /* 17675 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 17676 * If the usesrc client ILL is already part of a usesrc group or not, 17677 * in either case a ire_stq with the matching usesrc client ILL will 17678 * locate the IRE's that need to be deleted. We want IREs to be created 17679 * with the new source address. 17680 */ 17681 static void 17682 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 17683 { 17684 ill_t *ucill = (ill_t *)ill_arg; 17685 17686 ASSERT(IAM_WRITER_ILL(ucill)); 17687 17688 if (ire->ire_stq == NULL) 17689 return; 17690 17691 if ((ire->ire_type == IRE_CACHE) && 17692 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 17693 ire_delete(ire); 17694 } 17695 17696 /* 17697 * ire_walk routine to delete every IRE dependent on the interface 17698 * address that is going down. (Always called as writer.) 17699 * Works for both v4 and v6. 17700 * In addition for checking for ire_ipif matches it also checks for 17701 * IRE_CACHE entries which have the same source address as the 17702 * disappearing ipif since ipif_select_source might have picked 17703 * that source. Note that ipif_down/ipif_update_other_ipifs takes 17704 * care of any IRE_INTERFACE with the disappearing source address. 17705 */ 17706 static void 17707 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 17708 { 17709 ipif_t *ipif = (ipif_t *)ipif_arg; 17710 ill_t *ire_ill; 17711 ill_t *ipif_ill; 17712 17713 ASSERT(IAM_WRITER_IPIF(ipif)); 17714 if (ire->ire_ipif == NULL) 17715 return; 17716 17717 /* 17718 * For IPv4, we derive source addresses for an IRE from ipif's 17719 * belonging to the same IPMP group as the IRE's outgoing 17720 * interface. If an IRE's outgoing interface isn't in the 17721 * same IPMP group as a particular ipif, then that ipif 17722 * couldn't have been used as a source address for this IRE. 17723 * 17724 * For IPv6, source addresses are only restricted to the IPMP group 17725 * if the IRE is for a link-local address or a multicast address. 17726 * Otherwise, source addresses for an IRE can be chosen from 17727 * interfaces other than the the outgoing interface for that IRE. 17728 * 17729 * For source address selection details, see ipif_select_source() 17730 * and ipif_select_source_v6(). 17731 */ 17732 if (ire->ire_ipversion == IPV4_VERSION || 17733 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 17734 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 17735 ire_ill = ire->ire_ipif->ipif_ill; 17736 ipif_ill = ipif->ipif_ill; 17737 17738 if (ire_ill->ill_group != ipif_ill->ill_group) { 17739 return; 17740 } 17741 } 17742 17743 17744 if (ire->ire_ipif != ipif) { 17745 /* 17746 * Look for a matching source address. 17747 */ 17748 if (ire->ire_type != IRE_CACHE) 17749 return; 17750 if (ipif->ipif_flags & IPIF_NOLOCAL) 17751 return; 17752 17753 if (ire->ire_ipversion == IPV4_VERSION) { 17754 if (ire->ire_src_addr != ipif->ipif_src_addr) 17755 return; 17756 } else { 17757 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 17758 &ipif->ipif_v6lcl_addr)) 17759 return; 17760 } 17761 ire_delete(ire); 17762 return; 17763 } 17764 /* 17765 * ire_delete() will do an ire_flush_cache which will delete 17766 * all ire_ipif matches 17767 */ 17768 ire_delete(ire); 17769 } 17770 17771 /* 17772 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 17773 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 17774 * 2) when an interface is brought up or down (on that ill). 17775 * This ensures that the IRE_CACHE entries don't retain stale source 17776 * address selection results. 17777 */ 17778 void 17779 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 17780 { 17781 ill_t *ill = (ill_t *)ill_arg; 17782 ill_t *ipif_ill; 17783 17784 ASSERT(IAM_WRITER_ILL(ill)); 17785 /* 17786 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 17787 * Hence this should be IRE_CACHE. 17788 */ 17789 ASSERT(ire->ire_type == IRE_CACHE); 17790 17791 /* 17792 * We are called for IRE_CACHES whose ire_ipif matches ill. 17793 * We are only interested in IRE_CACHES that has borrowed 17794 * the source address from ill_arg e.g. ipif_up_done[_v6] 17795 * for which we need to look at ire_ipif->ipif_ill match 17796 * with ill. 17797 */ 17798 ASSERT(ire->ire_ipif != NULL); 17799 ipif_ill = ire->ire_ipif->ipif_ill; 17800 if (ipif_ill == ill || (ill->ill_group != NULL && 17801 ipif_ill->ill_group == ill->ill_group)) { 17802 ire_delete(ire); 17803 } 17804 } 17805 17806 /* 17807 * Delete all the ire whose stq references ill_arg. 17808 */ 17809 static void 17810 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 17811 { 17812 ill_t *ill = (ill_t *)ill_arg; 17813 ill_t *ire_ill; 17814 17815 ASSERT(IAM_WRITER_ILL(ill)); 17816 /* 17817 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 17818 * Hence this should be IRE_CACHE. 17819 */ 17820 ASSERT(ire->ire_type == IRE_CACHE); 17821 17822 /* 17823 * We are called for IRE_CACHES whose ire_stq and ire_ipif 17824 * matches ill. We are only interested in IRE_CACHES that 17825 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 17826 * filtering here. 17827 */ 17828 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 17829 17830 if (ire_ill == ill) 17831 ire_delete(ire); 17832 } 17833 17834 /* 17835 * This is called when an ill leaves the group. We want to delete 17836 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 17837 * pointing at ill. 17838 */ 17839 static void 17840 illgrp_cache_delete(ire_t *ire, char *ill_arg) 17841 { 17842 ill_t *ill = (ill_t *)ill_arg; 17843 17844 ASSERT(IAM_WRITER_ILL(ill)); 17845 ASSERT(ill->ill_group == NULL); 17846 /* 17847 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 17848 * Hence this should be IRE_CACHE. 17849 */ 17850 ASSERT(ire->ire_type == IRE_CACHE); 17851 /* 17852 * We are called for IRE_CACHES whose ire_stq and ire_ipif 17853 * matches ill. We are interested in both. 17854 */ 17855 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 17856 (ire->ire_ipif->ipif_ill == ill)); 17857 17858 ire_delete(ire); 17859 } 17860 17861 /* 17862 * Initiate deallocate of an IPIF. Always called as writer. Called by 17863 * ill_delete or ip_sioctl_removeif. 17864 */ 17865 static void 17866 ipif_free(ipif_t *ipif) 17867 { 17868 ASSERT(IAM_WRITER_IPIF(ipif)); 17869 17870 /* Remove conn references */ 17871 reset_conn_ipif(ipif); 17872 17873 /* 17874 * Make sure we have valid net and subnet broadcast ire's for the 17875 * other ipif's which share them with this ipif. 17876 */ 17877 if (!ipif->ipif_isv6) 17878 ipif_check_bcast_ires(ipif); 17879 17880 /* 17881 * Take down the interface. We can be called either from ill_delete 17882 * or from ip_sioctl_removeif. 17883 */ 17884 (void) ipif_down(ipif, NULL, NULL); 17885 17886 rw_enter(&ill_g_lock, RW_WRITER); 17887 /* Remove pointers to this ill in the multicast routing tables */ 17888 reset_mrt_vif_ipif(ipif); 17889 rw_exit(&ill_g_lock); 17890 } 17891 17892 static void 17893 ipif_free_tail(ipif_t *ipif) 17894 { 17895 mblk_t *mp; 17896 ipif_t **ipifp; 17897 17898 /* 17899 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 17900 */ 17901 mutex_enter(&ipif->ipif_saved_ire_lock); 17902 mp = ipif->ipif_saved_ire_mp; 17903 ipif->ipif_saved_ire_mp = NULL; 17904 mutex_exit(&ipif->ipif_saved_ire_lock); 17905 freemsg(mp); 17906 17907 /* 17908 * Need to hold both ill_g_lock and ill_lock while 17909 * inserting or removing an ipif from the linked list 17910 * of ipifs hanging off the ill. 17911 */ 17912 rw_enter(&ill_g_lock, RW_WRITER); 17913 /* 17914 * Remove all multicast memberships on the interface now. 17915 * This removes IPv4 multicast memberships joined within 17916 * the kernel as ipif_down does not do ipif_multicast_down 17917 * for IPv4. IPv6 is not handled here as the multicast memberships 17918 * are based on ill and not on ipif. 17919 */ 17920 ilm_free(ipif); 17921 17922 /* 17923 * Since we held the ill_g_lock while doing the ilm_free above, 17924 * we can assert the ilms were really deleted and not just marked 17925 * ILM_DELETED. 17926 */ 17927 ASSERT(ilm_walk_ipif(ipif) == 0); 17928 17929 17930 IPIF_TRACE_CLEANUP(ipif); 17931 17932 /* Ask SCTP to take it out of it list */ 17933 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 17934 17935 mutex_enter(&ipif->ipif_ill->ill_lock); 17936 /* Get it out of the ILL interface list. */ 17937 ipifp = &ipif->ipif_ill->ill_ipif; 17938 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 17939 if (*ipifp == ipif) { 17940 *ipifp = ipif->ipif_next; 17941 break; 17942 } 17943 } 17944 17945 mutex_exit(&ipif->ipif_ill->ill_lock); 17946 rw_exit(&ill_g_lock); 17947 17948 mutex_destroy(&ipif->ipif_saved_ire_lock); 17949 /* Free the memory. */ 17950 mi_free((char *)ipif); 17951 } 17952 17953 /* 17954 * Returns an ipif name in the form "ill_name/unit" if ipif_id is not zero, 17955 * "ill_name" otherwise. 17956 */ 17957 char * 17958 ipif_get_name(ipif_t *ipif, char *buf, int len) 17959 { 17960 char lbuf[32]; 17961 char *name; 17962 size_t name_len; 17963 17964 buf[0] = '\0'; 17965 if (!ipif) 17966 return (buf); 17967 name = ipif->ipif_ill->ill_name; 17968 name_len = ipif->ipif_ill->ill_name_length; 17969 if (ipif->ipif_id != 0) { 17970 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 17971 ipif->ipif_id); 17972 name = lbuf; 17973 name_len = mi_strlen(name) + 1; 17974 } 17975 len -= 1; 17976 buf[len] = '\0'; 17977 len = MIN(len, name_len); 17978 bcopy(name, buf, len); 17979 return (buf); 17980 } 17981 17982 /* 17983 * Find an IPIF based on the name passed in. Names can be of the 17984 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 17985 * The <phys> string can have forms like <dev><#> (e.g., le0), 17986 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 17987 * When there is no colon, the implied unit id is zero. <phys> must 17988 * correspond to the name of an ILL. (May be called as writer.) 17989 */ 17990 static ipif_t * 17991 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 17992 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 17993 mblk_t *mp, ipsq_func_t func, int *error) 17994 { 17995 char *cp; 17996 char *endp; 17997 long id; 17998 ill_t *ill; 17999 ipif_t *ipif; 18000 uint_t ire_type; 18001 boolean_t did_alloc = B_FALSE; 18002 ipsq_t *ipsq; 18003 18004 if (error != NULL) 18005 *error = 0; 18006 18007 /* 18008 * If the caller wants to us to create the ipif, make sure we have a 18009 * valid zoneid 18010 */ 18011 ASSERT(!do_alloc || zoneid != ALL_ZONES); 18012 18013 if (namelen == 0) { 18014 if (error != NULL) 18015 *error = ENXIO; 18016 return (NULL); 18017 } 18018 18019 *exists = B_FALSE; 18020 /* Look for a colon in the name. */ 18021 endp = &name[namelen]; 18022 for (cp = endp; --cp > name; ) { 18023 if (*cp == IPIF_SEPARATOR_CHAR) 18024 break; 18025 } 18026 18027 if (*cp == IPIF_SEPARATOR_CHAR) { 18028 /* 18029 * Reject any non-decimal aliases for logical 18030 * interfaces. Aliases with leading zeroes 18031 * are also rejected as they introduce ambiguity 18032 * in the naming of the interfaces. 18033 * In order to confirm with existing semantics, 18034 * and to not break any programs/script relying 18035 * on that behaviour, if<0>:0 is considered to be 18036 * a valid interface. 18037 * 18038 * If alias has two or more digits and the first 18039 * is zero, fail. 18040 */ 18041 if (&cp[2] < endp && cp[1] == '0') 18042 return (NULL); 18043 } 18044 18045 if (cp <= name) { 18046 cp = endp; 18047 } else { 18048 *cp = '\0'; 18049 } 18050 18051 /* 18052 * Look up the ILL, based on the portion of the name 18053 * before the slash. ill_lookup_on_name returns a held ill. 18054 * Temporary to check whether ill exists already. If so 18055 * ill_lookup_on_name will clear it. 18056 */ 18057 ill = ill_lookup_on_name(name, do_alloc, isv6, 18058 q, mp, func, error, &did_alloc); 18059 if (cp != endp) 18060 *cp = IPIF_SEPARATOR_CHAR; 18061 if (ill == NULL) 18062 return (NULL); 18063 18064 /* Establish the unit number in the name. */ 18065 id = 0; 18066 if (cp < endp && *endp == '\0') { 18067 /* If there was a colon, the unit number follows. */ 18068 cp++; 18069 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 18070 ill_refrele(ill); 18071 if (error != NULL) 18072 *error = ENXIO; 18073 return (NULL); 18074 } 18075 } 18076 18077 GRAB_CONN_LOCK(q); 18078 mutex_enter(&ill->ill_lock); 18079 /* Now see if there is an IPIF with this unit number. */ 18080 for (ipif = ill->ill_ipif; ipif; ipif = ipif->ipif_next) { 18081 if (ipif->ipif_id == id) { 18082 if (zoneid != ALL_ZONES && 18083 zoneid != ipif->ipif_zoneid) { 18084 mutex_exit(&ill->ill_lock); 18085 RELEASE_CONN_LOCK(q); 18086 ill_refrele(ill); 18087 if (error != NULL) 18088 *error = ENXIO; 18089 return (NULL); 18090 } 18091 /* 18092 * The block comment at the start of ipif_down 18093 * explains the use of the macros used below 18094 */ 18095 if (IPIF_CAN_LOOKUP(ipif)) { 18096 ipif_refhold_locked(ipif); 18097 mutex_exit(&ill->ill_lock); 18098 if (!did_alloc) 18099 *exists = B_TRUE; 18100 /* 18101 * Drop locks before calling ill_refrele 18102 * since it can potentially call into 18103 * ipif_ill_refrele_tail which can end up 18104 * in trying to acquire any lock. 18105 */ 18106 RELEASE_CONN_LOCK(q); 18107 ill_refrele(ill); 18108 return (ipif); 18109 } else if (IPIF_CAN_WAIT(ipif, q)) { 18110 ipsq = ill->ill_phyint->phyint_ipsq; 18111 mutex_enter(&ipsq->ipsq_lock); 18112 mutex_exit(&ill->ill_lock); 18113 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 18114 mutex_exit(&ipsq->ipsq_lock); 18115 RELEASE_CONN_LOCK(q); 18116 ill_refrele(ill); 18117 *error = EINPROGRESS; 18118 return (NULL); 18119 } 18120 } 18121 } 18122 RELEASE_CONN_LOCK(q); 18123 18124 if (!do_alloc) { 18125 mutex_exit(&ill->ill_lock); 18126 ill_refrele(ill); 18127 if (error != NULL) 18128 *error = ENXIO; 18129 return (NULL); 18130 } 18131 18132 /* 18133 * If none found, atomically allocate and return a new one. 18134 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 18135 * to support "receive only" use of lo0:1 etc. as is still done 18136 * below as an initial guess. 18137 * However, this is now likely to be overriden later in ipif_up_done() 18138 * when we know for sure what address has been configured on the 18139 * interface, since we might have more than one loopback interface 18140 * with a loopback address, e.g. in the case of zones, and all the 18141 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 18142 */ 18143 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 18144 ire_type = IRE_LOOPBACK; 18145 else 18146 ire_type = IRE_LOCAL; 18147 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 18148 if (ipif != NULL) 18149 ipif_refhold_locked(ipif); 18150 else if (error != NULL) 18151 *error = ENOMEM; 18152 mutex_exit(&ill->ill_lock); 18153 ill_refrele(ill); 18154 return (ipif); 18155 } 18156 18157 /* 18158 * This routine is called whenever a new address comes up on an ipif. If 18159 * we are configured to respond to address mask requests, then we are supposed 18160 * to broadcast an address mask reply at this time. This routine is also 18161 * called if we are already up, but a netmask change is made. This is legal 18162 * but might not make the system manager very popular. (May be called 18163 * as writer.) 18164 */ 18165 static void 18166 ipif_mask_reply(ipif_t *ipif) 18167 { 18168 icmph_t *icmph; 18169 ipha_t *ipha; 18170 mblk_t *mp; 18171 18172 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 18173 18174 if (!ip_respond_to_address_mask_broadcast) 18175 return; 18176 18177 /* ICMP mask reply is IPv4 only */ 18178 ASSERT(!ipif->ipif_isv6); 18179 /* ICMP mask reply is not for a loopback interface */ 18180 ASSERT(ipif->ipif_ill->ill_wq != NULL); 18181 18182 mp = allocb(REPLY_LEN, BPRI_HI); 18183 if (mp == NULL) 18184 return; 18185 mp->b_wptr = mp->b_rptr + REPLY_LEN; 18186 18187 ipha = (ipha_t *)mp->b_rptr; 18188 bzero(ipha, REPLY_LEN); 18189 *ipha = icmp_ipha; 18190 ipha->ipha_ttl = ip_broadcast_ttl; 18191 ipha->ipha_src = ipif->ipif_src_addr; 18192 ipha->ipha_dst = ipif->ipif_brd_addr; 18193 ipha->ipha_length = htons(REPLY_LEN); 18194 ipha->ipha_ident = 0; 18195 18196 icmph = (icmph_t *)&ipha[1]; 18197 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 18198 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 18199 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 18200 if (icmph->icmph_checksum == 0) 18201 icmph->icmph_checksum = 0xffff; 18202 18203 put(ipif->ipif_wq, mp); 18204 18205 #undef REPLY_LEN 18206 } 18207 18208 /* 18209 * When the mtu in the ipif changes, we call this routine through ire_walk 18210 * to update all the relevant IREs. 18211 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 18212 */ 18213 static void 18214 ipif_mtu_change(ire_t *ire, char *ipif_arg) 18215 { 18216 ipif_t *ipif = (ipif_t *)ipif_arg; 18217 18218 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 18219 return; 18220 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 18221 } 18222 18223 /* 18224 * When the mtu in the ill changes, we call this routine through ire_walk 18225 * to update all the relevant IREs. 18226 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 18227 */ 18228 void 18229 ill_mtu_change(ire_t *ire, char *ill_arg) 18230 { 18231 ill_t *ill = (ill_t *)ill_arg; 18232 18233 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 18234 return; 18235 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 18236 } 18237 18238 /* 18239 * Join the ipif specific multicast groups. 18240 * Must be called after a mapping has been set up in the resolver. (Always 18241 * called as writer.) 18242 */ 18243 void 18244 ipif_multicast_up(ipif_t *ipif) 18245 { 18246 int err, index; 18247 ill_t *ill; 18248 18249 ASSERT(IAM_WRITER_IPIF(ipif)); 18250 18251 ill = ipif->ipif_ill; 18252 index = ill->ill_phyint->phyint_ifindex; 18253 18254 ip1dbg(("ipif_multicast_up\n")); 18255 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 18256 return; 18257 18258 if (ipif->ipif_isv6) { 18259 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 18260 return; 18261 18262 /* Join the all hosts multicast address */ 18263 ip1dbg(("ipif_multicast_up - addmulti\n")); 18264 /* 18265 * Passing B_TRUE means we have to join the multicast 18266 * membership on this interface even though this is 18267 * FAILED. If we join on a different one in the group, 18268 * we will not be able to delete the membership later 18269 * as we currently don't track where we join when we 18270 * join within the kernel unlike applications where 18271 * we have ilg/ilg_orig_index. See ip_addmulti_v6 18272 * for more on this. 18273 */ 18274 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 18275 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 18276 if (err != 0) { 18277 ip0dbg(("ipif_multicast_up: " 18278 "all_hosts_mcast failed %d\n", 18279 err)); 18280 return; 18281 } 18282 /* 18283 * Enable multicast for the solicited node multicast address 18284 */ 18285 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 18286 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 18287 18288 ipv6_multi.s6_addr32[3] |= 18289 ipif->ipif_v6lcl_addr.s6_addr32[3]; 18290 18291 err = ip_addmulti_v6(&ipv6_multi, ill, index, 18292 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 18293 NULL); 18294 if (err != 0) { 18295 ip0dbg(("ipif_multicast_up: solicited MC" 18296 " failed %d\n", err)); 18297 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 18298 ill, ill->ill_phyint->phyint_ifindex, 18299 ipif->ipif_zoneid, B_TRUE, B_TRUE); 18300 return; 18301 } 18302 } 18303 } else { 18304 if (ipif->ipif_lcl_addr == INADDR_ANY) 18305 return; 18306 18307 /* Join the all hosts multicast address */ 18308 ip1dbg(("ipif_multicast_up - addmulti\n")); 18309 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 18310 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 18311 if (err) { 18312 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 18313 return; 18314 } 18315 } 18316 ipif->ipif_multicast_up = 1; 18317 } 18318 18319 /* 18320 * Blow away any IPv6 multicast groups that we joined in ipif_multicast_up(); 18321 * any explicit memberships are blown away in ill_leave_multicast() when the 18322 * ill is brought down. 18323 */ 18324 static void 18325 ipif_multicast_down(ipif_t *ipif) 18326 { 18327 int err; 18328 18329 ASSERT(IAM_WRITER_IPIF(ipif)); 18330 18331 ip1dbg(("ipif_multicast_down\n")); 18332 if (!ipif->ipif_multicast_up) 18333 return; 18334 18335 ASSERT(ipif->ipif_isv6); 18336 18337 ip1dbg(("ipif_multicast_down - delmulti\n")); 18338 18339 /* 18340 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 18341 * we should look for ilms on this ill rather than the ones that have 18342 * been failed over here. They are here temporarily. As 18343 * ipif_multicast_up has joined on this ill, we should delete only 18344 * from this ill. 18345 */ 18346 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 18347 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 18348 B_TRUE, B_TRUE); 18349 if (err != 0) { 18350 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 18351 err)); 18352 } 18353 /* 18354 * Disable multicast for the solicited node multicast address 18355 */ 18356 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 18357 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 18358 18359 ipv6_multi.s6_addr32[3] |= 18360 ipif->ipif_v6lcl_addr.s6_addr32[3]; 18361 18362 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 18363 ipif->ipif_ill->ill_phyint->phyint_ifindex, 18364 ipif->ipif_zoneid, B_TRUE, B_TRUE); 18365 18366 if (err != 0) { 18367 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 18368 err)); 18369 } 18370 } 18371 18372 ipif->ipif_multicast_up = 0; 18373 } 18374 18375 /* 18376 * Used when an interface comes up to recreate any extra routes on this 18377 * interface. 18378 */ 18379 static ire_t ** 18380 ipif_recover_ire(ipif_t *ipif) 18381 { 18382 mblk_t *mp; 18383 ire_t **ipif_saved_irep; 18384 ire_t **irep; 18385 18386 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 18387 ipif->ipif_id)); 18388 18389 mutex_enter(&ipif->ipif_saved_ire_lock); 18390 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 18391 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 18392 if (ipif_saved_irep == NULL) { 18393 mutex_exit(&ipif->ipif_saved_ire_lock); 18394 return (NULL); 18395 } 18396 18397 irep = ipif_saved_irep; 18398 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 18399 ire_t *ire; 18400 queue_t *rfq; 18401 queue_t *stq; 18402 ifrt_t *ifrt; 18403 uchar_t *src_addr; 18404 uchar_t *gateway_addr; 18405 mblk_t *resolver_mp; 18406 ushort_t type; 18407 18408 /* 18409 * When the ire was initially created and then added in 18410 * ip_rt_add(), it was created either using ipif->ipif_net_type 18411 * in the case of a traditional interface route, or as one of 18412 * the IRE_OFFSUBNET types (with the exception of 18413 * IRE_HOST_REDIRECT which is created by icmp_redirect() and 18414 * which we don't need to save or recover). In the case where 18415 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 18416 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 18417 * to satisfy software like GateD and Sun Cluster which creates 18418 * routes using the the loopback interface's address as a 18419 * gateway. 18420 * 18421 * As ifrt->ifrt_type reflects the already updated ire_type and 18422 * since ire_create() expects that IRE_IF_NORESOLVER will have 18423 * a valid ire_dlureq_mp field (which doesn't make sense for a 18424 * IRE_LOOPBACK), ire_create() will be called in the same way 18425 * here as in ip_rt_add(), namely using ipif->ipif_net_type when 18426 * the route looks like a traditional interface route (where 18427 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 18428 * the saved ifrt->ifrt_type. This means that in the case where 18429 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 18430 * ire_create() will be an IRE_LOOPBACK, it will then be turned 18431 * into an IRE_IF_NORESOLVER and then added by ire_add(). 18432 */ 18433 ifrt = (ifrt_t *)mp->b_rptr; 18434 if (ifrt->ifrt_type & IRE_INTERFACE) { 18435 rfq = NULL; 18436 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 18437 ? ipif->ipif_rq : ipif->ipif_wq; 18438 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18439 ? (uint8_t *)&ifrt->ifrt_src_addr 18440 : (uint8_t *)&ipif->ipif_src_addr; 18441 gateway_addr = NULL; 18442 resolver_mp = ipif->ipif_resolver_mp; 18443 type = ipif->ipif_net_type; 18444 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 18445 /* Recover multiroute broadcast IRE. */ 18446 rfq = ipif->ipif_rq; 18447 stq = ipif->ipif_wq; 18448 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18449 ? (uint8_t *)&ifrt->ifrt_src_addr 18450 : (uint8_t *)&ipif->ipif_src_addr; 18451 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 18452 resolver_mp = ipif->ipif_bcast_mp; 18453 type = ifrt->ifrt_type; 18454 } else { 18455 rfq = NULL; 18456 stq = NULL; 18457 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 18458 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 18459 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 18460 resolver_mp = NULL; 18461 type = ifrt->ifrt_type; 18462 } 18463 18464 /* 18465 * Create a copy of the IRE with the saved address and netmask. 18466 */ 18467 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 18468 "0x%x/0x%x\n", 18469 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 18470 ntohl(ifrt->ifrt_addr), 18471 ntohl(ifrt->ifrt_mask))); 18472 ire = ire_create( 18473 (uint8_t *)&ifrt->ifrt_addr, 18474 (uint8_t *)&ifrt->ifrt_mask, 18475 src_addr, 18476 gateway_addr, 18477 NULL, 18478 &ifrt->ifrt_max_frag, 18479 NULL, 18480 rfq, 18481 stq, 18482 type, 18483 resolver_mp, 18484 ipif, 18485 NULL, 18486 0, 18487 0, 18488 0, 18489 ifrt->ifrt_flags, 18490 &ifrt->ifrt_iulp_info); 18491 18492 if (ire == NULL) { 18493 mutex_exit(&ipif->ipif_saved_ire_lock); 18494 kmem_free(ipif_saved_irep, 18495 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 18496 return (NULL); 18497 } 18498 18499 /* 18500 * Some software (for example, GateD and Sun Cluster) attempts 18501 * to create (what amount to) IRE_PREFIX routes with the 18502 * loopback address as the gateway. This is primarily done to 18503 * set up prefixes with the RTF_REJECT flag set (for example, 18504 * when generating aggregate routes.) 18505 * 18506 * If the IRE type (as defined by ipif->ipif_net_type) is 18507 * IRE_LOOPBACK, then we map the request into a 18508 * IRE_IF_NORESOLVER. 18509 */ 18510 if (ipif->ipif_net_type == IRE_LOOPBACK) 18511 ire->ire_type = IRE_IF_NORESOLVER; 18512 /* 18513 * ire held by ire_add, will be refreled' towards the 18514 * the end of ipif_up_done 18515 */ 18516 (void) ire_add(&ire, NULL, NULL, NULL); 18517 *irep = ire; 18518 irep++; 18519 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 18520 } 18521 mutex_exit(&ipif->ipif_saved_ire_lock); 18522 return (ipif_saved_irep); 18523 } 18524 18525 /* 18526 * Used to set the netmask and broadcast address to default values when the 18527 * interface is brought up. (Always called as writer.) 18528 */ 18529 static void 18530 ipif_set_default(ipif_t *ipif) 18531 { 18532 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 18533 18534 if (!ipif->ipif_isv6) { 18535 /* 18536 * Interface holds an IPv4 address. Default 18537 * mask is the natural netmask. 18538 */ 18539 if (!ipif->ipif_net_mask) { 18540 ipaddr_t v4mask; 18541 18542 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 18543 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 18544 } 18545 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 18546 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 18547 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 18548 } else { 18549 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 18550 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 18551 } 18552 /* 18553 * NOTE: SunOS 4.X does this even if the broadcast address 18554 * has been already set thus we do the same here. 18555 */ 18556 if (ipif->ipif_flags & IPIF_BROADCAST) { 18557 ipaddr_t v4addr; 18558 18559 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 18560 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 18561 } 18562 } else { 18563 /* 18564 * Interface holds an IPv6-only address. Default 18565 * mask is all-ones. 18566 */ 18567 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 18568 ipif->ipif_v6net_mask = ipv6_all_ones; 18569 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 18570 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 18571 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 18572 } else { 18573 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 18574 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 18575 } 18576 } 18577 } 18578 18579 /* 18580 * Return 0 if this address can be used as local address without causing 18581 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 18582 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 18583 * Special checks are needed to allow the same IPv6 link-local address 18584 * on different ills. 18585 * TODO: allowing the same site-local address on different ill's. 18586 */ 18587 int 18588 ip_addr_availability_check(ipif_t *new_ipif) 18589 { 18590 in6_addr_t our_v6addr; 18591 ill_t *ill; 18592 ipif_t *ipif; 18593 ill_walk_context_t ctx; 18594 18595 ASSERT(IAM_WRITER_IPIF(new_ipif)); 18596 ASSERT(MUTEX_HELD(&ip_addr_avail_lock)); 18597 ASSERT(RW_READ_HELD(&ill_g_lock)); 18598 18599 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 18600 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 18601 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 18602 return (0); 18603 18604 our_v6addr = new_ipif->ipif_v6lcl_addr; 18605 18606 if (new_ipif->ipif_isv6) 18607 ill = ILL_START_WALK_V6(&ctx); 18608 else 18609 ill = ILL_START_WALK_V4(&ctx); 18610 18611 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18612 for (ipif = ill->ill_ipif; ipif != NULL; 18613 ipif = ipif->ipif_next) { 18614 if ((ipif == new_ipif) || 18615 !(ipif->ipif_flags & IPIF_UP) || 18616 (ipif->ipif_flags & IPIF_UNNUMBERED)) 18617 continue; 18618 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 18619 &our_v6addr)) { 18620 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 18621 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 18622 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 18623 ipif->ipif_flags |= IPIF_UNNUMBERED; 18624 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 18625 new_ipif->ipif_ill != ill) 18626 continue; 18627 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 18628 new_ipif->ipif_ill != ill) 18629 continue; 18630 else if (new_ipif->ipif_zoneid != 18631 ipif->ipif_zoneid && 18632 (ill->ill_phyint->phyint_flags & 18633 PHYI_LOOPBACK)) 18634 continue; 18635 else if (new_ipif->ipif_ill == ill) 18636 return (EADDRINUSE); 18637 else 18638 return (EADDRNOTAVAIL); 18639 } 18640 } 18641 } 18642 18643 return (0); 18644 } 18645 18646 /* 18647 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 18648 * IREs for the ipif. 18649 * When the routine returns EINPROGRESS then mp has been consumed and 18650 * the ioctl will be acked from ip_rput_dlpi. 18651 */ 18652 static int 18653 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 18654 { 18655 ill_t *ill = ipif->ipif_ill; 18656 boolean_t isv6 = ipif->ipif_isv6; 18657 int err = 0; 18658 boolean_t success; 18659 18660 ASSERT(IAM_WRITER_IPIF(ipif)); 18661 18662 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 18663 18664 /* Shouldn't get here if it is already up. */ 18665 if (ipif->ipif_flags & IPIF_UP) 18666 return (EALREADY); 18667 18668 /* Skip arp/ndp for any loopback interface. */ 18669 if (ill->ill_wq != NULL) { 18670 conn_t *connp = Q_TO_CONN(q); 18671 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 18672 18673 if (!ill->ill_dl_up) { 18674 /* 18675 * ill_dl_up is not yet set. i.e. we are yet to 18676 * DL_BIND with the driver and this is the first 18677 * logical interface on the ill to become "up". 18678 * Tell the driver to get going (via DL_BIND_REQ). 18679 * Note that changing "significant" IFF_ flags 18680 * address/netmask etc cause a down/up dance, but 18681 * does not cause an unbind (DL_UNBIND) with the driver 18682 */ 18683 return (ill_dl_up(ill, ipif, mp, q)); 18684 } 18685 18686 /* 18687 * ipif_resolver_up may end up sending an 18688 * AR_INTERFACE_UP message to ARP, which would, in 18689 * turn send a DLPI message to the driver. ioctls are 18690 * serialized and so we cannot send more than one 18691 * interface up message at a time. If ipif_resolver_up 18692 * does send an interface up message to ARP, we get 18693 * EINPROGRESS and we will complete in ip_arp_done. 18694 */ 18695 18696 ASSERT(connp != NULL); 18697 ASSERT(ipsq->ipsq_pending_mp == NULL); 18698 mutex_enter(&connp->conn_lock); 18699 mutex_enter(&ill->ill_lock); 18700 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 18701 mutex_exit(&ill->ill_lock); 18702 mutex_exit(&connp->conn_lock); 18703 if (!success) 18704 return (EINTR); 18705 18706 /* 18707 * Crank up IPv6 neighbor discovery 18708 * Unlike ARP, this should complete when 18709 * ipif_ndp_up returns. However, for 18710 * ILLF_XRESOLV interfaces we also send a 18711 * AR_INTERFACE_UP to the external resolver. 18712 * That ioctl will complete in ip_rput. 18713 */ 18714 if (isv6) { 18715 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, 18716 B_FALSE); 18717 if (err != 0) { 18718 mp = ipsq_pending_mp_get(ipsq, &connp); 18719 return (err); 18720 } 18721 } 18722 /* Now, ARP */ 18723 if ((err = ipif_resolver_up(ipif, B_FALSE)) == 18724 EINPROGRESS) { 18725 /* We will complete it in ip_arp_done */ 18726 return (err); 18727 } 18728 mp = ipsq_pending_mp_get(ipsq, &connp); 18729 ASSERT(mp != NULL); 18730 if (err != 0) 18731 return (err); 18732 } 18733 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 18734 } 18735 18736 /* 18737 * Perform a bind for the physical device. 18738 * When the routine returns EINPROGRESS then mp has been consumed and 18739 * the ioctl will be acked from ip_rput_dlpi. 18740 * Allocate an unbind message and save it until ipif_down. 18741 */ 18742 static int 18743 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 18744 { 18745 mblk_t *areq_mp = NULL; 18746 mblk_t *bind_mp = NULL; 18747 mblk_t *unbind_mp = NULL; 18748 conn_t *connp; 18749 boolean_t success; 18750 18751 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 18752 ASSERT(IAM_WRITER_ILL(ill)); 18753 18754 ASSERT(mp != NULL); 18755 18756 /* Create a resolver cookie for ARP */ 18757 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 18758 areq_t *areq; 18759 uint16_t sap_addr; 18760 18761 areq_mp = ill_arp_alloc(ill, 18762 (uchar_t *)&ip_areq_template, 0); 18763 if (areq_mp == NULL) { 18764 return (ENOMEM); 18765 } 18766 freemsg(ill->ill_resolver_mp); 18767 ill->ill_resolver_mp = areq_mp; 18768 areq = (areq_t *)areq_mp->b_rptr; 18769 sap_addr = ill->ill_sap; 18770 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 18771 /* 18772 * Wait till we call ill_pending_mp_add to determine 18773 * the success before we free the ill_resolver_mp and 18774 * attach areq_mp in it's place. 18775 */ 18776 } 18777 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 18778 DL_BIND_REQ); 18779 if (bind_mp == NULL) 18780 goto bad; 18781 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 18782 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 18783 18784 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 18785 if (unbind_mp == NULL) 18786 goto bad; 18787 18788 /* 18789 * Record state needed to complete this operation when the 18790 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 18791 */ 18792 if (WR(q)->q_next == NULL) { 18793 connp = Q_TO_CONN(q); 18794 mutex_enter(&connp->conn_lock); 18795 } else { 18796 connp = NULL; 18797 } 18798 mutex_enter(&ipif->ipif_ill->ill_lock); 18799 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 18800 mutex_exit(&ipif->ipif_ill->ill_lock); 18801 if (connp != NULL) 18802 mutex_exit(&connp->conn_lock); 18803 if (!success) 18804 goto bad; 18805 18806 /* 18807 * Save the unbind message for ill_dl_down(); it will be consumed when 18808 * the interface goes down. 18809 */ 18810 ASSERT(ill->ill_unbind_mp == NULL); 18811 ill->ill_unbind_mp = unbind_mp; 18812 18813 ill_dlpi_send(ill, bind_mp); 18814 /* Send down link-layer capabilities probe if not already done. */ 18815 ill_capability_probe(ill); 18816 18817 /* 18818 * Sysid used to rely on the fact that netboots set domainname 18819 * and the like. Now that miniroot boots aren't strictly netboots 18820 * and miniroot network configuration is driven from userland 18821 * these things still need to be set. This situation can be detected 18822 * by comparing the interface being configured here to the one 18823 * dhcack was set to reference by the boot loader. Once sysid is 18824 * converted to use dhcp_ipc_getinfo() this call can go away. 18825 */ 18826 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) && 18827 (strcmp(ill->ill_name, dhcack) == 0) && 18828 (strlen(srpc_domain) == 0)) { 18829 if (dhcpinit() != 0) 18830 cmn_err(CE_WARN, "no cached dhcp response"); 18831 } 18832 18833 /* 18834 * This operation will complete in ip_rput_dlpi with either 18835 * a DL_BIND_ACK or DL_ERROR_ACK. 18836 */ 18837 return (EINPROGRESS); 18838 bad: 18839 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 18840 /* 18841 * We don't have to check for possible removal from illgrp 18842 * as we have not yet inserted in illgrp. For groups 18843 * without names, this ipif is still not UP and hence 18844 * this could not have possibly had any influence in forming 18845 * groups. 18846 */ 18847 18848 if (bind_mp != NULL) 18849 freemsg(bind_mp); 18850 if (unbind_mp != NULL) 18851 freemsg(unbind_mp); 18852 return (ENOMEM); 18853 } 18854 18855 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 18856 18857 /* 18858 * DLPI and ARP is up. 18859 * Create all the IREs associated with an interface bring up multicast. 18860 * Set the interface flag and finish other initialization 18861 * that potentially had to be differed to after DL_BIND_ACK. 18862 */ 18863 int 18864 ipif_up_done(ipif_t *ipif) 18865 { 18866 ire_t *ire_array[20]; 18867 ire_t **irep = ire_array; 18868 ire_t **irep1; 18869 ipaddr_t net_mask = 0; 18870 ipaddr_t subnet_mask, route_mask; 18871 ill_t *ill = ipif->ipif_ill; 18872 queue_t *stq; 18873 ipif_t *src_ipif; 18874 ipif_t *tmp_ipif; 18875 boolean_t flush_ire_cache = B_TRUE; 18876 int err = 0; 18877 phyint_t *phyi; 18878 ire_t **ipif_saved_irep = NULL; 18879 int ipif_saved_ire_cnt; 18880 int cnt; 18881 boolean_t src_ipif_held = B_FALSE; 18882 boolean_t ire_added = B_FALSE; 18883 boolean_t loopback = B_FALSE; 18884 18885 ip1dbg(("ipif_up_done(%s:%u)\n", 18886 ipif->ipif_ill->ill_name, ipif->ipif_id)); 18887 /* Check if this is a loopback interface */ 18888 if (ipif->ipif_ill->ill_wq == NULL) 18889 loopback = B_TRUE; 18890 18891 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 18892 /* 18893 * If all other interfaces for this ill are down or DEPRECATED, 18894 * or otherwise unsuitable for source address selection, remove 18895 * any IRE_CACHE entries for this ill to make sure source 18896 * address selection gets to take this new ipif into account. 18897 * No need to hold ill_lock while traversing the ipif list since 18898 * we are writer 18899 */ 18900 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 18901 tmp_ipif = tmp_ipif->ipif_next) { 18902 if (((tmp_ipif->ipif_flags & 18903 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 18904 !(tmp_ipif->ipif_flags & IPIF_UP)) || 18905 (tmp_ipif == ipif)) 18906 continue; 18907 /* first useable pre-existing interface */ 18908 flush_ire_cache = B_FALSE; 18909 break; 18910 } 18911 if (flush_ire_cache) 18912 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 18913 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 18914 18915 /* 18916 * Figure out which way the send-to queue should go. Only 18917 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 18918 * should show up here. 18919 */ 18920 switch (ill->ill_net_type) { 18921 case IRE_IF_RESOLVER: 18922 stq = ill->ill_rq; 18923 break; 18924 case IRE_IF_NORESOLVER: 18925 case IRE_LOOPBACK: 18926 stq = ill->ill_wq; 18927 break; 18928 default: 18929 return (EINVAL); 18930 } 18931 18932 if (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) { 18933 /* 18934 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 18935 * ipif_lookup_on_name(), but in the case of zones we can have 18936 * several loopback addresses on lo0. So all the interfaces with 18937 * loopback addresses need to be marked IRE_LOOPBACK. 18938 */ 18939 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 18940 htonl(INADDR_LOOPBACK)) 18941 ipif->ipif_ire_type = IRE_LOOPBACK; 18942 else 18943 ipif->ipif_ire_type = IRE_LOCAL; 18944 } 18945 18946 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 18947 /* 18948 * Can't use our source address. Select a different 18949 * source address for the IRE_INTERFACE and IRE_LOCAL 18950 */ 18951 src_ipif = ipif_select_source(ipif->ipif_ill, 18952 ipif->ipif_subnet, ipif->ipif_zoneid); 18953 if (src_ipif == NULL) 18954 src_ipif = ipif; /* Last resort */ 18955 else 18956 src_ipif_held = B_TRUE; 18957 } else { 18958 src_ipif = ipif; 18959 } 18960 18961 /* Create all the IREs associated with this interface */ 18962 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 18963 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 18964 /* Register the source address for __sin6_src_id */ 18965 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 18966 ipif->ipif_zoneid); 18967 if (err != 0) { 18968 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 18969 return (err); 18970 } 18971 /* If the interface address is set, create the local IRE. */ 18972 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 18973 (void *)ipif, 18974 ipif->ipif_ire_type, 18975 ntohl(ipif->ipif_lcl_addr))); 18976 *irep++ = ire_create( 18977 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 18978 (uchar_t *)&ip_g_all_ones, /* mask */ 18979 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 18980 NULL, /* no gateway */ 18981 NULL, 18982 &ip_loopback_mtuplus, /* max frag size */ 18983 NULL, 18984 ipif->ipif_rq, /* recv-from queue */ 18985 NULL, /* no send-to queue */ 18986 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 18987 NULL, 18988 ipif, 18989 NULL, 18990 0, 18991 0, 18992 0, 18993 (ipif->ipif_flags & IPIF_PRIVATE) ? 18994 RTF_PRIVATE : 0, 18995 &ire_uinfo_null); 18996 } else { 18997 ip1dbg(( 18998 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 18999 ipif->ipif_ire_type, 19000 ntohl(ipif->ipif_lcl_addr), 19001 (uint_t)ipif->ipif_flags)); 19002 } 19003 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 19004 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 19005 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 19006 } else { 19007 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 19008 } 19009 19010 subnet_mask = ipif->ipif_net_mask; 19011 19012 /* 19013 * If mask was not specified, use natural netmask of 19014 * interface address. Also, store this mask back into the 19015 * ipif struct. 19016 */ 19017 if (subnet_mask == 0) { 19018 subnet_mask = net_mask; 19019 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 19020 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 19021 ipif->ipif_v6subnet); 19022 } 19023 19024 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 19025 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 19026 ipif->ipif_subnet != INADDR_ANY) { 19027 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19028 19029 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19030 route_mask = IP_HOST_MASK; 19031 } else { 19032 route_mask = subnet_mask; 19033 } 19034 19035 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 19036 "creating if IRE ill_net_type 0x%x for 0x%x\n", 19037 (void *)ipif, (void *)ill, 19038 ill->ill_net_type, 19039 ntohl(ipif->ipif_subnet))); 19040 *irep++ = ire_create( 19041 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 19042 (uchar_t *)&route_mask, /* mask */ 19043 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 19044 NULL, /* no gateway */ 19045 NULL, 19046 &ipif->ipif_mtu, /* max frag */ 19047 NULL, 19048 NULL, /* no recv queue */ 19049 stq, /* send-to queue */ 19050 ill->ill_net_type, /* IF_[NO]RESOLVER */ 19051 ill->ill_resolver_mp, /* xmit header */ 19052 ipif, 19053 NULL, 19054 0, 19055 0, 19056 0, 19057 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 19058 &ire_uinfo_null); 19059 } 19060 19061 /* 19062 * If the interface address is set, create the broadcast IREs. 19063 * 19064 * ire_create_bcast checks if the proposed new IRE matches 19065 * any existing IRE's with the same physical interface (ILL). 19066 * This should get rid of duplicates. 19067 * ire_create_bcast also check IPIF_NOXMIT and does not create 19068 * any broadcast ires. 19069 */ 19070 if ((ipif->ipif_subnet != INADDR_ANY) && 19071 (ipif->ipif_flags & IPIF_BROADCAST)) { 19072 ipaddr_t addr; 19073 19074 ip1dbg(("ipif_up_done: creating broadcast IRE\n")); 19075 irep = ire_check_and_create_bcast(ipif, 0, irep, 19076 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19077 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, 19078 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19079 19080 /* 19081 * For backward compatibility, we need to create net 19082 * broadcast ire's based on the old "IP address class 19083 * system." The reason is that some old machines only 19084 * respond to these class derived net broadcast. 19085 * 19086 * But we should not create these net broadcast ire's if 19087 * the subnet_mask is shorter than the IP address class based 19088 * derived netmask. Otherwise, we may create a net 19089 * broadcast address which is the same as an IP address 19090 * on the subnet. Then TCP will refuse to talk to that 19091 * address. 19092 * 19093 * Nor do we need IRE_BROADCAST ire's for the interface 19094 * with the netmask as 0xFFFFFFFF, as IRE_LOCAL for that 19095 * interface is already created. Creating these broadcast 19096 * ire's will only create confusion as the "addr" is going 19097 * to be same as that of the IP address of the interface. 19098 */ 19099 if (net_mask < subnet_mask) { 19100 addr = net_mask & ipif->ipif_subnet; 19101 irep = ire_check_and_create_bcast(ipif, addr, irep, 19102 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19103 irep = ire_check_and_create_bcast(ipif, 19104 ~net_mask | addr, irep, 19105 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19106 } 19107 19108 if (subnet_mask != 0xFFFFFFFF) { 19109 addr = ipif->ipif_subnet; 19110 irep = ire_check_and_create_bcast(ipif, addr, irep, 19111 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19112 irep = ire_check_and_create_bcast(ipif, 19113 ~subnet_mask|addr, irep, 19114 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19115 } 19116 } 19117 19118 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19119 19120 /* If an earlier ire_create failed, get out now */ 19121 for (irep1 = irep; irep1 > ire_array; ) { 19122 irep1--; 19123 if (*irep1 == NULL) { 19124 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 19125 err = ENOMEM; 19126 goto bad; 19127 } 19128 } 19129 19130 /* 19131 * Need to atomically check for ip_addr_availablity_check 19132 * under ip_addr_avail_lock, and if it fails got bad, and remove 19133 * from group also.The ill_g_lock is grabbed as reader 19134 * just to make sure no new ills or new ipifs are being added 19135 * to the system while we are checking the uniqueness of addresses. 19136 */ 19137 rw_enter(&ill_g_lock, RW_READER); 19138 mutex_enter(&ip_addr_avail_lock); 19139 /* Mark it up, and increment counters. */ 19140 ill->ill_ipif_up_count++; 19141 ipif->ipif_flags |= IPIF_UP; 19142 err = ip_addr_availability_check(ipif); 19143 mutex_exit(&ip_addr_avail_lock); 19144 rw_exit(&ill_g_lock); 19145 19146 if (err != 0) { 19147 /* 19148 * Our address may already be up on the same ill. In this case, 19149 * the ARP entry for our ipif replaced the one for the other 19150 * ipif. So we don't want to delete it (otherwise the other ipif 19151 * would be unable to send packets). 19152 * ip_addr_availability_check() identifies this case for us and 19153 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 19154 * which is the expected error code. 19155 */ 19156 if (err == EADDRINUSE) { 19157 freemsg(ipif->ipif_arp_del_mp); 19158 ipif->ipif_arp_del_mp = NULL; 19159 err = EADDRNOTAVAIL; 19160 } 19161 ill->ill_ipif_up_count--; 19162 ipif->ipif_flags &= ~IPIF_UP; 19163 goto bad; 19164 } 19165 19166 /* 19167 * Add in all newly created IREs. ire_create_bcast() has 19168 * already checked for duplicates of the IRE_BROADCAST type. 19169 * We want to add before we call ifgrp_insert which wants 19170 * to know whether IRE_IF_RESOLVER exists or not. 19171 * 19172 * NOTE : We refrele the ire though we may branch to "bad" 19173 * later on where we do ire_delete. This is okay 19174 * because nobody can delete it as we are running 19175 * exclusively. 19176 */ 19177 for (irep1 = irep; irep1 > ire_array; ) { 19178 irep1--; 19179 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 19180 /* 19181 * refheld by ire_add. refele towards the end of the func 19182 */ 19183 (void) ire_add(irep1, NULL, NULL, NULL); 19184 } 19185 ire_added = B_TRUE; 19186 /* 19187 * Form groups if possible. 19188 * 19189 * If we are supposed to be in a ill_group with a name, insert it 19190 * now as we know that at least one ipif is UP. Otherwise form 19191 * nameless groups. 19192 * 19193 * If ip_enable_group_ifs is set and ipif address is not 0, insert 19194 * this ipif into the appropriate interface group, or create a 19195 * new one. If this is already in a nameless group, we try to form 19196 * a bigger group looking at other ills potentially sharing this 19197 * ipif's prefix. 19198 */ 19199 phyi = ill->ill_phyint; 19200 if (phyi->phyint_groupname_len != 0) { 19201 ASSERT(phyi->phyint_groupname != NULL); 19202 if (ill->ill_ipif_up_count == 1) { 19203 ASSERT(ill->ill_group == NULL); 19204 err = illgrp_insert(&illgrp_head_v4, ill, 19205 phyi->phyint_groupname, NULL, B_TRUE); 19206 if (err != 0) { 19207 ip1dbg(("ipif_up_done: illgrp allocation " 19208 "failed, error %d\n", err)); 19209 goto bad; 19210 } 19211 } 19212 ASSERT(ill->ill_group != NULL); 19213 } 19214 19215 /* 19216 * When this is part of group, we need to make sure that 19217 * any broadcast ires created because of this ipif coming 19218 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 19219 * so that we don't receive duplicate broadcast packets. 19220 */ 19221 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 19222 ipif_renominate_bcast(ipif); 19223 19224 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 19225 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 19226 ipif_saved_irep = ipif_recover_ire(ipif); 19227 19228 if (!loopback) { 19229 /* 19230 * If the broadcast address has been set, make sure it makes 19231 * sense based on the interface address. 19232 * Only match on ill since we are sharing broadcast addresses. 19233 */ 19234 if ((ipif->ipif_brd_addr != INADDR_ANY) && 19235 (ipif->ipif_flags & IPIF_BROADCAST)) { 19236 ire_t *ire; 19237 19238 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 19239 IRE_BROADCAST, ipif, ALL_ZONES, 19240 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 19241 19242 if (ire == NULL) { 19243 /* 19244 * If there isn't a matching broadcast IRE, 19245 * revert to the default for this netmask. 19246 */ 19247 ipif->ipif_v6brd_addr = ipv6_all_zeros; 19248 mutex_enter(&ipif->ipif_ill->ill_lock); 19249 ipif_set_default(ipif); 19250 mutex_exit(&ipif->ipif_ill->ill_lock); 19251 } else { 19252 ire_refrele(ire); 19253 } 19254 } 19255 19256 } 19257 19258 19259 /* This is the first interface on this ill */ 19260 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 19261 /* 19262 * Need to recover all multicast memberships in the driver. 19263 * This had to be deferred until we had attached. 19264 */ 19265 ill_recover_multicast(ill); 19266 } 19267 /* Join the allhosts multicast address */ 19268 ipif_multicast_up(ipif); 19269 19270 if (!loopback) { 19271 /* 19272 * See whether anybody else would benefit from the 19273 * new ipif that we added. We call this always rather 19274 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 19275 * ipif is for the benefit of illgrp_insert (done above) 19276 * which does not do source address selection as it does 19277 * not want to re-create interface routes that we are 19278 * having reference to it here. 19279 */ 19280 ill_update_source_selection(ill); 19281 } 19282 19283 for (irep1 = irep; irep1 > ire_array; ) { 19284 irep1--; 19285 if (*irep1 != NULL) { 19286 /* was held in ire_add */ 19287 ire_refrele(*irep1); 19288 } 19289 } 19290 19291 cnt = ipif_saved_ire_cnt; 19292 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 19293 if (*irep1 != NULL) { 19294 /* was held in ire_add */ 19295 ire_refrele(*irep1); 19296 } 19297 } 19298 19299 /* 19300 * This had to be deferred until we had bound. 19301 * tell routing sockets that this interface is up 19302 */ 19303 ip_rts_ifmsg(ipif); 19304 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 19305 19306 if (!loopback) { 19307 /* Broadcast an address mask reply. */ 19308 ipif_mask_reply(ipif); 19309 } 19310 if (ipif_saved_irep != NULL) { 19311 kmem_free(ipif_saved_irep, 19312 ipif_saved_ire_cnt * sizeof (ire_t *)); 19313 } 19314 if (src_ipif_held) 19315 ipif_refrele(src_ipif); 19316 /* Let SCTP update the status for this ipif */ 19317 sctp_update_ipif(ipif, SCTP_IPIF_UP); 19318 return (0); 19319 19320 bad: 19321 ip1dbg(("ipif_up_done: FAILED \n")); 19322 /* 19323 * We don't have to bother removing from ill groups because 19324 * 19325 * 1) For groups with names, we insert only when the first ipif 19326 * comes up. In that case if it fails, it will not be in any 19327 * group. So, we need not try to remove for that case. 19328 * 19329 * 2) For groups without names, either we tried to insert ipif_ill 19330 * in a group as singleton or found some other group to become 19331 * a bigger group. For the former, if it fails we don't have 19332 * anything to do as ipif_ill is not in the group and for the 19333 * latter, there are no failures in illgrp_insert/illgrp_delete 19334 * (ENOMEM can't occur for this. Check ifgrp_insert). 19335 */ 19336 while (irep > ire_array) { 19337 irep--; 19338 if (*irep != NULL) { 19339 ire_delete(*irep); 19340 if (ire_added) 19341 ire_refrele(*irep); 19342 } 19343 } 19344 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid); 19345 19346 if (ipif_saved_irep != NULL) { 19347 kmem_free(ipif_saved_irep, 19348 ipif_saved_ire_cnt * sizeof (ire_t *)); 19349 } 19350 if (src_ipif_held) 19351 ipif_refrele(src_ipif); 19352 19353 ipif_arp_down(ipif); 19354 return (err); 19355 } 19356 19357 /* 19358 * Turn off the ARP with the ILLF_NOARP flag. 19359 */ 19360 static int 19361 ill_arp_off(ill_t *ill) 19362 { 19363 mblk_t *arp_off_mp = NULL; 19364 mblk_t *arp_on_mp = NULL; 19365 19366 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 19367 19368 ASSERT(IAM_WRITER_ILL(ill)); 19369 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 19370 19371 /* 19372 * If the on message is still around we've already done 19373 * an arp_off without doing an arp_on thus there is no 19374 * work needed. 19375 */ 19376 if (ill->ill_arp_on_mp != NULL) 19377 return (0); 19378 19379 /* 19380 * Allocate an ARP on message (to be saved) and an ARP off message 19381 */ 19382 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 19383 if (!arp_off_mp) 19384 return (ENOMEM); 19385 19386 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 19387 if (!arp_on_mp) 19388 goto failed; 19389 19390 ASSERT(ill->ill_arp_on_mp == NULL); 19391 ill->ill_arp_on_mp = arp_on_mp; 19392 19393 /* Send an AR_INTERFACE_OFF request */ 19394 putnext(ill->ill_rq, arp_off_mp); 19395 return (0); 19396 failed: 19397 19398 if (arp_off_mp) 19399 freemsg(arp_off_mp); 19400 return (ENOMEM); 19401 } 19402 19403 /* 19404 * Turn on ARP by turning off the ILLF_NOARP flag. 19405 */ 19406 static int 19407 ill_arp_on(ill_t *ill) 19408 { 19409 mblk_t *mp; 19410 19411 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 19412 19413 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 19414 19415 ASSERT(IAM_WRITER_ILL(ill)); 19416 /* 19417 * Send an AR_INTERFACE_ON request if we have already done 19418 * an arp_off (which allocated the message). 19419 */ 19420 if (ill->ill_arp_on_mp != NULL) { 19421 mp = ill->ill_arp_on_mp; 19422 ill->ill_arp_on_mp = NULL; 19423 putnext(ill->ill_rq, mp); 19424 } 19425 return (0); 19426 } 19427 19428 /* 19429 * Called after either deleting ill from the group or when setting 19430 * FAILED or STANDBY on the interface. 19431 */ 19432 static void 19433 illgrp_reset_schednext(ill_t *ill) 19434 { 19435 ill_group_t *illgrp; 19436 ill_t *save_ill; 19437 19438 ASSERT(IAM_WRITER_ILL(ill)); 19439 /* 19440 * When called from illgrp_delete, ill_group will be non-NULL. 19441 * But when called from ip_sioctl_flags, it could be NULL if 19442 * somebody is setting FAILED/INACTIVE on some interface which 19443 * is not part of a group. 19444 */ 19445 illgrp = ill->ill_group; 19446 if (illgrp == NULL) 19447 return; 19448 if (illgrp->illgrp_ill_schednext != ill) 19449 return; 19450 19451 illgrp->illgrp_ill_schednext = NULL; 19452 save_ill = ill; 19453 /* 19454 * Choose a good ill to be the next one for 19455 * outbound traffic. As the flags FAILED/STANDBY is 19456 * not yet marked when called from ip_sioctl_flags, 19457 * we check for ill separately. 19458 */ 19459 for (ill = illgrp->illgrp_ill; ill != NULL; 19460 ill = ill->ill_group_next) { 19461 if ((ill != save_ill) && 19462 !(ill->ill_phyint->phyint_flags & 19463 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 19464 illgrp->illgrp_ill_schednext = ill; 19465 return; 19466 } 19467 } 19468 } 19469 19470 /* 19471 * Given an ill, find the next ill in the group to be scheduled. 19472 * (This should be called by ip_newroute() before ire_create().) 19473 * The passed in ill may be pulled out of the group, after we have picked 19474 * up a different outgoing ill from the same group. However ire add will 19475 * atomically check this. 19476 */ 19477 ill_t * 19478 illgrp_scheduler(ill_t *ill) 19479 { 19480 ill_t *retill; 19481 ill_group_t *illgrp; 19482 int illcnt; 19483 int i; 19484 uint64_t flags; 19485 19486 /* 19487 * We don't use a lock to check for the ill_group. If this ill 19488 * is currently being inserted we may end up just returning this 19489 * ill itself. That is ok. 19490 */ 19491 if (ill->ill_group == NULL) { 19492 ill_refhold(ill); 19493 return (ill); 19494 } 19495 19496 /* 19497 * Grab the ill_g_lock as reader to make sure we are dealing with 19498 * a set of stable ills. No ill can be added or deleted or change 19499 * group while we hold the reader lock. 19500 */ 19501 rw_enter(&ill_g_lock, RW_READER); 19502 if ((illgrp = ill->ill_group) == NULL) { 19503 rw_exit(&ill_g_lock); 19504 ill_refhold(ill); 19505 return (ill); 19506 } 19507 19508 illcnt = illgrp->illgrp_ill_count; 19509 mutex_enter(&illgrp->illgrp_lock); 19510 retill = illgrp->illgrp_ill_schednext; 19511 19512 if (retill == NULL) 19513 retill = illgrp->illgrp_ill; 19514 19515 /* 19516 * We do a circular search beginning at illgrp_ill_schednext 19517 * or illgrp_ill. We don't check the flags against the ill lock 19518 * since it can change anytime. The ire creation will be atomic 19519 * and will fail if the ill is FAILED or OFFLINE. 19520 */ 19521 for (i = 0; i < illcnt; i++) { 19522 flags = retill->ill_phyint->phyint_flags; 19523 19524 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 19525 ILL_CAN_LOOKUP(retill)) { 19526 illgrp->illgrp_ill_schednext = retill->ill_group_next; 19527 ill_refhold(retill); 19528 break; 19529 } 19530 retill = retill->ill_group_next; 19531 if (retill == NULL) 19532 retill = illgrp->illgrp_ill; 19533 } 19534 mutex_exit(&illgrp->illgrp_lock); 19535 rw_exit(&ill_g_lock); 19536 19537 return (i == illcnt ? NULL : retill); 19538 } 19539 19540 /* 19541 * Checks for availbility of a usable source address (if there is one) when the 19542 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 19543 * this selection is done regardless of the destination. 19544 */ 19545 boolean_t 19546 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 19547 { 19548 uint_t ifindex; 19549 ipif_t *ipif = NULL; 19550 ill_t *uill; 19551 boolean_t isv6; 19552 19553 ASSERT(ill != NULL); 19554 19555 isv6 = ill->ill_isv6; 19556 ifindex = ill->ill_usesrc_ifindex; 19557 if (ifindex != 0) { 19558 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 19559 NULL); 19560 if (uill == NULL) 19561 return (NULL); 19562 mutex_enter(&uill->ill_lock); 19563 for (ipif = uill->ill_ipif; ipif != NULL; 19564 ipif = ipif->ipif_next) { 19565 if (!IPIF_CAN_LOOKUP(ipif)) 19566 continue; 19567 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 19568 continue; 19569 if (!(ipif->ipif_flags & IPIF_UP)) 19570 continue; 19571 if (ipif->ipif_zoneid != zoneid) 19572 continue; 19573 if ((isv6 && 19574 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 19575 (ipif->ipif_lcl_addr == INADDR_ANY)) 19576 continue; 19577 mutex_exit(&uill->ill_lock); 19578 ill_refrele(uill); 19579 return (B_TRUE); 19580 } 19581 mutex_exit(&uill->ill_lock); 19582 ill_refrele(uill); 19583 } 19584 return (B_FALSE); 19585 } 19586 19587 /* 19588 * Determine the best source address given a destination address and an ill. 19589 * Prefers non-deprecated over deprecated but will return a deprecated 19590 * address if there is no other choice. If there is a usable source address 19591 * on the interface pointed to by ill_usesrc_ifindex then that is given 19592 * first preference. 19593 * 19594 * Returns NULL if there is no suitable source address for the ill. 19595 * This only occurs when there is no valid source address for the ill. 19596 */ 19597 ipif_t * 19598 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 19599 { 19600 ipif_t *ipif; 19601 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 19602 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 19603 int index = 0; 19604 boolean_t wrapped = B_FALSE; 19605 boolean_t same_subnet_only = B_FALSE; 19606 boolean_t ipif_same_found, ipif_other_found; 19607 ill_t *till, *usill = NULL; 19608 19609 if (ill->ill_usesrc_ifindex != 0) { 19610 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, B_FALSE, 19611 NULL, NULL, NULL, NULL); 19612 if (usill != NULL) 19613 ill = usill; /* Select source from usesrc ILL */ 19614 else 19615 return (NULL); 19616 } 19617 19618 /* 19619 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 19620 * can be deleted. But an ipif/ill can get CONDEMNED any time. 19621 * After selecting the right ipif, under ill_lock make sure ipif is 19622 * not condemned, and increment refcnt. If ipif is CONDEMNED, 19623 * we retry. Inside the loop we still need to check for CONDEMNED, 19624 * but not under a lock. 19625 */ 19626 rw_enter(&ill_g_lock, RW_READER); 19627 19628 retry: 19629 till = ill; 19630 ipif_arr[0] = NULL; 19631 19632 if (till->ill_group != NULL) 19633 till = till->ill_group->illgrp_ill; 19634 19635 /* 19636 * Choose one good source address from each ill across the group. 19637 * If possible choose a source address in the same subnet as 19638 * the destination address. 19639 * 19640 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 19641 * This is okay because of the following. 19642 * 19643 * If PHYI_FAILED is set and we still have non-deprecated 19644 * addresses, it means the addresses have not yet been 19645 * failed over to a different interface. We potentially 19646 * select them to create IRE_CACHES, which will be later 19647 * flushed when the addresses move over. 19648 * 19649 * If PHYI_INACTIVE is set and we still have non-deprecated 19650 * addresses, it means either the user has configured them 19651 * or PHYI_INACTIVE has not been cleared after the addresses 19652 * been moved over. For the former, in.mpathd does a failover 19653 * when the interface becomes INACTIVE and hence we should 19654 * not find them. Once INACTIVE is set, we don't allow them 19655 * to create logical interfaces anymore. For the latter, a 19656 * flush will happen when INACTIVE is cleared which will 19657 * flush the IRE_CACHES. 19658 * 19659 * If PHYI_OFFLINE is set, all the addresses will be failed 19660 * over soon. We potentially select them to create IRE_CACHEs, 19661 * which will be later flushed when the addresses move over. 19662 * 19663 * NOTE : As ipif_select_source is called to borrow source address 19664 * for an ipif that is part of a group, source address selection 19665 * will be re-done whenever the group changes i.e either an 19666 * insertion/deletion in the group. 19667 * 19668 * Fill ipif_arr[] with source addresses, using these rules: 19669 * 19670 * 1. At most one source address from a given ill ends up 19671 * in ipif_arr[] -- that is, at most one of the ipif's 19672 * associated with a given ill ends up in ipif_arr[]. 19673 * 19674 * 2. If there is at least one non-deprecated ipif in the 19675 * IPMP group with a source address on the same subnet as 19676 * our destination, then fill ipif_arr[] only with 19677 * source addresses on the same subnet as our destination. 19678 * Note that because of (1), only the first 19679 * non-deprecated ipif found with a source address 19680 * matching the destination ends up in ipif_arr[]. 19681 * 19682 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 19683 * addresses not in the same subnet as our destination. 19684 * Again, because of (1), only the first off-subnet source 19685 * address will be chosen. 19686 * 19687 * 4. If there are no non-deprecated ipifs, then just use 19688 * the source address associated with the last deprecated 19689 * one we find that happens to be on the same subnet, 19690 * otherwise the first one not in the same subnet. 19691 */ 19692 for (; till != NULL; till = till->ill_group_next) { 19693 ipif_same_found = B_FALSE; 19694 ipif_other_found = B_FALSE; 19695 for (ipif = till->ill_ipif; ipif != NULL; 19696 ipif = ipif->ipif_next) { 19697 if (!IPIF_CAN_LOOKUP(ipif)) 19698 continue; 19699 /* Always skip NOLOCAL and ANYCAST interfaces */ 19700 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 19701 continue; 19702 if (!(ipif->ipif_flags & IPIF_UP)) 19703 continue; 19704 if (ipif->ipif_zoneid != zoneid) 19705 continue; 19706 /* 19707 * Interfaces with 0.0.0.0 address are allowed to be UP, 19708 * but are not valid as source addresses. 19709 */ 19710 if (ipif->ipif_lcl_addr == INADDR_ANY) 19711 continue; 19712 if (ipif->ipif_flags & IPIF_DEPRECATED) { 19713 if (ipif_dep == NULL || 19714 (ipif->ipif_net_mask & dst) == 19715 ipif->ipif_subnet) 19716 ipif_dep = ipif; 19717 continue; 19718 } 19719 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 19720 /* found a source address in the same subnet */ 19721 if (same_subnet_only == B_FALSE) { 19722 same_subnet_only = B_TRUE; 19723 index = 0; 19724 } 19725 ipif_same_found = B_TRUE; 19726 } else { 19727 if (same_subnet_only == B_TRUE || 19728 ipif_other_found == B_TRUE) 19729 continue; 19730 ipif_other_found = B_TRUE; 19731 } 19732 ipif_arr[index++] = ipif; 19733 if (index == MAX_IPIF_SELECT_SOURCE) { 19734 wrapped = B_TRUE; 19735 index = 0; 19736 } 19737 if (ipif_same_found == B_TRUE) 19738 break; 19739 } 19740 } 19741 19742 if (ipif_arr[0] == NULL) { 19743 ipif = ipif_dep; 19744 } else { 19745 if (wrapped) 19746 index = MAX_IPIF_SELECT_SOURCE; 19747 ipif = ipif_arr[ipif_rand() % index]; 19748 ASSERT(ipif != NULL); 19749 } 19750 19751 if (ipif != NULL) { 19752 mutex_enter(&ipif->ipif_ill->ill_lock); 19753 if (!IPIF_CAN_LOOKUP(ipif)) { 19754 mutex_exit(&ipif->ipif_ill->ill_lock); 19755 goto retry; 19756 } 19757 ipif_refhold_locked(ipif); 19758 mutex_exit(&ipif->ipif_ill->ill_lock); 19759 } 19760 19761 rw_exit(&ill_g_lock); 19762 if (usill != NULL) 19763 ill_refrele(usill); 19764 19765 #ifdef DEBUG 19766 if (ipif == NULL) { 19767 char buf1[INET6_ADDRSTRLEN]; 19768 19769 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 19770 ill->ill_name, 19771 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 19772 } else { 19773 char buf1[INET6_ADDRSTRLEN]; 19774 char buf2[INET6_ADDRSTRLEN]; 19775 19776 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 19777 ipif->ipif_ill->ill_name, 19778 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 19779 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 19780 buf2, sizeof (buf2)))); 19781 } 19782 #endif /* DEBUG */ 19783 return (ipif); 19784 } 19785 19786 19787 /* 19788 * If old_ipif is not NULL, see if ipif was derived from old 19789 * ipif and if so, recreate the interface route by re-doing 19790 * source address selection. This happens when ipif_down -> 19791 * ipif_update_other_ipifs calls us. 19792 * 19793 * If old_ipif is NULL, just redo the source address selection 19794 * if needed. This happens when illgrp_insert or ipif_up_done 19795 * calls us. 19796 */ 19797 static void 19798 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 19799 { 19800 ire_t *ire; 19801 ire_t *ipif_ire; 19802 queue_t *stq; 19803 ipif_t *nipif; 19804 ill_t *ill; 19805 boolean_t need_rele = B_FALSE; 19806 19807 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 19808 ASSERT(IAM_WRITER_IPIF(ipif)); 19809 19810 ill = ipif->ipif_ill; 19811 if (!(ipif->ipif_flags & 19812 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 19813 /* 19814 * Can't possibly have borrowed the source 19815 * from old_ipif. 19816 */ 19817 return; 19818 } 19819 19820 /* 19821 * Is there any work to be done? No work if the address 19822 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 19823 * ipif_select_source() does not borrow addresses from 19824 * NOLOCAL and ANYCAST interfaces). 19825 */ 19826 if ((old_ipif != NULL) && 19827 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 19828 (old_ipif->ipif_ill->ill_wq == NULL) || 19829 (old_ipif->ipif_flags & 19830 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 19831 return; 19832 } 19833 19834 /* 19835 * Perform the same checks as when creating the 19836 * IRE_INTERFACE in ipif_up_done. 19837 */ 19838 if (!(ipif->ipif_flags & IPIF_UP)) 19839 return; 19840 19841 if ((ipif->ipif_flags & IPIF_NOXMIT) || 19842 (ipif->ipif_subnet == INADDR_ANY)) 19843 return; 19844 19845 ipif_ire = ipif_to_ire(ipif); 19846 if (ipif_ire == NULL) 19847 return; 19848 19849 /* 19850 * We know that ipif uses some other source for its 19851 * IRE_INTERFACE. Is it using the source of this 19852 * old_ipif? 19853 */ 19854 if (old_ipif != NULL && 19855 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 19856 ire_refrele(ipif_ire); 19857 return; 19858 } 19859 if (ip_debug > 2) { 19860 /* ip1dbg */ 19861 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 19862 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 19863 } 19864 19865 stq = ipif_ire->ire_stq; 19866 19867 /* 19868 * Can't use our source address. Select a different 19869 * source address for the IRE_INTERFACE. 19870 */ 19871 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 19872 if (nipif == NULL) { 19873 /* Last resort - all ipif's have IPIF_NOLOCAL */ 19874 nipif = ipif; 19875 } else { 19876 need_rele = B_TRUE; 19877 } 19878 19879 ire = ire_create( 19880 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 19881 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 19882 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 19883 NULL, /* no gateway */ 19884 NULL, 19885 &ipif->ipif_mtu, /* max frag */ 19886 NULL, /* fast path header */ 19887 NULL, /* no recv from queue */ 19888 stq, /* send-to queue */ 19889 ill->ill_net_type, /* IF_[NO]RESOLVER */ 19890 ill->ill_resolver_mp, /* xmit header */ 19891 ipif, 19892 NULL, 19893 0, 19894 0, 19895 0, 19896 0, 19897 &ire_uinfo_null); 19898 19899 if (ire != NULL) { 19900 ire_t *ret_ire; 19901 int error; 19902 19903 /* 19904 * We don't need ipif_ire anymore. We need to delete 19905 * before we add so that ire_add does not detect 19906 * duplicates. 19907 */ 19908 ire_delete(ipif_ire); 19909 ret_ire = ire; 19910 error = ire_add(&ret_ire, NULL, NULL, NULL); 19911 ASSERT(error == 0); 19912 ASSERT(ire == ret_ire); 19913 /* Held in ire_add */ 19914 ire_refrele(ret_ire); 19915 } 19916 /* 19917 * Either we are falling through from above or could not 19918 * allocate a replacement. 19919 */ 19920 ire_refrele(ipif_ire); 19921 if (need_rele) 19922 ipif_refrele(nipif); 19923 } 19924 19925 /* 19926 * This old_ipif is going away. 19927 * 19928 * Determine if any other ipif's is using our address as 19929 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 19930 * IPIF_DEPRECATED). 19931 * Find the IRE_INTERFACE for such ipifs and recreate them 19932 * to use an different source address following the rules in 19933 * ipif_up_done. 19934 * 19935 * This function takes an illgrp as an argument so that illgrp_delete 19936 * can call this to update source address even after deleting the 19937 * old_ipif->ipif_ill from the ill group. 19938 */ 19939 static void 19940 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 19941 { 19942 ipif_t *ipif; 19943 ill_t *ill; 19944 char buf[INET6_ADDRSTRLEN]; 19945 19946 ASSERT(IAM_WRITER_IPIF(old_ipif)); 19947 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 19948 19949 ill = old_ipif->ipif_ill; 19950 19951 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 19952 ill->ill_name, 19953 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 19954 buf, sizeof (buf)))); 19955 /* 19956 * If this part of a group, look at all ills as ipif_select_source 19957 * borrows source address across all the ills in the group. 19958 */ 19959 if (illgrp != NULL) 19960 ill = illgrp->illgrp_ill; 19961 19962 for (; ill != NULL; ill = ill->ill_group_next) { 19963 for (ipif = ill->ill_ipif; ipif != NULL; 19964 ipif = ipif->ipif_next) { 19965 19966 if (ipif == old_ipif) 19967 continue; 19968 19969 ipif_recreate_interface_routes(old_ipif, ipif); 19970 } 19971 } 19972 } 19973 19974 /* ARGSUSED */ 19975 int 19976 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 19977 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 19978 { 19979 /* 19980 * ill_phyint_reinit merged the v4 and v6 into a single 19981 * ipsq. Could also have become part of a ipmp group in the 19982 * process, and we might not have been able to complete the 19983 * operation in ipif_set_values, if we could not become 19984 * exclusive. If so restart it here. 19985 */ 19986 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 19987 } 19988 19989 19990 /* ARGSUSED */ 19991 int 19992 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 19993 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 19994 { 19995 queue_t *q1 = q; 19996 char *cp; 19997 char interf_name[LIFNAMSIZ]; 19998 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 19999 20000 if (!q->q_next) { 20001 ip1dbg(( 20002 "if_unitsel: IF_UNITSEL: no q_next\n")); 20003 return (EINVAL); 20004 } 20005 20006 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 20007 return (EALREADY); 20008 20009 do { 20010 q1 = q1->q_next; 20011 } while (q1->q_next); 20012 cp = q1->q_qinfo->qi_minfo->mi_idname; 20013 (void) sprintf(interf_name, "%s%d", cp, ppa); 20014 20015 /* 20016 * Here we are not going to delay the ioack until after 20017 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 20018 * original ioctl message before sending the requests. 20019 */ 20020 return (ipif_set_values(q, mp, interf_name, &ppa)); 20021 } 20022 20023 /* ARGSUSED */ 20024 int 20025 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 20026 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 20027 { 20028 return (ENXIO); 20029 } 20030 20031 /* 20032 * Net and subnet broadcast ire's are now specific to the particular 20033 * physical interface (ill) and not to any one locigal interface (ipif). 20034 * However, if a particular logical interface is being taken down, it's 20035 * associated ire's will be taken down as well. Hence, when we go to 20036 * take down or change the local address, broadcast address or netmask 20037 * of a specific logical interface, we must check to make sure that we 20038 * have valid net and subnet broadcast ire's for the other logical 20039 * interfaces which may have been shared with the logical interface 20040 * being brought down or changed. 20041 * 20042 * There is one set of 0.0.0.0 and 255.255.255.255 per ill. Usually it 20043 * is tied to the first interface coming UP. If that ipif is going down, 20044 * we need to recreate them on the next valid ipif. 20045 * 20046 * Note: assume that the ipif passed in is still up so that it's IRE 20047 * entries are still valid. 20048 */ 20049 static void 20050 ipif_check_bcast_ires(ipif_t *test_ipif) 20051 { 20052 ipif_t *ipif; 20053 ire_t *test_subnet_ire, *test_net_ire; 20054 ire_t *test_allzero_ire, *test_allone_ire; 20055 ire_t *ire_array[12]; 20056 ire_t **irep = &ire_array[0]; 20057 ire_t **irep1; 20058 20059 ipaddr_t net_addr, subnet_addr, net_mask, subnet_mask; 20060 ipaddr_t test_net_addr, test_subnet_addr; 20061 ipaddr_t test_net_mask, test_subnet_mask; 20062 boolean_t need_net_bcast_ire = B_FALSE; 20063 boolean_t need_subnet_bcast_ire = B_FALSE; 20064 boolean_t allzero_bcast_ire_created = B_FALSE; 20065 boolean_t allone_bcast_ire_created = B_FALSE; 20066 boolean_t net_bcast_ire_created = B_FALSE; 20067 boolean_t subnet_bcast_ire_created = B_FALSE; 20068 20069 ipif_t *backup_ipif_net = (ipif_t *)NULL; 20070 ipif_t *backup_ipif_subnet = (ipif_t *)NULL; 20071 ipif_t *backup_ipif_allzeros = (ipif_t *)NULL; 20072 ipif_t *backup_ipif_allones = (ipif_t *)NULL; 20073 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 20074 20075 ASSERT(!test_ipif->ipif_isv6); 20076 ASSERT(IAM_WRITER_IPIF(test_ipif)); 20077 20078 /* 20079 * No broadcast IREs for the LOOPBACK interface 20080 * or others such as point to point and IPIF_NOXMIT. 20081 */ 20082 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 20083 (test_ipif->ipif_flags & IPIF_NOXMIT)) 20084 return; 20085 20086 test_allzero_ire = ire_ctable_lookup(0, 0, IRE_BROADCAST, 20087 test_ipif, ALL_ZONES, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20088 20089 test_allone_ire = ire_ctable_lookup(INADDR_BROADCAST, 0, IRE_BROADCAST, 20090 test_ipif, ALL_ZONES, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20091 20092 test_net_mask = ip_net_mask(test_ipif->ipif_subnet); 20093 test_subnet_mask = test_ipif->ipif_net_mask; 20094 20095 /* 20096 * If no net mask set, assume the default based on net class. 20097 */ 20098 if (test_subnet_mask == 0) 20099 test_subnet_mask = test_net_mask; 20100 20101 /* 20102 * Check if there is a network broadcast ire associated with this ipif 20103 */ 20104 test_net_addr = test_net_mask & test_ipif->ipif_subnet; 20105 test_net_ire = ire_ctable_lookup(test_net_addr, 0, IRE_BROADCAST, 20106 test_ipif, ALL_ZONES, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20107 20108 /* 20109 * Check if there is a subnet broadcast IRE associated with this ipif 20110 */ 20111 test_subnet_addr = test_subnet_mask & test_ipif->ipif_subnet; 20112 test_subnet_ire = ire_ctable_lookup(test_subnet_addr, 0, IRE_BROADCAST, 20113 test_ipif, ALL_ZONES, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 20114 20115 /* 20116 * No broadcast ire's associated with this ipif. 20117 */ 20118 if ((test_subnet_ire == NULL) && (test_net_ire == NULL) && 20119 (test_allzero_ire == NULL) && (test_allone_ire == NULL)) { 20120 return; 20121 } 20122 20123 /* 20124 * We have established which bcast ires have to be replaced. 20125 * Next we try to locate ipifs that match there ires. 20126 * The rules are simple: If we find an ipif that matches on the subnet 20127 * address it will also match on the net address, the allzeros and 20128 * allones address. Any ipif that matches only on the net address will 20129 * also match the allzeros and allones addresses. 20130 * The other criterion is the ipif_flags. We look for non-deprecated 20131 * (and non-anycast and non-nolocal) ipifs as the best choice. 20132 * ipifs with check_flags matching (deprecated, etc) are used only 20133 * if good ipifs are not available. While looping, we save existing 20134 * deprecated ipifs as backup_ipif. 20135 * We loop through all the ipifs for this ill looking for ipifs 20136 * whose broadcast addr match the ipif passed in, but do not have 20137 * their own broadcast ires. For creating 0.0.0.0 and 20138 * 255.255.255.255 we just need an ipif on this ill to create. 20139 */ 20140 for (ipif = test_ipif->ipif_ill->ill_ipif; ipif != NULL; 20141 ipif = ipif->ipif_next) { 20142 20143 ASSERT(!ipif->ipif_isv6); 20144 /* 20145 * Already checked the ipif passed in. 20146 */ 20147 if (ipif == test_ipif) { 20148 continue; 20149 } 20150 20151 /* 20152 * We only need to recreate broadcast ires if another ipif in 20153 * the same zone uses them. The new ires must be created in the 20154 * same zone. 20155 */ 20156 if (ipif->ipif_zoneid != test_ipif->ipif_zoneid) { 20157 continue; 20158 } 20159 20160 /* 20161 * Only interested in logical interfaces with valid local 20162 * addresses or with the ability to broadcast. 20163 */ 20164 if ((ipif->ipif_subnet == 0) || 20165 !(ipif->ipif_flags & IPIF_BROADCAST) || 20166 (ipif->ipif_flags & IPIF_NOXMIT) || 20167 !(ipif->ipif_flags & IPIF_UP)) { 20168 continue; 20169 } 20170 /* 20171 * Check if there is a net broadcast ire for this 20172 * net address. If it turns out that the ipif we are 20173 * about to take down owns this ire, we must make a 20174 * new one because it is potentially going away. 20175 */ 20176 if (test_net_ire && (!net_bcast_ire_created)) { 20177 net_mask = ip_net_mask(ipif->ipif_subnet); 20178 net_addr = net_mask & ipif->ipif_subnet; 20179 if (net_addr == test_net_addr) { 20180 need_net_bcast_ire = B_TRUE; 20181 /* 20182 * Use DEPRECATED ipif only if no good 20183 * ires are available. subnet_addr is 20184 * a better match than net_addr. 20185 */ 20186 if ((ipif->ipif_flags & check_flags) && 20187 (backup_ipif_net == NULL)) { 20188 backup_ipif_net = ipif; 20189 } 20190 } 20191 } 20192 /* 20193 * Check if there is a subnet broadcast ire for this 20194 * net address. If it turns out that the ipif we are 20195 * about to take down owns this ire, we must make a 20196 * new one because it is potentially going away. 20197 */ 20198 if (test_subnet_ire && (!subnet_bcast_ire_created)) { 20199 subnet_mask = ipif->ipif_net_mask; 20200 subnet_addr = ipif->ipif_subnet; 20201 if (subnet_addr == test_subnet_addr) { 20202 need_subnet_bcast_ire = B_TRUE; 20203 if ((ipif->ipif_flags & check_flags) && 20204 (backup_ipif_subnet == NULL)) { 20205 backup_ipif_subnet = ipif; 20206 } 20207 } 20208 } 20209 20210 20211 /* Short circuit here if this ipif is deprecated */ 20212 if (ipif->ipif_flags & check_flags) { 20213 if ((test_allzero_ire != NULL) && 20214 (!allzero_bcast_ire_created) && 20215 (backup_ipif_allzeros == NULL)) { 20216 backup_ipif_allzeros = ipif; 20217 } 20218 if ((test_allone_ire != NULL) && 20219 (!allone_bcast_ire_created) && 20220 (backup_ipif_allones == NULL)) { 20221 backup_ipif_allones = ipif; 20222 } 20223 continue; 20224 } 20225 20226 /* 20227 * Found an ipif which has the same broadcast ire as the 20228 * ipif passed in and the ipif passed in "owns" the ire. 20229 * Create new broadcast ire's for this broadcast addr. 20230 */ 20231 if (need_net_bcast_ire && !net_bcast_ire_created) { 20232 irep = ire_create_bcast(ipif, net_addr, irep); 20233 irep = ire_create_bcast(ipif, 20234 ~net_mask | net_addr, irep); 20235 net_bcast_ire_created = B_TRUE; 20236 } 20237 if (need_subnet_bcast_ire && !subnet_bcast_ire_created) { 20238 irep = ire_create_bcast(ipif, subnet_addr, irep); 20239 irep = ire_create_bcast(ipif, 20240 ~subnet_mask | subnet_addr, irep); 20241 subnet_bcast_ire_created = B_TRUE; 20242 } 20243 if (test_allzero_ire != NULL && !allzero_bcast_ire_created) { 20244 irep = ire_create_bcast(ipif, 0, irep); 20245 allzero_bcast_ire_created = B_TRUE; 20246 } 20247 if (test_allone_ire != NULL && !allone_bcast_ire_created) { 20248 irep = ire_create_bcast(ipif, INADDR_BROADCAST, irep); 20249 allone_bcast_ire_created = B_TRUE; 20250 } 20251 /* 20252 * Once we have created all the appropriate ires, we 20253 * just break out of this loop to add what we have created. 20254 * This has been indented similar to ire_match_args for 20255 * readability. 20256 */ 20257 if (((test_net_ire == NULL) || 20258 (net_bcast_ire_created)) && 20259 ((test_subnet_ire == NULL) || 20260 (subnet_bcast_ire_created)) && 20261 ((test_allzero_ire == NULL) || 20262 (allzero_bcast_ire_created)) && 20263 ((test_allone_ire == NULL) || 20264 (allone_bcast_ire_created))) { 20265 break; 20266 } 20267 } 20268 20269 /* 20270 * Create bcast ires on deprecated ipifs if no non-deprecated ipifs 20271 * exist. 6 pairs of bcast ires are needed. 20272 * Note - the old ires are deleted in ipif_down. 20273 */ 20274 if (need_net_bcast_ire && !net_bcast_ire_created && backup_ipif_net) { 20275 ipif = backup_ipif_net; 20276 irep = ire_create_bcast(ipif, net_addr, irep); 20277 irep = ire_create_bcast(ipif, ~net_mask | net_addr, irep); 20278 net_bcast_ire_created = B_TRUE; 20279 } 20280 if (need_subnet_bcast_ire && !subnet_bcast_ire_created && 20281 backup_ipif_subnet) { 20282 ipif = backup_ipif_subnet; 20283 irep = ire_create_bcast(ipif, subnet_addr, irep); 20284 irep = ire_create_bcast(ipif, 20285 ~subnet_mask | subnet_addr, irep); 20286 subnet_bcast_ire_created = B_TRUE; 20287 } 20288 if (test_allzero_ire != NULL && !allzero_bcast_ire_created && 20289 backup_ipif_allzeros) { 20290 irep = ire_create_bcast(backup_ipif_allzeros, 0, irep); 20291 allzero_bcast_ire_created = B_TRUE; 20292 } 20293 if (test_allone_ire != NULL && !allone_bcast_ire_created && 20294 backup_ipif_allones) { 20295 irep = ire_create_bcast(backup_ipif_allones, 20296 INADDR_BROADCAST, irep); 20297 allone_bcast_ire_created = B_TRUE; 20298 } 20299 20300 /* 20301 * If we can't create all of them, don't add any of them. 20302 * Code in ip_wput_ire and ire_to_ill assumes that we 20303 * always have a non-loopback copy and loopback copy 20304 * for a given address. 20305 */ 20306 for (irep1 = irep; irep1 > ire_array; ) { 20307 irep1--; 20308 if (*irep1 == NULL) { 20309 ip0dbg(("ipif_check_bcast_ires: can't create " 20310 "IRE_BROADCAST, memory allocation failure\n")); 20311 while (irep > ire_array) { 20312 irep--; 20313 if (*irep != NULL) 20314 ire_delete(*irep); 20315 } 20316 goto bad; 20317 } 20318 } 20319 for (irep1 = irep; irep1 > ire_array; ) { 20320 int error; 20321 20322 irep1--; 20323 error = ire_add(irep1, NULL, NULL, NULL); 20324 if (error == 0) { 20325 ire_refrele(*irep1); /* Held in ire_add */ 20326 } 20327 } 20328 bad: 20329 if (test_allzero_ire != NULL) 20330 ire_refrele(test_allzero_ire); 20331 if (test_allone_ire != NULL) 20332 ire_refrele(test_allone_ire); 20333 if (test_net_ire != NULL) 20334 ire_refrele(test_net_ire); 20335 if (test_subnet_ire != NULL) 20336 ire_refrele(test_subnet_ire); 20337 } 20338 20339 /* 20340 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 20341 * from lifr_flags and the name from lifr_name. 20342 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 20343 * since ipif_lookup_on_name uses the _isv6 flags when matching. 20344 * Returns EINPROGRESS when mp has been consumed by queueing it on 20345 * ill_pending_mp and the ioctl will complete in ip_rput. 20346 */ 20347 /* ARGSUSED */ 20348 int 20349 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20350 ip_ioctl_cmd_t *ipip, void *if_req) 20351 { 20352 int err; 20353 ill_t *ill; 20354 struct lifreq *lifr = (struct lifreq *)if_req; 20355 20356 ASSERT(ipif != NULL); 20357 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 20358 ASSERT(q->q_next != NULL); 20359 20360 ill = (ill_t *)q->q_ptr; 20361 /* 20362 * If we are not writer on 'q' then this interface exists already 20363 * and previous lookups (ipif_extract_lifreq_cmn) found this ipif. 20364 * So return EALREADY 20365 */ 20366 if (ill != ipif->ipif_ill) 20367 return (EALREADY); 20368 20369 if (ill->ill_name[0] != '\0') 20370 return (EALREADY); 20371 20372 /* 20373 * Set all the flags. Allows all kinds of override. Provide some 20374 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 20375 * unless there is either multicast/broadcast support in the driver 20376 * or it is a pt-pt link. 20377 */ 20378 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 20379 /* Meaningless to IP thus don't allow them to be set. */ 20380 ip1dbg(("ip_setname: EINVAL 1\n")); 20381 return (EINVAL); 20382 } 20383 /* 20384 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 20385 * ill_bcast_addr_length info. 20386 */ 20387 if (!ill->ill_needs_attach && 20388 ((lifr->lifr_flags & IFF_MULTICAST) && 20389 !(lifr->lifr_flags & IFF_POINTOPOINT) && 20390 ill->ill_bcast_addr_length == 0)) { 20391 /* Link not broadcast/pt-pt capable i.e. no multicast */ 20392 ip1dbg(("ip_setname: EINVAL 2\n")); 20393 return (EINVAL); 20394 } 20395 if ((lifr->lifr_flags & IFF_BROADCAST) && 20396 ((lifr->lifr_flags & IFF_IPV6) || 20397 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 20398 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 20399 ip1dbg(("ip_setname: EINVAL 3\n")); 20400 return (EINVAL); 20401 } 20402 if (lifr->lifr_flags & IFF_UP) { 20403 /* Can only be set with SIOCSLIFFLAGS */ 20404 ip1dbg(("ip_setname: EINVAL 4\n")); 20405 return (EINVAL); 20406 } 20407 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 20408 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 20409 ip1dbg(("ip_setname: EINVAL 5\n")); 20410 return (EINVAL); 20411 } 20412 /* 20413 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 20414 */ 20415 if ((lifr->lifr_flags & IFF_XRESOLV) && 20416 !(lifr->lifr_flags & IFF_IPV6) && 20417 !(ipif->ipif_isv6)) { 20418 ip1dbg(("ip_setname: EINVAL 6\n")); 20419 return (EINVAL); 20420 } 20421 20422 /* 20423 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 20424 * we have all the flags here. So, we assign rather than we OR. 20425 * We can't OR the flags here because we don't want to set 20426 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 20427 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 20428 * on lifr_flags value here. 20429 */ 20430 /* 20431 * This ill has not been inserted into the global list. 20432 * So we are still single threaded and don't need any lock 20433 */ 20434 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS; 20435 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 20436 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 20437 20438 /* We started off as V4. */ 20439 if (ill->ill_flags & ILLF_IPV6) { 20440 ill->ill_phyint->phyint_illv6 = ill; 20441 ill->ill_phyint->phyint_illv4 = NULL; 20442 } 20443 err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa); 20444 return (err); 20445 } 20446 20447 /* ARGSUSED */ 20448 int 20449 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20450 ip_ioctl_cmd_t *ipip, void *if_req) 20451 { 20452 /* 20453 * ill_phyint_reinit merged the v4 and v6 into a single 20454 * ipsq. Could also have become part of a ipmp group in the 20455 * process, and we might not have been able to complete the 20456 * slifname in ipif_set_values, if we could not become 20457 * exclusive. If so restart it here 20458 */ 20459 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 20460 } 20461 20462 /* 20463 * Return a pointer to the ipif which matches the index, IP version type and 20464 * zoneid. 20465 */ 20466 ipif_t * 20467 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 20468 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 20469 { 20470 ill_t *ill; 20471 ipsq_t *ipsq; 20472 phyint_t *phyi; 20473 ipif_t *ipif; 20474 20475 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 20476 (q != NULL && mp != NULL && func != NULL && err != NULL)); 20477 20478 if (err != NULL) 20479 *err = 0; 20480 20481 /* 20482 * Indexes are stored in the phyint - a common structure 20483 * to both IPv4 and IPv6. 20484 */ 20485 20486 rw_enter(&ill_g_lock, RW_READER); 20487 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 20488 (void *) &index, NULL); 20489 if (phyi != NULL) { 20490 ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4; 20491 if (ill == NULL) { 20492 rw_exit(&ill_g_lock); 20493 if (err != NULL) 20494 *err = ENXIO; 20495 return (NULL); 20496 } 20497 GRAB_CONN_LOCK(q); 20498 mutex_enter(&ill->ill_lock); 20499 if (ILL_CAN_LOOKUP(ill)) { 20500 for (ipif = ill->ill_ipif; ipif != NULL; 20501 ipif = ipif->ipif_next) { 20502 if (IPIF_CAN_LOOKUP(ipif) && 20503 (zoneid == ALL_ZONES || 20504 zoneid == ipif->ipif_zoneid)) { 20505 ipif_refhold_locked(ipif); 20506 mutex_exit(&ill->ill_lock); 20507 RELEASE_CONN_LOCK(q); 20508 rw_exit(&ill_g_lock); 20509 return (ipif); 20510 } 20511 } 20512 } else if (ILL_CAN_WAIT(ill, q)) { 20513 ipsq = ill->ill_phyint->phyint_ipsq; 20514 mutex_enter(&ipsq->ipsq_lock); 20515 rw_exit(&ill_g_lock); 20516 mutex_exit(&ill->ill_lock); 20517 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 20518 mutex_exit(&ipsq->ipsq_lock); 20519 RELEASE_CONN_LOCK(q); 20520 *err = EINPROGRESS; 20521 return (NULL); 20522 } 20523 mutex_exit(&ill->ill_lock); 20524 RELEASE_CONN_LOCK(q); 20525 } 20526 rw_exit(&ill_g_lock); 20527 if (err != NULL) 20528 *err = ENXIO; 20529 return (NULL); 20530 } 20531 20532 typedef struct conn_change_s { 20533 uint_t cc_old_ifindex; 20534 uint_t cc_new_ifindex; 20535 } conn_change_t; 20536 20537 /* 20538 * ipcl_walk function for changing interface index. 20539 */ 20540 static void 20541 conn_change_ifindex(conn_t *connp, caddr_t arg) 20542 { 20543 conn_change_t *connc; 20544 uint_t old_ifindex; 20545 uint_t new_ifindex; 20546 int i; 20547 ilg_t *ilg; 20548 20549 connc = (conn_change_t *)arg; 20550 old_ifindex = connc->cc_old_ifindex; 20551 new_ifindex = connc->cc_new_ifindex; 20552 20553 if (connp->conn_orig_bound_ifindex == old_ifindex) 20554 connp->conn_orig_bound_ifindex = new_ifindex; 20555 20556 if (connp->conn_orig_multicast_ifindex == old_ifindex) 20557 connp->conn_orig_multicast_ifindex = new_ifindex; 20558 20559 if (connp->conn_orig_xmit_ifindex == old_ifindex) 20560 connp->conn_orig_xmit_ifindex = new_ifindex; 20561 20562 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 20563 ilg = &connp->conn_ilg[i]; 20564 if (ilg->ilg_orig_ifindex == old_ifindex) 20565 ilg->ilg_orig_ifindex = new_ifindex; 20566 } 20567 } 20568 20569 /* 20570 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 20571 * to new_index if it matches the old_index. 20572 * 20573 * Failovers typically happen within a group of ills. But somebody 20574 * can remove an ill from the group after a failover happened. If 20575 * we are setting the ifindex after this, we potentially need to 20576 * look at all the ills rather than just the ones in the group. 20577 * We cut down the work by looking at matching ill_net_types 20578 * and ill_types as we could not possibly grouped them together. 20579 */ 20580 static void 20581 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 20582 { 20583 ill_t *ill; 20584 ipif_t *ipif; 20585 uint_t old_ifindex; 20586 uint_t new_ifindex; 20587 ilm_t *ilm; 20588 ill_walk_context_t ctx; 20589 20590 old_ifindex = connc->cc_old_ifindex; 20591 new_ifindex = connc->cc_new_ifindex; 20592 20593 rw_enter(&ill_g_lock, RW_READER); 20594 ill = ILL_START_WALK_ALL(&ctx); 20595 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 20596 if ((ill_orig->ill_net_type != ill->ill_net_type) || 20597 (ill_orig->ill_type != ill->ill_type)) { 20598 continue; 20599 } 20600 for (ipif = ill->ill_ipif; ipif != NULL; 20601 ipif = ipif->ipif_next) { 20602 if (ipif->ipif_orig_ifindex == old_ifindex) 20603 ipif->ipif_orig_ifindex = new_ifindex; 20604 } 20605 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 20606 if (ilm->ilm_orig_ifindex == old_ifindex) 20607 ilm->ilm_orig_ifindex = new_ifindex; 20608 } 20609 } 20610 rw_exit(&ill_g_lock); 20611 } 20612 20613 /* 20614 * We first need to ensure that the new index is unique, and 20615 * then carry the change across both v4 and v6 ill representation 20616 * of the physical interface. 20617 */ 20618 /* ARGSUSED */ 20619 int 20620 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20621 ip_ioctl_cmd_t *ipip, void *ifreq) 20622 { 20623 ill_t *ill; 20624 ill_t *ill_other; 20625 phyint_t *phyi; 20626 int old_index; 20627 conn_change_t connc; 20628 struct ifreq *ifr = (struct ifreq *)ifreq; 20629 struct lifreq *lifr = (struct lifreq *)ifreq; 20630 uint_t index; 20631 ill_t *ill_v4; 20632 ill_t *ill_v6; 20633 20634 if (ipip->ipi_cmd_type == IF_CMD) 20635 index = ifr->ifr_index; 20636 else 20637 index = lifr->lifr_index; 20638 20639 /* 20640 * Only allow on physical interface. Also, index zero is illegal. 20641 * 20642 * Need to check for PHYI_FAILED and PHYI_INACTIVE 20643 * 20644 * 1) If PHYI_FAILED is set, a failover could have happened which 20645 * implies a possible failback might have to happen. As failback 20646 * depends on the old index, we should fail setting the index. 20647 * 20648 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 20649 * any addresses or multicast memberships are failed over to 20650 * a non-STANDBY interface. As failback depends on the old 20651 * index, we should fail setting the index for this case also. 20652 * 20653 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 20654 * Be consistent with PHYI_FAILED and fail the ioctl. 20655 */ 20656 ill = ipif->ipif_ill; 20657 phyi = ill->ill_phyint; 20658 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 20659 ipif->ipif_id != 0 || index == 0) { 20660 return (EINVAL); 20661 } 20662 old_index = phyi->phyint_ifindex; 20663 20664 /* If the index is not changing, no work to do */ 20665 if (old_index == index) 20666 return (0); 20667 20668 /* 20669 * Use ill_lookup_on_ifindex to determine if the 20670 * new index is unused and if so allow the change. 20671 */ 20672 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL); 20673 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL); 20674 if (ill_v6 != NULL || ill_v4 != NULL) { 20675 if (ill_v4 != NULL) 20676 ill_refrele(ill_v4); 20677 if (ill_v6 != NULL) 20678 ill_refrele(ill_v6); 20679 return (EBUSY); 20680 } 20681 20682 /* 20683 * The new index is unused. Set it in the phyint. 20684 * Locate the other ill so that we can send a routing 20685 * sockets message. 20686 */ 20687 if (ill->ill_isv6) { 20688 ill_other = phyi->phyint_illv4; 20689 } else { 20690 ill_other = phyi->phyint_illv6; 20691 } 20692 20693 phyi->phyint_ifindex = index; 20694 20695 connc.cc_old_ifindex = old_index; 20696 connc.cc_new_ifindex = index; 20697 ip_change_ifindex(ill, &connc); 20698 ipcl_walk(conn_change_ifindex, (caddr_t)&connc); 20699 20700 /* Send the routing sockets message */ 20701 ip_rts_ifmsg(ipif); 20702 if (ill_other != NULL) 20703 ip_rts_ifmsg(ill_other->ill_ipif); 20704 20705 return (0); 20706 } 20707 20708 /* ARGSUSED */ 20709 int 20710 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20711 ip_ioctl_cmd_t *ipip, void *ifreq) 20712 { 20713 struct ifreq *ifr = (struct ifreq *)ifreq; 20714 struct lifreq *lifr = (struct lifreq *)ifreq; 20715 20716 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 20717 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 20718 /* Get the interface index */ 20719 if (ipip->ipi_cmd_type == IF_CMD) { 20720 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 20721 } else { 20722 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 20723 } 20724 return (0); 20725 } 20726 20727 /* ARGSUSED */ 20728 int 20729 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20730 ip_ioctl_cmd_t *ipip, void *ifreq) 20731 { 20732 struct lifreq *lifr = (struct lifreq *)ifreq; 20733 20734 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 20735 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 20736 /* Get the interface zone */ 20737 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 20738 lifr->lifr_zoneid = ipif->ipif_zoneid; 20739 return (0); 20740 } 20741 20742 /* 20743 * Set the zoneid of an interface. 20744 */ 20745 /* ARGSUSED */ 20746 int 20747 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20748 ip_ioctl_cmd_t *ipip, void *ifreq) 20749 { 20750 struct lifreq *lifr = (struct lifreq *)ifreq; 20751 int err = 0; 20752 boolean_t need_up = B_FALSE; 20753 zone_t *zptr; 20754 zone_status_t status; 20755 zoneid_t zoneid; 20756 20757 /* cannot assign instance zero to a non-global zone */ 20758 if (ipif->ipif_id == 0) 20759 return (ENOTSUP); 20760 20761 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 20762 zoneid = lifr->lifr_zoneid; 20763 20764 /* 20765 * Cannot assign to a zone that doesn't exist or is shutting down. In 20766 * the event of a race with the zone shutdown processing, since IP 20767 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 20768 * interface will be cleaned up even if the zone is shut down 20769 * immediately after the status check. If the interface can't be brought 20770 * down right away, and the zone is shut down before the restart 20771 * function is called, we resolve the possible races by rechecking the 20772 * zone status in the restart function. 20773 */ 20774 if ((zptr = zone_find_by_id(zoneid)) == NULL) 20775 return (EINVAL); 20776 status = zone_status_get(zptr); 20777 zone_rele(zptr); 20778 20779 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 20780 return (EINVAL); 20781 20782 if (ipif->ipif_flags & IPIF_UP) { 20783 /* 20784 * If the interface is already marked up, 20785 * we call ipif_down which will take care 20786 * of ditching any IREs that have been set 20787 * up based on the old interface address. 20788 */ 20789 err = ipif_logical_down(ipif, q, mp); 20790 if (err == EINPROGRESS) 20791 return (err); 20792 ipif_down_tail(ipif); 20793 need_up = B_TRUE; 20794 } 20795 20796 err = ip_sioctl_slifzone_tail(ipif, zoneid, q, mp, need_up); 20797 return (err); 20798 } 20799 20800 static int 20801 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 20802 queue_t *q, mblk_t *mp, boolean_t need_up) 20803 { 20804 int err = 0; 20805 20806 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 20807 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 20808 20809 /* Set the new zone id. */ 20810 ipif->ipif_zoneid = zoneid; 20811 20812 /* Update sctp list */ 20813 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 20814 20815 if (need_up) { 20816 /* 20817 * Now bring the interface back up. If this 20818 * is the only IPIF for the ILL, ipif_up 20819 * will have to re-bind to the device, so 20820 * we may get back EINPROGRESS, in which 20821 * case, this IOCTL will get completed in 20822 * ip_rput_dlpi when we see the DL_BIND_ACK. 20823 */ 20824 err = ipif_up(ipif, q, mp); 20825 } 20826 return (err); 20827 } 20828 20829 /* ARGSUSED */ 20830 int 20831 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20832 ip_ioctl_cmd_t *ipip, void *if_req) 20833 { 20834 struct lifreq *lifr = (struct lifreq *)if_req; 20835 zoneid_t zoneid; 20836 zone_t *zptr; 20837 zone_status_t status; 20838 20839 ASSERT(ipif->ipif_id != 0); 20840 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 20841 zoneid = lifr->lifr_zoneid; 20842 20843 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 20844 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 20845 20846 /* 20847 * We recheck the zone status to resolve the following race condition: 20848 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 20849 * 2) hme0:1 is up and can't be brought down right away; 20850 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 20851 * 3) zone "myzone" is halted; the zone status switches to 20852 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 20853 * the interfaces to remove - hme0:1 is not returned because it's not 20854 * yet in "myzone", so it won't be removed; 20855 * 4) the restart function for SIOCSLIFZONE is called; without the 20856 * status check here, we would have hme0:1 in "myzone" after it's been 20857 * destroyed. 20858 * Note that if the status check fails, we need to bring the interface 20859 * back to its state prior to ip_sioctl_slifzone(), hence the call to 20860 * ipif_up_done[_v6](). 20861 */ 20862 status = ZONE_IS_UNINITIALIZED; 20863 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 20864 status = zone_status_get(zptr); 20865 zone_rele(zptr); 20866 } 20867 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 20868 if (ipif->ipif_isv6) { 20869 (void) ipif_up_done_v6(ipif); 20870 } else { 20871 (void) ipif_up_done(ipif); 20872 } 20873 return (EINVAL); 20874 } 20875 20876 ipif_down_tail(ipif); 20877 20878 return (ip_sioctl_slifzone_tail(ipif, zoneid, q, mp, B_TRUE)); 20879 } 20880 20881 /* ARGSUSED */ 20882 int 20883 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20884 ip_ioctl_cmd_t *ipip, void *ifreq) 20885 { 20886 struct lifreq *lifr = ifreq; 20887 20888 ASSERT(q->q_next == NULL); 20889 ASSERT(CONN_Q(q)); 20890 20891 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 20892 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 20893 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 20894 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 20895 20896 return (0); 20897 } 20898 20899 20900 /* Find the previous ILL in this usesrc group */ 20901 static ill_t * 20902 ill_prev_usesrc(ill_t *uill) 20903 { 20904 ill_t *ill; 20905 20906 for (ill = uill->ill_usesrc_grp_next; 20907 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 20908 ill = ill->ill_usesrc_grp_next) 20909 /* do nothing */; 20910 return (ill); 20911 } 20912 20913 /* 20914 * Release all members of the usesrc group. This routine is called 20915 * from ill_delete when the interface being unplumbed is the 20916 * group head. 20917 */ 20918 static void 20919 ill_disband_usesrc_group(ill_t *uill) 20920 { 20921 ill_t *next_ill, *tmp_ill; 20922 ASSERT(RW_WRITE_HELD(&ill_g_usesrc_lock)); 20923 next_ill = uill->ill_usesrc_grp_next; 20924 20925 do { 20926 ASSERT(next_ill != NULL); 20927 tmp_ill = next_ill->ill_usesrc_grp_next; 20928 ASSERT(tmp_ill != NULL); 20929 next_ill->ill_usesrc_grp_next = NULL; 20930 next_ill->ill_usesrc_ifindex = 0; 20931 next_ill = tmp_ill; 20932 } while (next_ill->ill_usesrc_ifindex != 0); 20933 uill->ill_usesrc_grp_next = NULL; 20934 } 20935 20936 /* 20937 * Remove the client usesrc ILL from the list and relink to a new list 20938 */ 20939 int 20940 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 20941 { 20942 ill_t *ill, *tmp_ill; 20943 20944 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 20945 (uill != NULL) && RW_WRITE_HELD(&ill_g_usesrc_lock)); 20946 20947 /* 20948 * Check if the usesrc client ILL passed in is not already 20949 * in use as a usesrc ILL i.e one whose source address is 20950 * in use OR a usesrc ILL is not already in use as a usesrc 20951 * client ILL 20952 */ 20953 if ((ucill->ill_usesrc_ifindex == 0) || 20954 (uill->ill_usesrc_ifindex != 0)) { 20955 return (-1); 20956 } 20957 20958 ill = ill_prev_usesrc(ucill); 20959 ASSERT(ill->ill_usesrc_grp_next != NULL); 20960 20961 /* Remove from the current list */ 20962 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 20963 /* Only two elements in the list */ 20964 ASSERT(ill->ill_usesrc_ifindex == 0); 20965 ill->ill_usesrc_grp_next = NULL; 20966 } else { 20967 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 20968 } 20969 20970 if (ifindex == 0) { 20971 ucill->ill_usesrc_ifindex = 0; 20972 ucill->ill_usesrc_grp_next = NULL; 20973 return (0); 20974 } 20975 20976 ucill->ill_usesrc_ifindex = ifindex; 20977 tmp_ill = uill->ill_usesrc_grp_next; 20978 uill->ill_usesrc_grp_next = ucill; 20979 ucill->ill_usesrc_grp_next = 20980 (tmp_ill != NULL) ? tmp_ill : uill; 20981 return (0); 20982 } 20983 20984 /* 20985 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 20986 * ip.c for locking details. 20987 */ 20988 /* ARGSUSED */ 20989 int 20990 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20991 ip_ioctl_cmd_t *ipip, void *ifreq) 20992 { 20993 struct lifreq *lifr = (struct lifreq *)ifreq; 20994 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 20995 ill_flag_changed = B_FALSE; 20996 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 20997 int err = 0, ret; 20998 uint_t ifindex; 20999 phyint_t *us_phyint, *us_cli_phyint; 21000 ipsq_t *ipsq = NULL; 21001 21002 ASSERT(IAM_WRITER_IPIF(ipif)); 21003 ASSERT(q->q_next == NULL); 21004 ASSERT(CONN_Q(q)); 21005 21006 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 21007 us_cli_phyint = usesrc_cli_ill->ill_phyint; 21008 21009 ASSERT(us_cli_phyint != NULL); 21010 21011 /* 21012 * If the client ILL is being used for IPMP, abort. 21013 * Note, this can be done before ipsq_try_enter since we are already 21014 * exclusive on this ILL 21015 */ 21016 if ((us_cli_phyint->phyint_groupname != NULL) || 21017 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 21018 return (EINVAL); 21019 } 21020 21021 ifindex = lifr->lifr_index; 21022 if (ifindex == 0) { 21023 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 21024 /* non usesrc group interface, nothing to reset */ 21025 return (0); 21026 } 21027 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 21028 /* valid reset request */ 21029 reset_flg = B_TRUE; 21030 } 21031 21032 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 21033 ip_process_ioctl, &err); 21034 21035 if (usesrc_ill == NULL) { 21036 return (err); 21037 } 21038 21039 /* 21040 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 21041 * group nor can either of the interfaces be used for standy. So 21042 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 21043 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 21044 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 21045 * We are already exlusive on this ipsq i.e ipsq corresponding to 21046 * the usesrc_cli_ill 21047 */ 21048 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 21049 NEW_OP, B_TRUE); 21050 if (ipsq == NULL) { 21051 err = EINPROGRESS; 21052 /* Operation enqueued on the ipsq of the usesrc ILL */ 21053 goto done; 21054 } 21055 21056 /* Check if the usesrc_ill is used for IPMP */ 21057 us_phyint = usesrc_ill->ill_phyint; 21058 if ((us_phyint->phyint_groupname != NULL) || 21059 (us_phyint->phyint_flags & PHYI_STANDBY)) { 21060 err = EINVAL; 21061 goto done; 21062 } 21063 21064 /* 21065 * If the client is already in use as a usesrc_ill or a usesrc_ill is 21066 * already a client then return EINVAL 21067 */ 21068 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 21069 err = EINVAL; 21070 goto done; 21071 } 21072 21073 /* 21074 * If the ill_usesrc_ifindex field is already set to what it needs to 21075 * be then this is a duplicate operation. 21076 */ 21077 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 21078 err = 0; 21079 goto done; 21080 } 21081 21082 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 21083 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 21084 usesrc_ill->ill_isv6)); 21085 21086 /* 21087 * The next step ensures that no new ires will be created referencing 21088 * the client ill, until the ILL_CHANGING flag is cleared. Then 21089 * we go through an ire walk deleting all ire caches that reference 21090 * the client ill. New ires referencing the client ill that are added 21091 * to the ire table before the ILL_CHANGING flag is set, will be 21092 * cleaned up by the ire walk below. Attempt to add new ires referencing 21093 * the client ill while the ILL_CHANGING flag is set will be failed 21094 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 21095 * checks (under the ill_g_usesrc_lock) that the ire being added 21096 * is not stale, i.e the ire_stq and ire_ipif are consistent and 21097 * belong to the same usesrc group. 21098 */ 21099 mutex_enter(&usesrc_cli_ill->ill_lock); 21100 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 21101 mutex_exit(&usesrc_cli_ill->ill_lock); 21102 ill_flag_changed = B_TRUE; 21103 21104 if (ipif->ipif_isv6) 21105 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 21106 ALL_ZONES); 21107 else 21108 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 21109 ALL_ZONES); 21110 21111 /* 21112 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 21113 * and the ill_usesrc_ifindex fields 21114 */ 21115 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 21116 21117 if (reset_flg) { 21118 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 21119 if (ret != 0) { 21120 err = EINVAL; 21121 } 21122 rw_exit(&ill_g_usesrc_lock); 21123 goto done; 21124 } 21125 21126 /* 21127 * Four possibilities to consider: 21128 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 21129 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 21130 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 21131 * 4. Both are part of their respective usesrc groups 21132 */ 21133 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 21134 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 21135 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 21136 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 21137 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 21138 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 21139 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 21140 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 21141 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 21142 /* Insert at head of list */ 21143 usesrc_cli_ill->ill_usesrc_grp_next = 21144 usesrc_ill->ill_usesrc_grp_next; 21145 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 21146 } else { 21147 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 21148 ifindex); 21149 if (ret != 0) 21150 err = EINVAL; 21151 } 21152 rw_exit(&ill_g_usesrc_lock); 21153 21154 done: 21155 if (ill_flag_changed) { 21156 mutex_enter(&usesrc_cli_ill->ill_lock); 21157 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 21158 mutex_exit(&usesrc_cli_ill->ill_lock); 21159 } 21160 if (ipsq != NULL) 21161 ipsq_exit(ipsq, B_TRUE, B_TRUE); 21162 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 21163 ill_refrele(usesrc_ill); 21164 return (err); 21165 } 21166 21167 /* 21168 * comparison function used by avl. 21169 */ 21170 static int 21171 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 21172 { 21173 21174 uint_t index; 21175 21176 ASSERT(phyip != NULL && index_ptr != NULL); 21177 21178 index = *((uint_t *)index_ptr); 21179 /* 21180 * let the phyint with the lowest index be on top. 21181 */ 21182 if (((phyint_t *)phyip)->phyint_ifindex < index) 21183 return (1); 21184 if (((phyint_t *)phyip)->phyint_ifindex > index) 21185 return (-1); 21186 return (0); 21187 } 21188 21189 /* 21190 * comparison function used by avl. 21191 */ 21192 static int 21193 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 21194 { 21195 ill_t *ill; 21196 int res = 0; 21197 21198 ASSERT(phyip != NULL && name_ptr != NULL); 21199 21200 if (((phyint_t *)phyip)->phyint_illv4) 21201 ill = ((phyint_t *)phyip)->phyint_illv4; 21202 else 21203 ill = ((phyint_t *)phyip)->phyint_illv6; 21204 ASSERT(ill != NULL); 21205 21206 res = strcmp(ill->ill_name, (char *)name_ptr); 21207 if (res > 0) 21208 return (1); 21209 else if (res < 0) 21210 return (-1); 21211 return (0); 21212 } 21213 /* 21214 * This function is called from ill_delete when the ill is being 21215 * unplumbed. We remove the reference from the phyint and we also 21216 * free the phyint when there are no more references to it. 21217 */ 21218 static void 21219 ill_phyint_free(ill_t *ill) 21220 { 21221 phyint_t *phyi; 21222 phyint_t *next_phyint; 21223 ipsq_t *cur_ipsq; 21224 21225 ASSERT(ill->ill_phyint != NULL); 21226 21227 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 21228 phyi = ill->ill_phyint; 21229 ill->ill_phyint = NULL; 21230 /* 21231 * ill_init allocates a phyint always to store the copy 21232 * of flags relevant to phyint. At that point in time, we could 21233 * not assign the name and hence phyint_illv4/v6 could not be 21234 * initialized. Later in ipif_set_values, we assign the name to 21235 * the ill, at which point in time we assign phyint_illv4/v6. 21236 * Thus we don't rely on phyint_illv6 to be initialized always. 21237 */ 21238 if (ill->ill_flags & ILLF_IPV6) { 21239 phyi->phyint_illv6 = NULL; 21240 } else { 21241 phyi->phyint_illv4 = NULL; 21242 } 21243 /* 21244 * ipif_down removes it from the group when the last ipif goes 21245 * down. 21246 */ 21247 ASSERT(ill->ill_group == NULL); 21248 21249 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 21250 return; 21251 21252 /* 21253 * Make sure this phyint was put in the list. 21254 */ 21255 if (phyi->phyint_ifindex > 0) { 21256 avl_remove(&phyint_g_list.phyint_list_avl_by_index, 21257 phyi); 21258 avl_remove(&phyint_g_list.phyint_list_avl_by_name, 21259 phyi); 21260 } 21261 /* 21262 * remove phyint from the ipsq list. 21263 */ 21264 cur_ipsq = phyi->phyint_ipsq; 21265 if (phyi == cur_ipsq->ipsq_phyint_list) { 21266 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 21267 } else { 21268 next_phyint = cur_ipsq->ipsq_phyint_list; 21269 while (next_phyint != NULL) { 21270 if (next_phyint->phyint_ipsq_next == phyi) { 21271 next_phyint->phyint_ipsq_next = 21272 phyi->phyint_ipsq_next; 21273 break; 21274 } 21275 next_phyint = next_phyint->phyint_ipsq_next; 21276 } 21277 ASSERT(next_phyint != NULL); 21278 } 21279 IPSQ_DEC_REF(cur_ipsq); 21280 21281 if (phyi->phyint_groupname_len != 0) { 21282 ASSERT(phyi->phyint_groupname != NULL); 21283 mi_free(phyi->phyint_groupname); 21284 } 21285 mi_free(phyi); 21286 } 21287 21288 /* 21289 * Attach the ill to the phyint structure which can be shared by both 21290 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 21291 * function is called from ipif_set_values and ill_lookup_on_name (for 21292 * loopback) where we know the name of the ill. We lookup the ill and if 21293 * there is one present already with the name use that phyint. Otherwise 21294 * reuse the one allocated by ill_init. 21295 */ 21296 static void 21297 ill_phyint_reinit(ill_t *ill) 21298 { 21299 boolean_t isv6 = ill->ill_isv6; 21300 phyint_t *phyi_old; 21301 phyint_t *phyi; 21302 avl_index_t where = 0; 21303 ill_t *ill_other = NULL; 21304 ipsq_t *ipsq; 21305 21306 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 21307 21308 phyi_old = ill->ill_phyint; 21309 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 21310 phyi_old->phyint_illv6 == NULL)); 21311 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 21312 phyi_old->phyint_illv4 == NULL)); 21313 ASSERT(phyi_old->phyint_ifindex == 0); 21314 21315 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 21316 ill->ill_name, &where); 21317 21318 /* 21319 * 1. We grabbed the ill_g_lock before inserting this ill into 21320 * the global list of ills. So no other thread could have located 21321 * this ill and hence the ipsq of this ill is guaranteed to be empty. 21322 * 2. Now locate the other protocol instance of this ill. 21323 * 3. Now grab both ill locks in the right order, and the phyint lock of 21324 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 21325 * of neither ill can change. 21326 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 21327 * other ill. 21328 * 5. Release all locks. 21329 */ 21330 21331 /* 21332 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 21333 * we are initializing IPv4. 21334 */ 21335 if (phyi != NULL) { 21336 ill_other = (isv6) ? phyi->phyint_illv4 : 21337 phyi->phyint_illv6; 21338 ASSERT(ill_other->ill_phyint != NULL); 21339 ASSERT((isv6 && !ill_other->ill_isv6) || 21340 (!isv6 && ill_other->ill_isv6)); 21341 GRAB_ILL_LOCKS(ill, ill_other); 21342 /* 21343 * We are potentially throwing away phyint_flags which 21344 * could be different from the one that we obtain from 21345 * ill_other->ill_phyint. But it is okay as we are assuming 21346 * that the state maintained within IP is correct. 21347 */ 21348 mutex_enter(&phyi->phyint_lock); 21349 if (isv6) { 21350 ASSERT(phyi->phyint_illv6 == NULL); 21351 phyi->phyint_illv6 = ill; 21352 } else { 21353 ASSERT(phyi->phyint_illv4 == NULL); 21354 phyi->phyint_illv4 = ill; 21355 } 21356 /* 21357 * This is a new ill, currently undergoing SLIFNAME 21358 * So we could not have joined an IPMP group until now. 21359 */ 21360 ASSERT(phyi_old->phyint_ipsq_next == NULL && 21361 phyi_old->phyint_groupname == NULL); 21362 21363 /* 21364 * This phyi_old is going away. Decref ipsq_refs and 21365 * assert it is zero. The ipsq itself will be freed in 21366 * ipsq_exit 21367 */ 21368 ipsq = phyi_old->phyint_ipsq; 21369 IPSQ_DEC_REF(ipsq); 21370 ASSERT(ipsq->ipsq_refs == 0); 21371 /* Get the singleton phyint out of the ipsq list */ 21372 ASSERT(phyi_old->phyint_ipsq_next == NULL); 21373 ipsq->ipsq_phyint_list = NULL; 21374 phyi_old->phyint_illv4 = NULL; 21375 phyi_old->phyint_illv6 = NULL; 21376 mi_free(phyi_old); 21377 } else { 21378 mutex_enter(&ill->ill_lock); 21379 /* 21380 * We don't need to acquire any lock, since 21381 * the ill is not yet visible globally and we 21382 * have not yet released the ill_g_lock. 21383 */ 21384 phyi = phyi_old; 21385 mutex_enter(&phyi->phyint_lock); 21386 /* XXX We need a recovery strategy here. */ 21387 if (!phyint_assign_ifindex(phyi)) 21388 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 21389 21390 avl_insert(&phyint_g_list.phyint_list_avl_by_name, 21391 (void *)phyi, where); 21392 21393 (void) avl_find(&phyint_g_list.phyint_list_avl_by_index, 21394 &phyi->phyint_ifindex, &where); 21395 avl_insert(&phyint_g_list.phyint_list_avl_by_index, 21396 (void *)phyi, where); 21397 } 21398 21399 /* 21400 * Reassigning ill_phyint automatically reassigns the ipsq also. 21401 * pending mp is not affected because that is per ill basis. 21402 */ 21403 ill->ill_phyint = phyi; 21404 21405 /* 21406 * Keep the index on ipif_orig_index to be used by FAILOVER. 21407 * We do this here as when the first ipif was allocated, 21408 * ipif_allocate does not know the right interface index. 21409 */ 21410 21411 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 21412 /* 21413 * Now that the phyint's ifindex has been assigned, complete the 21414 * remaining 21415 */ 21416 if (ill->ill_isv6) { 21417 ill->ill_ip6_mib->ipv6IfIndex = 21418 ill->ill_phyint->phyint_ifindex; 21419 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 21420 ill->ill_phyint->phyint_ifindex; 21421 } 21422 21423 RELEASE_ILL_LOCKS(ill, ill_other); 21424 mutex_exit(&phyi->phyint_lock); 21425 } 21426 21427 /* 21428 * Notify any downstream modules of the name of this interface. 21429 * An M_IOCTL is used even though we don't expect a successful reply. 21430 * Any reply message from the driver (presumably an M_IOCNAK) will 21431 * eventually get discarded somewhere upstream. The message format is 21432 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 21433 * to IP. 21434 */ 21435 static void 21436 ip_ifname_notify(ill_t *ill, queue_t *q) 21437 { 21438 mblk_t *mp1, *mp2; 21439 struct iocblk *iocp; 21440 struct lifreq *lifr; 21441 21442 mp1 = mkiocb(SIOCSLIFNAME); 21443 if (mp1 == NULL) 21444 return; 21445 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 21446 if (mp2 == NULL) { 21447 freeb(mp1); 21448 return; 21449 } 21450 21451 mp1->b_cont = mp2; 21452 iocp = (struct iocblk *)mp1->b_rptr; 21453 iocp->ioc_count = sizeof (struct lifreq); 21454 21455 lifr = (struct lifreq *)mp2->b_rptr; 21456 mp2->b_wptr += sizeof (struct lifreq); 21457 bzero(lifr, sizeof (struct lifreq)); 21458 21459 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 21460 lifr->lifr_ppa = ill->ill_ppa; 21461 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 21462 21463 putnext(q, mp1); 21464 } 21465 21466 static boolean_t ip_trash_timer_started = B_FALSE; 21467 21468 static int 21469 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 21470 { 21471 int err; 21472 21473 /* Set the obsolete NDD per-interface forwarding name. */ 21474 err = ill_set_ndd_name(ill); 21475 if (err != 0) { 21476 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 21477 err); 21478 } 21479 21480 /* Tell downstream modules where they are. */ 21481 ip_ifname_notify(ill, q); 21482 21483 /* 21484 * ill_dl_phys returns EINPROGRESS in the usual case. 21485 * Error cases are ENOMEM ... 21486 */ 21487 err = ill_dl_phys(ill, ipif, mp, q); 21488 21489 /* 21490 * If there is no IRE expiration timer running, get one started. 21491 * igmp and mld timers will be triggered by the first multicast 21492 */ 21493 if (!ip_trash_timer_started) { 21494 /* 21495 * acquire the lock and check again. 21496 */ 21497 mutex_enter(&ip_trash_timer_lock); 21498 if (!ip_trash_timer_started) { 21499 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 21500 MSEC_TO_TICK(ip_timer_interval)); 21501 ip_trash_timer_started = B_TRUE; 21502 } 21503 mutex_exit(&ip_trash_timer_lock); 21504 } 21505 21506 if (ill->ill_isv6) { 21507 mutex_enter(&mld_slowtimeout_lock); 21508 if (mld_slowtimeout_id == 0) { 21509 mld_slowtimeout_id = timeout(mld_slowtimo, NULL, 21510 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 21511 } 21512 mutex_exit(&mld_slowtimeout_lock); 21513 } else { 21514 mutex_enter(&igmp_slowtimeout_lock); 21515 if (igmp_slowtimeout_id == 0) { 21516 igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL, 21517 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 21518 } 21519 mutex_exit(&igmp_slowtimeout_lock); 21520 } 21521 21522 return (err); 21523 } 21524 21525 /* 21526 * Common routine for ppa and ifname setting. Should be called exclusive. 21527 * 21528 * Returns EINPROGRESS when mp has been consumed by queueing it on 21529 * ill_pending_mp and the ioctl will complete in ip_rput. 21530 * 21531 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 21532 * the new name and new ppa in lifr_name and lifr_ppa respectively. 21533 * For SLIFNAME, we pass these values back to the userland. 21534 */ 21535 static int 21536 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 21537 { 21538 ill_t *ill; 21539 ipif_t *ipif; 21540 ipsq_t *ipsq; 21541 char *ppa_ptr; 21542 char *old_ptr; 21543 char old_char; 21544 int error; 21545 21546 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 21547 ASSERT(q->q_next != NULL); 21548 ASSERT(interf_name != NULL); 21549 21550 ill = (ill_t *)q->q_ptr; 21551 21552 ASSERT(ill->ill_name[0] == '\0'); 21553 ASSERT(IAM_WRITER_ILL(ill)); 21554 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 21555 ASSERT(ill->ill_ppa == UINT_MAX); 21556 21557 /* The ppa is sent down by ifconfig or is chosen */ 21558 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 21559 return (EINVAL); 21560 } 21561 21562 /* 21563 * make sure ppa passed in is same as ppa in the name. 21564 * This check is not made when ppa == UINT_MAX in that case ppa 21565 * in the name could be anything. System will choose a ppa and 21566 * update new_ppa_ptr and inter_name to contain the choosen ppa. 21567 */ 21568 if (*new_ppa_ptr != UINT_MAX) { 21569 /* stoi changes the pointer */ 21570 old_ptr = ppa_ptr; 21571 /* 21572 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 21573 * (they don't have an externally visible ppa). We assign one 21574 * here so that we can manage the interface. Note that in 21575 * the past this value was always 0 for DLPI 1 drivers. 21576 */ 21577 if (*new_ppa_ptr == 0) 21578 *new_ppa_ptr = stoi(&old_ptr); 21579 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 21580 return (EINVAL); 21581 } 21582 /* 21583 * terminate string before ppa 21584 * save char at that location. 21585 */ 21586 old_char = ppa_ptr[0]; 21587 ppa_ptr[0] = '\0'; 21588 21589 ill->ill_ppa = *new_ppa_ptr; 21590 /* 21591 * Finish as much work now as possible before calling ill_glist_insert 21592 * which makes the ill globally visible and also merges it with the 21593 * other protocol instance of this phyint. The remaining work is 21594 * done after entering the ipsq which may happen sometime later. 21595 * ill_set_ndd_name occurs after the ill has been made globally visible. 21596 */ 21597 ipif = ill->ill_ipif; 21598 21599 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 21600 ipif_assign_seqid(ipif); 21601 21602 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 21603 ill->ill_flags |= ILLF_IPV4; 21604 21605 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 21606 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 21607 21608 if (ill->ill_flags & ILLF_IPV6) { 21609 21610 ill->ill_isv6 = B_TRUE; 21611 if (ill->ill_rq != NULL) { 21612 ill->ill_rq->q_qinfo = &rinit_ipv6; 21613 ill->ill_wq->q_qinfo = &winit_ipv6; 21614 } 21615 21616 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 21617 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 21618 ipif->ipif_v6src_addr = ipv6_all_zeros; 21619 ipif->ipif_v6subnet = ipv6_all_zeros; 21620 ipif->ipif_v6net_mask = ipv6_all_zeros; 21621 ipif->ipif_v6brd_addr = ipv6_all_zeros; 21622 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 21623 /* 21624 * point-to-point or Non-mulicast capable 21625 * interfaces won't do NUD unless explicitly 21626 * configured to do so. 21627 */ 21628 if (ipif->ipif_flags & IPIF_POINTOPOINT || 21629 !(ill->ill_flags & ILLF_MULTICAST)) { 21630 ill->ill_flags |= ILLF_NONUD; 21631 } 21632 /* Make sure IPv4 specific flag is not set on IPv6 if */ 21633 if (ill->ill_flags & ILLF_NOARP) { 21634 /* 21635 * Note: xresolv interfaces will eventually need 21636 * NOARP set here as well, but that will require 21637 * those external resolvers to have some 21638 * knowledge of that flag and act appropriately. 21639 * Not to be changed at present. 21640 */ 21641 ill->ill_flags &= ~ILLF_NOARP; 21642 } 21643 /* 21644 * Set the ILLF_ROUTER flag according to the global 21645 * IPv6 forwarding policy. 21646 */ 21647 if (ipv6_forward != 0) 21648 ill->ill_flags |= ILLF_ROUTER; 21649 } else if (ill->ill_flags & ILLF_IPV4) { 21650 ill->ill_isv6 = B_FALSE; 21651 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 21652 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 21653 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 21654 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 21655 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 21656 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 21657 /* 21658 * Set the ILLF_ROUTER flag according to the global 21659 * IPv4 forwarding policy. 21660 */ 21661 if (ip_g_forward != 0) 21662 ill->ill_flags |= ILLF_ROUTER; 21663 } 21664 21665 ASSERT(ill->ill_phyint != NULL); 21666 21667 /* 21668 * The ipv6Ifindex and ipv6IfIcmpIfIndex assignments will 21669 * be completed in ill_glist_insert -> ill_phyint_reinit 21670 */ 21671 if (ill->ill_isv6) { 21672 /* allocate v6 mib */ 21673 if (!ill_allocate_mibs(ill)) 21674 return (ENOMEM); 21675 } 21676 21677 /* 21678 * Pick a default sap until we get the DL_INFO_ACK back from 21679 * the driver. 21680 */ 21681 if (ill->ill_sap == 0) { 21682 if (ill->ill_isv6) 21683 ill->ill_sap = IP6_DL_SAP; 21684 else 21685 ill->ill_sap = IP_DL_SAP; 21686 } 21687 21688 ill->ill_ifname_pending = 1; 21689 ill->ill_ifname_pending_err = 0; 21690 21691 ill_refhold(ill); 21692 rw_enter(&ill_g_lock, RW_WRITER); 21693 if ((error = ill_glist_insert(ill, interf_name, 21694 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 21695 ill->ill_ppa = UINT_MAX; 21696 ill->ill_name[0] = '\0'; 21697 /* 21698 * undo null termination done above. 21699 */ 21700 ppa_ptr[0] = old_char; 21701 rw_exit(&ill_g_lock); 21702 ill_refrele(ill); 21703 return (error); 21704 } 21705 21706 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 21707 21708 /* 21709 * When we return the buffer pointed to by interf_name should contain 21710 * the same name as in ill_name. 21711 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 21712 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 21713 * so copy full name and update the ppa ptr. 21714 * When ppa passed in != UINT_MAX all values are correct just undo 21715 * null termination, this saves a bcopy. 21716 */ 21717 if (*new_ppa_ptr == UINT_MAX) { 21718 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 21719 *new_ppa_ptr = ill->ill_ppa; 21720 } else { 21721 /* 21722 * undo null termination done above. 21723 */ 21724 ppa_ptr[0] = old_char; 21725 } 21726 21727 /* Let SCTP know about this ILL */ 21728 sctp_update_ill(ill, SCTP_ILL_INSERT); 21729 21730 /* and also about the first ipif */ 21731 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 21732 21733 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 21734 B_TRUE); 21735 21736 rw_exit(&ill_g_lock); 21737 ill_refrele(ill); 21738 if (ipsq == NULL) 21739 return (EINPROGRESS); 21740 21741 /* 21742 * Need to set the ipsq_current_ipif now, if we have changed ipsq 21743 * due to the phyint merge in ill_phyint_reinit. 21744 */ 21745 ASSERT(ipsq->ipsq_current_ipif == NULL || 21746 ipsq->ipsq_current_ipif == ipif); 21747 ipsq->ipsq_current_ipif = ipif; 21748 ipsq->ipsq_last_cmd = SIOCSLIFNAME; 21749 error = ipif_set_values_tail(ill, ipif, mp, q); 21750 ipsq_exit(ipsq, B_TRUE, B_TRUE); 21751 if (error != 0 && error != EINPROGRESS) { 21752 /* 21753 * restore previous values 21754 */ 21755 ill->ill_isv6 = B_FALSE; 21756 } 21757 return (error); 21758 } 21759 21760 21761 extern void (*ip_cleanup_func)(void); 21762 21763 void 21764 ipif_init(void) 21765 { 21766 hrtime_t hrt; 21767 int i; 21768 21769 /* 21770 * Can't call drv_getparm here as it is too early in the boot. 21771 * As we use ipif_src_random just for picking a different 21772 * source address everytime, this need not be really random. 21773 */ 21774 hrt = gethrtime(); 21775 ipif_src_random = ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 21776 21777 for (i = 0; i < MAX_G_HEADS; i++) { 21778 ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ill_g_heads[i]; 21779 ill_g_heads[i].ill_g_list_tail = (ill_if_t *)&ill_g_heads[i]; 21780 } 21781 21782 avl_create(&phyint_g_list.phyint_list_avl_by_index, 21783 ill_phyint_compare_index, 21784 sizeof (phyint_t), 21785 offsetof(struct phyint, phyint_avl_by_index)); 21786 avl_create(&phyint_g_list.phyint_list_avl_by_name, 21787 ill_phyint_compare_name, 21788 sizeof (phyint_t), 21789 offsetof(struct phyint, phyint_avl_by_name)); 21790 21791 ip_cleanup_func = ip_thread_exit; 21792 } 21793 21794 /* 21795 * This is called by ip_rt_add when src_addr value is other than zero. 21796 * src_addr signifies the source address of the incoming packet. For 21797 * reverse tunnel route we need to create a source addr based routing 21798 * table. This routine creates ip_mrtun_table if it's empty and then 21799 * it adds the route entry hashed by source address. It verifies that 21800 * the outgoing interface is always a non-resolver interface (tunnel). 21801 */ 21802 int 21803 ip_mrtun_rt_add(ipaddr_t in_src_addr, int flags, ipif_t *ipif_arg, 21804 ipif_t *src_ipif, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func) 21805 { 21806 ire_t *ire; 21807 ire_t *save_ire; 21808 ipif_t *ipif; 21809 ill_t *in_ill = NULL; 21810 ill_t *out_ill; 21811 queue_t *stq; 21812 mblk_t *dlureq_mp; 21813 int error; 21814 21815 if (ire_arg != NULL) 21816 *ire_arg = NULL; 21817 ASSERT(in_src_addr != INADDR_ANY); 21818 21819 ipif = ipif_arg; 21820 if (ipif != NULL) { 21821 out_ill = ipif->ipif_ill; 21822 } else { 21823 ip1dbg(("ip_mrtun_rt_add: ipif is NULL\n")); 21824 return (EINVAL); 21825 } 21826 21827 if (src_ipif == NULL) { 21828 ip1dbg(("ip_mrtun_rt_add: src_ipif is NULL\n")); 21829 return (EINVAL); 21830 } 21831 in_ill = src_ipif->ipif_ill; 21832 21833 /* 21834 * Check for duplicates. We don't need to 21835 * match out_ill, because the uniqueness of 21836 * a route is only dependent on src_addr and 21837 * in_ill. 21838 */ 21839 ire = ire_mrtun_lookup(in_src_addr, in_ill); 21840 if (ire != NULL) { 21841 ire_refrele(ire); 21842 return (EEXIST); 21843 } 21844 if (ipif->ipif_net_type != IRE_IF_NORESOLVER) { 21845 ip2dbg(("ip_mrtun_rt_add: outgoing interface is type %d\n", 21846 ipif->ipif_net_type)); 21847 return (EINVAL); 21848 } 21849 21850 stq = ipif->ipif_wq; 21851 ASSERT(stq != NULL); 21852 21853 /* 21854 * The outgoing interface must be non-resolver 21855 * interface. 21856 */ 21857 dlureq_mp = ill_dlur_gen(NULL, 21858 out_ill->ill_phys_addr_length, out_ill->ill_sap, 21859 out_ill->ill_sap_length); 21860 21861 if (dlureq_mp == NULL) { 21862 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 21863 return (ENOMEM); 21864 } 21865 21866 /* Create the IRE. */ 21867 21868 ire = ire_create( 21869 NULL, /* Zero dst addr */ 21870 NULL, /* Zero mask */ 21871 NULL, /* Zero gateway addr */ 21872 NULL, /* Zero ipif_src addr */ 21873 (uint8_t *)&in_src_addr, /* in_src-addr */ 21874 &ipif->ipif_mtu, 21875 NULL, 21876 NULL, /* rfq */ 21877 stq, 21878 IRE_MIPRTUN, 21879 dlureq_mp, 21880 ipif, 21881 in_ill, 21882 0, 21883 0, 21884 0, 21885 flags, 21886 &ire_uinfo_null); 21887 21888 if (ire == NULL) 21889 return (ENOMEM); 21890 ip2dbg(("ip_mrtun_rt_add: mrtun route is created with type %d\n", 21891 ire->ire_type)); 21892 save_ire = ire; 21893 ASSERT(save_ire != NULL); 21894 error = ire_add_mrtun(&ire, q, mp, func); 21895 /* 21896 * If ire_add_mrtun() failed, the ire passed in was freed 21897 * so there is no need to do so here. 21898 */ 21899 if (error != 0) { 21900 return (error); 21901 } 21902 21903 /* Duplicate check */ 21904 if (ire != save_ire) { 21905 /* route already exists by now */ 21906 ire_refrele(ire); 21907 return (EEXIST); 21908 } 21909 21910 if (ire_arg != NULL) { 21911 /* 21912 * Store the ire that was just added. the caller 21913 * ip_rts_request responsible for doing ire_refrele() 21914 * on it. 21915 */ 21916 *ire_arg = ire; 21917 } else { 21918 ire_refrele(ire); /* held in ire_add_mrtun */ 21919 } 21920 21921 return (0); 21922 } 21923 21924 /* 21925 * It is called by ip_rt_delete() only when mipagent requests to delete 21926 * a reverse tunnel route that was added by ip_mrtun_rt_add() before. 21927 */ 21928 21929 int 21930 ip_mrtun_rt_delete(ipaddr_t in_src_addr, ipif_t *src_ipif) 21931 { 21932 ire_t *ire = NULL; 21933 21934 if (in_src_addr == INADDR_ANY) 21935 return (EINVAL); 21936 if (src_ipif == NULL) 21937 return (EINVAL); 21938 21939 /* search if this route exists in the ip_mrtun_table */ 21940 ire = ire_mrtun_lookup(in_src_addr, src_ipif->ipif_ill); 21941 if (ire == NULL) { 21942 ip2dbg(("ip_mrtun_rt_delete: ire not found\n")); 21943 return (ESRCH); 21944 } 21945 ire_delete(ire); 21946 ire_refrele(ire); 21947 return (0); 21948 } 21949 21950 /* 21951 * Lookup the ipif corresponding to the onlink destination address. For 21952 * point-to-point interfaces, it matches with remote endpoint destination 21953 * address. For point-to-multipoint interfaces it only tries to match the 21954 * destination with the interface's subnet address. The longest, most specific 21955 * match is found to take care of such rare network configurations like - 21956 * le0: 129.146.1.1/16 21957 * le1: 129.146.2.2/24 21958 * It is used only by SO_DONTROUTE at the moment. 21959 */ 21960 ipif_t * 21961 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid) 21962 { 21963 ipif_t *ipif, *best_ipif; 21964 ill_t *ill; 21965 ill_walk_context_t ctx; 21966 21967 ASSERT(zoneid != ALL_ZONES); 21968 best_ipif = NULL; 21969 21970 rw_enter(&ill_g_lock, RW_READER); 21971 ill = ILL_START_WALK_V4(&ctx); 21972 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 21973 mutex_enter(&ill->ill_lock); 21974 for (ipif = ill->ill_ipif; ipif != NULL; 21975 ipif = ipif->ipif_next) { 21976 if (!IPIF_CAN_LOOKUP(ipif)) 21977 continue; 21978 if (ipif->ipif_zoneid != zoneid) 21979 continue; 21980 /* 21981 * Point-to-point case. Look for exact match with 21982 * destination address. 21983 */ 21984 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 21985 if (ipif->ipif_pp_dst_addr == addr) { 21986 ipif_refhold_locked(ipif); 21987 mutex_exit(&ill->ill_lock); 21988 rw_exit(&ill_g_lock); 21989 if (best_ipif != NULL) 21990 ipif_refrele(best_ipif); 21991 return (ipif); 21992 } 21993 } else if (ipif->ipif_subnet == (addr & 21994 ipif->ipif_net_mask)) { 21995 /* 21996 * Point-to-multipoint case. Looping through to 21997 * find the most specific match. If there are 21998 * multiple best match ipif's then prefer ipif's 21999 * that are UP. If there is only one best match 22000 * ipif and it is DOWN we must still return it. 22001 */ 22002 if ((best_ipif == NULL) || 22003 (ipif->ipif_net_mask > 22004 best_ipif->ipif_net_mask) || 22005 ((ipif->ipif_net_mask == 22006 best_ipif->ipif_net_mask) && 22007 ((ipif->ipif_flags & IPIF_UP) && 22008 (!(best_ipif->ipif_flags & IPIF_UP))))) { 22009 ipif_refhold_locked(ipif); 22010 mutex_exit(&ill->ill_lock); 22011 rw_exit(&ill_g_lock); 22012 if (best_ipif != NULL) 22013 ipif_refrele(best_ipif); 22014 best_ipif = ipif; 22015 rw_enter(&ill_g_lock, RW_READER); 22016 mutex_enter(&ill->ill_lock); 22017 } 22018 } 22019 } 22020 mutex_exit(&ill->ill_lock); 22021 } 22022 rw_exit(&ill_g_lock); 22023 return (best_ipif); 22024 } 22025 22026 22027 /* 22028 * Save enough information so that we can recreate the IRE if 22029 * the interface goes down and then up. 22030 */ 22031 static void 22032 ipif_save_ire(ipif_t *ipif, ire_t *ire) 22033 { 22034 mblk_t *save_mp; 22035 22036 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 22037 if (save_mp != NULL) { 22038 ifrt_t *ifrt; 22039 22040 save_mp->b_wptr += sizeof (ifrt_t); 22041 ifrt = (ifrt_t *)save_mp->b_rptr; 22042 bzero(ifrt, sizeof (ifrt_t)); 22043 ifrt->ifrt_type = ire->ire_type; 22044 ifrt->ifrt_addr = ire->ire_addr; 22045 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 22046 ifrt->ifrt_src_addr = ire->ire_src_addr; 22047 ifrt->ifrt_mask = ire->ire_mask; 22048 ifrt->ifrt_flags = ire->ire_flags; 22049 ifrt->ifrt_max_frag = ire->ire_max_frag; 22050 mutex_enter(&ipif->ipif_saved_ire_lock); 22051 save_mp->b_cont = ipif->ipif_saved_ire_mp; 22052 ipif->ipif_saved_ire_mp = save_mp; 22053 ipif->ipif_saved_ire_cnt++; 22054 mutex_exit(&ipif->ipif_saved_ire_lock); 22055 } 22056 } 22057 22058 22059 static void 22060 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 22061 { 22062 mblk_t **mpp; 22063 mblk_t *mp; 22064 ifrt_t *ifrt; 22065 22066 /* Remove from ipif_saved_ire_mp list if it is there */ 22067 mutex_enter(&ipif->ipif_saved_ire_lock); 22068 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 22069 mpp = &(*mpp)->b_cont) { 22070 /* 22071 * On a given ipif, the triple of address, gateway and 22072 * mask is unique for each saved IRE (in the case of 22073 * ordinary interface routes, the gateway address is 22074 * all-zeroes). 22075 */ 22076 mp = *mpp; 22077 ifrt = (ifrt_t *)mp->b_rptr; 22078 if (ifrt->ifrt_addr == ire->ire_addr && 22079 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 22080 ifrt->ifrt_mask == ire->ire_mask) { 22081 *mpp = mp->b_cont; 22082 ipif->ipif_saved_ire_cnt--; 22083 freeb(mp); 22084 break; 22085 } 22086 } 22087 mutex_exit(&ipif->ipif_saved_ire_lock); 22088 } 22089 22090 22091 /* 22092 * IP multirouting broadcast routes handling 22093 * Append CGTP broadcast IREs to regular ones created 22094 * at ifconfig time. 22095 */ 22096 static void 22097 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst) 22098 { 22099 ire_t *ire_prim; 22100 22101 ASSERT(ire != NULL); 22102 ASSERT(ire_dst != NULL); 22103 22104 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 22105 IRE_BROADCAST, NULL, NULL, MATCH_IRE_TYPE); 22106 if (ire_prim != NULL) { 22107 /* 22108 * We are in the special case of broadcasts for 22109 * CGTP. We add an IRE_BROADCAST that holds 22110 * the RTF_MULTIRT flag, the destination 22111 * address of ire_dst and the low level 22112 * info of ire_prim. In other words, CGTP 22113 * broadcast is added to the redundant ipif. 22114 */ 22115 ipif_t *ipif_prim; 22116 ire_t *bcast_ire; 22117 22118 ipif_prim = ire_prim->ire_ipif; 22119 22120 ip2dbg(("ip_cgtp_filter_bcast_add: " 22121 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 22122 (void *)ire_dst, (void *)ire_prim, 22123 (void *)ipif_prim)); 22124 22125 bcast_ire = ire_create( 22126 (uchar_t *)&ire->ire_addr, 22127 (uchar_t *)&ip_g_all_ones, 22128 (uchar_t *)&ire_dst->ire_src_addr, 22129 (uchar_t *)&ire->ire_gateway_addr, 22130 NULL, 22131 &ipif_prim->ipif_mtu, 22132 NULL, 22133 ipif_prim->ipif_rq, 22134 ipif_prim->ipif_wq, 22135 IRE_BROADCAST, 22136 ipif_prim->ipif_bcast_mp, 22137 ipif_prim, 22138 NULL, 22139 0, 22140 0, 22141 0, 22142 ire->ire_flags, 22143 &ire_uinfo_null); 22144 22145 if (bcast_ire != NULL) { 22146 22147 if (ire_add(&bcast_ire, NULL, NULL, NULL) == 0) { 22148 ip2dbg(("ip_cgtp_filter_bcast_add: " 22149 "added bcast_ire %p\n", 22150 (void *)bcast_ire)); 22151 22152 ipif_save_ire(bcast_ire->ire_ipif, 22153 bcast_ire); 22154 ire_refrele(bcast_ire); 22155 } 22156 } 22157 ire_refrele(ire_prim); 22158 } 22159 } 22160 22161 22162 /* 22163 * IP multirouting broadcast routes handling 22164 * Remove the broadcast ire 22165 */ 22166 static void 22167 ip_cgtp_bcast_delete(ire_t *ire) 22168 { 22169 ire_t *ire_dst; 22170 22171 ASSERT(ire != NULL); 22172 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 22173 NULL, NULL, MATCH_IRE_TYPE); 22174 if (ire_dst != NULL) { 22175 ire_t *ire_prim; 22176 22177 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 22178 IRE_BROADCAST, NULL, NULL, MATCH_IRE_TYPE); 22179 if (ire_prim != NULL) { 22180 ipif_t *ipif_prim; 22181 ire_t *bcast_ire; 22182 22183 ipif_prim = ire_prim->ire_ipif; 22184 22185 ip2dbg(("ip_cgtp_filter_bcast_delete: " 22186 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 22187 (void *)ire_dst, (void *)ire_prim, 22188 (void *)ipif_prim)); 22189 22190 bcast_ire = ire_ctable_lookup(ire->ire_addr, 22191 ire->ire_gateway_addr, 22192 IRE_BROADCAST, 22193 ipif_prim, 22194 NULL, 22195 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 22196 MATCH_IRE_MASK); 22197 22198 if (bcast_ire != NULL) { 22199 ip2dbg(("ip_cgtp_filter_bcast_delete: " 22200 "looked up bcast_ire %p\n", 22201 (void *)bcast_ire)); 22202 ipif_remove_ire(bcast_ire->ire_ipif, 22203 bcast_ire); 22204 ire_delete(bcast_ire); 22205 } 22206 ire_refrele(ire_prim); 22207 } 22208 ire_refrele(ire_dst); 22209 } 22210 } 22211 22212 /* 22213 * IPsec hardware acceleration capabilities related functions. 22214 */ 22215 22216 /* 22217 * Free a per-ill IPsec capabilities structure. 22218 */ 22219 static void 22220 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 22221 { 22222 if (capab->auth_hw_algs != NULL) 22223 kmem_free(capab->auth_hw_algs, capab->algs_size); 22224 if (capab->encr_hw_algs != NULL) 22225 kmem_free(capab->encr_hw_algs, capab->algs_size); 22226 if (capab->encr_algparm != NULL) 22227 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 22228 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 22229 } 22230 22231 /* 22232 * Allocate a new per-ill IPsec capabilities structure. This structure 22233 * is specific to an IPsec protocol (AH or ESP). It is implemented as 22234 * an array which specifies, for each algorithm, whether this algorithm 22235 * is supported by the ill or not. 22236 */ 22237 static ill_ipsec_capab_t * 22238 ill_ipsec_capab_alloc(void) 22239 { 22240 ill_ipsec_capab_t *capab; 22241 uint_t nelems; 22242 22243 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 22244 if (capab == NULL) 22245 return (NULL); 22246 22247 /* we need one bit per algorithm */ 22248 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 22249 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 22250 22251 /* allocate memory to store algorithm flags */ 22252 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 22253 if (capab->encr_hw_algs == NULL) 22254 goto nomem; 22255 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 22256 if (capab->auth_hw_algs == NULL) 22257 goto nomem; 22258 /* 22259 * Leave encr_algparm NULL for now since we won't need it half 22260 * the time 22261 */ 22262 return (capab); 22263 22264 nomem: 22265 ill_ipsec_capab_free(capab); 22266 return (NULL); 22267 } 22268 22269 /* 22270 * Resize capability array. Since we're exclusive, this is OK. 22271 */ 22272 static boolean_t 22273 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 22274 { 22275 ipsec_capab_algparm_t *nalp, *oalp; 22276 uint32_t olen, nlen; 22277 22278 oalp = capab->encr_algparm; 22279 olen = capab->encr_algparm_size; 22280 22281 if (oalp != NULL) { 22282 if (algid < capab->encr_algparm_end) 22283 return (B_TRUE); 22284 } 22285 22286 nlen = (algid + 1) * sizeof (*nalp); 22287 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 22288 if (nalp == NULL) 22289 return (B_FALSE); 22290 22291 if (oalp != NULL) { 22292 bcopy(oalp, nalp, olen); 22293 kmem_free(oalp, olen); 22294 } 22295 capab->encr_algparm = nalp; 22296 capab->encr_algparm_size = nlen; 22297 capab->encr_algparm_end = algid + 1; 22298 22299 return (B_TRUE); 22300 } 22301 22302 /* 22303 * Compare the capabilities of the specified ill with the protocol 22304 * and algorithms specified by the SA passed as argument. 22305 * If they match, returns B_TRUE, B_FALSE if they do not match. 22306 * 22307 * The ill can be passed as a pointer to it, or by specifying its index 22308 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 22309 * 22310 * Called by ipsec_out_is_accelerated() do decide whether an outbound 22311 * packet is eligible for hardware acceleration, and by 22312 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 22313 * to a particular ill. 22314 */ 22315 boolean_t 22316 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 22317 ipsa_t *sa) 22318 { 22319 boolean_t sa_isv6; 22320 uint_t algid; 22321 struct ill_ipsec_capab_s *cpp; 22322 boolean_t need_refrele = B_FALSE; 22323 22324 if (ill == NULL) { 22325 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 22326 NULL, NULL, NULL); 22327 if (ill == NULL) { 22328 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 22329 return (B_FALSE); 22330 } 22331 need_refrele = B_TRUE; 22332 } 22333 22334 /* 22335 * Use the address length specified by the SA to determine 22336 * if it corresponds to a IPv6 address, and fail the matching 22337 * if the isv6 flag passed as argument does not match. 22338 * Note: this check is used for SADB capability checking before 22339 * sending SA information to an ill. 22340 */ 22341 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 22342 if (sa_isv6 != ill_isv6) 22343 /* protocol mismatch */ 22344 goto done; 22345 22346 /* 22347 * Check if the ill supports the protocol, algorithm(s) and 22348 * key size(s) specified by the SA, and get the pointers to 22349 * the algorithms supported by the ill. 22350 */ 22351 switch (sa->ipsa_type) { 22352 22353 case SADB_SATYPE_ESP: 22354 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 22355 /* ill does not support ESP acceleration */ 22356 goto done; 22357 cpp = ill->ill_ipsec_capab_esp; 22358 algid = sa->ipsa_auth_alg; 22359 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 22360 goto done; 22361 algid = sa->ipsa_encr_alg; 22362 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 22363 goto done; 22364 if (algid < cpp->encr_algparm_end) { 22365 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 22366 if (sa->ipsa_encrkeybits < alp->minkeylen) 22367 goto done; 22368 if (sa->ipsa_encrkeybits > alp->maxkeylen) 22369 goto done; 22370 } 22371 break; 22372 22373 case SADB_SATYPE_AH: 22374 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 22375 /* ill does not support AH acceleration */ 22376 goto done; 22377 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 22378 ill->ill_ipsec_capab_ah->auth_hw_algs)) 22379 goto done; 22380 break; 22381 } 22382 22383 if (need_refrele) 22384 ill_refrele(ill); 22385 return (B_TRUE); 22386 done: 22387 if (need_refrele) 22388 ill_refrele(ill); 22389 return (B_FALSE); 22390 } 22391 22392 22393 /* 22394 * Add a new ill to the list of IPsec capable ills. 22395 * Called from ill_capability_ipsec_ack() when an ACK was received 22396 * indicating that IPsec hardware processing was enabled for an ill. 22397 * 22398 * ill must point to the ill for which acceleration was enabled. 22399 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 22400 */ 22401 static void 22402 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 22403 { 22404 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 22405 uint_t sa_type; 22406 uint_t ipproto; 22407 22408 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 22409 (dl_cap == DL_CAPAB_IPSEC_ESP)); 22410 22411 switch (dl_cap) { 22412 case DL_CAPAB_IPSEC_AH: 22413 sa_type = SADB_SATYPE_AH; 22414 ills = &ipsec_capab_ills_ah; 22415 ipproto = IPPROTO_AH; 22416 break; 22417 case DL_CAPAB_IPSEC_ESP: 22418 sa_type = SADB_SATYPE_ESP; 22419 ills = &ipsec_capab_ills_esp; 22420 ipproto = IPPROTO_ESP; 22421 break; 22422 } 22423 22424 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 22425 22426 /* 22427 * Add ill index to list of hardware accelerators. If 22428 * already in list, do nothing. 22429 */ 22430 for (cur_ill = *ills; cur_ill != NULL && 22431 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 22432 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 22433 ; 22434 22435 if (cur_ill == NULL) { 22436 /* if this is a new entry for this ill */ 22437 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 22438 if (new_ill == NULL) { 22439 rw_exit(&ipsec_capab_ills_lock); 22440 return; 22441 } 22442 22443 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 22444 new_ill->ill_isv6 = ill->ill_isv6; 22445 new_ill->next = *ills; 22446 *ills = new_ill; 22447 } else if (!sadb_resync) { 22448 /* not resync'ing SADB and an entry exists for this ill */ 22449 rw_exit(&ipsec_capab_ills_lock); 22450 return; 22451 } 22452 22453 rw_exit(&ipsec_capab_ills_lock); 22454 22455 if (ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 22456 /* 22457 * IPsec module for protocol loaded, initiate dump 22458 * of the SADB to this ill. 22459 */ 22460 sadb_ill_download(ill, sa_type); 22461 } 22462 22463 /* 22464 * Remove an ill from the list of IPsec capable ills. 22465 */ 22466 static void 22467 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 22468 { 22469 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 22470 22471 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 22472 dl_cap == DL_CAPAB_IPSEC_ESP); 22473 22474 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipsec_capab_ills_ah : 22475 &ipsec_capab_ills_esp; 22476 22477 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 22478 22479 prev_ill = NULL; 22480 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 22481 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 22482 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 22483 ; 22484 if (cur_ill == NULL) { 22485 /* entry not found */ 22486 rw_exit(&ipsec_capab_ills_lock); 22487 return; 22488 } 22489 if (prev_ill == NULL) { 22490 /* entry at front of list */ 22491 *ills = NULL; 22492 } else { 22493 prev_ill->next = cur_ill->next; 22494 } 22495 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 22496 rw_exit(&ipsec_capab_ills_lock); 22497 } 22498 22499 22500 /* 22501 * Handling of DL_CONTROL_REQ messages that must be sent down to 22502 * an ill while having exclusive access. 22503 */ 22504 /* ARGSUSED */ 22505 static void 22506 ill_ipsec_capab_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 22507 { 22508 ill_t *ill = (ill_t *)q->q_ptr; 22509 22510 ill_dlpi_send(ill, mp); 22511 } 22512 22513 22514 /* 22515 * Called by SADB to send a DL_CONTROL_REQ message to every ill 22516 * supporting the specified IPsec protocol acceleration. 22517 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 22518 * We free the mblk and, if sa is non-null, release the held referece. 22519 */ 22520 void 22521 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa) 22522 { 22523 ipsec_capab_ill_t *ici, *cur_ici; 22524 ill_t *ill; 22525 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 22526 22527 ici = (sa_type == SADB_SATYPE_AH) ? ipsec_capab_ills_ah : 22528 ipsec_capab_ills_esp; 22529 22530 rw_enter(&ipsec_capab_ills_lock, RW_READER); 22531 22532 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 22533 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 22534 cur_ici->ill_isv6, NULL, NULL, NULL, NULL); 22535 22536 /* 22537 * Handle the case where the ill goes away while the SADB is 22538 * attempting to send messages. If it's going away, it's 22539 * nuking its shadow SADB, so we don't care.. 22540 */ 22541 22542 if (ill == NULL) 22543 continue; 22544 22545 if (sa != NULL) { 22546 /* 22547 * Make sure capabilities match before 22548 * sending SA to ill. 22549 */ 22550 if (!ipsec_capab_match(ill, cur_ici->ill_index, 22551 cur_ici->ill_isv6, sa)) { 22552 ill_refrele(ill); 22553 continue; 22554 } 22555 22556 mutex_enter(&sa->ipsa_lock); 22557 sa->ipsa_flags |= IPSA_F_HW; 22558 mutex_exit(&sa->ipsa_lock); 22559 } 22560 22561 /* 22562 * Copy template message, and add it to the front 22563 * of the mblk ship list. We want to avoid holding 22564 * the ipsec_capab_ills_lock while sending the 22565 * message to the ills. 22566 * 22567 * The b_next and b_prev are temporarily used 22568 * to build a list of mblks to be sent down, and to 22569 * save the ill to which they must be sent. 22570 */ 22571 nmp = copymsg(mp); 22572 if (nmp == NULL) { 22573 ill_refrele(ill); 22574 continue; 22575 } 22576 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 22577 nmp->b_next = mp_ship_list; 22578 mp_ship_list = nmp; 22579 nmp->b_prev = (mblk_t *)ill; 22580 } 22581 22582 rw_exit(&ipsec_capab_ills_lock); 22583 22584 nmp = mp_ship_list; 22585 while (nmp != NULL) { 22586 /* restore the mblk to a sane state */ 22587 next_mp = nmp->b_next; 22588 nmp->b_next = NULL; 22589 ill = (ill_t *)nmp->b_prev; 22590 nmp->b_prev = NULL; 22591 22592 /* 22593 * Ship the mblk to the ill, must be exclusive. Keep the 22594 * reference to the ill as qwriter_ip() does a ill_referele(). 22595 */ 22596 (void) qwriter_ip(NULL, ill, ill->ill_wq, nmp, 22597 ill_ipsec_capab_send_writer, NEW_OP, B_TRUE); 22598 22599 nmp = next_mp; 22600 } 22601 22602 if (sa != NULL) 22603 IPSA_REFRELE(sa); 22604 freemsg(mp); 22605 } 22606 22607 22608 /* 22609 * Derive an interface id from the link layer address. 22610 * Knows about IEEE 802 and IEEE EUI-64 mappings. 22611 */ 22612 static boolean_t 22613 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 22614 { 22615 char *addr; 22616 22617 if (phys_length != ETHERADDRL) 22618 return (B_FALSE); 22619 22620 /* Form EUI-64 like address */ 22621 addr = (char *)&v6addr->s6_addr32[2]; 22622 bcopy((char *)phys_addr, addr, 3); 22623 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 22624 addr[3] = (char)0xff; 22625 addr[4] = (char)0xfe; 22626 bcopy((char *)phys_addr + 3, addr + 5, 3); 22627 return (B_TRUE); 22628 } 22629 22630 /* ARGSUSED */ 22631 static boolean_t 22632 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 22633 { 22634 return (B_FALSE); 22635 } 22636 22637 /* ARGSUSED */ 22638 static boolean_t 22639 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 22640 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 22641 { 22642 /* 22643 * Multicast address mappings used over Ethernet/802.X. 22644 * This address is used as a base for mappings. 22645 */ 22646 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 22647 0x00, 0x00, 0x00}; 22648 22649 /* 22650 * Extract low order 32 bits from IPv6 multicast address. 22651 * Or that into the link layer address, starting from the 22652 * second byte. 22653 */ 22654 *hw_start = 2; 22655 v6_extract_mask->s6_addr32[0] = 0; 22656 v6_extract_mask->s6_addr32[1] = 0; 22657 v6_extract_mask->s6_addr32[2] = 0; 22658 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 22659 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 22660 return (B_TRUE); 22661 } 22662 22663 /* 22664 * Indicate by return value whether multicast is supported. If not, 22665 * this code should not touch/change any parameters. 22666 */ 22667 /* ARGSUSED */ 22668 static boolean_t 22669 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 22670 uint32_t *hw_start, ipaddr_t *extract_mask) 22671 { 22672 /* 22673 * Multicast address mappings used over Ethernet/802.X. 22674 * This address is used as a base for mappings. 22675 */ 22676 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 22677 0x00, 0x00, 0x00 }; 22678 22679 if (phys_length != ETHERADDRL) 22680 return (B_FALSE); 22681 22682 *extract_mask = htonl(0x007fffff); 22683 *hw_start = 2; 22684 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 22685 return (B_TRUE); 22686 } 22687 22688 /* 22689 * Derive IPoIB interface id from the link layer address. 22690 */ 22691 static boolean_t 22692 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 22693 { 22694 char *addr; 22695 22696 if (phys_length != 20) 22697 return (B_FALSE); 22698 addr = (char *)&v6addr->s6_addr32[2]; 22699 bcopy(phys_addr + 12, addr, 8); 22700 /* 22701 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 22702 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 22703 * rules. In these cases, the IBA considers these GUIDs to be in 22704 * "Modified EUI-64" format, and thus toggling the u/l bit is not 22705 * required; vendors are required not to assign global EUI-64's 22706 * that differ only in u/l bit values, thus guaranteeing uniqueness 22707 * of the interface identifier. Whether the GUID is in modified 22708 * or proper EUI-64 format, the ipv6 identifier must have the u/l 22709 * bit set to 1. 22710 */ 22711 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 22712 return (B_TRUE); 22713 } 22714 22715 /* 22716 * Note on mapping from multicast IP addresses to IPoIB multicast link 22717 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 22718 * The format of an IPoIB multicast address is: 22719 * 22720 * 4 byte QPN Scope Sign. Pkey 22721 * +--------------------------------------------+ 22722 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 22723 * +--------------------------------------------+ 22724 * 22725 * The Scope and Pkey components are properties of the IBA port and 22726 * network interface. They can be ascertained from the broadcast address. 22727 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 22728 */ 22729 22730 static boolean_t 22731 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 22732 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 22733 { 22734 /* 22735 * Base IPoIB IPv6 multicast address used for mappings. 22736 * Does not contain the IBA scope/Pkey values. 22737 */ 22738 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 22739 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 22740 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 22741 22742 /* 22743 * Extract low order 80 bits from IPv6 multicast address. 22744 * Or that into the link layer address, starting from the 22745 * sixth byte. 22746 */ 22747 *hw_start = 6; 22748 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 22749 22750 /* 22751 * Now fill in the IBA scope/Pkey values from the broadcast address. 22752 */ 22753 *(maddr + 5) = *(bphys_addr + 5); 22754 *(maddr + 8) = *(bphys_addr + 8); 22755 *(maddr + 9) = *(bphys_addr + 9); 22756 22757 v6_extract_mask->s6_addr32[0] = 0; 22758 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 22759 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 22760 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 22761 return (B_TRUE); 22762 } 22763 22764 static boolean_t 22765 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 22766 uint32_t *hw_start, ipaddr_t *extract_mask) 22767 { 22768 /* 22769 * Base IPoIB IPv4 multicast address used for mappings. 22770 * Does not contain the IBA scope/Pkey values. 22771 */ 22772 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 22773 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 22774 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 22775 22776 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 22777 return (B_FALSE); 22778 22779 /* 22780 * Extract low order 28 bits from IPv4 multicast address. 22781 * Or that into the link layer address, starting from the 22782 * sixteenth byte. 22783 */ 22784 *extract_mask = htonl(0x0fffffff); 22785 *hw_start = 16; 22786 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 22787 22788 /* 22789 * Now fill in the IBA scope/Pkey values from the broadcast address. 22790 */ 22791 *(maddr + 5) = *(bphys_addr + 5); 22792 *(maddr + 8) = *(bphys_addr + 8); 22793 *(maddr + 9) = *(bphys_addr + 9); 22794 return (B_TRUE); 22795 } 22796 22797 /* 22798 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 22799 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 22800 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 22801 * the link-local address is preferred. 22802 */ 22803 boolean_t 22804 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 22805 { 22806 ipif_t *ipif; 22807 ipif_t *maybe_ipif = NULL; 22808 22809 mutex_enter(&ill->ill_lock); 22810 if (ill->ill_state_flags & ILL_CONDEMNED) { 22811 mutex_exit(&ill->ill_lock); 22812 if (ipifp != NULL) 22813 *ipifp = NULL; 22814 return (B_FALSE); 22815 } 22816 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 22817 if (!IPIF_CAN_LOOKUP(ipif)) 22818 continue; 22819 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid) 22820 continue; 22821 if ((ipif->ipif_flags & flags) != flags) 22822 continue; 22823 22824 if (ipifp == NULL) { 22825 mutex_exit(&ill->ill_lock); 22826 ASSERT(maybe_ipif == NULL); 22827 return (B_TRUE); 22828 } 22829 if (!ill->ill_isv6 || 22830 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 22831 ipif_refhold_locked(ipif); 22832 mutex_exit(&ill->ill_lock); 22833 *ipifp = ipif; 22834 return (B_TRUE); 22835 } 22836 if (maybe_ipif == NULL) 22837 maybe_ipif = ipif; 22838 } 22839 if (ipifp != NULL) { 22840 if (maybe_ipif != NULL) 22841 ipif_refhold_locked(maybe_ipif); 22842 *ipifp = maybe_ipif; 22843 } 22844 mutex_exit(&ill->ill_lock); 22845 return (maybe_ipif != NULL); 22846 } 22847 22848 /* 22849 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 22850 */ 22851 boolean_t 22852 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 22853 { 22854 ill_t *illg; 22855 22856 /* 22857 * We look at the passed-in ill first without grabbing ill_g_lock. 22858 */ 22859 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 22860 return (B_TRUE); 22861 } 22862 rw_enter(&ill_g_lock, RW_READER); 22863 if (ill->ill_group == NULL) { 22864 /* ill not in a group */ 22865 rw_exit(&ill_g_lock); 22866 return (B_FALSE); 22867 } 22868 22869 /* 22870 * There's no ipif in the zone on ill, however ill is part of an IPMP 22871 * group. We need to look for an ipif in the zone on all the ills in the 22872 * group. 22873 */ 22874 illg = ill->ill_group->illgrp_ill; 22875 do { 22876 /* 22877 * We don't call ipif_lookup_zoneid() on ill as we already know 22878 * that it's not there. 22879 */ 22880 if (illg != ill && 22881 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 22882 break; 22883 } 22884 } while ((illg = illg->ill_group_next) != NULL); 22885 rw_exit(&ill_g_lock); 22886 return (illg != NULL); 22887 } 22888 22889 /* 22890 * Check if this ill is only being used to send ICMP probes for IPMP 22891 */ 22892 boolean_t 22893 ill_is_probeonly(ill_t *ill) 22894 { 22895 /* 22896 * Check if the interface is FAILED, or INACTIVE 22897 */ 22898 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 22899 return (B_TRUE); 22900 22901 return (B_FALSE); 22902 } 22903