1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 47 #include <sys/kmem.h> 48 #include <sys/systm.h> 49 #include <sys/param.h> 50 #include <sys/socket.h> 51 #include <sys/isa_defs.h> 52 #include <net/if.h> 53 #include <net/if_arp.h> 54 #include <net/if_types.h> 55 #include <net/if_dl.h> 56 #include <net/route.h> 57 #include <sys/sockio.h> 58 #include <netinet/in.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/igmp_var.h> 62 #include <sys/strsun.h> 63 #include <sys/policy.h> 64 #include <sys/ethernet.h> 65 66 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 67 #include <inet/mi.h> 68 #include <inet/nd.h> 69 #include <inet/arp.h> 70 #include <inet/mib2.h> 71 #include <inet/ip.h> 72 #include <inet/ip6.h> 73 #include <inet/ip6_asp.h> 74 #include <inet/tcp.h> 75 #include <inet/ip_multi.h> 76 #include <inet/ip_ire.h> 77 #include <inet/ip_ftable.h> 78 #include <inet/ip_rts.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/ip_if.h> 81 #include <inet/ip_impl.h> 82 #include <inet/tun.h> 83 #include <inet/sctp_ip.h> 84 #include <inet/ip_netinfo.h> 85 #include <inet/mib2.h> 86 87 #include <net/pfkeyv2.h> 88 #include <inet/ipsec_info.h> 89 #include <inet/sadb.h> 90 #include <inet/ipsec_impl.h> 91 #include <sys/iphada.h> 92 93 94 #include <netinet/igmp.h> 95 #include <inet/ip_listutils.h> 96 #include <inet/ipclassifier.h> 97 #include <sys/mac.h> 98 99 #include <sys/systeminfo.h> 100 #include <sys/bootconf.h> 101 102 #include <sys/tsol/tndb.h> 103 #include <sys/tsol/tnet.h> 104 105 /* The character which tells where the ill_name ends */ 106 #define IPIF_SEPARATOR_CHAR ':' 107 108 /* IP ioctl function table entry */ 109 typedef struct ipft_s { 110 int ipft_cmd; 111 pfi_t ipft_pfi; 112 int ipft_min_size; 113 int ipft_flags; 114 } ipft_t; 115 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 116 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 117 118 typedef struct ip_sock_ar_s { 119 union { 120 area_t ip_sock_area; 121 ared_t ip_sock_ared; 122 areq_t ip_sock_areq; 123 } ip_sock_ar_u; 124 queue_t *ip_sock_ar_q; 125 } ip_sock_ar_t; 126 127 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 128 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 129 char *value, caddr_t cp, cred_t *ioc_cr); 130 131 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 132 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 133 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 134 mblk_t *mp, boolean_t need_up); 135 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 136 mblk_t *mp, boolean_t need_up); 137 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 138 queue_t *q, mblk_t *mp, boolean_t need_up); 139 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 140 mblk_t *mp, boolean_t need_up); 141 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 142 mblk_t *mp); 143 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 144 queue_t *q, mblk_t *mp, boolean_t need_up); 145 static int ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, 146 sin_t *sin, boolean_t x_arp_ioctl, boolean_t if_arp_ioctl); 147 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **); 148 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 149 static void ipsq_flush(ill_t *ill); 150 static void ipsq_clean_all(ill_t *ill); 151 static void ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring); 152 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 153 queue_t *q, mblk_t *mp, boolean_t need_up); 154 static void ipsq_delete(ipsq_t *); 155 156 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 157 boolean_t initialize); 158 static void ipif_check_bcast_ires(ipif_t *test_ipif); 159 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 160 static void ipif_delete_cache_ire(ire_t *, char *); 161 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 162 static void ipif_free(ipif_t *ipif); 163 static void ipif_free_tail(ipif_t *ipif); 164 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 165 static void ipif_multicast_down(ipif_t *ipif); 166 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 167 static void ipif_set_default(ipif_t *ipif); 168 static int ipif_set_values(queue_t *q, mblk_t *mp, 169 char *interf_name, uint_t *ppa); 170 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 171 queue_t *q); 172 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 173 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 174 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error); 175 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 176 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 177 178 static int ill_alloc_ppa(ill_if_t *, ill_t *); 179 static int ill_arp_off(ill_t *ill); 180 static int ill_arp_on(ill_t *ill); 181 static void ill_delete_interface_type(ill_if_t *); 182 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 183 static void ill_dl_down(ill_t *ill); 184 static void ill_down(ill_t *ill); 185 static void ill_downi(ire_t *ire, char *ill_arg); 186 static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); 187 static void ill_down_tail(ill_t *ill); 188 static void ill_free_mib(ill_t *ill); 189 static void ill_glist_delete(ill_t *); 190 static boolean_t ill_has_usable_ipif(ill_t *); 191 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 192 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 193 static void ill_phyint_free(ill_t *ill); 194 static void ill_phyint_reinit(ill_t *ill); 195 static void ill_set_nce_router_flags(ill_t *, boolean_t); 196 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 197 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 198 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 199 static void ill_stq_cache_delete(ire_t *, char *); 200 201 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 202 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 203 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 204 in6_addr_t *); 205 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 206 ipaddr_t *); 207 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 208 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 209 in6_addr_t *); 210 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 211 ipaddr_t *); 212 213 static void ipif_save_ire(ipif_t *, ire_t *); 214 static void ipif_remove_ire(ipif_t *, ire_t *); 215 static void ip_cgtp_bcast_add(ire_t *, ire_t *); 216 static void ip_cgtp_bcast_delete(ire_t *); 217 218 /* 219 * Per-ill IPsec capabilities management. 220 */ 221 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 222 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 223 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 224 static void ill_ipsec_capab_delete(ill_t *, uint_t); 225 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 226 static void ill_capability_proto(ill_t *, int, mblk_t *); 227 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 228 boolean_t); 229 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 230 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 231 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 232 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 233 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 234 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 235 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 236 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 237 dl_capability_sub_t *); 238 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 239 static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 240 static void ill_capability_lso_reset(ill_t *, mblk_t **); 241 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 242 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 243 static void ill_capability_dls_reset(ill_t *, mblk_t **); 244 static void ill_capability_dls_disable(ill_t *); 245 246 static void illgrp_cache_delete(ire_t *, char *); 247 static void illgrp_delete(ill_t *ill); 248 static void illgrp_reset_schednext(ill_t *ill); 249 250 static ill_t *ill_prev_usesrc(ill_t *); 251 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 252 static void ill_disband_usesrc_group(ill_t *); 253 254 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 255 256 /* 257 * if we go over the memory footprint limit more than once in this msec 258 * interval, we'll start pruning aggressively. 259 */ 260 int ip_min_frag_prune_time = 0; 261 262 /* 263 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 264 * and the IPsec DOI 265 */ 266 #define MAX_IPSEC_ALGS 256 267 268 #define BITSPERBYTE 8 269 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 270 271 #define IPSEC_ALG_ENABLE(algs, algid) \ 272 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 273 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 274 275 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 276 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 277 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 278 279 typedef uint8_t ipsec_capab_elem_t; 280 281 /* 282 * Per-algorithm parameters. Note that at present, only encryption 283 * algorithms have variable keysize (IKE does not provide a way to negotiate 284 * auth algorithm keysize). 285 * 286 * All sizes here are in bits. 287 */ 288 typedef struct 289 { 290 uint16_t minkeylen; 291 uint16_t maxkeylen; 292 } ipsec_capab_algparm_t; 293 294 /* 295 * Per-ill capabilities. 296 */ 297 struct ill_ipsec_capab_s { 298 ipsec_capab_elem_t *encr_hw_algs; 299 ipsec_capab_elem_t *auth_hw_algs; 300 uint32_t algs_size; /* size of _hw_algs in bytes */ 301 /* algorithm key lengths */ 302 ipsec_capab_algparm_t *encr_algparm; 303 uint32_t encr_algparm_size; 304 uint32_t encr_algparm_end; 305 }; 306 307 /* 308 * List of AH and ESP IPsec acceleration capable ills 309 */ 310 typedef struct ipsec_capab_ill_s { 311 uint_t ill_index; 312 boolean_t ill_isv6; 313 struct ipsec_capab_ill_s *next; 314 } ipsec_capab_ill_t; 315 316 static ipsec_capab_ill_t *ipsec_capab_ills_ah; 317 static ipsec_capab_ill_t *ipsec_capab_ills_esp; 318 krwlock_t ipsec_capab_ills_lock; 319 320 /* 321 * The field values are larger than strictly necessary for simple 322 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 323 */ 324 static area_t ip_area_template = { 325 AR_ENTRY_ADD, /* area_cmd */ 326 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 327 /* area_name_offset */ 328 /* area_name_length temporarily holds this structure length */ 329 sizeof (area_t), /* area_name_length */ 330 IP_ARP_PROTO_TYPE, /* area_proto */ 331 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 332 IP_ADDR_LEN, /* area_proto_addr_length */ 333 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 334 /* area_proto_mask_offset */ 335 0, /* area_flags */ 336 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 337 /* area_hw_addr_offset */ 338 /* Zero length hw_addr_length means 'use your idea of the address' */ 339 0 /* area_hw_addr_length */ 340 }; 341 342 /* 343 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 344 * support 345 */ 346 static area_t ip6_area_template = { 347 AR_ENTRY_ADD, /* area_cmd */ 348 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 349 /* area_name_offset */ 350 /* area_name_length temporarily holds this structure length */ 351 sizeof (area_t), /* area_name_length */ 352 IP_ARP_PROTO_TYPE, /* area_proto */ 353 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 354 IPV6_ADDR_LEN, /* area_proto_addr_length */ 355 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 356 /* area_proto_mask_offset */ 357 0, /* area_flags */ 358 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 359 /* area_hw_addr_offset */ 360 /* Zero length hw_addr_length means 'use your idea of the address' */ 361 0 /* area_hw_addr_length */ 362 }; 363 364 static ared_t ip_ared_template = { 365 AR_ENTRY_DELETE, 366 sizeof (ared_t) + IP_ADDR_LEN, 367 sizeof (ared_t), 368 IP_ARP_PROTO_TYPE, 369 sizeof (ared_t), 370 IP_ADDR_LEN 371 }; 372 373 static ared_t ip6_ared_template = { 374 AR_ENTRY_DELETE, 375 sizeof (ared_t) + IPV6_ADDR_LEN, 376 sizeof (ared_t), 377 IP_ARP_PROTO_TYPE, 378 sizeof (ared_t), 379 IPV6_ADDR_LEN 380 }; 381 382 /* 383 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 384 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 385 * areq is used). 386 */ 387 static areq_t ip_areq_template = { 388 AR_ENTRY_QUERY, /* cmd */ 389 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 390 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 391 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 392 sizeof (areq_t), /* target addr offset */ 393 IP_ADDR_LEN, /* target addr_length */ 394 0, /* flags */ 395 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 396 IP_ADDR_LEN, /* sender addr length */ 397 AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ 398 AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ 399 AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ 400 /* anything else filled in by the code */ 401 }; 402 403 static arc_t ip_aru_template = { 404 AR_INTERFACE_UP, 405 sizeof (arc_t), /* Name offset */ 406 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 407 }; 408 409 static arc_t ip_ard_template = { 410 AR_INTERFACE_DOWN, 411 sizeof (arc_t), /* Name offset */ 412 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 413 }; 414 415 static arc_t ip_aron_template = { 416 AR_INTERFACE_ON, 417 sizeof (arc_t), /* Name offset */ 418 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 419 }; 420 421 static arc_t ip_aroff_template = { 422 AR_INTERFACE_OFF, 423 sizeof (arc_t), /* Name offset */ 424 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 425 }; 426 427 428 static arma_t ip_arma_multi_template = { 429 AR_MAPPING_ADD, 430 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 431 /* Name offset */ 432 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 433 IP_ARP_PROTO_TYPE, 434 sizeof (arma_t), /* proto_addr_offset */ 435 IP_ADDR_LEN, /* proto_addr_length */ 436 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 437 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 438 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 439 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 440 IP_MAX_HW_LEN, /* hw_addr_length */ 441 0, /* hw_mapping_start */ 442 }; 443 444 static ipft_t ip_ioctl_ftbl[] = { 445 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 446 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 447 IPFT_F_NO_REPLY }, 448 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 449 IPFT_F_NO_REPLY }, 450 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 451 { 0 } 452 }; 453 454 /* Simple ICMP IP Header Template */ 455 static ipha_t icmp_ipha = { 456 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 457 }; 458 459 /* Flag descriptors for ip_ipif_report */ 460 static nv_t ipif_nv_tbl[] = { 461 { IPIF_UP, "UP" }, 462 { IPIF_BROADCAST, "BROADCAST" }, 463 { ILLF_DEBUG, "DEBUG" }, 464 { PHYI_LOOPBACK, "LOOPBACK" }, 465 { IPIF_POINTOPOINT, "POINTOPOINT" }, 466 { ILLF_NOTRAILERS, "NOTRAILERS" }, 467 { PHYI_RUNNING, "RUNNING" }, 468 { ILLF_NOARP, "NOARP" }, 469 { PHYI_PROMISC, "PROMISC" }, 470 { PHYI_ALLMULTI, "ALLMULTI" }, 471 { PHYI_INTELLIGENT, "INTELLIGENT" }, 472 { ILLF_MULTICAST, "MULTICAST" }, 473 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 474 { IPIF_UNNUMBERED, "UNNUMBERED" }, 475 { IPIF_DHCPRUNNING, "DHCP" }, 476 { IPIF_PRIVATE, "PRIVATE" }, 477 { IPIF_NOXMIT, "NOXMIT" }, 478 { IPIF_NOLOCAL, "NOLOCAL" }, 479 { IPIF_DEPRECATED, "DEPRECATED" }, 480 { IPIF_PREFERRED, "PREFERRED" }, 481 { IPIF_TEMPORARY, "TEMPORARY" }, 482 { IPIF_ADDRCONF, "ADDRCONF" }, 483 { PHYI_VIRTUAL, "VIRTUAL" }, 484 { ILLF_ROUTER, "ROUTER" }, 485 { ILLF_NONUD, "NONUD" }, 486 { IPIF_ANYCAST, "ANYCAST" }, 487 { ILLF_NORTEXCH, "NORTEXCH" }, 488 { ILLF_IPV4, "IPV4" }, 489 { ILLF_IPV6, "IPV6" }, 490 { IPIF_MIPRUNNING, "MIP" }, 491 { IPIF_NOFAILOVER, "NOFAILOVER" }, 492 { PHYI_FAILED, "FAILED" }, 493 { PHYI_STANDBY, "STANDBY" }, 494 { PHYI_INACTIVE, "INACTIVE" }, 495 { PHYI_OFFLINE, "OFFLINE" }, 496 }; 497 498 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 499 500 static ip_m_t ip_m_tbl[] = { 501 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 502 ip_ether_v6intfid }, 503 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 504 ip_nodef_v6intfid }, 505 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 506 ip_nodef_v6intfid }, 507 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 508 ip_nodef_v6intfid }, 509 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 510 ip_ether_v6intfid }, 511 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 512 ip_ib_v6intfid }, 513 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 514 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 515 ip_nodef_v6intfid } 516 }; 517 518 static ill_t ill_null; /* Empty ILL for init. */ 519 char ipif_loopback_name[] = "lo0"; 520 static char *ipv4_forward_suffix = ":ip_forwarding"; 521 static char *ipv6_forward_suffix = ":ip6_forwarding"; 522 static kstat_t *loopback_ksp = NULL; 523 static sin6_t sin6_null; /* Zero address for quick clears */ 524 static sin_t sin_null; /* Zero address for quick clears */ 525 static uint_t ill_index = 1; /* Used to assign interface indicies */ 526 /* When set search for unused index */ 527 static boolean_t ill_index_wrap = B_FALSE; 528 /* When set search for unused ipif_seqid */ 529 static ipif_t ipif_zero; 530 uint_t ipif_src_random; 531 532 /* 533 * For details on the protection offered by these locks please refer 534 * to the notes under the Synchronization section at the start of ip.c 535 */ 536 krwlock_t ill_g_lock; /* The global ill_g_lock */ 537 kmutex_t ip_addr_avail_lock; /* Address availability check lock */ 538 ipsq_t *ipsq_g_head; /* List of all ipsq's on the system */ 539 540 krwlock_t ill_g_usesrc_lock; /* Protects usesrc related fields */ 541 542 /* 543 * illgrp_head/ifgrp_head is protected by IP's perimeter. 544 */ 545 static ill_group_t *illgrp_head_v4; /* Head of IPv4 ill groups */ 546 ill_group_t *illgrp_head_v6; /* Head of IPv6 ill groups */ 547 548 ill_g_head_t ill_g_heads[MAX_G_HEADS]; /* ILL List Head */ 549 550 /* 551 * ppa arena is created after these many 552 * interfaces have been plumbed. 553 */ 554 uint_t ill_no_arena = 12; 555 556 #pragma align CACHE_ALIGN_SIZE(phyint_g_list) 557 static phyint_list_t phyint_g_list; /* start of phyint list */ 558 559 /* 560 * Reflects value of FAILBACK variable in IPMP config file 561 * /etc/default/mpathd. Default value is B_TRUE. 562 * Set to B_FALSE if user disabled failback by configuring "FAILBACK=no" 563 * in.mpathd uses SIOCSIPMPFAILBACK ioctl to pass this information to kernel. 564 */ 565 static boolean_t ipmp_enable_failback = B_TRUE; 566 567 /* 568 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 569 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 570 * set through platform specific code (Niagara/Ontario). 571 */ 572 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 573 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 574 575 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 576 577 static uint_t 578 ipif_rand(void) 579 { 580 ipif_src_random = ipif_src_random * 1103515245 + 12345; 581 return ((ipif_src_random >> 16) & 0x7fff); 582 } 583 584 /* 585 * Allocate per-interface mibs. 586 * Returns true if ok. False otherwise. 587 * ipsq may not yet be allocated (loopback case ). 588 */ 589 static boolean_t 590 ill_allocate_mibs(ill_t *ill) 591 { 592 /* Already allocated? */ 593 if (ill->ill_ip_mib != NULL) { 594 if (ill->ill_isv6) 595 ASSERT(ill->ill_icmp6_mib != NULL); 596 return (B_TRUE); 597 } 598 599 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 600 KM_NOSLEEP); 601 if (ill->ill_ip_mib == NULL) { 602 return (B_FALSE); 603 } 604 605 /* Setup static information */ 606 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 607 sizeof (mib2_ipIfStatsEntry_t)); 608 if (ill->ill_isv6) { 609 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 610 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 611 sizeof (mib2_ipv6AddrEntry_t)); 612 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 613 sizeof (mib2_ipv6RouteEntry_t)); 614 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 615 sizeof (mib2_ipv6NetToMediaEntry_t)); 616 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 617 sizeof (ipv6_member_t)); 618 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 619 sizeof (ipv6_grpsrc_t)); 620 } else { 621 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 622 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 623 sizeof (mib2_ipAddrEntry_t)); 624 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 625 sizeof (mib2_ipRouteEntry_t)); 626 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 627 sizeof (mib2_ipNetToMediaEntry_t)); 628 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 629 sizeof (ip_member_t)); 630 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 631 sizeof (ip_grpsrc_t)); 632 633 /* 634 * For a v4 ill, we are done at this point, because per ill 635 * icmp mibs are only used for v6. 636 */ 637 return (B_TRUE); 638 } 639 640 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 641 KM_NOSLEEP); 642 if (ill->ill_icmp6_mib == NULL) { 643 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 644 ill->ill_ip_mib = NULL; 645 return (B_FALSE); 646 } 647 /* static icmp info */ 648 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 649 sizeof (mib2_ipv6IfIcmpEntry_t); 650 /* 651 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 652 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 653 * -> ill_phyint_reinit 654 */ 655 return (B_TRUE); 656 } 657 658 /* 659 * Common code for preparation of ARP commands. Two points to remember: 660 * 1) The ill_name is tacked on at the end of the allocated space so 661 * the templates name_offset field must contain the total space 662 * to allocate less the name length. 663 * 664 * 2) The templates name_length field should contain the *template* 665 * length. We use it as a parameter to bcopy() and then write 666 * the real ill_name_length into the name_length field of the copy. 667 * (Always called as writer.) 668 */ 669 mblk_t * 670 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 671 { 672 arc_t *arc = (arc_t *)template; 673 char *cp; 674 int len; 675 mblk_t *mp; 676 uint_t name_length = ill->ill_name_length; 677 uint_t template_len = arc->arc_name_length; 678 679 len = arc->arc_name_offset + name_length; 680 mp = allocb(len, BPRI_HI); 681 if (mp == NULL) 682 return (NULL); 683 cp = (char *)mp->b_rptr; 684 mp->b_wptr = (uchar_t *)&cp[len]; 685 if (template_len) 686 bcopy(template, cp, template_len); 687 if (len > template_len) 688 bzero(&cp[template_len], len - template_len); 689 mp->b_datap->db_type = M_PROTO; 690 691 arc = (arc_t *)cp; 692 arc->arc_name_length = name_length; 693 cp = (char *)arc + arc->arc_name_offset; 694 bcopy(ill->ill_name, cp, name_length); 695 696 if (addr) { 697 area_t *area = (area_t *)mp->b_rptr; 698 699 cp = (char *)area + area->area_proto_addr_offset; 700 bcopy(addr, cp, area->area_proto_addr_length); 701 if (area->area_cmd == AR_ENTRY_ADD) { 702 cp = (char *)area; 703 len = area->area_proto_addr_length; 704 if (area->area_proto_mask_offset) 705 cp += area->area_proto_mask_offset; 706 else 707 cp += area->area_proto_addr_offset + len; 708 while (len-- > 0) 709 *cp++ = (char)~0; 710 } 711 } 712 return (mp); 713 } 714 715 mblk_t * 716 ipif_area_alloc(ipif_t *ipif) 717 { 718 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, 719 (char *)&ipif->ipif_lcl_addr)); 720 } 721 722 mblk_t * 723 ipif_ared_alloc(ipif_t *ipif) 724 { 725 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, 726 (char *)&ipif->ipif_lcl_addr)); 727 } 728 729 mblk_t * 730 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 731 { 732 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 733 (char *)&addr)); 734 } 735 736 /* 737 * Completely vaporize a lower level tap and all associated interfaces. 738 * ill_delete is called only out of ip_close when the device control 739 * stream is being closed. 740 */ 741 void 742 ill_delete(ill_t *ill) 743 { 744 ipif_t *ipif; 745 ill_t *prev_ill; 746 747 /* 748 * ill_delete may be forcibly entering the ipsq. The previous 749 * ioctl may not have completed and may need to be aborted. 750 * ipsq_flush takes care of it. If we don't need to enter the 751 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 752 * ill_delete_tail is sufficient. 753 */ 754 ipsq_flush(ill); 755 756 /* 757 * Nuke all interfaces. ipif_free will take down the interface, 758 * remove it from the list, and free the data structure. 759 * Walk down the ipif list and remove the logical interfaces 760 * first before removing the main ipif. We can't unplumb 761 * zeroth interface first in the case of IPv6 as reset_conn_ill 762 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 763 * POINTOPOINT. 764 * 765 * If ill_ipif was not properly initialized (i.e low on memory), 766 * then no interfaces to clean up. In this case just clean up the 767 * ill. 768 */ 769 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 770 ipif_free(ipif); 771 772 /* 773 * Used only by ill_arp_on and ill_arp_off, which are writers. 774 * So nobody can be using this mp now. Free the mp allocated for 775 * honoring ILLF_NOARP 776 */ 777 freemsg(ill->ill_arp_on_mp); 778 ill->ill_arp_on_mp = NULL; 779 780 /* Clean up msgs on pending upcalls for mrouted */ 781 reset_mrt_ill(ill); 782 783 /* 784 * ipif_free -> reset_conn_ipif will remove all multicast 785 * references for IPv4. For IPv6, we need to do it here as 786 * it points only at ills. 787 */ 788 reset_conn_ill(ill); 789 790 /* 791 * ill_down will arrange to blow off any IRE's dependent on this 792 * ILL, and shut down fragmentation reassembly. 793 */ 794 ill_down(ill); 795 796 /* Let SCTP know, so that it can remove this from its list. */ 797 sctp_update_ill(ill, SCTP_ILL_REMOVE); 798 799 /* 800 * If an address on this ILL is being used as a source address then 801 * clear out the pointers in other ILLs that point to this ILL. 802 */ 803 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 804 if (ill->ill_usesrc_grp_next != NULL) { 805 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 806 ill_disband_usesrc_group(ill); 807 } else { /* consumer of the usesrc ILL */ 808 prev_ill = ill_prev_usesrc(ill); 809 prev_ill->ill_usesrc_grp_next = 810 ill->ill_usesrc_grp_next; 811 } 812 } 813 rw_exit(&ill_g_usesrc_lock); 814 } 815 816 static void 817 ipif_non_duplicate(ipif_t *ipif) 818 { 819 ill_t *ill = ipif->ipif_ill; 820 mutex_enter(&ill->ill_lock); 821 if (ipif->ipif_flags & IPIF_DUPLICATE) { 822 ipif->ipif_flags &= ~IPIF_DUPLICATE; 823 ASSERT(ill->ill_ipif_dup_count > 0); 824 ill->ill_ipif_dup_count--; 825 } 826 mutex_exit(&ill->ill_lock); 827 } 828 829 /* 830 * Send all deferred messages without waiting for their ACKs. 831 */ 832 void 833 ill_send_all_deferred_mp(ill_t *ill) 834 { 835 mblk_t *mp, *next; 836 837 /* 838 * Clear ill_dlpi_pending so that the message is not queued in 839 * ill_dlpi_send(). 840 */ 841 ill->ill_dlpi_pending = DL_PRIM_INVAL; 842 843 for (mp = ill->ill_dlpi_deferred; mp != NULL; mp = next) { 844 next = mp->b_next; 845 mp->b_next = NULL; 846 ill_dlpi_send(ill, mp); 847 } 848 ill->ill_dlpi_deferred = NULL; 849 } 850 851 /* 852 * ill_delete_tail is called from ip_modclose after all references 853 * to the closing ill are gone. The wait is done in ip_modclose 854 */ 855 void 856 ill_delete_tail(ill_t *ill) 857 { 858 mblk_t **mpp; 859 ipif_t *ipif; 860 861 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 862 ipif_non_duplicate(ipif); 863 ipif_down_tail(ipif); 864 } 865 866 ASSERT(ill->ill_ipif_dup_count == 0 && 867 ill->ill_arp_down_mp == NULL && 868 ill->ill_arp_del_mapping_mp == NULL); 869 870 /* 871 * If polling capability is enabled (which signifies direct 872 * upcall into IP and driver has ill saved as a handle), 873 * we need to make sure that unbind has completed before we 874 * let the ill disappear and driver no longer has any reference 875 * to this ill. 876 */ 877 mutex_enter(&ill->ill_lock); 878 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 879 cv_wait(&ill->ill_cv, &ill->ill_lock); 880 mutex_exit(&ill->ill_lock); 881 882 /* 883 * Clean up polling and soft ring capabilities 884 */ 885 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 886 ill_capability_dls_disable(ill); 887 888 /* 889 * Send the detach if there's one to send (i.e., if we're above a 890 * style 2 DLPI driver). 891 */ 892 if (ill->ill_detach_mp != NULL) { 893 ill_dlpi_send(ill, ill->ill_detach_mp); 894 ill->ill_detach_mp = NULL; 895 } 896 897 if (ill->ill_net_type != IRE_LOOPBACK) 898 qprocsoff(ill->ill_rq); 899 900 /* 901 * We do an ipsq_flush once again now. New messages could have 902 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 903 * could also have landed up if an ioctl thread had looked up 904 * the ill before we set the ILL_CONDEMNED flag, but not yet 905 * enqueued the ioctl when we did the ipsq_flush last time. 906 */ 907 ipsq_flush(ill); 908 909 /* 910 * Free capabilities. 911 */ 912 if (ill->ill_ipsec_capab_ah != NULL) { 913 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 914 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 915 ill->ill_ipsec_capab_ah = NULL; 916 } 917 918 if (ill->ill_ipsec_capab_esp != NULL) { 919 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 920 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 921 ill->ill_ipsec_capab_esp = NULL; 922 } 923 924 if (ill->ill_mdt_capab != NULL) { 925 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 926 ill->ill_mdt_capab = NULL; 927 } 928 929 if (ill->ill_hcksum_capab != NULL) { 930 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 931 ill->ill_hcksum_capab = NULL; 932 } 933 934 if (ill->ill_zerocopy_capab != NULL) { 935 kmem_free(ill->ill_zerocopy_capab, 936 sizeof (ill_zerocopy_capab_t)); 937 ill->ill_zerocopy_capab = NULL; 938 } 939 940 if (ill->ill_lso_capab != NULL) { 941 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 942 ill->ill_lso_capab = NULL; 943 } 944 945 if (ill->ill_dls_capab != NULL) { 946 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 947 ill->ill_dls_capab->ill_unbind_conn = NULL; 948 kmem_free(ill->ill_dls_capab, 949 sizeof (ill_dls_capab_t) + 950 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 951 ill->ill_dls_capab = NULL; 952 } 953 954 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 955 956 while (ill->ill_ipif != NULL) 957 ipif_free_tail(ill->ill_ipif); 958 959 ill_down_tail(ill); 960 961 /* 962 * We have removed all references to ilm from conn and the ones joined 963 * within the kernel. 964 * 965 * We don't walk conns, mrts and ires because 966 * 967 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 968 * 2) ill_down ->ill_downi walks all the ires and cleans up 969 * ill references. 970 */ 971 ASSERT(ilm_walk_ill(ill) == 0); 972 /* 973 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 974 * could free the phyint. No more reference to the phyint after this 975 * point. 976 */ 977 (void) ill_glist_delete(ill); 978 979 rw_enter(&ip_g_nd_lock, RW_WRITER); 980 if (ill->ill_ndd_name != NULL) 981 nd_unload(&ip_g_nd, ill->ill_ndd_name); 982 rw_exit(&ip_g_nd_lock); 983 984 985 if (ill->ill_frag_ptr != NULL) { 986 uint_t count; 987 988 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 989 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 990 } 991 mi_free(ill->ill_frag_ptr); 992 ill->ill_frag_ptr = NULL; 993 ill->ill_frag_hash_tbl = NULL; 994 } 995 996 freemsg(ill->ill_nd_lla_mp); 997 /* Free all retained control messages. */ 998 mpp = &ill->ill_first_mp_to_free; 999 do { 1000 while (mpp[0]) { 1001 mblk_t *mp; 1002 mblk_t *mp1; 1003 1004 mp = mpp[0]; 1005 mpp[0] = mp->b_next; 1006 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 1007 mp1->b_next = NULL; 1008 mp1->b_prev = NULL; 1009 } 1010 freemsg(mp); 1011 } 1012 } while (mpp++ != &ill->ill_last_mp_to_free); 1013 1014 ill_free_mib(ill); 1015 ILL_TRACE_CLEANUP(ill); 1016 } 1017 1018 static void 1019 ill_free_mib(ill_t *ill) 1020 { 1021 /* 1022 * MIB statistics must not be lost, so when an interface 1023 * goes away the counter values will be added to the global 1024 * MIBs. 1025 */ 1026 if (ill->ill_ip_mib != NULL) { 1027 if (ill->ill_isv6) 1028 ip_mib2_add_ip_stats(&ip6_mib, ill->ill_ip_mib); 1029 else 1030 ip_mib2_add_ip_stats(&ip_mib, ill->ill_ip_mib); 1031 1032 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 1033 ill->ill_ip_mib = NULL; 1034 } 1035 if (ill->ill_icmp6_mib != NULL) { 1036 ip_mib2_add_icmp6_stats(&icmp6_mib, ill->ill_icmp6_mib); 1037 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 1038 ill->ill_icmp6_mib = NULL; 1039 } 1040 } 1041 1042 /* 1043 * Concatenate together a physical address and a sap. 1044 * 1045 * Sap_lengths are interpreted as follows: 1046 * sap_length == 0 ==> no sap 1047 * sap_length > 0 ==> sap is at the head of the dlpi address 1048 * sap_length < 0 ==> sap is at the tail of the dlpi address 1049 */ 1050 static void 1051 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 1052 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 1053 { 1054 uint16_t sap_addr = (uint16_t)sap_src; 1055 1056 if (sap_length == 0) { 1057 if (phys_src == NULL) 1058 bzero(dst, phys_length); 1059 else 1060 bcopy(phys_src, dst, phys_length); 1061 } else if (sap_length < 0) { 1062 if (phys_src == NULL) 1063 bzero(dst, phys_length); 1064 else 1065 bcopy(phys_src, dst, phys_length); 1066 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 1067 } else { 1068 bcopy(&sap_addr, dst, sizeof (sap_addr)); 1069 if (phys_src == NULL) 1070 bzero((char *)dst + sap_length, phys_length); 1071 else 1072 bcopy(phys_src, (char *)dst + sap_length, phys_length); 1073 } 1074 } 1075 1076 /* 1077 * Generate a dl_unitdata_req mblk for the device and address given. 1078 * addr_length is the length of the physical portion of the address. 1079 * If addr is NULL include an all zero address of the specified length. 1080 * TRUE? In any case, addr_length is taken to be the entire length of the 1081 * dlpi address, including the absolute value of sap_length. 1082 */ 1083 mblk_t * 1084 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1085 t_scalar_t sap_length) 1086 { 1087 dl_unitdata_req_t *dlur; 1088 mblk_t *mp; 1089 t_scalar_t abs_sap_length; /* absolute value */ 1090 1091 abs_sap_length = ABS(sap_length); 1092 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1093 DL_UNITDATA_REQ); 1094 if (mp == NULL) 1095 return (NULL); 1096 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1097 /* HACK: accomodate incompatible DLPI drivers */ 1098 if (addr_length == 8) 1099 addr_length = 6; 1100 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1101 dlur->dl_dest_addr_offset = sizeof (*dlur); 1102 dlur->dl_priority.dl_min = 0; 1103 dlur->dl_priority.dl_max = 0; 1104 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1105 (uchar_t *)&dlur[1]); 1106 return (mp); 1107 } 1108 1109 /* 1110 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 1111 * Return an error if we already have 1 or more ioctls in progress. 1112 * This is used only for non-exclusive ioctls. Currently this is used 1113 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 1114 * and thus need to use ipsq_pending_mp_add. 1115 */ 1116 boolean_t 1117 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1118 { 1119 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1120 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1121 /* 1122 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1123 */ 1124 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1125 (add_mp->b_datap->db_type == M_IOCTL)); 1126 1127 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1128 /* 1129 * Return error if the conn has started closing. The conn 1130 * could have finished cleaning up the pending mp list, 1131 * If so we should not add another mp to the list negating 1132 * the cleanup. 1133 */ 1134 if (connp->conn_state_flags & CONN_CLOSING) 1135 return (B_FALSE); 1136 /* 1137 * Add the pending mp to the head of the list, chained by b_next. 1138 * Note down the conn on which the ioctl request came, in b_prev. 1139 * This will be used to later get the conn, when we get a response 1140 * on the ill queue, from some other module (typically arp) 1141 */ 1142 add_mp->b_next = (void *)ill->ill_pending_mp; 1143 add_mp->b_queue = CONNP_TO_WQ(connp); 1144 ill->ill_pending_mp = add_mp; 1145 if (connp != NULL) 1146 connp->conn_oper_pending_ill = ill; 1147 return (B_TRUE); 1148 } 1149 1150 /* 1151 * Retrieve the ill_pending_mp and return it. We have to walk the list 1152 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1153 */ 1154 mblk_t * 1155 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1156 { 1157 mblk_t *prev = NULL; 1158 mblk_t *curr = NULL; 1159 uint_t id; 1160 conn_t *connp; 1161 1162 /* 1163 * When the conn closes, conn_ioctl_cleanup needs to clean 1164 * up the pending mp, but it does not know the ioc_id and 1165 * passes in a zero for it. 1166 */ 1167 mutex_enter(&ill->ill_lock); 1168 if (ioc_id != 0) 1169 *connpp = NULL; 1170 1171 /* Search the list for the appropriate ioctl based on ioc_id */ 1172 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1173 prev = curr, curr = curr->b_next) { 1174 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1175 connp = Q_TO_CONN(curr->b_queue); 1176 /* Match based on the ioc_id or based on the conn */ 1177 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1178 break; 1179 } 1180 1181 if (curr != NULL) { 1182 /* Unlink the mblk from the pending mp list */ 1183 if (prev != NULL) { 1184 prev->b_next = curr->b_next; 1185 } else { 1186 ASSERT(ill->ill_pending_mp == curr); 1187 ill->ill_pending_mp = curr->b_next; 1188 } 1189 1190 /* 1191 * conn refcnt must have been bumped up at the start of 1192 * the ioctl. So we can safely access the conn. 1193 */ 1194 ASSERT(CONN_Q(curr->b_queue)); 1195 *connpp = Q_TO_CONN(curr->b_queue); 1196 curr->b_next = NULL; 1197 curr->b_queue = NULL; 1198 } 1199 1200 mutex_exit(&ill->ill_lock); 1201 1202 return (curr); 1203 } 1204 1205 /* 1206 * Add the pending mp to the list. There can be only 1 pending mp 1207 * in the list. Any exclusive ioctl that needs to wait for a response 1208 * from another module or driver needs to use this function to set 1209 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1210 * the other module/driver. This is also used while waiting for the 1211 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1212 */ 1213 boolean_t 1214 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1215 int waitfor) 1216 { 1217 ipsq_t *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1218 1219 ASSERT(IAM_WRITER_IPIF(ipif)); 1220 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1221 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1222 ASSERT(ipsq->ipsq_pending_mp == NULL); 1223 /* 1224 * The caller may be using a different ipif than the one passed into 1225 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 1226 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 1227 * that `ipsq_current_ipif == ipif'. 1228 */ 1229 ASSERT(ipsq->ipsq_current_ipif != NULL); 1230 1231 /* 1232 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1233 * M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the driver. 1234 */ 1235 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1236 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP) || 1237 (DB_TYPE(add_mp) == M_PROTO) || (DB_TYPE(add_mp) == M_PCPROTO)); 1238 1239 if (connp != NULL) { 1240 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1241 /* 1242 * Return error if the conn has started closing. The conn 1243 * could have finished cleaning up the pending mp list, 1244 * If so we should not add another mp to the list negating 1245 * the cleanup. 1246 */ 1247 if (connp->conn_state_flags & CONN_CLOSING) 1248 return (B_FALSE); 1249 } 1250 mutex_enter(&ipsq->ipsq_lock); 1251 ipsq->ipsq_pending_ipif = ipif; 1252 /* 1253 * Note down the queue in b_queue. This will be returned by 1254 * ipsq_pending_mp_get. Caller will then use these values to restart 1255 * the processing 1256 */ 1257 add_mp->b_next = NULL; 1258 add_mp->b_queue = q; 1259 ipsq->ipsq_pending_mp = add_mp; 1260 ipsq->ipsq_waitfor = waitfor; 1261 1262 if (connp != NULL) 1263 connp->conn_oper_pending_ill = ipif->ipif_ill; 1264 mutex_exit(&ipsq->ipsq_lock); 1265 return (B_TRUE); 1266 } 1267 1268 /* 1269 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1270 * queued in the list. 1271 */ 1272 mblk_t * 1273 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1274 { 1275 mblk_t *curr = NULL; 1276 1277 mutex_enter(&ipsq->ipsq_lock); 1278 *connpp = NULL; 1279 if (ipsq->ipsq_pending_mp == NULL) { 1280 mutex_exit(&ipsq->ipsq_lock); 1281 return (NULL); 1282 } 1283 1284 /* There can be only 1 such excl message */ 1285 curr = ipsq->ipsq_pending_mp; 1286 ASSERT(curr != NULL && curr->b_next == NULL); 1287 ipsq->ipsq_pending_ipif = NULL; 1288 ipsq->ipsq_pending_mp = NULL; 1289 ipsq->ipsq_waitfor = 0; 1290 mutex_exit(&ipsq->ipsq_lock); 1291 1292 if (CONN_Q(curr->b_queue)) { 1293 /* 1294 * This mp did a refhold on the conn, at the start of the ioctl. 1295 * So we can safely return a pointer to the conn to the caller. 1296 */ 1297 *connpp = Q_TO_CONN(curr->b_queue); 1298 } else { 1299 *connpp = NULL; 1300 } 1301 curr->b_next = NULL; 1302 curr->b_prev = NULL; 1303 return (curr); 1304 } 1305 1306 /* 1307 * Cleanup the ioctl mp queued in ipsq_pending_mp 1308 * - Called in the ill_delete path 1309 * - Called in the M_ERROR or M_HANGUP path on the ill. 1310 * - Called in the conn close path. 1311 */ 1312 boolean_t 1313 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1314 { 1315 mblk_t *mp; 1316 ipsq_t *ipsq; 1317 queue_t *q; 1318 ipif_t *ipif; 1319 1320 ASSERT(IAM_WRITER_ILL(ill)); 1321 ipsq = ill->ill_phyint->phyint_ipsq; 1322 mutex_enter(&ipsq->ipsq_lock); 1323 /* 1324 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1325 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1326 * even if it is meant for another ill, since we have to enqueue 1327 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1328 * If connp is non-null we are called from the conn close path. 1329 */ 1330 mp = ipsq->ipsq_pending_mp; 1331 if (mp == NULL || (connp != NULL && 1332 mp->b_queue != CONNP_TO_WQ(connp))) { 1333 mutex_exit(&ipsq->ipsq_lock); 1334 return (B_FALSE); 1335 } 1336 /* Now remove from the ipsq_pending_mp */ 1337 ipsq->ipsq_pending_mp = NULL; 1338 q = mp->b_queue; 1339 mp->b_next = NULL; 1340 mp->b_prev = NULL; 1341 mp->b_queue = NULL; 1342 1343 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1344 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1345 if (ill->ill_move_in_progress) { 1346 ILL_CLEAR_MOVE(ill); 1347 } else if (ill->ill_up_ipifs) { 1348 ill_group_cleanup(ill); 1349 } 1350 1351 ipif = ipsq->ipsq_pending_ipif; 1352 ipsq->ipsq_pending_ipif = NULL; 1353 ipsq->ipsq_waitfor = 0; 1354 ipsq->ipsq_current_ipif = NULL; 1355 ipsq->ipsq_current_ioctl = 0; 1356 mutex_exit(&ipsq->ipsq_lock); 1357 1358 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1359 if (connp == NULL) { 1360 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1361 } else { 1362 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 1363 mutex_enter(&ipif->ipif_ill->ill_lock); 1364 ipif->ipif_state_flags &= ~IPIF_CHANGING; 1365 mutex_exit(&ipif->ipif_ill->ill_lock); 1366 } 1367 } else { 1368 /* 1369 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1370 * be just inet_freemsg. we have to restart it 1371 * otherwise the thread will be stuck. 1372 */ 1373 inet_freemsg(mp); 1374 } 1375 return (B_TRUE); 1376 } 1377 1378 /* 1379 * The ill is closing. Cleanup all the pending mps. Called exclusively 1380 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1381 * knows this ill, and hence nobody can add an mp to this list 1382 */ 1383 static void 1384 ill_pending_mp_cleanup(ill_t *ill) 1385 { 1386 mblk_t *mp; 1387 queue_t *q; 1388 1389 ASSERT(IAM_WRITER_ILL(ill)); 1390 1391 mutex_enter(&ill->ill_lock); 1392 /* 1393 * Every mp on the pending mp list originating from an ioctl 1394 * added 1 to the conn refcnt, at the start of the ioctl. 1395 * So bump it down now. See comments in ip_wput_nondata() 1396 */ 1397 while (ill->ill_pending_mp != NULL) { 1398 mp = ill->ill_pending_mp; 1399 ill->ill_pending_mp = mp->b_next; 1400 mutex_exit(&ill->ill_lock); 1401 1402 q = mp->b_queue; 1403 ASSERT(CONN_Q(q)); 1404 mp->b_next = NULL; 1405 mp->b_prev = NULL; 1406 mp->b_queue = NULL; 1407 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1408 mutex_enter(&ill->ill_lock); 1409 } 1410 ill->ill_pending_ipif = NULL; 1411 1412 mutex_exit(&ill->ill_lock); 1413 } 1414 1415 /* 1416 * Called in the conn close path and ill delete path 1417 */ 1418 static void 1419 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1420 { 1421 ipsq_t *ipsq; 1422 mblk_t *prev; 1423 mblk_t *curr; 1424 mblk_t *next; 1425 queue_t *q; 1426 mblk_t *tmp_list = NULL; 1427 1428 ASSERT(IAM_WRITER_ILL(ill)); 1429 if (connp != NULL) 1430 q = CONNP_TO_WQ(connp); 1431 else 1432 q = ill->ill_wq; 1433 1434 ipsq = ill->ill_phyint->phyint_ipsq; 1435 /* 1436 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1437 * In the case of ioctl from a conn, there can be only 1 mp 1438 * queued on the ipsq. If an ill is being unplumbed, only messages 1439 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1440 * ioctls meant for this ill form conn's are not flushed. They will 1441 * be processed during ipsq_exit and will not find the ill and will 1442 * return error. 1443 */ 1444 mutex_enter(&ipsq->ipsq_lock); 1445 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1446 curr = next) { 1447 next = curr->b_next; 1448 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1449 /* Unlink the mblk from the pending mp list */ 1450 if (prev != NULL) { 1451 prev->b_next = curr->b_next; 1452 } else { 1453 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1454 ipsq->ipsq_xopq_mphead = curr->b_next; 1455 } 1456 if (ipsq->ipsq_xopq_mptail == curr) 1457 ipsq->ipsq_xopq_mptail = prev; 1458 /* 1459 * Create a temporary list and release the ipsq lock 1460 * New elements are added to the head of the tmp_list 1461 */ 1462 curr->b_next = tmp_list; 1463 tmp_list = curr; 1464 } else { 1465 prev = curr; 1466 } 1467 } 1468 mutex_exit(&ipsq->ipsq_lock); 1469 1470 while (tmp_list != NULL) { 1471 curr = tmp_list; 1472 tmp_list = curr->b_next; 1473 curr->b_next = NULL; 1474 curr->b_prev = NULL; 1475 curr->b_queue = NULL; 1476 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1477 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1478 CONN_CLOSE : NO_COPYOUT, NULL); 1479 } else { 1480 /* 1481 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1482 * this can't be just inet_freemsg. we have to 1483 * restart it otherwise the thread will be stuck. 1484 */ 1485 inet_freemsg(curr); 1486 } 1487 } 1488 } 1489 1490 /* 1491 * This conn has started closing. Cleanup any pending ioctl from this conn. 1492 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1493 */ 1494 void 1495 conn_ioctl_cleanup(conn_t *connp) 1496 { 1497 mblk_t *curr; 1498 ipsq_t *ipsq; 1499 ill_t *ill; 1500 boolean_t refheld; 1501 1502 /* 1503 * Is any exclusive ioctl pending ? If so clean it up. If the 1504 * ioctl has not yet started, the mp is pending in the list headed by 1505 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1506 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1507 * is currently executing now the mp is not queued anywhere but 1508 * conn_oper_pending_ill is null. The conn close will wait 1509 * till the conn_ref drops to zero. 1510 */ 1511 mutex_enter(&connp->conn_lock); 1512 ill = connp->conn_oper_pending_ill; 1513 if (ill == NULL) { 1514 mutex_exit(&connp->conn_lock); 1515 return; 1516 } 1517 1518 curr = ill_pending_mp_get(ill, &connp, 0); 1519 if (curr != NULL) { 1520 mutex_exit(&connp->conn_lock); 1521 CONN_DEC_REF(connp); 1522 inet_freemsg(curr); 1523 return; 1524 } 1525 /* 1526 * We may not be able to refhold the ill if the ill/ipif 1527 * is changing. But we need to make sure that the ill will 1528 * not vanish. So we just bump up the ill_waiter count. 1529 */ 1530 refheld = ill_waiter_inc(ill); 1531 mutex_exit(&connp->conn_lock); 1532 if (refheld) { 1533 if (ipsq_enter(ill, B_TRUE)) { 1534 ill_waiter_dcr(ill); 1535 /* 1536 * Check whether this ioctl has started and is 1537 * pending now in ipsq_pending_mp. If it is not 1538 * found there then check whether this ioctl has 1539 * not even started and is in the ipsq_xopq list. 1540 */ 1541 if (!ipsq_pending_mp_cleanup(ill, connp)) 1542 ipsq_xopq_mp_cleanup(ill, connp); 1543 ipsq = ill->ill_phyint->phyint_ipsq; 1544 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1545 return; 1546 } 1547 } 1548 1549 /* 1550 * The ill is also closing and we could not bump up the 1551 * ill_waiter_count or we could not enter the ipsq. Leave 1552 * the cleanup to ill_delete 1553 */ 1554 mutex_enter(&connp->conn_lock); 1555 while (connp->conn_oper_pending_ill != NULL) 1556 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1557 mutex_exit(&connp->conn_lock); 1558 if (refheld) 1559 ill_waiter_dcr(ill); 1560 } 1561 1562 /* 1563 * ipcl_walk function for cleaning up conn_*_ill fields. 1564 */ 1565 static void 1566 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1567 { 1568 ill_t *ill = (ill_t *)arg; 1569 ire_t *ire; 1570 1571 mutex_enter(&connp->conn_lock); 1572 if (connp->conn_multicast_ill == ill) { 1573 /* Revert to late binding */ 1574 connp->conn_multicast_ill = NULL; 1575 connp->conn_orig_multicast_ifindex = 0; 1576 } 1577 if (connp->conn_incoming_ill == ill) 1578 connp->conn_incoming_ill = NULL; 1579 if (connp->conn_outgoing_ill == ill) 1580 connp->conn_outgoing_ill = NULL; 1581 if (connp->conn_outgoing_pill == ill) 1582 connp->conn_outgoing_pill = NULL; 1583 if (connp->conn_nofailover_ill == ill) 1584 connp->conn_nofailover_ill = NULL; 1585 if (connp->conn_xmit_if_ill == ill) 1586 connp->conn_xmit_if_ill = NULL; 1587 if (connp->conn_ire_cache != NULL) { 1588 ire = connp->conn_ire_cache; 1589 /* 1590 * ip_newroute creates IRE_CACHE with ire_stq coming from 1591 * interface X and ipif coming from interface Y, if interface 1592 * X and Y are part of the same IPMPgroup. Thus whenever 1593 * interface X goes down, remove all references to it by 1594 * checking both on ire_ipif and ire_stq. 1595 */ 1596 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1597 (ire->ire_type == IRE_CACHE && 1598 ire->ire_stq == ill->ill_wq)) { 1599 connp->conn_ire_cache = NULL; 1600 mutex_exit(&connp->conn_lock); 1601 ire_refrele_notr(ire); 1602 return; 1603 } 1604 } 1605 mutex_exit(&connp->conn_lock); 1606 1607 } 1608 1609 /* ARGSUSED */ 1610 void 1611 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1612 { 1613 ill_t *ill = q->q_ptr; 1614 ipif_t *ipif; 1615 1616 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1617 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1618 ipif_non_duplicate(ipif); 1619 ipif_down_tail(ipif); 1620 } 1621 ill_down_tail(ill); 1622 freemsg(mp); 1623 ipsq_current_finish(ipsq); 1624 } 1625 1626 /* 1627 * ill_down_start is called when we want to down this ill and bring it up again 1628 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1629 * all interfaces, but don't tear down any plumbing. 1630 */ 1631 boolean_t 1632 ill_down_start(queue_t *q, mblk_t *mp) 1633 { 1634 ill_t *ill = q->q_ptr; 1635 ipif_t *ipif; 1636 1637 ASSERT(IAM_WRITER_ILL(ill)); 1638 1639 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1640 (void) ipif_down(ipif, NULL, NULL); 1641 1642 ill_down(ill); 1643 1644 (void) ipsq_pending_mp_cleanup(ill, NULL); 1645 1646 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1647 1648 /* 1649 * Atomically test and add the pending mp if references are active. 1650 */ 1651 mutex_enter(&ill->ill_lock); 1652 if (!ill_is_quiescent(ill)) { 1653 /* call cannot fail since `conn_t *' argument is NULL */ 1654 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1655 mp, ILL_DOWN); 1656 mutex_exit(&ill->ill_lock); 1657 return (B_FALSE); 1658 } 1659 mutex_exit(&ill->ill_lock); 1660 return (B_TRUE); 1661 } 1662 1663 static void 1664 ill_down(ill_t *ill) 1665 { 1666 /* Blow off any IREs dependent on this ILL. */ 1667 ire_walk(ill_downi, (char *)ill); 1668 1669 mutex_enter(&ire_mrtun_lock); 1670 if (ire_mrtun_count != 0) { 1671 mutex_exit(&ire_mrtun_lock); 1672 ire_walk_ill_mrtun(0, 0, ill_downi_mrtun_srcif, 1673 (char *)ill, NULL); 1674 } else { 1675 mutex_exit(&ire_mrtun_lock); 1676 } 1677 1678 /* 1679 * If any interface based forwarding table exists 1680 * Blow off the ires there dependent on this ill 1681 */ 1682 mutex_enter(&ire_srcif_table_lock); 1683 if (ire_srcif_table_count > 0) { 1684 mutex_exit(&ire_srcif_table_lock); 1685 ire_walk_srcif_table_v4(ill_downi_mrtun_srcif, (char *)ill); 1686 } else { 1687 mutex_exit(&ire_srcif_table_lock); 1688 } 1689 1690 /* Remove any conn_*_ill depending on this ill */ 1691 ipcl_walk(conn_cleanup_ill, (caddr_t)ill); 1692 1693 if (ill->ill_group != NULL) { 1694 illgrp_delete(ill); 1695 } 1696 1697 } 1698 1699 static void 1700 ill_down_tail(ill_t *ill) 1701 { 1702 int i; 1703 1704 /* Destroy ill_srcif_table if it exists */ 1705 /* Lock not reqd really because nobody should be able to access */ 1706 mutex_enter(&ill->ill_lock); 1707 if (ill->ill_srcif_table != NULL) { 1708 ill->ill_srcif_refcnt = 0; 1709 for (i = 0; i < IP_SRCIF_TABLE_SIZE; i++) { 1710 rw_destroy(&ill->ill_srcif_table[i].irb_lock); 1711 } 1712 kmem_free(ill->ill_srcif_table, 1713 IP_SRCIF_TABLE_SIZE * sizeof (irb_t)); 1714 ill->ill_srcif_table = NULL; 1715 ill->ill_srcif_refcnt = 0; 1716 ill->ill_mrtun_refcnt = 0; 1717 } 1718 mutex_exit(&ill->ill_lock); 1719 } 1720 1721 /* 1722 * ire_walk routine used to delete every IRE that depends on queues 1723 * associated with 'ill'. (Always called as writer.) 1724 */ 1725 static void 1726 ill_downi(ire_t *ire, char *ill_arg) 1727 { 1728 ill_t *ill = (ill_t *)ill_arg; 1729 1730 /* 1731 * ip_newroute creates IRE_CACHE with ire_stq coming from 1732 * interface X and ipif coming from interface Y, if interface 1733 * X and Y are part of the same IPMP group. Thus whenever interface 1734 * X goes down, remove all references to it by checking both 1735 * on ire_ipif and ire_stq. 1736 */ 1737 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1738 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1739 ire_delete(ire); 1740 } 1741 } 1742 1743 /* 1744 * A seperate routine for deleting revtun and srcif based routes 1745 * are needed because the ires only deleted when the interface 1746 * is unplumbed. Also these ires have ire_in_ill non-null as well. 1747 * we want to keep mobile IP specific code separate. 1748 */ 1749 static void 1750 ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg) 1751 { 1752 ill_t *ill = (ill_t *)ill_arg; 1753 1754 ASSERT(ire->ire_in_ill != NULL); 1755 1756 if ((ire->ire_in_ill != NULL && ire->ire_in_ill == ill) || 1757 (ire->ire_stq == ill->ill_wq) || (ire->ire_stq == ill->ill_rq)) { 1758 ire_delete(ire); 1759 } 1760 } 1761 1762 /* 1763 * Remove ire/nce from the fastpath list. 1764 */ 1765 void 1766 ill_fastpath_nack(ill_t *ill) 1767 { 1768 if (ill->ill_isv6) { 1769 nce_fastpath_list_dispatch(ill, NULL, NULL); 1770 } else { 1771 ire_fastpath_list_dispatch(ill, NULL, NULL); 1772 } 1773 } 1774 1775 /* Consume an M_IOCACK of the fastpath probe. */ 1776 void 1777 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1778 { 1779 mblk_t *mp1 = mp; 1780 1781 /* 1782 * If this was the first attempt turn on the fastpath probing. 1783 */ 1784 mutex_enter(&ill->ill_lock); 1785 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1786 ill->ill_dlpi_fastpath_state = IDS_OK; 1787 mutex_exit(&ill->ill_lock); 1788 1789 /* Free the M_IOCACK mblk, hold on to the data */ 1790 mp = mp->b_cont; 1791 freeb(mp1); 1792 if (mp == NULL) 1793 return; 1794 if (mp->b_cont != NULL) { 1795 /* 1796 * Update all IRE's or NCE's that are waiting for 1797 * fastpath update. 1798 */ 1799 if (ill->ill_isv6) { 1800 /* 1801 * update nce's in the fastpath list. 1802 */ 1803 nce_fastpath_list_dispatch(ill, 1804 ndp_fastpath_update, mp); 1805 } else { 1806 1807 /* 1808 * update ire's in the fastpath list. 1809 */ 1810 ire_fastpath_list_dispatch(ill, 1811 ire_fastpath_update, mp); 1812 /* 1813 * Check if we need to traverse reverse tunnel table. 1814 * Since there is only single ire_type (IRE_MIPRTUN) 1815 * in the table, we don't need to match on ire_type. 1816 * We have to check ire_mrtun_count and not the 1817 * ill_mrtun_refcnt since ill_mrtun_refcnt is set 1818 * on the incoming ill and here we are dealing with 1819 * outgoing ill. 1820 */ 1821 mutex_enter(&ire_mrtun_lock); 1822 if (ire_mrtun_count != 0) { 1823 mutex_exit(&ire_mrtun_lock); 1824 ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN, 1825 (void (*)(ire_t *, void *)) 1826 ire_fastpath_update, mp, ill); 1827 } else { 1828 mutex_exit(&ire_mrtun_lock); 1829 } 1830 } 1831 mp1 = mp->b_cont; 1832 freeb(mp); 1833 mp = mp1; 1834 } else { 1835 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1836 } 1837 1838 freeb(mp); 1839 } 1840 1841 /* 1842 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1843 * The data portion of the request is a dl_unitdata_req_t template for 1844 * what we would send downstream in the absence of a fastpath confirmation. 1845 */ 1846 int 1847 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1848 { 1849 struct iocblk *ioc; 1850 mblk_t *mp; 1851 1852 if (dlur_mp == NULL) 1853 return (EINVAL); 1854 1855 mutex_enter(&ill->ill_lock); 1856 switch (ill->ill_dlpi_fastpath_state) { 1857 case IDS_FAILED: 1858 /* 1859 * Driver NAKed the first fastpath ioctl - assume it doesn't 1860 * support it. 1861 */ 1862 mutex_exit(&ill->ill_lock); 1863 return (ENOTSUP); 1864 case IDS_UNKNOWN: 1865 /* This is the first probe */ 1866 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1867 break; 1868 default: 1869 break; 1870 } 1871 mutex_exit(&ill->ill_lock); 1872 1873 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1874 return (EAGAIN); 1875 1876 mp->b_cont = copyb(dlur_mp); 1877 if (mp->b_cont == NULL) { 1878 freeb(mp); 1879 return (EAGAIN); 1880 } 1881 1882 ioc = (struct iocblk *)mp->b_rptr; 1883 ioc->ioc_count = msgdsize(mp->b_cont); 1884 1885 putnext(ill->ill_wq, mp); 1886 return (0); 1887 } 1888 1889 void 1890 ill_capability_probe(ill_t *ill) 1891 { 1892 /* 1893 * Do so only if negotiation is enabled, capabilities are unknown, 1894 * and a capability negotiation is not already in progress. 1895 */ 1896 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN && 1897 ill->ill_dlpi_capab_state != IDS_RENEG) 1898 return; 1899 1900 ill->ill_dlpi_capab_state = IDS_INPROGRESS; 1901 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1902 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1903 } 1904 1905 void 1906 ill_capability_reset(ill_t *ill) 1907 { 1908 mblk_t *sc_mp = NULL; 1909 mblk_t *tmp; 1910 1911 /* 1912 * Note here that we reset the state to UNKNOWN, and later send 1913 * down the DL_CAPABILITY_REQ without first setting the state to 1914 * INPROGRESS. We do this in order to distinguish the 1915 * DL_CAPABILITY_ACK response which may come back in response to 1916 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1917 * also handle the case where the driver doesn't send us back 1918 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1919 * requires the state to be in UNKNOWN anyway. In any case, all 1920 * features are turned off until the state reaches IDS_OK. 1921 */ 1922 ill->ill_dlpi_capab_state = IDS_UNKNOWN; 1923 1924 /* 1925 * Disable sub-capabilities and request a list of sub-capability 1926 * messages which will be sent down to the driver. Each handler 1927 * allocates the corresponding dl_capability_sub_t inside an 1928 * mblk, and links it to the existing sc_mp mblk, or return it 1929 * as sc_mp if it's the first sub-capability (the passed in 1930 * sc_mp is NULL). Upon returning from all capability handlers, 1931 * sc_mp will be pulled-up, before passing it downstream. 1932 */ 1933 ill_capability_mdt_reset(ill, &sc_mp); 1934 ill_capability_hcksum_reset(ill, &sc_mp); 1935 ill_capability_zerocopy_reset(ill, &sc_mp); 1936 ill_capability_ipsec_reset(ill, &sc_mp); 1937 ill_capability_dls_reset(ill, &sc_mp); 1938 ill_capability_lso_reset(ill, &sc_mp); 1939 1940 /* Nothing to send down in order to disable the capabilities? */ 1941 if (sc_mp == NULL) 1942 return; 1943 1944 tmp = msgpullup(sc_mp, -1); 1945 freemsg(sc_mp); 1946 if ((sc_mp = tmp) == NULL) { 1947 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1948 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1949 return; 1950 } 1951 1952 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1953 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1954 } 1955 1956 /* 1957 * Request or set new-style hardware capabilities supported by DLS provider. 1958 */ 1959 static void 1960 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1961 { 1962 mblk_t *mp; 1963 dl_capability_req_t *capb; 1964 size_t size = 0; 1965 uint8_t *ptr; 1966 1967 if (reqp != NULL) 1968 size = MBLKL(reqp); 1969 1970 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1971 if (mp == NULL) { 1972 freemsg(reqp); 1973 return; 1974 } 1975 ptr = mp->b_rptr; 1976 1977 capb = (dl_capability_req_t *)ptr; 1978 ptr += sizeof (dl_capability_req_t); 1979 1980 if (reqp != NULL) { 1981 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1982 capb->dl_sub_length = size; 1983 bcopy(reqp->b_rptr, ptr, size); 1984 ptr += size; 1985 mp->b_cont = reqp->b_cont; 1986 freeb(reqp); 1987 } 1988 ASSERT(ptr == mp->b_wptr); 1989 1990 ill_dlpi_send(ill, mp); 1991 } 1992 1993 static void 1994 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1995 { 1996 dl_capab_id_t *id_ic; 1997 uint_t sub_dl_cap = outers->dl_cap; 1998 dl_capability_sub_t *inners; 1999 uint8_t *capend; 2000 2001 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 2002 2003 /* 2004 * Note: range checks here are not absolutely sufficient to 2005 * make us robust against malformed messages sent by drivers; 2006 * this is in keeping with the rest of IP's dlpi handling. 2007 * (Remember, it's coming from something else in the kernel 2008 * address space) 2009 */ 2010 2011 capend = (uint8_t *)(outers + 1) + outers->dl_length; 2012 if (capend > mp->b_wptr) { 2013 cmn_err(CE_WARN, "ill_capability_id_ack: " 2014 "malformed sub-capability too long for mblk"); 2015 return; 2016 } 2017 2018 id_ic = (dl_capab_id_t *)(outers + 1); 2019 2020 if (outers->dl_length < sizeof (*id_ic) || 2021 (inners = &id_ic->id_subcap, 2022 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 2023 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 2024 "encapsulated capab type %d too long for mblk", 2025 inners->dl_cap); 2026 return; 2027 } 2028 2029 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 2030 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 2031 "isn't as expected; pass-thru module(s) detected, " 2032 "discarding capability\n", inners->dl_cap)); 2033 return; 2034 } 2035 2036 /* Process the encapsulated sub-capability */ 2037 ill_capability_dispatch(ill, mp, inners, B_TRUE); 2038 } 2039 2040 /* 2041 * Process Multidata Transmit capability negotiation ack received from a 2042 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 2043 * DL_CAPABILITY_ACK message. 2044 */ 2045 static void 2046 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2047 { 2048 mblk_t *nmp = NULL; 2049 dl_capability_req_t *oc; 2050 dl_capab_mdt_t *mdt_ic, *mdt_oc; 2051 ill_mdt_capab_t **ill_mdt_capab; 2052 uint_t sub_dl_cap = isub->dl_cap; 2053 uint8_t *capend; 2054 2055 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 2056 2057 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 2058 2059 /* 2060 * Note: range checks here are not absolutely sufficient to 2061 * make us robust against malformed messages sent by drivers; 2062 * this is in keeping with the rest of IP's dlpi handling. 2063 * (Remember, it's coming from something else in the kernel 2064 * address space) 2065 */ 2066 2067 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2068 if (capend > mp->b_wptr) { 2069 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2070 "malformed sub-capability too long for mblk"); 2071 return; 2072 } 2073 2074 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 2075 2076 if (mdt_ic->mdt_version != MDT_VERSION_2) { 2077 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 2078 "unsupported MDT sub-capability (version %d, expected %d)", 2079 mdt_ic->mdt_version, MDT_VERSION_2); 2080 return; 2081 } 2082 2083 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 2084 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 2085 "capability isn't as expected; pass-thru module(s) " 2086 "detected, discarding capability\n")); 2087 return; 2088 } 2089 2090 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 2091 2092 if (*ill_mdt_capab == NULL) { 2093 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 2094 KM_NOSLEEP); 2095 2096 if (*ill_mdt_capab == NULL) { 2097 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2098 "could not enable MDT version %d " 2099 "for %s (ENOMEM)\n", MDT_VERSION_2, 2100 ill->ill_name); 2101 return; 2102 } 2103 } 2104 2105 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 2106 "MDT version %d (%d bytes leading, %d bytes trailing " 2107 "header spaces, %d max pld bufs, %d span limit)\n", 2108 ill->ill_name, MDT_VERSION_2, 2109 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 2110 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 2111 2112 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 2113 (*ill_mdt_capab)->ill_mdt_on = 1; 2114 /* 2115 * Round the following values to the nearest 32-bit; ULP 2116 * may further adjust them to accomodate for additional 2117 * protocol headers. We pass these values to ULP during 2118 * bind time. 2119 */ 2120 (*ill_mdt_capab)->ill_mdt_hdr_head = 2121 roundup(mdt_ic->mdt_hdr_head, 4); 2122 (*ill_mdt_capab)->ill_mdt_hdr_tail = 2123 roundup(mdt_ic->mdt_hdr_tail, 4); 2124 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 2125 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 2126 2127 ill->ill_capabilities |= ILL_CAPAB_MDT; 2128 } else { 2129 uint_t size; 2130 uchar_t *rptr; 2131 2132 size = sizeof (dl_capability_req_t) + 2133 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2134 2135 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2136 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2137 "could not enable MDT for %s (ENOMEM)\n", 2138 ill->ill_name); 2139 return; 2140 } 2141 2142 rptr = nmp->b_rptr; 2143 /* initialize dl_capability_req_t */ 2144 oc = (dl_capability_req_t *)nmp->b_rptr; 2145 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2146 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2147 sizeof (dl_capab_mdt_t); 2148 nmp->b_rptr += sizeof (dl_capability_req_t); 2149 2150 /* initialize dl_capability_sub_t */ 2151 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2152 nmp->b_rptr += sizeof (*isub); 2153 2154 /* initialize dl_capab_mdt_t */ 2155 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2156 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2157 2158 nmp->b_rptr = rptr; 2159 2160 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2161 "to enable MDT version %d\n", ill->ill_name, 2162 MDT_VERSION_2)); 2163 2164 /* set ENABLE flag */ 2165 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2166 2167 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2168 ill_dlpi_send(ill, nmp); 2169 } 2170 } 2171 2172 static void 2173 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2174 { 2175 mblk_t *mp; 2176 dl_capab_mdt_t *mdt_subcap; 2177 dl_capability_sub_t *dl_subcap; 2178 int size; 2179 2180 if (!ILL_MDT_CAPABLE(ill)) 2181 return; 2182 2183 ASSERT(ill->ill_mdt_capab != NULL); 2184 /* 2185 * Clear the capability flag for MDT but retain the ill_mdt_capab 2186 * structure since it's possible that another thread is still 2187 * referring to it. The structure only gets deallocated when 2188 * we destroy the ill. 2189 */ 2190 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2191 2192 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2193 2194 mp = allocb(size, BPRI_HI); 2195 if (mp == NULL) { 2196 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2197 "request to disable MDT\n")); 2198 return; 2199 } 2200 2201 mp->b_wptr = mp->b_rptr + size; 2202 2203 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2204 dl_subcap->dl_cap = DL_CAPAB_MDT; 2205 dl_subcap->dl_length = sizeof (*mdt_subcap); 2206 2207 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2208 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2209 mdt_subcap->mdt_flags = 0; 2210 mdt_subcap->mdt_hdr_head = 0; 2211 mdt_subcap->mdt_hdr_tail = 0; 2212 2213 if (*sc_mp != NULL) 2214 linkb(*sc_mp, mp); 2215 else 2216 *sc_mp = mp; 2217 } 2218 2219 /* 2220 * Send a DL_NOTIFY_REQ to the specified ill to enable 2221 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2222 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2223 * acceleration. 2224 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2225 */ 2226 static boolean_t 2227 ill_enable_promisc_notify(ill_t *ill) 2228 { 2229 mblk_t *mp; 2230 dl_notify_req_t *req; 2231 2232 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2233 2234 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2235 if (mp == NULL) 2236 return (B_FALSE); 2237 2238 req = (dl_notify_req_t *)mp->b_rptr; 2239 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2240 DL_NOTE_PROMISC_OFF_PHYS; 2241 2242 ill_dlpi_send(ill, mp); 2243 2244 return (B_TRUE); 2245 } 2246 2247 2248 /* 2249 * Allocate an IPsec capability request which will be filled by our 2250 * caller to turn on support for one or more algorithms. 2251 */ 2252 static mblk_t * 2253 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2254 { 2255 mblk_t *nmp; 2256 dl_capability_req_t *ocap; 2257 dl_capab_ipsec_t *ocip; 2258 dl_capab_ipsec_t *icip; 2259 uint8_t *ptr; 2260 icip = (dl_capab_ipsec_t *)(isub + 1); 2261 2262 /* 2263 * The first time around, we send a DL_NOTIFY_REQ to enable 2264 * PROMISC_ON/OFF notification from the provider. We need to 2265 * do this before enabling the algorithms to avoid leakage of 2266 * cleartext packets. 2267 */ 2268 2269 if (!ill_enable_promisc_notify(ill)) 2270 return (NULL); 2271 2272 /* 2273 * Allocate new mblk which will contain a new capability 2274 * request to enable the capabilities. 2275 */ 2276 2277 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2278 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2279 if (nmp == NULL) 2280 return (NULL); 2281 2282 ptr = nmp->b_rptr; 2283 2284 /* initialize dl_capability_req_t */ 2285 ocap = (dl_capability_req_t *)ptr; 2286 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2287 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2288 ptr += sizeof (dl_capability_req_t); 2289 2290 /* initialize dl_capability_sub_t */ 2291 bcopy(isub, ptr, sizeof (*isub)); 2292 ptr += sizeof (*isub); 2293 2294 /* initialize dl_capab_ipsec_t */ 2295 ocip = (dl_capab_ipsec_t *)ptr; 2296 bcopy(icip, ocip, sizeof (*icip)); 2297 2298 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2299 return (nmp); 2300 } 2301 2302 /* 2303 * Process an IPsec capability negotiation ack received from a DLS Provider. 2304 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2305 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2306 */ 2307 static void 2308 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2309 { 2310 dl_capab_ipsec_t *icip; 2311 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2312 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2313 uint_t cipher, nciphers; 2314 mblk_t *nmp; 2315 uint_t alg_len; 2316 boolean_t need_sadb_dump; 2317 uint_t sub_dl_cap = isub->dl_cap; 2318 ill_ipsec_capab_t **ill_capab; 2319 uint64_t ill_capab_flag; 2320 uint8_t *capend, *ciphend; 2321 boolean_t sadb_resync; 2322 2323 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2324 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2325 2326 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2327 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2328 ill_capab_flag = ILL_CAPAB_AH; 2329 } else { 2330 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2331 ill_capab_flag = ILL_CAPAB_ESP; 2332 } 2333 2334 /* 2335 * If the ill capability structure exists, then this incoming 2336 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2337 * If this is so, then we'd need to resynchronize the SADB 2338 * after re-enabling the offloaded ciphers. 2339 */ 2340 sadb_resync = (*ill_capab != NULL); 2341 2342 /* 2343 * Note: range checks here are not absolutely sufficient to 2344 * make us robust against malformed messages sent by drivers; 2345 * this is in keeping with the rest of IP's dlpi handling. 2346 * (Remember, it's coming from something else in the kernel 2347 * address space) 2348 */ 2349 2350 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2351 if (capend > mp->b_wptr) { 2352 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2353 "malformed sub-capability too long for mblk"); 2354 return; 2355 } 2356 2357 /* 2358 * There are two types of acks we process here: 2359 * 1. acks in reply to a (first form) generic capability req 2360 * (no ENABLE flag set) 2361 * 2. acks in reply to a ENABLE capability req. 2362 * (ENABLE flag set) 2363 * 2364 * We process the subcapability passed as argument as follows: 2365 * 1 do initializations 2366 * 1.1 initialize nmp = NULL 2367 * 1.2 set need_sadb_dump to B_FALSE 2368 * 2 for each cipher in subcapability: 2369 * 2.1 if ENABLE flag is set: 2370 * 2.1.1 update per-ill ipsec capabilities info 2371 * 2.1.2 set need_sadb_dump to B_TRUE 2372 * 2.2 if ENABLE flag is not set: 2373 * 2.2.1 if nmp is NULL: 2374 * 2.2.1.1 allocate and initialize nmp 2375 * 2.2.1.2 init current pos in nmp 2376 * 2.2.2 copy current cipher to current pos in nmp 2377 * 2.2.3 set ENABLE flag in nmp 2378 * 2.2.4 update current pos 2379 * 3 if nmp is not equal to NULL, send enable request 2380 * 3.1 send capability request 2381 * 4 if need_sadb_dump is B_TRUE 2382 * 4.1 enable promiscuous on/off notifications 2383 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2384 * AH or ESP SA's to interface. 2385 */ 2386 2387 nmp = NULL; 2388 oalg = NULL; 2389 need_sadb_dump = B_FALSE; 2390 icip = (dl_capab_ipsec_t *)(isub + 1); 2391 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2392 2393 nciphers = icip->cip_nciphers; 2394 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2395 2396 if (ciphend > capend) { 2397 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2398 "too many ciphers for sub-capability len"); 2399 return; 2400 } 2401 2402 for (cipher = 0; cipher < nciphers; cipher++) { 2403 alg_len = sizeof (dl_capab_ipsec_alg_t); 2404 2405 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2406 /* 2407 * TBD: when we provide a way to disable capabilities 2408 * from above, need to manage the request-pending state 2409 * and fail if we were not expecting this ACK. 2410 */ 2411 IPSECHW_DEBUG(IPSECHW_CAPAB, 2412 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2413 2414 /* 2415 * Update IPsec capabilities for this ill 2416 */ 2417 2418 if (*ill_capab == NULL) { 2419 IPSECHW_DEBUG(IPSECHW_CAPAB, 2420 ("ill_capability_ipsec_ack: " 2421 "allocating ipsec_capab for ill\n")); 2422 *ill_capab = ill_ipsec_capab_alloc(); 2423 2424 if (*ill_capab == NULL) { 2425 cmn_err(CE_WARN, 2426 "ill_capability_ipsec_ack: " 2427 "could not enable IPsec Hardware " 2428 "acceleration for %s (ENOMEM)\n", 2429 ill->ill_name); 2430 return; 2431 } 2432 } 2433 2434 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2435 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2436 2437 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2438 cmn_err(CE_WARN, 2439 "ill_capability_ipsec_ack: " 2440 "malformed IPsec algorithm id %d", 2441 ialg->alg_prim); 2442 continue; 2443 } 2444 2445 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2446 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2447 ialg->alg_prim); 2448 } else { 2449 ipsec_capab_algparm_t *alp; 2450 2451 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2452 ialg->alg_prim); 2453 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2454 ialg->alg_prim)) { 2455 cmn_err(CE_WARN, 2456 "ill_capability_ipsec_ack: " 2457 "no space for IPsec alg id %d", 2458 ialg->alg_prim); 2459 continue; 2460 } 2461 alp = &((*ill_capab)->encr_algparm[ 2462 ialg->alg_prim]); 2463 alp->minkeylen = ialg->alg_minbits; 2464 alp->maxkeylen = ialg->alg_maxbits; 2465 } 2466 ill->ill_capabilities |= ill_capab_flag; 2467 /* 2468 * indicate that a capability was enabled, which 2469 * will be used below to kick off a SADB dump 2470 * to the ill. 2471 */ 2472 need_sadb_dump = B_TRUE; 2473 } else { 2474 IPSECHW_DEBUG(IPSECHW_CAPAB, 2475 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2476 ialg->alg_prim)); 2477 2478 if (nmp == NULL) { 2479 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2480 if (nmp == NULL) { 2481 /* 2482 * Sending the PROMISC_ON/OFF 2483 * notification request failed. 2484 * We cannot enable the algorithms 2485 * since the Provider will not 2486 * notify IP of promiscous mode 2487 * changes, which could lead 2488 * to leakage of packets. 2489 */ 2490 cmn_err(CE_WARN, 2491 "ill_capability_ipsec_ack: " 2492 "could not enable IPsec Hardware " 2493 "acceleration for %s (ENOMEM)\n", 2494 ill->ill_name); 2495 return; 2496 } 2497 /* ptr to current output alg specifier */ 2498 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2499 } 2500 2501 /* 2502 * Copy current alg specifier, set ENABLE 2503 * flag, and advance to next output alg. 2504 * For now we enable all IPsec capabilities. 2505 */ 2506 ASSERT(oalg != NULL); 2507 bcopy(ialg, oalg, alg_len); 2508 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2509 nmp->b_wptr += alg_len; 2510 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2511 } 2512 2513 /* move to next input algorithm specifier */ 2514 ialg = (dl_capab_ipsec_alg_t *) 2515 ((char *)ialg + alg_len); 2516 } 2517 2518 if (nmp != NULL) 2519 /* 2520 * nmp points to a DL_CAPABILITY_REQ message to enable 2521 * IPsec hardware acceleration. 2522 */ 2523 ill_dlpi_send(ill, nmp); 2524 2525 if (need_sadb_dump) 2526 /* 2527 * An acknowledgement corresponding to a request to 2528 * enable acceleration was received, notify SADB. 2529 */ 2530 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2531 } 2532 2533 /* 2534 * Given an mblk with enough space in it, create sub-capability entries for 2535 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2536 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2537 * in preparation for the reset the DL_CAPABILITY_REQ message. 2538 */ 2539 static void 2540 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2541 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2542 { 2543 dl_capab_ipsec_t *oipsec; 2544 dl_capab_ipsec_alg_t *oalg; 2545 dl_capability_sub_t *dl_subcap; 2546 int i, k; 2547 2548 ASSERT(nciphers > 0); 2549 ASSERT(ill_cap != NULL); 2550 ASSERT(mp != NULL); 2551 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2552 2553 /* dl_capability_sub_t for "stype" */ 2554 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2555 dl_subcap->dl_cap = stype; 2556 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2557 mp->b_wptr += sizeof (dl_capability_sub_t); 2558 2559 /* dl_capab_ipsec_t for "stype" */ 2560 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2561 oipsec->cip_version = 1; 2562 oipsec->cip_nciphers = nciphers; 2563 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2564 2565 /* create entries for "stype" AUTH ciphers */ 2566 for (i = 0; i < ill_cap->algs_size; i++) { 2567 for (k = 0; k < BITSPERBYTE; k++) { 2568 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2569 continue; 2570 2571 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2572 bzero((void *)oalg, sizeof (*oalg)); 2573 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2574 oalg->alg_prim = k + (BITSPERBYTE * i); 2575 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2576 } 2577 } 2578 /* create entries for "stype" ENCR ciphers */ 2579 for (i = 0; i < ill_cap->algs_size; i++) { 2580 for (k = 0; k < BITSPERBYTE; k++) { 2581 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2582 continue; 2583 2584 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2585 bzero((void *)oalg, sizeof (*oalg)); 2586 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2587 oalg->alg_prim = k + (BITSPERBYTE * i); 2588 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2589 } 2590 } 2591 } 2592 2593 /* 2594 * Macro to count number of 1s in a byte (8-bit word). The total count is 2595 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2596 * POPC instruction, but our macro is more flexible for an arbitrary length 2597 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2598 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2599 * stays that way, we can reduce the number of iterations required. 2600 */ 2601 #define COUNT_1S(val, sum) { \ 2602 uint8_t x = val & 0xff; \ 2603 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2604 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2605 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2606 } 2607 2608 /* ARGSUSED */ 2609 static void 2610 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2611 { 2612 mblk_t *mp; 2613 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2614 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2615 uint64_t ill_capabilities = ill->ill_capabilities; 2616 int ah_cnt = 0, esp_cnt = 0; 2617 int ah_len = 0, esp_len = 0; 2618 int i, size = 0; 2619 2620 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2621 return; 2622 2623 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2624 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2625 2626 /* Find out the number of ciphers for AH */ 2627 if (cap_ah != NULL) { 2628 for (i = 0; i < cap_ah->algs_size; i++) { 2629 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2630 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2631 } 2632 if (ah_cnt > 0) { 2633 size += sizeof (dl_capability_sub_t) + 2634 sizeof (dl_capab_ipsec_t); 2635 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2636 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2637 size += ah_len; 2638 } 2639 } 2640 2641 /* Find out the number of ciphers for ESP */ 2642 if (cap_esp != NULL) { 2643 for (i = 0; i < cap_esp->algs_size; i++) { 2644 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2645 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2646 } 2647 if (esp_cnt > 0) { 2648 size += sizeof (dl_capability_sub_t) + 2649 sizeof (dl_capab_ipsec_t); 2650 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2651 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2652 size += esp_len; 2653 } 2654 } 2655 2656 if (size == 0) { 2657 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2658 "there's nothing to reset\n")); 2659 return; 2660 } 2661 2662 mp = allocb(size, BPRI_HI); 2663 if (mp == NULL) { 2664 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2665 "request to disable IPSEC Hardware Acceleration\n")); 2666 return; 2667 } 2668 2669 /* 2670 * Clear the capability flags for IPSec HA but retain the ill 2671 * capability structures since it's possible that another thread 2672 * is still referring to them. The structures only get deallocated 2673 * when we destroy the ill. 2674 * 2675 * Various places check the flags to see if the ill is capable of 2676 * hardware acceleration, and by clearing them we ensure that new 2677 * outbound IPSec packets are sent down encrypted. 2678 */ 2679 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2680 2681 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2682 if (ah_cnt > 0) { 2683 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2684 cap_ah, mp); 2685 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2686 } 2687 2688 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2689 if (esp_cnt > 0) { 2690 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2691 cap_esp, mp); 2692 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2693 } 2694 2695 /* 2696 * At this point we've composed a bunch of sub-capabilities to be 2697 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2698 * by the caller. Upon receiving this reset message, the driver 2699 * must stop inbound decryption (by destroying all inbound SAs) 2700 * and let the corresponding packets come in encrypted. 2701 */ 2702 2703 if (*sc_mp != NULL) 2704 linkb(*sc_mp, mp); 2705 else 2706 *sc_mp = mp; 2707 } 2708 2709 static void 2710 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2711 boolean_t encapsulated) 2712 { 2713 boolean_t legacy = B_FALSE; 2714 2715 /* 2716 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2717 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2718 * instructed the driver to disable its advertised capabilities, 2719 * so there's no point in accepting any response at this moment. 2720 */ 2721 if (ill->ill_dlpi_capab_state == IDS_UNKNOWN) 2722 return; 2723 2724 /* 2725 * Note that only the following two sub-capabilities may be 2726 * considered as "legacy", since their original definitions 2727 * do not incorporate the dl_mid_t module ID token, and hence 2728 * may require the use of the wrapper sub-capability. 2729 */ 2730 switch (subp->dl_cap) { 2731 case DL_CAPAB_IPSEC_AH: 2732 case DL_CAPAB_IPSEC_ESP: 2733 legacy = B_TRUE; 2734 break; 2735 } 2736 2737 /* 2738 * For legacy sub-capabilities which don't incorporate a queue_t 2739 * pointer in their structures, discard them if we detect that 2740 * there are intermediate modules in between IP and the driver. 2741 */ 2742 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2743 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2744 "%d discarded; %d module(s) present below IP\n", 2745 subp->dl_cap, ill->ill_lmod_cnt)); 2746 return; 2747 } 2748 2749 switch (subp->dl_cap) { 2750 case DL_CAPAB_IPSEC_AH: 2751 case DL_CAPAB_IPSEC_ESP: 2752 ill_capability_ipsec_ack(ill, mp, subp); 2753 break; 2754 case DL_CAPAB_MDT: 2755 ill_capability_mdt_ack(ill, mp, subp); 2756 break; 2757 case DL_CAPAB_HCKSUM: 2758 ill_capability_hcksum_ack(ill, mp, subp); 2759 break; 2760 case DL_CAPAB_ZEROCOPY: 2761 ill_capability_zerocopy_ack(ill, mp, subp); 2762 break; 2763 case DL_CAPAB_POLL: 2764 if (!SOFT_RINGS_ENABLED()) 2765 ill_capability_dls_ack(ill, mp, subp); 2766 break; 2767 case DL_CAPAB_SOFT_RING: 2768 if (SOFT_RINGS_ENABLED()) 2769 ill_capability_dls_ack(ill, mp, subp); 2770 break; 2771 case DL_CAPAB_LSO: 2772 ill_capability_lso_ack(ill, mp, subp); 2773 break; 2774 default: 2775 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2776 subp->dl_cap)); 2777 } 2778 } 2779 2780 /* 2781 * As part of negotiating polling capability, the driver tells us 2782 * the default (or normal) blanking interval and packet threshold 2783 * (the receive timer fires if blanking interval is reached or 2784 * the packet threshold is reached). 2785 * 2786 * As part of manipulating the polling interval, we always use our 2787 * estimated interval (avg service time * number of packets queued 2788 * on the squeue) but we try to blank for a minimum of 2789 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2790 * packet threshold during this time. When we are not in polling mode 2791 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2792 * rr_min_blank_ratio but up the packet cnt by a ratio of 2793 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2794 * possible although for a shorter interval. 2795 */ 2796 #define RR_MAX_BLANK_RATIO 20 2797 #define RR_MIN_BLANK_RATIO 10 2798 #define RR_MAX_PKT_CNT_RATIO 3 2799 #define RR_MIN_PKT_CNT_RATIO 3 2800 2801 /* 2802 * These can be tuned via /etc/system. 2803 */ 2804 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2805 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2806 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2807 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2808 2809 static mac_resource_handle_t 2810 ill_ring_add(void *arg, mac_resource_t *mrp) 2811 { 2812 ill_t *ill = (ill_t *)arg; 2813 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2814 ill_rx_ring_t *rx_ring; 2815 int ip_rx_index; 2816 2817 ASSERT(mrp != NULL); 2818 if (mrp->mr_type != MAC_RX_FIFO) { 2819 return (NULL); 2820 } 2821 ASSERT(ill != NULL); 2822 ASSERT(ill->ill_dls_capab != NULL); 2823 2824 mutex_enter(&ill->ill_lock); 2825 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2826 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2827 ASSERT(rx_ring != NULL); 2828 2829 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2830 time_t normal_blank_time = 2831 mrfp->mrf_normal_blank_time; 2832 uint_t normal_pkt_cnt = 2833 mrfp->mrf_normal_pkt_count; 2834 2835 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2836 2837 rx_ring->rr_blank = mrfp->mrf_blank; 2838 rx_ring->rr_handle = mrfp->mrf_arg; 2839 rx_ring->rr_ill = ill; 2840 rx_ring->rr_normal_blank_time = normal_blank_time; 2841 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2842 2843 rx_ring->rr_max_blank_time = 2844 normal_blank_time * rr_max_blank_ratio; 2845 rx_ring->rr_min_blank_time = 2846 normal_blank_time * rr_min_blank_ratio; 2847 rx_ring->rr_max_pkt_cnt = 2848 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2849 rx_ring->rr_min_pkt_cnt = 2850 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2851 2852 rx_ring->rr_ring_state = ILL_RING_INUSE; 2853 mutex_exit(&ill->ill_lock); 2854 2855 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2856 (int), ip_rx_index); 2857 return ((mac_resource_handle_t)rx_ring); 2858 } 2859 } 2860 2861 /* 2862 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2863 * we have devices which can overwhelm this limit, ILL_MAX_RING 2864 * should be made configurable. Meanwhile it cause no panic because 2865 * driver will pass ip_input a NULL handle which will make 2866 * IP allocate the default squeue and Polling mode will not 2867 * be used for this ring. 2868 */ 2869 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2870 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2871 2872 mutex_exit(&ill->ill_lock); 2873 return (NULL); 2874 } 2875 2876 static boolean_t 2877 ill_capability_dls_init(ill_t *ill) 2878 { 2879 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2880 conn_t *connp; 2881 size_t sz; 2882 2883 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2884 if (ill_dls == NULL) { 2885 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2886 "soft_ring enabled for ill=%s (%p) but data " 2887 "structs uninitialized\n", ill->ill_name, 2888 (void *)ill); 2889 } 2890 return (B_TRUE); 2891 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2892 if (ill_dls == NULL) { 2893 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2894 "polling enabled for ill=%s (%p) but data " 2895 "structs uninitialized\n", ill->ill_name, 2896 (void *)ill); 2897 } 2898 return (B_TRUE); 2899 } 2900 2901 if (ill_dls != NULL) { 2902 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2903 /* Soft_Ring or polling is being re-enabled */ 2904 2905 connp = ill_dls->ill_unbind_conn; 2906 ASSERT(rx_ring != NULL); 2907 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2908 bzero((void *)rx_ring, 2909 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2910 ill_dls->ill_ring_tbl = rx_ring; 2911 ill_dls->ill_unbind_conn = connp; 2912 return (B_TRUE); 2913 } 2914 2915 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 2916 return (B_FALSE); 2917 2918 sz = sizeof (ill_dls_capab_t); 2919 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2920 2921 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2922 if (ill_dls == NULL) { 2923 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2924 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2925 (void *)ill); 2926 CONN_DEC_REF(connp); 2927 return (B_FALSE); 2928 } 2929 2930 /* Allocate space to hold ring table */ 2931 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2932 ill->ill_dls_capab = ill_dls; 2933 ill_dls->ill_unbind_conn = connp; 2934 return (B_TRUE); 2935 } 2936 2937 /* 2938 * ill_capability_dls_disable: disable soft_ring and/or polling 2939 * capability. Since any of the rings might already be in use, need 2940 * to call ipsq_clean_all() which gets behind the squeue to disable 2941 * direct calls if necessary. 2942 */ 2943 static void 2944 ill_capability_dls_disable(ill_t *ill) 2945 { 2946 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2947 2948 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2949 ipsq_clean_all(ill); 2950 ill_dls->ill_tx = NULL; 2951 ill_dls->ill_tx_handle = NULL; 2952 ill_dls->ill_dls_change_status = NULL; 2953 ill_dls->ill_dls_bind = NULL; 2954 ill_dls->ill_dls_unbind = NULL; 2955 } 2956 2957 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2958 } 2959 2960 static void 2961 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2962 dl_capability_sub_t *isub) 2963 { 2964 uint_t size; 2965 uchar_t *rptr; 2966 dl_capab_dls_t dls, *odls; 2967 ill_dls_capab_t *ill_dls; 2968 mblk_t *nmp = NULL; 2969 dl_capability_req_t *ocap; 2970 uint_t sub_dl_cap = isub->dl_cap; 2971 2972 if (!ill_capability_dls_init(ill)) 2973 return; 2974 ill_dls = ill->ill_dls_capab; 2975 2976 /* Copy locally to get the members aligned */ 2977 bcopy((void *)idls, (void *)&dls, 2978 sizeof (dl_capab_dls_t)); 2979 2980 /* Get the tx function and handle from dld */ 2981 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2982 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2983 2984 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2985 ill_dls->ill_dls_change_status = 2986 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2987 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2988 ill_dls->ill_dls_unbind = 2989 (ip_dls_unbind_t)dls.dls_ring_unbind; 2990 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2991 } 2992 2993 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2994 isub->dl_length; 2995 2996 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2997 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2998 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2999 ill->ill_name, (void *)ill); 3000 return; 3001 } 3002 3003 /* initialize dl_capability_req_t */ 3004 rptr = nmp->b_rptr; 3005 ocap = (dl_capability_req_t *)rptr; 3006 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 3007 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 3008 rptr += sizeof (dl_capability_req_t); 3009 3010 /* initialize dl_capability_sub_t */ 3011 bcopy(isub, rptr, sizeof (*isub)); 3012 rptr += sizeof (*isub); 3013 3014 odls = (dl_capab_dls_t *)rptr; 3015 rptr += sizeof (dl_capab_dls_t); 3016 3017 /* initialize dl_capab_dls_t to be sent down */ 3018 dls.dls_rx_handle = (uintptr_t)ill; 3019 dls.dls_rx = (uintptr_t)ip_input; 3020 dls.dls_ring_add = (uintptr_t)ill_ring_add; 3021 3022 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 3023 dls.dls_ring_cnt = ip_soft_rings_cnt; 3024 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 3025 dls.dls_flags = SOFT_RING_ENABLE; 3026 } else { 3027 dls.dls_flags = POLL_ENABLE; 3028 ip1dbg(("ill_capability_dls_capable: asking interface %s " 3029 "to enable polling\n", ill->ill_name)); 3030 } 3031 bcopy((void *)&dls, (void *)odls, 3032 sizeof (dl_capab_dls_t)); 3033 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3034 /* 3035 * nmp points to a DL_CAPABILITY_REQ message to 3036 * enable either soft_ring or polling 3037 */ 3038 ill_dlpi_send(ill, nmp); 3039 } 3040 3041 static void 3042 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 3043 { 3044 mblk_t *mp; 3045 dl_capab_dls_t *idls; 3046 dl_capability_sub_t *dl_subcap; 3047 int size; 3048 3049 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 3050 return; 3051 3052 ASSERT(ill->ill_dls_capab != NULL); 3053 3054 size = sizeof (*dl_subcap) + sizeof (*idls); 3055 3056 mp = allocb(size, BPRI_HI); 3057 if (mp == NULL) { 3058 ip1dbg(("ill_capability_dls_reset: unable to allocate " 3059 "request to disable soft_ring\n")); 3060 return; 3061 } 3062 3063 mp->b_wptr = mp->b_rptr + size; 3064 3065 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3066 dl_subcap->dl_length = sizeof (*idls); 3067 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 3068 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 3069 else 3070 dl_subcap->dl_cap = DL_CAPAB_POLL; 3071 3072 idls = (dl_capab_dls_t *)(dl_subcap + 1); 3073 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 3074 idls->dls_flags = SOFT_RING_DISABLE; 3075 else 3076 idls->dls_flags = POLL_DISABLE; 3077 3078 if (*sc_mp != NULL) 3079 linkb(*sc_mp, mp); 3080 else 3081 *sc_mp = mp; 3082 } 3083 3084 /* 3085 * Process a soft_ring/poll capability negotiation ack received 3086 * from a DLS Provider.isub must point to the sub-capability 3087 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 3088 */ 3089 static void 3090 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3091 { 3092 dl_capab_dls_t *idls; 3093 uint_t sub_dl_cap = isub->dl_cap; 3094 uint8_t *capend; 3095 3096 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 3097 sub_dl_cap == DL_CAPAB_POLL); 3098 3099 if (ill->ill_isv6) 3100 return; 3101 3102 /* 3103 * Note: range checks here are not absolutely sufficient to 3104 * make us robust against malformed messages sent by drivers; 3105 * this is in keeping with the rest of IP's dlpi handling. 3106 * (Remember, it's coming from something else in the kernel 3107 * address space) 3108 */ 3109 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3110 if (capend > mp->b_wptr) { 3111 cmn_err(CE_WARN, "ill_capability_dls_ack: " 3112 "malformed sub-capability too long for mblk"); 3113 return; 3114 } 3115 3116 /* 3117 * There are two types of acks we process here: 3118 * 1. acks in reply to a (first form) generic capability req 3119 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 3120 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 3121 * capability req. 3122 */ 3123 idls = (dl_capab_dls_t *)(isub + 1); 3124 3125 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 3126 ip1dbg(("ill_capability_dls_ack: mid token for dls " 3127 "capability isn't as expected; pass-thru " 3128 "module(s) detected, discarding capability\n")); 3129 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 3130 /* 3131 * This is a capability renegotitation case. 3132 * The interface better be unusable at this 3133 * point other wise bad things will happen 3134 * if we disable direct calls on a running 3135 * and up interface. 3136 */ 3137 ill_capability_dls_disable(ill); 3138 } 3139 return; 3140 } 3141 3142 switch (idls->dls_flags) { 3143 default: 3144 /* Disable if unknown flag */ 3145 case SOFT_RING_DISABLE: 3146 case POLL_DISABLE: 3147 ill_capability_dls_disable(ill); 3148 break; 3149 case SOFT_RING_CAPABLE: 3150 case POLL_CAPABLE: 3151 /* 3152 * If the capability was already enabled, its safe 3153 * to disable it first to get rid of stale information 3154 * and then start enabling it again. 3155 */ 3156 ill_capability_dls_disable(ill); 3157 ill_capability_dls_capable(ill, idls, isub); 3158 break; 3159 case SOFT_RING_ENABLE: 3160 case POLL_ENABLE: 3161 mutex_enter(&ill->ill_lock); 3162 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3163 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3164 ASSERT(ill->ill_dls_capab != NULL); 3165 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3166 } 3167 if (sub_dl_cap == DL_CAPAB_POLL && 3168 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3169 ASSERT(ill->ill_dls_capab != NULL); 3170 ill->ill_capabilities |= ILL_CAPAB_POLL; 3171 ip1dbg(("ill_capability_dls_ack: interface %s " 3172 "has enabled polling\n", ill->ill_name)); 3173 } 3174 mutex_exit(&ill->ill_lock); 3175 break; 3176 } 3177 } 3178 3179 /* 3180 * Process a hardware checksum offload capability negotiation ack received 3181 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3182 * of a DL_CAPABILITY_ACK message. 3183 */ 3184 static void 3185 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3186 { 3187 dl_capability_req_t *ocap; 3188 dl_capab_hcksum_t *ihck, *ohck; 3189 ill_hcksum_capab_t **ill_hcksum; 3190 mblk_t *nmp = NULL; 3191 uint_t sub_dl_cap = isub->dl_cap; 3192 uint8_t *capend; 3193 3194 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3195 3196 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3197 3198 /* 3199 * Note: range checks here are not absolutely sufficient to 3200 * make us robust against malformed messages sent by drivers; 3201 * this is in keeping with the rest of IP's dlpi handling. 3202 * (Remember, it's coming from something else in the kernel 3203 * address space) 3204 */ 3205 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3206 if (capend > mp->b_wptr) { 3207 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3208 "malformed sub-capability too long for mblk"); 3209 return; 3210 } 3211 3212 /* 3213 * There are two types of acks we process here: 3214 * 1. acks in reply to a (first form) generic capability req 3215 * (no ENABLE flag set) 3216 * 2. acks in reply to a ENABLE capability req. 3217 * (ENABLE flag set) 3218 */ 3219 ihck = (dl_capab_hcksum_t *)(isub + 1); 3220 3221 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3222 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3223 "unsupported hardware checksum " 3224 "sub-capability (version %d, expected %d)", 3225 ihck->hcksum_version, HCKSUM_VERSION_1); 3226 return; 3227 } 3228 3229 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3230 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3231 "checksum capability isn't as expected; pass-thru " 3232 "module(s) detected, discarding capability\n")); 3233 return; 3234 } 3235 3236 #define CURR_HCKSUM_CAPAB \ 3237 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3238 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3239 3240 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3241 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3242 /* do ENABLE processing */ 3243 if (*ill_hcksum == NULL) { 3244 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3245 KM_NOSLEEP); 3246 3247 if (*ill_hcksum == NULL) { 3248 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3249 "could not enable hcksum version %d " 3250 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3251 ill->ill_name); 3252 return; 3253 } 3254 } 3255 3256 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3257 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3258 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3259 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3260 "has enabled hardware checksumming\n ", 3261 ill->ill_name)); 3262 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3263 /* 3264 * Enabling hardware checksum offload 3265 * Currently IP supports {TCP,UDP}/IPv4 3266 * partial and full cksum offload and 3267 * IPv4 header checksum offload. 3268 * Allocate new mblk which will 3269 * contain a new capability request 3270 * to enable hardware checksum offload. 3271 */ 3272 uint_t size; 3273 uchar_t *rptr; 3274 3275 size = sizeof (dl_capability_req_t) + 3276 sizeof (dl_capability_sub_t) + isub->dl_length; 3277 3278 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3279 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3280 "could not enable hardware cksum for %s (ENOMEM)\n", 3281 ill->ill_name); 3282 return; 3283 } 3284 3285 rptr = nmp->b_rptr; 3286 /* initialize dl_capability_req_t */ 3287 ocap = (dl_capability_req_t *)nmp->b_rptr; 3288 ocap->dl_sub_offset = 3289 sizeof (dl_capability_req_t); 3290 ocap->dl_sub_length = 3291 sizeof (dl_capability_sub_t) + 3292 isub->dl_length; 3293 nmp->b_rptr += sizeof (dl_capability_req_t); 3294 3295 /* initialize dl_capability_sub_t */ 3296 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3297 nmp->b_rptr += sizeof (*isub); 3298 3299 /* initialize dl_capab_hcksum_t */ 3300 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3301 bcopy(ihck, ohck, sizeof (*ihck)); 3302 3303 nmp->b_rptr = rptr; 3304 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3305 3306 /* Set ENABLE flag */ 3307 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3308 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3309 3310 /* 3311 * nmp points to a DL_CAPABILITY_REQ message to enable 3312 * hardware checksum acceleration. 3313 */ 3314 ill_dlpi_send(ill, nmp); 3315 } else { 3316 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3317 "advertised %x hardware checksum capability flags\n", 3318 ill->ill_name, ihck->hcksum_txflags)); 3319 } 3320 } 3321 3322 static void 3323 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3324 { 3325 mblk_t *mp; 3326 dl_capab_hcksum_t *hck_subcap; 3327 dl_capability_sub_t *dl_subcap; 3328 int size; 3329 3330 if (!ILL_HCKSUM_CAPABLE(ill)) 3331 return; 3332 3333 ASSERT(ill->ill_hcksum_capab != NULL); 3334 /* 3335 * Clear the capability flag for hardware checksum offload but 3336 * retain the ill_hcksum_capab structure since it's possible that 3337 * another thread is still referring to it. The structure only 3338 * gets deallocated when we destroy the ill. 3339 */ 3340 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3341 3342 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3343 3344 mp = allocb(size, BPRI_HI); 3345 if (mp == NULL) { 3346 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3347 "request to disable hardware checksum offload\n")); 3348 return; 3349 } 3350 3351 mp->b_wptr = mp->b_rptr + size; 3352 3353 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3354 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3355 dl_subcap->dl_length = sizeof (*hck_subcap); 3356 3357 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3358 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3359 hck_subcap->hcksum_txflags = 0; 3360 3361 if (*sc_mp != NULL) 3362 linkb(*sc_mp, mp); 3363 else 3364 *sc_mp = mp; 3365 } 3366 3367 static void 3368 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3369 { 3370 mblk_t *nmp = NULL; 3371 dl_capability_req_t *oc; 3372 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3373 ill_zerocopy_capab_t **ill_zerocopy_capab; 3374 uint_t sub_dl_cap = isub->dl_cap; 3375 uint8_t *capend; 3376 3377 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3378 3379 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3380 3381 /* 3382 * Note: range checks here are not absolutely sufficient to 3383 * make us robust against malformed messages sent by drivers; 3384 * this is in keeping with the rest of IP's dlpi handling. 3385 * (Remember, it's coming from something else in the kernel 3386 * address space) 3387 */ 3388 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3389 if (capend > mp->b_wptr) { 3390 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3391 "malformed sub-capability too long for mblk"); 3392 return; 3393 } 3394 3395 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3396 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3397 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3398 "unsupported ZEROCOPY sub-capability (version %d, " 3399 "expected %d)", zc_ic->zerocopy_version, 3400 ZEROCOPY_VERSION_1); 3401 return; 3402 } 3403 3404 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3405 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3406 "capability isn't as expected; pass-thru module(s) " 3407 "detected, discarding capability\n")); 3408 return; 3409 } 3410 3411 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3412 if (*ill_zerocopy_capab == NULL) { 3413 *ill_zerocopy_capab = 3414 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3415 KM_NOSLEEP); 3416 3417 if (*ill_zerocopy_capab == NULL) { 3418 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3419 "could not enable Zero-copy version %d " 3420 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3421 ill->ill_name); 3422 return; 3423 } 3424 } 3425 3426 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3427 "supports Zero-copy version %d\n", ill->ill_name, 3428 ZEROCOPY_VERSION_1)); 3429 3430 (*ill_zerocopy_capab)->ill_zerocopy_version = 3431 zc_ic->zerocopy_version; 3432 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3433 zc_ic->zerocopy_flags; 3434 3435 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3436 } else { 3437 uint_t size; 3438 uchar_t *rptr; 3439 3440 size = sizeof (dl_capability_req_t) + 3441 sizeof (dl_capability_sub_t) + 3442 sizeof (dl_capab_zerocopy_t); 3443 3444 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3445 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3446 "could not enable zerocopy for %s (ENOMEM)\n", 3447 ill->ill_name); 3448 return; 3449 } 3450 3451 rptr = nmp->b_rptr; 3452 /* initialize dl_capability_req_t */ 3453 oc = (dl_capability_req_t *)rptr; 3454 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3455 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3456 sizeof (dl_capab_zerocopy_t); 3457 rptr += sizeof (dl_capability_req_t); 3458 3459 /* initialize dl_capability_sub_t */ 3460 bcopy(isub, rptr, sizeof (*isub)); 3461 rptr += sizeof (*isub); 3462 3463 /* initialize dl_capab_zerocopy_t */ 3464 zc_oc = (dl_capab_zerocopy_t *)rptr; 3465 *zc_oc = *zc_ic; 3466 3467 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3468 "to enable zero-copy version %d\n", ill->ill_name, 3469 ZEROCOPY_VERSION_1)); 3470 3471 /* set VMSAFE_MEM flag */ 3472 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3473 3474 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3475 ill_dlpi_send(ill, nmp); 3476 } 3477 } 3478 3479 static void 3480 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3481 { 3482 mblk_t *mp; 3483 dl_capab_zerocopy_t *zerocopy_subcap; 3484 dl_capability_sub_t *dl_subcap; 3485 int size; 3486 3487 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3488 return; 3489 3490 ASSERT(ill->ill_zerocopy_capab != NULL); 3491 /* 3492 * Clear the capability flag for Zero-copy but retain the 3493 * ill_zerocopy_capab structure since it's possible that another 3494 * thread is still referring to it. The structure only gets 3495 * deallocated when we destroy the ill. 3496 */ 3497 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3498 3499 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3500 3501 mp = allocb(size, BPRI_HI); 3502 if (mp == NULL) { 3503 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3504 "request to disable Zero-copy\n")); 3505 return; 3506 } 3507 3508 mp->b_wptr = mp->b_rptr + size; 3509 3510 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3511 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3512 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3513 3514 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3515 zerocopy_subcap->zerocopy_version = 3516 ill->ill_zerocopy_capab->ill_zerocopy_version; 3517 zerocopy_subcap->zerocopy_flags = 0; 3518 3519 if (*sc_mp != NULL) 3520 linkb(*sc_mp, mp); 3521 else 3522 *sc_mp = mp; 3523 } 3524 3525 /* 3526 * Process Large Segment Offload capability negotiation ack received from a 3527 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_LSO) of a 3528 * DL_CAPABILITY_ACK message. 3529 */ 3530 static void 3531 ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3532 { 3533 mblk_t *nmp = NULL; 3534 dl_capability_req_t *oc; 3535 dl_capab_lso_t *lso_ic, *lso_oc; 3536 ill_lso_capab_t **ill_lso_capab; 3537 uint_t sub_dl_cap = isub->dl_cap; 3538 uint8_t *capend; 3539 3540 ASSERT(sub_dl_cap == DL_CAPAB_LSO); 3541 3542 ill_lso_capab = (ill_lso_capab_t **)&ill->ill_lso_capab; 3543 3544 /* 3545 * Note: range checks here are not absolutely sufficient to 3546 * make us robust against malformed messages sent by drivers; 3547 * this is in keeping with the rest of IP's dlpi handling. 3548 * (Remember, it's coming from something else in the kernel 3549 * address space) 3550 */ 3551 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3552 if (capend > mp->b_wptr) { 3553 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3554 "malformed sub-capability too long for mblk"); 3555 return; 3556 } 3557 3558 lso_ic = (dl_capab_lso_t *)(isub + 1); 3559 3560 if (lso_ic->lso_version != LSO_VERSION_1) { 3561 cmn_err(CE_CONT, "ill_capability_lso_ack: " 3562 "unsupported LSO sub-capability (version %d, expected %d)", 3563 lso_ic->lso_version, LSO_VERSION_1); 3564 return; 3565 } 3566 3567 if (!dlcapabcheckqid(&lso_ic->lso_mid, ill->ill_lmod_rq)) { 3568 ip1dbg(("ill_capability_lso_ack: mid token for LSO " 3569 "capability isn't as expected; pass-thru module(s) " 3570 "detected, discarding capability\n")); 3571 return; 3572 } 3573 3574 if ((lso_ic->lso_flags & LSO_TX_ENABLE) && 3575 (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4)) { 3576 if (*ill_lso_capab == NULL) { 3577 *ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 3578 KM_NOSLEEP); 3579 3580 if (*ill_lso_capab == NULL) { 3581 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3582 "could not enable LSO version %d " 3583 "for %s (ENOMEM)\n", LSO_VERSION_1, 3584 ill->ill_name); 3585 return; 3586 } 3587 } 3588 3589 (*ill_lso_capab)->ill_lso_version = lso_ic->lso_version; 3590 (*ill_lso_capab)->ill_lso_flags = lso_ic->lso_flags; 3591 (*ill_lso_capab)->ill_lso_max = lso_ic->lso_max; 3592 ill->ill_capabilities |= ILL_CAPAB_LSO; 3593 3594 ip1dbg(("ill_capability_lso_ack: interface %s " 3595 "has enabled LSO\n ", ill->ill_name)); 3596 } else if (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4) { 3597 uint_t size; 3598 uchar_t *rptr; 3599 3600 size = sizeof (dl_capability_req_t) + 3601 sizeof (dl_capability_sub_t) + sizeof (dl_capab_lso_t); 3602 3603 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3604 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3605 "could not enable LSO for %s (ENOMEM)\n", 3606 ill->ill_name); 3607 return; 3608 } 3609 3610 rptr = nmp->b_rptr; 3611 /* initialize dl_capability_req_t */ 3612 oc = (dl_capability_req_t *)nmp->b_rptr; 3613 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3614 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3615 sizeof (dl_capab_lso_t); 3616 nmp->b_rptr += sizeof (dl_capability_req_t); 3617 3618 /* initialize dl_capability_sub_t */ 3619 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3620 nmp->b_rptr += sizeof (*isub); 3621 3622 /* initialize dl_capab_lso_t */ 3623 lso_oc = (dl_capab_lso_t *)nmp->b_rptr; 3624 bcopy(lso_ic, lso_oc, sizeof (*lso_ic)); 3625 3626 nmp->b_rptr = rptr; 3627 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3628 3629 /* set ENABLE flag */ 3630 lso_oc->lso_flags |= LSO_TX_ENABLE; 3631 3632 /* nmp points to a DL_CAPABILITY_REQ message to enable LSO */ 3633 ill_dlpi_send(ill, nmp); 3634 } else { 3635 ip1dbg(("ill_capability_lso_ack: interface %s has " 3636 "advertised %x LSO capability flags\n", 3637 ill->ill_name, lso_ic->lso_flags)); 3638 } 3639 } 3640 3641 3642 static void 3643 ill_capability_lso_reset(ill_t *ill, mblk_t **sc_mp) 3644 { 3645 mblk_t *mp; 3646 dl_capab_lso_t *lso_subcap; 3647 dl_capability_sub_t *dl_subcap; 3648 int size; 3649 3650 if (!(ill->ill_capabilities & ILL_CAPAB_LSO)) 3651 return; 3652 3653 ASSERT(ill->ill_lso_capab != NULL); 3654 /* 3655 * Clear the capability flag for LSO but retain the 3656 * ill_lso_capab structure since it's possible that another 3657 * thread is still referring to it. The structure only gets 3658 * deallocated when we destroy the ill. 3659 */ 3660 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 3661 3662 size = sizeof (*dl_subcap) + sizeof (*lso_subcap); 3663 3664 mp = allocb(size, BPRI_HI); 3665 if (mp == NULL) { 3666 ip1dbg(("ill_capability_lso_reset: unable to allocate " 3667 "request to disable LSO\n")); 3668 return; 3669 } 3670 3671 mp->b_wptr = mp->b_rptr + size; 3672 3673 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3674 dl_subcap->dl_cap = DL_CAPAB_LSO; 3675 dl_subcap->dl_length = sizeof (*lso_subcap); 3676 3677 lso_subcap = (dl_capab_lso_t *)(dl_subcap + 1); 3678 lso_subcap->lso_version = ill->ill_lso_capab->ill_lso_version; 3679 lso_subcap->lso_flags = 0; 3680 3681 if (*sc_mp != NULL) 3682 linkb(*sc_mp, mp); 3683 else 3684 *sc_mp = mp; 3685 } 3686 3687 /* 3688 * Consume a new-style hardware capabilities negotiation ack. 3689 * Called from ip_rput_dlpi_writer(). 3690 */ 3691 void 3692 ill_capability_ack(ill_t *ill, mblk_t *mp) 3693 { 3694 dl_capability_ack_t *capp; 3695 dl_capability_sub_t *subp, *endp; 3696 3697 if (ill->ill_dlpi_capab_state == IDS_INPROGRESS) 3698 ill->ill_dlpi_capab_state = IDS_OK; 3699 3700 capp = (dl_capability_ack_t *)mp->b_rptr; 3701 3702 if (capp->dl_sub_length == 0) 3703 /* no new-style capabilities */ 3704 return; 3705 3706 /* make sure the driver supplied correct dl_sub_length */ 3707 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3708 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3709 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3710 return; 3711 } 3712 3713 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3714 /* 3715 * There are sub-capabilities. Process the ones we know about. 3716 * Loop until we don't have room for another sub-cap header.. 3717 */ 3718 for (subp = SC(capp, capp->dl_sub_offset), 3719 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3720 subp <= endp; 3721 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3722 3723 switch (subp->dl_cap) { 3724 case DL_CAPAB_ID_WRAPPER: 3725 ill_capability_id_ack(ill, mp, subp); 3726 break; 3727 default: 3728 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3729 break; 3730 } 3731 } 3732 #undef SC 3733 } 3734 3735 /* 3736 * This routine is called to scan the fragmentation reassembly table for 3737 * the specified ILL for any packets that are starting to smell. 3738 * dead_interval is the maximum time in seconds that will be tolerated. It 3739 * will either be the value specified in ip_g_frag_timeout, or zero if the 3740 * ILL is shutting down and it is time to blow everything off. 3741 * 3742 * It returns the number of seconds (as a time_t) that the next frag timer 3743 * should be scheduled for, 0 meaning that the timer doesn't need to be 3744 * re-started. Note that the method of calculating next_timeout isn't 3745 * entirely accurate since time will flow between the time we grab 3746 * current_time and the time we schedule the next timeout. This isn't a 3747 * big problem since this is the timer for sending an ICMP reassembly time 3748 * exceeded messages, and it doesn't have to be exactly accurate. 3749 * 3750 * This function is 3751 * sometimes called as writer, although this is not required. 3752 */ 3753 time_t 3754 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3755 { 3756 ipfb_t *ipfb; 3757 ipfb_t *endp; 3758 ipf_t *ipf; 3759 ipf_t *ipfnext; 3760 mblk_t *mp; 3761 time_t current_time = gethrestime_sec(); 3762 time_t next_timeout = 0; 3763 uint32_t hdr_length; 3764 mblk_t *send_icmp_head; 3765 mblk_t *send_icmp_head_v6; 3766 zoneid_t zoneid; 3767 3768 ipfb = ill->ill_frag_hash_tbl; 3769 if (ipfb == NULL) 3770 return (B_FALSE); 3771 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3772 /* Walk the frag hash table. */ 3773 for (; ipfb < endp; ipfb++) { 3774 send_icmp_head = NULL; 3775 send_icmp_head_v6 = NULL; 3776 mutex_enter(&ipfb->ipfb_lock); 3777 while ((ipf = ipfb->ipfb_ipf) != 0) { 3778 time_t frag_time = current_time - ipf->ipf_timestamp; 3779 time_t frag_timeout; 3780 3781 if (frag_time < dead_interval) { 3782 /* 3783 * There are some outstanding fragments 3784 * that will timeout later. Make note of 3785 * the time so that we can reschedule the 3786 * next timeout appropriately. 3787 */ 3788 frag_timeout = dead_interval - frag_time; 3789 if (next_timeout == 0 || 3790 frag_timeout < next_timeout) { 3791 next_timeout = frag_timeout; 3792 } 3793 break; 3794 } 3795 /* Time's up. Get it out of here. */ 3796 hdr_length = ipf->ipf_nf_hdr_len; 3797 ipfnext = ipf->ipf_hash_next; 3798 if (ipfnext) 3799 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3800 *ipf->ipf_ptphn = ipfnext; 3801 mp = ipf->ipf_mp->b_cont; 3802 for (; mp; mp = mp->b_cont) { 3803 /* Extra points for neatness. */ 3804 IP_REASS_SET_START(mp, 0); 3805 IP_REASS_SET_END(mp, 0); 3806 } 3807 mp = ipf->ipf_mp->b_cont; 3808 ill->ill_frag_count -= ipf->ipf_count; 3809 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3810 ipfb->ipfb_count -= ipf->ipf_count; 3811 ASSERT(ipfb->ipfb_frag_pkts > 0); 3812 ipfb->ipfb_frag_pkts--; 3813 /* 3814 * We do not send any icmp message from here because 3815 * we currently are holding the ipfb_lock for this 3816 * hash chain. If we try and send any icmp messages 3817 * from here we may end up via a put back into ip 3818 * trying to get the same lock, causing a recursive 3819 * mutex panic. Instead we build a list and send all 3820 * the icmp messages after we have dropped the lock. 3821 */ 3822 if (ill->ill_isv6) { 3823 if (hdr_length != 0) { 3824 mp->b_next = send_icmp_head_v6; 3825 send_icmp_head_v6 = mp; 3826 } else { 3827 freemsg(mp); 3828 } 3829 } else { 3830 if (hdr_length != 0) { 3831 mp->b_next = send_icmp_head; 3832 send_icmp_head = mp; 3833 } else { 3834 freemsg(mp); 3835 } 3836 } 3837 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3838 freeb(ipf->ipf_mp); 3839 } 3840 mutex_exit(&ipfb->ipfb_lock); 3841 /* 3842 * Now need to send any icmp messages that we delayed from 3843 * above. 3844 */ 3845 while (send_icmp_head_v6 != NULL) { 3846 ip6_t *ip6h; 3847 3848 mp = send_icmp_head_v6; 3849 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3850 mp->b_next = NULL; 3851 if (mp->b_datap->db_type == M_CTL) 3852 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3853 else 3854 ip6h = (ip6_t *)mp->b_rptr; 3855 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3856 ill); 3857 if (zoneid == ALL_ZONES) { 3858 freemsg(mp); 3859 } else { 3860 icmp_time_exceeded_v6(ill->ill_wq, mp, 3861 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3862 B_FALSE, zoneid); 3863 } 3864 } 3865 while (send_icmp_head != NULL) { 3866 ipaddr_t dst; 3867 3868 mp = send_icmp_head; 3869 send_icmp_head = send_icmp_head->b_next; 3870 mp->b_next = NULL; 3871 3872 if (mp->b_datap->db_type == M_CTL) 3873 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3874 else 3875 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3876 3877 zoneid = ipif_lookup_addr_zoneid(dst, ill); 3878 if (zoneid == ALL_ZONES) { 3879 freemsg(mp); 3880 } else { 3881 icmp_time_exceeded(ill->ill_wq, mp, 3882 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid); 3883 } 3884 } 3885 } 3886 /* 3887 * A non-dying ILL will use the return value to decide whether to 3888 * restart the frag timer, and for how long. 3889 */ 3890 return (next_timeout); 3891 } 3892 3893 /* 3894 * This routine is called when the approximate count of mblk memory used 3895 * for the specified ILL has exceeded max_count. 3896 */ 3897 void 3898 ill_frag_prune(ill_t *ill, uint_t max_count) 3899 { 3900 ipfb_t *ipfb; 3901 ipf_t *ipf; 3902 size_t count; 3903 3904 /* 3905 * If we are here within ip_min_frag_prune_time msecs remove 3906 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3907 * ill_frag_free_num_pkts. 3908 */ 3909 mutex_enter(&ill->ill_lock); 3910 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3911 (ip_min_frag_prune_time != 0 ? 3912 ip_min_frag_prune_time : msec_per_tick)) { 3913 3914 ill->ill_frag_free_num_pkts++; 3915 3916 } else { 3917 ill->ill_frag_free_num_pkts = 0; 3918 } 3919 ill->ill_last_frag_clean_time = lbolt; 3920 mutex_exit(&ill->ill_lock); 3921 3922 /* 3923 * free ill_frag_free_num_pkts oldest packets from each bucket. 3924 */ 3925 if (ill->ill_frag_free_num_pkts != 0) { 3926 int ix; 3927 3928 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3929 ipfb = &ill->ill_frag_hash_tbl[ix]; 3930 mutex_enter(&ipfb->ipfb_lock); 3931 if (ipfb->ipfb_ipf != NULL) { 3932 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3933 ill->ill_frag_free_num_pkts); 3934 } 3935 mutex_exit(&ipfb->ipfb_lock); 3936 } 3937 } 3938 /* 3939 * While the reassembly list for this ILL is too big, prune a fragment 3940 * queue by age, oldest first. Note that the per ILL count is 3941 * approximate, while the per frag hash bucket counts are accurate. 3942 */ 3943 while (ill->ill_frag_count > max_count) { 3944 int ix; 3945 ipfb_t *oipfb = NULL; 3946 uint_t oldest = UINT_MAX; 3947 3948 count = 0; 3949 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3950 ipfb = &ill->ill_frag_hash_tbl[ix]; 3951 mutex_enter(&ipfb->ipfb_lock); 3952 ipf = ipfb->ipfb_ipf; 3953 if (ipf != NULL && ipf->ipf_gen < oldest) { 3954 oldest = ipf->ipf_gen; 3955 oipfb = ipfb; 3956 } 3957 count += ipfb->ipfb_count; 3958 mutex_exit(&ipfb->ipfb_lock); 3959 } 3960 /* Refresh the per ILL count */ 3961 ill->ill_frag_count = count; 3962 if (oipfb == NULL) { 3963 ill->ill_frag_count = 0; 3964 break; 3965 } 3966 if (count <= max_count) 3967 return; /* Somebody beat us to it, nothing to do */ 3968 mutex_enter(&oipfb->ipfb_lock); 3969 ipf = oipfb->ipfb_ipf; 3970 if (ipf != NULL) { 3971 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3972 } 3973 mutex_exit(&oipfb->ipfb_lock); 3974 } 3975 } 3976 3977 /* 3978 * free 'free_cnt' fragmented packets starting at ipf. 3979 */ 3980 void 3981 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3982 { 3983 size_t count; 3984 mblk_t *mp; 3985 mblk_t *tmp; 3986 ipf_t **ipfp = ipf->ipf_ptphn; 3987 3988 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3989 ASSERT(ipfp != NULL); 3990 ASSERT(ipf != NULL); 3991 3992 while (ipf != NULL && free_cnt-- > 0) { 3993 count = ipf->ipf_count; 3994 mp = ipf->ipf_mp; 3995 ipf = ipf->ipf_hash_next; 3996 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3997 IP_REASS_SET_START(tmp, 0); 3998 IP_REASS_SET_END(tmp, 0); 3999 } 4000 ill->ill_frag_count -= count; 4001 ASSERT(ipfb->ipfb_count >= count); 4002 ipfb->ipfb_count -= count; 4003 ASSERT(ipfb->ipfb_frag_pkts > 0); 4004 ipfb->ipfb_frag_pkts--; 4005 freemsg(mp); 4006 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 4007 } 4008 4009 if (ipf) 4010 ipf->ipf_ptphn = ipfp; 4011 ipfp[0] = ipf; 4012 } 4013 4014 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 4015 "obsolete and may be removed in a future release of Solaris. Use " \ 4016 "ifconfig(1M) to manipulate the forwarding status of an interface." 4017 4018 /* 4019 * For obsolete per-interface forwarding configuration; 4020 * called in response to ND_GET. 4021 */ 4022 /* ARGSUSED */ 4023 static int 4024 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 4025 { 4026 ill_t *ill = (ill_t *)cp; 4027 4028 cmn_err(CE_WARN, ND_FORWARD_WARNING); 4029 4030 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 4031 return (0); 4032 } 4033 4034 /* 4035 * For obsolete per-interface forwarding configuration; 4036 * called in response to ND_SET. 4037 */ 4038 /* ARGSUSED */ 4039 static int 4040 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 4041 cred_t *ioc_cr) 4042 { 4043 long value; 4044 int retval; 4045 4046 cmn_err(CE_WARN, ND_FORWARD_WARNING); 4047 4048 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 4049 value < 0 || value > 1) { 4050 return (EINVAL); 4051 } 4052 4053 rw_enter(&ill_g_lock, RW_READER); 4054 retval = ill_forward_set(q, mp, (value != 0), cp); 4055 rw_exit(&ill_g_lock); 4056 return (retval); 4057 } 4058 4059 /* 4060 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 4061 * IPMP group, make sure all ill's in the group adopt the new policy. Send 4062 * up RTS_IFINFO routing socket messages for each interface whose flags we 4063 * change. 4064 */ 4065 /* ARGSUSED */ 4066 int 4067 ill_forward_set(queue_t *q, mblk_t *mp, boolean_t enable, caddr_t cp) 4068 { 4069 ill_t *ill = (ill_t *)cp; 4070 ill_group_t *illgrp; 4071 4072 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ill_g_lock)); 4073 4074 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 4075 (!enable && !(ill->ill_flags & ILLF_ROUTER)) || 4076 (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK)) 4077 return (EINVAL); 4078 4079 /* 4080 * If the ill is in an IPMP group, set the forwarding policy on all 4081 * members of the group to the same value. 4082 */ 4083 illgrp = ill->ill_group; 4084 if (illgrp != NULL) { 4085 ill_t *tmp_ill; 4086 4087 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 4088 tmp_ill = tmp_ill->ill_group_next) { 4089 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 4090 (enable ? "Enabling" : "Disabling"), 4091 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 4092 tmp_ill->ill_name)); 4093 mutex_enter(&tmp_ill->ill_lock); 4094 if (enable) 4095 tmp_ill->ill_flags |= ILLF_ROUTER; 4096 else 4097 tmp_ill->ill_flags &= ~ILLF_ROUTER; 4098 mutex_exit(&tmp_ill->ill_lock); 4099 if (tmp_ill->ill_isv6) 4100 ill_set_nce_router_flags(tmp_ill, enable); 4101 /* Notify routing socket listeners of this change. */ 4102 ip_rts_ifmsg(tmp_ill->ill_ipif); 4103 } 4104 } else { 4105 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 4106 (enable ? "Enabling" : "Disabling"), 4107 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 4108 mutex_enter(&ill->ill_lock); 4109 if (enable) 4110 ill->ill_flags |= ILLF_ROUTER; 4111 else 4112 ill->ill_flags &= ~ILLF_ROUTER; 4113 mutex_exit(&ill->ill_lock); 4114 if (ill->ill_isv6) 4115 ill_set_nce_router_flags(ill, enable); 4116 /* Notify routing socket listeners of this change. */ 4117 ip_rts_ifmsg(ill->ill_ipif); 4118 } 4119 4120 return (0); 4121 } 4122 4123 /* 4124 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 4125 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 4126 * set or clear. 4127 */ 4128 static void 4129 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 4130 { 4131 ipif_t *ipif; 4132 nce_t *nce; 4133 4134 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4135 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 4136 if (nce != NULL) { 4137 mutex_enter(&nce->nce_lock); 4138 if (enable) 4139 nce->nce_flags |= NCE_F_ISROUTER; 4140 else 4141 nce->nce_flags &= ~NCE_F_ISROUTER; 4142 mutex_exit(&nce->nce_lock); 4143 NCE_REFRELE(nce); 4144 } 4145 } 4146 } 4147 4148 /* 4149 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 4150 * for this ill. Make sure the v6/v4 question has been answered about this 4151 * ill. The creation of this ndd variable is only for backwards compatibility. 4152 * The preferred way to control per-interface IP forwarding is through the 4153 * ILLF_ROUTER interface flag. 4154 */ 4155 static int 4156 ill_set_ndd_name(ill_t *ill) 4157 { 4158 char *suffix; 4159 4160 ASSERT(IAM_WRITER_ILL(ill)); 4161 4162 if (ill->ill_isv6) 4163 suffix = ipv6_forward_suffix; 4164 else 4165 suffix = ipv4_forward_suffix; 4166 4167 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 4168 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 4169 /* 4170 * Copies over the '\0'. 4171 * Note that strlen(suffix) is always bounded. 4172 */ 4173 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 4174 strlen(suffix) + 1); 4175 4176 /* 4177 * Use of the nd table requires holding the reader lock. 4178 * Modifying the nd table thru nd_load/nd_unload requires 4179 * the writer lock. 4180 */ 4181 rw_enter(&ip_g_nd_lock, RW_WRITER); 4182 if (!nd_load(&ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 4183 nd_ill_forward_set, (caddr_t)ill)) { 4184 /* 4185 * If the nd_load failed, it only meant that it could not 4186 * allocate a new bunch of room for further NDD expansion. 4187 * Because of that, the ill_ndd_name will be set to 0, and 4188 * this interface is at the mercy of the global ip_forwarding 4189 * variable. 4190 */ 4191 rw_exit(&ip_g_nd_lock); 4192 ill->ill_ndd_name = NULL; 4193 return (ENOMEM); 4194 } 4195 rw_exit(&ip_g_nd_lock); 4196 return (0); 4197 } 4198 4199 /* 4200 * Intializes the context structure and returns the first ill in the list 4201 * cuurently start_list and end_list can have values: 4202 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 4203 * IP_V4_G_HEAD Traverse IPV4 list only. 4204 * IP_V6_G_HEAD Traverse IPV6 list only. 4205 */ 4206 4207 /* 4208 * We don't check for CONDEMNED ills here. Caller must do that if 4209 * necessary under the ill lock. 4210 */ 4211 ill_t * 4212 ill_first(int start_list, int end_list, ill_walk_context_t *ctx) 4213 { 4214 ill_if_t *ifp; 4215 ill_t *ill; 4216 avl_tree_t *avl_tree; 4217 4218 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4219 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 4220 4221 /* 4222 * setup the lists to search 4223 */ 4224 if (end_list != MAX_G_HEADS) { 4225 ctx->ctx_current_list = start_list; 4226 ctx->ctx_last_list = end_list; 4227 } else { 4228 ctx->ctx_last_list = MAX_G_HEADS - 1; 4229 ctx->ctx_current_list = 0; 4230 } 4231 4232 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 4233 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 4234 if (ifp != (ill_if_t *) 4235 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 4236 avl_tree = &ifp->illif_avl_by_ppa; 4237 ill = avl_first(avl_tree); 4238 /* 4239 * ill is guaranteed to be non NULL or ifp should have 4240 * not existed. 4241 */ 4242 ASSERT(ill != NULL); 4243 return (ill); 4244 } 4245 ctx->ctx_current_list++; 4246 } 4247 4248 return (NULL); 4249 } 4250 4251 /* 4252 * returns the next ill in the list. ill_first() must have been called 4253 * before calling ill_next() or bad things will happen. 4254 */ 4255 4256 /* 4257 * We don't check for CONDEMNED ills here. Caller must do that if 4258 * necessary under the ill lock. 4259 */ 4260 ill_t * 4261 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 4262 { 4263 ill_if_t *ifp; 4264 ill_t *ill; 4265 4266 4267 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 4268 ASSERT(lastill->ill_ifptr != (ill_if_t *) 4269 &IP_VX_ILL_G_LIST(ctx->ctx_current_list)); 4270 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 4271 AVL_AFTER)) != NULL) { 4272 return (ill); 4273 } 4274 4275 /* goto next ill_ifp in the list. */ 4276 ifp = lastill->ill_ifptr->illif_next; 4277 4278 /* make sure not at end of circular list */ 4279 while (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list)) { 4280 if (++ctx->ctx_current_list > ctx->ctx_last_list) 4281 return (NULL); 4282 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list); 4283 } 4284 4285 return (avl_first(&ifp->illif_avl_by_ppa)); 4286 } 4287 4288 /* 4289 * Check interface name for correct format which is name+ppa. 4290 * name can contain characters and digits, the right most digits 4291 * make up the ppa number. use of octal is not allowed, name must contain 4292 * a ppa, return pointer to the start of ppa. 4293 * In case of error return NULL. 4294 */ 4295 static char * 4296 ill_get_ppa_ptr(char *name) 4297 { 4298 int namelen = mi_strlen(name); 4299 4300 int len = namelen; 4301 4302 name += len; 4303 while (len > 0) { 4304 name--; 4305 if (*name < '0' || *name > '9') 4306 break; 4307 len--; 4308 } 4309 4310 /* empty string, all digits, or no trailing digits */ 4311 if (len == 0 || len == (int)namelen) 4312 return (NULL); 4313 4314 name++; 4315 /* check for attempted use of octal */ 4316 if (*name == '0' && len != (int)namelen - 1) 4317 return (NULL); 4318 return (name); 4319 } 4320 4321 /* 4322 * use avl tree to locate the ill. 4323 */ 4324 static ill_t * 4325 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4326 ipsq_func_t func, int *error) 4327 { 4328 char *ppa_ptr = NULL; 4329 int len; 4330 uint_t ppa; 4331 ill_t *ill = NULL; 4332 ill_if_t *ifp; 4333 int list; 4334 ipsq_t *ipsq; 4335 4336 if (error != NULL) 4337 *error = 0; 4338 4339 /* 4340 * get ppa ptr 4341 */ 4342 if (isv6) 4343 list = IP_V6_G_HEAD; 4344 else 4345 list = IP_V4_G_HEAD; 4346 4347 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4348 if (error != NULL) 4349 *error = ENXIO; 4350 return (NULL); 4351 } 4352 4353 len = ppa_ptr - name + 1; 4354 4355 ppa = stoi(&ppa_ptr); 4356 4357 ifp = IP_VX_ILL_G_LIST(list); 4358 4359 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4360 /* 4361 * match is done on len - 1 as the name is not null 4362 * terminated it contains ppa in addition to the interface 4363 * name. 4364 */ 4365 if ((ifp->illif_name_len == len) && 4366 bcmp(ifp->illif_name, name, len - 1) == 0) { 4367 break; 4368 } else { 4369 ifp = ifp->illif_next; 4370 } 4371 } 4372 4373 4374 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list)) { 4375 /* 4376 * Even the interface type does not exist. 4377 */ 4378 if (error != NULL) 4379 *error = ENXIO; 4380 return (NULL); 4381 } 4382 4383 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4384 if (ill != NULL) { 4385 /* 4386 * The block comment at the start of ipif_down 4387 * explains the use of the macros used below 4388 */ 4389 GRAB_CONN_LOCK(q); 4390 mutex_enter(&ill->ill_lock); 4391 if (ILL_CAN_LOOKUP(ill)) { 4392 ill_refhold_locked(ill); 4393 mutex_exit(&ill->ill_lock); 4394 RELEASE_CONN_LOCK(q); 4395 return (ill); 4396 } else if (ILL_CAN_WAIT(ill, q)) { 4397 ipsq = ill->ill_phyint->phyint_ipsq; 4398 mutex_enter(&ipsq->ipsq_lock); 4399 mutex_exit(&ill->ill_lock); 4400 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4401 mutex_exit(&ipsq->ipsq_lock); 4402 RELEASE_CONN_LOCK(q); 4403 *error = EINPROGRESS; 4404 return (NULL); 4405 } 4406 mutex_exit(&ill->ill_lock); 4407 RELEASE_CONN_LOCK(q); 4408 } 4409 if (error != NULL) 4410 *error = ENXIO; 4411 return (NULL); 4412 } 4413 4414 /* 4415 * comparison function for use with avl. 4416 */ 4417 static int 4418 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4419 { 4420 uint_t ppa; 4421 uint_t ill_ppa; 4422 4423 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4424 4425 ppa = *((uint_t *)ppa_ptr); 4426 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4427 /* 4428 * We want the ill with the lowest ppa to be on the 4429 * top. 4430 */ 4431 if (ill_ppa < ppa) 4432 return (1); 4433 if (ill_ppa > ppa) 4434 return (-1); 4435 return (0); 4436 } 4437 4438 /* 4439 * remove an interface type from the global list. 4440 */ 4441 static void 4442 ill_delete_interface_type(ill_if_t *interface) 4443 { 4444 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4445 4446 ASSERT(interface != NULL); 4447 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4448 4449 avl_destroy(&interface->illif_avl_by_ppa); 4450 if (interface->illif_ppa_arena != NULL) 4451 vmem_destroy(interface->illif_ppa_arena); 4452 4453 remque(interface); 4454 4455 mi_free(interface); 4456 } 4457 4458 /* Defined in ip_netinfo.c */ 4459 extern ddi_taskq_t *eventq_queue_nic; 4460 4461 /* 4462 * remove ill from the global list. 4463 */ 4464 static void 4465 ill_glist_delete(ill_t *ill) 4466 { 4467 char *nicname; 4468 size_t nicnamelen; 4469 hook_nic_event_t *info; 4470 4471 if (ill == NULL) 4472 return; 4473 4474 rw_enter(&ill_g_lock, RW_WRITER); 4475 4476 if (ill->ill_name != NULL) { 4477 nicname = kmem_alloc(ill->ill_name_length, KM_NOSLEEP); 4478 if (nicname != NULL) { 4479 bcopy(ill->ill_name, nicname, ill->ill_name_length); 4480 nicnamelen = ill->ill_name_length; 4481 } 4482 } else { 4483 nicname = NULL; 4484 nicnamelen = 0; 4485 } 4486 4487 /* 4488 * If the ill was never inserted into the AVL tree 4489 * we skip the if branch. 4490 */ 4491 if (ill->ill_ifptr != NULL) { 4492 /* 4493 * remove from AVL tree and free ppa number 4494 */ 4495 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4496 4497 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4498 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4499 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4500 } 4501 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4502 ill_delete_interface_type(ill->ill_ifptr); 4503 } 4504 4505 /* 4506 * Indicate ill is no longer in the list. 4507 */ 4508 ill->ill_ifptr = NULL; 4509 ill->ill_name_length = 0; 4510 ill->ill_name[0] = '\0'; 4511 ill->ill_ppa = UINT_MAX; 4512 } 4513 4514 /* 4515 * Run the unplumb hook after the NIC has disappeared from being 4516 * visible so that attempts to revalidate its existance will fail. 4517 * 4518 * This needs to be run inside the ill_g_lock perimeter to ensure 4519 * that the ordering of delivered events to listeners matches the 4520 * order of them in the kernel. 4521 */ 4522 if ((info = ill->ill_nic_event_info) != NULL) { 4523 if (info->hne_event != NE_DOWN) { 4524 ip2dbg(("ill_glist_delete: unexpected nic event %d " 4525 "attached for %s\n", info->hne_event, 4526 ill->ill_name)); 4527 if (info->hne_data != NULL) 4528 kmem_free(info->hne_data, info->hne_datalen); 4529 kmem_free(info, sizeof (hook_nic_event_t)); 4530 } else { 4531 if (ddi_taskq_dispatch(eventq_queue_nic, 4532 ip_ne_queue_func, (void *)info, DDI_SLEEP) 4533 == DDI_FAILURE) { 4534 ip2dbg(("ill_glist_delete: ddi_taskq_dispatch " 4535 "failed\n")); 4536 if (info->hne_data != NULL) 4537 kmem_free(info->hne_data, 4538 info->hne_datalen); 4539 kmem_free(info, sizeof (hook_nic_event_t)); 4540 } 4541 } 4542 } 4543 4544 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 4545 if (info != NULL) { 4546 info->hne_nic = ill->ill_phyint->phyint_ifindex; 4547 info->hne_lif = 0; 4548 info->hne_event = NE_UNPLUMB; 4549 info->hne_data = nicname; 4550 info->hne_datalen = nicnamelen; 4551 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 4552 } else { 4553 ip2dbg(("ill_glist_delete: could not attach UNPLUMB nic event " 4554 "information for %s (ENOMEM)\n", ill->ill_name)); 4555 if (nicname != NULL) 4556 kmem_free(nicname, nicnamelen); 4557 } 4558 4559 ill->ill_nic_event_info = info; 4560 4561 ill_phyint_free(ill); 4562 4563 rw_exit(&ill_g_lock); 4564 } 4565 4566 /* 4567 * allocate a ppa, if the number of plumbed interfaces of this type are 4568 * less than ill_no_arena do a linear search to find a unused ppa. 4569 * When the number goes beyond ill_no_arena switch to using an arena. 4570 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4571 * is the return value for an error condition, so allocation starts at one 4572 * and is decremented by one. 4573 */ 4574 static int 4575 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4576 { 4577 ill_t *tmp_ill; 4578 uint_t start, end; 4579 int ppa; 4580 4581 if (ifp->illif_ppa_arena == NULL && 4582 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4583 /* 4584 * Create an arena. 4585 */ 4586 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4587 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4588 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4589 /* allocate what has already been assigned */ 4590 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4591 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4592 tmp_ill, AVL_AFTER)) { 4593 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4594 1, /* size */ 4595 1, /* align/quantum */ 4596 0, /* phase */ 4597 0, /* nocross */ 4598 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), /* minaddr */ 4599 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), /* maxaddr */ 4600 VM_NOSLEEP|VM_FIRSTFIT); 4601 if (ppa == 0) { 4602 ip1dbg(("ill_alloc_ppa: ppa allocation" 4603 " failed while switching")); 4604 vmem_destroy(ifp->illif_ppa_arena); 4605 ifp->illif_ppa_arena = NULL; 4606 break; 4607 } 4608 } 4609 } 4610 4611 if (ifp->illif_ppa_arena != NULL) { 4612 if (ill->ill_ppa == UINT_MAX) { 4613 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4614 1, VM_NOSLEEP|VM_FIRSTFIT); 4615 if (ppa == 0) 4616 return (EAGAIN); 4617 ill->ill_ppa = --ppa; 4618 } else { 4619 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4620 1, /* size */ 4621 1, /* align/quantum */ 4622 0, /* phase */ 4623 0, /* nocross */ 4624 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4625 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4626 VM_NOSLEEP|VM_FIRSTFIT); 4627 /* 4628 * Most likely the allocation failed because 4629 * the requested ppa was in use. 4630 */ 4631 if (ppa == 0) 4632 return (EEXIST); 4633 } 4634 return (0); 4635 } 4636 4637 /* 4638 * No arena is in use and not enough (>ill_no_arena) interfaces have 4639 * been plumbed to create one. Do a linear search to get a unused ppa. 4640 */ 4641 if (ill->ill_ppa == UINT_MAX) { 4642 end = UINT_MAX - 1; 4643 start = 0; 4644 } else { 4645 end = start = ill->ill_ppa; 4646 } 4647 4648 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4649 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4650 if (start++ >= end) { 4651 if (ill->ill_ppa == UINT_MAX) 4652 return (EAGAIN); 4653 else 4654 return (EEXIST); 4655 } 4656 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4657 } 4658 ill->ill_ppa = start; 4659 return (0); 4660 } 4661 4662 /* 4663 * Insert ill into the list of configured ill's. Once this function completes, 4664 * the ill is globally visible and is available through lookups. More precisely 4665 * this happens after the caller drops the ill_g_lock. 4666 */ 4667 static int 4668 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4669 { 4670 ill_if_t *ill_interface; 4671 avl_index_t where = 0; 4672 int error; 4673 int name_length; 4674 int index; 4675 boolean_t check_length = B_FALSE; 4676 4677 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 4678 4679 name_length = mi_strlen(name) + 1; 4680 4681 if (isv6) 4682 index = IP_V6_G_HEAD; 4683 else 4684 index = IP_V4_G_HEAD; 4685 4686 ill_interface = IP_VX_ILL_G_LIST(index); 4687 /* 4688 * Search for interface type based on name 4689 */ 4690 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4691 if ((ill_interface->illif_name_len == name_length) && 4692 (strcmp(ill_interface->illif_name, name) == 0)) { 4693 break; 4694 } 4695 ill_interface = ill_interface->illif_next; 4696 } 4697 4698 /* 4699 * Interface type not found, create one. 4700 */ 4701 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index)) { 4702 4703 ill_g_head_t ghead; 4704 4705 /* 4706 * allocate ill_if_t structure 4707 */ 4708 4709 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4710 if (ill_interface == NULL) { 4711 return (ENOMEM); 4712 } 4713 4714 4715 4716 (void) strcpy(ill_interface->illif_name, name); 4717 ill_interface->illif_name_len = name_length; 4718 4719 avl_create(&ill_interface->illif_avl_by_ppa, 4720 ill_compare_ppa, sizeof (ill_t), 4721 offsetof(struct ill_s, ill_avl_byppa)); 4722 4723 /* 4724 * link the structure in the back to maintain order 4725 * of configuration for ifconfig output. 4726 */ 4727 ghead = ill_g_heads[index]; 4728 insque(ill_interface, ghead.ill_g_list_tail); 4729 4730 } 4731 4732 if (ill->ill_ppa == UINT_MAX) 4733 check_length = B_TRUE; 4734 4735 error = ill_alloc_ppa(ill_interface, ill); 4736 if (error != 0) { 4737 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4738 ill_delete_interface_type(ill->ill_ifptr); 4739 return (error); 4740 } 4741 4742 /* 4743 * When the ppa is choosen by the system, check that there is 4744 * enough space to insert ppa. if a specific ppa was passed in this 4745 * check is not required as the interface name passed in will have 4746 * the right ppa in it. 4747 */ 4748 if (check_length) { 4749 /* 4750 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4751 */ 4752 char buf[sizeof (uint_t) * 3]; 4753 4754 /* 4755 * convert ppa to string to calculate the amount of space 4756 * required for it in the name. 4757 */ 4758 numtos(ill->ill_ppa, buf); 4759 4760 /* Do we have enough space to insert ppa ? */ 4761 4762 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4763 /* Free ppa and interface type struct */ 4764 if (ill_interface->illif_ppa_arena != NULL) { 4765 vmem_free(ill_interface->illif_ppa_arena, 4766 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4767 } 4768 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4769 0) { 4770 ill_delete_interface_type(ill->ill_ifptr); 4771 } 4772 4773 return (EINVAL); 4774 } 4775 } 4776 4777 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4778 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4779 4780 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4781 &where); 4782 ill->ill_ifptr = ill_interface; 4783 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4784 4785 ill_phyint_reinit(ill); 4786 return (0); 4787 } 4788 4789 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4790 static boolean_t 4791 ipsq_init(ill_t *ill) 4792 { 4793 ipsq_t *ipsq; 4794 4795 /* Init the ipsq and impicitly enter as writer */ 4796 ill->ill_phyint->phyint_ipsq = 4797 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4798 if (ill->ill_phyint->phyint_ipsq == NULL) 4799 return (B_FALSE); 4800 ipsq = ill->ill_phyint->phyint_ipsq; 4801 ipsq->ipsq_phyint_list = ill->ill_phyint; 4802 ill->ill_phyint->phyint_ipsq_next = NULL; 4803 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4804 ipsq->ipsq_refs = 1; 4805 ipsq->ipsq_writer = curthread; 4806 ipsq->ipsq_reentry_cnt = 1; 4807 #ifdef ILL_DEBUG 4808 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, IP_STACK_DEPTH); 4809 #endif 4810 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4811 return (B_TRUE); 4812 } 4813 4814 /* 4815 * ill_init is called by ip_open when a device control stream is opened. 4816 * It does a few initializations, and shoots a DL_INFO_REQ message down 4817 * to the driver. The response is later picked up in ip_rput_dlpi and 4818 * used to set up default mechanisms for talking to the driver. (Always 4819 * called as writer.) 4820 * 4821 * If this function returns error, ip_open will call ip_close which in 4822 * turn will call ill_delete to clean up any memory allocated here that 4823 * is not yet freed. 4824 */ 4825 int 4826 ill_init(queue_t *q, ill_t *ill) 4827 { 4828 int count; 4829 dl_info_req_t *dlir; 4830 mblk_t *info_mp; 4831 uchar_t *frag_ptr; 4832 4833 /* 4834 * The ill is initialized to zero by mi_alloc*(). In addition 4835 * some fields already contain valid values, initialized in 4836 * ip_open(), before we reach here. 4837 */ 4838 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4839 4840 ill->ill_rq = q; 4841 ill->ill_wq = WR(q); 4842 4843 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4844 BPRI_HI); 4845 if (info_mp == NULL) 4846 return (ENOMEM); 4847 4848 /* 4849 * Allocate sufficient space to contain our fragment hash table and 4850 * the device name. 4851 */ 4852 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4853 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4854 if (frag_ptr == NULL) { 4855 freemsg(info_mp); 4856 return (ENOMEM); 4857 } 4858 ill->ill_frag_ptr = frag_ptr; 4859 ill->ill_frag_free_num_pkts = 0; 4860 ill->ill_last_frag_clean_time = 0; 4861 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4862 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4863 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4864 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4865 NULL, MUTEX_DEFAULT, NULL); 4866 } 4867 4868 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4869 if (ill->ill_phyint == NULL) { 4870 freemsg(info_mp); 4871 mi_free(frag_ptr); 4872 return (ENOMEM); 4873 } 4874 4875 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4876 /* 4877 * For now pretend this is a v4 ill. We need to set phyint_ill* 4878 * at this point because of the following reason. If we can't 4879 * enter the ipsq at some point and cv_wait, the writer that 4880 * wakes us up tries to locate us using the list of all phyints 4881 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4882 * If we don't set it now, we risk a missed wakeup. 4883 */ 4884 ill->ill_phyint->phyint_illv4 = ill; 4885 ill->ill_ppa = UINT_MAX; 4886 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4887 4888 if (!ipsq_init(ill)) { 4889 freemsg(info_mp); 4890 mi_free(frag_ptr); 4891 mi_free(ill->ill_phyint); 4892 return (ENOMEM); 4893 } 4894 4895 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4896 4897 4898 /* Frag queue limit stuff */ 4899 ill->ill_frag_count = 0; 4900 ill->ill_ipf_gen = 0; 4901 4902 ill->ill_global_timer = INFINITY; 4903 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 4904 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4905 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4906 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4907 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4908 4909 /* 4910 * Initialize IPv6 configuration variables. The IP module is always 4911 * opened as an IPv4 module. Instead tracking down the cases where 4912 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4913 * here for convenience, this has no effect until the ill is set to do 4914 * IPv6. 4915 */ 4916 ill->ill_reachable_time = ND_REACHABLE_TIME; 4917 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4918 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4919 ill->ill_max_buf = ND_MAX_Q; 4920 ill->ill_refcnt = 0; 4921 4922 /* Send down the Info Request to the driver. */ 4923 info_mp->b_datap->db_type = M_PCPROTO; 4924 dlir = (dl_info_req_t *)info_mp->b_rptr; 4925 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4926 dlir->dl_primitive = DL_INFO_REQ; 4927 4928 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4929 4930 qprocson(q); 4931 ill_dlpi_send(ill, info_mp); 4932 4933 return (0); 4934 } 4935 4936 /* 4937 * ill_dls_info 4938 * creates datalink socket info from the device. 4939 */ 4940 int 4941 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4942 { 4943 size_t len; 4944 ill_t *ill = ipif->ipif_ill; 4945 4946 sdl->sdl_family = AF_LINK; 4947 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4948 sdl->sdl_type = ill->ill_type; 4949 (void) ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4950 len = strlen(sdl->sdl_data); 4951 ASSERT(len < 256); 4952 sdl->sdl_nlen = (uchar_t)len; 4953 sdl->sdl_alen = ill->ill_phys_addr_length; 4954 sdl->sdl_slen = 0; 4955 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 4956 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 4957 4958 return (sizeof (struct sockaddr_dl)); 4959 } 4960 4961 /* 4962 * ill_xarp_info 4963 * creates xarp info from the device. 4964 */ 4965 static int 4966 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4967 { 4968 sdl->sdl_family = AF_LINK; 4969 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4970 sdl->sdl_type = ill->ill_type; 4971 (void) ipif_get_name(ill->ill_ipif, sdl->sdl_data, 4972 sizeof (sdl->sdl_data)); 4973 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4974 sdl->sdl_alen = ill->ill_phys_addr_length; 4975 sdl->sdl_slen = 0; 4976 return (sdl->sdl_nlen); 4977 } 4978 4979 static int 4980 loopback_kstat_update(kstat_t *ksp, int rw) 4981 { 4982 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 4983 4984 if (rw == KSTAT_WRITE) 4985 return (EACCES); 4986 kn[0].value.ui32 = loopback_packets; 4987 kn[1].value.ui32 = loopback_packets; 4988 return (0); 4989 } 4990 4991 4992 /* 4993 * Has ifindex been plumbed already. 4994 */ 4995 static boolean_t 4996 phyint_exists(uint_t index) 4997 { 4998 phyint_t *phyi; 4999 5000 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 5001 /* 5002 * Indexes are stored in the phyint - a common structure 5003 * to both IPv4 and IPv6. 5004 */ 5005 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 5006 (void *) &index, NULL); 5007 return (phyi != NULL); 5008 } 5009 5010 /* 5011 * Assign a unique interface index for the phyint. 5012 */ 5013 static boolean_t 5014 phyint_assign_ifindex(phyint_t *phyi) 5015 { 5016 uint_t starting_index; 5017 5018 ASSERT(phyi->phyint_ifindex == 0); 5019 if (!ill_index_wrap) { 5020 phyi->phyint_ifindex = ill_index++; 5021 if (ill_index == 0) { 5022 /* Reached the uint_t limit Next time wrap */ 5023 ill_index_wrap = B_TRUE; 5024 } 5025 return (B_TRUE); 5026 } 5027 5028 /* 5029 * Start reusing unused indexes. Note that we hold the ill_g_lock 5030 * at this point and don't want to call any function that attempts 5031 * to get the lock again. 5032 */ 5033 starting_index = ill_index++; 5034 for (; ill_index != starting_index; ill_index++) { 5035 if (ill_index != 0 && !phyint_exists(ill_index)) { 5036 /* found unused index - use it */ 5037 phyi->phyint_ifindex = ill_index; 5038 return (B_TRUE); 5039 } 5040 } 5041 5042 /* 5043 * all interface indicies are inuse. 5044 */ 5045 return (B_FALSE); 5046 } 5047 5048 /* 5049 * Return a pointer to the ill which matches the supplied name. Note that 5050 * the ill name length includes the null termination character. (May be 5051 * called as writer.) 5052 * If do_alloc and the interface is "lo0" it will be automatically created. 5053 * Cannot bump up reference on condemned ills. So dup detect can't be done 5054 * using this func. 5055 */ 5056 ill_t * 5057 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 5058 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc) 5059 { 5060 ill_t *ill; 5061 ipif_t *ipif; 5062 kstat_named_t *kn; 5063 boolean_t isloopback; 5064 ipsq_t *old_ipsq; 5065 5066 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 5067 5068 rw_enter(&ill_g_lock, RW_READER); 5069 ill = ill_find_by_name(name, isv6, q, mp, func, error); 5070 rw_exit(&ill_g_lock); 5071 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 5072 return (ill); 5073 5074 /* 5075 * Couldn't find it. Does this happen to be a lookup for the 5076 * loopback device and are we allowed to allocate it? 5077 */ 5078 if (!isloopback || !do_alloc) 5079 return (NULL); 5080 5081 rw_enter(&ill_g_lock, RW_WRITER); 5082 5083 ill = ill_find_by_name(name, isv6, q, mp, func, error); 5084 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 5085 rw_exit(&ill_g_lock); 5086 return (ill); 5087 } 5088 5089 /* Create the loopback device on demand */ 5090 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 5091 sizeof (ipif_loopback_name), BPRI_MED)); 5092 if (ill == NULL) 5093 goto done; 5094 5095 *ill = ill_null; 5096 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 5097 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 5098 if (ill->ill_phyint == NULL) 5099 goto done; 5100 5101 if (isv6) 5102 ill->ill_phyint->phyint_illv6 = ill; 5103 else 5104 ill->ill_phyint->phyint_illv4 = ill; 5105 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 5106 ill->ill_max_frag = IP_LOOPBACK_MTU; 5107 /* Add room for tcp+ip headers */ 5108 if (isv6) { 5109 ill->ill_isv6 = B_TRUE; 5110 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 5111 } else { 5112 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 5113 } 5114 if (!ill_allocate_mibs(ill)) 5115 goto done; 5116 ill->ill_max_mtu = ill->ill_max_frag; 5117 /* 5118 * ipif_loopback_name can't be pointed at directly because its used 5119 * by both the ipv4 and ipv6 interfaces. When the ill is removed 5120 * from the glist, ill_glist_delete() sets the first character of 5121 * ill_name to '\0'. 5122 */ 5123 ill->ill_name = (char *)ill + sizeof (*ill); 5124 (void) strcpy(ill->ill_name, ipif_loopback_name); 5125 ill->ill_name_length = sizeof (ipif_loopback_name); 5126 /* Set ill_name_set for ill_phyint_reinit to work properly */ 5127 5128 ill->ill_global_timer = INFINITY; 5129 ill->ill_mcast_type = IGMP_V3_ROUTER; /* == MLD_V2_ROUTER */ 5130 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 5131 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 5132 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 5133 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 5134 5135 /* No resolver here. */ 5136 ill->ill_net_type = IRE_LOOPBACK; 5137 5138 /* Initialize the ipsq */ 5139 if (!ipsq_init(ill)) 5140 goto done; 5141 5142 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 5143 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 5144 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 5145 #ifdef ILL_DEBUG 5146 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 5147 #endif 5148 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 5149 if (ipif == NULL) 5150 goto done; 5151 5152 ill->ill_flags = ILLF_MULTICAST; 5153 5154 /* Set up default loopback address and mask. */ 5155 if (!isv6) { 5156 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 5157 5158 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 5159 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 5160 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 5161 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 5162 ipif->ipif_v6subnet); 5163 ill->ill_flags |= ILLF_IPV4; 5164 } else { 5165 ipif->ipif_v6lcl_addr = ipv6_loopback; 5166 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 5167 ipif->ipif_v6net_mask = ipv6_all_ones; 5168 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 5169 ipif->ipif_v6subnet); 5170 ill->ill_flags |= ILLF_IPV6; 5171 } 5172 5173 /* 5174 * Chain us in at the end of the ill list. hold the ill 5175 * before we make it globally visible. 1 for the lookup. 5176 */ 5177 ill->ill_refcnt = 0; 5178 ill_refhold(ill); 5179 5180 ill->ill_frag_count = 0; 5181 ill->ill_frag_free_num_pkts = 0; 5182 ill->ill_last_frag_clean_time = 0; 5183 5184 old_ipsq = ill->ill_phyint->phyint_ipsq; 5185 5186 if (ill_glist_insert(ill, "lo", isv6) != 0) 5187 cmn_err(CE_PANIC, "cannot insert loopback interface"); 5188 5189 /* Let SCTP know so that it can add this to its list */ 5190 sctp_update_ill(ill, SCTP_ILL_INSERT); 5191 5192 /* Let SCTP know about this IPIF, so that it can add it to its list */ 5193 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 5194 5195 /* 5196 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 5197 */ 5198 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 5199 /* Loopback ills aren't in any IPMP group */ 5200 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 5201 ipsq_delete(old_ipsq); 5202 } 5203 5204 /* 5205 * Delay this till the ipif is allocated as ipif_allocate 5206 * de-references ill_phyint for getting the ifindex. We 5207 * can't do this before ipif_allocate because ill_phyint_reinit 5208 * -> phyint_assign_ifindex expects ipif to be present. 5209 */ 5210 mutex_enter(&ill->ill_phyint->phyint_lock); 5211 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 5212 mutex_exit(&ill->ill_phyint->phyint_lock); 5213 5214 if (loopback_ksp == NULL) { 5215 /* Export loopback interface statistics */ 5216 loopback_ksp = kstat_create("lo", 0, ipif_loopback_name, "net", 5217 KSTAT_TYPE_NAMED, 2, 0); 5218 if (loopback_ksp != NULL) { 5219 loopback_ksp->ks_update = loopback_kstat_update; 5220 kn = KSTAT_NAMED_PTR(loopback_ksp); 5221 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 5222 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 5223 kstat_install(loopback_ksp); 5224 } 5225 } 5226 5227 if (error != NULL) 5228 *error = 0; 5229 *did_alloc = B_TRUE; 5230 rw_exit(&ill_g_lock); 5231 return (ill); 5232 done: 5233 if (ill != NULL) { 5234 if (ill->ill_phyint != NULL) { 5235 ipsq_t *ipsq; 5236 5237 ipsq = ill->ill_phyint->phyint_ipsq; 5238 if (ipsq != NULL) 5239 kmem_free(ipsq, sizeof (ipsq_t)); 5240 mi_free(ill->ill_phyint); 5241 } 5242 ill_free_mib(ill); 5243 mi_free(ill); 5244 } 5245 rw_exit(&ill_g_lock); 5246 if (error != NULL) 5247 *error = ENOMEM; 5248 return (NULL); 5249 } 5250 5251 /* 5252 * Return a pointer to the ill which matches the index and IP version type. 5253 */ 5254 ill_t * 5255 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 5256 ipsq_func_t func, int *err) 5257 { 5258 ill_t *ill; 5259 ipsq_t *ipsq; 5260 phyint_t *phyi; 5261 5262 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 5263 (q != NULL && mp != NULL && func != NULL && err != NULL)); 5264 5265 if (err != NULL) 5266 *err = 0; 5267 5268 /* 5269 * Indexes are stored in the phyint - a common structure 5270 * to both IPv4 and IPv6. 5271 */ 5272 rw_enter(&ill_g_lock, RW_READER); 5273 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 5274 (void *) &index, NULL); 5275 if (phyi != NULL) { 5276 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5277 if (ill != NULL) { 5278 /* 5279 * The block comment at the start of ipif_down 5280 * explains the use of the macros used below 5281 */ 5282 GRAB_CONN_LOCK(q); 5283 mutex_enter(&ill->ill_lock); 5284 if (ILL_CAN_LOOKUP(ill)) { 5285 ill_refhold_locked(ill); 5286 mutex_exit(&ill->ill_lock); 5287 RELEASE_CONN_LOCK(q); 5288 rw_exit(&ill_g_lock); 5289 return (ill); 5290 } else if (ILL_CAN_WAIT(ill, q)) { 5291 ipsq = ill->ill_phyint->phyint_ipsq; 5292 mutex_enter(&ipsq->ipsq_lock); 5293 rw_exit(&ill_g_lock); 5294 mutex_exit(&ill->ill_lock); 5295 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5296 mutex_exit(&ipsq->ipsq_lock); 5297 RELEASE_CONN_LOCK(q); 5298 *err = EINPROGRESS; 5299 return (NULL); 5300 } 5301 RELEASE_CONN_LOCK(q); 5302 mutex_exit(&ill->ill_lock); 5303 } 5304 } 5305 rw_exit(&ill_g_lock); 5306 if (err != NULL) 5307 *err = ENXIO; 5308 return (NULL); 5309 } 5310 5311 /* 5312 * Return the ifindex next in sequence after the passed in ifindex. 5313 * If there is no next ifindex for the given protocol, return 0. 5314 */ 5315 uint_t 5316 ill_get_next_ifindex(uint_t index, boolean_t isv6) 5317 { 5318 phyint_t *phyi; 5319 phyint_t *phyi_initial; 5320 uint_t ifindex; 5321 5322 rw_enter(&ill_g_lock, RW_READER); 5323 5324 if (index == 0) { 5325 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 5326 } else { 5327 phyi = phyi_initial = avl_find( 5328 &phyint_g_list.phyint_list_avl_by_index, 5329 (void *) &index, NULL); 5330 } 5331 5332 for (; phyi != NULL; 5333 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 5334 phyi, AVL_AFTER)) { 5335 /* 5336 * If we're not returning the first interface in the tree 5337 * and we still haven't moved past the phyint_t that 5338 * corresponds to index, avl_walk needs to be called again 5339 */ 5340 if (!((index != 0) && (phyi == phyi_initial))) { 5341 if (isv6) { 5342 if ((phyi->phyint_illv6) && 5343 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5344 (phyi->phyint_illv6->ill_isv6 == 1)) 5345 break; 5346 } else { 5347 if ((phyi->phyint_illv4) && 5348 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5349 (phyi->phyint_illv4->ill_isv6 == 0)) 5350 break; 5351 } 5352 } 5353 } 5354 5355 rw_exit(&ill_g_lock); 5356 5357 if (phyi != NULL) 5358 ifindex = phyi->phyint_ifindex; 5359 else 5360 ifindex = 0; 5361 5362 return (ifindex); 5363 } 5364 5365 5366 /* 5367 * Return the ifindex for the named interface. 5368 * If there is no next ifindex for the interface, return 0. 5369 */ 5370 uint_t 5371 ill_get_ifindex_by_name(char *name) 5372 { 5373 phyint_t *phyi; 5374 avl_index_t where = 0; 5375 uint_t ifindex; 5376 5377 rw_enter(&ill_g_lock, RW_READER); 5378 5379 if ((phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 5380 name, &where)) == NULL) { 5381 rw_exit(&ill_g_lock); 5382 return (0); 5383 } 5384 5385 ifindex = phyi->phyint_ifindex; 5386 5387 rw_exit(&ill_g_lock); 5388 5389 return (ifindex); 5390 } 5391 5392 5393 /* 5394 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5395 * that gives a running thread a reference to the ill. This reference must be 5396 * released by the thread when it is done accessing the ill and related 5397 * objects. ill_refcnt can not be used to account for static references 5398 * such as other structures pointing to an ill. Callers must generally 5399 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5400 * or be sure that the ill is not being deleted or changing state before 5401 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5402 * ill won't change any of its critical state such as address, netmask etc. 5403 */ 5404 void 5405 ill_refhold(ill_t *ill) 5406 { 5407 mutex_enter(&ill->ill_lock); 5408 ill->ill_refcnt++; 5409 ILL_TRACE_REF(ill); 5410 mutex_exit(&ill->ill_lock); 5411 } 5412 5413 void 5414 ill_refhold_locked(ill_t *ill) 5415 { 5416 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5417 ill->ill_refcnt++; 5418 ILL_TRACE_REF(ill); 5419 } 5420 5421 int 5422 ill_check_and_refhold(ill_t *ill) 5423 { 5424 mutex_enter(&ill->ill_lock); 5425 if (ILL_CAN_LOOKUP(ill)) { 5426 ill_refhold_locked(ill); 5427 mutex_exit(&ill->ill_lock); 5428 return (0); 5429 } 5430 mutex_exit(&ill->ill_lock); 5431 return (ILL_LOOKUP_FAILED); 5432 } 5433 5434 /* 5435 * Must not be called while holding any locks. Otherwise if this is 5436 * the last reference to be released, there is a chance of recursive mutex 5437 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5438 * to restart an ioctl. 5439 */ 5440 void 5441 ill_refrele(ill_t *ill) 5442 { 5443 mutex_enter(&ill->ill_lock); 5444 ASSERT(ill->ill_refcnt != 0); 5445 ill->ill_refcnt--; 5446 ILL_UNTRACE_REF(ill); 5447 if (ill->ill_refcnt != 0) { 5448 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5449 mutex_exit(&ill->ill_lock); 5450 return; 5451 } 5452 5453 /* Drops the ill_lock */ 5454 ipif_ill_refrele_tail(ill); 5455 } 5456 5457 /* 5458 * Obtain a weak reference count on the ill. This reference ensures the 5459 * ill won't be freed, but the ill may change any of its critical state 5460 * such as netmask, address etc. Returns an error if the ill has started 5461 * closing. 5462 */ 5463 boolean_t 5464 ill_waiter_inc(ill_t *ill) 5465 { 5466 mutex_enter(&ill->ill_lock); 5467 if (ill->ill_state_flags & ILL_CONDEMNED) { 5468 mutex_exit(&ill->ill_lock); 5469 return (B_FALSE); 5470 } 5471 ill->ill_waiters++; 5472 mutex_exit(&ill->ill_lock); 5473 return (B_TRUE); 5474 } 5475 5476 void 5477 ill_waiter_dcr(ill_t *ill) 5478 { 5479 mutex_enter(&ill->ill_lock); 5480 ill->ill_waiters--; 5481 if (ill->ill_waiters == 0) 5482 cv_broadcast(&ill->ill_cv); 5483 mutex_exit(&ill->ill_lock); 5484 } 5485 5486 /* 5487 * Named Dispatch routine to produce a formatted report on all ILLs. 5488 * This report is accessed by using the ndd utility to "get" ND variable 5489 * "ip_ill_status". 5490 */ 5491 /* ARGSUSED */ 5492 int 5493 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5494 { 5495 ill_t *ill; 5496 ill_walk_context_t ctx; 5497 5498 (void) mi_mpprintf(mp, 5499 "ILL " MI_COL_HDRPAD_STR 5500 /* 01234567[89ABCDEF] */ 5501 "rq " MI_COL_HDRPAD_STR 5502 /* 01234567[89ABCDEF] */ 5503 "wq " MI_COL_HDRPAD_STR 5504 /* 01234567[89ABCDEF] */ 5505 "upcnt mxfrg err name"); 5506 /* 12345 12345 123 xxxxxxxx */ 5507 5508 rw_enter(&ill_g_lock, RW_READER); 5509 ill = ILL_START_WALK_ALL(&ctx); 5510 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5511 (void) mi_mpprintf(mp, 5512 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5513 "%05u %05u %03d %s", 5514 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5515 ill->ill_ipif_up_count, 5516 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5517 } 5518 rw_exit(&ill_g_lock); 5519 5520 return (0); 5521 } 5522 5523 /* 5524 * Named Dispatch routine to produce a formatted report on all IPIFs. 5525 * This report is accessed by using the ndd utility to "get" ND variable 5526 * "ip_ipif_status". 5527 */ 5528 /* ARGSUSED */ 5529 int 5530 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5531 { 5532 char buf1[INET6_ADDRSTRLEN]; 5533 char buf2[INET6_ADDRSTRLEN]; 5534 char buf3[INET6_ADDRSTRLEN]; 5535 char buf4[INET6_ADDRSTRLEN]; 5536 char buf5[INET6_ADDRSTRLEN]; 5537 char buf6[INET6_ADDRSTRLEN]; 5538 char buf[LIFNAMSIZ]; 5539 ill_t *ill; 5540 ipif_t *ipif; 5541 nv_t *nvp; 5542 uint64_t flags; 5543 zoneid_t zoneid; 5544 ill_walk_context_t ctx; 5545 5546 (void) mi_mpprintf(mp, 5547 "IPIF metric mtu in/out/forward name zone flags...\n" 5548 "\tlocal address\n" 5549 "\tsrc address\n" 5550 "\tsubnet\n" 5551 "\tmask\n" 5552 "\tbroadcast\n" 5553 "\tp-p-dst"); 5554 5555 ASSERT(q->q_next == NULL); 5556 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5557 5558 rw_enter(&ill_g_lock, RW_READER); 5559 ill = ILL_START_WALK_ALL(&ctx); 5560 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5561 for (ipif = ill->ill_ipif; ipif != NULL; 5562 ipif = ipif->ipif_next) { 5563 if (zoneid != GLOBAL_ZONEID && 5564 zoneid != ipif->ipif_zoneid && 5565 ipif->ipif_zoneid != ALL_ZONES) 5566 continue; 5567 (void) mi_mpprintf(mp, 5568 MI_COL_PTRFMT_STR 5569 "%04u %05u %u/%u/%u %s %d", 5570 (void *)ipif, 5571 ipif->ipif_metric, ipif->ipif_mtu, 5572 ipif->ipif_ib_pkt_count, 5573 ipif->ipif_ob_pkt_count, 5574 ipif->ipif_fo_pkt_count, 5575 ipif_get_name(ipif, buf, sizeof (buf)), 5576 ipif->ipif_zoneid); 5577 5578 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5579 ipif->ipif_ill->ill_phyint->phyint_flags; 5580 5581 /* Tack on text strings for any flags. */ 5582 nvp = ipif_nv_tbl; 5583 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5584 if (nvp->nv_value & flags) 5585 (void) mi_mpprintf_nr(mp, " %s", 5586 nvp->nv_name); 5587 } 5588 (void) mi_mpprintf(mp, 5589 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5590 inet_ntop(AF_INET6, 5591 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5592 inet_ntop(AF_INET6, 5593 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5594 inet_ntop(AF_INET6, 5595 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5596 inet_ntop(AF_INET6, 5597 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5598 inet_ntop(AF_INET6, 5599 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5600 inet_ntop(AF_INET6, 5601 &ipif->ipif_v6pp_dst_addr, 5602 buf6, sizeof (buf6))); 5603 } 5604 } 5605 rw_exit(&ill_g_lock); 5606 return (0); 5607 } 5608 5609 /* 5610 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5611 * driver. We construct best guess defaults for lower level information that 5612 * we need. If an interface is brought up without injection of any overriding 5613 * information from outside, we have to be ready to go with these defaults. 5614 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5615 * we primarely want the dl_provider_style. 5616 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5617 * at which point we assume the other part of the information is valid. 5618 */ 5619 void 5620 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5621 { 5622 uchar_t *brdcst_addr; 5623 uint_t brdcst_addr_length, phys_addr_length; 5624 t_scalar_t sap_length; 5625 dl_info_ack_t *dlia; 5626 ip_m_t *ipm; 5627 dl_qos_cl_sel1_t *sel1; 5628 5629 ASSERT(IAM_WRITER_ILL(ill)); 5630 5631 /* 5632 * Till the ill is fully up ILL_CHANGING will be set and 5633 * the ill is not globally visible. So no need for a lock. 5634 */ 5635 dlia = (dl_info_ack_t *)mp->b_rptr; 5636 ill->ill_mactype = dlia->dl_mac_type; 5637 5638 ipm = ip_m_lookup(dlia->dl_mac_type); 5639 if (ipm == NULL) { 5640 ipm = ip_m_lookup(DL_OTHER); 5641 ASSERT(ipm != NULL); 5642 } 5643 ill->ill_media = ipm; 5644 5645 /* 5646 * When the new DLPI stuff is ready we'll pull lengths 5647 * from dlia. 5648 */ 5649 if (dlia->dl_version == DL_VERSION_2) { 5650 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5651 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5652 brdcst_addr_length); 5653 if (brdcst_addr == NULL) { 5654 brdcst_addr_length = 0; 5655 } 5656 sap_length = dlia->dl_sap_length; 5657 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5658 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5659 brdcst_addr_length, sap_length, phys_addr_length)); 5660 } else { 5661 brdcst_addr_length = 6; 5662 brdcst_addr = ip_six_byte_all_ones; 5663 sap_length = -2; 5664 phys_addr_length = brdcst_addr_length; 5665 } 5666 5667 ill->ill_bcast_addr_length = brdcst_addr_length; 5668 ill->ill_phys_addr_length = phys_addr_length; 5669 ill->ill_sap_length = sap_length; 5670 ill->ill_max_frag = dlia->dl_max_sdu; 5671 ill->ill_max_mtu = ill->ill_max_frag; 5672 5673 ill->ill_type = ipm->ip_m_type; 5674 5675 if (!ill->ill_dlpi_style_set) { 5676 if (dlia->dl_provider_style == DL_STYLE2) 5677 ill->ill_needs_attach = 1; 5678 5679 /* 5680 * Allocate the first ipif on this ill. We don't delay it 5681 * further as ioctl handling assumes atleast one ipif to 5682 * be present. 5683 * 5684 * At this point we don't know whether the ill is v4 or v6. 5685 * We will know this whan the SIOCSLIFNAME happens and 5686 * the correct value for ill_isv6 will be assigned in 5687 * ipif_set_values(). We need to hold the ill lock and 5688 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5689 * the wakeup. 5690 */ 5691 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5692 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5693 mutex_enter(&ill->ill_lock); 5694 ASSERT(ill->ill_dlpi_style_set == 0); 5695 ill->ill_dlpi_style_set = 1; 5696 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5697 cv_broadcast(&ill->ill_cv); 5698 mutex_exit(&ill->ill_lock); 5699 freemsg(mp); 5700 return; 5701 } 5702 ASSERT(ill->ill_ipif != NULL); 5703 /* 5704 * We know whether it is IPv4 or IPv6 now, as this is the 5705 * second DL_INFO_ACK we are recieving in response to the 5706 * DL_INFO_REQ sent in ipif_set_values. 5707 */ 5708 if (ill->ill_isv6) 5709 ill->ill_sap = IP6_DL_SAP; 5710 else 5711 ill->ill_sap = IP_DL_SAP; 5712 /* 5713 * Set ipif_mtu which is used to set the IRE's 5714 * ire_max_frag value. The driver could have sent 5715 * a different mtu from what it sent last time. No 5716 * need to call ipif_mtu_change because IREs have 5717 * not yet been created. 5718 */ 5719 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5720 /* 5721 * Clear all the flags that were set based on ill_bcast_addr_length 5722 * and ill_phys_addr_length (in ipif_set_values) as these could have 5723 * changed now and we need to re-evaluate. 5724 */ 5725 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5726 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5727 5728 /* 5729 * Free ill_resolver_mp and ill_bcast_mp as things could have 5730 * changed now. 5731 */ 5732 if (ill->ill_bcast_addr_length == 0) { 5733 if (ill->ill_resolver_mp != NULL) 5734 freemsg(ill->ill_resolver_mp); 5735 if (ill->ill_bcast_mp != NULL) 5736 freemsg(ill->ill_bcast_mp); 5737 if (ill->ill_flags & ILLF_XRESOLV) 5738 ill->ill_net_type = IRE_IF_RESOLVER; 5739 else 5740 ill->ill_net_type = IRE_IF_NORESOLVER; 5741 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5742 ill->ill_phys_addr_length, 5743 ill->ill_sap, 5744 ill->ill_sap_length); 5745 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5746 5747 if (ill->ill_isv6) 5748 /* 5749 * Note: xresolv interfaces will eventually need NOARP 5750 * set here as well, but that will require those 5751 * external resolvers to have some knowledge of 5752 * that flag and act appropriately. Not to be changed 5753 * at present. 5754 */ 5755 ill->ill_flags |= ILLF_NONUD; 5756 else 5757 ill->ill_flags |= ILLF_NOARP; 5758 5759 if (ill->ill_phys_addr_length == 0) { 5760 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5761 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5762 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5763 } else { 5764 /* pt-pt supports multicast. */ 5765 ill->ill_flags |= ILLF_MULTICAST; 5766 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5767 } 5768 } 5769 } else { 5770 ill->ill_net_type = IRE_IF_RESOLVER; 5771 if (ill->ill_bcast_mp != NULL) 5772 freemsg(ill->ill_bcast_mp); 5773 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5774 ill->ill_bcast_addr_length, ill->ill_sap, 5775 ill->ill_sap_length); 5776 /* 5777 * Later detect lack of DLPI driver multicast 5778 * capability by catching DL_ENABMULTI errors in 5779 * ip_rput_dlpi. 5780 */ 5781 ill->ill_flags |= ILLF_MULTICAST; 5782 if (!ill->ill_isv6) 5783 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5784 } 5785 /* By default an interface does not support any CoS marking */ 5786 ill->ill_flags &= ~ILLF_COS_ENABLED; 5787 5788 /* 5789 * If we get QoS information in DL_INFO_ACK, the device supports 5790 * some form of CoS marking, set ILLF_COS_ENABLED. 5791 */ 5792 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5793 dlia->dl_qos_length); 5794 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5795 ill->ill_flags |= ILLF_COS_ENABLED; 5796 } 5797 5798 /* Clear any previous error indication. */ 5799 ill->ill_error = 0; 5800 freemsg(mp); 5801 } 5802 5803 /* 5804 * Perform various checks to verify that an address would make sense as a 5805 * local, remote, or subnet interface address. 5806 */ 5807 static boolean_t 5808 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5809 { 5810 ipaddr_t net_mask; 5811 5812 /* 5813 * Don't allow all zeroes, all ones or experimental address, but allow 5814 * all ones netmask. 5815 */ 5816 if ((net_mask = ip_net_mask(addr)) == 0) 5817 return (B_FALSE); 5818 /* A given netmask overrides the "guess" netmask */ 5819 if (subnet_mask != 0) 5820 net_mask = subnet_mask; 5821 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5822 (addr == (addr | ~net_mask)))) { 5823 return (B_FALSE); 5824 } 5825 if (CLASSD(addr)) 5826 return (B_FALSE); 5827 5828 return (B_TRUE); 5829 } 5830 5831 /* 5832 * ipif_lookup_group 5833 * Returns held ipif 5834 */ 5835 ipif_t * 5836 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid) 5837 { 5838 ire_t *ire; 5839 ipif_t *ipif; 5840 5841 ire = ire_lookup_multi(group, zoneid); 5842 if (ire == NULL) 5843 return (NULL); 5844 ipif = ire->ire_ipif; 5845 ipif_refhold(ipif); 5846 ire_refrele(ire); 5847 return (ipif); 5848 } 5849 5850 /* 5851 * Look for an ipif with the specified interface address and destination. 5852 * The destination address is used only for matching point-to-point interfaces. 5853 */ 5854 ipif_t * 5855 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5856 ipsq_func_t func, int *error) 5857 { 5858 ipif_t *ipif; 5859 ill_t *ill; 5860 ill_walk_context_t ctx; 5861 ipsq_t *ipsq; 5862 5863 if (error != NULL) 5864 *error = 0; 5865 5866 /* 5867 * First match all the point-to-point interfaces 5868 * before looking at non-point-to-point interfaces. 5869 * This is done to avoid returning non-point-to-point 5870 * ipif instead of unnumbered point-to-point ipif. 5871 */ 5872 rw_enter(&ill_g_lock, RW_READER); 5873 ill = ILL_START_WALK_V4(&ctx); 5874 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5875 GRAB_CONN_LOCK(q); 5876 mutex_enter(&ill->ill_lock); 5877 for (ipif = ill->ill_ipif; ipif != NULL; 5878 ipif = ipif->ipif_next) { 5879 /* Allow the ipif to be down */ 5880 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5881 (ipif->ipif_lcl_addr == if_addr) && 5882 (ipif->ipif_pp_dst_addr == dst)) { 5883 /* 5884 * The block comment at the start of ipif_down 5885 * explains the use of the macros used below 5886 */ 5887 if (IPIF_CAN_LOOKUP(ipif)) { 5888 ipif_refhold_locked(ipif); 5889 mutex_exit(&ill->ill_lock); 5890 RELEASE_CONN_LOCK(q); 5891 rw_exit(&ill_g_lock); 5892 return (ipif); 5893 } else if (IPIF_CAN_WAIT(ipif, q)) { 5894 ipsq = ill->ill_phyint->phyint_ipsq; 5895 mutex_enter(&ipsq->ipsq_lock); 5896 mutex_exit(&ill->ill_lock); 5897 rw_exit(&ill_g_lock); 5898 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5899 ill); 5900 mutex_exit(&ipsq->ipsq_lock); 5901 RELEASE_CONN_LOCK(q); 5902 *error = EINPROGRESS; 5903 return (NULL); 5904 } 5905 } 5906 } 5907 mutex_exit(&ill->ill_lock); 5908 RELEASE_CONN_LOCK(q); 5909 } 5910 rw_exit(&ill_g_lock); 5911 5912 /* lookup the ipif based on interface address */ 5913 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error); 5914 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5915 return (ipif); 5916 } 5917 5918 /* 5919 * Look for an ipif with the specified address. For point-point links 5920 * we look for matches on either the destination address and the local 5921 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5922 * is set. 5923 * Matches on a specific ill if match_ill is set. 5924 */ 5925 ipif_t * 5926 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5927 mblk_t *mp, ipsq_func_t func, int *error) 5928 { 5929 ipif_t *ipif; 5930 ill_t *ill; 5931 boolean_t ptp = B_FALSE; 5932 ipsq_t *ipsq; 5933 ill_walk_context_t ctx; 5934 5935 if (error != NULL) 5936 *error = 0; 5937 5938 rw_enter(&ill_g_lock, RW_READER); 5939 /* 5940 * Repeat twice, first based on local addresses and 5941 * next time for pointopoint. 5942 */ 5943 repeat: 5944 ill = ILL_START_WALK_V4(&ctx); 5945 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5946 if (match_ill != NULL && ill != match_ill) { 5947 continue; 5948 } 5949 GRAB_CONN_LOCK(q); 5950 mutex_enter(&ill->ill_lock); 5951 for (ipif = ill->ill_ipif; ipif != NULL; 5952 ipif = ipif->ipif_next) { 5953 if (zoneid != ALL_ZONES && 5954 zoneid != ipif->ipif_zoneid && 5955 ipif->ipif_zoneid != ALL_ZONES) 5956 continue; 5957 /* Allow the ipif to be down */ 5958 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5959 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5960 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5961 (ipif->ipif_pp_dst_addr == addr))) { 5962 /* 5963 * The block comment at the start of ipif_down 5964 * explains the use of the macros used below 5965 */ 5966 if (IPIF_CAN_LOOKUP(ipif)) { 5967 ipif_refhold_locked(ipif); 5968 mutex_exit(&ill->ill_lock); 5969 RELEASE_CONN_LOCK(q); 5970 rw_exit(&ill_g_lock); 5971 return (ipif); 5972 } else if (IPIF_CAN_WAIT(ipif, q)) { 5973 ipsq = ill->ill_phyint->phyint_ipsq; 5974 mutex_enter(&ipsq->ipsq_lock); 5975 mutex_exit(&ill->ill_lock); 5976 rw_exit(&ill_g_lock); 5977 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5978 ill); 5979 mutex_exit(&ipsq->ipsq_lock); 5980 RELEASE_CONN_LOCK(q); 5981 *error = EINPROGRESS; 5982 return (NULL); 5983 } 5984 } 5985 } 5986 mutex_exit(&ill->ill_lock); 5987 RELEASE_CONN_LOCK(q); 5988 } 5989 5990 /* If we already did the ptp case, then we are done */ 5991 if (ptp) { 5992 rw_exit(&ill_g_lock); 5993 if (error != NULL) 5994 *error = ENXIO; 5995 return (NULL); 5996 } 5997 ptp = B_TRUE; 5998 goto repeat; 5999 } 6000 6001 /* 6002 * Look for an ipif with the specified address. For point-point links 6003 * we look for matches on either the destination address and the local 6004 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 6005 * is set. 6006 * Matches on a specific ill if match_ill is set. 6007 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 6008 */ 6009 zoneid_t 6010 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill) 6011 { 6012 zoneid_t zoneid; 6013 ipif_t *ipif; 6014 ill_t *ill; 6015 boolean_t ptp = B_FALSE; 6016 ill_walk_context_t ctx; 6017 6018 rw_enter(&ill_g_lock, RW_READER); 6019 /* 6020 * Repeat twice, first based on local addresses and 6021 * next time for pointopoint. 6022 */ 6023 repeat: 6024 ill = ILL_START_WALK_V4(&ctx); 6025 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6026 if (match_ill != NULL && ill != match_ill) { 6027 continue; 6028 } 6029 mutex_enter(&ill->ill_lock); 6030 for (ipif = ill->ill_ipif; ipif != NULL; 6031 ipif = ipif->ipif_next) { 6032 /* Allow the ipif to be down */ 6033 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 6034 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 6035 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 6036 (ipif->ipif_pp_dst_addr == addr)) && 6037 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 6038 zoneid = ipif->ipif_zoneid; 6039 mutex_exit(&ill->ill_lock); 6040 rw_exit(&ill_g_lock); 6041 /* 6042 * If ipif_zoneid was ALL_ZONES then we have 6043 * a trusted extensions shared IP address. 6044 * In that case GLOBAL_ZONEID works to send. 6045 */ 6046 if (zoneid == ALL_ZONES) 6047 zoneid = GLOBAL_ZONEID; 6048 return (zoneid); 6049 } 6050 } 6051 mutex_exit(&ill->ill_lock); 6052 } 6053 6054 /* If we already did the ptp case, then we are done */ 6055 if (ptp) { 6056 rw_exit(&ill_g_lock); 6057 return (ALL_ZONES); 6058 } 6059 ptp = B_TRUE; 6060 goto repeat; 6061 } 6062 6063 /* 6064 * Look for an ipif that matches the specified remote address i.e. the 6065 * ipif that would receive the specified packet. 6066 * First look for directly connected interfaces and then do a recursive 6067 * IRE lookup and pick the first ipif corresponding to the source address in the 6068 * ire. 6069 * Returns: held ipif 6070 */ 6071 ipif_t * 6072 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 6073 { 6074 ipif_t *ipif; 6075 ire_t *ire; 6076 6077 ASSERT(!ill->ill_isv6); 6078 6079 /* 6080 * Someone could be changing this ipif currently or change it 6081 * after we return this. Thus a few packets could use the old 6082 * old values. However structure updates/creates (ire, ilg, ilm etc) 6083 * will atomically be updated or cleaned up with the new value 6084 * Thus we don't need a lock to check the flags or other attrs below. 6085 */ 6086 mutex_enter(&ill->ill_lock); 6087 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6088 if (!IPIF_CAN_LOOKUP(ipif)) 6089 continue; 6090 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 6091 ipif->ipif_zoneid != ALL_ZONES) 6092 continue; 6093 /* Allow the ipif to be down */ 6094 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 6095 if ((ipif->ipif_pp_dst_addr == addr) || 6096 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 6097 ipif->ipif_lcl_addr == addr)) { 6098 ipif_refhold_locked(ipif); 6099 mutex_exit(&ill->ill_lock); 6100 return (ipif); 6101 } 6102 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 6103 ipif_refhold_locked(ipif); 6104 mutex_exit(&ill->ill_lock); 6105 return (ipif); 6106 } 6107 } 6108 mutex_exit(&ill->ill_lock); 6109 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 6110 NULL, MATCH_IRE_RECURSIVE); 6111 if (ire != NULL) { 6112 /* 6113 * The callers of this function wants to know the 6114 * interface on which they have to send the replies 6115 * back. For IRE_CACHES that have ire_stq and ire_ipif 6116 * derived from different ills, we really don't care 6117 * what we return here. 6118 */ 6119 ipif = ire->ire_ipif; 6120 if (ipif != NULL) { 6121 ipif_refhold(ipif); 6122 ire_refrele(ire); 6123 return (ipif); 6124 } 6125 ire_refrele(ire); 6126 } 6127 /* Pick the first interface */ 6128 ipif = ipif_get_next_ipif(NULL, ill); 6129 return (ipif); 6130 } 6131 6132 /* 6133 * This func does not prevent refcnt from increasing. But if 6134 * the caller has taken steps to that effect, then this func 6135 * can be used to determine whether the ill has become quiescent 6136 */ 6137 boolean_t 6138 ill_is_quiescent(ill_t *ill) 6139 { 6140 ipif_t *ipif; 6141 6142 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6143 6144 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6145 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6146 return (B_FALSE); 6147 } 6148 } 6149 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 6150 ill->ill_nce_cnt != 0 || ill->ill_srcif_refcnt != 0 || 6151 ill->ill_mrtun_refcnt != 0) { 6152 return (B_FALSE); 6153 } 6154 return (B_TRUE); 6155 } 6156 6157 /* 6158 * This func does not prevent refcnt from increasing. But if 6159 * the caller has taken steps to that effect, then this func 6160 * can be used to determine whether the ipif has become quiescent 6161 */ 6162 static boolean_t 6163 ipif_is_quiescent(ipif_t *ipif) 6164 { 6165 ill_t *ill; 6166 6167 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6168 6169 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6170 return (B_FALSE); 6171 } 6172 6173 ill = ipif->ipif_ill; 6174 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 6175 ill->ill_logical_down) { 6176 return (B_TRUE); 6177 } 6178 6179 /* This is the last ipif going down or being deleted on this ill */ 6180 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 6181 return (B_FALSE); 6182 } 6183 6184 return (B_TRUE); 6185 } 6186 6187 /* 6188 * This func does not prevent refcnt from increasing. But if 6189 * the caller has taken steps to that effect, then this func 6190 * can be used to determine whether the ipifs marked with IPIF_MOVING 6191 * have become quiescent and can be moved in a failover/failback. 6192 */ 6193 static ipif_t * 6194 ill_quiescent_to_move(ill_t *ill) 6195 { 6196 ipif_t *ipif; 6197 6198 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6199 6200 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6201 if (ipif->ipif_state_flags & IPIF_MOVING) { 6202 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6203 return (ipif); 6204 } 6205 } 6206 } 6207 return (NULL); 6208 } 6209 6210 /* 6211 * The ipif/ill/ire has been refreled. Do the tail processing. 6212 * Determine if the ipif or ill in question has become quiescent and if so 6213 * wakeup close and/or restart any queued pending ioctl that is waiting 6214 * for the ipif_down (or ill_down) 6215 */ 6216 void 6217 ipif_ill_refrele_tail(ill_t *ill) 6218 { 6219 mblk_t *mp; 6220 conn_t *connp; 6221 ipsq_t *ipsq; 6222 ipif_t *ipif; 6223 dl_notify_ind_t *dlindp; 6224 6225 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6226 6227 if ((ill->ill_state_flags & ILL_CONDEMNED) && 6228 ill_is_quiescent(ill)) { 6229 /* ill_close may be waiting */ 6230 cv_broadcast(&ill->ill_cv); 6231 } 6232 6233 /* ipsq can't change because ill_lock is held */ 6234 ipsq = ill->ill_phyint->phyint_ipsq; 6235 if (ipsq->ipsq_waitfor == 0) { 6236 /* Not waiting for anything, just return. */ 6237 mutex_exit(&ill->ill_lock); 6238 return; 6239 } 6240 ASSERT(ipsq->ipsq_pending_mp != NULL && 6241 ipsq->ipsq_pending_ipif != NULL); 6242 /* 6243 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 6244 * Last ipif going down needs to down the ill, so ill_ire_cnt must 6245 * be zero for restarting an ioctl that ends up downing the ill. 6246 */ 6247 ipif = ipsq->ipsq_pending_ipif; 6248 if (ipif->ipif_ill != ill) { 6249 /* The ioctl is pending on some other ill. */ 6250 mutex_exit(&ill->ill_lock); 6251 return; 6252 } 6253 6254 switch (ipsq->ipsq_waitfor) { 6255 case IPIF_DOWN: 6256 case IPIF_FREE: 6257 if (!ipif_is_quiescent(ipif)) { 6258 mutex_exit(&ill->ill_lock); 6259 return; 6260 } 6261 break; 6262 6263 case ILL_DOWN: 6264 case ILL_FREE: 6265 /* 6266 * case ILL_FREE arises only for loopback. otherwise ill_delete 6267 * waits synchronously in ip_close, and no message is queued in 6268 * ipsq_pending_mp at all in this case 6269 */ 6270 if (!ill_is_quiescent(ill)) { 6271 mutex_exit(&ill->ill_lock); 6272 return; 6273 } 6274 6275 break; 6276 6277 case ILL_MOVE_OK: 6278 if (ill_quiescent_to_move(ill) != NULL) { 6279 mutex_exit(&ill->ill_lock); 6280 return; 6281 } 6282 6283 break; 6284 default: 6285 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 6286 (void *)ipsq, ipsq->ipsq_waitfor); 6287 } 6288 6289 /* 6290 * Incr refcnt for the qwriter_ip call below which 6291 * does a refrele 6292 */ 6293 ill_refhold_locked(ill); 6294 mutex_exit(&ill->ill_lock); 6295 6296 mp = ipsq_pending_mp_get(ipsq, &connp); 6297 ASSERT(mp != NULL); 6298 6299 switch (mp->b_datap->db_type) { 6300 case M_PCPROTO: 6301 case M_PROTO: 6302 /* 6303 * For now, only DL_NOTIFY_IND messages can use this facility. 6304 */ 6305 dlindp = (dl_notify_ind_t *)mp->b_rptr; 6306 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 6307 6308 switch (dlindp->dl_notification) { 6309 case DL_NOTE_PHYS_ADDR: 6310 qwriter_ip(NULL, ill, ill->ill_rq, mp, 6311 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 6312 return; 6313 default: 6314 ASSERT(0); 6315 } 6316 break; 6317 6318 case M_ERROR: 6319 case M_HANGUP: 6320 qwriter_ip(NULL, ill, ill->ill_rq, mp, ipif_all_down_tail, 6321 CUR_OP, B_TRUE); 6322 return; 6323 6324 case M_IOCTL: 6325 case M_IOCDATA: 6326 qwriter_ip(NULL, ill, (connp != NULL ? CONNP_TO_WQ(connp) : 6327 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 6328 return; 6329 6330 default: 6331 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6332 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6333 } 6334 } 6335 6336 #ifdef ILL_DEBUG 6337 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6338 void 6339 th_trace_rrecord(th_trace_t *th_trace) 6340 { 6341 tr_buf_t *tr_buf; 6342 uint_t lastref; 6343 6344 lastref = th_trace->th_trace_lastref; 6345 lastref++; 6346 if (lastref == TR_BUF_MAX) 6347 lastref = 0; 6348 th_trace->th_trace_lastref = lastref; 6349 tr_buf = &th_trace->th_trbuf[lastref]; 6350 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, IP_STACK_DEPTH); 6351 } 6352 6353 th_trace_t * 6354 th_trace_ipif_lookup(ipif_t *ipif) 6355 { 6356 int bucket_id; 6357 th_trace_t *th_trace; 6358 6359 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6360 6361 bucket_id = IP_TR_HASH(curthread); 6362 ASSERT(bucket_id < IP_TR_HASH_MAX); 6363 6364 for (th_trace = ipif->ipif_trace[bucket_id]; th_trace != NULL; 6365 th_trace = th_trace->th_next) { 6366 if (th_trace->th_id == curthread) 6367 return (th_trace); 6368 } 6369 return (NULL); 6370 } 6371 6372 void 6373 ipif_trace_ref(ipif_t *ipif) 6374 { 6375 int bucket_id; 6376 th_trace_t *th_trace; 6377 6378 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6379 6380 if (ipif->ipif_trace_disable) 6381 return; 6382 6383 /* 6384 * Attempt to locate the trace buffer for the curthread. 6385 * If it does not exist, then allocate a new trace buffer 6386 * and link it in list of trace bufs for this ipif, at the head 6387 */ 6388 th_trace = th_trace_ipif_lookup(ipif); 6389 if (th_trace == NULL) { 6390 bucket_id = IP_TR_HASH(curthread); 6391 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6392 KM_NOSLEEP); 6393 if (th_trace == NULL) { 6394 ipif->ipif_trace_disable = B_TRUE; 6395 ipif_trace_cleanup(ipif); 6396 return; 6397 } 6398 th_trace->th_id = curthread; 6399 th_trace->th_next = ipif->ipif_trace[bucket_id]; 6400 th_trace->th_prev = &ipif->ipif_trace[bucket_id]; 6401 if (th_trace->th_next != NULL) 6402 th_trace->th_next->th_prev = &th_trace->th_next; 6403 ipif->ipif_trace[bucket_id] = th_trace; 6404 } 6405 ASSERT(th_trace->th_refcnt >= 0 && 6406 th_trace->th_refcnt < TR_BUF_MAX -1); 6407 th_trace->th_refcnt++; 6408 th_trace_rrecord(th_trace); 6409 } 6410 6411 void 6412 ipif_untrace_ref(ipif_t *ipif) 6413 { 6414 th_trace_t *th_trace; 6415 6416 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6417 6418 if (ipif->ipif_trace_disable) 6419 return; 6420 th_trace = th_trace_ipif_lookup(ipif); 6421 ASSERT(th_trace != NULL); 6422 ASSERT(th_trace->th_refcnt > 0); 6423 6424 th_trace->th_refcnt--; 6425 th_trace_rrecord(th_trace); 6426 } 6427 6428 th_trace_t * 6429 th_trace_ill_lookup(ill_t *ill) 6430 { 6431 th_trace_t *th_trace; 6432 int bucket_id; 6433 6434 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6435 6436 bucket_id = IP_TR_HASH(curthread); 6437 ASSERT(bucket_id < IP_TR_HASH_MAX); 6438 6439 for (th_trace = ill->ill_trace[bucket_id]; th_trace != NULL; 6440 th_trace = th_trace->th_next) { 6441 if (th_trace->th_id == curthread) 6442 return (th_trace); 6443 } 6444 return (NULL); 6445 } 6446 6447 void 6448 ill_trace_ref(ill_t *ill) 6449 { 6450 int bucket_id; 6451 th_trace_t *th_trace; 6452 6453 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6454 if (ill->ill_trace_disable) 6455 return; 6456 /* 6457 * Attempt to locate the trace buffer for the curthread. 6458 * If it does not exist, then allocate a new trace buffer 6459 * and link it in list of trace bufs for this ill, at the head 6460 */ 6461 th_trace = th_trace_ill_lookup(ill); 6462 if (th_trace == NULL) { 6463 bucket_id = IP_TR_HASH(curthread); 6464 th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t), 6465 KM_NOSLEEP); 6466 if (th_trace == NULL) { 6467 ill->ill_trace_disable = B_TRUE; 6468 ill_trace_cleanup(ill); 6469 return; 6470 } 6471 th_trace->th_id = curthread; 6472 th_trace->th_next = ill->ill_trace[bucket_id]; 6473 th_trace->th_prev = &ill->ill_trace[bucket_id]; 6474 if (th_trace->th_next != NULL) 6475 th_trace->th_next->th_prev = &th_trace->th_next; 6476 ill->ill_trace[bucket_id] = th_trace; 6477 } 6478 ASSERT(th_trace->th_refcnt >= 0 && 6479 th_trace->th_refcnt < TR_BUF_MAX - 1); 6480 6481 th_trace->th_refcnt++; 6482 th_trace_rrecord(th_trace); 6483 } 6484 6485 void 6486 ill_untrace_ref(ill_t *ill) 6487 { 6488 th_trace_t *th_trace; 6489 6490 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6491 6492 if (ill->ill_trace_disable) 6493 return; 6494 th_trace = th_trace_ill_lookup(ill); 6495 ASSERT(th_trace != NULL); 6496 ASSERT(th_trace->th_refcnt > 0); 6497 6498 th_trace->th_refcnt--; 6499 th_trace_rrecord(th_trace); 6500 } 6501 6502 /* 6503 * Verify that this thread has no refs to the ipif and free 6504 * the trace buffers 6505 */ 6506 /* ARGSUSED */ 6507 void 6508 ipif_thread_exit(ipif_t *ipif, void *dummy) 6509 { 6510 th_trace_t *th_trace; 6511 6512 mutex_enter(&ipif->ipif_ill->ill_lock); 6513 6514 th_trace = th_trace_ipif_lookup(ipif); 6515 if (th_trace == NULL) { 6516 mutex_exit(&ipif->ipif_ill->ill_lock); 6517 return; 6518 } 6519 ASSERT(th_trace->th_refcnt == 0); 6520 /* unlink th_trace and free it */ 6521 *th_trace->th_prev = th_trace->th_next; 6522 if (th_trace->th_next != NULL) 6523 th_trace->th_next->th_prev = th_trace->th_prev; 6524 th_trace->th_next = NULL; 6525 th_trace->th_prev = NULL; 6526 kmem_free(th_trace, sizeof (th_trace_t)); 6527 6528 mutex_exit(&ipif->ipif_ill->ill_lock); 6529 } 6530 6531 /* 6532 * Verify that this thread has no refs to the ill and free 6533 * the trace buffers 6534 */ 6535 /* ARGSUSED */ 6536 void 6537 ill_thread_exit(ill_t *ill, void *dummy) 6538 { 6539 th_trace_t *th_trace; 6540 6541 mutex_enter(&ill->ill_lock); 6542 6543 th_trace = th_trace_ill_lookup(ill); 6544 if (th_trace == NULL) { 6545 mutex_exit(&ill->ill_lock); 6546 return; 6547 } 6548 ASSERT(th_trace->th_refcnt == 0); 6549 /* unlink th_trace and free it */ 6550 *th_trace->th_prev = th_trace->th_next; 6551 if (th_trace->th_next != NULL) 6552 th_trace->th_next->th_prev = th_trace->th_prev; 6553 th_trace->th_next = NULL; 6554 th_trace->th_prev = NULL; 6555 kmem_free(th_trace, sizeof (th_trace_t)); 6556 6557 mutex_exit(&ill->ill_lock); 6558 } 6559 #endif 6560 6561 #ifdef ILL_DEBUG 6562 void 6563 ip_thread_exit(void) 6564 { 6565 ill_t *ill; 6566 ipif_t *ipif; 6567 ill_walk_context_t ctx; 6568 6569 rw_enter(&ill_g_lock, RW_READER); 6570 ill = ILL_START_WALK_ALL(&ctx); 6571 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6572 for (ipif = ill->ill_ipif; ipif != NULL; 6573 ipif = ipif->ipif_next) { 6574 ipif_thread_exit(ipif, NULL); 6575 } 6576 ill_thread_exit(ill, NULL); 6577 } 6578 rw_exit(&ill_g_lock); 6579 6580 ire_walk(ire_thread_exit, NULL); 6581 ndp_walk_common(&ndp4, NULL, nce_thread_exit, NULL, B_FALSE); 6582 ndp_walk_common(&ndp6, NULL, nce_thread_exit, NULL, B_FALSE); 6583 } 6584 6585 /* 6586 * Called when ipif is unplumbed or when memory alloc fails 6587 */ 6588 void 6589 ipif_trace_cleanup(ipif_t *ipif) 6590 { 6591 int i; 6592 th_trace_t *th_trace; 6593 th_trace_t *th_trace_next; 6594 6595 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6596 for (th_trace = ipif->ipif_trace[i]; th_trace != NULL; 6597 th_trace = th_trace_next) { 6598 th_trace_next = th_trace->th_next; 6599 kmem_free(th_trace, sizeof (th_trace_t)); 6600 } 6601 ipif->ipif_trace[i] = NULL; 6602 } 6603 } 6604 6605 /* 6606 * Called when ill is unplumbed or when memory alloc fails 6607 */ 6608 void 6609 ill_trace_cleanup(ill_t *ill) 6610 { 6611 int i; 6612 th_trace_t *th_trace; 6613 th_trace_t *th_trace_next; 6614 6615 for (i = 0; i < IP_TR_HASH_MAX; i++) { 6616 for (th_trace = ill->ill_trace[i]; th_trace != NULL; 6617 th_trace = th_trace_next) { 6618 th_trace_next = th_trace->th_next; 6619 kmem_free(th_trace, sizeof (th_trace_t)); 6620 } 6621 ill->ill_trace[i] = NULL; 6622 } 6623 } 6624 6625 #else 6626 void ip_thread_exit(void) {} 6627 #endif 6628 6629 void 6630 ipif_refhold_locked(ipif_t *ipif) 6631 { 6632 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6633 ipif->ipif_refcnt++; 6634 IPIF_TRACE_REF(ipif); 6635 } 6636 6637 void 6638 ipif_refhold(ipif_t *ipif) 6639 { 6640 ill_t *ill; 6641 6642 ill = ipif->ipif_ill; 6643 mutex_enter(&ill->ill_lock); 6644 ipif->ipif_refcnt++; 6645 IPIF_TRACE_REF(ipif); 6646 mutex_exit(&ill->ill_lock); 6647 } 6648 6649 /* 6650 * Must not be called while holding any locks. Otherwise if this is 6651 * the last reference to be released there is a chance of recursive mutex 6652 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6653 * to restart an ioctl. 6654 */ 6655 void 6656 ipif_refrele(ipif_t *ipif) 6657 { 6658 ill_t *ill; 6659 6660 ill = ipif->ipif_ill; 6661 6662 mutex_enter(&ill->ill_lock); 6663 ASSERT(ipif->ipif_refcnt != 0); 6664 ipif->ipif_refcnt--; 6665 IPIF_UNTRACE_REF(ipif); 6666 if (ipif->ipif_refcnt != 0) { 6667 mutex_exit(&ill->ill_lock); 6668 return; 6669 } 6670 6671 /* Drops the ill_lock */ 6672 ipif_ill_refrele_tail(ill); 6673 } 6674 6675 ipif_t * 6676 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6677 { 6678 ipif_t *ipif; 6679 6680 mutex_enter(&ill->ill_lock); 6681 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6682 ipif != NULL; ipif = ipif->ipif_next) { 6683 if (!IPIF_CAN_LOOKUP(ipif)) 6684 continue; 6685 ipif_refhold_locked(ipif); 6686 mutex_exit(&ill->ill_lock); 6687 return (ipif); 6688 } 6689 mutex_exit(&ill->ill_lock); 6690 return (NULL); 6691 } 6692 6693 /* 6694 * TODO: make this table extendible at run time 6695 * Return a pointer to the mac type info for 'mac_type' 6696 */ 6697 static ip_m_t * 6698 ip_m_lookup(t_uscalar_t mac_type) 6699 { 6700 ip_m_t *ipm; 6701 6702 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6703 if (ipm->ip_m_mac_type == mac_type) 6704 return (ipm); 6705 return (NULL); 6706 } 6707 6708 /* 6709 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6710 * ipif_arg is passed in to associate it with the correct interface. 6711 * We may need to restart this operation if the ipif cannot be looked up 6712 * due to an exclusive operation that is currently in progress. The restart 6713 * entry point is specified by 'func' 6714 */ 6715 int 6716 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6717 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 6718 ire_t **ire_arg, boolean_t ioctl_msg, queue_t *q, mblk_t *mp, 6719 ipsq_func_t func, struct rtsa_s *sp) 6720 { 6721 ire_t *ire; 6722 ire_t *gw_ire = NULL; 6723 ipif_t *ipif = NULL; 6724 boolean_t ipif_refheld = B_FALSE; 6725 uint_t type; 6726 int match_flags = MATCH_IRE_TYPE; 6727 int error; 6728 tsol_gc_t *gc = NULL; 6729 tsol_gcgrp_t *gcgrp = NULL; 6730 boolean_t gcgrp_xtraref = B_FALSE; 6731 6732 ip1dbg(("ip_rt_add:")); 6733 6734 if (ire_arg != NULL) 6735 *ire_arg = NULL; 6736 6737 /* 6738 * If this is the case of RTF_HOST being set, then we set the netmask 6739 * to all ones (regardless if one was supplied). 6740 */ 6741 if (flags & RTF_HOST) 6742 mask = IP_HOST_MASK; 6743 6744 /* 6745 * Prevent routes with a zero gateway from being created (since 6746 * interfaces can currently be plumbed and brought up no assigned 6747 * address). 6748 * For routes with RTA_SRCIFP, the gateway address can be 0.0.0.0. 6749 */ 6750 if (gw_addr == 0 && src_ipif == NULL) 6751 return (ENETUNREACH); 6752 /* 6753 * Get the ipif, if any, corresponding to the gw_addr 6754 */ 6755 if (gw_addr != 0) { 6756 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, 6757 &error); 6758 if (ipif != NULL) { 6759 if (IS_VNI(ipif->ipif_ill)) { 6760 ipif_refrele(ipif); 6761 return (EINVAL); 6762 } 6763 ipif_refheld = B_TRUE; 6764 } else if (error == EINPROGRESS) { 6765 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6766 return (EINPROGRESS); 6767 } else { 6768 error = 0; 6769 } 6770 } 6771 6772 if (ipif != NULL) { 6773 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6774 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6775 } else { 6776 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6777 } 6778 6779 /* 6780 * GateD will attempt to create routes with a loopback interface 6781 * address as the gateway and with RTF_GATEWAY set. We allow 6782 * these routes to be added, but create them as interface routes 6783 * since the gateway is an interface address. 6784 */ 6785 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6786 flags &= ~RTF_GATEWAY; 6787 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6788 mask == IP_HOST_MASK) { 6789 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6790 ALL_ZONES, NULL, match_flags); 6791 if (ire != NULL) { 6792 ire_refrele(ire); 6793 if (ipif_refheld) 6794 ipif_refrele(ipif); 6795 return (EEXIST); 6796 } 6797 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" 6798 "for 0x%x\n", (void *)ipif, 6799 ipif->ipif_ire_type, 6800 ntohl(ipif->ipif_lcl_addr))); 6801 ire = ire_create( 6802 (uchar_t *)&dst_addr, /* dest address */ 6803 (uchar_t *)&mask, /* mask */ 6804 (uchar_t *)&ipif->ipif_src_addr, 6805 NULL, /* no gateway */ 6806 NULL, 6807 &ipif->ipif_mtu, 6808 NULL, 6809 ipif->ipif_rq, /* recv-from queue */ 6810 NULL, /* no send-to queue */ 6811 ipif->ipif_ire_type, /* LOOPBACK */ 6812 NULL, 6813 ipif, 6814 NULL, 6815 0, 6816 0, 6817 0, 6818 (ipif->ipif_flags & IPIF_PRIVATE) ? 6819 RTF_PRIVATE : 0, 6820 &ire_uinfo_null, 6821 NULL, 6822 NULL); 6823 6824 if (ire == NULL) { 6825 if (ipif_refheld) 6826 ipif_refrele(ipif); 6827 return (ENOMEM); 6828 } 6829 error = ire_add(&ire, q, mp, func, B_FALSE); 6830 if (error == 0) 6831 goto save_ire; 6832 if (ipif_refheld) 6833 ipif_refrele(ipif); 6834 return (error); 6835 6836 } 6837 } 6838 6839 /* 6840 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6841 * and the gateway address provided is one of the system's interface 6842 * addresses. By using the routing socket interface and supplying an 6843 * RTA_IFP sockaddr with an interface index, an alternate method of 6844 * specifying an interface route to be created is available which uses 6845 * the interface index that specifies the outgoing interface rather than 6846 * the address of an outgoing interface (which may not be able to 6847 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6848 * flag, routes can be specified which not only specify the next-hop to 6849 * be used when routing to a certain prefix, but also which outgoing 6850 * interface should be used. 6851 * 6852 * Previously, interfaces would have unique addresses assigned to them 6853 * and so the address assigned to a particular interface could be used 6854 * to identify a particular interface. One exception to this was the 6855 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6856 * 6857 * With the advent of IPv6 and its link-local addresses, this 6858 * restriction was relaxed and interfaces could share addresses between 6859 * themselves. In fact, typically all of the link-local interfaces on 6860 * an IPv6 node or router will have the same link-local address. In 6861 * order to differentiate between these interfaces, the use of an 6862 * interface index is necessary and this index can be carried inside a 6863 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6864 * of using the interface index, however, is that all of the ipif's that 6865 * are part of an ill have the same index and so the RTA_IFP sockaddr 6866 * cannot be used to differentiate between ipif's (or logical 6867 * interfaces) that belong to the same ill (physical interface). 6868 * 6869 * For example, in the following case involving IPv4 interfaces and 6870 * logical interfaces 6871 * 6872 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6873 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6874 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6875 * 6876 * the ipif's corresponding to each of these interface routes can be 6877 * uniquely identified by the "gateway" (actually interface address). 6878 * 6879 * In this case involving multiple IPv6 default routes to a particular 6880 * link-local gateway, the use of RTA_IFP is necessary to specify which 6881 * default route is of interest: 6882 * 6883 * default fe80::123:4567:89ab:cdef U if0 6884 * default fe80::123:4567:89ab:cdef U if1 6885 */ 6886 6887 /* RTF_GATEWAY not set */ 6888 if (!(flags & RTF_GATEWAY)) { 6889 queue_t *stq; 6890 queue_t *rfq = NULL; 6891 ill_t *in_ill = NULL; 6892 6893 if (sp != NULL) { 6894 ip2dbg(("ip_rt_add: gateway security attributes " 6895 "cannot be set with interface route\n")); 6896 if (ipif_refheld) 6897 ipif_refrele(ipif); 6898 return (EINVAL); 6899 } 6900 6901 /* 6902 * As the interface index specified with the RTA_IFP sockaddr is 6903 * the same for all ipif's off of an ill, the matching logic 6904 * below uses MATCH_IRE_ILL if such an index was specified. 6905 * This means that routes sharing the same prefix when added 6906 * using a RTA_IFP sockaddr must have distinct interface 6907 * indices (namely, they must be on distinct ill's). 6908 * 6909 * On the other hand, since the gateway address will usually be 6910 * different for each ipif on the system, the matching logic 6911 * uses MATCH_IRE_IPIF in the case of a traditional interface 6912 * route. This means that interface routes for the same prefix 6913 * can be created if they belong to distinct ipif's and if a 6914 * RTA_IFP sockaddr is not present. 6915 */ 6916 if (ipif_arg != NULL) { 6917 if (ipif_refheld) { 6918 ipif_refrele(ipif); 6919 ipif_refheld = B_FALSE; 6920 } 6921 ipif = ipif_arg; 6922 match_flags |= MATCH_IRE_ILL; 6923 } else { 6924 /* 6925 * Check the ipif corresponding to the gw_addr 6926 */ 6927 if (ipif == NULL) 6928 return (ENETUNREACH); 6929 match_flags |= MATCH_IRE_IPIF; 6930 } 6931 ASSERT(ipif != NULL); 6932 /* 6933 * If src_ipif is not NULL, we have to create 6934 * an ire with non-null ire_in_ill value 6935 */ 6936 if (src_ipif != NULL) { 6937 in_ill = src_ipif->ipif_ill; 6938 } 6939 6940 /* 6941 * We check for an existing entry at this point. 6942 * 6943 * Since a netmask isn't passed in via the ioctl interface 6944 * (SIOCADDRT), we don't check for a matching netmask in that 6945 * case. 6946 */ 6947 if (!ioctl_msg) 6948 match_flags |= MATCH_IRE_MASK; 6949 if (src_ipif != NULL) { 6950 /* Look up in the special table */ 6951 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 6952 ipif, src_ipif->ipif_ill, match_flags); 6953 } else { 6954 ire = ire_ftable_lookup(dst_addr, mask, 0, 6955 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 6956 NULL, match_flags); 6957 } 6958 if (ire != NULL) { 6959 ire_refrele(ire); 6960 if (ipif_refheld) 6961 ipif_refrele(ipif); 6962 return (EEXIST); 6963 } 6964 6965 if (src_ipif != NULL) { 6966 /* 6967 * Create the special ire for the IRE table 6968 * which hangs out of ire_in_ill. This ire 6969 * is in-between IRE_CACHE and IRE_INTERFACE. 6970 * Thus rfq is non-NULL. 6971 */ 6972 rfq = ipif->ipif_rq; 6973 } 6974 /* Create the usual interface ires */ 6975 6976 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6977 ? ipif->ipif_rq : ipif->ipif_wq; 6978 6979 /* 6980 * Create a copy of the IRE_LOOPBACK, 6981 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6982 * the modified address and netmask. 6983 */ 6984 ire = ire_create( 6985 (uchar_t *)&dst_addr, 6986 (uint8_t *)&mask, 6987 (uint8_t *)&ipif->ipif_src_addr, 6988 NULL, 6989 NULL, 6990 &ipif->ipif_mtu, 6991 NULL, 6992 rfq, 6993 stq, 6994 ipif->ipif_net_type, 6995 ipif->ipif_resolver_mp, 6996 ipif, 6997 in_ill, 6998 0, 6999 0, 7000 0, 7001 flags, 7002 &ire_uinfo_null, 7003 NULL, 7004 NULL); 7005 if (ire == NULL) { 7006 if (ipif_refheld) 7007 ipif_refrele(ipif); 7008 return (ENOMEM); 7009 } 7010 7011 /* 7012 * Some software (for example, GateD and Sun Cluster) attempts 7013 * to create (what amount to) IRE_PREFIX routes with the 7014 * loopback address as the gateway. This is primarily done to 7015 * set up prefixes with the RTF_REJECT flag set (for example, 7016 * when generating aggregate routes.) 7017 * 7018 * If the IRE type (as defined by ipif->ipif_net_type) is 7019 * IRE_LOOPBACK, then we map the request into a 7020 * IRE_IF_NORESOLVER. 7021 * 7022 * Needless to say, the real IRE_LOOPBACK is NOT created by this 7023 * routine, but rather using ire_create() directly. 7024 * 7025 */ 7026 if (ipif->ipif_net_type == IRE_LOOPBACK) 7027 ire->ire_type = IRE_IF_NORESOLVER; 7028 7029 error = ire_add(&ire, q, mp, func, B_FALSE); 7030 if (error == 0) 7031 goto save_ire; 7032 7033 /* 7034 * In the result of failure, ire_add() will have already 7035 * deleted the ire in question, so there is no need to 7036 * do that here. 7037 */ 7038 if (ipif_refheld) 7039 ipif_refrele(ipif); 7040 return (error); 7041 } 7042 if (ipif_refheld) { 7043 ipif_refrele(ipif); 7044 ipif_refheld = B_FALSE; 7045 } 7046 7047 if (src_ipif != NULL) { 7048 /* RTA_SRCIFP is not supported on RTF_GATEWAY */ 7049 ip2dbg(("ip_rt_add: SRCIF cannot be set with gateway route\n")); 7050 return (EINVAL); 7051 } 7052 /* 7053 * Get an interface IRE for the specified gateway. 7054 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 7055 * gateway, it is currently unreachable and we fail the request 7056 * accordingly. 7057 */ 7058 ipif = ipif_arg; 7059 if (ipif_arg != NULL) 7060 match_flags |= MATCH_IRE_ILL; 7061 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 7062 ALL_ZONES, 0, NULL, match_flags); 7063 if (gw_ire == NULL) 7064 return (ENETUNREACH); 7065 7066 /* 7067 * We create one of three types of IREs as a result of this request 7068 * based on the netmask. A netmask of all ones (which is automatically 7069 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 7070 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 7071 * created. Otherwise, an IRE_PREFIX route is created for the 7072 * destination prefix. 7073 */ 7074 if (mask == IP_HOST_MASK) 7075 type = IRE_HOST; 7076 else if (mask == 0) 7077 type = IRE_DEFAULT; 7078 else 7079 type = IRE_PREFIX; 7080 7081 /* check for a duplicate entry */ 7082 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7083 NULL, ALL_ZONES, 0, NULL, 7084 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW); 7085 if (ire != NULL) { 7086 ire_refrele(gw_ire); 7087 ire_refrele(ire); 7088 return (EEXIST); 7089 } 7090 7091 /* Security attribute exists */ 7092 if (sp != NULL) { 7093 tsol_gcgrp_addr_t ga; 7094 7095 /* find or create the gateway credentials group */ 7096 ga.ga_af = AF_INET; 7097 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 7098 7099 /* we hold reference to it upon success */ 7100 gcgrp = gcgrp_lookup(&ga, B_TRUE); 7101 if (gcgrp == NULL) { 7102 ire_refrele(gw_ire); 7103 return (ENOMEM); 7104 } 7105 7106 /* 7107 * Create and add the security attribute to the group; a 7108 * reference to the group is made upon allocating a new 7109 * entry successfully. If it finds an already-existing 7110 * entry for the security attribute in the group, it simply 7111 * returns it and no new reference is made to the group. 7112 */ 7113 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 7114 if (gc == NULL) { 7115 /* release reference held by gcgrp_lookup */ 7116 GCGRP_REFRELE(gcgrp); 7117 ire_refrele(gw_ire); 7118 return (ENOMEM); 7119 } 7120 } 7121 7122 /* Create the IRE. */ 7123 ire = ire_create( 7124 (uchar_t *)&dst_addr, /* dest address */ 7125 (uchar_t *)&mask, /* mask */ 7126 /* src address assigned by the caller? */ 7127 (uchar_t *)(((src_addr != INADDR_ANY) && 7128 (flags & RTF_SETSRC)) ? &src_addr : NULL), 7129 (uchar_t *)&gw_addr, /* gateway address */ 7130 NULL, /* no in-srcaddress */ 7131 &gw_ire->ire_max_frag, 7132 NULL, /* no Fast Path header */ 7133 NULL, /* no recv-from queue */ 7134 NULL, /* no send-to queue */ 7135 (ushort_t)type, /* IRE type */ 7136 NULL, 7137 ipif_arg, 7138 NULL, 7139 0, 7140 0, 7141 0, 7142 flags, 7143 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 7144 gc, /* security attribute */ 7145 NULL); 7146 /* 7147 * The ire holds a reference to the 'gc' and the 'gc' holds a 7148 * reference to the 'gcgrp'. We can now release the extra reference 7149 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 7150 */ 7151 if (gcgrp_xtraref) 7152 GCGRP_REFRELE(gcgrp); 7153 if (ire == NULL) { 7154 if (gc != NULL) 7155 GC_REFRELE(gc); 7156 ire_refrele(gw_ire); 7157 return (ENOMEM); 7158 } 7159 7160 /* 7161 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 7162 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 7163 */ 7164 7165 /* Add the new IRE. */ 7166 error = ire_add(&ire, q, mp, func, B_FALSE); 7167 if (error != 0) { 7168 /* 7169 * In the result of failure, ire_add() will have already 7170 * deleted the ire in question, so there is no need to 7171 * do that here. 7172 */ 7173 ire_refrele(gw_ire); 7174 return (error); 7175 } 7176 7177 if (flags & RTF_MULTIRT) { 7178 /* 7179 * Invoke the CGTP (multirouting) filtering module 7180 * to add the dst address in the filtering database. 7181 * Replicated inbound packets coming from that address 7182 * will be filtered to discard the duplicates. 7183 * It is not necessary to call the CGTP filter hook 7184 * when the dst address is a broadcast or multicast, 7185 * because an IP source address cannot be a broadcast 7186 * or a multicast. 7187 */ 7188 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 7189 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 7190 if (ire_dst != NULL) { 7191 ip_cgtp_bcast_add(ire, ire_dst); 7192 ire_refrele(ire_dst); 7193 goto save_ire; 7194 } 7195 if ((ip_cgtp_filter_ops != NULL) && !CLASSD(ire->ire_addr)) { 7196 int res = ip_cgtp_filter_ops->cfo_add_dest_v4( 7197 ire->ire_addr, 7198 ire->ire_gateway_addr, 7199 ire->ire_src_addr, 7200 gw_ire->ire_src_addr); 7201 if (res != 0) { 7202 ire_refrele(gw_ire); 7203 ire_delete(ire); 7204 return (res); 7205 } 7206 } 7207 } 7208 7209 /* 7210 * Now that the prefix IRE entry has been created, delete any 7211 * existing gateway IRE cache entries as well as any IRE caches 7212 * using the gateway, and force them to be created through 7213 * ip_newroute. 7214 */ 7215 if (gc != NULL) { 7216 ASSERT(gcgrp != NULL); 7217 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES); 7218 } 7219 7220 save_ire: 7221 if (gw_ire != NULL) { 7222 ire_refrele(gw_ire); 7223 } 7224 /* 7225 * We do not do save_ire for the routes added with RTA_SRCIFP 7226 * flag. This route is only added and deleted by mipagent. 7227 * So, for simplicity of design, we refrain from saving 7228 * ires that are created with srcif value. This may change 7229 * in future if we find more usage of srcifp feature. 7230 */ 7231 if (ipif != NULL && src_ipif == NULL) { 7232 /* 7233 * Save enough information so that we can recreate the IRE if 7234 * the interface goes down and then up. The metrics associated 7235 * with the route will be saved as well when rts_setmetrics() is 7236 * called after the IRE has been created. In the case where 7237 * memory cannot be allocated, none of this information will be 7238 * saved. 7239 */ 7240 ipif_save_ire(ipif, ire); 7241 } 7242 if (ioctl_msg) 7243 ip_rts_rtmsg(RTM_OLDADD, ire, 0); 7244 if (ire_arg != NULL) { 7245 /* 7246 * Store the ire that was successfully added into where ire_arg 7247 * points to so that callers don't have to look it up 7248 * themselves (but they are responsible for ire_refrele()ing 7249 * the ire when they are finished with it). 7250 */ 7251 *ire_arg = ire; 7252 } else { 7253 ire_refrele(ire); /* Held in ire_add */ 7254 } 7255 if (ipif_refheld) 7256 ipif_refrele(ipif); 7257 return (0); 7258 } 7259 7260 /* 7261 * ip_rt_delete is called to delete an IPv4 route. 7262 * ipif_arg is passed in to associate it with the correct interface. 7263 * src_ipif is passed to associate the incoming interface of the packet. 7264 * We may need to restart this operation if the ipif cannot be looked up 7265 * due to an exclusive operation that is currently in progress. The restart 7266 * entry point is specified by 'func' 7267 */ 7268 /* ARGSUSED4 */ 7269 int 7270 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7271 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, ipif_t *src_ipif, 7272 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func) 7273 { 7274 ire_t *ire = NULL; 7275 ipif_t *ipif; 7276 boolean_t ipif_refheld = B_FALSE; 7277 uint_t type; 7278 uint_t match_flags = MATCH_IRE_TYPE; 7279 int err = 0; 7280 7281 ip1dbg(("ip_rt_delete:")); 7282 /* 7283 * If this is the case of RTF_HOST being set, then we set the netmask 7284 * to all ones. Otherwise, we use the netmask if one was supplied. 7285 */ 7286 if (flags & RTF_HOST) { 7287 mask = IP_HOST_MASK; 7288 match_flags |= MATCH_IRE_MASK; 7289 } else if (rtm_addrs & RTA_NETMASK) { 7290 match_flags |= MATCH_IRE_MASK; 7291 } 7292 7293 /* 7294 * Note that RTF_GATEWAY is never set on a delete, therefore 7295 * we check if the gateway address is one of our interfaces first, 7296 * and fall back on RTF_GATEWAY routes. 7297 * 7298 * This makes it possible to delete an original 7299 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7300 * 7301 * As the interface index specified with the RTA_IFP sockaddr is the 7302 * same for all ipif's off of an ill, the matching logic below uses 7303 * MATCH_IRE_ILL if such an index was specified. This means a route 7304 * sharing the same prefix and interface index as the the route 7305 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7306 * is specified in the request. 7307 * 7308 * On the other hand, since the gateway address will usually be 7309 * different for each ipif on the system, the matching logic 7310 * uses MATCH_IRE_IPIF in the case of a traditional interface 7311 * route. This means that interface routes for the same prefix can be 7312 * uniquely identified if they belong to distinct ipif's and if a 7313 * RTA_IFP sockaddr is not present. 7314 * 7315 * For more detail on specifying routes by gateway address and by 7316 * interface index, see the comments in ip_rt_add(). 7317 * gw_addr could be zero in some cases when both RTA_SRCIFP and 7318 * RTA_IFP are specified. If RTA_SRCIFP is specified and both 7319 * RTA_IFP and gateway_addr are NULL/zero, then delete will not 7320 * succeed. 7321 */ 7322 if (src_ipif != NULL) { 7323 if (ipif_arg == NULL && gw_addr != 0) { 7324 ipif_arg = ipif_lookup_interface(gw_addr, dst_addr, 7325 q, mp, func, &err); 7326 if (ipif_arg != NULL) 7327 ipif_refheld = B_TRUE; 7328 } 7329 if (ipif_arg == NULL) { 7330 err = (err == EINPROGRESS) ? err : ESRCH; 7331 return (err); 7332 } 7333 ipif = ipif_arg; 7334 } else { 7335 ipif = ipif_lookup_interface(gw_addr, dst_addr, 7336 q, mp, func, &err); 7337 if (ipif != NULL) 7338 ipif_refheld = B_TRUE; 7339 else if (err == EINPROGRESS) 7340 return (err); 7341 else 7342 err = 0; 7343 } 7344 if (ipif != NULL) { 7345 if (ipif_arg != NULL) { 7346 if (ipif_refheld) { 7347 ipif_refrele(ipif); 7348 ipif_refheld = B_FALSE; 7349 } 7350 ipif = ipif_arg; 7351 match_flags |= MATCH_IRE_ILL; 7352 } else { 7353 match_flags |= MATCH_IRE_IPIF; 7354 } 7355 if (src_ipif != NULL) { 7356 ire = ire_srcif_table_lookup(dst_addr, IRE_INTERFACE, 7357 ipif, src_ipif->ipif_ill, match_flags); 7358 } else { 7359 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7360 ire = ire_ctable_lookup(dst_addr, 0, 7361 IRE_LOOPBACK, ipif, ALL_ZONES, NULL, 7362 match_flags); 7363 } 7364 if (ire == NULL) { 7365 ire = ire_ftable_lookup(dst_addr, mask, 0, 7366 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, 7367 NULL, match_flags); 7368 } 7369 } 7370 } 7371 7372 if (ire == NULL) { 7373 /* 7374 * At this point, the gateway address is not one of our own 7375 * addresses or a matching interface route was not found. We 7376 * set the IRE type to lookup based on whether 7377 * this is a host route, a default route or just a prefix. 7378 * 7379 * If an ipif_arg was passed in, then the lookup is based on an 7380 * interface index so MATCH_IRE_ILL is added to match_flags. 7381 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7382 * set as the route being looked up is not a traditional 7383 * interface route. 7384 * Since we do not add gateway route with srcipif, we don't 7385 * expect to find it either. 7386 */ 7387 if (src_ipif != NULL) { 7388 if (ipif_refheld) 7389 ipif_refrele(ipif); 7390 return (ESRCH); 7391 } else { 7392 match_flags &= ~MATCH_IRE_IPIF; 7393 match_flags |= MATCH_IRE_GW; 7394 if (ipif_arg != NULL) 7395 match_flags |= MATCH_IRE_ILL; 7396 if (mask == IP_HOST_MASK) 7397 type = IRE_HOST; 7398 else if (mask == 0) 7399 type = IRE_DEFAULT; 7400 else 7401 type = IRE_PREFIX; 7402 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, 7403 ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags); 7404 } 7405 } 7406 7407 if (ipif_refheld) 7408 ipif_refrele(ipif); 7409 7410 /* ipif is not refheld anymore */ 7411 if (ire == NULL) 7412 return (ESRCH); 7413 7414 if (ire->ire_flags & RTF_MULTIRT) { 7415 /* 7416 * Invoke the CGTP (multirouting) filtering module 7417 * to remove the dst address from the filtering database. 7418 * Packets coming from that address will no longer be 7419 * filtered to remove duplicates. 7420 */ 7421 if (ip_cgtp_filter_ops != NULL) { 7422 err = ip_cgtp_filter_ops->cfo_del_dest_v4(ire->ire_addr, 7423 ire->ire_gateway_addr); 7424 } 7425 ip_cgtp_bcast_delete(ire); 7426 } 7427 7428 ipif = ire->ire_ipif; 7429 /* 7430 * Removing from ipif_saved_ire_mp is not necessary 7431 * when src_ipif being non-NULL. ip_rt_add does not 7432 * save the ires which src_ipif being non-NULL. 7433 */ 7434 if (ipif != NULL && src_ipif == NULL) { 7435 ipif_remove_ire(ipif, ire); 7436 } 7437 if (ioctl_msg) 7438 ip_rts_rtmsg(RTM_OLDDEL, ire, 0); 7439 ire_delete(ire); 7440 ire_refrele(ire); 7441 return (err); 7442 } 7443 7444 /* 7445 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7446 */ 7447 /* ARGSUSED */ 7448 int 7449 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7450 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7451 { 7452 ipaddr_t dst_addr; 7453 ipaddr_t gw_addr; 7454 ipaddr_t mask; 7455 int error = 0; 7456 mblk_t *mp1; 7457 struct rtentry *rt; 7458 ipif_t *ipif = NULL; 7459 7460 ip1dbg(("ip_siocaddrt:")); 7461 /* Existence of mp1 verified in ip_wput_nondata */ 7462 mp1 = mp->b_cont->b_cont; 7463 rt = (struct rtentry *)mp1->b_rptr; 7464 7465 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7466 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7467 7468 /* 7469 * If the RTF_HOST flag is on, this is a request to assign a gateway 7470 * to a particular host address. In this case, we set the netmask to 7471 * all ones for the particular destination address. Otherwise, 7472 * determine the netmask to be used based on dst_addr and the interfaces 7473 * in use. 7474 */ 7475 if (rt->rt_flags & RTF_HOST) { 7476 mask = IP_HOST_MASK; 7477 } else { 7478 /* 7479 * Note that ip_subnet_mask returns a zero mask in the case of 7480 * default (an all-zeroes address). 7481 */ 7482 mask = ip_subnet_mask(dst_addr, &ipif); 7483 } 7484 7485 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7486 NULL, B_TRUE, q, mp, ip_process_ioctl, NULL); 7487 if (ipif != NULL) 7488 ipif_refrele(ipif); 7489 return (error); 7490 } 7491 7492 /* 7493 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7494 */ 7495 /* ARGSUSED */ 7496 int 7497 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7498 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7499 { 7500 ipaddr_t dst_addr; 7501 ipaddr_t gw_addr; 7502 ipaddr_t mask; 7503 int error; 7504 mblk_t *mp1; 7505 struct rtentry *rt; 7506 ipif_t *ipif = NULL; 7507 7508 ip1dbg(("ip_siocdelrt:")); 7509 /* Existence of mp1 verified in ip_wput_nondata */ 7510 mp1 = mp->b_cont->b_cont; 7511 rt = (struct rtentry *)mp1->b_rptr; 7512 7513 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7514 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7515 7516 /* 7517 * If the RTF_HOST flag is on, this is a request to delete a gateway 7518 * to a particular host address. In this case, we set the netmask to 7519 * all ones for the particular destination address. Otherwise, 7520 * determine the netmask to be used based on dst_addr and the interfaces 7521 * in use. 7522 */ 7523 if (rt->rt_flags & RTF_HOST) { 7524 mask = IP_HOST_MASK; 7525 } else { 7526 /* 7527 * Note that ip_subnet_mask returns a zero mask in the case of 7528 * default (an all-zeroes address). 7529 */ 7530 mask = ip_subnet_mask(dst_addr, &ipif); 7531 } 7532 7533 error = ip_rt_delete(dst_addr, mask, gw_addr, 7534 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, NULL, 7535 B_TRUE, q, mp, ip_process_ioctl); 7536 if (ipif != NULL) 7537 ipif_refrele(ipif); 7538 return (error); 7539 } 7540 7541 /* 7542 * Enqueue the mp onto the ipsq, chained by b_next. 7543 * b_prev stores the function to be executed later, and b_queue the queue 7544 * where this mp originated. 7545 */ 7546 void 7547 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7548 ill_t *pending_ill) 7549 { 7550 conn_t *connp = NULL; 7551 7552 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7553 ASSERT(func != NULL); 7554 7555 mp->b_queue = q; 7556 mp->b_prev = (void *)func; 7557 mp->b_next = NULL; 7558 7559 switch (type) { 7560 case CUR_OP: 7561 if (ipsq->ipsq_mptail != NULL) { 7562 ASSERT(ipsq->ipsq_mphead != NULL); 7563 ipsq->ipsq_mptail->b_next = mp; 7564 } else { 7565 ASSERT(ipsq->ipsq_mphead == NULL); 7566 ipsq->ipsq_mphead = mp; 7567 } 7568 ipsq->ipsq_mptail = mp; 7569 break; 7570 7571 case NEW_OP: 7572 if (ipsq->ipsq_xopq_mptail != NULL) { 7573 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7574 ipsq->ipsq_xopq_mptail->b_next = mp; 7575 } else { 7576 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7577 ipsq->ipsq_xopq_mphead = mp; 7578 } 7579 ipsq->ipsq_xopq_mptail = mp; 7580 break; 7581 default: 7582 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7583 } 7584 7585 if (CONN_Q(q) && pending_ill != NULL) { 7586 connp = Q_TO_CONN(q); 7587 7588 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7589 connp->conn_oper_pending_ill = pending_ill; 7590 } 7591 } 7592 7593 /* 7594 * Return the mp at the head of the ipsq. After emptying the ipsq 7595 * look at the next ioctl, if this ioctl is complete. Otherwise 7596 * return, we will resume when we complete the current ioctl. 7597 * The current ioctl will wait till it gets a response from the 7598 * driver below. 7599 */ 7600 static mblk_t * 7601 ipsq_dq(ipsq_t *ipsq) 7602 { 7603 mblk_t *mp; 7604 7605 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7606 7607 mp = ipsq->ipsq_mphead; 7608 if (mp != NULL) { 7609 ipsq->ipsq_mphead = mp->b_next; 7610 if (ipsq->ipsq_mphead == NULL) 7611 ipsq->ipsq_mptail = NULL; 7612 mp->b_next = NULL; 7613 return (mp); 7614 } 7615 if (ipsq->ipsq_current_ipif != NULL) 7616 return (NULL); 7617 mp = ipsq->ipsq_xopq_mphead; 7618 if (mp != NULL) { 7619 ipsq->ipsq_xopq_mphead = mp->b_next; 7620 if (ipsq->ipsq_xopq_mphead == NULL) 7621 ipsq->ipsq_xopq_mptail = NULL; 7622 mp->b_next = NULL; 7623 return (mp); 7624 } 7625 return (NULL); 7626 } 7627 7628 /* 7629 * Enter the ipsq corresponding to ill, by waiting synchronously till 7630 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7631 * will have to drain completely before ipsq_enter returns success. 7632 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 7633 * and the ipsq_exit logic will start the next enqueued ioctl after 7634 * completion of the current ioctl. If 'force' is used, we don't wait 7635 * for the enqueued ioctls. This is needed when a conn_close wants to 7636 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7637 * of an ill can also use this option. But we dont' use it currently. 7638 */ 7639 #define ENTER_SQ_WAIT_TICKS 100 7640 boolean_t 7641 ipsq_enter(ill_t *ill, boolean_t force) 7642 { 7643 ipsq_t *ipsq; 7644 boolean_t waited_enough = B_FALSE; 7645 7646 /* 7647 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 7648 * Since the <ill-ipsq> assocs could change while we wait for the 7649 * writer, it is easier to wait on a fixed global rather than try to 7650 * cv_wait on a changing ipsq. 7651 */ 7652 mutex_enter(&ill->ill_lock); 7653 for (;;) { 7654 if (ill->ill_state_flags & ILL_CONDEMNED) { 7655 mutex_exit(&ill->ill_lock); 7656 return (B_FALSE); 7657 } 7658 7659 ipsq = ill->ill_phyint->phyint_ipsq; 7660 mutex_enter(&ipsq->ipsq_lock); 7661 if (ipsq->ipsq_writer == NULL && 7662 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 7663 break; 7664 } else if (ipsq->ipsq_writer != NULL) { 7665 mutex_exit(&ipsq->ipsq_lock); 7666 cv_wait(&ill->ill_cv, &ill->ill_lock); 7667 } else { 7668 mutex_exit(&ipsq->ipsq_lock); 7669 if (force) { 7670 (void) cv_timedwait(&ill->ill_cv, 7671 &ill->ill_lock, 7672 lbolt + ENTER_SQ_WAIT_TICKS); 7673 waited_enough = B_TRUE; 7674 continue; 7675 } else { 7676 cv_wait(&ill->ill_cv, &ill->ill_lock); 7677 } 7678 } 7679 } 7680 7681 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7682 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7683 ipsq->ipsq_writer = curthread; 7684 ipsq->ipsq_reentry_cnt++; 7685 #ifdef ILL_DEBUG 7686 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7687 #endif 7688 mutex_exit(&ipsq->ipsq_lock); 7689 mutex_exit(&ill->ill_lock); 7690 return (B_TRUE); 7691 } 7692 7693 /* 7694 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7695 * certain critical operations like plumbing (i.e. most set ioctls), 7696 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7697 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7698 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7699 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7700 * threads executing in the ipsq. Responses from the driver pertain to the 7701 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7702 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7703 * 7704 * If a thread does not want to reenter the ipsq when it is already writer, 7705 * it must make sure that the specified reentry point to be called later 7706 * when the ipsq is empty, nor any code path starting from the specified reentry 7707 * point must never ever try to enter the ipsq again. Otherwise it can lead 7708 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7709 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7710 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7711 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7712 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7713 * ioctl if the current ioctl has completed. If the current ioctl is still 7714 * in progress it simply returns. The current ioctl could be waiting for 7715 * a response from another module (arp_ or the driver or could be waiting for 7716 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7717 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7718 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7719 * ipsq_current_ipif is clear which happens only on ioctl completion. 7720 */ 7721 7722 /* 7723 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7724 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7725 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7726 * completion. 7727 */ 7728 ipsq_t * 7729 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7730 ipsq_func_t func, int type, boolean_t reentry_ok) 7731 { 7732 ipsq_t *ipsq; 7733 7734 /* Only 1 of ipif or ill can be specified */ 7735 ASSERT((ipif != NULL) ^ (ill != NULL)); 7736 if (ipif != NULL) 7737 ill = ipif->ipif_ill; 7738 7739 /* 7740 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7741 * ipsq of an ill can't change when ill_lock is held. 7742 */ 7743 GRAB_CONN_LOCK(q); 7744 mutex_enter(&ill->ill_lock); 7745 ipsq = ill->ill_phyint->phyint_ipsq; 7746 mutex_enter(&ipsq->ipsq_lock); 7747 7748 /* 7749 * 1. Enter the ipsq if we are already writer and reentry is ok. 7750 * (Note: If the caller does not specify reentry_ok then neither 7751 * 'func' nor any of its callees must ever attempt to enter the ipsq 7752 * again. Otherwise it can lead to an infinite loop 7753 * 2. Enter the ipsq if there is no current writer and this attempted 7754 * entry is part of the current ioctl or operation 7755 * 3. Enter the ipsq if there is no current writer and this is a new 7756 * ioctl (or operation) and the ioctl (or operation) queue is 7757 * empty and there is no ioctl (or operation) currently in progress 7758 */ 7759 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7760 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7761 ipsq->ipsq_current_ipif == NULL))) || 7762 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7763 /* Success. */ 7764 ipsq->ipsq_reentry_cnt++; 7765 ipsq->ipsq_writer = curthread; 7766 mutex_exit(&ipsq->ipsq_lock); 7767 mutex_exit(&ill->ill_lock); 7768 RELEASE_CONN_LOCK(q); 7769 #ifdef ILL_DEBUG 7770 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IP_STACK_DEPTH); 7771 #endif 7772 return (ipsq); 7773 } 7774 7775 ipsq_enq(ipsq, q, mp, func, type, ill); 7776 7777 mutex_exit(&ipsq->ipsq_lock); 7778 mutex_exit(&ill->ill_lock); 7779 RELEASE_CONN_LOCK(q); 7780 return (NULL); 7781 } 7782 7783 /* 7784 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7785 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7786 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7787 * completion. 7788 * 7789 * This function does a refrele on the ipif/ill. 7790 */ 7791 void 7792 qwriter_ip(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7793 ipsq_func_t func, int type, boolean_t reentry_ok) 7794 { 7795 ipsq_t *ipsq; 7796 7797 ipsq = ipsq_try_enter(ipif, ill, q, mp, func, type, reentry_ok); 7798 /* 7799 * Caller must have done a refhold on the ipif. ipif_refrele 7800 * happens on the passed ipif. We can do this since we are 7801 * already exclusive, or we won't access ipif henceforth, Both 7802 * this func and caller will just return if we ipsq_try_enter 7803 * fails above. This is needed because func needs to 7804 * see the correct refcount. Eg. removeif can work only then. 7805 */ 7806 if (ipif != NULL) 7807 ipif_refrele(ipif); 7808 else 7809 ill_refrele(ill); 7810 if (ipsq != NULL) { 7811 (*func)(ipsq, q, mp, NULL); 7812 ipsq_exit(ipsq, B_TRUE, B_TRUE); 7813 } 7814 } 7815 7816 /* 7817 * If there are more than ILL_GRP_CNT ills in a group, 7818 * we use kmem alloc'd buffers, else use the stack 7819 */ 7820 #define ILL_GRP_CNT 14 7821 /* 7822 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 7823 * Called by a thread that is currently exclusive on this ipsq. 7824 */ 7825 void 7826 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 7827 { 7828 queue_t *q; 7829 mblk_t *mp; 7830 ipsq_func_t func; 7831 int next; 7832 ill_t **ill_list = NULL; 7833 size_t ill_list_size = 0; 7834 int cnt = 0; 7835 boolean_t need_ipsq_free = B_FALSE; 7836 7837 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7838 mutex_enter(&ipsq->ipsq_lock); 7839 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 7840 if (ipsq->ipsq_reentry_cnt != 1) { 7841 ipsq->ipsq_reentry_cnt--; 7842 mutex_exit(&ipsq->ipsq_lock); 7843 return; 7844 } 7845 7846 mp = ipsq_dq(ipsq); 7847 while (mp != NULL) { 7848 again: 7849 mutex_exit(&ipsq->ipsq_lock); 7850 func = (ipsq_func_t)mp->b_prev; 7851 q = (queue_t *)mp->b_queue; 7852 mp->b_prev = NULL; 7853 mp->b_queue = NULL; 7854 7855 /* 7856 * If 'q' is an conn queue, it is valid, since we did a 7857 * a refhold on the connp, at the start of the ioctl. 7858 * If 'q' is an ill queue, it is valid, since close of an 7859 * ill will clean up the 'ipsq'. 7860 */ 7861 (*func)(ipsq, q, mp, NULL); 7862 7863 mutex_enter(&ipsq->ipsq_lock); 7864 mp = ipsq_dq(ipsq); 7865 } 7866 7867 mutex_exit(&ipsq->ipsq_lock); 7868 7869 /* 7870 * Need to grab the locks in the right order. Need to 7871 * atomically check (under ipsq_lock) that there are no 7872 * messages before relinquishing the ipsq. Also need to 7873 * atomically wakeup waiters on ill_cv while holding ill_lock. 7874 * Holding ill_g_lock ensures that ipsq list of ills is stable. 7875 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 7876 * to grab ill_g_lock as writer. 7877 */ 7878 rw_enter(&ill_g_lock, ipsq->ipsq_split ? RW_WRITER : RW_READER); 7879 7880 /* ipsq_refs can't change while ill_g_lock is held as reader */ 7881 if (ipsq->ipsq_refs != 0) { 7882 /* At most 2 ills v4/v6 per phyint */ 7883 cnt = ipsq->ipsq_refs << 1; 7884 ill_list_size = cnt * sizeof (ill_t *); 7885 /* 7886 * If memory allocation fails, we will do the split 7887 * the next time ipsq_exit is called for whatever reason. 7888 * As long as the ipsq_split flag is set the need to 7889 * split is remembered. 7890 */ 7891 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 7892 if (ill_list != NULL) 7893 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 7894 } 7895 mutex_enter(&ipsq->ipsq_lock); 7896 mp = ipsq_dq(ipsq); 7897 if (mp != NULL) { 7898 /* oops, some message has landed up, we can't get out */ 7899 if (ill_list != NULL) 7900 ill_unlock_ills(ill_list, cnt); 7901 rw_exit(&ill_g_lock); 7902 if (ill_list != NULL) 7903 kmem_free(ill_list, ill_list_size); 7904 ill_list = NULL; 7905 ill_list_size = 0; 7906 cnt = 0; 7907 goto again; 7908 } 7909 7910 /* 7911 * Split only if no ioctl is pending and if memory alloc succeeded 7912 * above. 7913 */ 7914 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 7915 ill_list != NULL) { 7916 /* 7917 * No new ill can join this ipsq since we are holding the 7918 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 7919 * ipsq. ill_split_ipsq may fail due to memory shortage. 7920 * If so we will retry on the next ipsq_exit. 7921 */ 7922 ipsq->ipsq_split = ill_split_ipsq(ipsq); 7923 } 7924 7925 /* 7926 * We are holding the ipsq lock, hence no new messages can 7927 * land up on the ipsq, and there are no messages currently. 7928 * Now safe to get out. Wake up waiters and relinquish ipsq 7929 * atomically while holding ill locks. 7930 */ 7931 ipsq->ipsq_writer = NULL; 7932 ipsq->ipsq_reentry_cnt--; 7933 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7934 #ifdef ILL_DEBUG 7935 ipsq->ipsq_depth = 0; 7936 #endif 7937 mutex_exit(&ipsq->ipsq_lock); 7938 /* 7939 * For IPMP this should wake up all ills in this ipsq. 7940 * We need to hold the ill_lock while waking up waiters to 7941 * avoid missed wakeups. But there is no need to acquire all 7942 * the ill locks and then wakeup. If we have not acquired all 7943 * the locks (due to memory failure above) ill_signal_ipsq_ills 7944 * wakes up ills one at a time after getting the right ill_lock 7945 */ 7946 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 7947 if (ill_list != NULL) 7948 ill_unlock_ills(ill_list, cnt); 7949 if (ipsq->ipsq_refs == 0) 7950 need_ipsq_free = B_TRUE; 7951 rw_exit(&ill_g_lock); 7952 if (ill_list != 0) 7953 kmem_free(ill_list, ill_list_size); 7954 7955 if (need_ipsq_free) { 7956 /* 7957 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 7958 * looked up. ipsq can be looked up only thru ill or phyint 7959 * and there are no ills/phyint on this ipsq. 7960 */ 7961 ipsq_delete(ipsq); 7962 } 7963 /* 7964 * Now start any igmp or mld timers that could not be started 7965 * while inside the ipsq. The timers can't be started while inside 7966 * the ipsq, since igmp_start_timers may need to call untimeout() 7967 * which can't be done while holding a lock i.e. the ipsq. Otherwise 7968 * there could be a deadlock since the timeout handlers 7969 * mld_timeout_handler / igmp_timeout_handler also synchronously 7970 * wait in ipsq_enter() trying to get the ipsq. 7971 * 7972 * However there is one exception to the above. If this thread is 7973 * itself the igmp/mld timeout handler thread, then we don't want 7974 * to start any new timer until the current handler is done. The 7975 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 7976 * all others pass B_TRUE. 7977 */ 7978 if (start_igmp_timer) { 7979 mutex_enter(&igmp_timer_lock); 7980 next = igmp_deferred_next; 7981 igmp_deferred_next = INFINITY; 7982 mutex_exit(&igmp_timer_lock); 7983 7984 if (next != INFINITY) 7985 igmp_start_timers(next); 7986 } 7987 7988 if (start_mld_timer) { 7989 mutex_enter(&mld_timer_lock); 7990 next = mld_deferred_next; 7991 mld_deferred_next = INFINITY; 7992 mutex_exit(&mld_timer_lock); 7993 7994 if (next != INFINITY) 7995 mld_start_timers(next); 7996 } 7997 } 7998 7999 /* 8000 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 8001 * and `ioccmd'. 8002 */ 8003 void 8004 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 8005 { 8006 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8007 8008 mutex_enter(&ipsq->ipsq_lock); 8009 ASSERT(ipsq->ipsq_current_ipif == NULL); 8010 ASSERT(ipsq->ipsq_current_ioctl == 0); 8011 ipsq->ipsq_current_ipif = ipif; 8012 ipsq->ipsq_current_ioctl = ioccmd; 8013 mutex_exit(&ipsq->ipsq_lock); 8014 } 8015 8016 /* 8017 * Finish the current exclusive operation on `ipsq'. Note that other 8018 * operations will not be able to proceed until an ipsq_exit() is done. 8019 */ 8020 void 8021 ipsq_current_finish(ipsq_t *ipsq) 8022 { 8023 ipif_t *ipif = ipsq->ipsq_current_ipif; 8024 hook_nic_event_t *info; 8025 8026 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8027 8028 /* 8029 * For SIOCSLIFREMOVEIF, the ipif has been already been blown away 8030 * (but we're careful to never set IPIF_CHANGING in that case). 8031 */ 8032 if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) { 8033 mutex_enter(&ipif->ipif_ill->ill_lock); 8034 ipif->ipif_state_flags &= ~IPIF_CHANGING; 8035 /* 8036 * Unhook the nic event message from the ill and enqueue it 8037 * into the nic event taskq. 8038 */ 8039 if ((info = ipif->ipif_ill->ill_nic_event_info) != NULL) { 8040 if (ddi_taskq_dispatch(eventq_queue_nic, 8041 ip_ne_queue_func, info, DDI_SLEEP) == DDI_FAILURE) { 8042 ip2dbg(("ipsq_current_finish: " 8043 "ddi_taskq_dispatch failed\n")); 8044 if (info->hne_data != NULL) 8045 kmem_free(info->hne_data, 8046 info->hne_datalen); 8047 kmem_free(info, sizeof (hook_nic_event_t)); 8048 } 8049 ipif->ipif_ill->ill_nic_event_info = NULL; 8050 } 8051 mutex_exit(&ipif->ipif_ill->ill_lock); 8052 } 8053 8054 mutex_enter(&ipsq->ipsq_lock); 8055 ASSERT(ipsq->ipsq_current_ipif != NULL); 8056 ipsq->ipsq_current_ipif = NULL; 8057 ipsq->ipsq_current_ioctl = 0; 8058 mutex_exit(&ipsq->ipsq_lock); 8059 } 8060 8061 /* 8062 * The ill is closing. Flush all messages on the ipsq that originated 8063 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 8064 * for this ill since ipsq_enter could not have entered until then. 8065 * New messages can't be queued since the CONDEMNED flag is set. 8066 */ 8067 static void 8068 ipsq_flush(ill_t *ill) 8069 { 8070 queue_t *q; 8071 mblk_t *prev; 8072 mblk_t *mp; 8073 mblk_t *mp_next; 8074 ipsq_t *ipsq; 8075 8076 ASSERT(IAM_WRITER_ILL(ill)); 8077 ipsq = ill->ill_phyint->phyint_ipsq; 8078 /* 8079 * Flush any messages sent up by the driver. 8080 */ 8081 mutex_enter(&ipsq->ipsq_lock); 8082 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 8083 mp_next = mp->b_next; 8084 q = mp->b_queue; 8085 if (q == ill->ill_rq || q == ill->ill_wq) { 8086 /* Remove the mp from the ipsq */ 8087 if (prev == NULL) 8088 ipsq->ipsq_mphead = mp->b_next; 8089 else 8090 prev->b_next = mp->b_next; 8091 if (ipsq->ipsq_mptail == mp) { 8092 ASSERT(mp_next == NULL); 8093 ipsq->ipsq_mptail = prev; 8094 } 8095 inet_freemsg(mp); 8096 } else { 8097 prev = mp; 8098 } 8099 } 8100 mutex_exit(&ipsq->ipsq_lock); 8101 (void) ipsq_pending_mp_cleanup(ill, NULL); 8102 ipsq_xopq_mp_cleanup(ill, NULL); 8103 ill_pending_mp_cleanup(ill); 8104 } 8105 8106 /* 8107 * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. 8108 * The real cleanup happens behind the squeue via ip_squeue_clean function but 8109 * we need to protect ourselfs from 2 threads trying to cleanup at the same 8110 * time (possible with one port going down for aggr and someone tearing down the 8111 * entire aggr simultaneously. So we use ill_inuse_ref protected by ill_lock 8112 * to indicate when the cleanup has started (1 ref) and when the cleanup 8113 * is done (0 ref). When a new ring gets assigned to squeue, we start by 8114 * putting 2 ref on ill_inuse_ref. 8115 */ 8116 static void 8117 ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 8118 { 8119 conn_t *connp; 8120 squeue_t *sqp; 8121 mblk_t *mp; 8122 8123 ASSERT(rx_ring != NULL); 8124 8125 /* Just clean one squeue */ 8126 mutex_enter(&ill->ill_lock); 8127 /* 8128 * Reset the ILL_SOFT_RING_ASSIGN bit so that 8129 * ip_squeue_soft_ring_affinty() will not go 8130 * ahead with assigning rings. 8131 */ 8132 ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 8133 while (rx_ring->rr_ring_state == ILL_RING_INPROC) 8134 /* Some operations pending on the ring. Wait */ 8135 cv_wait(&ill->ill_cv, &ill->ill_lock); 8136 8137 if (rx_ring->rr_ring_state != ILL_RING_INUSE) { 8138 /* 8139 * Someone already trying to clean 8140 * this squeue or its already been cleaned. 8141 */ 8142 mutex_exit(&ill->ill_lock); 8143 return; 8144 } 8145 sqp = rx_ring->rr_sqp; 8146 8147 if (sqp == NULL) { 8148 /* 8149 * The rx_ring never had a squeue assigned to it. 8150 * We are under ill_lock so we can clean it up 8151 * here itself since no one can get to it. 8152 */ 8153 rx_ring->rr_blank = NULL; 8154 rx_ring->rr_handle = NULL; 8155 rx_ring->rr_sqp = NULL; 8156 rx_ring->rr_ring_state = ILL_RING_FREE; 8157 mutex_exit(&ill->ill_lock); 8158 return; 8159 } 8160 8161 /* Set the state that its being cleaned */ 8162 rx_ring->rr_ring_state = ILL_RING_BEING_FREED; 8163 ASSERT(sqp != NULL); 8164 mutex_exit(&ill->ill_lock); 8165 8166 /* 8167 * Use the preallocated ill_unbind_conn for this purpose 8168 */ 8169 connp = ill->ill_dls_capab->ill_unbind_conn; 8170 8171 ASSERT(!connp->conn_tcp->tcp_closemp.b_prev); 8172 TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); 8173 if (connp->conn_tcp->tcp_closemp.b_prev == NULL) 8174 connp->conn_tcp->tcp_closemp_used = 1; 8175 else 8176 connp->conn_tcp->tcp_closemp_used++; 8177 mp = &connp->conn_tcp->tcp_closemp; 8178 CONN_INC_REF(connp); 8179 squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); 8180 8181 mutex_enter(&ill->ill_lock); 8182 while (rx_ring->rr_ring_state != ILL_RING_FREE) 8183 cv_wait(&ill->ill_cv, &ill->ill_lock); 8184 8185 mutex_exit(&ill->ill_lock); 8186 } 8187 8188 static void 8189 ipsq_clean_all(ill_t *ill) 8190 { 8191 int idx; 8192 8193 /* 8194 * No need to clean if poll_capab isn't set for this ill 8195 */ 8196 if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) 8197 return; 8198 8199 for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 8200 ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; 8201 ipsq_clean_ring(ill, ipr); 8202 } 8203 8204 ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); 8205 } 8206 8207 /* ARGSUSED */ 8208 int 8209 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8210 ip_ioctl_cmd_t *ipip, void *ifreq) 8211 { 8212 ill_t *ill; 8213 struct lifreq *lifr = (struct lifreq *)ifreq; 8214 boolean_t isv6; 8215 conn_t *connp; 8216 8217 connp = Q_TO_CONN(q); 8218 isv6 = connp->conn_af_isv6; 8219 /* 8220 * Set original index. 8221 * Failover and failback move logical interfaces 8222 * from one physical interface to another. The 8223 * original index indicates the parent of a logical 8224 * interface, in other words, the physical interface 8225 * the logical interface will be moved back to on 8226 * failback. 8227 */ 8228 8229 /* 8230 * Don't allow the original index to be changed 8231 * for non-failover addresses, autoconfigured 8232 * addresses, or IPv6 link local addresses. 8233 */ 8234 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 8235 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 8236 return (EINVAL); 8237 } 8238 /* 8239 * The new original index must be in use by some 8240 * physical interface. 8241 */ 8242 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 8243 NULL, NULL); 8244 if (ill == NULL) 8245 return (ENXIO); 8246 ill_refrele(ill); 8247 8248 ipif->ipif_orig_ifindex = lifr->lifr_index; 8249 /* 8250 * When this ipif gets failed back, don't 8251 * preserve the original id, as it is no 8252 * longer applicable. 8253 */ 8254 ipif->ipif_orig_ipifid = 0; 8255 /* 8256 * For IPv4, change the original index of any 8257 * multicast addresses associated with the 8258 * ipif to the new value. 8259 */ 8260 if (!isv6) { 8261 ilm_t *ilm; 8262 8263 mutex_enter(&ipif->ipif_ill->ill_lock); 8264 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 8265 ilm = ilm->ilm_next) { 8266 if (ilm->ilm_ipif == ipif) { 8267 ilm->ilm_orig_ifindex = lifr->lifr_index; 8268 } 8269 } 8270 mutex_exit(&ipif->ipif_ill->ill_lock); 8271 } 8272 return (0); 8273 } 8274 8275 /* ARGSUSED */ 8276 int 8277 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8278 ip_ioctl_cmd_t *ipip, void *ifreq) 8279 { 8280 struct lifreq *lifr = (struct lifreq *)ifreq; 8281 8282 /* 8283 * Get the original interface index i.e the one 8284 * before FAILOVER if it ever happened. 8285 */ 8286 lifr->lifr_index = ipif->ipif_orig_ifindex; 8287 return (0); 8288 } 8289 8290 /* 8291 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 8292 * refhold and return the associated ipif 8293 */ 8294 int 8295 ip_extract_tunreq(queue_t *q, mblk_t *mp, ipif_t **ipifp, ipsq_func_t func) 8296 { 8297 boolean_t exists; 8298 struct iftun_req *ta; 8299 ipif_t *ipif; 8300 ill_t *ill; 8301 boolean_t isv6; 8302 mblk_t *mp1; 8303 int error; 8304 conn_t *connp; 8305 8306 /* Existence verified in ip_wput_nondata */ 8307 mp1 = mp->b_cont->b_cont; 8308 ta = (struct iftun_req *)mp1->b_rptr; 8309 /* 8310 * Null terminate the string to protect against buffer 8311 * overrun. String was generated by user code and may not 8312 * be trusted. 8313 */ 8314 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 8315 8316 connp = Q_TO_CONN(q); 8317 isv6 = connp->conn_af_isv6; 8318 8319 /* Disallows implicit create */ 8320 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 8321 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 8322 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error); 8323 if (ipif == NULL) 8324 return (error); 8325 8326 if (ipif->ipif_id != 0) { 8327 /* 8328 * We really don't want to set/get tunnel parameters 8329 * on virtual tunnel interfaces. Only allow the 8330 * base tunnel to do these. 8331 */ 8332 ipif_refrele(ipif); 8333 return (EINVAL); 8334 } 8335 8336 /* 8337 * Send down to tunnel mod for ioctl processing. 8338 * Will finish ioctl in ip_rput_other(). 8339 */ 8340 ill = ipif->ipif_ill; 8341 if (ill->ill_net_type == IRE_LOOPBACK) { 8342 ipif_refrele(ipif); 8343 return (EOPNOTSUPP); 8344 } 8345 8346 if (ill->ill_wq == NULL) { 8347 ipif_refrele(ipif); 8348 return (ENXIO); 8349 } 8350 /* 8351 * Mark the ioctl as coming from an IPv6 interface for 8352 * tun's convenience. 8353 */ 8354 if (ill->ill_isv6) 8355 ta->ifta_flags |= 0x80000000; 8356 *ipifp = ipif; 8357 return (0); 8358 } 8359 8360 /* 8361 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8362 * and return the associated ipif. 8363 * Return value: 8364 * Non zero: An error has occurred. ci may not be filled out. 8365 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8366 * a held ipif in ci.ci_ipif. 8367 */ 8368 int 8369 ip_extract_lifreq_cmn(queue_t *q, mblk_t *mp, int cmd_type, int flags, 8370 cmd_info_t *ci, ipsq_func_t func) 8371 { 8372 sin_t *sin; 8373 sin6_t *sin6; 8374 char *name; 8375 struct ifreq *ifr; 8376 struct lifreq *lifr; 8377 ipif_t *ipif = NULL; 8378 ill_t *ill; 8379 conn_t *connp; 8380 boolean_t isv6; 8381 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8382 boolean_t exists; 8383 int err; 8384 mblk_t *mp1; 8385 zoneid_t zoneid; 8386 8387 if (q->q_next != NULL) { 8388 ill = (ill_t *)q->q_ptr; 8389 isv6 = ill->ill_isv6; 8390 connp = NULL; 8391 zoneid = ALL_ZONES; 8392 } else { 8393 ill = NULL; 8394 connp = Q_TO_CONN(q); 8395 isv6 = connp->conn_af_isv6; 8396 zoneid = connp->conn_zoneid; 8397 if (zoneid == GLOBAL_ZONEID) { 8398 /* global zone can access ipifs in all zones */ 8399 zoneid = ALL_ZONES; 8400 } 8401 } 8402 8403 /* Has been checked in ip_wput_nondata */ 8404 mp1 = mp->b_cont->b_cont; 8405 8406 8407 if (cmd_type == IF_CMD) { 8408 /* This a old style SIOC[GS]IF* command */ 8409 ifr = (struct ifreq *)mp1->b_rptr; 8410 /* 8411 * Null terminate the string to protect against buffer 8412 * overrun. String was generated by user code and may not 8413 * be trusted. 8414 */ 8415 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8416 sin = (sin_t *)&ifr->ifr_addr; 8417 name = ifr->ifr_name; 8418 ci->ci_sin = sin; 8419 ci->ci_sin6 = NULL; 8420 ci->ci_lifr = (struct lifreq *)ifr; 8421 } else { 8422 /* This a new style SIOC[GS]LIF* command */ 8423 ASSERT(cmd_type == LIF_CMD); 8424 lifr = (struct lifreq *)mp1->b_rptr; 8425 /* 8426 * Null terminate the string to protect against buffer 8427 * overrun. String was generated by user code and may not 8428 * be trusted. 8429 */ 8430 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8431 name = lifr->lifr_name; 8432 sin = (sin_t *)&lifr->lifr_addr; 8433 sin6 = (sin6_t *)&lifr->lifr_addr; 8434 if (iocp->ioc_cmd == SIOCSLIFGROUPNAME) { 8435 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 8436 LIFNAMSIZ); 8437 } 8438 ci->ci_sin = sin; 8439 ci->ci_sin6 = sin6; 8440 ci->ci_lifr = lifr; 8441 } 8442 8443 8444 if (iocp->ioc_cmd == SIOCSLIFNAME) { 8445 /* 8446 * The ioctl will be failed if the ioctl comes down 8447 * an conn stream 8448 */ 8449 if (ill == NULL) { 8450 /* 8451 * Not an ill queue, return EINVAL same as the 8452 * old error code. 8453 */ 8454 return (ENXIO); 8455 } 8456 ipif = ill->ill_ipif; 8457 ipif_refhold(ipif); 8458 } else { 8459 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8460 &exists, isv6, zoneid, 8461 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err); 8462 if (ipif == NULL) { 8463 if (err == EINPROGRESS) 8464 return (err); 8465 if (iocp->ioc_cmd == SIOCLIFFAILOVER || 8466 iocp->ioc_cmd == SIOCLIFFAILBACK) { 8467 /* 8468 * Need to try both v4 and v6 since this 8469 * ioctl can come down either v4 or v6 8470 * socket. The lifreq.lifr_family passed 8471 * down by this ioctl is AF_UNSPEC. 8472 */ 8473 ipif = ipif_lookup_on_name(name, 8474 mi_strlen(name), B_FALSE, &exists, !isv6, 8475 zoneid, (connp == NULL) ? q : 8476 CONNP_TO_WQ(connp), mp, func, &err); 8477 if (err == EINPROGRESS) 8478 return (err); 8479 } 8480 err = 0; /* Ensure we don't use it below */ 8481 } 8482 } 8483 8484 /* 8485 * Old style [GS]IFCMD does not admit IPv6 ipif 8486 */ 8487 if (ipif != NULL && ipif->ipif_isv6 && cmd_type == IF_CMD) { 8488 ipif_refrele(ipif); 8489 return (ENXIO); 8490 } 8491 8492 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8493 name[0] == '\0') { 8494 /* 8495 * Handle a or a SIOC?IF* with a null name 8496 * during plumb (on the ill queue before the I_PLINK). 8497 */ 8498 ipif = ill->ill_ipif; 8499 ipif_refhold(ipif); 8500 } 8501 8502 if (ipif == NULL) 8503 return (ENXIO); 8504 8505 /* 8506 * Allow only GET operations if this ipif has been created 8507 * temporarily due to a MOVE operation. 8508 */ 8509 if (ipif->ipif_replace_zero && !(flags & IPI_REPL)) { 8510 ipif_refrele(ipif); 8511 return (EINVAL); 8512 } 8513 8514 ci->ci_ipif = ipif; 8515 return (0); 8516 } 8517 8518 /* 8519 * Return the total number of ipifs. 8520 */ 8521 static uint_t 8522 ip_get_numifs(zoneid_t zoneid) 8523 { 8524 uint_t numifs = 0; 8525 ill_t *ill; 8526 ill_walk_context_t ctx; 8527 ipif_t *ipif; 8528 8529 rw_enter(&ill_g_lock, RW_READER); 8530 ill = ILL_START_WALK_V4(&ctx); 8531 8532 while (ill != NULL) { 8533 for (ipif = ill->ill_ipif; ipif != NULL; 8534 ipif = ipif->ipif_next) { 8535 if (ipif->ipif_zoneid == zoneid || 8536 ipif->ipif_zoneid == ALL_ZONES) 8537 numifs++; 8538 } 8539 ill = ill_next(&ctx, ill); 8540 } 8541 rw_exit(&ill_g_lock); 8542 return (numifs); 8543 } 8544 8545 /* 8546 * Return the total number of ipifs. 8547 */ 8548 static uint_t 8549 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid) 8550 { 8551 uint_t numifs = 0; 8552 ill_t *ill; 8553 ipif_t *ipif; 8554 ill_walk_context_t ctx; 8555 8556 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8557 8558 rw_enter(&ill_g_lock, RW_READER); 8559 if (family == AF_INET) 8560 ill = ILL_START_WALK_V4(&ctx); 8561 else if (family == AF_INET6) 8562 ill = ILL_START_WALK_V6(&ctx); 8563 else 8564 ill = ILL_START_WALK_ALL(&ctx); 8565 8566 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8567 for (ipif = ill->ill_ipif; ipif != NULL; 8568 ipif = ipif->ipif_next) { 8569 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8570 !(lifn_flags & LIFC_NOXMIT)) 8571 continue; 8572 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8573 !(lifn_flags & LIFC_TEMPORARY)) 8574 continue; 8575 if (((ipif->ipif_flags & 8576 (IPIF_NOXMIT|IPIF_NOLOCAL| 8577 IPIF_DEPRECATED)) || 8578 (ill->ill_phyint->phyint_flags & 8579 PHYI_LOOPBACK) || 8580 !(ipif->ipif_flags & IPIF_UP)) && 8581 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8582 continue; 8583 8584 if (zoneid != ipif->ipif_zoneid && 8585 ipif->ipif_zoneid != ALL_ZONES && 8586 (zoneid != GLOBAL_ZONEID || 8587 !(lifn_flags & LIFC_ALLZONES))) 8588 continue; 8589 8590 numifs++; 8591 } 8592 } 8593 rw_exit(&ill_g_lock); 8594 return (numifs); 8595 } 8596 8597 uint_t 8598 ip_get_lifsrcofnum(ill_t *ill) 8599 { 8600 uint_t numifs = 0; 8601 ill_t *ill_head = ill; 8602 8603 /* 8604 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8605 * other thread may be trying to relink the ILLs in this usesrc group 8606 * and adjusting the ill_usesrc_grp_next pointers 8607 */ 8608 rw_enter(&ill_g_usesrc_lock, RW_READER); 8609 if ((ill->ill_usesrc_ifindex == 0) && 8610 (ill->ill_usesrc_grp_next != NULL)) { 8611 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8612 ill = ill->ill_usesrc_grp_next) 8613 numifs++; 8614 } 8615 rw_exit(&ill_g_usesrc_lock); 8616 8617 return (numifs); 8618 } 8619 8620 /* Null values are passed in for ipif, sin, and ifreq */ 8621 /* ARGSUSED */ 8622 int 8623 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8624 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8625 { 8626 int *nump; 8627 8628 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8629 8630 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8631 nump = (int *)mp->b_cont->b_cont->b_rptr; 8632 8633 *nump = ip_get_numifs(Q_TO_CONN(q)->conn_zoneid); 8634 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8635 return (0); 8636 } 8637 8638 /* Null values are passed in for ipif, sin, and ifreq */ 8639 /* ARGSUSED */ 8640 int 8641 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8642 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8643 { 8644 struct lifnum *lifn; 8645 mblk_t *mp1; 8646 8647 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8648 8649 /* Existence checked in ip_wput_nondata */ 8650 mp1 = mp->b_cont->b_cont; 8651 8652 lifn = (struct lifnum *)mp1->b_rptr; 8653 switch (lifn->lifn_family) { 8654 case AF_UNSPEC: 8655 case AF_INET: 8656 case AF_INET6: 8657 break; 8658 default: 8659 return (EAFNOSUPPORT); 8660 } 8661 8662 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8663 Q_TO_CONN(q)->conn_zoneid); 8664 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8665 return (0); 8666 } 8667 8668 /* ARGSUSED */ 8669 int 8670 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8671 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8672 { 8673 STRUCT_HANDLE(ifconf, ifc); 8674 mblk_t *mp1; 8675 struct iocblk *iocp; 8676 struct ifreq *ifr; 8677 ill_walk_context_t ctx; 8678 ill_t *ill; 8679 ipif_t *ipif; 8680 struct sockaddr_in *sin; 8681 int32_t ifclen; 8682 zoneid_t zoneid; 8683 8684 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8685 8686 ip1dbg(("ip_sioctl_get_ifconf")); 8687 /* Existence verified in ip_wput_nondata */ 8688 mp1 = mp->b_cont->b_cont; 8689 iocp = (struct iocblk *)mp->b_rptr; 8690 zoneid = Q_TO_CONN(q)->conn_zoneid; 8691 8692 /* 8693 * The original SIOCGIFCONF passed in a struct ifconf which specified 8694 * the user buffer address and length into which the list of struct 8695 * ifreqs was to be copied. Since AT&T Streams does not seem to 8696 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8697 * the SIOCGIFCONF operation was redefined to simply provide 8698 * a large output buffer into which we are supposed to jam the ifreq 8699 * array. The same ioctl command code was used, despite the fact that 8700 * both the applications and the kernel code had to change, thus making 8701 * it impossible to support both interfaces. 8702 * 8703 * For reasons not good enough to try to explain, the following 8704 * algorithm is used for deciding what to do with one of these: 8705 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8706 * form with the output buffer coming down as the continuation message. 8707 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8708 * and we have to copy in the ifconf structure to find out how big the 8709 * output buffer is and where to copy out to. Sure no problem... 8710 * 8711 */ 8712 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8713 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8714 int numifs = 0; 8715 size_t ifc_bufsize; 8716 8717 /* 8718 * Must be (better be!) continuation of a TRANSPARENT 8719 * IOCTL. We just copied in the ifconf structure. 8720 */ 8721 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8722 (struct ifconf *)mp1->b_rptr); 8723 8724 /* 8725 * Allocate a buffer to hold requested information. 8726 * 8727 * If ifc_len is larger than what is needed, we only 8728 * allocate what we will use. 8729 * 8730 * If ifc_len is smaller than what is needed, return 8731 * EINVAL. 8732 * 8733 * XXX: the ill_t structure can hava 2 counters, for 8734 * v4 and v6 (not just ill_ipif_up_count) to store the 8735 * number of interfaces for a device, so we don't need 8736 * to count them here... 8737 */ 8738 numifs = ip_get_numifs(zoneid); 8739 8740 ifclen = STRUCT_FGET(ifc, ifc_len); 8741 ifc_bufsize = numifs * sizeof (struct ifreq); 8742 if (ifc_bufsize > ifclen) { 8743 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8744 /* old behaviour */ 8745 return (EINVAL); 8746 } else { 8747 ifc_bufsize = ifclen; 8748 } 8749 } 8750 8751 mp1 = mi_copyout_alloc(q, mp, 8752 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8753 if (mp1 == NULL) 8754 return (ENOMEM); 8755 8756 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8757 } 8758 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8759 /* 8760 * the SIOCGIFCONF ioctl only knows about 8761 * IPv4 addresses, so don't try to tell 8762 * it about interfaces with IPv6-only 8763 * addresses. (Last parm 'isv6' is B_FALSE) 8764 */ 8765 8766 ifr = (struct ifreq *)mp1->b_rptr; 8767 8768 rw_enter(&ill_g_lock, RW_READER); 8769 ill = ILL_START_WALK_V4(&ctx); 8770 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8771 for (ipif = ill->ill_ipif; ipif != NULL; 8772 ipif = ipif->ipif_next) { 8773 if (zoneid != ipif->ipif_zoneid && 8774 ipif->ipif_zoneid != ALL_ZONES) 8775 continue; 8776 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8777 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8778 /* old behaviour */ 8779 rw_exit(&ill_g_lock); 8780 return (EINVAL); 8781 } else { 8782 goto if_copydone; 8783 } 8784 } 8785 (void) ipif_get_name(ipif, 8786 ifr->ifr_name, 8787 sizeof (ifr->ifr_name)); 8788 sin = (sin_t *)&ifr->ifr_addr; 8789 *sin = sin_null; 8790 sin->sin_family = AF_INET; 8791 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8792 ifr++; 8793 } 8794 } 8795 if_copydone: 8796 rw_exit(&ill_g_lock); 8797 mp1->b_wptr = (uchar_t *)ifr; 8798 8799 if (STRUCT_BUF(ifc) != NULL) { 8800 STRUCT_FSET(ifc, ifc_len, 8801 (int)((uchar_t *)ifr - mp1->b_rptr)); 8802 } 8803 return (0); 8804 } 8805 8806 /* 8807 * Get the interfaces using the address hosted on the interface passed in, 8808 * as a source adddress 8809 */ 8810 /* ARGSUSED */ 8811 int 8812 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8813 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8814 { 8815 mblk_t *mp1; 8816 ill_t *ill, *ill_head; 8817 ipif_t *ipif, *orig_ipif; 8818 int numlifs = 0; 8819 size_t lifs_bufsize, lifsmaxlen; 8820 struct lifreq *lifr; 8821 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8822 uint_t ifindex; 8823 zoneid_t zoneid; 8824 int err = 0; 8825 boolean_t isv6 = B_FALSE; 8826 struct sockaddr_in *sin; 8827 struct sockaddr_in6 *sin6; 8828 8829 STRUCT_HANDLE(lifsrcof, lifs); 8830 8831 ASSERT(q->q_next == NULL); 8832 8833 zoneid = Q_TO_CONN(q)->conn_zoneid; 8834 8835 /* Existence verified in ip_wput_nondata */ 8836 mp1 = mp->b_cont->b_cont; 8837 8838 /* 8839 * Must be (better be!) continuation of a TRANSPARENT 8840 * IOCTL. We just copied in the lifsrcof structure. 8841 */ 8842 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8843 (struct lifsrcof *)mp1->b_rptr); 8844 8845 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8846 return (EINVAL); 8847 8848 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8849 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8850 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8851 ip_process_ioctl, &err); 8852 if (ipif == NULL) { 8853 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8854 ifindex)); 8855 return (err); 8856 } 8857 8858 8859 /* Allocate a buffer to hold requested information */ 8860 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8861 lifs_bufsize = numlifs * sizeof (struct lifreq); 8862 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8863 /* The actual size needed is always returned in lifs_len */ 8864 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8865 8866 /* If the amount we need is more than what is passed in, abort */ 8867 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8868 ipif_refrele(ipif); 8869 return (0); 8870 } 8871 8872 mp1 = mi_copyout_alloc(q, mp, 8873 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8874 if (mp1 == NULL) { 8875 ipif_refrele(ipif); 8876 return (ENOMEM); 8877 } 8878 8879 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8880 bzero(mp1->b_rptr, lifs_bufsize); 8881 8882 lifr = (struct lifreq *)mp1->b_rptr; 8883 8884 ill = ill_head = ipif->ipif_ill; 8885 orig_ipif = ipif; 8886 8887 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8888 rw_enter(&ill_g_usesrc_lock, RW_READER); 8889 rw_enter(&ill_g_lock, RW_READER); 8890 8891 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8892 for (; (ill != NULL) && (ill != ill_head); 8893 ill = ill->ill_usesrc_grp_next) { 8894 8895 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8896 break; 8897 8898 ipif = ill->ill_ipif; 8899 (void) ipif_get_name(ipif, 8900 lifr->lifr_name, sizeof (lifr->lifr_name)); 8901 if (ipif->ipif_isv6) { 8902 sin6 = (sin6_t *)&lifr->lifr_addr; 8903 *sin6 = sin6_null; 8904 sin6->sin6_family = AF_INET6; 8905 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8906 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8907 &ipif->ipif_v6net_mask); 8908 } else { 8909 sin = (sin_t *)&lifr->lifr_addr; 8910 *sin = sin_null; 8911 sin->sin_family = AF_INET; 8912 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8913 lifr->lifr_addrlen = ip_mask_to_plen( 8914 ipif->ipif_net_mask); 8915 } 8916 lifr++; 8917 } 8918 rw_exit(&ill_g_usesrc_lock); 8919 rw_exit(&ill_g_lock); 8920 ipif_refrele(orig_ipif); 8921 mp1->b_wptr = (uchar_t *)lifr; 8922 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8923 8924 return (0); 8925 } 8926 8927 /* ARGSUSED */ 8928 int 8929 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8930 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8931 { 8932 mblk_t *mp1; 8933 int list; 8934 ill_t *ill; 8935 ipif_t *ipif; 8936 int flags; 8937 int numlifs = 0; 8938 size_t lifc_bufsize; 8939 struct lifreq *lifr; 8940 sa_family_t family; 8941 struct sockaddr_in *sin; 8942 struct sockaddr_in6 *sin6; 8943 ill_walk_context_t ctx; 8944 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8945 int32_t lifclen; 8946 zoneid_t zoneid; 8947 STRUCT_HANDLE(lifconf, lifc); 8948 8949 ip1dbg(("ip_sioctl_get_lifconf")); 8950 8951 ASSERT(q->q_next == NULL); 8952 8953 zoneid = Q_TO_CONN(q)->conn_zoneid; 8954 8955 /* Existence verified in ip_wput_nondata */ 8956 mp1 = mp->b_cont->b_cont; 8957 8958 /* 8959 * An extended version of SIOCGIFCONF that takes an 8960 * additional address family and flags field. 8961 * AF_UNSPEC retrieve both IPv4 and IPv6. 8962 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8963 * interfaces are omitted. 8964 * Similarly, IPIF_TEMPORARY interfaces are omitted 8965 * unless LIFC_TEMPORARY is specified. 8966 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8967 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8968 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8969 * has priority over LIFC_NOXMIT. 8970 */ 8971 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8972 8973 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8974 return (EINVAL); 8975 8976 /* 8977 * Must be (better be!) continuation of a TRANSPARENT 8978 * IOCTL. We just copied in the lifconf structure. 8979 */ 8980 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8981 8982 family = STRUCT_FGET(lifc, lifc_family); 8983 flags = STRUCT_FGET(lifc, lifc_flags); 8984 8985 switch (family) { 8986 case AF_UNSPEC: 8987 /* 8988 * walk all ILL's. 8989 */ 8990 list = MAX_G_HEADS; 8991 break; 8992 case AF_INET: 8993 /* 8994 * walk only IPV4 ILL's. 8995 */ 8996 list = IP_V4_G_HEAD; 8997 break; 8998 case AF_INET6: 8999 /* 9000 * walk only IPV6 ILL's. 9001 */ 9002 list = IP_V6_G_HEAD; 9003 break; 9004 default: 9005 return (EAFNOSUPPORT); 9006 } 9007 9008 /* 9009 * Allocate a buffer to hold requested information. 9010 * 9011 * If lifc_len is larger than what is needed, we only 9012 * allocate what we will use. 9013 * 9014 * If lifc_len is smaller than what is needed, return 9015 * EINVAL. 9016 */ 9017 numlifs = ip_get_numlifs(family, flags, zoneid); 9018 lifc_bufsize = numlifs * sizeof (struct lifreq); 9019 lifclen = STRUCT_FGET(lifc, lifc_len); 9020 if (lifc_bufsize > lifclen) { 9021 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 9022 return (EINVAL); 9023 else 9024 lifc_bufsize = lifclen; 9025 } 9026 9027 mp1 = mi_copyout_alloc(q, mp, 9028 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 9029 if (mp1 == NULL) 9030 return (ENOMEM); 9031 9032 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 9033 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 9034 9035 lifr = (struct lifreq *)mp1->b_rptr; 9036 9037 rw_enter(&ill_g_lock, RW_READER); 9038 ill = ill_first(list, list, &ctx); 9039 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 9040 for (ipif = ill->ill_ipif; ipif != NULL; 9041 ipif = ipif->ipif_next) { 9042 if ((ipif->ipif_flags & IPIF_NOXMIT) && 9043 !(flags & LIFC_NOXMIT)) 9044 continue; 9045 9046 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 9047 !(flags & LIFC_TEMPORARY)) 9048 continue; 9049 9050 if (((ipif->ipif_flags & 9051 (IPIF_NOXMIT|IPIF_NOLOCAL| 9052 IPIF_DEPRECATED)) || 9053 (ill->ill_phyint->phyint_flags & 9054 PHYI_LOOPBACK) || 9055 !(ipif->ipif_flags & IPIF_UP)) && 9056 (flags & LIFC_EXTERNAL_SOURCE)) 9057 continue; 9058 9059 if (zoneid != ipif->ipif_zoneid && 9060 ipif->ipif_zoneid != ALL_ZONES && 9061 (zoneid != GLOBAL_ZONEID || 9062 !(flags & LIFC_ALLZONES))) 9063 continue; 9064 9065 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 9066 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 9067 rw_exit(&ill_g_lock); 9068 return (EINVAL); 9069 } else { 9070 goto lif_copydone; 9071 } 9072 } 9073 9074 (void) ipif_get_name(ipif, 9075 lifr->lifr_name, 9076 sizeof (lifr->lifr_name)); 9077 if (ipif->ipif_isv6) { 9078 sin6 = (sin6_t *)&lifr->lifr_addr; 9079 *sin6 = sin6_null; 9080 sin6->sin6_family = AF_INET6; 9081 sin6->sin6_addr = 9082 ipif->ipif_v6lcl_addr; 9083 lifr->lifr_addrlen = 9084 ip_mask_to_plen_v6( 9085 &ipif->ipif_v6net_mask); 9086 } else { 9087 sin = (sin_t *)&lifr->lifr_addr; 9088 *sin = sin_null; 9089 sin->sin_family = AF_INET; 9090 sin->sin_addr.s_addr = 9091 ipif->ipif_lcl_addr; 9092 lifr->lifr_addrlen = 9093 ip_mask_to_plen( 9094 ipif->ipif_net_mask); 9095 } 9096 lifr++; 9097 } 9098 } 9099 lif_copydone: 9100 rw_exit(&ill_g_lock); 9101 9102 mp1->b_wptr = (uchar_t *)lifr; 9103 if (STRUCT_BUF(lifc) != NULL) { 9104 STRUCT_FSET(lifc, lifc_len, 9105 (int)((uchar_t *)lifr - mp1->b_rptr)); 9106 } 9107 return (0); 9108 } 9109 9110 /* ARGSUSED */ 9111 int 9112 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 9113 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 9114 { 9115 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 9116 ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 9117 return (0); 9118 } 9119 9120 static void 9121 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 9122 { 9123 ip6_asp_t *table; 9124 size_t table_size; 9125 mblk_t *data_mp; 9126 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9127 9128 /* These two ioctls are I_STR only */ 9129 if (iocp->ioc_count == TRANSPARENT) { 9130 miocnak(q, mp, 0, EINVAL); 9131 return; 9132 } 9133 9134 data_mp = mp->b_cont; 9135 if (data_mp == NULL) { 9136 /* The user passed us a NULL argument */ 9137 table = NULL; 9138 table_size = iocp->ioc_count; 9139 } else { 9140 /* 9141 * The user provided a table. The stream head 9142 * may have copied in the user data in chunks, 9143 * so make sure everything is pulled up 9144 * properly. 9145 */ 9146 if (MBLKL(data_mp) < iocp->ioc_count) { 9147 mblk_t *new_data_mp; 9148 if ((new_data_mp = msgpullup(data_mp, -1)) == 9149 NULL) { 9150 miocnak(q, mp, 0, ENOMEM); 9151 return; 9152 } 9153 freemsg(data_mp); 9154 data_mp = new_data_mp; 9155 mp->b_cont = data_mp; 9156 } 9157 table = (ip6_asp_t *)data_mp->b_rptr; 9158 table_size = iocp->ioc_count; 9159 } 9160 9161 switch (iocp->ioc_cmd) { 9162 case SIOCGIP6ADDRPOLICY: 9163 iocp->ioc_rval = ip6_asp_get(table, table_size); 9164 if (iocp->ioc_rval == -1) 9165 iocp->ioc_error = EINVAL; 9166 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 9167 else if (table != NULL && 9168 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 9169 ip6_asp_t *src = table; 9170 ip6_asp32_t *dst = (void *)table; 9171 int count = table_size / sizeof (ip6_asp_t); 9172 int i; 9173 9174 /* 9175 * We need to do an in-place shrink of the array 9176 * to match the alignment attributes of the 9177 * 32-bit ABI looking at it. 9178 */ 9179 /* LINTED: logical expression always true: op "||" */ 9180 ASSERT(sizeof (*src) > sizeof (*dst)); 9181 for (i = 1; i < count; i++) 9182 bcopy(src + i, dst + i, sizeof (*dst)); 9183 } 9184 #endif 9185 break; 9186 9187 case SIOCSIP6ADDRPOLICY: 9188 ASSERT(mp->b_prev == NULL); 9189 mp->b_prev = (void *)q; 9190 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 9191 /* 9192 * We pass in the datamodel here so that the ip6_asp_replace() 9193 * routine can handle converting from 32-bit to native formats 9194 * where necessary. 9195 * 9196 * A better way to handle this might be to convert the inbound 9197 * data structure here, and hang it off a new 'mp'; thus the 9198 * ip6_asp_replace() logic would always be dealing with native 9199 * format data structures.. 9200 * 9201 * (An even simpler way to handle these ioctls is to just 9202 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 9203 * and just recompile everything that depends on it.) 9204 */ 9205 #endif 9206 ip6_asp_replace(mp, table, table_size, B_FALSE, 9207 iocp->ioc_flag & IOC_MODELS); 9208 return; 9209 } 9210 9211 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 9212 qreply(q, mp); 9213 } 9214 9215 static void 9216 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 9217 { 9218 mblk_t *data_mp; 9219 struct dstinforeq *dir; 9220 uint8_t *end, *cur; 9221 in6_addr_t *daddr, *saddr; 9222 ipaddr_t v4daddr; 9223 ire_t *ire; 9224 char *slabel, *dlabel; 9225 boolean_t isipv4; 9226 int match_ire; 9227 ill_t *dst_ill; 9228 ipif_t *src_ipif, *ire_ipif; 9229 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9230 zoneid_t zoneid; 9231 9232 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9233 zoneid = Q_TO_CONN(q)->conn_zoneid; 9234 9235 /* 9236 * This ioctl is I_STR only, and must have a 9237 * data mblk following the M_IOCTL mblk. 9238 */ 9239 data_mp = mp->b_cont; 9240 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 9241 miocnak(q, mp, 0, EINVAL); 9242 return; 9243 } 9244 9245 if (MBLKL(data_mp) < iocp->ioc_count) { 9246 mblk_t *new_data_mp; 9247 9248 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 9249 miocnak(q, mp, 0, ENOMEM); 9250 return; 9251 } 9252 freemsg(data_mp); 9253 data_mp = new_data_mp; 9254 mp->b_cont = data_mp; 9255 } 9256 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 9257 9258 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 9259 end - cur >= sizeof (struct dstinforeq); 9260 cur += sizeof (struct dstinforeq)) { 9261 dir = (struct dstinforeq *)cur; 9262 daddr = &dir->dir_daddr; 9263 saddr = &dir->dir_saddr; 9264 9265 /* 9266 * ip_addr_scope_v6() and ip6_asp_lookup() handle 9267 * v4 mapped addresses; ire_ftable_lookup[_v6]() 9268 * and ipif_select_source[_v6]() do not. 9269 */ 9270 dir->dir_dscope = ip_addr_scope_v6(daddr); 9271 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence); 9272 9273 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 9274 if (isipv4) { 9275 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 9276 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 9277 0, NULL, NULL, zoneid, 0, NULL, match_ire); 9278 } else { 9279 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 9280 0, NULL, NULL, zoneid, 0, NULL, match_ire); 9281 } 9282 if (ire == NULL) { 9283 dir->dir_dreachable = 0; 9284 9285 /* move on to next dst addr */ 9286 continue; 9287 } 9288 dir->dir_dreachable = 1; 9289 9290 ire_ipif = ire->ire_ipif; 9291 if (ire_ipif == NULL) 9292 goto next_dst; 9293 9294 /* 9295 * We expect to get back an interface ire or a 9296 * gateway ire cache entry. For both types, the 9297 * output interface is ire_ipif->ipif_ill. 9298 */ 9299 dst_ill = ire_ipif->ipif_ill; 9300 dir->dir_dmactype = dst_ill->ill_mactype; 9301 9302 if (isipv4) { 9303 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 9304 } else { 9305 src_ipif = ipif_select_source_v6(dst_ill, 9306 daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, 9307 zoneid); 9308 } 9309 if (src_ipif == NULL) 9310 goto next_dst; 9311 9312 *saddr = src_ipif->ipif_v6lcl_addr; 9313 dir->dir_sscope = ip_addr_scope_v6(saddr); 9314 slabel = ip6_asp_lookup(saddr, NULL); 9315 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 9316 dir->dir_sdeprecated = 9317 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 9318 ipif_refrele(src_ipif); 9319 next_dst: 9320 ire_refrele(ire); 9321 } 9322 miocack(q, mp, iocp->ioc_count, 0); 9323 } 9324 9325 9326 /* 9327 * Check if this is an address assigned to this machine. 9328 * Skips interfaces that are down by using ire checks. 9329 * Translates mapped addresses to v4 addresses and then 9330 * treats them as such, returning true if the v4 address 9331 * associated with this mapped address is configured. 9332 * Note: Applications will have to be careful what they do 9333 * with the response; use of mapped addresses limits 9334 * what can be done with the socket, especially with 9335 * respect to socket options and ioctls - neither IPv4 9336 * options nor IPv6 sticky options/ancillary data options 9337 * may be used. 9338 */ 9339 /* ARGSUSED */ 9340 int 9341 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9342 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9343 { 9344 struct sioc_addrreq *sia; 9345 sin_t *sin; 9346 ire_t *ire; 9347 mblk_t *mp1; 9348 zoneid_t zoneid; 9349 9350 ip1dbg(("ip_sioctl_tmyaddr")); 9351 9352 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9353 zoneid = Q_TO_CONN(q)->conn_zoneid; 9354 9355 /* Existence verified in ip_wput_nondata */ 9356 mp1 = mp->b_cont->b_cont; 9357 sia = (struct sioc_addrreq *)mp1->b_rptr; 9358 sin = (sin_t *)&sia->sa_addr; 9359 switch (sin->sin_family) { 9360 case AF_INET6: { 9361 sin6_t *sin6 = (sin6_t *)sin; 9362 9363 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9364 ipaddr_t v4_addr; 9365 9366 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9367 v4_addr); 9368 ire = ire_ctable_lookup(v4_addr, 0, 9369 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9370 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9371 } else { 9372 in6_addr_t v6addr; 9373 9374 v6addr = sin6->sin6_addr; 9375 ire = ire_ctable_lookup_v6(&v6addr, 0, 9376 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9377 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9378 } 9379 break; 9380 } 9381 case AF_INET: { 9382 ipaddr_t v4addr; 9383 9384 v4addr = sin->sin_addr.s_addr; 9385 ire = ire_ctable_lookup(v4addr, 0, 9386 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9387 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY); 9388 break; 9389 } 9390 default: 9391 return (EAFNOSUPPORT); 9392 } 9393 if (ire != NULL) { 9394 sia->sa_res = 1; 9395 ire_refrele(ire); 9396 } else { 9397 sia->sa_res = 0; 9398 } 9399 return (0); 9400 } 9401 9402 /* 9403 * Check if this is an address assigned on-link i.e. neighbor, 9404 * and makes sure it's reachable from the current zone. 9405 * Returns true for my addresses as well. 9406 * Translates mapped addresses to v4 addresses and then 9407 * treats them as such, returning true if the v4 address 9408 * associated with this mapped address is configured. 9409 * Note: Applications will have to be careful what they do 9410 * with the response; use of mapped addresses limits 9411 * what can be done with the socket, especially with 9412 * respect to socket options and ioctls - neither IPv4 9413 * options nor IPv6 sticky options/ancillary data options 9414 * may be used. 9415 */ 9416 /* ARGSUSED */ 9417 int 9418 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9419 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9420 { 9421 struct sioc_addrreq *sia; 9422 sin_t *sin; 9423 mblk_t *mp1; 9424 ire_t *ire = NULL; 9425 zoneid_t zoneid; 9426 9427 ip1dbg(("ip_sioctl_tonlink")); 9428 9429 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9430 zoneid = Q_TO_CONN(q)->conn_zoneid; 9431 9432 /* Existence verified in ip_wput_nondata */ 9433 mp1 = mp->b_cont->b_cont; 9434 sia = (struct sioc_addrreq *)mp1->b_rptr; 9435 sin = (sin_t *)&sia->sa_addr; 9436 9437 /* 9438 * Match addresses with a zero gateway field to avoid 9439 * routes going through a router. 9440 * Exclude broadcast and multicast addresses. 9441 */ 9442 switch (sin->sin_family) { 9443 case AF_INET6: { 9444 sin6_t *sin6 = (sin6_t *)sin; 9445 9446 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9447 ipaddr_t v4_addr; 9448 9449 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9450 v4_addr); 9451 if (!CLASSD(v4_addr)) { 9452 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9453 NULL, NULL, zoneid, NULL, 9454 MATCH_IRE_GW); 9455 } 9456 } else { 9457 in6_addr_t v6addr; 9458 in6_addr_t v6gw; 9459 9460 v6addr = sin6->sin6_addr; 9461 v6gw = ipv6_all_zeros; 9462 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9463 ire = ire_route_lookup_v6(&v6addr, 0, 9464 &v6gw, 0, NULL, NULL, zoneid, 9465 NULL, MATCH_IRE_GW); 9466 } 9467 } 9468 break; 9469 } 9470 case AF_INET: { 9471 ipaddr_t v4addr; 9472 9473 v4addr = sin->sin_addr.s_addr; 9474 if (!CLASSD(v4addr)) { 9475 ire = ire_route_lookup(v4addr, 0, 0, 0, 9476 NULL, NULL, zoneid, NULL, 9477 MATCH_IRE_GW); 9478 } 9479 break; 9480 } 9481 default: 9482 return (EAFNOSUPPORT); 9483 } 9484 sia->sa_res = 0; 9485 if (ire != NULL) { 9486 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9487 IRE_LOCAL|IRE_LOOPBACK)) { 9488 sia->sa_res = 1; 9489 } 9490 ire_refrele(ire); 9491 } 9492 return (0); 9493 } 9494 9495 /* 9496 * TBD: implement when kernel maintaines a list of site prefixes. 9497 */ 9498 /* ARGSUSED */ 9499 int 9500 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9501 ip_ioctl_cmd_t *ipip, void *ifreq) 9502 { 9503 return (ENXIO); 9504 } 9505 9506 /* ARGSUSED */ 9507 int 9508 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9509 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9510 { 9511 ill_t *ill; 9512 mblk_t *mp1; 9513 conn_t *connp; 9514 boolean_t success; 9515 9516 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 9517 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9518 /* ioctl comes down on an conn */ 9519 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9520 connp = Q_TO_CONN(q); 9521 9522 mp->b_datap->db_type = M_IOCTL; 9523 9524 /* 9525 * Send down a copy. (copymsg does not copy b_next/b_prev). 9526 * The original mp contains contaminated b_next values due to 'mi', 9527 * which is needed to do the mi_copy_done. Unfortunately if we 9528 * send down the original mblk itself and if we are popped due to an 9529 * an unplumb before the response comes back from tunnel, 9530 * the streamhead (which does a freemsg) will see this contaminated 9531 * message and the assertion in freemsg about non-null b_next/b_prev 9532 * will panic a DEBUG kernel. 9533 */ 9534 mp1 = copymsg(mp); 9535 if (mp1 == NULL) 9536 return (ENOMEM); 9537 9538 ill = ipif->ipif_ill; 9539 mutex_enter(&connp->conn_lock); 9540 mutex_enter(&ill->ill_lock); 9541 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 9542 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9543 mp, 0); 9544 } else { 9545 success = ill_pending_mp_add(ill, connp, mp); 9546 } 9547 mutex_exit(&ill->ill_lock); 9548 mutex_exit(&connp->conn_lock); 9549 9550 if (success) { 9551 ip1dbg(("sending down tunparam request ")); 9552 putnext(ill->ill_wq, mp1); 9553 return (EINPROGRESS); 9554 } else { 9555 /* The conn has started closing */ 9556 freemsg(mp1); 9557 return (EINTR); 9558 } 9559 } 9560 9561 static int 9562 ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, 9563 boolean_t x_arp_ioctl, boolean_t if_arp_ioctl) 9564 { 9565 mblk_t *mp1; 9566 mblk_t *mp2; 9567 mblk_t *pending_mp; 9568 ipaddr_t ipaddr; 9569 area_t *area; 9570 struct iocblk *iocp; 9571 conn_t *connp; 9572 struct arpreq *ar; 9573 struct xarpreq *xar; 9574 boolean_t success; 9575 int flags, alength; 9576 char *lladdr; 9577 9578 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9579 connp = Q_TO_CONN(q); 9580 9581 iocp = (struct iocblk *)mp->b_rptr; 9582 /* 9583 * ill has already been set depending on whether 9584 * bsd style or interface style ioctl. 9585 */ 9586 ASSERT(ill != NULL); 9587 9588 /* 9589 * Is this one of the new SIOC*XARP ioctls? 9590 */ 9591 if (x_arp_ioctl) { 9592 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9593 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9594 ar = NULL; 9595 9596 flags = xar->xarp_flags; 9597 lladdr = LLADDR(&xar->xarp_ha); 9598 /* 9599 * Validate against user's link layer address length 9600 * input and name and addr length limits. 9601 */ 9602 alength = ill->ill_phys_addr_length; 9603 if (iocp->ioc_cmd == SIOCSXARP) { 9604 if (alength != xar->xarp_ha.sdl_alen || 9605 (alength + xar->xarp_ha.sdl_nlen > 9606 sizeof (xar->xarp_ha.sdl_data))) 9607 return (EINVAL); 9608 } 9609 } else { 9610 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9611 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9612 xar = NULL; 9613 9614 flags = ar->arp_flags; 9615 lladdr = ar->arp_ha.sa_data; 9616 /* 9617 * Theoretically, the sa_family could tell us what link 9618 * layer type this operation is trying to deal with. By 9619 * common usage AF_UNSPEC means ethernet. We'll assume 9620 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9621 * for now. Our new SIOC*XARP ioctls can be used more 9622 * generally. 9623 * 9624 * If the underlying media happens to have a non 6 byte 9625 * address, arp module will fail set/get, but the del 9626 * operation will succeed. 9627 */ 9628 alength = 6; 9629 if ((iocp->ioc_cmd != SIOCDARP) && 9630 (alength != ill->ill_phys_addr_length)) { 9631 return (EINVAL); 9632 } 9633 } 9634 9635 /* 9636 * We are going to pass up to ARP a packet chain that looks 9637 * like: 9638 * 9639 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9640 * 9641 * Get a copy of the original IOCTL mblk to head the chain, 9642 * to be sent up (in mp1). Also get another copy to store 9643 * in the ill_pending_mp list, for matching the response 9644 * when it comes back from ARP. 9645 */ 9646 mp1 = copyb(mp); 9647 pending_mp = copymsg(mp); 9648 if (mp1 == NULL || pending_mp == NULL) { 9649 if (mp1 != NULL) 9650 freeb(mp1); 9651 if (pending_mp != NULL) 9652 inet_freemsg(pending_mp); 9653 return (ENOMEM); 9654 } 9655 9656 ipaddr = sin->sin_addr.s_addr; 9657 9658 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9659 (caddr_t)&ipaddr); 9660 if (mp2 == NULL) { 9661 freeb(mp1); 9662 inet_freemsg(pending_mp); 9663 return (ENOMEM); 9664 } 9665 /* Put together the chain. */ 9666 mp1->b_cont = mp2; 9667 mp1->b_datap->db_type = M_IOCTL; 9668 mp2->b_cont = mp; 9669 mp2->b_datap->db_type = M_DATA; 9670 9671 iocp = (struct iocblk *)mp1->b_rptr; 9672 9673 /* 9674 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9675 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9676 * cp_private field (or cp_rval on 32-bit systems) in place of the 9677 * ioc_count field; set ioc_count to be correct. 9678 */ 9679 iocp->ioc_count = MBLKL(mp1->b_cont); 9680 9681 /* 9682 * Set the proper command in the ARP message. 9683 * Convert the SIOC{G|S|D}ARP calls into our 9684 * AR_ENTRY_xxx calls. 9685 */ 9686 area = (area_t *)mp2->b_rptr; 9687 switch (iocp->ioc_cmd) { 9688 case SIOCDARP: 9689 case SIOCDXARP: 9690 /* 9691 * We defer deleting the corresponding IRE until 9692 * we return from arp. 9693 */ 9694 area->area_cmd = AR_ENTRY_DELETE; 9695 area->area_proto_mask_offset = 0; 9696 break; 9697 case SIOCGARP: 9698 case SIOCGXARP: 9699 area->area_cmd = AR_ENTRY_SQUERY; 9700 area->area_proto_mask_offset = 0; 9701 break; 9702 case SIOCSARP: 9703 case SIOCSXARP: { 9704 /* 9705 * Delete the corresponding ire to make sure IP will 9706 * pick up any change from arp. 9707 */ 9708 if (!if_arp_ioctl) { 9709 (void) ip_ire_clookup_and_delete(ipaddr, NULL); 9710 break; 9711 } else { 9712 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9713 if (ipif != NULL) { 9714 (void) ip_ire_clookup_and_delete(ipaddr, ipif); 9715 ipif_refrele(ipif); 9716 } 9717 break; 9718 } 9719 } 9720 } 9721 iocp->ioc_cmd = area->area_cmd; 9722 9723 /* 9724 * Before sending 'mp' to ARP, we have to clear the b_next 9725 * and b_prev. Otherwise if STREAMS encounters such a message 9726 * in freemsg(), (because ARP can close any time) it can cause 9727 * a panic. But mi code needs the b_next and b_prev values of 9728 * mp->b_cont, to complete the ioctl. So we store it here 9729 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9730 * when the response comes down from ARP. 9731 */ 9732 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9733 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9734 mp->b_cont->b_next = NULL; 9735 mp->b_cont->b_prev = NULL; 9736 9737 mutex_enter(&connp->conn_lock); 9738 mutex_enter(&ill->ill_lock); 9739 /* conn has not yet started closing, hence this can't fail */ 9740 success = ill_pending_mp_add(ill, connp, pending_mp); 9741 ASSERT(success); 9742 mutex_exit(&ill->ill_lock); 9743 mutex_exit(&connp->conn_lock); 9744 9745 /* 9746 * Fill in the rest of the ARP operation fields. 9747 */ 9748 area->area_hw_addr_length = alength; 9749 bcopy(lladdr, 9750 (char *)area + area->area_hw_addr_offset, 9751 area->area_hw_addr_length); 9752 /* Translate the flags. */ 9753 if (flags & ATF_PERM) 9754 area->area_flags |= ACE_F_PERMANENT; 9755 if (flags & ATF_PUBL) 9756 area->area_flags |= ACE_F_PUBLISH; 9757 if (flags & ATF_AUTHORITY) 9758 area->area_flags |= ACE_F_AUTHORITY; 9759 9760 /* 9761 * Up to ARP it goes. The response will come 9762 * back in ip_wput as an M_IOCACK message, and 9763 * will be handed to ip_sioctl_iocack for 9764 * completion. 9765 */ 9766 putnext(ill->ill_rq, mp1); 9767 return (EINPROGRESS); 9768 } 9769 9770 /* ARGSUSED */ 9771 int 9772 ip_sioctl_xarp(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9773 ip_ioctl_cmd_t *ipip, void *ifreq) 9774 { 9775 struct xarpreq *xar; 9776 boolean_t isv6; 9777 mblk_t *mp1; 9778 int err; 9779 conn_t *connp; 9780 int ifnamelen; 9781 ire_t *ire = NULL; 9782 ill_t *ill = NULL; 9783 struct sockaddr_in *sin; 9784 boolean_t if_arp_ioctl = B_FALSE; 9785 9786 /* ioctl comes down on an conn */ 9787 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9788 connp = Q_TO_CONN(q); 9789 isv6 = connp->conn_af_isv6; 9790 9791 /* Existance verified in ip_wput_nondata */ 9792 mp1 = mp->b_cont->b_cont; 9793 9794 ASSERT(MBLKL(mp1) >= sizeof (*xar)); 9795 xar = (struct xarpreq *)mp1->b_rptr; 9796 sin = (sin_t *)&xar->xarp_pa; 9797 9798 if (isv6 || (xar->xarp_ha.sdl_family != AF_LINK) || 9799 (xar->xarp_pa.ss_family != AF_INET)) 9800 return (ENXIO); 9801 9802 ifnamelen = xar->xarp_ha.sdl_nlen; 9803 if (ifnamelen != 0) { 9804 char *cptr, cval; 9805 9806 if (ifnamelen >= LIFNAMSIZ) 9807 return (EINVAL); 9808 9809 /* 9810 * Instead of bcopying a bunch of bytes, 9811 * null-terminate the string in-situ. 9812 */ 9813 cptr = xar->xarp_ha.sdl_data + ifnamelen; 9814 cval = *cptr; 9815 *cptr = '\0'; 9816 ill = ill_lookup_on_name(xar->xarp_ha.sdl_data, 9817 B_FALSE, isv6, CONNP_TO_WQ(connp), mp, ip_process_ioctl, 9818 &err, NULL); 9819 *cptr = cval; 9820 if (ill == NULL) 9821 return (err); 9822 if (ill->ill_net_type != IRE_IF_RESOLVER) { 9823 ill_refrele(ill); 9824 return (ENXIO); 9825 } 9826 9827 if_arp_ioctl = B_TRUE; 9828 } else { 9829 /* 9830 * PSARC 2003/088 states that if sdl_nlen == 0, it behaves 9831 * as an extended BSD ioctl. The kernel uses the IP address 9832 * to figure out the network interface. 9833 */ 9834 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9835 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9836 ((ill = ire_to_ill(ire)) == NULL) || 9837 (ill->ill_net_type != IRE_IF_RESOLVER)) { 9838 if (ire != NULL) 9839 ire_refrele(ire); 9840 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9841 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9842 NULL, MATCH_IRE_TYPE); 9843 if ((ire == NULL) || 9844 ((ill = ire_to_ill(ire)) == NULL)) { 9845 if (ire != NULL) 9846 ire_refrele(ire); 9847 return (ENXIO); 9848 } 9849 } 9850 ASSERT(ire != NULL && ill != NULL); 9851 } 9852 9853 err = ip_sioctl_arp_common(ill, q, mp, sin, B_TRUE, if_arp_ioctl); 9854 if (if_arp_ioctl) 9855 ill_refrele(ill); 9856 if (ire != NULL) 9857 ire_refrele(ire); 9858 9859 return (err); 9860 } 9861 9862 /* 9863 * ARP IOCTLs. 9864 * How does IP get in the business of fronting ARP configuration/queries? 9865 * Well its like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9866 * are by tradition passed in through a datagram socket. That lands in IP. 9867 * As it happens, this is just as well since the interface is quite crude in 9868 * that it passes in no information about protocol or hardware types, or 9869 * interface association. After making the protocol assumption, IP is in 9870 * the position to look up the name of the ILL, which ARP will need, and 9871 * format a request that can be handled by ARP. The request is passed up 9872 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9873 * back a response. ARP supports its own set of more general IOCTLs, in 9874 * case anyone is interested. 9875 */ 9876 /* ARGSUSED */ 9877 int 9878 ip_sioctl_arp(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9879 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9880 { 9881 struct arpreq *ar; 9882 struct sockaddr_in *sin; 9883 ire_t *ire; 9884 boolean_t isv6; 9885 mblk_t *mp1; 9886 int err; 9887 conn_t *connp; 9888 ill_t *ill; 9889 9890 /* ioctl comes down on an conn */ 9891 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9892 connp = Q_TO_CONN(q); 9893 isv6 = connp->conn_af_isv6; 9894 if (isv6) 9895 return (ENXIO); 9896 9897 /* Existance verified in ip_wput_nondata */ 9898 mp1 = mp->b_cont->b_cont; 9899 9900 ar = (struct arpreq *)mp1->b_rptr; 9901 sin = (sin_t *)&ar->arp_pa; 9902 9903 /* 9904 * We need to let ARP know on which interface the IP 9905 * address has an ARP mapping. In the IPMP case, a 9906 * simple forwarding table lookup will return the 9907 * IRE_IF_RESOLVER for the first interface in the group, 9908 * which might not be the interface on which the 9909 * requested IP address was resolved due to the ill 9910 * selection algorithm (see ip_newroute_get_dst_ill()). 9911 * So we do a cache table lookup first: if the IRE cache 9912 * entry for the IP address is still there, it will 9913 * contain the ill pointer for the right interface, so 9914 * we use that. If the cache entry has been flushed, we 9915 * fall back to the forwarding table lookup. This should 9916 * be rare enough since IRE cache entries have a longer 9917 * life expectancy than ARP cache entries. 9918 */ 9919 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL); 9920 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9921 ((ill = ire_to_ill(ire)) == NULL)) { 9922 if (ire != NULL) 9923 ire_refrele(ire); 9924 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9925 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9926 NULL, MATCH_IRE_TYPE); 9927 if ((ire == NULL) || ((ill = ire_to_ill(ire)) == NULL)) { 9928 if (ire != NULL) 9929 ire_refrele(ire); 9930 return (ENXIO); 9931 } 9932 } 9933 ASSERT(ire != NULL && ill != NULL); 9934 9935 err = ip_sioctl_arp_common(ill, q, mp, sin, B_FALSE, B_FALSE); 9936 ire_refrele(ire); 9937 return (err); 9938 } 9939 9940 /* 9941 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9942 * atomically set/clear the muxids. Also complete the ioctl by acking or 9943 * naking it. Note that the code is structured such that the link type, 9944 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9945 * its clones use the persistent link, while pppd(1M) and perhaps many 9946 * other daemons may use non-persistent link. When combined with some 9947 * ill_t states, linking and unlinking lower streams may be used as 9948 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9949 */ 9950 /* ARGSUSED */ 9951 void 9952 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9953 { 9954 mblk_t *mp1; 9955 mblk_t *mp2; 9956 struct linkblk *li; 9957 queue_t *ipwq; 9958 char *name; 9959 struct qinit *qinfo; 9960 struct ipmx_s *ipmxp; 9961 ill_t *ill = NULL; 9962 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9963 int err = 0; 9964 boolean_t entered_ipsq = B_FALSE; 9965 boolean_t islink; 9966 queue_t *dwq = NULL; 9967 9968 ASSERT(iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_PUNLINK || 9969 iocp->ioc_cmd == I_LINK || iocp->ioc_cmd == I_UNLINK); 9970 9971 islink = (iocp->ioc_cmd == I_PLINK || iocp->ioc_cmd == I_LINK) ? 9972 B_TRUE : B_FALSE; 9973 9974 mp1 = mp->b_cont; /* This is the linkblk info */ 9975 li = (struct linkblk *)mp1->b_rptr; 9976 9977 /* 9978 * ARP has added this special mblk, and the utility is asking us 9979 * to perform consistency checks, and also atomically set the 9980 * muxid. Ifconfig is an example. It achieves this by using 9981 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9982 * to /dev/udp[6] stream for use as the mux when plinking the IP 9983 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9984 * and other comments in this routine for more details. 9985 */ 9986 mp2 = mp1->b_cont; /* This is added by ARP */ 9987 9988 /* 9989 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9990 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9991 * get the special mblk above. For backward compatibility, we just 9992 * return success. The utility will use SIOCSLIFMUXID to store 9993 * the muxids. This is not atomic, and can leave the streams 9994 * unplumbable if the utility is interrrupted, before it does the 9995 * SIOCSLIFMUXID. 9996 */ 9997 if (mp2 == NULL) { 9998 /* 9999 * At this point we don't know whether or not this is the 10000 * IP module stream or the ARP device stream. We need to 10001 * walk the lower stream in order to find this out, since 10002 * the capability negotiation is done only on the IP module 10003 * stream. IP module instance is identified by the module 10004 * name IP, non-null q_next, and it's wput not being ip_lwput. 10005 * STREAMS ensures that the lower stream (l_qbot) will not 10006 * vanish until this ioctl completes. So we can safely walk 10007 * the stream or refer to the q_ptr. 10008 */ 10009 ipwq = li->l_qbot; 10010 while (ipwq != NULL) { 10011 qinfo = ipwq->q_qinfo; 10012 name = qinfo->qi_minfo->mi_idname; 10013 if (name != NULL && name[0] != NULL && 10014 (strcmp(name, ip_mod_info.mi_idname) == 0) && 10015 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 10016 (ipwq->q_next != NULL)) { 10017 break; 10018 } 10019 ipwq = ipwq->q_next; 10020 } 10021 /* 10022 * This looks like an IP module stream, so trigger 10023 * the capability reset or re-negotiation if necessary. 10024 */ 10025 if (ipwq != NULL) { 10026 ill = ipwq->q_ptr; 10027 ASSERT(ill != NULL); 10028 10029 if (ipsq == NULL) { 10030 ipsq = ipsq_try_enter(NULL, ill, q, mp, 10031 ip_sioctl_plink, NEW_OP, B_TRUE); 10032 if (ipsq == NULL) 10033 return; 10034 entered_ipsq = B_TRUE; 10035 } 10036 ASSERT(IAM_WRITER_ILL(ill)); 10037 /* 10038 * Store the upper read queue of the module 10039 * immediately below IP, and count the total 10040 * number of lower modules. Do this only 10041 * for I_PLINK or I_LINK event. 10042 */ 10043 ill->ill_lmod_rq = NULL; 10044 ill->ill_lmod_cnt = 0; 10045 if (islink && (dwq = ipwq->q_next) != NULL) { 10046 ill->ill_lmod_rq = RD(dwq); 10047 10048 while (dwq != NULL) { 10049 ill->ill_lmod_cnt++; 10050 dwq = dwq->q_next; 10051 } 10052 } 10053 /* 10054 * There's no point in resetting or re-negotiating if 10055 * we are not bound to the driver, so only do this if 10056 * the DLPI state is idle (up); we assume such state 10057 * since ill_ipif_up_count gets incremented in 10058 * ipif_up_done(), which is after we are bound to the 10059 * driver. Note that in the case of logical 10060 * interfaces, IP won't rebind to the driver unless 10061 * the ill_ipif_up_count is 0, meaning that all other 10062 * IP interfaces (including the main ipif) are in the 10063 * down state. Because of this, we use such counter 10064 * as an indicator, instead of relying on the IPIF_UP 10065 * flag, which is per ipif instance. 10066 */ 10067 if (ill->ill_ipif_up_count > 0) { 10068 if (islink) 10069 ill_capability_probe(ill); 10070 else 10071 ill_capability_reset(ill); 10072 } 10073 } 10074 goto done; 10075 } 10076 10077 /* 10078 * This is an I_{P}LINK sent down by ifconfig on 10079 * /dev/arp. ARP has appended this last (3rd) mblk, 10080 * giving more info. STREAMS ensures that the lower 10081 * stream (l_qbot) will not vanish until this ioctl 10082 * completes. So we can safely walk the stream or refer 10083 * to the q_ptr. 10084 */ 10085 ipmxp = (struct ipmx_s *)mp2->b_rptr; 10086 if (ipmxp->ipmx_arpdev_stream) { 10087 /* 10088 * The operation is occuring on the arp-device 10089 * stream. 10090 */ 10091 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 10092 q, mp, ip_sioctl_plink, &err, NULL); 10093 if (ill == NULL) { 10094 if (err == EINPROGRESS) { 10095 return; 10096 } else { 10097 err = EINVAL; 10098 goto done; 10099 } 10100 } 10101 10102 if (ipsq == NULL) { 10103 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 10104 NEW_OP, B_TRUE); 10105 if (ipsq == NULL) { 10106 ill_refrele(ill); 10107 return; 10108 } 10109 entered_ipsq = B_TRUE; 10110 } 10111 ASSERT(IAM_WRITER_ILL(ill)); 10112 ill_refrele(ill); 10113 /* 10114 * To ensure consistency between IP and ARP, 10115 * the following LIFO scheme is used in 10116 * plink/punlink. (IP first, ARP last). 10117 * This is because the muxid's are stored 10118 * in the IP stream on the ill. 10119 * 10120 * I_{P}LINK: ifconfig plinks the IP stream before 10121 * plinking the ARP stream. On an arp-dev 10122 * stream, IP checks that it is not yet 10123 * plinked, and it also checks that the 10124 * corresponding IP stream is already plinked. 10125 * 10126 * I_{P}UNLINK: ifconfig punlinks the ARP stream 10127 * before punlinking the IP stream. IP does 10128 * not allow punlink of the IP stream unless 10129 * the arp stream has been punlinked. 10130 * 10131 */ 10132 if ((islink && 10133 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 10134 (!islink && 10135 ill->ill_arp_muxid != li->l_index)) { 10136 err = EINVAL; 10137 goto done; 10138 } 10139 if (islink) { 10140 ill->ill_arp_muxid = li->l_index; 10141 } else { 10142 ill->ill_arp_muxid = 0; 10143 } 10144 } else { 10145 /* 10146 * This must be the IP module stream with or 10147 * without arp. Walk the stream and locate the 10148 * IP module. An IP module instance is 10149 * identified by the module name IP, non-null 10150 * q_next, and it's wput not being ip_lwput. 10151 */ 10152 ipwq = li->l_qbot; 10153 while (ipwq != NULL) { 10154 qinfo = ipwq->q_qinfo; 10155 name = qinfo->qi_minfo->mi_idname; 10156 if (name != NULL && name[0] != NULL && 10157 (strcmp(name, ip_mod_info.mi_idname) == 0) && 10158 ((void *)(qinfo->qi_putp) != (void *)ip_lwput) && 10159 (ipwq->q_next != NULL)) { 10160 break; 10161 } 10162 ipwq = ipwq->q_next; 10163 } 10164 if (ipwq != NULL) { 10165 ill = ipwq->q_ptr; 10166 ASSERT(ill != NULL); 10167 10168 if (ipsq == NULL) { 10169 ipsq = ipsq_try_enter(NULL, ill, q, mp, 10170 ip_sioctl_plink, NEW_OP, B_TRUE); 10171 if (ipsq == NULL) 10172 return; 10173 entered_ipsq = B_TRUE; 10174 } 10175 ASSERT(IAM_WRITER_ILL(ill)); 10176 /* 10177 * Return error if the ip_mux_id is 10178 * non-zero and command is I_{P}LINK. 10179 * If command is I_{P}UNLINK, return 10180 * error if the arp-devstr is not 10181 * yet punlinked. 10182 */ 10183 if ((islink && ill->ill_ip_muxid != 0) || 10184 (!islink && ill->ill_arp_muxid != 0)) { 10185 err = EINVAL; 10186 goto done; 10187 } 10188 ill->ill_lmod_rq = NULL; 10189 ill->ill_lmod_cnt = 0; 10190 if (islink) { 10191 /* 10192 * Store the upper read queue of the module 10193 * immediately below IP, and count the total 10194 * number of lower modules. 10195 */ 10196 if ((dwq = ipwq->q_next) != NULL) { 10197 ill->ill_lmod_rq = RD(dwq); 10198 10199 while (dwq != NULL) { 10200 ill->ill_lmod_cnt++; 10201 dwq = dwq->q_next; 10202 } 10203 } 10204 ill->ill_ip_muxid = li->l_index; 10205 } else { 10206 ill->ill_ip_muxid = 0; 10207 } 10208 10209 /* 10210 * See comments above about resetting/re- 10211 * negotiating driver sub-capabilities. 10212 */ 10213 if (ill->ill_ipif_up_count > 0) { 10214 if (islink) 10215 ill_capability_probe(ill); 10216 else 10217 ill_capability_reset(ill); 10218 } 10219 } 10220 } 10221 done: 10222 iocp->ioc_count = 0; 10223 iocp->ioc_error = err; 10224 if (err == 0) 10225 mp->b_datap->db_type = M_IOCACK; 10226 else 10227 mp->b_datap->db_type = M_IOCNAK; 10228 qreply(q, mp); 10229 10230 /* Conn was refheld in ip_sioctl_copyin_setup */ 10231 if (CONN_Q(q)) 10232 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 10233 if (entered_ipsq) 10234 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10235 } 10236 10237 /* 10238 * Search the ioctl command in the ioctl tables and return a pointer 10239 * to the ioctl command information. The ioctl command tables are 10240 * static and fully populated at compile time. 10241 */ 10242 ip_ioctl_cmd_t * 10243 ip_sioctl_lookup(int ioc_cmd) 10244 { 10245 int index; 10246 ip_ioctl_cmd_t *ipip; 10247 ip_ioctl_cmd_t *ipip_end; 10248 10249 if (ioc_cmd == IPI_DONTCARE) 10250 return (NULL); 10251 10252 /* 10253 * Do a 2 step search. First search the indexed table 10254 * based on the least significant byte of the ioctl cmd. 10255 * If we don't find a match, then search the misc table 10256 * serially. 10257 */ 10258 index = ioc_cmd & 0xFF; 10259 if (index < ip_ndx_ioctl_count) { 10260 ipip = &ip_ndx_ioctl_table[index]; 10261 if (ipip->ipi_cmd == ioc_cmd) { 10262 /* Found a match in the ndx table */ 10263 return (ipip); 10264 } 10265 } 10266 10267 /* Search the misc table */ 10268 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 10269 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 10270 if (ipip->ipi_cmd == ioc_cmd) 10271 /* Found a match in the misc table */ 10272 return (ipip); 10273 } 10274 10275 return (NULL); 10276 } 10277 10278 /* 10279 * Wrapper function for resuming deferred ioctl processing 10280 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 10281 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 10282 */ 10283 /* ARGSUSED */ 10284 void 10285 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 10286 void *dummy_arg) 10287 { 10288 ip_sioctl_copyin_setup(q, mp); 10289 } 10290 10291 /* 10292 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 10293 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 10294 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 10295 * We establish here the size of the block to be copied in. mi_copyin 10296 * arranges for this to happen, an processing continues in ip_wput with 10297 * an M_IOCDATA message. 10298 */ 10299 void 10300 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 10301 { 10302 int copyin_size; 10303 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10304 ip_ioctl_cmd_t *ipip; 10305 cred_t *cr; 10306 10307 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 10308 if (ipip == NULL) { 10309 /* 10310 * The ioctl is not one we understand or own. 10311 * Pass it along to be processed down stream, 10312 * if this is a module instance of IP, else nak 10313 * the ioctl. 10314 */ 10315 if (q->q_next == NULL) { 10316 goto nak; 10317 } else { 10318 putnext(q, mp); 10319 return; 10320 } 10321 } 10322 10323 /* 10324 * If this is deferred, then we will do all the checks when we 10325 * come back. 10326 */ 10327 if ((iocp->ioc_cmd == SIOCGDSTINFO || 10328 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup()) { 10329 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 10330 return; 10331 } 10332 10333 /* 10334 * Only allow a very small subset of IP ioctls on this stream if 10335 * IP is a module and not a driver. Allowing ioctls to be processed 10336 * in this case may cause assert failures or data corruption. 10337 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10338 * ioctls allowed on an IP module stream, after which this stream 10339 * normally becomes a multiplexor (at which time the stream head 10340 * will fail all ioctls). 10341 */ 10342 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10343 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10344 /* 10345 * Pass common Streams ioctls which the IP 10346 * module does not own or consume along to 10347 * be processed down stream. 10348 */ 10349 putnext(q, mp); 10350 return; 10351 } else { 10352 goto nak; 10353 } 10354 } 10355 10356 /* Make sure we have ioctl data to process. */ 10357 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10358 goto nak; 10359 10360 /* 10361 * Prefer dblk credential over ioctl credential; some synthesized 10362 * ioctls have kcred set because there's no way to crhold() 10363 * a credential in some contexts. (ioc_cr is not crfree() by 10364 * the framework; the caller of ioctl needs to hold the reference 10365 * for the duration of the call). 10366 */ 10367 cr = DB_CREDDEF(mp, iocp->ioc_cr); 10368 10369 /* Make sure normal users don't send down privileged ioctls */ 10370 if ((ipip->ipi_flags & IPI_PRIV) && 10371 (cr != NULL) && secpolicy_net_config(cr, B_TRUE) != 0) { 10372 /* We checked the privilege earlier but log it here */ 10373 miocnak(q, mp, 0, secpolicy_net_config(cr, B_FALSE)); 10374 return; 10375 } 10376 10377 /* 10378 * The ioctl command tables can only encode fixed length 10379 * ioctl data. If the length is variable, the table will 10380 * encode the length as zero. Such special cases are handled 10381 * below in the switch. 10382 */ 10383 if (ipip->ipi_copyin_size != 0) { 10384 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10385 return; 10386 } 10387 10388 switch (iocp->ioc_cmd) { 10389 case O_SIOCGIFCONF: 10390 case SIOCGIFCONF: 10391 /* 10392 * This IOCTL is hilarious. See comments in 10393 * ip_sioctl_get_ifconf for the story. 10394 */ 10395 if (iocp->ioc_count == TRANSPARENT) 10396 copyin_size = SIZEOF_STRUCT(ifconf, 10397 iocp->ioc_flag); 10398 else 10399 copyin_size = iocp->ioc_count; 10400 mi_copyin(q, mp, NULL, copyin_size); 10401 return; 10402 10403 case O_SIOCGLIFCONF: 10404 case SIOCGLIFCONF: 10405 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10406 mi_copyin(q, mp, NULL, copyin_size); 10407 return; 10408 10409 case SIOCGLIFSRCOF: 10410 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10411 mi_copyin(q, mp, NULL, copyin_size); 10412 return; 10413 case SIOCGIP6ADDRPOLICY: 10414 ip_sioctl_ip6addrpolicy(q, mp); 10415 ip6_asp_table_refrele(); 10416 return; 10417 10418 case SIOCSIP6ADDRPOLICY: 10419 ip_sioctl_ip6addrpolicy(q, mp); 10420 return; 10421 10422 case SIOCGDSTINFO: 10423 ip_sioctl_dstinfo(q, mp); 10424 ip6_asp_table_refrele(); 10425 return; 10426 10427 case I_PLINK: 10428 case I_PUNLINK: 10429 case I_LINK: 10430 case I_UNLINK: 10431 /* 10432 * We treat non-persistent link similarly as the persistent 10433 * link case, in terms of plumbing/unplumbing, as well as 10434 * dynamic re-plumbing events indicator. See comments 10435 * in ip_sioctl_plink() for more. 10436 * 10437 * Request can be enqueued in the 'ipsq' while waiting 10438 * to become exclusive. So bump up the conn ref. 10439 */ 10440 if (CONN_Q(q)) 10441 CONN_INC_REF(Q_TO_CONN(q)); 10442 ip_sioctl_plink(NULL, q, mp, NULL); 10443 return; 10444 10445 case ND_GET: 10446 case ND_SET: 10447 /* 10448 * Use of the nd table requires holding the reader lock. 10449 * Modifying the nd table thru nd_load/nd_unload requires 10450 * the writer lock. 10451 */ 10452 rw_enter(&ip_g_nd_lock, RW_READER); 10453 if (nd_getset(q, ip_g_nd, mp)) { 10454 rw_exit(&ip_g_nd_lock); 10455 10456 if (iocp->ioc_error) 10457 iocp->ioc_count = 0; 10458 mp->b_datap->db_type = M_IOCACK; 10459 qreply(q, mp); 10460 return; 10461 } 10462 rw_exit(&ip_g_nd_lock); 10463 /* 10464 * We don't understand this subioctl of ND_GET / ND_SET. 10465 * Maybe intended for some driver / module below us 10466 */ 10467 if (q->q_next) { 10468 putnext(q, mp); 10469 } else { 10470 iocp->ioc_error = ENOENT; 10471 mp->b_datap->db_type = M_IOCNAK; 10472 iocp->ioc_count = 0; 10473 qreply(q, mp); 10474 } 10475 return; 10476 10477 case IP_IOCTL: 10478 ip_wput_ioctl(q, mp); 10479 return; 10480 default: 10481 cmn_err(CE_PANIC, "should not happen "); 10482 } 10483 nak: 10484 if (mp->b_cont != NULL) { 10485 freemsg(mp->b_cont); 10486 mp->b_cont = NULL; 10487 } 10488 iocp->ioc_error = EINVAL; 10489 mp->b_datap->db_type = M_IOCNAK; 10490 iocp->ioc_count = 0; 10491 qreply(q, mp); 10492 } 10493 10494 /* ip_wput hands off ARP IOCTL responses to us */ 10495 void 10496 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 10497 { 10498 struct arpreq *ar; 10499 struct xarpreq *xar; 10500 area_t *area; 10501 mblk_t *area_mp; 10502 struct iocblk *iocp; 10503 mblk_t *orig_ioc_mp, *tmp; 10504 struct iocblk *orig_iocp; 10505 ill_t *ill; 10506 conn_t *connp = NULL; 10507 uint_t ioc_id; 10508 mblk_t *pending_mp; 10509 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10510 int *flagsp; 10511 char *storage = NULL; 10512 sin_t *sin; 10513 ipaddr_t addr; 10514 int err; 10515 10516 ill = q->q_ptr; 10517 ASSERT(ill != NULL); 10518 10519 /* 10520 * We should get back from ARP a packet chain that looks like: 10521 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10522 */ 10523 if (!(area_mp = mp->b_cont) || 10524 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10525 !(orig_ioc_mp = area_mp->b_cont) || 10526 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10527 freemsg(mp); 10528 return; 10529 } 10530 10531 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10532 10533 tmp = (orig_ioc_mp->b_cont)->b_cont; 10534 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10535 (orig_iocp->ioc_cmd == SIOCSXARP) || 10536 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10537 x_arp_ioctl = B_TRUE; 10538 xar = (struct xarpreq *)tmp->b_rptr; 10539 sin = (sin_t *)&xar->xarp_pa; 10540 flagsp = &xar->xarp_flags; 10541 storage = xar->xarp_ha.sdl_data; 10542 if (xar->xarp_ha.sdl_nlen != 0) 10543 ifx_arp_ioctl = B_TRUE; 10544 } else { 10545 ar = (struct arpreq *)tmp->b_rptr; 10546 sin = (sin_t *)&ar->arp_pa; 10547 flagsp = &ar->arp_flags; 10548 storage = ar->arp_ha.sa_data; 10549 } 10550 10551 iocp = (struct iocblk *)mp->b_rptr; 10552 10553 /* 10554 * Pick out the originating queue based on the ioc_id. 10555 */ 10556 ioc_id = iocp->ioc_id; 10557 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 10558 if (pending_mp == NULL) { 10559 ASSERT(connp == NULL); 10560 inet_freemsg(mp); 10561 return; 10562 } 10563 ASSERT(connp != NULL); 10564 q = CONNP_TO_WQ(connp); 10565 10566 /* Uncouple the internally generated IOCTL from the original one */ 10567 area = (area_t *)area_mp->b_rptr; 10568 area_mp->b_cont = NULL; 10569 10570 /* 10571 * Restore the b_next and b_prev used by mi code. This is needed 10572 * to complete the ioctl using mi* functions. We stored them in 10573 * the pending mp prior to sending the request to ARP. 10574 */ 10575 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10576 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10577 inet_freemsg(pending_mp); 10578 10579 /* 10580 * We're done if there was an error or if this is not an SIOCG{X}ARP 10581 * Catch the case where there is an IRE_CACHE by no entry in the 10582 * arp table. 10583 */ 10584 addr = sin->sin_addr.s_addr; 10585 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10586 ire_t *ire; 10587 dl_unitdata_req_t *dlup; 10588 mblk_t *llmp; 10589 int addr_len; 10590 ill_t *ipsqill = NULL; 10591 10592 if (ifx_arp_ioctl) { 10593 /* 10594 * There's no need to lookup the ill, since 10595 * we've already done that when we started 10596 * processing the ioctl and sent the message 10597 * to ARP on that ill. So use the ill that 10598 * is stored in q->q_ptr. 10599 */ 10600 ipsqill = ill; 10601 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10602 ipsqill->ill_ipif, ALL_ZONES, 10603 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 10604 } else { 10605 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10606 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 10607 if (ire != NULL) 10608 ipsqill = ire_to_ill(ire); 10609 } 10610 10611 if ((x_arp_ioctl) && (ipsqill != NULL)) 10612 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10613 10614 if (ire != NULL) { 10615 /* 10616 * Since the ire obtained from cachetable is used for 10617 * mac addr copying below, treat an incomplete ire as if 10618 * as if we never found it. 10619 */ 10620 if (ire->ire_nce != NULL && 10621 ire->ire_nce->nce_state != ND_REACHABLE) { 10622 ire_refrele(ire); 10623 ire = NULL; 10624 ipsqill = NULL; 10625 goto errack; 10626 } 10627 *flagsp = ATF_INUSE; 10628 llmp = (ire->ire_nce != NULL ? 10629 ire->ire_nce->nce_res_mp : NULL); 10630 if (llmp != NULL && ipsqill != NULL) { 10631 uchar_t *macaddr; 10632 10633 addr_len = ipsqill->ill_phys_addr_length; 10634 if (x_arp_ioctl && ((addr_len + 10635 ipsqill->ill_name_length) > 10636 sizeof (xar->xarp_ha.sdl_data))) { 10637 ire_refrele(ire); 10638 freemsg(mp); 10639 ip_ioctl_finish(q, orig_ioc_mp, 10640 EINVAL, NO_COPYOUT, NULL); 10641 return; 10642 } 10643 *flagsp |= ATF_COM; 10644 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10645 if (ipsqill->ill_sap_length < 0) 10646 macaddr = llmp->b_rptr + 10647 dlup->dl_dest_addr_offset; 10648 else 10649 macaddr = llmp->b_rptr + 10650 dlup->dl_dest_addr_offset + 10651 ipsqill->ill_sap_length; 10652 /* 10653 * For SIOCGARP, MAC address length 10654 * validation has already been done 10655 * before the ioctl was issued to ARP to 10656 * allow it to progress only on 6 byte 10657 * addressable (ethernet like) media. Thus 10658 * the mac address copying can not overwrite 10659 * the sa_data area below. 10660 */ 10661 bcopy(macaddr, storage, addr_len); 10662 } 10663 /* Ditch the internal IOCTL. */ 10664 freemsg(mp); 10665 ire_refrele(ire); 10666 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); 10667 return; 10668 } 10669 } 10670 10671 /* 10672 * Delete the coresponding IRE_CACHE if any. 10673 * Reset the error if there was one (in case there was no entry 10674 * in arp.) 10675 */ 10676 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10677 ipif_t *ipintf = NULL; 10678 10679 if (ifx_arp_ioctl) { 10680 /* 10681 * There's no need to lookup the ill, since 10682 * we've already done that when we started 10683 * processing the ioctl and sent the message 10684 * to ARP on that ill. So use the ill that 10685 * is stored in q->q_ptr. 10686 */ 10687 ipintf = ill->ill_ipif; 10688 } 10689 if (ip_ire_clookup_and_delete(addr, ipintf)) { 10690 /* 10691 * The address in "addr" may be an entry for a 10692 * router. If that's true, then any off-net 10693 * IRE_CACHE entries that go through the router 10694 * with address "addr" must be clobbered. Use 10695 * ire_walk to achieve this goal. 10696 */ 10697 if (ifx_arp_ioctl) 10698 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10699 ire_delete_cache_gw, (char *)&addr, ill); 10700 else 10701 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10702 ALL_ZONES); 10703 iocp->ioc_error = 0; 10704 } 10705 } 10706 errack: 10707 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10708 err = iocp->ioc_error; 10709 freemsg(mp); 10710 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL); 10711 return; 10712 } 10713 10714 /* 10715 * Completion of an SIOCG{X}ARP. Translate the information from 10716 * the area_t into the struct {x}arpreq. 10717 */ 10718 if (x_arp_ioctl) { 10719 storage += ill_xarp_info(&xar->xarp_ha, ill); 10720 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10721 sizeof (xar->xarp_ha.sdl_data)) { 10722 freemsg(mp); 10723 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, 10724 NULL); 10725 return; 10726 } 10727 } 10728 *flagsp = ATF_INUSE; 10729 if (area->area_flags & ACE_F_PERMANENT) 10730 *flagsp |= ATF_PERM; 10731 if (area->area_flags & ACE_F_PUBLISH) 10732 *flagsp |= ATF_PUBL; 10733 if (area->area_flags & ACE_F_AUTHORITY) 10734 *flagsp |= ATF_AUTHORITY; 10735 if (area->area_hw_addr_length != 0) { 10736 *flagsp |= ATF_COM; 10737 /* 10738 * For SIOCGARP, MAC address length validation has 10739 * already been done before the ioctl was issued to ARP 10740 * to allow it to progress only on 6 byte addressable 10741 * (ethernet like) media. Thus the mac address copying 10742 * can not overwrite the sa_data area below. 10743 */ 10744 bcopy((char *)area + area->area_hw_addr_offset, 10745 storage, area->area_hw_addr_length); 10746 } 10747 10748 /* Ditch the internal IOCTL. */ 10749 freemsg(mp); 10750 /* Complete the original. */ 10751 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); 10752 } 10753 10754 /* 10755 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10756 * interface) create the next available logical interface for this 10757 * physical interface. 10758 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10759 * ipif with the specified name. 10760 * 10761 * If the address family is not AF_UNSPEC then set the address as well. 10762 * 10763 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10764 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10765 * 10766 * Executed as a writer on the ill or ill group. 10767 * So no lock is needed to traverse the ipif chain, or examine the 10768 * phyint flags. 10769 */ 10770 /* ARGSUSED */ 10771 int 10772 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10773 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10774 { 10775 mblk_t *mp1; 10776 struct lifreq *lifr; 10777 boolean_t isv6; 10778 boolean_t exists; 10779 char *name; 10780 char *endp; 10781 char *cp; 10782 int namelen; 10783 ipif_t *ipif; 10784 long id; 10785 ipsq_t *ipsq; 10786 ill_t *ill; 10787 sin_t *sin; 10788 int err = 0; 10789 boolean_t found_sep = B_FALSE; 10790 conn_t *connp; 10791 zoneid_t zoneid; 10792 int orig_ifindex = 0; 10793 10794 ip1dbg(("ip_sioctl_addif\n")); 10795 /* Existence of mp1 has been checked in ip_wput_nondata */ 10796 mp1 = mp->b_cont->b_cont; 10797 /* 10798 * Null terminate the string to protect against buffer 10799 * overrun. String was generated by user code and may not 10800 * be trusted. 10801 */ 10802 lifr = (struct lifreq *)mp1->b_rptr; 10803 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10804 name = lifr->lifr_name; 10805 ASSERT(CONN_Q(q)); 10806 connp = Q_TO_CONN(q); 10807 isv6 = connp->conn_af_isv6; 10808 zoneid = connp->conn_zoneid; 10809 namelen = mi_strlen(name); 10810 if (namelen == 0) 10811 return (EINVAL); 10812 10813 exists = B_FALSE; 10814 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10815 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10816 /* 10817 * Allow creating lo0 using SIOCLIFADDIF. 10818 * can't be any other writer thread. So can pass null below 10819 * for the last 4 args to ipif_lookup_name. 10820 */ 10821 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, 10822 B_TRUE, &exists, isv6, zoneid, NULL, NULL, NULL, NULL); 10823 /* Prevent any further action */ 10824 if (ipif == NULL) { 10825 return (ENOBUFS); 10826 } else if (!exists) { 10827 /* We created the ipif now and as writer */ 10828 ipif_refrele(ipif); 10829 return (0); 10830 } else { 10831 ill = ipif->ipif_ill; 10832 ill_refhold(ill); 10833 ipif_refrele(ipif); 10834 } 10835 } else { 10836 /* Look for a colon in the name. */ 10837 endp = &name[namelen]; 10838 for (cp = endp; --cp > name; ) { 10839 if (*cp == IPIF_SEPARATOR_CHAR) { 10840 found_sep = B_TRUE; 10841 /* 10842 * Reject any non-decimal aliases for plumbing 10843 * of logical interfaces. Aliases with leading 10844 * zeroes are also rejected as they introduce 10845 * ambiguity in the naming of the interfaces. 10846 * Comparing with "0" takes care of all such 10847 * cases. 10848 */ 10849 if ((strncmp("0", cp+1, 1)) == 0) 10850 return (EINVAL); 10851 10852 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10853 id <= 0 || *endp != '\0') { 10854 return (EINVAL); 10855 } 10856 *cp = '\0'; 10857 break; 10858 } 10859 } 10860 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10861 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL); 10862 if (found_sep) 10863 *cp = IPIF_SEPARATOR_CHAR; 10864 if (ill == NULL) 10865 return (err); 10866 } 10867 10868 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10869 B_TRUE); 10870 10871 /* 10872 * Release the refhold due to the lookup, now that we are excl 10873 * or we are just returning 10874 */ 10875 ill_refrele(ill); 10876 10877 if (ipsq == NULL) 10878 return (EINPROGRESS); 10879 10880 /* 10881 * If the interface is failed, inactive or offlined, look for a working 10882 * interface in the ill group and create the ipif there. If we can't 10883 * find a good interface, create the ipif anyway so that in.mpathd can 10884 * move it to the first repaired interface. 10885 */ 10886 if ((ill->ill_phyint->phyint_flags & 10887 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10888 ill->ill_phyint->phyint_groupname_len != 0) { 10889 phyint_t *phyi; 10890 char *groupname = ill->ill_phyint->phyint_groupname; 10891 10892 /* 10893 * We're looking for a working interface, but it doesn't matter 10894 * if it's up or down; so instead of following the group lists, 10895 * we look at each physical interface and compare the groupname. 10896 * We're only interested in interfaces with IPv4 (resp. IPv6) 10897 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 10898 * Otherwise we create the ipif on the failed interface. 10899 */ 10900 rw_enter(&ill_g_lock, RW_READER); 10901 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 10902 for (; phyi != NULL; 10903 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 10904 phyi, AVL_AFTER)) { 10905 if (phyi->phyint_groupname_len == 0) 10906 continue; 10907 ASSERT(phyi->phyint_groupname != NULL); 10908 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 10909 !(phyi->phyint_flags & 10910 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10911 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 10912 (phyi->phyint_illv4 != NULL))) { 10913 break; 10914 } 10915 } 10916 rw_exit(&ill_g_lock); 10917 10918 if (phyi != NULL) { 10919 orig_ifindex = ill->ill_phyint->phyint_ifindex; 10920 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 10921 phyi->phyint_illv4); 10922 } 10923 } 10924 10925 /* 10926 * We are now exclusive on the ipsq, so an ill move will be serialized 10927 * before or after us. 10928 */ 10929 ASSERT(IAM_WRITER_ILL(ill)); 10930 ASSERT(ill->ill_move_in_progress == B_FALSE); 10931 10932 if (found_sep && orig_ifindex == 0) { 10933 /* Now see if there is an IPIF with this unit number. */ 10934 for (ipif = ill->ill_ipif; ipif != NULL; 10935 ipif = ipif->ipif_next) { 10936 if (ipif->ipif_id == id) { 10937 err = EEXIST; 10938 goto done; 10939 } 10940 } 10941 } 10942 10943 /* 10944 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10945 * of lo0. We never come here when we plumb lo0:0. It 10946 * happens in ipif_lookup_on_name. 10947 * The specified unit number is ignored when we create the ipif on a 10948 * different interface. However, we save it in ipif_orig_ipifid below so 10949 * that the ipif fails back to the right position. 10950 */ 10951 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 10952 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 10953 err = ENOBUFS; 10954 goto done; 10955 } 10956 10957 /* Return created name with ioctl */ 10958 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10959 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10960 ip1dbg(("created %s\n", lifr->lifr_name)); 10961 10962 /* Set address */ 10963 sin = (sin_t *)&lifr->lifr_addr; 10964 if (sin->sin_family != AF_UNSPEC) { 10965 err = ip_sioctl_addr(ipif, sin, q, mp, 10966 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10967 } 10968 10969 /* Set ifindex and unit number for failback */ 10970 if (err == 0 && orig_ifindex != 0) { 10971 ipif->ipif_orig_ifindex = orig_ifindex; 10972 if (found_sep) { 10973 ipif->ipif_orig_ipifid = id; 10974 } 10975 } 10976 10977 done: 10978 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10979 return (err); 10980 } 10981 10982 /* 10983 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10984 * interface) delete it based on the IP address (on this physical interface). 10985 * Otherwise delete it based on the ipif_id. 10986 * Also, special handling to allow a removeif of lo0. 10987 */ 10988 /* ARGSUSED */ 10989 int 10990 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10991 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10992 { 10993 conn_t *connp; 10994 ill_t *ill = ipif->ipif_ill; 10995 boolean_t success; 10996 10997 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10998 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10999 ASSERT(IAM_WRITER_IPIF(ipif)); 11000 11001 connp = Q_TO_CONN(q); 11002 /* 11003 * Special case for unplumbing lo0 (the loopback physical interface). 11004 * If unplumbing lo0, the incoming address structure has been 11005 * initialized to all zeros. When unplumbing lo0, all its logical 11006 * interfaces must be removed too. 11007 * 11008 * Note that this interface may be called to remove a specific 11009 * loopback logical interface (eg, lo0:1). But in that case 11010 * ipif->ipif_id != 0 so that the code path for that case is the 11011 * same as any other interface (meaning it skips the code directly 11012 * below). 11013 */ 11014 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 11015 if (sin->sin_family == AF_UNSPEC && 11016 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 11017 /* 11018 * Mark it condemned. No new ref. will be made to ill. 11019 */ 11020 mutex_enter(&ill->ill_lock); 11021 ill->ill_state_flags |= ILL_CONDEMNED; 11022 for (ipif = ill->ill_ipif; ipif != NULL; 11023 ipif = ipif->ipif_next) { 11024 ipif->ipif_state_flags |= IPIF_CONDEMNED; 11025 } 11026 mutex_exit(&ill->ill_lock); 11027 11028 ipif = ill->ill_ipif; 11029 /* unplumb the loopback interface */ 11030 ill_delete(ill); 11031 mutex_enter(&connp->conn_lock); 11032 mutex_enter(&ill->ill_lock); 11033 ASSERT(ill->ill_group == NULL); 11034 11035 /* Are any references to this ill active */ 11036 if (ill_is_quiescent(ill)) { 11037 mutex_exit(&ill->ill_lock); 11038 mutex_exit(&connp->conn_lock); 11039 ill_delete_tail(ill); 11040 mi_free(ill); 11041 return (0); 11042 } 11043 success = ipsq_pending_mp_add(connp, ipif, 11044 CONNP_TO_WQ(connp), mp, ILL_FREE); 11045 mutex_exit(&connp->conn_lock); 11046 mutex_exit(&ill->ill_lock); 11047 if (success) 11048 return (EINPROGRESS); 11049 else 11050 return (EINTR); 11051 } 11052 } 11053 11054 /* 11055 * We are exclusive on the ipsq, so an ill move will be serialized 11056 * before or after us. 11057 */ 11058 ASSERT(ill->ill_move_in_progress == B_FALSE); 11059 11060 if (ipif->ipif_id == 0) { 11061 /* Find based on address */ 11062 if (ipif->ipif_isv6) { 11063 sin6_t *sin6; 11064 11065 if (sin->sin_family != AF_INET6) 11066 return (EAFNOSUPPORT); 11067 11068 sin6 = (sin6_t *)sin; 11069 /* We are a writer, so we should be able to lookup */ 11070 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 11071 ill, ALL_ZONES, NULL, NULL, NULL, NULL); 11072 if (ipif == NULL) { 11073 /* 11074 * Maybe the address in on another interface in 11075 * the same IPMP group? We check this below. 11076 */ 11077 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 11078 NULL, ALL_ZONES, NULL, NULL, NULL, NULL); 11079 } 11080 } else { 11081 ipaddr_t addr; 11082 11083 if (sin->sin_family != AF_INET) 11084 return (EAFNOSUPPORT); 11085 11086 addr = sin->sin_addr.s_addr; 11087 /* We are a writer, so we should be able to lookup */ 11088 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 11089 NULL, NULL, NULL); 11090 if (ipif == NULL) { 11091 /* 11092 * Maybe the address in on another interface in 11093 * the same IPMP group? We check this below. 11094 */ 11095 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 11096 NULL, NULL, NULL, NULL); 11097 } 11098 } 11099 if (ipif == NULL) { 11100 return (EADDRNOTAVAIL); 11101 } 11102 /* 11103 * When the address to be removed is hosted on a different 11104 * interface, we check if the interface is in the same IPMP 11105 * group as the specified one; if so we proceed with the 11106 * removal. 11107 * ill->ill_group is NULL when the ill is down, so we have to 11108 * compare the group names instead. 11109 */ 11110 if (ipif->ipif_ill != ill && 11111 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 11112 ill->ill_phyint->phyint_groupname_len == 0 || 11113 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 11114 ill->ill_phyint->phyint_groupname) != 0)) { 11115 ipif_refrele(ipif); 11116 return (EADDRNOTAVAIL); 11117 } 11118 11119 /* This is a writer */ 11120 ipif_refrele(ipif); 11121 } 11122 11123 /* 11124 * Can not delete instance zero since it is tied to the ill. 11125 */ 11126 if (ipif->ipif_id == 0) 11127 return (EBUSY); 11128 11129 mutex_enter(&ill->ill_lock); 11130 ipif->ipif_state_flags |= IPIF_CONDEMNED; 11131 mutex_exit(&ill->ill_lock); 11132 11133 ipif_free(ipif); 11134 11135 mutex_enter(&connp->conn_lock); 11136 mutex_enter(&ill->ill_lock); 11137 11138 /* Are any references to this ipif active */ 11139 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 11140 mutex_exit(&ill->ill_lock); 11141 mutex_exit(&connp->conn_lock); 11142 ipif_non_duplicate(ipif); 11143 ipif_down_tail(ipif); 11144 ipif_free_tail(ipif); 11145 return (0); 11146 } 11147 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 11148 IPIF_FREE); 11149 mutex_exit(&ill->ill_lock); 11150 mutex_exit(&connp->conn_lock); 11151 if (success) 11152 return (EINPROGRESS); 11153 else 11154 return (EINTR); 11155 } 11156 11157 /* 11158 * Restart the removeif ioctl. The refcnt has gone down to 0. 11159 * The ipif is already condemned. So can't find it thru lookups. 11160 */ 11161 /* ARGSUSED */ 11162 int 11163 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 11164 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 11165 { 11166 ill_t *ill; 11167 11168 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 11169 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11170 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 11171 ill = ipif->ipif_ill; 11172 ASSERT(IAM_WRITER_ILL(ill)); 11173 ASSERT((ipif->ipif_state_flags & IPIF_CONDEMNED) && 11174 (ill->ill_state_flags & IPIF_CONDEMNED)); 11175 ill_delete_tail(ill); 11176 mi_free(ill); 11177 return (0); 11178 } 11179 11180 ill = ipif->ipif_ill; 11181 ASSERT(IAM_WRITER_IPIF(ipif)); 11182 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 11183 11184 ipif_non_duplicate(ipif); 11185 ipif_down_tail(ipif); 11186 ipif_free_tail(ipif); 11187 11188 ILL_UNMARK_CHANGING(ill); 11189 return (0); 11190 } 11191 11192 /* 11193 * Set the local interface address. 11194 * Allow an address of all zero when the interface is down. 11195 */ 11196 /* ARGSUSED */ 11197 int 11198 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11199 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 11200 { 11201 int err = 0; 11202 in6_addr_t v6addr; 11203 boolean_t need_up = B_FALSE; 11204 11205 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 11206 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11207 11208 ASSERT(IAM_WRITER_IPIF(ipif)); 11209 11210 if (ipif->ipif_isv6) { 11211 sin6_t *sin6; 11212 ill_t *ill; 11213 phyint_t *phyi; 11214 11215 if (sin->sin_family != AF_INET6) 11216 return (EAFNOSUPPORT); 11217 11218 sin6 = (sin6_t *)sin; 11219 v6addr = sin6->sin6_addr; 11220 ill = ipif->ipif_ill; 11221 phyi = ill->ill_phyint; 11222 11223 /* 11224 * Enforce that true multicast interfaces have a link-local 11225 * address for logical unit 0. 11226 */ 11227 if (ipif->ipif_id == 0 && 11228 (ill->ill_flags & ILLF_MULTICAST) && 11229 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 11230 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 11231 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 11232 return (EADDRNOTAVAIL); 11233 } 11234 11235 /* 11236 * up interfaces shouldn't have the unspecified address 11237 * unless they also have the IPIF_NOLOCAL flags set and 11238 * have a subnet assigned. 11239 */ 11240 if ((ipif->ipif_flags & IPIF_UP) && 11241 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 11242 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 11243 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 11244 return (EADDRNOTAVAIL); 11245 } 11246 11247 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11248 return (EADDRNOTAVAIL); 11249 } else { 11250 ipaddr_t addr; 11251 11252 if (sin->sin_family != AF_INET) 11253 return (EAFNOSUPPORT); 11254 11255 addr = sin->sin_addr.s_addr; 11256 11257 /* Allow 0 as the local address. */ 11258 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11259 return (EADDRNOTAVAIL); 11260 11261 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11262 } 11263 11264 11265 /* 11266 * Even if there is no change we redo things just to rerun 11267 * ipif_set_default. 11268 */ 11269 if (ipif->ipif_flags & IPIF_UP) { 11270 /* 11271 * Setting a new local address, make sure 11272 * we have net and subnet bcast ire's for 11273 * the old address if we need them. 11274 */ 11275 if (!ipif->ipif_isv6) 11276 ipif_check_bcast_ires(ipif); 11277 /* 11278 * If the interface is already marked up, 11279 * we call ipif_down which will take care 11280 * of ditching any IREs that have been set 11281 * up based on the old interface address. 11282 */ 11283 err = ipif_logical_down(ipif, q, mp); 11284 if (err == EINPROGRESS) 11285 return (err); 11286 ipif_down_tail(ipif); 11287 need_up = 1; 11288 } 11289 11290 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 11291 return (err); 11292 } 11293 11294 int 11295 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11296 boolean_t need_up) 11297 { 11298 in6_addr_t v6addr; 11299 ipaddr_t addr; 11300 sin6_t *sin6; 11301 int sinlen; 11302 int err = 0; 11303 ill_t *ill = ipif->ipif_ill; 11304 boolean_t need_dl_down; 11305 boolean_t need_arp_down; 11306 struct iocblk *iocp; 11307 11308 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 11309 11310 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 11311 ill->ill_name, ipif->ipif_id, (void *)ipif)); 11312 ASSERT(IAM_WRITER_IPIF(ipif)); 11313 11314 /* Must cancel any pending timer before taking the ill_lock */ 11315 if (ipif->ipif_recovery_id != 0) 11316 (void) untimeout(ipif->ipif_recovery_id); 11317 ipif->ipif_recovery_id = 0; 11318 11319 if (ipif->ipif_isv6) { 11320 sin6 = (sin6_t *)sin; 11321 v6addr = sin6->sin6_addr; 11322 sinlen = sizeof (struct sockaddr_in6); 11323 } else { 11324 addr = sin->sin_addr.s_addr; 11325 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11326 sinlen = sizeof (struct sockaddr_in); 11327 } 11328 mutex_enter(&ill->ill_lock); 11329 ipif->ipif_v6lcl_addr = v6addr; 11330 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 11331 ipif->ipif_v6src_addr = ipv6_all_zeros; 11332 } else { 11333 ipif->ipif_v6src_addr = v6addr; 11334 } 11335 ipif->ipif_addr_ready = 0; 11336 11337 /* 11338 * If the interface was previously marked as a duplicate, then since 11339 * we've now got a "new" address, it should no longer be considered a 11340 * duplicate -- even if the "new" address is the same as the old one. 11341 * Note that if all ipifs are down, we may have a pending ARP down 11342 * event to handle. This is because we want to recover from duplicates 11343 * and thus delay tearing down ARP until the duplicates have been 11344 * removed or disabled. 11345 */ 11346 need_dl_down = need_arp_down = B_FALSE; 11347 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11348 need_arp_down = !need_up; 11349 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11350 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11351 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11352 need_dl_down = B_TRUE; 11353 } 11354 } 11355 11356 if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && 11357 !ill->ill_is_6to4tun) { 11358 queue_t *wqp = ill->ill_wq; 11359 11360 /* 11361 * The local address of this interface is a 6to4 address, 11362 * check if this interface is in fact a 6to4 tunnel or just 11363 * an interface configured with a 6to4 address. We are only 11364 * interested in the former. 11365 */ 11366 if (wqp != NULL) { 11367 while ((wqp->q_next != NULL) && 11368 (wqp->q_next->q_qinfo != NULL) && 11369 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 11370 11371 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 11372 == TUN6TO4_MODID) { 11373 /* set for use in IP */ 11374 ill->ill_is_6to4tun = 1; 11375 break; 11376 } 11377 wqp = wqp->q_next; 11378 } 11379 } 11380 } 11381 11382 ipif_set_default(ipif); 11383 11384 /* 11385 * When publishing an interface address change event, we only notify 11386 * the event listeners of the new address. It is assumed that if they 11387 * actively care about the addresses assigned that they will have 11388 * already discovered the previous address assigned (if there was one.) 11389 * 11390 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11391 */ 11392 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 11393 hook_nic_event_t *info; 11394 if ((info = ipif->ipif_ill->ill_nic_event_info) != NULL) { 11395 ip2dbg(("ip_sioctl_addr_tail: unexpected nic event %d " 11396 "attached for %s\n", info->hne_event, 11397 ill->ill_name)); 11398 if (info->hne_data != NULL) 11399 kmem_free(info->hne_data, info->hne_datalen); 11400 kmem_free(info, sizeof (hook_nic_event_t)); 11401 } 11402 11403 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 11404 if (info != NULL) { 11405 info->hne_nic = 11406 ipif->ipif_ill->ill_phyint->phyint_ifindex; 11407 info->hne_lif = MAP_IPIF_ID(ipif->ipif_id); 11408 info->hne_event = NE_ADDRESS_CHANGE; 11409 info->hne_family = ipif->ipif_isv6 ? ipv6 : ipv4; 11410 info->hne_data = kmem_alloc(sinlen, KM_NOSLEEP); 11411 if (info->hne_data != NULL) { 11412 info->hne_datalen = sinlen; 11413 bcopy(sin, info->hne_data, sinlen); 11414 } else { 11415 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11416 "address information for ADDRESS_CHANGE nic" 11417 " event of %s (ENOMEM)\n", 11418 ipif->ipif_ill->ill_name)); 11419 kmem_free(info, sizeof (hook_nic_event_t)); 11420 } 11421 } else 11422 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11423 "ADDRESS_CHANGE nic event information for %s " 11424 "(ENOMEM)\n", ipif->ipif_ill->ill_name)); 11425 11426 ipif->ipif_ill->ill_nic_event_info = info; 11427 } 11428 11429 mutex_exit(&ipif->ipif_ill->ill_lock); 11430 11431 if (need_up) { 11432 /* 11433 * Now bring the interface back up. If this 11434 * is the only IPIF for the ILL, ipif_up 11435 * will have to re-bind to the device, so 11436 * we may get back EINPROGRESS, in which 11437 * case, this IOCTL will get completed in 11438 * ip_rput_dlpi when we see the DL_BIND_ACK. 11439 */ 11440 err = ipif_up(ipif, q, mp); 11441 } else { 11442 /* 11443 * Update the IPIF list in SCTP, ipif_up_done() will do it 11444 * if need_up is true. 11445 */ 11446 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11447 } 11448 11449 if (need_dl_down) 11450 ill_dl_down(ill); 11451 if (need_arp_down) 11452 ipif_arp_down(ipif); 11453 11454 return (err); 11455 } 11456 11457 11458 /* 11459 * Restart entry point to restart the address set operation after the 11460 * refcounts have dropped to zero. 11461 */ 11462 /* ARGSUSED */ 11463 int 11464 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11465 ip_ioctl_cmd_t *ipip, void *ifreq) 11466 { 11467 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11468 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11469 ASSERT(IAM_WRITER_IPIF(ipif)); 11470 ipif_down_tail(ipif); 11471 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11472 } 11473 11474 /* ARGSUSED */ 11475 int 11476 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11477 ip_ioctl_cmd_t *ipip, void *if_req) 11478 { 11479 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11480 struct lifreq *lifr = (struct lifreq *)if_req; 11481 11482 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11483 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11484 /* 11485 * The net mask and address can't change since we have a 11486 * reference to the ipif. So no lock is necessary. 11487 */ 11488 if (ipif->ipif_isv6) { 11489 *sin6 = sin6_null; 11490 sin6->sin6_family = AF_INET6; 11491 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11492 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11493 lifr->lifr_addrlen = 11494 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11495 } else { 11496 *sin = sin_null; 11497 sin->sin_family = AF_INET; 11498 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11499 if (ipip->ipi_cmd_type == LIF_CMD) { 11500 lifr->lifr_addrlen = 11501 ip_mask_to_plen(ipif->ipif_net_mask); 11502 } 11503 } 11504 return (0); 11505 } 11506 11507 /* 11508 * Set the destination address for a pt-pt interface. 11509 */ 11510 /* ARGSUSED */ 11511 int 11512 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11513 ip_ioctl_cmd_t *ipip, void *if_req) 11514 { 11515 int err = 0; 11516 in6_addr_t v6addr; 11517 boolean_t need_up = B_FALSE; 11518 11519 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11520 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11521 ASSERT(IAM_WRITER_IPIF(ipif)); 11522 11523 if (ipif->ipif_isv6) { 11524 sin6_t *sin6; 11525 11526 if (sin->sin_family != AF_INET6) 11527 return (EAFNOSUPPORT); 11528 11529 sin6 = (sin6_t *)sin; 11530 v6addr = sin6->sin6_addr; 11531 11532 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11533 return (EADDRNOTAVAIL); 11534 } else { 11535 ipaddr_t addr; 11536 11537 if (sin->sin_family != AF_INET) 11538 return (EAFNOSUPPORT); 11539 11540 addr = sin->sin_addr.s_addr; 11541 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11542 return (EADDRNOTAVAIL); 11543 11544 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11545 } 11546 11547 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11548 return (0); /* No change */ 11549 11550 if (ipif->ipif_flags & IPIF_UP) { 11551 /* 11552 * If the interface is already marked up, 11553 * we call ipif_down which will take care 11554 * of ditching any IREs that have been set 11555 * up based on the old pp dst address. 11556 */ 11557 err = ipif_logical_down(ipif, q, mp); 11558 if (err == EINPROGRESS) 11559 return (err); 11560 ipif_down_tail(ipif); 11561 need_up = B_TRUE; 11562 } 11563 /* 11564 * could return EINPROGRESS. If so ioctl will complete in 11565 * ip_rput_dlpi_writer 11566 */ 11567 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11568 return (err); 11569 } 11570 11571 static int 11572 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11573 boolean_t need_up) 11574 { 11575 in6_addr_t v6addr; 11576 ill_t *ill = ipif->ipif_ill; 11577 int err = 0; 11578 boolean_t need_dl_down; 11579 boolean_t need_arp_down; 11580 11581 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11582 ipif->ipif_id, (void *)ipif)); 11583 11584 /* Must cancel any pending timer before taking the ill_lock */ 11585 if (ipif->ipif_recovery_id != 0) 11586 (void) untimeout(ipif->ipif_recovery_id); 11587 ipif->ipif_recovery_id = 0; 11588 11589 if (ipif->ipif_isv6) { 11590 sin6_t *sin6; 11591 11592 sin6 = (sin6_t *)sin; 11593 v6addr = sin6->sin6_addr; 11594 } else { 11595 ipaddr_t addr; 11596 11597 addr = sin->sin_addr.s_addr; 11598 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11599 } 11600 mutex_enter(&ill->ill_lock); 11601 /* Set point to point destination address. */ 11602 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11603 /* 11604 * Allow this as a means of creating logical 11605 * pt-pt interfaces on top of e.g. an Ethernet. 11606 * XXX Undocumented HACK for testing. 11607 * pt-pt interfaces are created with NUD disabled. 11608 */ 11609 ipif->ipif_flags |= IPIF_POINTOPOINT; 11610 ipif->ipif_flags &= ~IPIF_BROADCAST; 11611 if (ipif->ipif_isv6) 11612 ill->ill_flags |= ILLF_NONUD; 11613 } 11614 11615 /* 11616 * If the interface was previously marked as a duplicate, then since 11617 * we've now got a "new" address, it should no longer be considered a 11618 * duplicate -- even if the "new" address is the same as the old one. 11619 * Note that if all ipifs are down, we may have a pending ARP down 11620 * event to handle. 11621 */ 11622 need_dl_down = need_arp_down = B_FALSE; 11623 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11624 need_arp_down = !need_up; 11625 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11626 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11627 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11628 need_dl_down = B_TRUE; 11629 } 11630 } 11631 11632 /* Set the new address. */ 11633 ipif->ipif_v6pp_dst_addr = v6addr; 11634 /* Make sure subnet tracks pp_dst */ 11635 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11636 mutex_exit(&ill->ill_lock); 11637 11638 if (need_up) { 11639 /* 11640 * Now bring the interface back up. If this 11641 * is the only IPIF for the ILL, ipif_up 11642 * will have to re-bind to the device, so 11643 * we may get back EINPROGRESS, in which 11644 * case, this IOCTL will get completed in 11645 * ip_rput_dlpi when we see the DL_BIND_ACK. 11646 */ 11647 err = ipif_up(ipif, q, mp); 11648 } 11649 11650 if (need_dl_down) 11651 ill_dl_down(ill); 11652 11653 if (need_arp_down) 11654 ipif_arp_down(ipif); 11655 return (err); 11656 } 11657 11658 /* 11659 * Restart entry point to restart the dstaddress set operation after the 11660 * refcounts have dropped to zero. 11661 */ 11662 /* ARGSUSED */ 11663 int 11664 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11665 ip_ioctl_cmd_t *ipip, void *ifreq) 11666 { 11667 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11668 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11669 ipif_down_tail(ipif); 11670 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11671 } 11672 11673 /* ARGSUSED */ 11674 int 11675 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11676 ip_ioctl_cmd_t *ipip, void *if_req) 11677 { 11678 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11679 11680 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11681 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11682 /* 11683 * Get point to point destination address. The addresses can't 11684 * change since we hold a reference to the ipif. 11685 */ 11686 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11687 return (EADDRNOTAVAIL); 11688 11689 if (ipif->ipif_isv6) { 11690 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11691 *sin6 = sin6_null; 11692 sin6->sin6_family = AF_INET6; 11693 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11694 } else { 11695 *sin = sin_null; 11696 sin->sin_family = AF_INET; 11697 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11698 } 11699 return (0); 11700 } 11701 11702 /* 11703 * part of ipmp, make this func return the active/inactive state and 11704 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 11705 */ 11706 /* 11707 * This function either sets or clears the IFF_INACTIVE flag. 11708 * 11709 * As long as there are some addresses or multicast memberships on the 11710 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 11711 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 11712 * will be used for outbound packets. 11713 * 11714 * Caller needs to verify the validity of setting IFF_INACTIVE. 11715 */ 11716 static void 11717 phyint_inactive(phyint_t *phyi) 11718 { 11719 ill_t *ill_v4; 11720 ill_t *ill_v6; 11721 ipif_t *ipif; 11722 ilm_t *ilm; 11723 11724 ill_v4 = phyi->phyint_illv4; 11725 ill_v6 = phyi->phyint_illv6; 11726 11727 /* 11728 * No need for a lock while traversing the list since iam 11729 * a writer 11730 */ 11731 if (ill_v4 != NULL) { 11732 ASSERT(IAM_WRITER_ILL(ill_v4)); 11733 for (ipif = ill_v4->ill_ipif; ipif != NULL; 11734 ipif = ipif->ipif_next) { 11735 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11736 mutex_enter(&phyi->phyint_lock); 11737 phyi->phyint_flags &= ~PHYI_INACTIVE; 11738 mutex_exit(&phyi->phyint_lock); 11739 return; 11740 } 11741 } 11742 for (ilm = ill_v4->ill_ilm; ilm != NULL; 11743 ilm = ilm->ilm_next) { 11744 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11745 mutex_enter(&phyi->phyint_lock); 11746 phyi->phyint_flags &= ~PHYI_INACTIVE; 11747 mutex_exit(&phyi->phyint_lock); 11748 return; 11749 } 11750 } 11751 } 11752 if (ill_v6 != NULL) { 11753 ill_v6 = phyi->phyint_illv6; 11754 for (ipif = ill_v6->ill_ipif; ipif != NULL; 11755 ipif = ipif->ipif_next) { 11756 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11757 mutex_enter(&phyi->phyint_lock); 11758 phyi->phyint_flags &= ~PHYI_INACTIVE; 11759 mutex_exit(&phyi->phyint_lock); 11760 return; 11761 } 11762 } 11763 for (ilm = ill_v6->ill_ilm; ilm != NULL; 11764 ilm = ilm->ilm_next) { 11765 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11766 mutex_enter(&phyi->phyint_lock); 11767 phyi->phyint_flags &= ~PHYI_INACTIVE; 11768 mutex_exit(&phyi->phyint_lock); 11769 return; 11770 } 11771 } 11772 } 11773 mutex_enter(&phyi->phyint_lock); 11774 phyi->phyint_flags |= PHYI_INACTIVE; 11775 mutex_exit(&phyi->phyint_lock); 11776 } 11777 11778 /* 11779 * This function is called only when the phyint flags change. Currently 11780 * called from ip_sioctl_flags. We re-do the broadcast nomination so 11781 * that we can select a good ill. 11782 */ 11783 static void 11784 ip_redo_nomination(phyint_t *phyi) 11785 { 11786 ill_t *ill_v4; 11787 11788 ill_v4 = phyi->phyint_illv4; 11789 11790 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 11791 ASSERT(IAM_WRITER_ILL(ill_v4)); 11792 if (ill_v4->ill_group->illgrp_ill_count > 1) 11793 ill_nominate_bcast_rcv(ill_v4->ill_group); 11794 } 11795 } 11796 11797 /* 11798 * Heuristic to check if ill is INACTIVE. 11799 * Checks if ill has an ipif with an usable ip address. 11800 * 11801 * Return values: 11802 * B_TRUE - ill is INACTIVE; has no usable ipif 11803 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 11804 */ 11805 static boolean_t 11806 ill_is_inactive(ill_t *ill) 11807 { 11808 ipif_t *ipif; 11809 11810 /* Check whether it is in an IPMP group */ 11811 if (ill->ill_phyint->phyint_groupname == NULL) 11812 return (B_FALSE); 11813 11814 if (ill->ill_ipif_up_count == 0) 11815 return (B_TRUE); 11816 11817 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11818 uint64_t flags = ipif->ipif_flags; 11819 11820 /* 11821 * This ipif is usable if it is IPIF_UP and not a 11822 * dedicated test address. A dedicated test address 11823 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 11824 * (note in particular that V6 test addresses are 11825 * link-local data addresses and thus are marked 11826 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 11827 */ 11828 if ((flags & IPIF_UP) && 11829 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 11830 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 11831 return (B_FALSE); 11832 } 11833 return (B_TRUE); 11834 } 11835 11836 /* 11837 * Set interface flags. 11838 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 11839 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 11840 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 11841 * 11842 * NOTE : We really don't enforce that ipif_id zero should be used 11843 * for setting any flags other than IFF_LOGINT_FLAGS. This 11844 * is because applications generally does SICGLIFFLAGS and 11845 * ORs in the new flags (that affects the logical) and does a 11846 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11847 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11848 * flags that will be turned on is correct with respect to 11849 * ipif_id 0. For backward compatibility reasons, it is not done. 11850 */ 11851 /* ARGSUSED */ 11852 int 11853 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11854 ip_ioctl_cmd_t *ipip, void *if_req) 11855 { 11856 uint64_t turn_on; 11857 uint64_t turn_off; 11858 int err; 11859 boolean_t need_up = B_FALSE; 11860 phyint_t *phyi; 11861 ill_t *ill; 11862 uint64_t intf_flags; 11863 boolean_t phyint_flags_modified = B_FALSE; 11864 uint64_t flags; 11865 struct ifreq *ifr; 11866 struct lifreq *lifr; 11867 boolean_t set_linklocal = B_FALSE; 11868 boolean_t zero_source = B_FALSE; 11869 11870 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11871 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11872 11873 ASSERT(IAM_WRITER_IPIF(ipif)); 11874 11875 ill = ipif->ipif_ill; 11876 phyi = ill->ill_phyint; 11877 11878 if (ipip->ipi_cmd_type == IF_CMD) { 11879 ifr = (struct ifreq *)if_req; 11880 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11881 } else { 11882 lifr = (struct lifreq *)if_req; 11883 flags = lifr->lifr_flags; 11884 } 11885 11886 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11887 11888 /* 11889 * Has the flags been set correctly till now ? 11890 */ 11891 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11892 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11893 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11894 /* 11895 * Compare the new flags to the old, and partition 11896 * into those coming on and those going off. 11897 * For the 16 bit command keep the bits above bit 16 unchanged. 11898 */ 11899 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11900 flags |= intf_flags & ~0xFFFF; 11901 11902 /* 11903 * First check which bits will change and then which will 11904 * go on and off 11905 */ 11906 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 11907 if (!turn_on) 11908 return (0); /* No change */ 11909 11910 turn_off = intf_flags & turn_on; 11911 turn_on ^= turn_off; 11912 err = 0; 11913 11914 /* 11915 * Don't allow any bits belonging to the logical interface 11916 * to be set or cleared on the replacement ipif that was 11917 * created temporarily during a MOVE. 11918 */ 11919 if (ipif->ipif_replace_zero && 11920 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 11921 return (EINVAL); 11922 } 11923 11924 /* 11925 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11926 * IPv6 interfaces. 11927 */ 11928 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11929 return (EINVAL); 11930 11931 /* 11932 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11933 * interfaces. It makes no sense in that context. 11934 */ 11935 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11936 return (EINVAL); 11937 11938 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11939 zero_source = B_TRUE; 11940 11941 /* 11942 * For IPv6 ipif_id 0, don't allow the interface to be up without 11943 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11944 * If the link local address isn't set, and can be set, it will get 11945 * set later on in this function. 11946 */ 11947 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11948 (flags & IFF_UP) && !zero_source && 11949 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11950 if (ipif_cant_setlinklocal(ipif)) 11951 return (EINVAL); 11952 set_linklocal = B_TRUE; 11953 } 11954 11955 /* 11956 * ILL cannot be part of a usesrc group and and IPMP group at the 11957 * same time. No need to grab ill_g_usesrc_lock here, see 11958 * synchronization notes in ip.c 11959 */ 11960 if (turn_on & PHYI_STANDBY && 11961 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 11962 return (EINVAL); 11963 } 11964 11965 /* 11966 * If we modify physical interface flags, we'll potentially need to 11967 * send up two routing socket messages for the changes (one for the 11968 * IPv4 ill, and another for the IPv6 ill). Note that here. 11969 */ 11970 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11971 phyint_flags_modified = B_TRUE; 11972 11973 /* 11974 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 11975 * we need to flush the IRE_CACHES belonging to this ill. 11976 * We handle this case here without doing the DOWN/UP dance 11977 * like it is done for other flags. If some other flags are 11978 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 11979 * below will handle it by bringing it down and then 11980 * bringing it UP. 11981 */ 11982 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 11983 ill_t *ill_v4, *ill_v6; 11984 11985 ill_v4 = phyi->phyint_illv4; 11986 ill_v6 = phyi->phyint_illv6; 11987 11988 /* 11989 * First set the INACTIVE flag if needed. Then delete the ires. 11990 * ire_add will atomically prevent creating new IRE_CACHEs 11991 * unless hidden flag is set. 11992 * PHYI_FAILED and PHYI_INACTIVE are exclusive 11993 */ 11994 if ((turn_on & PHYI_FAILED) && 11995 ((intf_flags & PHYI_STANDBY) || !ipmp_enable_failback)) { 11996 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 11997 phyi->phyint_flags &= ~PHYI_INACTIVE; 11998 } 11999 if ((turn_off & PHYI_FAILED) && 12000 ((intf_flags & PHYI_STANDBY) || 12001 (!ipmp_enable_failback && ill_is_inactive(ill)))) { 12002 phyint_inactive(phyi); 12003 } 12004 12005 if (turn_on & PHYI_STANDBY) { 12006 /* 12007 * We implicitly set INACTIVE only when STANDBY is set. 12008 * INACTIVE is also set on non-STANDBY phyint when user 12009 * disables FAILBACK using configuration file. 12010 * Do not allow STANDBY to be set on such INACTIVE 12011 * phyint 12012 */ 12013 if (phyi->phyint_flags & PHYI_INACTIVE) 12014 return (EINVAL); 12015 if (!(phyi->phyint_flags & PHYI_FAILED)) 12016 phyint_inactive(phyi); 12017 } 12018 if (turn_off & PHYI_STANDBY) { 12019 if (ipmp_enable_failback) { 12020 /* 12021 * Reset PHYI_INACTIVE. 12022 */ 12023 phyi->phyint_flags &= ~PHYI_INACTIVE; 12024 } else if (ill_is_inactive(ill) && 12025 !(phyi->phyint_flags & PHYI_FAILED)) { 12026 /* 12027 * Need to set INACTIVE, when user sets 12028 * STANDBY on a non-STANDBY phyint and 12029 * later resets STANDBY 12030 */ 12031 phyint_inactive(phyi); 12032 } 12033 } 12034 /* 12035 * We should always send up a message so that the 12036 * daemons come to know of it. Note that the zeroth 12037 * interface can be down and the check below for IPIF_UP 12038 * will not make sense as we are actually setting 12039 * a phyint flag here. We assume that the ipif used 12040 * is always the zeroth ipif. (ip_rts_ifmsg does not 12041 * send up any message for non-zero ipifs). 12042 */ 12043 phyint_flags_modified = B_TRUE; 12044 12045 if (ill_v4 != NULL) { 12046 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 12047 IRE_CACHE, ill_stq_cache_delete, 12048 (char *)ill_v4, ill_v4); 12049 illgrp_reset_schednext(ill_v4); 12050 } 12051 if (ill_v6 != NULL) { 12052 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 12053 IRE_CACHE, ill_stq_cache_delete, 12054 (char *)ill_v6, ill_v6); 12055 illgrp_reset_schednext(ill_v6); 12056 } 12057 } 12058 12059 /* 12060 * If ILLF_ROUTER changes, we need to change the ip forwarding 12061 * status of the interface and, if the interface is part of an IPMP 12062 * group, all other interfaces that are part of the same IPMP 12063 * group. 12064 */ 12065 if ((turn_on | turn_off) & ILLF_ROUTER) { 12066 (void) ill_forward_set(q, mp, ((turn_on & ILLF_ROUTER) != 0), 12067 (caddr_t)ill); 12068 } 12069 12070 /* 12071 * If the interface is not UP and we are not going to 12072 * bring it UP, record the flags and return. When the 12073 * interface comes UP later, the right actions will be 12074 * taken. 12075 */ 12076 if (!(ipif->ipif_flags & IPIF_UP) && 12077 !(turn_on & IPIF_UP)) { 12078 /* Record new flags in their respective places. */ 12079 mutex_enter(&ill->ill_lock); 12080 mutex_enter(&ill->ill_phyint->phyint_lock); 12081 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 12082 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 12083 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 12084 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 12085 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 12086 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 12087 mutex_exit(&ill->ill_lock); 12088 mutex_exit(&ill->ill_phyint->phyint_lock); 12089 12090 /* 12091 * We do the broadcast and nomination here rather 12092 * than waiting for a FAILOVER/FAILBACK to happen. In 12093 * the case of FAILBACK from INACTIVE standby to the 12094 * interface that has been repaired, PHYI_FAILED has not 12095 * been cleared yet. If there are only two interfaces in 12096 * that group, all we have is a FAILED and INACTIVE 12097 * interface. If we do the nomination soon after a failback, 12098 * the broadcast nomination code would select the 12099 * INACTIVE interface for receiving broadcasts as FAILED is 12100 * not yet cleared. As we don't want STANDBY/INACTIVE to 12101 * receive broadcast packets, we need to redo nomination 12102 * when the FAILED is cleared here. Thus, in general we 12103 * always do the nomination here for FAILED, STANDBY 12104 * and OFFLINE. 12105 */ 12106 if (((turn_on | turn_off) & 12107 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 12108 ip_redo_nomination(phyi); 12109 } 12110 if (phyint_flags_modified) { 12111 if (phyi->phyint_illv4 != NULL) { 12112 ip_rts_ifmsg(phyi->phyint_illv4-> 12113 ill_ipif); 12114 } 12115 if (phyi->phyint_illv6 != NULL) { 12116 ip_rts_ifmsg(phyi->phyint_illv6-> 12117 ill_ipif); 12118 } 12119 } 12120 return (0); 12121 } else if (set_linklocal || zero_source) { 12122 mutex_enter(&ill->ill_lock); 12123 if (set_linklocal) 12124 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 12125 if (zero_source) 12126 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 12127 mutex_exit(&ill->ill_lock); 12128 } 12129 12130 /* 12131 * Disallow IPv6 interfaces coming up that have the unspecified address, 12132 * or point-to-point interfaces with an unspecified destination. We do 12133 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 12134 * have a subnet assigned, which is how in.ndpd currently manages its 12135 * onlink prefix list when no addresses are configured with those 12136 * prefixes. 12137 */ 12138 if (ipif->ipif_isv6 && 12139 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 12140 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 12141 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 12142 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 12143 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 12144 return (EINVAL); 12145 } 12146 12147 /* 12148 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 12149 * from being brought up. 12150 */ 12151 if (!ipif->ipif_isv6 && 12152 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 12153 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 12154 return (EINVAL); 12155 } 12156 12157 /* 12158 * The only flag changes that we currently take specific action on 12159 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 12160 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 12161 * IPIF_PREFERRED. This is done by bring the ipif down, changing 12162 * the flags and bringing it back up again. 12163 */ 12164 if ((turn_on|turn_off) & 12165 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 12166 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 12167 /* 12168 * Taking this ipif down, make sure we have 12169 * valid net and subnet bcast ire's for other 12170 * logical interfaces, if we need them. 12171 */ 12172 if (!ipif->ipif_isv6) 12173 ipif_check_bcast_ires(ipif); 12174 12175 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 12176 !(turn_off & IPIF_UP)) { 12177 need_up = B_TRUE; 12178 if (ipif->ipif_flags & IPIF_UP) 12179 ill->ill_logical_down = 1; 12180 turn_on &= ~IPIF_UP; 12181 } 12182 err = ipif_down(ipif, q, mp); 12183 ip1dbg(("ipif_down returns %d err ", err)); 12184 if (err == EINPROGRESS) 12185 return (err); 12186 ipif_down_tail(ipif); 12187 } 12188 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 12189 } 12190 12191 static int 12192 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 12193 boolean_t need_up) 12194 { 12195 ill_t *ill; 12196 phyint_t *phyi; 12197 uint64_t turn_on; 12198 uint64_t turn_off; 12199 uint64_t intf_flags; 12200 boolean_t phyint_flags_modified = B_FALSE; 12201 int err = 0; 12202 boolean_t set_linklocal = B_FALSE; 12203 boolean_t zero_source = B_FALSE; 12204 12205 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 12206 ipif->ipif_ill->ill_name, ipif->ipif_id)); 12207 12208 ASSERT(IAM_WRITER_IPIF(ipif)); 12209 12210 ill = ipif->ipif_ill; 12211 phyi = ill->ill_phyint; 12212 12213 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 12214 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 12215 12216 turn_off = intf_flags & turn_on; 12217 turn_on ^= turn_off; 12218 12219 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 12220 phyint_flags_modified = B_TRUE; 12221 12222 /* 12223 * Now we change the flags. Track current value of 12224 * other flags in their respective places. 12225 */ 12226 mutex_enter(&ill->ill_lock); 12227 mutex_enter(&phyi->phyint_lock); 12228 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 12229 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 12230 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 12231 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 12232 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 12233 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 12234 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 12235 set_linklocal = B_TRUE; 12236 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 12237 } 12238 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 12239 zero_source = B_TRUE; 12240 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 12241 } 12242 mutex_exit(&ill->ill_lock); 12243 mutex_exit(&phyi->phyint_lock); 12244 12245 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 12246 ip_redo_nomination(phyi); 12247 12248 if (set_linklocal) 12249 (void) ipif_setlinklocal(ipif); 12250 12251 if (zero_source) 12252 ipif->ipif_v6src_addr = ipv6_all_zeros; 12253 else 12254 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 12255 12256 if (need_up) { 12257 /* 12258 * XXX ipif_up really does not know whether a phyint flags 12259 * was modified or not. So, it sends up information on 12260 * only one routing sockets message. As we don't bring up 12261 * the interface and also set STANDBY/FAILED simultaneously 12262 * it should be okay. 12263 */ 12264 err = ipif_up(ipif, q, mp); 12265 } else { 12266 /* 12267 * Make sure routing socket sees all changes to the flags. 12268 * ipif_up_done* handles this when we use ipif_up. 12269 */ 12270 if (phyint_flags_modified) { 12271 if (phyi->phyint_illv4 != NULL) { 12272 ip_rts_ifmsg(phyi->phyint_illv4-> 12273 ill_ipif); 12274 } 12275 if (phyi->phyint_illv6 != NULL) { 12276 ip_rts_ifmsg(phyi->phyint_illv6-> 12277 ill_ipif); 12278 } 12279 } else { 12280 ip_rts_ifmsg(ipif); 12281 } 12282 } 12283 return (err); 12284 } 12285 12286 /* 12287 * Restart entry point to restart the flags restart operation after the 12288 * refcounts have dropped to zero. 12289 */ 12290 /* ARGSUSED */ 12291 int 12292 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12293 ip_ioctl_cmd_t *ipip, void *if_req) 12294 { 12295 int err; 12296 struct ifreq *ifr = (struct ifreq *)if_req; 12297 struct lifreq *lifr = (struct lifreq *)if_req; 12298 12299 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 12300 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12301 12302 ipif_down_tail(ipif); 12303 if (ipip->ipi_cmd_type == IF_CMD) { 12304 /* 12305 * Since ip_sioctl_flags expects an int and ifr_flags 12306 * is a short we need to cast ifr_flags into an int 12307 * to avoid having sign extension cause bits to get 12308 * set that should not be. 12309 */ 12310 err = ip_sioctl_flags_tail(ipif, 12311 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 12312 q, mp, B_TRUE); 12313 } else { 12314 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 12315 q, mp, B_TRUE); 12316 } 12317 return (err); 12318 } 12319 12320 /* ARGSUSED */ 12321 int 12322 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12323 ip_ioctl_cmd_t *ipip, void *if_req) 12324 { 12325 /* 12326 * Has the flags been set correctly till now ? 12327 */ 12328 ill_t *ill = ipif->ipif_ill; 12329 phyint_t *phyi = ill->ill_phyint; 12330 12331 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 12332 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12333 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 12334 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 12335 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 12336 12337 /* 12338 * Need a lock since some flags can be set even when there are 12339 * references to the ipif. 12340 */ 12341 mutex_enter(&ill->ill_lock); 12342 if (ipip->ipi_cmd_type == IF_CMD) { 12343 struct ifreq *ifr = (struct ifreq *)if_req; 12344 12345 /* Get interface flags (low 16 only). */ 12346 ifr->ifr_flags = ((ipif->ipif_flags | 12347 ill->ill_flags | phyi->phyint_flags) & 0xffff); 12348 } else { 12349 struct lifreq *lifr = (struct lifreq *)if_req; 12350 12351 /* Get interface flags. */ 12352 lifr->lifr_flags = ipif->ipif_flags | 12353 ill->ill_flags | phyi->phyint_flags; 12354 } 12355 mutex_exit(&ill->ill_lock); 12356 return (0); 12357 } 12358 12359 /* ARGSUSED */ 12360 int 12361 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12362 ip_ioctl_cmd_t *ipip, void *if_req) 12363 { 12364 int mtu; 12365 int ip_min_mtu; 12366 struct ifreq *ifr; 12367 struct lifreq *lifr; 12368 ire_t *ire; 12369 12370 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 12371 ipif->ipif_id, (void *)ipif)); 12372 if (ipip->ipi_cmd_type == IF_CMD) { 12373 ifr = (struct ifreq *)if_req; 12374 mtu = ifr->ifr_metric; 12375 } else { 12376 lifr = (struct lifreq *)if_req; 12377 mtu = lifr->lifr_mtu; 12378 } 12379 12380 if (ipif->ipif_isv6) 12381 ip_min_mtu = IPV6_MIN_MTU; 12382 else 12383 ip_min_mtu = IP_MIN_MTU; 12384 12385 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 12386 return (EINVAL); 12387 12388 /* 12389 * Change the MTU size in all relevant ire's. 12390 * Mtu change Vs. new ire creation - protocol below. 12391 * First change ipif_mtu and the ire_max_frag of the 12392 * interface ire. Then do an ire walk and change the 12393 * ire_max_frag of all affected ires. During ire_add 12394 * under the bucket lock, set the ire_max_frag of the 12395 * new ire being created from the ipif/ire from which 12396 * it is being derived. If an mtu change happens after 12397 * the ire is added, the new ire will be cleaned up. 12398 * Conversely if the mtu change happens before the ire 12399 * is added, ire_add will see the new value of the mtu. 12400 */ 12401 ipif->ipif_mtu = mtu; 12402 ipif->ipif_flags |= IPIF_FIXEDMTU; 12403 12404 if (ipif->ipif_isv6) 12405 ire = ipif_to_ire_v6(ipif); 12406 else 12407 ire = ipif_to_ire(ipif); 12408 if (ire != NULL) { 12409 ire->ire_max_frag = ipif->ipif_mtu; 12410 ire_refrele(ire); 12411 } 12412 if (ipif->ipif_flags & IPIF_UP) { 12413 if (ipif->ipif_isv6) 12414 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES); 12415 else 12416 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES); 12417 } 12418 /* Update the MTU in SCTP's list */ 12419 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12420 return (0); 12421 } 12422 12423 /* Get interface MTU. */ 12424 /* ARGSUSED */ 12425 int 12426 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12427 ip_ioctl_cmd_t *ipip, void *if_req) 12428 { 12429 struct ifreq *ifr; 12430 struct lifreq *lifr; 12431 12432 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 12433 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12434 if (ipip->ipi_cmd_type == IF_CMD) { 12435 ifr = (struct ifreq *)if_req; 12436 ifr->ifr_metric = ipif->ipif_mtu; 12437 } else { 12438 lifr = (struct lifreq *)if_req; 12439 lifr->lifr_mtu = ipif->ipif_mtu; 12440 } 12441 return (0); 12442 } 12443 12444 /* Set interface broadcast address. */ 12445 /* ARGSUSED2 */ 12446 int 12447 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12448 ip_ioctl_cmd_t *ipip, void *if_req) 12449 { 12450 ipaddr_t addr; 12451 ire_t *ire; 12452 12453 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 12454 ipif->ipif_id)); 12455 12456 ASSERT(IAM_WRITER_IPIF(ipif)); 12457 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12458 return (EADDRNOTAVAIL); 12459 12460 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 12461 12462 if (sin->sin_family != AF_INET) 12463 return (EAFNOSUPPORT); 12464 12465 addr = sin->sin_addr.s_addr; 12466 if (ipif->ipif_flags & IPIF_UP) { 12467 /* 12468 * If we are already up, make sure the new 12469 * broadcast address makes sense. If it does, 12470 * there should be an IRE for it already. 12471 * Don't match on ipif, only on the ill 12472 * since we are sharing these now. Don't use 12473 * MATCH_IRE_ILL_GROUP as we are looking for 12474 * the broadcast ire on this ill and each ill 12475 * in the group has its own broadcast ire. 12476 */ 12477 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 12478 ipif, ALL_ZONES, NULL, 12479 (MATCH_IRE_ILL | MATCH_IRE_TYPE)); 12480 if (ire == NULL) { 12481 return (EINVAL); 12482 } else { 12483 ire_refrele(ire); 12484 } 12485 } 12486 /* 12487 * Changing the broadcast addr for this ipif. 12488 * Make sure we have valid net and subnet bcast 12489 * ire's for other logical interfaces, if needed. 12490 */ 12491 if (addr != ipif->ipif_brd_addr) 12492 ipif_check_bcast_ires(ipif); 12493 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 12494 return (0); 12495 } 12496 12497 /* Get interface broadcast address. */ 12498 /* ARGSUSED */ 12499 int 12500 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12501 ip_ioctl_cmd_t *ipip, void *if_req) 12502 { 12503 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 12504 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12505 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12506 return (EADDRNOTAVAIL); 12507 12508 /* IPIF_BROADCAST not possible with IPv6 */ 12509 ASSERT(!ipif->ipif_isv6); 12510 *sin = sin_null; 12511 sin->sin_family = AF_INET; 12512 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 12513 return (0); 12514 } 12515 12516 /* 12517 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 12518 */ 12519 /* ARGSUSED */ 12520 int 12521 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12522 ip_ioctl_cmd_t *ipip, void *if_req) 12523 { 12524 int err = 0; 12525 in6_addr_t v6mask; 12526 12527 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 12528 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12529 12530 ASSERT(IAM_WRITER_IPIF(ipif)); 12531 12532 if (ipif->ipif_isv6) { 12533 sin6_t *sin6; 12534 12535 if (sin->sin_family != AF_INET6) 12536 return (EAFNOSUPPORT); 12537 12538 sin6 = (sin6_t *)sin; 12539 v6mask = sin6->sin6_addr; 12540 } else { 12541 ipaddr_t mask; 12542 12543 if (sin->sin_family != AF_INET) 12544 return (EAFNOSUPPORT); 12545 12546 mask = sin->sin_addr.s_addr; 12547 V4MASK_TO_V6(mask, v6mask); 12548 } 12549 12550 /* 12551 * No big deal if the interface isn't already up, or the mask 12552 * isn't really changing, or this is pt-pt. 12553 */ 12554 if (!(ipif->ipif_flags & IPIF_UP) || 12555 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12556 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12557 ipif->ipif_v6net_mask = v6mask; 12558 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12559 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12560 ipif->ipif_v6net_mask, 12561 ipif->ipif_v6subnet); 12562 } 12563 return (0); 12564 } 12565 /* 12566 * Make sure we have valid net and subnet broadcast ire's 12567 * for the old netmask, if needed by other logical interfaces. 12568 */ 12569 if (!ipif->ipif_isv6) 12570 ipif_check_bcast_ires(ipif); 12571 12572 err = ipif_logical_down(ipif, q, mp); 12573 if (err == EINPROGRESS) 12574 return (err); 12575 ipif_down_tail(ipif); 12576 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12577 return (err); 12578 } 12579 12580 static int 12581 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12582 { 12583 in6_addr_t v6mask; 12584 int err = 0; 12585 12586 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12587 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12588 12589 if (ipif->ipif_isv6) { 12590 sin6_t *sin6; 12591 12592 sin6 = (sin6_t *)sin; 12593 v6mask = sin6->sin6_addr; 12594 } else { 12595 ipaddr_t mask; 12596 12597 mask = sin->sin_addr.s_addr; 12598 V4MASK_TO_V6(mask, v6mask); 12599 } 12600 12601 ipif->ipif_v6net_mask = v6mask; 12602 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12603 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12604 ipif->ipif_v6subnet); 12605 } 12606 err = ipif_up(ipif, q, mp); 12607 12608 if (err == 0 || err == EINPROGRESS) { 12609 /* 12610 * The interface must be DL_BOUND if this packet has to 12611 * go out on the wire. Since we only go through a logical 12612 * down and are bound with the driver during an internal 12613 * down/up that is satisfied. 12614 */ 12615 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12616 /* Potentially broadcast an address mask reply. */ 12617 ipif_mask_reply(ipif); 12618 } 12619 } 12620 return (err); 12621 } 12622 12623 /* ARGSUSED */ 12624 int 12625 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12626 ip_ioctl_cmd_t *ipip, void *if_req) 12627 { 12628 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12629 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12630 ipif_down_tail(ipif); 12631 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12632 } 12633 12634 /* Get interface net mask. */ 12635 /* ARGSUSED */ 12636 int 12637 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12638 ip_ioctl_cmd_t *ipip, void *if_req) 12639 { 12640 struct lifreq *lifr = (struct lifreq *)if_req; 12641 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12642 12643 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12644 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12645 12646 /* 12647 * net mask can't change since we have a reference to the ipif. 12648 */ 12649 if (ipif->ipif_isv6) { 12650 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12651 *sin6 = sin6_null; 12652 sin6->sin6_family = AF_INET6; 12653 sin6->sin6_addr = ipif->ipif_v6net_mask; 12654 lifr->lifr_addrlen = 12655 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12656 } else { 12657 *sin = sin_null; 12658 sin->sin_family = AF_INET; 12659 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12660 if (ipip->ipi_cmd_type == LIF_CMD) { 12661 lifr->lifr_addrlen = 12662 ip_mask_to_plen(ipif->ipif_net_mask); 12663 } 12664 } 12665 return (0); 12666 } 12667 12668 /* ARGSUSED */ 12669 int 12670 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12671 ip_ioctl_cmd_t *ipip, void *if_req) 12672 { 12673 12674 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12675 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12676 /* 12677 * Set interface metric. We don't use this for 12678 * anything but we keep track of it in case it is 12679 * important to routing applications or such. 12680 */ 12681 if (ipip->ipi_cmd_type == IF_CMD) { 12682 struct ifreq *ifr; 12683 12684 ifr = (struct ifreq *)if_req; 12685 ipif->ipif_metric = ifr->ifr_metric; 12686 } else { 12687 struct lifreq *lifr; 12688 12689 lifr = (struct lifreq *)if_req; 12690 ipif->ipif_metric = lifr->lifr_metric; 12691 } 12692 return (0); 12693 } 12694 12695 12696 /* ARGSUSED */ 12697 int 12698 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12699 ip_ioctl_cmd_t *ipip, void *if_req) 12700 { 12701 12702 /* Get interface metric. */ 12703 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12704 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12705 if (ipip->ipi_cmd_type == IF_CMD) { 12706 struct ifreq *ifr; 12707 12708 ifr = (struct ifreq *)if_req; 12709 ifr->ifr_metric = ipif->ipif_metric; 12710 } else { 12711 struct lifreq *lifr; 12712 12713 lifr = (struct lifreq *)if_req; 12714 lifr->lifr_metric = ipif->ipif_metric; 12715 } 12716 12717 return (0); 12718 } 12719 12720 /* ARGSUSED */ 12721 int 12722 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12723 ip_ioctl_cmd_t *ipip, void *if_req) 12724 { 12725 12726 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12727 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12728 /* 12729 * Set the muxid returned from I_PLINK. 12730 */ 12731 if (ipip->ipi_cmd_type == IF_CMD) { 12732 struct ifreq *ifr = (struct ifreq *)if_req; 12733 12734 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12735 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12736 } else { 12737 struct lifreq *lifr = (struct lifreq *)if_req; 12738 12739 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12740 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12741 } 12742 return (0); 12743 } 12744 12745 /* ARGSUSED */ 12746 int 12747 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12748 ip_ioctl_cmd_t *ipip, void *if_req) 12749 { 12750 12751 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12752 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12753 /* 12754 * Get the muxid saved in ill for I_PUNLINK. 12755 */ 12756 if (ipip->ipi_cmd_type == IF_CMD) { 12757 struct ifreq *ifr = (struct ifreq *)if_req; 12758 12759 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12760 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12761 } else { 12762 struct lifreq *lifr = (struct lifreq *)if_req; 12763 12764 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12765 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12766 } 12767 return (0); 12768 } 12769 12770 /* 12771 * Set the subnet prefix. Does not modify the broadcast address. 12772 */ 12773 /* ARGSUSED */ 12774 int 12775 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12776 ip_ioctl_cmd_t *ipip, void *if_req) 12777 { 12778 int err = 0; 12779 in6_addr_t v6addr; 12780 in6_addr_t v6mask; 12781 boolean_t need_up = B_FALSE; 12782 int addrlen; 12783 12784 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12785 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12786 12787 ASSERT(IAM_WRITER_IPIF(ipif)); 12788 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12789 12790 if (ipif->ipif_isv6) { 12791 sin6_t *sin6; 12792 12793 if (sin->sin_family != AF_INET6) 12794 return (EAFNOSUPPORT); 12795 12796 sin6 = (sin6_t *)sin; 12797 v6addr = sin6->sin6_addr; 12798 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12799 return (EADDRNOTAVAIL); 12800 } else { 12801 ipaddr_t addr; 12802 12803 if (sin->sin_family != AF_INET) 12804 return (EAFNOSUPPORT); 12805 12806 addr = sin->sin_addr.s_addr; 12807 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12808 return (EADDRNOTAVAIL); 12809 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12810 /* Add 96 bits */ 12811 addrlen += IPV6_ABITS - IP_ABITS; 12812 } 12813 12814 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12815 return (EINVAL); 12816 12817 /* Check if bits in the address is set past the mask */ 12818 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12819 return (EINVAL); 12820 12821 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12822 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12823 return (0); /* No change */ 12824 12825 if (ipif->ipif_flags & IPIF_UP) { 12826 /* 12827 * If the interface is already marked up, 12828 * we call ipif_down which will take care 12829 * of ditching any IREs that have been set 12830 * up based on the old interface address. 12831 */ 12832 err = ipif_logical_down(ipif, q, mp); 12833 if (err == EINPROGRESS) 12834 return (err); 12835 ipif_down_tail(ipif); 12836 need_up = B_TRUE; 12837 } 12838 12839 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12840 return (err); 12841 } 12842 12843 static int 12844 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12845 queue_t *q, mblk_t *mp, boolean_t need_up) 12846 { 12847 ill_t *ill = ipif->ipif_ill; 12848 int err = 0; 12849 12850 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12851 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12852 12853 /* Set the new address. */ 12854 mutex_enter(&ill->ill_lock); 12855 ipif->ipif_v6net_mask = v6mask; 12856 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12857 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12858 ipif->ipif_v6subnet); 12859 } 12860 mutex_exit(&ill->ill_lock); 12861 12862 if (need_up) { 12863 /* 12864 * Now bring the interface back up. If this 12865 * is the only IPIF for the ILL, ipif_up 12866 * will have to re-bind to the device, so 12867 * we may get back EINPROGRESS, in which 12868 * case, this IOCTL will get completed in 12869 * ip_rput_dlpi when we see the DL_BIND_ACK. 12870 */ 12871 err = ipif_up(ipif, q, mp); 12872 if (err == EINPROGRESS) 12873 return (err); 12874 } 12875 return (err); 12876 } 12877 12878 /* ARGSUSED */ 12879 int 12880 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12881 ip_ioctl_cmd_t *ipip, void *if_req) 12882 { 12883 int addrlen; 12884 in6_addr_t v6addr; 12885 in6_addr_t v6mask; 12886 struct lifreq *lifr = (struct lifreq *)if_req; 12887 12888 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12889 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12890 ipif_down_tail(ipif); 12891 12892 addrlen = lifr->lifr_addrlen; 12893 if (ipif->ipif_isv6) { 12894 sin6_t *sin6; 12895 12896 sin6 = (sin6_t *)sin; 12897 v6addr = sin6->sin6_addr; 12898 } else { 12899 ipaddr_t addr; 12900 12901 addr = sin->sin_addr.s_addr; 12902 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12903 addrlen += IPV6_ABITS - IP_ABITS; 12904 } 12905 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12906 12907 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12908 } 12909 12910 /* ARGSUSED */ 12911 int 12912 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12913 ip_ioctl_cmd_t *ipip, void *if_req) 12914 { 12915 struct lifreq *lifr = (struct lifreq *)if_req; 12916 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12917 12918 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12919 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12920 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12921 12922 if (ipif->ipif_isv6) { 12923 *sin6 = sin6_null; 12924 sin6->sin6_family = AF_INET6; 12925 sin6->sin6_addr = ipif->ipif_v6subnet; 12926 lifr->lifr_addrlen = 12927 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12928 } else { 12929 *sin = sin_null; 12930 sin->sin_family = AF_INET; 12931 sin->sin_addr.s_addr = ipif->ipif_subnet; 12932 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12933 } 12934 return (0); 12935 } 12936 12937 /* 12938 * Set the IPv6 address token. 12939 */ 12940 /* ARGSUSED */ 12941 int 12942 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12943 ip_ioctl_cmd_t *ipi, void *if_req) 12944 { 12945 ill_t *ill = ipif->ipif_ill; 12946 int err; 12947 in6_addr_t v6addr; 12948 in6_addr_t v6mask; 12949 boolean_t need_up = B_FALSE; 12950 int i; 12951 sin6_t *sin6 = (sin6_t *)sin; 12952 struct lifreq *lifr = (struct lifreq *)if_req; 12953 int addrlen; 12954 12955 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12956 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12957 ASSERT(IAM_WRITER_IPIF(ipif)); 12958 12959 addrlen = lifr->lifr_addrlen; 12960 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12961 if (ipif->ipif_id != 0) 12962 return (EINVAL); 12963 12964 if (!ipif->ipif_isv6) 12965 return (EINVAL); 12966 12967 if (addrlen > IPV6_ABITS) 12968 return (EINVAL); 12969 12970 v6addr = sin6->sin6_addr; 12971 12972 /* 12973 * The length of the token is the length from the end. To get 12974 * the proper mask for this, compute the mask of the bits not 12975 * in the token; ie. the prefix, and then xor to get the mask. 12976 */ 12977 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12978 return (EINVAL); 12979 for (i = 0; i < 4; i++) { 12980 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12981 } 12982 12983 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12984 ill->ill_token_length == addrlen) 12985 return (0); /* No change */ 12986 12987 if (ipif->ipif_flags & IPIF_UP) { 12988 err = ipif_logical_down(ipif, q, mp); 12989 if (err == EINPROGRESS) 12990 return (err); 12991 ipif_down_tail(ipif); 12992 need_up = B_TRUE; 12993 } 12994 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12995 return (err); 12996 } 12997 12998 static int 12999 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 13000 mblk_t *mp, boolean_t need_up) 13001 { 13002 in6_addr_t v6addr; 13003 in6_addr_t v6mask; 13004 ill_t *ill = ipif->ipif_ill; 13005 int i; 13006 int err = 0; 13007 13008 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 13009 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13010 v6addr = sin6->sin6_addr; 13011 /* 13012 * The length of the token is the length from the end. To get 13013 * the proper mask for this, compute the mask of the bits not 13014 * in the token; ie. the prefix, and then xor to get the mask. 13015 */ 13016 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 13017 for (i = 0; i < 4; i++) 13018 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 13019 13020 mutex_enter(&ill->ill_lock); 13021 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 13022 ill->ill_token_length = addrlen; 13023 mutex_exit(&ill->ill_lock); 13024 13025 if (need_up) { 13026 /* 13027 * Now bring the interface back up. If this 13028 * is the only IPIF for the ILL, ipif_up 13029 * will have to re-bind to the device, so 13030 * we may get back EINPROGRESS, in which 13031 * case, this IOCTL will get completed in 13032 * ip_rput_dlpi when we see the DL_BIND_ACK. 13033 */ 13034 err = ipif_up(ipif, q, mp); 13035 if (err == EINPROGRESS) 13036 return (err); 13037 } 13038 return (err); 13039 } 13040 13041 /* ARGSUSED */ 13042 int 13043 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13044 ip_ioctl_cmd_t *ipi, void *if_req) 13045 { 13046 ill_t *ill; 13047 sin6_t *sin6 = (sin6_t *)sin; 13048 struct lifreq *lifr = (struct lifreq *)if_req; 13049 13050 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 13051 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13052 if (ipif->ipif_id != 0) 13053 return (EINVAL); 13054 13055 ill = ipif->ipif_ill; 13056 if (!ill->ill_isv6) 13057 return (ENXIO); 13058 13059 *sin6 = sin6_null; 13060 sin6->sin6_family = AF_INET6; 13061 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 13062 sin6->sin6_addr = ill->ill_token; 13063 lifr->lifr_addrlen = ill->ill_token_length; 13064 return (0); 13065 } 13066 13067 /* 13068 * Set (hardware) link specific information that might override 13069 * what was acquired through the DL_INFO_ACK. 13070 * The logic is as follows. 13071 * 13072 * become exclusive 13073 * set CHANGING flag 13074 * change mtu on affected IREs 13075 * clear CHANGING flag 13076 * 13077 * An ire add that occurs before the CHANGING flag is set will have its mtu 13078 * changed by the ip_sioctl_lnkinfo. 13079 * 13080 * During the time the CHANGING flag is set, no new ires will be added to the 13081 * bucket, and ire add will fail (due the CHANGING flag). 13082 * 13083 * An ire add that occurs after the CHANGING flag is set will have the right mtu 13084 * before it is added to the bucket. 13085 * 13086 * Obviously only 1 thread can set the CHANGING flag and we need to become 13087 * exclusive to set the flag. 13088 */ 13089 /* ARGSUSED */ 13090 int 13091 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13092 ip_ioctl_cmd_t *ipi, void *if_req) 13093 { 13094 ill_t *ill = ipif->ipif_ill; 13095 ipif_t *nipif; 13096 int ip_min_mtu; 13097 boolean_t mtu_walk = B_FALSE; 13098 struct lifreq *lifr = (struct lifreq *)if_req; 13099 lif_ifinfo_req_t *lir; 13100 ire_t *ire; 13101 13102 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 13103 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13104 lir = &lifr->lifr_ifinfo; 13105 ASSERT(IAM_WRITER_IPIF(ipif)); 13106 13107 /* Only allow for logical unit zero i.e. not on "le0:17" */ 13108 if (ipif->ipif_id != 0) 13109 return (EINVAL); 13110 13111 /* Set interface MTU. */ 13112 if (ipif->ipif_isv6) 13113 ip_min_mtu = IPV6_MIN_MTU; 13114 else 13115 ip_min_mtu = IP_MIN_MTU; 13116 13117 /* 13118 * Verify values before we set anything. Allow zero to 13119 * mean unspecified. 13120 */ 13121 if (lir->lir_maxmtu != 0 && 13122 (lir->lir_maxmtu > ill->ill_max_frag || 13123 lir->lir_maxmtu < ip_min_mtu)) 13124 return (EINVAL); 13125 if (lir->lir_reachtime != 0 && 13126 lir->lir_reachtime > ND_MAX_REACHTIME) 13127 return (EINVAL); 13128 if (lir->lir_reachretrans != 0 && 13129 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 13130 return (EINVAL); 13131 13132 mutex_enter(&ill->ill_lock); 13133 ill->ill_state_flags |= ILL_CHANGING; 13134 for (nipif = ill->ill_ipif; nipif != NULL; 13135 nipif = nipif->ipif_next) { 13136 nipif->ipif_state_flags |= IPIF_CHANGING; 13137 } 13138 13139 mutex_exit(&ill->ill_lock); 13140 13141 if (lir->lir_maxmtu != 0) { 13142 ill->ill_max_mtu = lir->lir_maxmtu; 13143 ill->ill_mtu_userspecified = 1; 13144 mtu_walk = B_TRUE; 13145 } 13146 13147 if (lir->lir_reachtime != 0) 13148 ill->ill_reachable_time = lir->lir_reachtime; 13149 13150 if (lir->lir_reachretrans != 0) 13151 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 13152 13153 ill->ill_max_hops = lir->lir_maxhops; 13154 13155 ill->ill_max_buf = ND_MAX_Q; 13156 13157 if (mtu_walk) { 13158 /* 13159 * Set the MTU on all ipifs associated with this ill except 13160 * for those whose MTU was fixed via SIOCSLIFMTU. 13161 */ 13162 for (nipif = ill->ill_ipif; nipif != NULL; 13163 nipif = nipif->ipif_next) { 13164 if (nipif->ipif_flags & IPIF_FIXEDMTU) 13165 continue; 13166 13167 nipif->ipif_mtu = ill->ill_max_mtu; 13168 13169 if (!(nipif->ipif_flags & IPIF_UP)) 13170 continue; 13171 13172 if (nipif->ipif_isv6) 13173 ire = ipif_to_ire_v6(nipif); 13174 else 13175 ire = ipif_to_ire(nipif); 13176 if (ire != NULL) { 13177 ire->ire_max_frag = ipif->ipif_mtu; 13178 ire_refrele(ire); 13179 } 13180 if (ill->ill_isv6) { 13181 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 13182 ipif_mtu_change, (char *)nipif, 13183 ill); 13184 } else { 13185 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 13186 ipif_mtu_change, (char *)nipif, 13187 ill); 13188 } 13189 } 13190 } 13191 13192 mutex_enter(&ill->ill_lock); 13193 for (nipif = ill->ill_ipif; nipif != NULL; 13194 nipif = nipif->ipif_next) { 13195 nipif->ipif_state_flags &= ~IPIF_CHANGING; 13196 } 13197 ILL_UNMARK_CHANGING(ill); 13198 mutex_exit(&ill->ill_lock); 13199 13200 return (0); 13201 } 13202 13203 /* ARGSUSED */ 13204 int 13205 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13206 ip_ioctl_cmd_t *ipi, void *if_req) 13207 { 13208 struct lif_ifinfo_req *lir; 13209 ill_t *ill = ipif->ipif_ill; 13210 13211 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 13212 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13213 if (ipif->ipif_id != 0) 13214 return (EINVAL); 13215 13216 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 13217 lir->lir_maxhops = ill->ill_max_hops; 13218 lir->lir_reachtime = ill->ill_reachable_time; 13219 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 13220 lir->lir_maxmtu = ill->ill_max_mtu; 13221 13222 return (0); 13223 } 13224 13225 /* 13226 * Return best guess as to the subnet mask for the specified address. 13227 * Based on the subnet masks for all the configured interfaces. 13228 * 13229 * We end up returning a zero mask in the case of default, multicast or 13230 * experimental. 13231 */ 13232 static ipaddr_t 13233 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp) 13234 { 13235 ipaddr_t net_mask; 13236 ill_t *ill; 13237 ipif_t *ipif; 13238 ill_walk_context_t ctx; 13239 ipif_t *fallback_ipif = NULL; 13240 13241 net_mask = ip_net_mask(addr); 13242 if (net_mask == 0) { 13243 *ipifp = NULL; 13244 return (0); 13245 } 13246 13247 /* Let's check to see if this is maybe a local subnet route. */ 13248 /* this function only applies to IPv4 interfaces */ 13249 rw_enter(&ill_g_lock, RW_READER); 13250 ill = ILL_START_WALK_V4(&ctx); 13251 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13252 mutex_enter(&ill->ill_lock); 13253 for (ipif = ill->ill_ipif; ipif != NULL; 13254 ipif = ipif->ipif_next) { 13255 if (!IPIF_CAN_LOOKUP(ipif)) 13256 continue; 13257 if (!(ipif->ipif_flags & IPIF_UP)) 13258 continue; 13259 if ((ipif->ipif_subnet & net_mask) == 13260 (addr & net_mask)) { 13261 /* 13262 * Don't trust pt-pt interfaces if there are 13263 * other interfaces. 13264 */ 13265 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13266 if (fallback_ipif == NULL) { 13267 ipif_refhold_locked(ipif); 13268 fallback_ipif = ipif; 13269 } 13270 continue; 13271 } 13272 13273 /* 13274 * Fine. Just assume the same net mask as the 13275 * directly attached subnet interface is using. 13276 */ 13277 ipif_refhold_locked(ipif); 13278 mutex_exit(&ill->ill_lock); 13279 rw_exit(&ill_g_lock); 13280 if (fallback_ipif != NULL) 13281 ipif_refrele(fallback_ipif); 13282 *ipifp = ipif; 13283 return (ipif->ipif_net_mask); 13284 } 13285 } 13286 mutex_exit(&ill->ill_lock); 13287 } 13288 rw_exit(&ill_g_lock); 13289 13290 *ipifp = fallback_ipif; 13291 return ((fallback_ipif != NULL) ? 13292 fallback_ipif->ipif_net_mask : net_mask); 13293 } 13294 13295 /* 13296 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 13297 */ 13298 static void 13299 ip_wput_ioctl(queue_t *q, mblk_t *mp) 13300 { 13301 IOCP iocp; 13302 ipft_t *ipft; 13303 ipllc_t *ipllc; 13304 mblk_t *mp1; 13305 cred_t *cr; 13306 int error = 0; 13307 conn_t *connp; 13308 13309 ip1dbg(("ip_wput_ioctl")); 13310 iocp = (IOCP)mp->b_rptr; 13311 mp1 = mp->b_cont; 13312 if (mp1 == NULL) { 13313 iocp->ioc_error = EINVAL; 13314 mp->b_datap->db_type = M_IOCNAK; 13315 iocp->ioc_count = 0; 13316 qreply(q, mp); 13317 return; 13318 } 13319 13320 /* 13321 * These IOCTLs provide various control capabilities to 13322 * upstream agents such as ULPs and processes. There 13323 * are currently two such IOCTLs implemented. They 13324 * are used by TCP to provide update information for 13325 * existing IREs and to forcibly delete an IRE for a 13326 * host that is not responding, thereby forcing an 13327 * attempt at a new route. 13328 */ 13329 iocp->ioc_error = EINVAL; 13330 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 13331 goto done; 13332 13333 ipllc = (ipllc_t *)mp1->b_rptr; 13334 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 13335 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 13336 break; 13337 } 13338 /* 13339 * prefer credential from mblk over ioctl; 13340 * see ip_sioctl_copyin_setup 13341 */ 13342 cr = DB_CREDDEF(mp, iocp->ioc_cr); 13343 13344 /* 13345 * Refhold the conn in case the request gets queued up in some lookup 13346 */ 13347 ASSERT(CONN_Q(q)); 13348 connp = Q_TO_CONN(q); 13349 CONN_INC_REF(connp); 13350 if (ipft->ipft_pfi && 13351 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 13352 pullupmsg(mp1, ipft->ipft_min_size))) { 13353 error = (*ipft->ipft_pfi)(q, 13354 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 13355 } 13356 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 13357 /* 13358 * CONN_OPER_PENDING_DONE happens in the function called 13359 * through ipft_pfi above. 13360 */ 13361 return; 13362 } 13363 13364 CONN_OPER_PENDING_DONE(connp); 13365 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 13366 freemsg(mp); 13367 return; 13368 } 13369 iocp->ioc_error = error; 13370 13371 done: 13372 mp->b_datap->db_type = M_IOCACK; 13373 if (iocp->ioc_error) 13374 iocp->ioc_count = 0; 13375 qreply(q, mp); 13376 } 13377 13378 /* 13379 * Lookup an ipif using the sequence id (ipif_seqid) 13380 */ 13381 ipif_t * 13382 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 13383 { 13384 ipif_t *ipif; 13385 13386 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13387 13388 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13389 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 13390 return (ipif); 13391 } 13392 return (NULL); 13393 } 13394 13395 uint64_t ipif_g_seqid; 13396 13397 /* 13398 * Assign a unique id for the ipif. This is used later when we send 13399 * IRES to ARP for resolution where we initialize ire_ipif_seqid 13400 * to the value pointed by ire_ipif->ipif_seqid. Later when the 13401 * IRE is added, we verify that ipif has not disappeared. 13402 */ 13403 13404 static void 13405 ipif_assign_seqid(ipif_t *ipif) 13406 { 13407 ipif->ipif_seqid = atomic_add_64_nv(&ipif_g_seqid, 1); 13408 } 13409 13410 /* 13411 * Insert the ipif, so that the list of ipifs on the ill will be sorted 13412 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 13413 * be inserted into the first space available in the list. The value of 13414 * ipif_id will then be set to the appropriate value for its position. 13415 */ 13416 static int 13417 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 13418 { 13419 ill_t *ill; 13420 ipif_t *tipif; 13421 ipif_t **tipifp; 13422 int id; 13423 13424 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 13425 IAM_WRITER_IPIF(ipif)); 13426 13427 ill = ipif->ipif_ill; 13428 ASSERT(ill != NULL); 13429 13430 /* 13431 * In the case of lo0:0 we already hold the ill_g_lock. 13432 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13433 * ipif_insert. Another such caller is ipif_move. 13434 */ 13435 if (acquire_g_lock) 13436 rw_enter(&ill_g_lock, RW_WRITER); 13437 if (acquire_ill_lock) 13438 mutex_enter(&ill->ill_lock); 13439 id = ipif->ipif_id; 13440 tipifp = &(ill->ill_ipif); 13441 if (id == -1) { /* need to find a real id */ 13442 id = 0; 13443 while ((tipif = *tipifp) != NULL) { 13444 ASSERT(tipif->ipif_id >= id); 13445 if (tipif->ipif_id != id) 13446 break; /* non-consecutive id */ 13447 id++; 13448 tipifp = &(tipif->ipif_next); 13449 } 13450 /* limit number of logical interfaces */ 13451 if (id >= ip_addrs_per_if) { 13452 if (acquire_ill_lock) 13453 mutex_exit(&ill->ill_lock); 13454 if (acquire_g_lock) 13455 rw_exit(&ill_g_lock); 13456 return (-1); 13457 } 13458 ipif->ipif_id = id; /* assign new id */ 13459 } else if (id < ip_addrs_per_if) { 13460 /* we have a real id; insert ipif in the right place */ 13461 while ((tipif = *tipifp) != NULL) { 13462 ASSERT(tipif->ipif_id != id); 13463 if (tipif->ipif_id > id) 13464 break; /* found correct location */ 13465 tipifp = &(tipif->ipif_next); 13466 } 13467 } else { 13468 if (acquire_ill_lock) 13469 mutex_exit(&ill->ill_lock); 13470 if (acquire_g_lock) 13471 rw_exit(&ill_g_lock); 13472 return (-1); 13473 } 13474 13475 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13476 13477 ipif->ipif_next = tipif; 13478 *tipifp = ipif; 13479 if (acquire_ill_lock) 13480 mutex_exit(&ill->ill_lock); 13481 if (acquire_g_lock) 13482 rw_exit(&ill_g_lock); 13483 return (0); 13484 } 13485 13486 /* 13487 * Allocate and initialize a new interface control structure. (Always 13488 * called as writer.) 13489 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13490 * is not part of the global linked list of ills. ipif_seqid is unique 13491 * in the system and to preserve the uniqueness, it is assigned only 13492 * when ill becomes part of the global list. At that point ill will 13493 * have a name. If it doesn't get assigned here, it will get assigned 13494 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13495 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13496 * the interface flags or any other information from the DL_INFO_ACK for 13497 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13498 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13499 * second DL_INFO_ACK comes in from the driver. 13500 */ 13501 static ipif_t * 13502 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 13503 { 13504 ipif_t *ipif; 13505 phyint_t *phyi; 13506 13507 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13508 ill->ill_name, id, (void *)ill)); 13509 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13510 13511 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13512 return (NULL); 13513 *ipif = ipif_zero; /* start clean */ 13514 13515 ipif->ipif_ill = ill; 13516 ipif->ipif_id = id; /* could be -1 */ 13517 ipif->ipif_zoneid = GLOBAL_ZONEID; 13518 13519 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13520 13521 ipif->ipif_refcnt = 0; 13522 ipif->ipif_saved_ire_cnt = 0; 13523 13524 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 13525 mi_free(ipif); 13526 return (NULL); 13527 } 13528 /* -1 id should have been replaced by real id */ 13529 id = ipif->ipif_id; 13530 ASSERT(id >= 0); 13531 13532 if (ill->ill_name[0] != '\0') { 13533 ipif_assign_seqid(ipif); 13534 if (ill->ill_phyint->phyint_ifindex != 0) 13535 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 13536 } 13537 /* 13538 * Keep a copy of original id in ipif_orig_ipifid. Failback 13539 * will attempt to restore the original id. The SIOCSLIFOINDEX 13540 * ioctl sets ipif_orig_ipifid to zero. 13541 */ 13542 ipif->ipif_orig_ipifid = id; 13543 13544 /* 13545 * We grab the ill_lock and phyint_lock to protect the flag changes. 13546 * The ipif is still not up and can't be looked up until the 13547 * ioctl completes and the IPIF_CHANGING flag is cleared. 13548 */ 13549 mutex_enter(&ill->ill_lock); 13550 mutex_enter(&ill->ill_phyint->phyint_lock); 13551 /* 13552 * Set the running flag when logical interface zero is created. 13553 * For subsequent logical interfaces, a DLPI link down 13554 * notification message may have cleared the running flag to 13555 * indicate the link is down, so we shouldn't just blindly set it. 13556 */ 13557 if (id == 0) 13558 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 13559 ipif->ipif_ire_type = ire_type; 13560 phyi = ill->ill_phyint; 13561 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 13562 13563 if (ipif->ipif_isv6) { 13564 ill->ill_flags |= ILLF_IPV6; 13565 } else { 13566 ipaddr_t inaddr_any = INADDR_ANY; 13567 13568 ill->ill_flags |= ILLF_IPV4; 13569 13570 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13571 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13572 &ipif->ipif_v6lcl_addr); 13573 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13574 &ipif->ipif_v6src_addr); 13575 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13576 &ipif->ipif_v6subnet); 13577 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13578 &ipif->ipif_v6net_mask); 13579 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13580 &ipif->ipif_v6brd_addr); 13581 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13582 &ipif->ipif_v6pp_dst_addr); 13583 } 13584 13585 /* 13586 * Don't set the interface flags etc. now, will do it in 13587 * ip_ll_subnet_defaults. 13588 */ 13589 if (!initialize) { 13590 mutex_exit(&ill->ill_lock); 13591 mutex_exit(&ill->ill_phyint->phyint_lock); 13592 return (ipif); 13593 } 13594 ipif->ipif_mtu = ill->ill_max_mtu; 13595 13596 if (ill->ill_bcast_addr_length != 0) { 13597 /* 13598 * Later detect lack of DLPI driver multicast 13599 * capability by catching DL_ENABMULTI errors in 13600 * ip_rput_dlpi. 13601 */ 13602 ill->ill_flags |= ILLF_MULTICAST; 13603 if (!ipif->ipif_isv6) 13604 ipif->ipif_flags |= IPIF_BROADCAST; 13605 } else { 13606 if (ill->ill_net_type != IRE_LOOPBACK) { 13607 if (ipif->ipif_isv6) 13608 /* 13609 * Note: xresolv interfaces will eventually need 13610 * NOARP set here as well, but that will require 13611 * those external resolvers to have some 13612 * knowledge of that flag and act appropriately. 13613 * Not to be changed at present. 13614 */ 13615 ill->ill_flags |= ILLF_NONUD; 13616 else 13617 ill->ill_flags |= ILLF_NOARP; 13618 } 13619 if (ill->ill_phys_addr_length == 0) { 13620 if (ill->ill_media && 13621 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 13622 ipif->ipif_flags |= IPIF_NOXMIT; 13623 phyi->phyint_flags |= PHYI_VIRTUAL; 13624 } else { 13625 /* pt-pt supports multicast. */ 13626 ill->ill_flags |= ILLF_MULTICAST; 13627 if (ill->ill_net_type == IRE_LOOPBACK) { 13628 phyi->phyint_flags |= 13629 (PHYI_LOOPBACK | PHYI_VIRTUAL); 13630 } else { 13631 ipif->ipif_flags |= IPIF_POINTOPOINT; 13632 } 13633 } 13634 } 13635 } 13636 mutex_exit(&ill->ill_lock); 13637 mutex_exit(&ill->ill_phyint->phyint_lock); 13638 return (ipif); 13639 } 13640 13641 /* 13642 * If appropriate, send a message up to the resolver delete the entry 13643 * for the address of this interface which is going out of business. 13644 * (Always called as writer). 13645 * 13646 * NOTE : We need to check for NULL mps as some of the fields are 13647 * initialized only for some interface types. See ipif_resolver_up() 13648 * for details. 13649 */ 13650 void 13651 ipif_arp_down(ipif_t *ipif) 13652 { 13653 mblk_t *mp; 13654 ill_t *ill = ipif->ipif_ill; 13655 13656 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13657 ASSERT(IAM_WRITER_IPIF(ipif)); 13658 13659 /* Delete the mapping for the local address */ 13660 mp = ipif->ipif_arp_del_mp; 13661 if (mp != NULL) { 13662 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13663 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13664 putnext(ill->ill_rq, mp); 13665 ipif->ipif_arp_del_mp = NULL; 13666 } 13667 13668 /* 13669 * If this is the last ipif that is going down and there are no 13670 * duplicate addresses we may yet attempt to re-probe, then we need to 13671 * clean up ARP completely. 13672 */ 13673 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13674 13675 /* Send up AR_INTERFACE_DOWN message */ 13676 mp = ill->ill_arp_down_mp; 13677 if (mp != NULL) { 13678 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13679 *(unsigned *)mp->b_rptr, ill->ill_name, 13680 ipif->ipif_id)); 13681 putnext(ill->ill_rq, mp); 13682 ill->ill_arp_down_mp = NULL; 13683 } 13684 13685 /* Tell ARP to delete the multicast mappings */ 13686 mp = ill->ill_arp_del_mapping_mp; 13687 if (mp != NULL) { 13688 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13689 *(unsigned *)mp->b_rptr, ill->ill_name, 13690 ipif->ipif_id)); 13691 putnext(ill->ill_rq, mp); 13692 ill->ill_arp_del_mapping_mp = NULL; 13693 } 13694 } 13695 } 13696 13697 /* 13698 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 13699 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 13700 * that it wants the add_mp allocated in this function to be returned 13701 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 13702 * just re-do the multicast, it wants us to send the add_mp to ARP also. 13703 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 13704 * as it does a ipif_arp_down after calling this function - which will 13705 * remove what we add here. 13706 * 13707 * Returns -1 on failures and 0 on success. 13708 */ 13709 int 13710 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13711 { 13712 mblk_t *del_mp = NULL; 13713 mblk_t *add_mp = NULL; 13714 mblk_t *mp; 13715 ill_t *ill = ipif->ipif_ill; 13716 phyint_t *phyi = ill->ill_phyint; 13717 ipaddr_t addr, mask, extract_mask = 0; 13718 arma_t *arma; 13719 uint8_t *maddr, *bphys_addr; 13720 uint32_t hw_start; 13721 dl_unitdata_req_t *dlur; 13722 13723 ASSERT(IAM_WRITER_IPIF(ipif)); 13724 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13725 return (0); 13726 13727 /* 13728 * Delete the existing mapping from ARP. Normally ipif_down 13729 * -> ipif_arp_down should send this up to ARP. The only 13730 * reason we would find this when we are switching from 13731 * Multicast to Broadcast where we did not do a down. 13732 */ 13733 mp = ill->ill_arp_del_mapping_mp; 13734 if (mp != NULL) { 13735 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13736 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13737 putnext(ill->ill_rq, mp); 13738 ill->ill_arp_del_mapping_mp = NULL; 13739 } 13740 13741 if (arp_add_mapping_mp != NULL) 13742 *arp_add_mapping_mp = NULL; 13743 13744 /* 13745 * Check that the address is not to long for the constant 13746 * length reserved in the template arma_t. 13747 */ 13748 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13749 return (-1); 13750 13751 /* Add mapping mblk */ 13752 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13753 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13754 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13755 (caddr_t)&addr); 13756 if (add_mp == NULL) 13757 return (-1); 13758 arma = (arma_t *)add_mp->b_rptr; 13759 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13760 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13761 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13762 13763 /* 13764 * Determine the broadcast address. 13765 */ 13766 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13767 if (ill->ill_sap_length < 0) 13768 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13769 else 13770 bphys_addr = (uchar_t *)dlur + 13771 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13772 /* 13773 * Check PHYI_MULTI_BCAST and length of physical 13774 * address to determine if we use the mapping or the 13775 * broadcast address. 13776 */ 13777 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13778 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13779 bphys_addr, maddr, &hw_start, &extract_mask)) 13780 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13781 13782 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13783 (ill->ill_flags & ILLF_MULTICAST)) { 13784 /* Make sure this will not match the "exact" entry. */ 13785 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13786 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13787 (caddr_t)&addr); 13788 if (del_mp == NULL) { 13789 freemsg(add_mp); 13790 return (-1); 13791 } 13792 bcopy(&extract_mask, (char *)arma + 13793 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13794 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13795 /* Use link-layer broadcast address for MULTI_BCAST */ 13796 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13797 ip2dbg(("ipif_arp_setup_multicast: adding" 13798 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13799 } else { 13800 arma->arma_hw_mapping_start = hw_start; 13801 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13802 " ARP setup for %s\n", ill->ill_name)); 13803 } 13804 } else { 13805 freemsg(add_mp); 13806 ASSERT(del_mp == NULL); 13807 /* It is neither MULTICAST nor MULTI_BCAST */ 13808 return (0); 13809 } 13810 ASSERT(add_mp != NULL && del_mp != NULL); 13811 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13812 ill->ill_arp_del_mapping_mp = del_mp; 13813 if (arp_add_mapping_mp != NULL) { 13814 /* The caller just wants the mblks allocated */ 13815 *arp_add_mapping_mp = add_mp; 13816 } else { 13817 /* The caller wants us to send it to arp */ 13818 putnext(ill->ill_rq, add_mp); 13819 } 13820 return (0); 13821 } 13822 13823 /* 13824 * Get the resolver set up for a new interface address. 13825 * (Always called as writer.) 13826 * Called both for IPv4 and IPv6 interfaces, 13827 * though it only sets up the resolver for v6 13828 * if it's an xresolv interface (one using an external resolver). 13829 * Honors ILLF_NOARP. 13830 * The enumerated value res_act is used to tune the behavior. 13831 * If set to Res_act_initial, then we set up all the resolver 13832 * structures for a new interface. If set to Res_act_move, then 13833 * we just send an AR_ENTRY_ADD message up to ARP for IPv4 13834 * interfaces; this is called by ip_rput_dlpi_writer() to handle 13835 * asynchronous hardware address change notification. If set to 13836 * Res_act_defend, then we tell ARP that it needs to send a single 13837 * gratuitous message in defense of the address. 13838 * Returns error on failure. 13839 */ 13840 int 13841 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13842 { 13843 caddr_t addr; 13844 mblk_t *arp_up_mp = NULL; 13845 mblk_t *arp_down_mp = NULL; 13846 mblk_t *arp_add_mp = NULL; 13847 mblk_t *arp_del_mp = NULL; 13848 mblk_t *arp_add_mapping_mp = NULL; 13849 mblk_t *arp_del_mapping_mp = NULL; 13850 ill_t *ill = ipif->ipif_ill; 13851 uchar_t *area_p = NULL; 13852 uchar_t *ared_p = NULL; 13853 int err = ENOMEM; 13854 boolean_t was_dup; 13855 13856 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13857 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13858 ASSERT(IAM_WRITER_IPIF(ipif)); 13859 13860 was_dup = B_FALSE; 13861 if (res_act == Res_act_initial) { 13862 ipif->ipif_addr_ready = 0; 13863 /* 13864 * We're bringing an interface up here. There's no way that we 13865 * should need to shut down ARP now. 13866 */ 13867 mutex_enter(&ill->ill_lock); 13868 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13869 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13870 ill->ill_ipif_dup_count--; 13871 was_dup = B_TRUE; 13872 } 13873 mutex_exit(&ill->ill_lock); 13874 } 13875 if (ipif->ipif_recovery_id != 0) 13876 (void) untimeout(ipif->ipif_recovery_id); 13877 ipif->ipif_recovery_id = 0; 13878 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13879 ipif->ipif_addr_ready = 1; 13880 return (0); 13881 } 13882 /* NDP will set the ipif_addr_ready flag when it's ready */ 13883 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13884 return (0); 13885 13886 if (ill->ill_isv6) { 13887 /* 13888 * External resolver for IPv6 13889 */ 13890 ASSERT(res_act == Res_act_initial); 13891 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 13892 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 13893 area_p = (uchar_t *)&ip6_area_template; 13894 ared_p = (uchar_t *)&ip6_ared_template; 13895 } 13896 } else { 13897 /* 13898 * IPv4 arp case. If the ARP stream has already started 13899 * closing, fail this request for ARP bringup. Else 13900 * record the fact that an ARP bringup is pending. 13901 */ 13902 mutex_enter(&ill->ill_lock); 13903 if (ill->ill_arp_closing) { 13904 mutex_exit(&ill->ill_lock); 13905 err = EINVAL; 13906 goto failed; 13907 } else { 13908 if (ill->ill_ipif_up_count == 0 && 13909 ill->ill_ipif_dup_count == 0 && !was_dup) 13910 ill->ill_arp_bringup_pending = 1; 13911 mutex_exit(&ill->ill_lock); 13912 } 13913 if (ipif->ipif_lcl_addr != INADDR_ANY) { 13914 addr = (caddr_t)&ipif->ipif_lcl_addr; 13915 area_p = (uchar_t *)&ip_area_template; 13916 ared_p = (uchar_t *)&ip_ared_template; 13917 } 13918 } 13919 13920 /* 13921 * Add an entry for the local address in ARP only if it 13922 * is not UNNUMBERED and the address is not INADDR_ANY. 13923 */ 13924 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { 13925 area_t *area; 13926 13927 /* Now ask ARP to publish our address. */ 13928 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 13929 if (arp_add_mp == NULL) 13930 goto failed; 13931 area = (area_t *)arp_add_mp->b_rptr; 13932 if (res_act != Res_act_initial) { 13933 /* 13934 * Copy the new hardware address and length into 13935 * arp_add_mp to be sent to ARP. 13936 */ 13937 area->area_hw_addr_length = ill->ill_phys_addr_length; 13938 bcopy(ill->ill_phys_addr, 13939 ((char *)area + area->area_hw_addr_offset), 13940 area->area_hw_addr_length); 13941 } 13942 13943 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | 13944 ACE_F_MYADDR; 13945 13946 if (res_act == Res_act_defend) { 13947 area->area_flags |= ACE_F_DEFEND; 13948 /* 13949 * If we're just defending our address now, then 13950 * there's no need to set up ARP multicast mappings. 13951 * The publish command is enough. 13952 */ 13953 goto done; 13954 } 13955 13956 if (res_act != Res_act_initial) 13957 goto arp_setup_multicast; 13958 13959 /* 13960 * Allocate an ARP deletion message so we know we can tell ARP 13961 * when the interface goes down. 13962 */ 13963 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 13964 if (arp_del_mp == NULL) 13965 goto failed; 13966 13967 } else { 13968 if (res_act != Res_act_initial) 13969 goto done; 13970 } 13971 /* 13972 * Need to bring up ARP or setup multicast mapping only 13973 * when the first interface is coming UP. 13974 */ 13975 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 13976 was_dup) { 13977 goto done; 13978 } 13979 13980 /* 13981 * Allocate an ARP down message (to be saved) and an ARP up 13982 * message. 13983 */ 13984 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13985 if (arp_down_mp == NULL) 13986 goto failed; 13987 13988 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13989 if (arp_up_mp == NULL) 13990 goto failed; 13991 13992 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13993 goto done; 13994 13995 arp_setup_multicast: 13996 /* 13997 * Setup the multicast mappings. This function initializes 13998 * ill_arp_del_mapping_mp also. This does not need to be done for 13999 * IPv6. 14000 */ 14001 if (!ill->ill_isv6) { 14002 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 14003 if (err != 0) 14004 goto failed; 14005 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 14006 ASSERT(arp_add_mapping_mp != NULL); 14007 } 14008 14009 done: 14010 if (arp_del_mp != NULL) { 14011 ASSERT(ipif->ipif_arp_del_mp == NULL); 14012 ipif->ipif_arp_del_mp = arp_del_mp; 14013 } 14014 if (arp_down_mp != NULL) { 14015 ASSERT(ill->ill_arp_down_mp == NULL); 14016 ill->ill_arp_down_mp = arp_down_mp; 14017 } 14018 if (arp_del_mapping_mp != NULL) { 14019 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 14020 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 14021 } 14022 if (arp_up_mp != NULL) { 14023 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 14024 ill->ill_name, ipif->ipif_id)); 14025 putnext(ill->ill_rq, arp_up_mp); 14026 } 14027 if (arp_add_mp != NULL) { 14028 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 14029 ill->ill_name, ipif->ipif_id)); 14030 /* 14031 * If it's an extended ARP implementation, then we'll wait to 14032 * hear that DAD has finished before using the interface. 14033 */ 14034 if (!ill->ill_arp_extend) 14035 ipif->ipif_addr_ready = 1; 14036 putnext(ill->ill_rq, arp_add_mp); 14037 } else { 14038 ipif->ipif_addr_ready = 1; 14039 } 14040 if (arp_add_mapping_mp != NULL) { 14041 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 14042 ill->ill_name, ipif->ipif_id)); 14043 putnext(ill->ill_rq, arp_add_mapping_mp); 14044 } 14045 if (res_act != Res_act_initial) 14046 return (0); 14047 14048 if (ill->ill_flags & ILLF_NOARP) 14049 err = ill_arp_off(ill); 14050 else 14051 err = ill_arp_on(ill); 14052 if (err != 0) { 14053 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 14054 freemsg(ipif->ipif_arp_del_mp); 14055 freemsg(ill->ill_arp_down_mp); 14056 freemsg(ill->ill_arp_del_mapping_mp); 14057 ipif->ipif_arp_del_mp = NULL; 14058 ill->ill_arp_down_mp = NULL; 14059 ill->ill_arp_del_mapping_mp = NULL; 14060 return (err); 14061 } 14062 return ((ill->ill_ipif_up_count != 0 || was_dup || 14063 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 14064 14065 failed: 14066 ip1dbg(("ipif_resolver_up: FAILED\n")); 14067 freemsg(arp_add_mp); 14068 freemsg(arp_del_mp); 14069 freemsg(arp_add_mapping_mp); 14070 freemsg(arp_up_mp); 14071 freemsg(arp_down_mp); 14072 ill->ill_arp_bringup_pending = 0; 14073 return (err); 14074 } 14075 14076 /* 14077 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 14078 * just gone back up. 14079 */ 14080 static void 14081 ipif_arp_start_dad(ipif_t *ipif) 14082 { 14083 ill_t *ill = ipif->ipif_ill; 14084 mblk_t *arp_add_mp; 14085 area_t *area; 14086 14087 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 14088 (ipif->ipif_flags & IPIF_UNNUMBERED) || 14089 ipif->ipif_lcl_addr == INADDR_ANY || 14090 (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 14091 (char *)&ipif->ipif_lcl_addr)) == NULL) { 14092 /* 14093 * If we can't contact ARP for some reason, that's not really a 14094 * problem. Just send out the routing socket notification that 14095 * DAD completion would have done, and continue. 14096 */ 14097 ipif_mask_reply(ipif); 14098 ip_rts_ifmsg(ipif); 14099 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 14100 sctp_update_ipif(ipif, SCTP_IPIF_UP); 14101 ipif->ipif_addr_ready = 1; 14102 return; 14103 } 14104 14105 /* Setting the 'unverified' flag restarts DAD */ 14106 area = (area_t *)arp_add_mp->b_rptr; 14107 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 14108 ACE_F_UNVERIFIED; 14109 putnext(ill->ill_rq, arp_add_mp); 14110 } 14111 14112 static void 14113 ipif_ndp_start_dad(ipif_t *ipif) 14114 { 14115 nce_t *nce; 14116 14117 nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); 14118 if (nce == NULL) 14119 return; 14120 14121 if (!ndp_restart_dad(nce)) { 14122 /* 14123 * If we can't restart DAD for some reason, that's not really a 14124 * problem. Just send out the routing socket notification that 14125 * DAD completion would have done, and continue. 14126 */ 14127 ip_rts_ifmsg(ipif); 14128 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 14129 sctp_update_ipif(ipif, SCTP_IPIF_UP); 14130 ipif->ipif_addr_ready = 1; 14131 } 14132 NCE_REFRELE(nce); 14133 } 14134 14135 /* 14136 * Restart duplicate address detection on all interfaces on the given ill. 14137 * 14138 * This is called when an interface transitions from down to up 14139 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 14140 * 14141 * Note that since the underlying physical link has transitioned, we must cause 14142 * at least one routing socket message to be sent here, either via DAD 14143 * completion or just by default on the first ipif. (If we don't do this, then 14144 * in.mpathd will see long delays when doing link-based failure recovery.) 14145 */ 14146 void 14147 ill_restart_dad(ill_t *ill, boolean_t went_up) 14148 { 14149 ipif_t *ipif; 14150 14151 if (ill == NULL) 14152 return; 14153 14154 /* 14155 * If layer two doesn't support duplicate address detection, then just 14156 * send the routing socket message now and be done with it. 14157 */ 14158 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 14159 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 14160 ip_rts_ifmsg(ill->ill_ipif); 14161 return; 14162 } 14163 14164 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14165 if (went_up) { 14166 if (ipif->ipif_flags & IPIF_UP) { 14167 if (ill->ill_isv6) 14168 ipif_ndp_start_dad(ipif); 14169 else 14170 ipif_arp_start_dad(ipif); 14171 } else if (ill->ill_isv6 && 14172 (ipif->ipif_flags & IPIF_DUPLICATE)) { 14173 /* 14174 * For IPv4, the ARP module itself will 14175 * automatically start the DAD process when it 14176 * sees DL_NOTE_LINK_UP. We respond to the 14177 * AR_CN_READY at the completion of that task. 14178 * For IPv6, we must kick off the bring-up 14179 * process now. 14180 */ 14181 ndp_do_recovery(ipif); 14182 } else { 14183 /* 14184 * Unfortunately, the first ipif is "special" 14185 * and represents the underlying ill in the 14186 * routing socket messages. Thus, when this 14187 * one ipif is down, we must still notify so 14188 * that the user knows the IFF_RUNNING status 14189 * change. (If the first ipif is up, then 14190 * we'll handle eventual routing socket 14191 * notification via DAD completion.) 14192 */ 14193 if (ipif == ill->ill_ipif) 14194 ip_rts_ifmsg(ill->ill_ipif); 14195 } 14196 } else { 14197 /* 14198 * After link down, we'll need to send a new routing 14199 * message when the link comes back, so clear 14200 * ipif_addr_ready. 14201 */ 14202 ipif->ipif_addr_ready = 0; 14203 } 14204 } 14205 14206 /* 14207 * If we've torn down links, then notify the user right away. 14208 */ 14209 if (!went_up) 14210 ip_rts_ifmsg(ill->ill_ipif); 14211 } 14212 14213 /* 14214 * Wakeup all threads waiting to enter the ipsq, and sleeping 14215 * on any of the ills in this ipsq. The ill_lock of the ill 14216 * must be held so that waiters don't miss wakeups 14217 */ 14218 static void 14219 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 14220 { 14221 phyint_t *phyint; 14222 14223 phyint = ipsq->ipsq_phyint_list; 14224 while (phyint != NULL) { 14225 if (phyint->phyint_illv4) { 14226 if (!caller_holds_lock) 14227 mutex_enter(&phyint->phyint_illv4->ill_lock); 14228 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14229 cv_broadcast(&phyint->phyint_illv4->ill_cv); 14230 if (!caller_holds_lock) 14231 mutex_exit(&phyint->phyint_illv4->ill_lock); 14232 } 14233 if (phyint->phyint_illv6) { 14234 if (!caller_holds_lock) 14235 mutex_enter(&phyint->phyint_illv6->ill_lock); 14236 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14237 cv_broadcast(&phyint->phyint_illv6->ill_cv); 14238 if (!caller_holds_lock) 14239 mutex_exit(&phyint->phyint_illv6->ill_lock); 14240 } 14241 phyint = phyint->phyint_ipsq_next; 14242 } 14243 } 14244 14245 static ipsq_t * 14246 ipsq_create(char *groupname) 14247 { 14248 ipsq_t *ipsq; 14249 14250 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14251 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 14252 if (ipsq == NULL) { 14253 return (NULL); 14254 } 14255 14256 if (groupname != NULL) 14257 (void) strcpy(ipsq->ipsq_name, groupname); 14258 else 14259 ipsq->ipsq_name[0] = '\0'; 14260 14261 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 14262 ipsq->ipsq_flags |= IPSQ_GROUP; 14263 ipsq->ipsq_next = ipsq_g_head; 14264 ipsq_g_head = ipsq; 14265 return (ipsq); 14266 } 14267 14268 /* 14269 * Return an ipsq correspoding to the groupname. If 'create' is true 14270 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 14271 * uniquely with an IPMP group. However during IPMP groupname operations, 14272 * multiple IPMP groups may be associated with a single ipsq. But no 14273 * IPMP group can be associated with more than 1 ipsq at any time. 14274 * For example 14275 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 14276 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 14277 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 14278 * 14279 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 14280 * status shown below during the execution of the above command. 14281 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 14282 * 14283 * After the completion of the above groupname command we return to the stable 14284 * state shown below. 14285 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 14286 * hme4 mpk17-85 ipsq2 mpk17-85 1 14287 * 14288 * Because of the above, we don't search based on the ipsq_name since that 14289 * would miss the correct ipsq during certain windows as shown above. 14290 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 14291 * natural state. 14292 */ 14293 static ipsq_t * 14294 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq) 14295 { 14296 ipsq_t *ipsq; 14297 int group_len; 14298 phyint_t *phyint; 14299 14300 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 14301 14302 group_len = strlen(groupname); 14303 ASSERT(group_len != 0); 14304 group_len++; 14305 14306 for (ipsq = ipsq_g_head; ipsq != NULL; ipsq = ipsq->ipsq_next) { 14307 /* 14308 * When an ipsq is being split, and ill_split_ipsq 14309 * calls this function, we exclude it from being considered. 14310 */ 14311 if (ipsq == exclude_ipsq) 14312 continue; 14313 14314 /* 14315 * Compare against the ipsq_name. The groupname change happens 14316 * in 2 phases. The 1st phase merges the from group into 14317 * the to group's ipsq, by calling ill_merge_groups and restarts 14318 * the ioctl. The 2nd phase then locates the ipsq again thru 14319 * ipsq_name. At this point the phyint_groupname has not been 14320 * updated. 14321 */ 14322 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 14323 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 14324 /* 14325 * Verify that an ipmp groupname is exactly 14326 * part of 1 ipsq and is not found in any other 14327 * ipsq. 14328 */ 14329 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) == 14330 NULL); 14331 return (ipsq); 14332 } 14333 14334 /* 14335 * Comparison against ipsq_name alone is not sufficient. 14336 * In the case when groups are currently being 14337 * merged, the ipsq could hold other IPMP groups temporarily. 14338 * so we walk the phyint list and compare against the 14339 * phyint_groupname as well. 14340 */ 14341 phyint = ipsq->ipsq_phyint_list; 14342 while (phyint != NULL) { 14343 if ((group_len == phyint->phyint_groupname_len) && 14344 (bcmp(phyint->phyint_groupname, groupname, 14345 group_len) == 0)) { 14346 /* 14347 * Verify that an ipmp groupname is exactly 14348 * part of 1 ipsq and is not found in any other 14349 * ipsq. 14350 */ 14351 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq) 14352 == NULL); 14353 return (ipsq); 14354 } 14355 phyint = phyint->phyint_ipsq_next; 14356 } 14357 } 14358 if (create) 14359 ipsq = ipsq_create(groupname); 14360 return (ipsq); 14361 } 14362 14363 static void 14364 ipsq_delete(ipsq_t *ipsq) 14365 { 14366 ipsq_t *nipsq; 14367 ipsq_t *pipsq = NULL; 14368 14369 /* 14370 * We don't hold the ipsq lock, but we are sure no new 14371 * messages can land up, since the ipsq_refs is zero. 14372 * i.e. this ipsq is unnamed and no phyint or phyint group 14373 * is associated with this ipsq. (Lookups are based on ill_name 14374 * or phyint_group_name) 14375 */ 14376 ASSERT(ipsq->ipsq_refs == 0); 14377 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 14378 ASSERT(ipsq->ipsq_pending_mp == NULL); 14379 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 14380 /* 14381 * This is not the ipsq of an IPMP group. 14382 */ 14383 kmem_free(ipsq, sizeof (ipsq_t)); 14384 return; 14385 } 14386 14387 rw_enter(&ill_g_lock, RW_WRITER); 14388 14389 /* 14390 * Locate the ipsq before we can remove it from 14391 * the singly linked list of ipsq's. 14392 */ 14393 for (nipsq = ipsq_g_head; nipsq != NULL; nipsq = nipsq->ipsq_next) { 14394 if (nipsq == ipsq) { 14395 break; 14396 } 14397 pipsq = nipsq; 14398 } 14399 14400 ASSERT(nipsq == ipsq); 14401 14402 /* unlink ipsq from the list */ 14403 if (pipsq != NULL) 14404 pipsq->ipsq_next = ipsq->ipsq_next; 14405 else 14406 ipsq_g_head = ipsq->ipsq_next; 14407 kmem_free(ipsq, sizeof (ipsq_t)); 14408 rw_exit(&ill_g_lock); 14409 } 14410 14411 static void 14412 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 14413 queue_t *q) 14414 14415 { 14416 14417 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 14418 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 14419 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 14420 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 14421 ASSERT(current_mp != NULL); 14422 14423 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 14424 NEW_OP, NULL); 14425 14426 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 14427 new_ipsq->ipsq_xopq_mphead != NULL); 14428 14429 /* 14430 * move from old ipsq to the new ipsq. 14431 */ 14432 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 14433 if (old_ipsq->ipsq_xopq_mphead != NULL) 14434 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 14435 14436 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 14437 } 14438 14439 void 14440 ill_group_cleanup(ill_t *ill) 14441 { 14442 ill_t *ill_v4; 14443 ill_t *ill_v6; 14444 ipif_t *ipif; 14445 14446 ill_v4 = ill->ill_phyint->phyint_illv4; 14447 ill_v6 = ill->ill_phyint->phyint_illv6; 14448 14449 if (ill_v4 != NULL) { 14450 mutex_enter(&ill_v4->ill_lock); 14451 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14452 ipif = ipif->ipif_next) { 14453 IPIF_UNMARK_MOVING(ipif); 14454 } 14455 ill_v4->ill_up_ipifs = B_FALSE; 14456 mutex_exit(&ill_v4->ill_lock); 14457 } 14458 14459 if (ill_v6 != NULL) { 14460 mutex_enter(&ill_v6->ill_lock); 14461 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14462 ipif = ipif->ipif_next) { 14463 IPIF_UNMARK_MOVING(ipif); 14464 } 14465 ill_v6->ill_up_ipifs = B_FALSE; 14466 mutex_exit(&ill_v6->ill_lock); 14467 } 14468 } 14469 /* 14470 * This function is called when an ill has had a change in its group status 14471 * to bring up all the ipifs that were up before the change. 14472 */ 14473 int 14474 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 14475 { 14476 ipif_t *ipif; 14477 ill_t *ill_v4; 14478 ill_t *ill_v6; 14479 ill_t *from_ill; 14480 int err = 0; 14481 14482 14483 ASSERT(IAM_WRITER_ILL(ill)); 14484 14485 /* 14486 * Except for ipif_state_flags and ill_state_flags the other 14487 * fields of the ipif/ill that are modified below are protected 14488 * implicitly since we are a writer. We would have tried to down 14489 * even an ipif that was already down, in ill_down_ipifs. So we 14490 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 14491 */ 14492 ill_v4 = ill->ill_phyint->phyint_illv4; 14493 ill_v6 = ill->ill_phyint->phyint_illv6; 14494 if (ill_v4 != NULL) { 14495 ill_v4->ill_up_ipifs = B_TRUE; 14496 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14497 ipif = ipif->ipif_next) { 14498 mutex_enter(&ill_v4->ill_lock); 14499 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14500 IPIF_UNMARK_MOVING(ipif); 14501 mutex_exit(&ill_v4->ill_lock); 14502 if (ipif->ipif_was_up) { 14503 if (!(ipif->ipif_flags & IPIF_UP)) 14504 err = ipif_up(ipif, q, mp); 14505 ipif->ipif_was_up = B_FALSE; 14506 if (err != 0) { 14507 /* 14508 * Can there be any other error ? 14509 */ 14510 ASSERT(err == EINPROGRESS); 14511 return (err); 14512 } 14513 } 14514 } 14515 mutex_enter(&ill_v4->ill_lock); 14516 ill_v4->ill_state_flags &= ~ILL_CHANGING; 14517 mutex_exit(&ill_v4->ill_lock); 14518 ill_v4->ill_up_ipifs = B_FALSE; 14519 if (ill_v4->ill_move_in_progress) { 14520 ASSERT(ill_v4->ill_move_peer != NULL); 14521 ill_v4->ill_move_in_progress = B_FALSE; 14522 from_ill = ill_v4->ill_move_peer; 14523 from_ill->ill_move_in_progress = B_FALSE; 14524 from_ill->ill_move_peer = NULL; 14525 mutex_enter(&from_ill->ill_lock); 14526 from_ill->ill_state_flags &= ~ILL_CHANGING; 14527 mutex_exit(&from_ill->ill_lock); 14528 if (ill_v6 == NULL) { 14529 if (from_ill->ill_phyint->phyint_flags & 14530 PHYI_STANDBY) { 14531 phyint_inactive(from_ill->ill_phyint); 14532 } 14533 if (ill_v4->ill_phyint->phyint_flags & 14534 PHYI_STANDBY) { 14535 phyint_inactive(ill_v4->ill_phyint); 14536 } 14537 } 14538 ill_v4->ill_move_peer = NULL; 14539 } 14540 } 14541 14542 if (ill_v6 != NULL) { 14543 ill_v6->ill_up_ipifs = B_TRUE; 14544 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14545 ipif = ipif->ipif_next) { 14546 mutex_enter(&ill_v6->ill_lock); 14547 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14548 IPIF_UNMARK_MOVING(ipif); 14549 mutex_exit(&ill_v6->ill_lock); 14550 if (ipif->ipif_was_up) { 14551 if (!(ipif->ipif_flags & IPIF_UP)) 14552 err = ipif_up(ipif, q, mp); 14553 ipif->ipif_was_up = B_FALSE; 14554 if (err != 0) { 14555 /* 14556 * Can there be any other error ? 14557 */ 14558 ASSERT(err == EINPROGRESS); 14559 return (err); 14560 } 14561 } 14562 } 14563 mutex_enter(&ill_v6->ill_lock); 14564 ill_v6->ill_state_flags &= ~ILL_CHANGING; 14565 mutex_exit(&ill_v6->ill_lock); 14566 ill_v6->ill_up_ipifs = B_FALSE; 14567 if (ill_v6->ill_move_in_progress) { 14568 ASSERT(ill_v6->ill_move_peer != NULL); 14569 ill_v6->ill_move_in_progress = B_FALSE; 14570 from_ill = ill_v6->ill_move_peer; 14571 from_ill->ill_move_in_progress = B_FALSE; 14572 from_ill->ill_move_peer = NULL; 14573 mutex_enter(&from_ill->ill_lock); 14574 from_ill->ill_state_flags &= ~ILL_CHANGING; 14575 mutex_exit(&from_ill->ill_lock); 14576 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 14577 phyint_inactive(from_ill->ill_phyint); 14578 } 14579 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 14580 phyint_inactive(ill_v6->ill_phyint); 14581 } 14582 ill_v6->ill_move_peer = NULL; 14583 } 14584 } 14585 return (0); 14586 } 14587 14588 /* 14589 * bring down all the approriate ipifs. 14590 */ 14591 /* ARGSUSED */ 14592 static void 14593 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 14594 { 14595 ipif_t *ipif; 14596 14597 ASSERT(IAM_WRITER_ILL(ill)); 14598 14599 /* 14600 * Except for ipif_state_flags the other fields of the ipif/ill that 14601 * are modified below are protected implicitly since we are a writer 14602 */ 14603 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14604 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 14605 continue; 14606 if (index == 0 || index == ipif->ipif_orig_ifindex) { 14607 /* 14608 * We go through the ipif_down logic even if the ipif 14609 * is already down, since routes can be added based 14610 * on down ipifs. Going through ipif_down once again 14611 * will delete any IREs created based on these routes. 14612 */ 14613 if (ipif->ipif_flags & IPIF_UP) 14614 ipif->ipif_was_up = B_TRUE; 14615 /* 14616 * If called with chk_nofailover true ipif is moving. 14617 */ 14618 mutex_enter(&ill->ill_lock); 14619 if (chk_nofailover) { 14620 ipif->ipif_state_flags |= 14621 IPIF_MOVING | IPIF_CHANGING; 14622 } else { 14623 ipif->ipif_state_flags |= IPIF_CHANGING; 14624 } 14625 mutex_exit(&ill->ill_lock); 14626 /* 14627 * Need to re-create net/subnet bcast ires if 14628 * they are dependent on ipif. 14629 */ 14630 if (!ipif->ipif_isv6) 14631 ipif_check_bcast_ires(ipif); 14632 (void) ipif_logical_down(ipif, NULL, NULL); 14633 ipif_non_duplicate(ipif); 14634 ipif_down_tail(ipif); 14635 /* 14636 * We don't do ipif_multicast_down for IPv4 in 14637 * ipif_down. We need to set this so that 14638 * ipif_multicast_up will join the 14639 * ALLHOSTS_GROUP on to_ill. 14640 */ 14641 ipif->ipif_multicast_up = B_FALSE; 14642 } 14643 } 14644 } 14645 14646 #define IPSQ_INC_REF(ipsq) { \ 14647 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 14648 (ipsq)->ipsq_refs++; \ 14649 } 14650 14651 #define IPSQ_DEC_REF(ipsq) { \ 14652 ASSERT(RW_WRITE_HELD(&ill_g_lock)); \ 14653 (ipsq)->ipsq_refs--; \ 14654 if ((ipsq)->ipsq_refs == 0) \ 14655 (ipsq)->ipsq_name[0] = '\0'; \ 14656 } 14657 14658 /* 14659 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14660 * new_ipsq. 14661 */ 14662 static void 14663 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq) 14664 { 14665 phyint_t *phyint; 14666 phyint_t *next_phyint; 14667 14668 /* 14669 * To change the ipsq of an ill, we need to hold the ill_g_lock as 14670 * writer and the ill_lock of the ill in question. Also the dest 14671 * ipsq can't vanish while we hold the ill_g_lock as writer. 14672 */ 14673 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14674 14675 phyint = cur_ipsq->ipsq_phyint_list; 14676 cur_ipsq->ipsq_phyint_list = NULL; 14677 while (phyint != NULL) { 14678 next_phyint = phyint->phyint_ipsq_next; 14679 IPSQ_DEC_REF(cur_ipsq); 14680 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 14681 new_ipsq->ipsq_phyint_list = phyint; 14682 IPSQ_INC_REF(new_ipsq); 14683 phyint->phyint_ipsq = new_ipsq; 14684 phyint = next_phyint; 14685 } 14686 } 14687 14688 #define SPLIT_SUCCESS 0 14689 #define SPLIT_NOT_NEEDED 1 14690 #define SPLIT_FAILED 2 14691 14692 int 14693 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry) 14694 { 14695 ipsq_t *newipsq = NULL; 14696 14697 /* 14698 * Assertions denote pre-requisites for changing the ipsq of 14699 * a phyint 14700 */ 14701 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14702 /* 14703 * <ill-phyint> assocs can't change while ill_g_lock 14704 * is held as writer. See ill_phyint_reinit() 14705 */ 14706 ASSERT(phyint->phyint_illv4 == NULL || 14707 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14708 ASSERT(phyint->phyint_illv6 == NULL || 14709 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14710 14711 if ((phyint->phyint_groupname_len != 14712 (strlen(cur_ipsq->ipsq_name) + 1) || 14713 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 14714 phyint->phyint_groupname_len) != 0)) { 14715 /* 14716 * Once we fail in creating a new ipsq due to memory shortage, 14717 * don't attempt to create new ipsq again, based on another 14718 * phyint, since we want all phyints belonging to an IPMP group 14719 * to be in the same ipsq even in the event of mem alloc fails. 14720 */ 14721 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 14722 cur_ipsq); 14723 if (newipsq == NULL) { 14724 /* Memory allocation failure */ 14725 return (SPLIT_FAILED); 14726 } else { 14727 /* ipsq_refs protected by ill_g_lock (writer) */ 14728 IPSQ_DEC_REF(cur_ipsq); 14729 phyint->phyint_ipsq = newipsq; 14730 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 14731 newipsq->ipsq_phyint_list = phyint; 14732 IPSQ_INC_REF(newipsq); 14733 return (SPLIT_SUCCESS); 14734 } 14735 } 14736 return (SPLIT_NOT_NEEDED); 14737 } 14738 14739 /* 14740 * The ill locks of the phyint and the ill_g_lock (writer) must be held 14741 * to do this split 14742 */ 14743 static int 14744 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq) 14745 { 14746 ipsq_t *newipsq; 14747 14748 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 14749 /* 14750 * <ill-phyint> assocs can't change while ill_g_lock 14751 * is held as writer. See ill_phyint_reinit() 14752 */ 14753 14754 ASSERT(phyint->phyint_illv4 == NULL || 14755 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14756 ASSERT(phyint->phyint_illv6 == NULL || 14757 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14758 14759 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 14760 phyint->phyint_illv4: phyint->phyint_illv6)) { 14761 /* 14762 * ipsq_init failed due to no memory 14763 * caller will use the same ipsq 14764 */ 14765 return (SPLIT_FAILED); 14766 } 14767 14768 /* ipsq_ref is protected by ill_g_lock (writer) */ 14769 IPSQ_DEC_REF(cur_ipsq); 14770 14771 /* 14772 * This is a new ipsq that is unknown to the world. 14773 * So we don't need to hold ipsq_lock, 14774 */ 14775 newipsq = phyint->phyint_ipsq; 14776 newipsq->ipsq_writer = NULL; 14777 newipsq->ipsq_reentry_cnt--; 14778 ASSERT(newipsq->ipsq_reentry_cnt == 0); 14779 #ifdef ILL_DEBUG 14780 newipsq->ipsq_depth = 0; 14781 #endif 14782 14783 return (SPLIT_SUCCESS); 14784 } 14785 14786 /* 14787 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14788 * ipsq's representing their individual groups or themselves. Return 14789 * whether split needs to be retried again later. 14790 */ 14791 static boolean_t 14792 ill_split_ipsq(ipsq_t *cur_ipsq) 14793 { 14794 phyint_t *phyint; 14795 phyint_t *next_phyint; 14796 int error; 14797 boolean_t need_retry = B_FALSE; 14798 14799 phyint = cur_ipsq->ipsq_phyint_list; 14800 cur_ipsq->ipsq_phyint_list = NULL; 14801 while (phyint != NULL) { 14802 next_phyint = phyint->phyint_ipsq_next; 14803 /* 14804 * 'created' will tell us whether the callee actually 14805 * created an ipsq. Lack of memory may force the callee 14806 * to return without creating an ipsq. 14807 */ 14808 if (phyint->phyint_groupname == NULL) { 14809 error = ill_split_to_own_ipsq(phyint, cur_ipsq); 14810 } else { 14811 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 14812 need_retry); 14813 } 14814 14815 switch (error) { 14816 case SPLIT_FAILED: 14817 need_retry = B_TRUE; 14818 /* FALLTHRU */ 14819 case SPLIT_NOT_NEEDED: 14820 /* 14821 * Keep it on the list. 14822 */ 14823 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 14824 cur_ipsq->ipsq_phyint_list = phyint; 14825 break; 14826 case SPLIT_SUCCESS: 14827 break; 14828 default: 14829 ASSERT(0); 14830 } 14831 14832 phyint = next_phyint; 14833 } 14834 return (need_retry); 14835 } 14836 14837 /* 14838 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 14839 * and return the ills in the list. This list will be 14840 * needed to unlock all the ills later on by the caller. 14841 * The <ill-ipsq> associations could change between the 14842 * lock and unlock. Hence the unlock can't traverse the 14843 * ipsq to get the list of ills. 14844 */ 14845 static int 14846 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 14847 { 14848 int cnt = 0; 14849 phyint_t *phyint; 14850 14851 /* 14852 * The caller holds ill_g_lock to ensure that the ill memberships 14853 * of the ipsq don't change 14854 */ 14855 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 14856 14857 phyint = ipsq->ipsq_phyint_list; 14858 while (phyint != NULL) { 14859 if (phyint->phyint_illv4 != NULL) { 14860 ASSERT(cnt < list_max); 14861 list[cnt++] = phyint->phyint_illv4; 14862 } 14863 if (phyint->phyint_illv6 != NULL) { 14864 ASSERT(cnt < list_max); 14865 list[cnt++] = phyint->phyint_illv6; 14866 } 14867 phyint = phyint->phyint_ipsq_next; 14868 } 14869 ill_lock_ills(list, cnt); 14870 return (cnt); 14871 } 14872 14873 void 14874 ill_lock_ills(ill_t **list, int cnt) 14875 { 14876 int i; 14877 14878 if (cnt > 1) { 14879 boolean_t try_again; 14880 do { 14881 try_again = B_FALSE; 14882 for (i = 0; i < cnt - 1; i++) { 14883 if (list[i] < list[i + 1]) { 14884 ill_t *tmp; 14885 14886 /* swap the elements */ 14887 tmp = list[i]; 14888 list[i] = list[i + 1]; 14889 list[i + 1] = tmp; 14890 try_again = B_TRUE; 14891 } 14892 } 14893 } while (try_again); 14894 } 14895 14896 for (i = 0; i < cnt; i++) { 14897 if (i == 0) { 14898 if (list[i] != NULL) 14899 mutex_enter(&list[i]->ill_lock); 14900 else 14901 return; 14902 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14903 mutex_enter(&list[i]->ill_lock); 14904 } 14905 } 14906 } 14907 14908 void 14909 ill_unlock_ills(ill_t **list, int cnt) 14910 { 14911 int i; 14912 14913 for (i = 0; i < cnt; i++) { 14914 if ((i == 0) && (list[i] != NULL)) { 14915 mutex_exit(&list[i]->ill_lock); 14916 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14917 mutex_exit(&list[i]->ill_lock); 14918 } 14919 } 14920 } 14921 14922 /* 14923 * Merge all the ills from 1 ipsq group into another ipsq group. 14924 * The source ipsq group is specified by the ipsq associated with 14925 * 'from_ill'. The destination ipsq group is specified by the ipsq 14926 * associated with 'to_ill' or 'groupname' respectively. 14927 * Note that ipsq itself does not have a reference count mechanism 14928 * and functions don't look up an ipsq and pass it around. Instead 14929 * functions pass around an ill or groupname, and the ipsq is looked 14930 * up from the ill or groupname and the required operation performed 14931 * atomically with the lookup on the ipsq. 14932 */ 14933 static int 14934 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 14935 queue_t *q) 14936 { 14937 ipsq_t *old_ipsq; 14938 ipsq_t *new_ipsq; 14939 ill_t **ill_list; 14940 int cnt; 14941 size_t ill_list_size; 14942 boolean_t became_writer_on_new_sq = B_FALSE; 14943 14944 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 14945 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 14946 14947 /* 14948 * Need to hold ill_g_lock as writer and also the ill_lock to 14949 * change the <ill-ipsq> assoc of an ill. Need to hold the 14950 * ipsq_lock to prevent new messages from landing on an ipsq. 14951 */ 14952 rw_enter(&ill_g_lock, RW_WRITER); 14953 14954 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 14955 if (groupname != NULL) 14956 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL); 14957 else { 14958 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 14959 } 14960 14961 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 14962 14963 /* 14964 * both groups are on the same ipsq. 14965 */ 14966 if (old_ipsq == new_ipsq) { 14967 rw_exit(&ill_g_lock); 14968 return (0); 14969 } 14970 14971 cnt = old_ipsq->ipsq_refs << 1; 14972 ill_list_size = cnt * sizeof (ill_t *); 14973 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 14974 if (ill_list == NULL) { 14975 rw_exit(&ill_g_lock); 14976 return (ENOMEM); 14977 } 14978 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 14979 14980 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 14981 mutex_enter(&new_ipsq->ipsq_lock); 14982 if ((new_ipsq->ipsq_writer == NULL && 14983 new_ipsq->ipsq_current_ipif == NULL) || 14984 (new_ipsq->ipsq_writer == curthread)) { 14985 new_ipsq->ipsq_writer = curthread; 14986 new_ipsq->ipsq_reentry_cnt++; 14987 became_writer_on_new_sq = B_TRUE; 14988 } 14989 14990 /* 14991 * We are holding ill_g_lock as writer and all the ill locks of 14992 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 14993 * message can land up on the old ipsq even though we don't hold the 14994 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 14995 */ 14996 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 14997 14998 /* 14999 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 15000 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 15001 * assocs. till we release the ill_g_lock, and hence it can't vanish. 15002 */ 15003 ill_merge_ipsq(old_ipsq, new_ipsq); 15004 15005 /* 15006 * Mark the new ipsq as needing a split since it is currently 15007 * being shared by more than 1 IPMP group. The split will 15008 * occur at the end of ipsq_exit 15009 */ 15010 new_ipsq->ipsq_split = B_TRUE; 15011 15012 /* Now release all the locks */ 15013 mutex_exit(&new_ipsq->ipsq_lock); 15014 ill_unlock_ills(ill_list, cnt); 15015 rw_exit(&ill_g_lock); 15016 15017 kmem_free(ill_list, ill_list_size); 15018 15019 /* 15020 * If we succeeded in becoming writer on the new ipsq, then 15021 * drain the new ipsq and start processing all enqueued messages 15022 * including the current ioctl we are processing which is either 15023 * a set groupname or failover/failback. 15024 */ 15025 if (became_writer_on_new_sq) 15026 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 15027 15028 /* 15029 * syncq has been changed and all the messages have been moved. 15030 */ 15031 mutex_enter(&old_ipsq->ipsq_lock); 15032 old_ipsq->ipsq_current_ipif = NULL; 15033 old_ipsq->ipsq_current_ioctl = 0; 15034 mutex_exit(&old_ipsq->ipsq_lock); 15035 return (EINPROGRESS); 15036 } 15037 15038 /* 15039 * Delete and add the loopback copy and non-loopback copy of 15040 * the BROADCAST ire corresponding to ill and addr. Used to 15041 * group broadcast ires together when ill becomes part of 15042 * a group. 15043 * 15044 * This function is also called when ill is leaving the group 15045 * so that the ires belonging to the group gets re-grouped. 15046 */ 15047 static void 15048 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 15049 { 15050 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 15051 ire_t **ire_ptpn = &ire_head; 15052 15053 /* 15054 * The loopback and non-loopback IREs are inserted in the order in which 15055 * they're found, on the basis that they are correctly ordered (loopback 15056 * first). 15057 */ 15058 for (;;) { 15059 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 15060 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 15061 if (ire == NULL) 15062 break; 15063 15064 /* 15065 * we are passing in KM_SLEEP because it is not easy to 15066 * go back to a sane state in case of memory failure. 15067 */ 15068 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 15069 ASSERT(nire != NULL); 15070 bzero(nire, sizeof (ire_t)); 15071 /* 15072 * Don't use ire_max_frag directly since we don't 15073 * hold on to 'ire' until we add the new ire 'nire' and 15074 * we don't want the new ire to have a dangling reference 15075 * to 'ire'. The ire_max_frag of a broadcast ire must 15076 * be in sync with the ipif_mtu of the associate ipif. 15077 * For eg. this happens as a result of SIOCSLIFNAME, 15078 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 15079 * the driver. A change in ire_max_frag triggered as 15080 * as a result of path mtu discovery, or due to an 15081 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 15082 * route change -mtu command does not apply to broadcast ires. 15083 * 15084 * XXX We need a recovery strategy here if ire_init fails 15085 */ 15086 if (ire_init(nire, 15087 (uchar_t *)&ire->ire_addr, 15088 (uchar_t *)&ire->ire_mask, 15089 (uchar_t *)&ire->ire_src_addr, 15090 (uchar_t *)&ire->ire_gateway_addr, 15091 (uchar_t *)&ire->ire_in_src_addr, 15092 ire->ire_stq == NULL ? &ip_loopback_mtu : 15093 &ire->ire_ipif->ipif_mtu, 15094 (ire->ire_nce != NULL ? ire->ire_nce->nce_fp_mp : NULL), 15095 ire->ire_rfq, 15096 ire->ire_stq, 15097 ire->ire_type, 15098 (ire->ire_nce != NULL? ire->ire_nce->nce_res_mp : NULL), 15099 ire->ire_ipif, 15100 ire->ire_in_ill, 15101 ire->ire_cmask, 15102 ire->ire_phandle, 15103 ire->ire_ihandle, 15104 ire->ire_flags, 15105 &ire->ire_uinfo, 15106 NULL, 15107 NULL) == NULL) { 15108 cmn_err(CE_PANIC, "ire_init() failed"); 15109 } 15110 ire_delete(ire); 15111 ire_refrele(ire); 15112 15113 /* 15114 * The newly created IREs are inserted at the tail of the list 15115 * starting with ire_head. As we've just allocated them no one 15116 * knows about them so it's safe. 15117 */ 15118 *ire_ptpn = nire; 15119 ire_ptpn = &nire->ire_next; 15120 } 15121 15122 for (nire = ire_head; nire != NULL; nire = nire_next) { 15123 int error; 15124 ire_t *oire; 15125 /* unlink the IRE from our list before calling ire_add() */ 15126 nire_next = nire->ire_next; 15127 nire->ire_next = NULL; 15128 15129 /* ire_add adds the ire at the right place in the list */ 15130 oire = nire; 15131 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); 15132 ASSERT(error == 0); 15133 ASSERT(oire == nire); 15134 ire_refrele(nire); /* Held in ire_add */ 15135 } 15136 } 15137 15138 /* 15139 * This function is usually called when an ill is inserted in 15140 * a group and all the ipifs are already UP. As all the ipifs 15141 * are already UP, the broadcast ires have already been created 15142 * and been inserted. But, ire_add_v4 would not have grouped properly. 15143 * We need to re-group for the benefit of ip_wput_ire which 15144 * expects BROADCAST ires to be grouped properly to avoid sending 15145 * more than one copy of the broadcast packet per group. 15146 * 15147 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 15148 * because when ipif_up_done ends up calling this, ires have 15149 * already been added before illgrp_insert i.e before ill_group 15150 * has been initialized. 15151 */ 15152 static void 15153 ill_group_bcast_for_xmit(ill_t *ill) 15154 { 15155 ill_group_t *illgrp; 15156 ipif_t *ipif; 15157 ipaddr_t addr; 15158 ipaddr_t net_mask; 15159 ipaddr_t subnet_netmask; 15160 15161 illgrp = ill->ill_group; 15162 15163 /* 15164 * This function is called even when an ill is deleted from 15165 * the group. Hence, illgrp could be null. 15166 */ 15167 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 15168 return; 15169 15170 /* 15171 * Delete all the BROADCAST ires matching this ill and add 15172 * them back. This time, ire_add_v4 should take care of 15173 * grouping them with others because ill is part of the 15174 * group. 15175 */ 15176 ill_bcast_delete_and_add(ill, 0); 15177 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 15178 15179 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15180 15181 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15182 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15183 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15184 } else { 15185 net_mask = htonl(IN_CLASSA_NET); 15186 } 15187 addr = net_mask & ipif->ipif_subnet; 15188 ill_bcast_delete_and_add(ill, addr); 15189 ill_bcast_delete_and_add(ill, ~net_mask | addr); 15190 15191 subnet_netmask = ipif->ipif_net_mask; 15192 addr = ipif->ipif_subnet; 15193 ill_bcast_delete_and_add(ill, addr); 15194 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 15195 } 15196 } 15197 15198 /* 15199 * This function is called from illgrp_delete when ill is being deleted 15200 * from the group. 15201 * 15202 * As ill is not there in the group anymore, any address belonging 15203 * to this ill should be cleared of IRE_MARK_NORECV. 15204 */ 15205 static void 15206 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 15207 { 15208 ire_t *ire; 15209 irb_t *irb; 15210 15211 ASSERT(ill->ill_group == NULL); 15212 15213 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 15214 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL); 15215 15216 if (ire != NULL) { 15217 /* 15218 * IPMP and plumbing operations are serialized on the ipsq, so 15219 * no one will insert or delete a broadcast ire under our feet. 15220 */ 15221 irb = ire->ire_bucket; 15222 rw_enter(&irb->irb_lock, RW_READER); 15223 ire_refrele(ire); 15224 15225 for (; ire != NULL; ire = ire->ire_next) { 15226 if (ire->ire_addr != addr) 15227 break; 15228 if (ire_to_ill(ire) != ill) 15229 continue; 15230 15231 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 15232 ire->ire_marks &= ~IRE_MARK_NORECV; 15233 } 15234 rw_exit(&irb->irb_lock); 15235 } 15236 } 15237 15238 /* 15239 * This function must be called only after the broadcast ires 15240 * have been grouped together. For a given address addr, nominate 15241 * only one of the ires whose interface is not FAILED or OFFLINE. 15242 * 15243 * This is also called when an ipif goes down, so that we can nominate 15244 * a different ire with the same address for receiving. 15245 */ 15246 static void 15247 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr) 15248 { 15249 irb_t *irb; 15250 ire_t *ire; 15251 ire_t *ire1; 15252 ire_t *save_ire; 15253 ire_t **irep = NULL; 15254 boolean_t first = B_TRUE; 15255 ire_t *clear_ire = NULL; 15256 ire_t *start_ire = NULL; 15257 ire_t *new_lb_ire; 15258 ire_t *new_nlb_ire; 15259 boolean_t new_lb_ire_used = B_FALSE; 15260 boolean_t new_nlb_ire_used = B_FALSE; 15261 uint64_t match_flags; 15262 uint64_t phyi_flags; 15263 boolean_t fallback = B_FALSE; 15264 15265 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 15266 NULL, MATCH_IRE_TYPE); 15267 /* 15268 * We may not be able to find some ires if a previous 15269 * ire_create failed. This happens when an ipif goes 15270 * down and we are unable to create BROADCAST ires due 15271 * to memory failure. Thus, we have to check for NULL 15272 * below. This should handle the case for LOOPBACK, 15273 * POINTOPOINT and interfaces with some POINTOPOINT 15274 * logicals for which there are no BROADCAST ires. 15275 */ 15276 if (ire == NULL) 15277 return; 15278 /* 15279 * Currently IRE_BROADCASTS are deleted when an ipif 15280 * goes down which runs exclusively. Thus, setting 15281 * IRE_MARK_RCVD should not race with ire_delete marking 15282 * IRE_MARK_CONDEMNED. We grab the lock below just to 15283 * be consistent with other parts of the code that walks 15284 * a given bucket. 15285 */ 15286 save_ire = ire; 15287 irb = ire->ire_bucket; 15288 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15289 if (new_lb_ire == NULL) { 15290 ire_refrele(ire); 15291 return; 15292 } 15293 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15294 if (new_nlb_ire == NULL) { 15295 ire_refrele(ire); 15296 kmem_cache_free(ire_cache, new_lb_ire); 15297 return; 15298 } 15299 IRB_REFHOLD(irb); 15300 rw_enter(&irb->irb_lock, RW_WRITER); 15301 /* 15302 * Get to the first ire matching the address and the 15303 * group. If the address does not match we are done 15304 * as we could not find the IRE. If the address matches 15305 * we should get to the first one matching the group. 15306 */ 15307 while (ire != NULL) { 15308 if (ire->ire_addr != addr || 15309 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 15310 break; 15311 } 15312 ire = ire->ire_next; 15313 } 15314 match_flags = PHYI_FAILED | PHYI_INACTIVE; 15315 start_ire = ire; 15316 redo: 15317 while (ire != NULL && ire->ire_addr == addr && 15318 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 15319 /* 15320 * The first ire for any address within a group 15321 * should always be the one with IRE_MARK_NORECV cleared 15322 * so that ip_wput_ire can avoid searching for one. 15323 * Note down the insertion point which will be used 15324 * later. 15325 */ 15326 if (first && (irep == NULL)) 15327 irep = ire->ire_ptpn; 15328 /* 15329 * PHYI_FAILED is set when the interface fails. 15330 * This interface might have become good, but the 15331 * daemon has not yet detected. We should still 15332 * not receive on this. PHYI_OFFLINE should never 15333 * be picked as this has been offlined and soon 15334 * be removed. 15335 */ 15336 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 15337 if (phyi_flags & PHYI_OFFLINE) { 15338 ire->ire_marks |= IRE_MARK_NORECV; 15339 ire = ire->ire_next; 15340 continue; 15341 } 15342 if (phyi_flags & match_flags) { 15343 ire->ire_marks |= IRE_MARK_NORECV; 15344 ire = ire->ire_next; 15345 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 15346 PHYI_INACTIVE) { 15347 fallback = B_TRUE; 15348 } 15349 continue; 15350 } 15351 if (first) { 15352 /* 15353 * We will move this to the front of the list later 15354 * on. 15355 */ 15356 clear_ire = ire; 15357 ire->ire_marks &= ~IRE_MARK_NORECV; 15358 } else { 15359 ire->ire_marks |= IRE_MARK_NORECV; 15360 } 15361 first = B_FALSE; 15362 ire = ire->ire_next; 15363 } 15364 /* 15365 * If we never nominated anybody, try nominating at least 15366 * an INACTIVE, if we found one. Do it only once though. 15367 */ 15368 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 15369 fallback) { 15370 match_flags = PHYI_FAILED; 15371 ire = start_ire; 15372 irep = NULL; 15373 goto redo; 15374 } 15375 ire_refrele(save_ire); 15376 15377 /* 15378 * irep non-NULL indicates that we entered the while loop 15379 * above. If clear_ire is at the insertion point, we don't 15380 * have to do anything. clear_ire will be NULL if all the 15381 * interfaces are failed. 15382 * 15383 * We cannot unlink and reinsert the ire at the right place 15384 * in the list since there can be other walkers of this bucket. 15385 * Instead we delete and recreate the ire 15386 */ 15387 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 15388 ire_t *clear_ire_stq = NULL; 15389 mblk_t *fp_mp = NULL, *res_mp = NULL; 15390 15391 bzero(new_lb_ire, sizeof (ire_t)); 15392 if (clear_ire->ire_nce != NULL) { 15393 fp_mp = clear_ire->ire_nce->nce_fp_mp; 15394 res_mp = clear_ire->ire_nce->nce_res_mp; 15395 } 15396 /* XXX We need a recovery strategy here. */ 15397 if (ire_init(new_lb_ire, 15398 (uchar_t *)&clear_ire->ire_addr, 15399 (uchar_t *)&clear_ire->ire_mask, 15400 (uchar_t *)&clear_ire->ire_src_addr, 15401 (uchar_t *)&clear_ire->ire_gateway_addr, 15402 (uchar_t *)&clear_ire->ire_in_src_addr, 15403 &clear_ire->ire_max_frag, 15404 fp_mp, 15405 clear_ire->ire_rfq, 15406 clear_ire->ire_stq, 15407 clear_ire->ire_type, 15408 res_mp, 15409 clear_ire->ire_ipif, 15410 clear_ire->ire_in_ill, 15411 clear_ire->ire_cmask, 15412 clear_ire->ire_phandle, 15413 clear_ire->ire_ihandle, 15414 clear_ire->ire_flags, 15415 &clear_ire->ire_uinfo, 15416 NULL, 15417 NULL) == NULL) 15418 cmn_err(CE_PANIC, "ire_init() failed"); 15419 if (clear_ire->ire_stq == NULL) { 15420 ire_t *ire_next = clear_ire->ire_next; 15421 if (ire_next != NULL && 15422 ire_next->ire_stq != NULL && 15423 ire_next->ire_addr == clear_ire->ire_addr && 15424 ire_next->ire_ipif->ipif_ill == 15425 clear_ire->ire_ipif->ipif_ill) { 15426 clear_ire_stq = ire_next; 15427 15428 bzero(new_nlb_ire, sizeof (ire_t)); 15429 if (clear_ire_stq->ire_nce != NULL) { 15430 fp_mp = 15431 clear_ire_stq->ire_nce->nce_fp_mp; 15432 res_mp = 15433 clear_ire_stq->ire_nce->nce_res_mp; 15434 } else { 15435 fp_mp = res_mp = NULL; 15436 } 15437 /* XXX We need a recovery strategy here. */ 15438 if (ire_init(new_nlb_ire, 15439 (uchar_t *)&clear_ire_stq->ire_addr, 15440 (uchar_t *)&clear_ire_stq->ire_mask, 15441 (uchar_t *)&clear_ire_stq->ire_src_addr, 15442 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 15443 (uchar_t *)&clear_ire_stq->ire_in_src_addr, 15444 &clear_ire_stq->ire_max_frag, 15445 fp_mp, 15446 clear_ire_stq->ire_rfq, 15447 clear_ire_stq->ire_stq, 15448 clear_ire_stq->ire_type, 15449 res_mp, 15450 clear_ire_stq->ire_ipif, 15451 clear_ire_stq->ire_in_ill, 15452 clear_ire_stq->ire_cmask, 15453 clear_ire_stq->ire_phandle, 15454 clear_ire_stq->ire_ihandle, 15455 clear_ire_stq->ire_flags, 15456 &clear_ire_stq->ire_uinfo, 15457 NULL, 15458 NULL) == NULL) 15459 cmn_err(CE_PANIC, "ire_init() failed"); 15460 } 15461 } 15462 15463 /* 15464 * Delete the ire. We can't call ire_delete() since 15465 * we are holding the bucket lock. We can't release the 15466 * bucket lock since we can't allow irep to change. So just 15467 * mark it CONDEMNED. The IRB_REFRELE will delete the 15468 * ire from the list and do the refrele. 15469 */ 15470 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 15471 irb->irb_marks |= IRB_MARK_CONDEMNED; 15472 15473 if (clear_ire_stq != NULL) { 15474 ire_fastpath_list_delete( 15475 (ill_t *)clear_ire_stq->ire_stq->q_ptr, 15476 clear_ire_stq); 15477 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 15478 } 15479 15480 /* 15481 * Also take care of otherfields like ib/ob pkt count 15482 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 15483 */ 15484 15485 /* Add the new ire's. Insert at *irep */ 15486 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 15487 ire1 = *irep; 15488 if (ire1 != NULL) 15489 ire1->ire_ptpn = &new_lb_ire->ire_next; 15490 new_lb_ire->ire_next = ire1; 15491 /* Link the new one in. */ 15492 new_lb_ire->ire_ptpn = irep; 15493 membar_producer(); 15494 *irep = new_lb_ire; 15495 new_lb_ire_used = B_TRUE; 15496 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 15497 new_lb_ire->ire_bucket->irb_ire_cnt++; 15498 new_lb_ire->ire_ipif->ipif_ire_cnt++; 15499 15500 if (clear_ire_stq != NULL) { 15501 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 15502 irep = &new_lb_ire->ire_next; 15503 /* Add the new ire. Insert at *irep */ 15504 ire1 = *irep; 15505 if (ire1 != NULL) 15506 ire1->ire_ptpn = &new_nlb_ire->ire_next; 15507 new_nlb_ire->ire_next = ire1; 15508 /* Link the new one in. */ 15509 new_nlb_ire->ire_ptpn = irep; 15510 membar_producer(); 15511 *irep = new_nlb_ire; 15512 new_nlb_ire_used = B_TRUE; 15513 BUMP_IRE_STATS(ire_stats_v4, ire_stats_inserted); 15514 new_nlb_ire->ire_bucket->irb_ire_cnt++; 15515 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 15516 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 15517 } 15518 } 15519 rw_exit(&irb->irb_lock); 15520 if (!new_lb_ire_used) 15521 kmem_cache_free(ire_cache, new_lb_ire); 15522 if (!new_nlb_ire_used) 15523 kmem_cache_free(ire_cache, new_nlb_ire); 15524 IRB_REFRELE(irb); 15525 } 15526 15527 /* 15528 * Whenever an ipif goes down we have to renominate a different 15529 * broadcast ire to receive. Whenever an ipif comes up, we need 15530 * to make sure that we have only one nominated to receive. 15531 */ 15532 static void 15533 ipif_renominate_bcast(ipif_t *ipif) 15534 { 15535 ill_t *ill = ipif->ipif_ill; 15536 ipaddr_t subnet_addr; 15537 ipaddr_t net_addr; 15538 ipaddr_t net_mask = 0; 15539 ipaddr_t subnet_netmask; 15540 ipaddr_t addr; 15541 ill_group_t *illgrp; 15542 15543 illgrp = ill->ill_group; 15544 /* 15545 * If this is the last ipif going down, it might take 15546 * the ill out of the group. In that case ipif_down -> 15547 * illgrp_delete takes care of doing the nomination. 15548 * ipif_down does not call for this case. 15549 */ 15550 ASSERT(illgrp != NULL); 15551 15552 /* There could not have been any ires associated with this */ 15553 if (ipif->ipif_subnet == 0) 15554 return; 15555 15556 ill_mark_bcast(illgrp, 0); 15557 ill_mark_bcast(illgrp, INADDR_BROADCAST); 15558 15559 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15560 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15561 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15562 } else { 15563 net_mask = htonl(IN_CLASSA_NET); 15564 } 15565 addr = net_mask & ipif->ipif_subnet; 15566 ill_mark_bcast(illgrp, addr); 15567 15568 net_addr = ~net_mask | addr; 15569 ill_mark_bcast(illgrp, net_addr); 15570 15571 subnet_netmask = ipif->ipif_net_mask; 15572 addr = ipif->ipif_subnet; 15573 ill_mark_bcast(illgrp, addr); 15574 15575 subnet_addr = ~subnet_netmask | addr; 15576 ill_mark_bcast(illgrp, subnet_addr); 15577 } 15578 15579 /* 15580 * Whenever we form or delete ill groups, we need to nominate one set of 15581 * BROADCAST ires for receiving in the group. 15582 * 15583 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 15584 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 15585 * for ill_ipif_up_count to be non-zero. This is the only case where 15586 * ill_ipif_up_count is zero and we would still find the ires. 15587 * 15588 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 15589 * ipif is UP and we just have to do the nomination. 15590 * 15591 * 3) When ill_handoff_responsibility calls us, some ill has been removed 15592 * from the group. So, we have to do the nomination. 15593 * 15594 * Because of (3), there could be just one ill in the group. But we have 15595 * to nominate still as IRE_MARK_NORCV may have been marked on this. 15596 * Thus, this function does not optimize when there is only one ill as 15597 * it is not correct for (3). 15598 */ 15599 static void 15600 ill_nominate_bcast_rcv(ill_group_t *illgrp) 15601 { 15602 ill_t *ill; 15603 ipif_t *ipif; 15604 ipaddr_t subnet_addr; 15605 ipaddr_t prev_subnet_addr = 0; 15606 ipaddr_t net_addr; 15607 ipaddr_t prev_net_addr = 0; 15608 ipaddr_t net_mask = 0; 15609 ipaddr_t subnet_netmask; 15610 ipaddr_t addr; 15611 15612 /* 15613 * When the last memeber is leaving, there is nothing to 15614 * nominate. 15615 */ 15616 if (illgrp->illgrp_ill_count == 0) { 15617 ASSERT(illgrp->illgrp_ill == NULL); 15618 return; 15619 } 15620 15621 ill = illgrp->illgrp_ill; 15622 ASSERT(!ill->ill_isv6); 15623 /* 15624 * We assume that ires with same address and belonging to the 15625 * same group, has been grouped together. Nominating a *single* 15626 * ill in the group for sending and receiving broadcast is done 15627 * by making sure that the first BROADCAST ire (which will be 15628 * the one returned by ire_ctable_lookup for ip_rput and the 15629 * one that will be used in ip_wput_ire) will be the one that 15630 * will not have IRE_MARK_NORECV set. 15631 * 15632 * 1) ip_rput checks and discards packets received on ires marked 15633 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 15634 * broadcast packets. We need to clear IRE_MARK_NORECV on the 15635 * first ire in the group for every broadcast address in the group. 15636 * ip_rput will accept packets only on the first ire i.e only 15637 * one copy of the ill. 15638 * 15639 * 2) ip_wput_ire needs to send out just one copy of the broadcast 15640 * packet for the whole group. It needs to send out on the ill 15641 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 15642 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 15643 * the copy echoed back on other port where the ire is not marked 15644 * with IRE_MARK_NORECV. 15645 * 15646 * Note that we just need to have the first IRE either loopback or 15647 * non-loopback (either of them may not exist if ire_create failed 15648 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 15649 * always hit the first one and hence will always accept one copy. 15650 * 15651 * We have a broadcast ire per ill for all the unique prefixes 15652 * hosted on that ill. As we don't have a way of knowing the 15653 * unique prefixes on a given ill and hence in the whole group, 15654 * we just call ill_mark_bcast on all the prefixes that exist 15655 * in the group. For the common case of one prefix, the code 15656 * below optimizes by remebering the last address used for 15657 * markng. In the case of multiple prefixes, this will still 15658 * optimize depending the order of prefixes. 15659 * 15660 * The only unique address across the whole group is 0.0.0.0 and 15661 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 15662 * the first ire in the bucket for receiving and disables the 15663 * others. 15664 */ 15665 ill_mark_bcast(illgrp, 0); 15666 ill_mark_bcast(illgrp, INADDR_BROADCAST); 15667 for (; ill != NULL; ill = ill->ill_group_next) { 15668 15669 for (ipif = ill->ill_ipif; ipif != NULL; 15670 ipif = ipif->ipif_next) { 15671 15672 if (!(ipif->ipif_flags & IPIF_UP) || 15673 ipif->ipif_subnet == 0) { 15674 continue; 15675 } 15676 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15677 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15678 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15679 } else { 15680 net_mask = htonl(IN_CLASSA_NET); 15681 } 15682 addr = net_mask & ipif->ipif_subnet; 15683 if (prev_net_addr == 0 || prev_net_addr != addr) { 15684 ill_mark_bcast(illgrp, addr); 15685 net_addr = ~net_mask | addr; 15686 ill_mark_bcast(illgrp, net_addr); 15687 } 15688 prev_net_addr = addr; 15689 15690 subnet_netmask = ipif->ipif_net_mask; 15691 addr = ipif->ipif_subnet; 15692 if (prev_subnet_addr == 0 || 15693 prev_subnet_addr != addr) { 15694 ill_mark_bcast(illgrp, addr); 15695 subnet_addr = ~subnet_netmask | addr; 15696 ill_mark_bcast(illgrp, subnet_addr); 15697 } 15698 prev_subnet_addr = addr; 15699 } 15700 } 15701 } 15702 15703 /* 15704 * This function is called while forming ill groups. 15705 * 15706 * Currently, we handle only allmulti groups. We want to join 15707 * allmulti on only one of the ills in the groups. In future, 15708 * when we have link aggregation, we may have to join normal 15709 * multicast groups on multiple ills as switch does inbound load 15710 * balancing. Following are the functions that calls this 15711 * function : 15712 * 15713 * 1) ill_recover_multicast : Interface is coming back UP. 15714 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 15715 * will call ill_recover_multicast to recover all the multicast 15716 * groups. We need to make sure that only one member is joined 15717 * in the ill group. 15718 * 15719 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 15720 * Somebody is joining allmulti. We need to make sure that only one 15721 * member is joined in the group. 15722 * 15723 * 3) illgrp_insert : If allmulti has already joined, we need to make 15724 * sure that only one member is joined in the group. 15725 * 15726 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 15727 * allmulti who we have nominated. We need to pick someother ill. 15728 * 15729 * 5) illgrp_delete : The ill we nominated is leaving the group, 15730 * we need to pick a new ill to join the group. 15731 * 15732 * For (1), (2), (5) - we just have to check whether there is 15733 * a good ill joined in the group. If we could not find any ills 15734 * joined the group, we should join. 15735 * 15736 * For (4), the one that was nominated to receive, left the group. 15737 * There could be nobody joined in the group when this function is 15738 * called. 15739 * 15740 * For (3) - we need to explicitly check whether there are multiple 15741 * ills joined in the group. 15742 * 15743 * For simplicity, we don't differentiate any of the above cases. We 15744 * just leave the group if it is joined on any of them and join on 15745 * the first good ill. 15746 */ 15747 int 15748 ill_nominate_mcast_rcv(ill_group_t *illgrp) 15749 { 15750 ilm_t *ilm; 15751 ill_t *ill; 15752 ill_t *fallback_inactive_ill = NULL; 15753 ill_t *fallback_failed_ill = NULL; 15754 int ret = 0; 15755 15756 /* 15757 * Leave the allmulti on all the ills and start fresh. 15758 */ 15759 for (ill = illgrp->illgrp_ill; ill != NULL; 15760 ill = ill->ill_group_next) { 15761 if (ill->ill_join_allmulti) 15762 (void) ip_leave_allmulti(ill->ill_ipif); 15763 } 15764 15765 /* 15766 * Choose a good ill. Fallback to inactive or failed if 15767 * none available. We need to fallback to FAILED in the 15768 * case where we have 2 interfaces in a group - where 15769 * one of them is failed and another is a good one and 15770 * the good one (not marked inactive) is leaving the group. 15771 */ 15772 ret = 0; 15773 for (ill = illgrp->illgrp_ill; ill != NULL; 15774 ill = ill->ill_group_next) { 15775 /* Never pick an offline interface */ 15776 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 15777 continue; 15778 15779 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 15780 fallback_failed_ill = ill; 15781 continue; 15782 } 15783 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 15784 fallback_inactive_ill = ill; 15785 continue; 15786 } 15787 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15788 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15789 ret = ip_join_allmulti(ill->ill_ipif); 15790 /* 15791 * ip_join_allmulti can fail because of memory 15792 * failures. So, make sure we join at least 15793 * on one ill. 15794 */ 15795 if (ill->ill_join_allmulti) 15796 return (0); 15797 } 15798 } 15799 } 15800 if (ret != 0) { 15801 /* 15802 * If we tried nominating above and failed to do so, 15803 * return error. We might have tried multiple times. 15804 * But, return the latest error. 15805 */ 15806 return (ret); 15807 } 15808 if ((ill = fallback_inactive_ill) != NULL) { 15809 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15810 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15811 ret = ip_join_allmulti(ill->ill_ipif); 15812 return (ret); 15813 } 15814 } 15815 } else if ((ill = fallback_failed_ill) != NULL) { 15816 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15817 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15818 ret = ip_join_allmulti(ill->ill_ipif); 15819 return (ret); 15820 } 15821 } 15822 } 15823 return (0); 15824 } 15825 15826 /* 15827 * This function is called from illgrp_delete after it is 15828 * deleted from the group to reschedule responsibilities 15829 * to a different ill. 15830 */ 15831 static void 15832 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 15833 { 15834 ilm_t *ilm; 15835 ipif_t *ipif; 15836 ipaddr_t subnet_addr; 15837 ipaddr_t net_addr; 15838 ipaddr_t net_mask = 0; 15839 ipaddr_t subnet_netmask; 15840 ipaddr_t addr; 15841 15842 ASSERT(ill->ill_group == NULL); 15843 /* 15844 * Broadcast Responsibility: 15845 * 15846 * 1. If this ill has been nominated for receiving broadcast 15847 * packets, we need to find a new one. Before we find a new 15848 * one, we need to re-group the ires that are part of this new 15849 * group (assumed by ill_nominate_bcast_rcv). We do this by 15850 * calling ill_group_bcast_for_xmit(ill) which will do the right 15851 * thing for us. 15852 * 15853 * 2. If this ill was not nominated for receiving broadcast 15854 * packets, we need to clear the IRE_MARK_NORECV flag 15855 * so that we continue to send up broadcast packets. 15856 */ 15857 if (!ill->ill_isv6) { 15858 /* 15859 * Case 1 above : No optimization here. Just redo the 15860 * nomination. 15861 */ 15862 ill_group_bcast_for_xmit(ill); 15863 ill_nominate_bcast_rcv(illgrp); 15864 15865 /* 15866 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 15867 */ 15868 ill_clear_bcast_mark(ill, 0); 15869 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 15870 15871 for (ipif = ill->ill_ipif; ipif != NULL; 15872 ipif = ipif->ipif_next) { 15873 15874 if (!(ipif->ipif_flags & IPIF_UP) || 15875 ipif->ipif_subnet == 0) { 15876 continue; 15877 } 15878 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15879 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15880 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15881 } else { 15882 net_mask = htonl(IN_CLASSA_NET); 15883 } 15884 addr = net_mask & ipif->ipif_subnet; 15885 ill_clear_bcast_mark(ill, addr); 15886 15887 net_addr = ~net_mask | addr; 15888 ill_clear_bcast_mark(ill, net_addr); 15889 15890 subnet_netmask = ipif->ipif_net_mask; 15891 addr = ipif->ipif_subnet; 15892 ill_clear_bcast_mark(ill, addr); 15893 15894 subnet_addr = ~subnet_netmask | addr; 15895 ill_clear_bcast_mark(ill, subnet_addr); 15896 } 15897 } 15898 15899 /* 15900 * Multicast Responsibility. 15901 * 15902 * If we have joined allmulti on this one, find a new member 15903 * in the group to join allmulti. As this ill is already part 15904 * of allmulti, we don't have to join on this one. 15905 * 15906 * If we have not joined allmulti on this one, there is no 15907 * responsibility to handoff. But we need to take new 15908 * responsibility i.e, join allmulti on this one if we need 15909 * to. 15910 */ 15911 if (ill->ill_join_allmulti) { 15912 (void) ill_nominate_mcast_rcv(illgrp); 15913 } else { 15914 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15915 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15916 (void) ip_join_allmulti(ill->ill_ipif); 15917 break; 15918 } 15919 } 15920 } 15921 15922 /* 15923 * We intentionally do the flushing of IRE_CACHES only matching 15924 * on the ill and not on groups. Note that we are already deleted 15925 * from the group. 15926 * 15927 * This will make sure that all IRE_CACHES whose stq is pointing 15928 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 15929 * deleted and IRE_CACHES that are not pointing at this ill will 15930 * be left alone. 15931 */ 15932 if (ill->ill_isv6) { 15933 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15934 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15935 } else { 15936 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15937 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15938 } 15939 15940 /* 15941 * Some conn may have cached one of the IREs deleted above. By removing 15942 * the ire reference, we clean up the extra reference to the ill held in 15943 * ire->ire_stq. 15944 */ 15945 ipcl_walk(conn_cleanup_stale_ire, NULL); 15946 15947 /* 15948 * Re-do source address selection for all the members in the 15949 * group, if they borrowed source address from one of the ipifs 15950 * in this ill. 15951 */ 15952 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15953 if (ill->ill_isv6) { 15954 ipif_update_other_ipifs_v6(ipif, illgrp); 15955 } else { 15956 ipif_update_other_ipifs(ipif, illgrp); 15957 } 15958 } 15959 } 15960 15961 /* 15962 * Delete the ill from the group. The caller makes sure that it is 15963 * in a group and it okay to delete from the group. So, we always 15964 * delete here. 15965 */ 15966 static void 15967 illgrp_delete(ill_t *ill) 15968 { 15969 ill_group_t *illgrp; 15970 ill_group_t *tmpg; 15971 ill_t *tmp_ill; 15972 15973 /* 15974 * Reset illgrp_ill_schednext if it was pointing at us. 15975 * We need to do this before we set ill_group to NULL. 15976 */ 15977 rw_enter(&ill_g_lock, RW_WRITER); 15978 mutex_enter(&ill->ill_lock); 15979 15980 illgrp_reset_schednext(ill); 15981 15982 illgrp = ill->ill_group; 15983 15984 /* Delete the ill from illgrp. */ 15985 if (illgrp->illgrp_ill == ill) { 15986 illgrp->illgrp_ill = ill->ill_group_next; 15987 } else { 15988 tmp_ill = illgrp->illgrp_ill; 15989 while (tmp_ill->ill_group_next != ill) { 15990 tmp_ill = tmp_ill->ill_group_next; 15991 ASSERT(tmp_ill != NULL); 15992 } 15993 tmp_ill->ill_group_next = ill->ill_group_next; 15994 } 15995 ill->ill_group = NULL; 15996 ill->ill_group_next = NULL; 15997 15998 illgrp->illgrp_ill_count--; 15999 mutex_exit(&ill->ill_lock); 16000 rw_exit(&ill_g_lock); 16001 16002 /* 16003 * As this ill is leaving the group, we need to hand off 16004 * the responsibilities to the other ills in the group, if 16005 * this ill had some responsibilities. 16006 */ 16007 16008 ill_handoff_responsibility(ill, illgrp); 16009 16010 rw_enter(&ill_g_lock, RW_WRITER); 16011 16012 if (illgrp->illgrp_ill_count == 0) { 16013 16014 ASSERT(illgrp->illgrp_ill == NULL); 16015 if (ill->ill_isv6) { 16016 if (illgrp == illgrp_head_v6) { 16017 illgrp_head_v6 = illgrp->illgrp_next; 16018 } else { 16019 tmpg = illgrp_head_v6; 16020 while (tmpg->illgrp_next != illgrp) { 16021 tmpg = tmpg->illgrp_next; 16022 ASSERT(tmpg != NULL); 16023 } 16024 tmpg->illgrp_next = illgrp->illgrp_next; 16025 } 16026 } else { 16027 if (illgrp == illgrp_head_v4) { 16028 illgrp_head_v4 = illgrp->illgrp_next; 16029 } else { 16030 tmpg = illgrp_head_v4; 16031 while (tmpg->illgrp_next != illgrp) { 16032 tmpg = tmpg->illgrp_next; 16033 ASSERT(tmpg != NULL); 16034 } 16035 tmpg->illgrp_next = illgrp->illgrp_next; 16036 } 16037 } 16038 mutex_destroy(&illgrp->illgrp_lock); 16039 mi_free(illgrp); 16040 } 16041 rw_exit(&ill_g_lock); 16042 16043 /* 16044 * Even though the ill is out of the group its not necessary 16045 * to set ipsq_split as TRUE as the ipifs could be down temporarily 16046 * We will split the ipsq when phyint_groupname is set to NULL. 16047 */ 16048 16049 /* 16050 * Send a routing sockets message if we are deleting from 16051 * groups with names. 16052 */ 16053 if (ill->ill_phyint->phyint_groupname_len != 0) 16054 ip_rts_ifmsg(ill->ill_ipif); 16055 } 16056 16057 /* 16058 * Re-do source address selection. This is normally called when 16059 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 16060 * ipif comes up. 16061 */ 16062 void 16063 ill_update_source_selection(ill_t *ill) 16064 { 16065 ipif_t *ipif; 16066 16067 ASSERT(IAM_WRITER_ILL(ill)); 16068 16069 if (ill->ill_group != NULL) 16070 ill = ill->ill_group->illgrp_ill; 16071 16072 for (; ill != NULL; ill = ill->ill_group_next) { 16073 for (ipif = ill->ill_ipif; ipif != NULL; 16074 ipif = ipif->ipif_next) { 16075 if (ill->ill_isv6) 16076 ipif_recreate_interface_routes_v6(NULL, ipif); 16077 else 16078 ipif_recreate_interface_routes(NULL, ipif); 16079 } 16080 } 16081 } 16082 16083 /* 16084 * Insert ill in a group headed by illgrp_head. The caller can either 16085 * pass a groupname in which case we search for a group with the 16086 * same name to insert in or pass a group to insert in. This function 16087 * would only search groups with names. 16088 * 16089 * NOTE : The caller should make sure that there is at least one ipif 16090 * UP on this ill so that illgrp_scheduler can pick this ill 16091 * for outbound packets. If ill_ipif_up_count is zero, we have 16092 * already sent a DL_UNBIND to the driver and we don't want to 16093 * send anymore packets. We don't assert for ipif_up_count 16094 * to be greater than zero, because ipif_up_done wants to call 16095 * this function before bumping up the ipif_up_count. See 16096 * ipif_up_done() for details. 16097 */ 16098 int 16099 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 16100 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 16101 { 16102 ill_group_t *illgrp; 16103 ill_t *prev_ill; 16104 phyint_t *phyi; 16105 16106 ASSERT(ill->ill_group == NULL); 16107 16108 rw_enter(&ill_g_lock, RW_WRITER); 16109 mutex_enter(&ill->ill_lock); 16110 16111 if (groupname != NULL) { 16112 /* 16113 * Look for a group with a matching groupname to insert. 16114 */ 16115 for (illgrp = *illgrp_head; illgrp != NULL; 16116 illgrp = illgrp->illgrp_next) { 16117 16118 ill_t *tmp_ill; 16119 16120 /* 16121 * If we have an ill_group_t in the list which has 16122 * no ill_t assigned then we must be in the process of 16123 * removing this group. We skip this as illgrp_delete() 16124 * will remove it from the list. 16125 */ 16126 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 16127 ASSERT(illgrp->illgrp_ill_count == 0); 16128 continue; 16129 } 16130 16131 ASSERT(tmp_ill->ill_phyint != NULL); 16132 phyi = tmp_ill->ill_phyint; 16133 /* 16134 * Look at groups which has names only. 16135 */ 16136 if (phyi->phyint_groupname_len == 0) 16137 continue; 16138 /* 16139 * Names are stored in the phyint common to both 16140 * IPv4 and IPv6. 16141 */ 16142 if (mi_strcmp(phyi->phyint_groupname, 16143 groupname) == 0) { 16144 break; 16145 } 16146 } 16147 } else { 16148 /* 16149 * If the caller passes in a NULL "grp_to_insert", we 16150 * allocate one below and insert this singleton. 16151 */ 16152 illgrp = grp_to_insert; 16153 } 16154 16155 ill->ill_group_next = NULL; 16156 16157 if (illgrp == NULL) { 16158 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 16159 if (illgrp == NULL) { 16160 return (ENOMEM); 16161 } 16162 illgrp->illgrp_next = *illgrp_head; 16163 *illgrp_head = illgrp; 16164 illgrp->illgrp_ill = ill; 16165 illgrp->illgrp_ill_count = 1; 16166 ill->ill_group = illgrp; 16167 /* 16168 * Used in illgrp_scheduler to protect multiple threads 16169 * from traversing the list. 16170 */ 16171 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 16172 } else { 16173 ASSERT(ill->ill_net_type == 16174 illgrp->illgrp_ill->ill_net_type); 16175 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 16176 16177 /* Insert ill at tail of this group */ 16178 prev_ill = illgrp->illgrp_ill; 16179 while (prev_ill->ill_group_next != NULL) 16180 prev_ill = prev_ill->ill_group_next; 16181 prev_ill->ill_group_next = ill; 16182 ill->ill_group = illgrp; 16183 illgrp->illgrp_ill_count++; 16184 /* 16185 * Inherit group properties. Currently only forwarding 16186 * is the property we try to keep the same with all the 16187 * ills. When there are more, we will abstract this into 16188 * a function. 16189 */ 16190 ill->ill_flags &= ~ILLF_ROUTER; 16191 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 16192 } 16193 mutex_exit(&ill->ill_lock); 16194 rw_exit(&ill_g_lock); 16195 16196 /* 16197 * 1) When ipif_up_done() calls this function, ipif_up_count 16198 * may be zero as it has not yet been bumped. But the ires 16199 * have already been added. So, we do the nomination here 16200 * itself. But, when ip_sioctl_groupname calls this, it checks 16201 * for ill_ipif_up_count != 0. Thus we don't check for 16202 * ill_ipif_up_count here while nominating broadcast ires for 16203 * receive. 16204 * 16205 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 16206 * to group them properly as ire_add() has already happened 16207 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 16208 * case, we need to do it here anyway. 16209 */ 16210 if (!ill->ill_isv6) { 16211 ill_group_bcast_for_xmit(ill); 16212 ill_nominate_bcast_rcv(illgrp); 16213 } 16214 16215 if (!ipif_is_coming_up) { 16216 /* 16217 * When ipif_up_done() calls this function, the multicast 16218 * groups have not been joined yet. So, there is no point in 16219 * nomination. ip_join_allmulti will handle groups when 16220 * ill_recover_multicast is called from ipif_up_done() later. 16221 */ 16222 (void) ill_nominate_mcast_rcv(illgrp); 16223 /* 16224 * ipif_up_done calls ill_update_source_selection 16225 * anyway. Moreover, we don't want to re-create 16226 * interface routes while ipif_up_done() still has reference 16227 * to them. Refer to ipif_up_done() for more details. 16228 */ 16229 ill_update_source_selection(ill); 16230 } 16231 16232 /* 16233 * Send a routing sockets message if we are inserting into 16234 * groups with names. 16235 */ 16236 if (groupname != NULL) 16237 ip_rts_ifmsg(ill->ill_ipif); 16238 return (0); 16239 } 16240 16241 /* 16242 * Return the first phyint matching the groupname. There could 16243 * be more than one when there are ill groups. 16244 * 16245 * Needs work: called only from ip_sioctl_groupname 16246 */ 16247 static phyint_t * 16248 phyint_lookup_group(char *groupname) 16249 { 16250 phyint_t *phyi; 16251 16252 ASSERT(RW_LOCK_HELD(&ill_g_lock)); 16253 /* 16254 * Group names are stored in the phyint - a common structure 16255 * to both IPv4 and IPv6. 16256 */ 16257 phyi = avl_first(&phyint_g_list.phyint_list_avl_by_index); 16258 for (; phyi != NULL; 16259 phyi = avl_walk(&phyint_g_list.phyint_list_avl_by_index, 16260 phyi, AVL_AFTER)) { 16261 if (phyi->phyint_groupname_len == 0) 16262 continue; 16263 ASSERT(phyi->phyint_groupname != NULL); 16264 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 16265 return (phyi); 16266 } 16267 return (NULL); 16268 } 16269 16270 16271 16272 /* 16273 * MT notes on creation and deletion of IPMP groups 16274 * 16275 * Creation and deletion of IPMP groups introduce the need to merge or 16276 * split the associated serialization objects i.e the ipsq's. Normally all 16277 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 16278 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 16279 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 16280 * is a need to change the <ill-ipsq> association and we have to operate on both 16281 * the source and destination IPMP groups. For eg. attempting to set the 16282 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 16283 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 16284 * source or destination IPMP group are mapped to a single ipsq for executing 16285 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 16286 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 16287 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 16288 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 16289 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 16290 * ipsq has to be examined for redoing the <ill-ipsq> associations. 16291 * 16292 * In the above example the ioctl handling code locates the current ipsq of hme0 16293 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 16294 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 16295 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 16296 * the destination ipsq. If the destination ipsq is not busy, it also enters 16297 * the destination ipsq exclusively. Now the actual groupname setting operation 16298 * can proceed. If the destination ipsq is busy, the operation is enqueued 16299 * on the destination (merged) ipsq and will be handled in the unwind from 16300 * ipsq_exit. 16301 * 16302 * To prevent other threads accessing the ill while the group name change is 16303 * in progres, we bring down the ipifs which also removes the ill from the 16304 * group. The group is changed in phyint and when the first ipif on the ill 16305 * is brought up, the ill is inserted into the right IPMP group by 16306 * illgrp_insert. 16307 */ 16308 /* ARGSUSED */ 16309 int 16310 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16311 ip_ioctl_cmd_t *ipip, void *ifreq) 16312 { 16313 int i; 16314 char *tmp; 16315 int namelen; 16316 ill_t *ill = ipif->ipif_ill; 16317 ill_t *ill_v4, *ill_v6; 16318 int err = 0; 16319 phyint_t *phyi; 16320 phyint_t *phyi_tmp; 16321 struct lifreq *lifr; 16322 mblk_t *mp1; 16323 char *groupname; 16324 ipsq_t *ipsq; 16325 16326 ASSERT(IAM_WRITER_IPIF(ipif)); 16327 16328 /* Existance verified in ip_wput_nondata */ 16329 mp1 = mp->b_cont->b_cont; 16330 lifr = (struct lifreq *)mp1->b_rptr; 16331 groupname = lifr->lifr_groupname; 16332 16333 if (ipif->ipif_id != 0) 16334 return (EINVAL); 16335 16336 phyi = ill->ill_phyint; 16337 ASSERT(phyi != NULL); 16338 16339 if (phyi->phyint_flags & PHYI_VIRTUAL) 16340 return (EINVAL); 16341 16342 tmp = groupname; 16343 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 16344 ; 16345 16346 if (i == LIFNAMSIZ) { 16347 /* no null termination */ 16348 return (EINVAL); 16349 } 16350 16351 /* 16352 * Calculate the namelen exclusive of the null 16353 * termination character. 16354 */ 16355 namelen = tmp - groupname; 16356 16357 ill_v4 = phyi->phyint_illv4; 16358 ill_v6 = phyi->phyint_illv6; 16359 16360 /* 16361 * ILL cannot be part of a usesrc group and and IPMP group at the 16362 * same time. No need to grab the ill_g_usesrc_lock here, see 16363 * synchronization notes in ip.c 16364 */ 16365 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 16366 return (EINVAL); 16367 } 16368 16369 /* 16370 * mark the ill as changing. 16371 * this should queue all new requests on the syncq. 16372 */ 16373 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16374 16375 if (ill_v4 != NULL) 16376 ill_v4->ill_state_flags |= ILL_CHANGING; 16377 if (ill_v6 != NULL) 16378 ill_v6->ill_state_flags |= ILL_CHANGING; 16379 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16380 16381 if (namelen == 0) { 16382 /* 16383 * Null string means remove this interface from the 16384 * existing group. 16385 */ 16386 if (phyi->phyint_groupname_len == 0) { 16387 /* 16388 * Never was in a group. 16389 */ 16390 err = 0; 16391 goto done; 16392 } 16393 16394 /* 16395 * IPv4 or IPv6 may be temporarily out of the group when all 16396 * the ipifs are down. Thus, we need to check for ill_group to 16397 * be non-NULL. 16398 */ 16399 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 16400 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16401 mutex_enter(&ill_v4->ill_lock); 16402 if (!ill_is_quiescent(ill_v4)) { 16403 /* 16404 * ipsq_pending_mp_add will not fail since 16405 * connp is NULL 16406 */ 16407 (void) ipsq_pending_mp_add(NULL, 16408 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16409 mutex_exit(&ill_v4->ill_lock); 16410 err = EINPROGRESS; 16411 goto done; 16412 } 16413 mutex_exit(&ill_v4->ill_lock); 16414 } 16415 16416 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 16417 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16418 mutex_enter(&ill_v6->ill_lock); 16419 if (!ill_is_quiescent(ill_v6)) { 16420 (void) ipsq_pending_mp_add(NULL, 16421 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16422 mutex_exit(&ill_v6->ill_lock); 16423 err = EINPROGRESS; 16424 goto done; 16425 } 16426 mutex_exit(&ill_v6->ill_lock); 16427 } 16428 16429 rw_enter(&ill_g_lock, RW_WRITER); 16430 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16431 mutex_enter(&phyi->phyint_lock); 16432 ASSERT(phyi->phyint_groupname != NULL); 16433 mi_free(phyi->phyint_groupname); 16434 phyi->phyint_groupname = NULL; 16435 phyi->phyint_groupname_len = 0; 16436 mutex_exit(&phyi->phyint_lock); 16437 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16438 rw_exit(&ill_g_lock); 16439 err = ill_up_ipifs(ill, q, mp); 16440 16441 /* 16442 * set the split flag so that the ipsq can be split 16443 */ 16444 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16445 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16446 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16447 16448 } else { 16449 if (phyi->phyint_groupname_len != 0) { 16450 ASSERT(phyi->phyint_groupname != NULL); 16451 /* Are we inserting in the same group ? */ 16452 if (mi_strcmp(groupname, 16453 phyi->phyint_groupname) == 0) { 16454 err = 0; 16455 goto done; 16456 } 16457 } 16458 16459 rw_enter(&ill_g_lock, RW_READER); 16460 /* 16461 * Merge ipsq for the group's. 16462 * This check is here as multiple groups/ills might be 16463 * sharing the same ipsq. 16464 * If we have to merege than the operation is restarted 16465 * on the new ipsq. 16466 */ 16467 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL); 16468 if (phyi->phyint_ipsq != ipsq) { 16469 rw_exit(&ill_g_lock); 16470 err = ill_merge_groups(ill, NULL, groupname, mp, q); 16471 goto done; 16472 } 16473 /* 16474 * Running exclusive on new ipsq. 16475 */ 16476 16477 ASSERT(ipsq != NULL); 16478 ASSERT(ipsq->ipsq_writer == curthread); 16479 16480 /* 16481 * Check whether the ill_type and ill_net_type matches before 16482 * we allocate any memory so that the cleanup is easier. 16483 * 16484 * We can't group dissimilar ones as we can't load spread 16485 * packets across the group because of potential link-level 16486 * header differences. 16487 */ 16488 phyi_tmp = phyint_lookup_group(groupname); 16489 if (phyi_tmp != NULL) { 16490 if ((ill_v4 != NULL && 16491 phyi_tmp->phyint_illv4 != NULL) && 16492 ((ill_v4->ill_net_type != 16493 phyi_tmp->phyint_illv4->ill_net_type) || 16494 (ill_v4->ill_type != 16495 phyi_tmp->phyint_illv4->ill_type))) { 16496 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16497 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16498 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16499 rw_exit(&ill_g_lock); 16500 return (EINVAL); 16501 } 16502 if ((ill_v6 != NULL && 16503 phyi_tmp->phyint_illv6 != NULL) && 16504 ((ill_v6->ill_net_type != 16505 phyi_tmp->phyint_illv6->ill_net_type) || 16506 (ill_v6->ill_type != 16507 phyi_tmp->phyint_illv6->ill_type))) { 16508 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16509 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16510 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16511 rw_exit(&ill_g_lock); 16512 return (EINVAL); 16513 } 16514 } 16515 16516 rw_exit(&ill_g_lock); 16517 16518 /* 16519 * bring down all v4 ipifs. 16520 */ 16521 if (ill_v4 != NULL) { 16522 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16523 } 16524 16525 /* 16526 * bring down all v6 ipifs. 16527 */ 16528 if (ill_v6 != NULL) { 16529 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16530 } 16531 16532 /* 16533 * make sure all ipifs are down and there are no active 16534 * references. Call to ipsq_pending_mp_add will not fail 16535 * since connp is NULL. 16536 */ 16537 if (ill_v4 != NULL) { 16538 mutex_enter(&ill_v4->ill_lock); 16539 if (!ill_is_quiescent(ill_v4)) { 16540 (void) ipsq_pending_mp_add(NULL, 16541 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16542 mutex_exit(&ill_v4->ill_lock); 16543 err = EINPROGRESS; 16544 goto done; 16545 } 16546 mutex_exit(&ill_v4->ill_lock); 16547 } 16548 16549 if (ill_v6 != NULL) { 16550 mutex_enter(&ill_v6->ill_lock); 16551 if (!ill_is_quiescent(ill_v6)) { 16552 (void) ipsq_pending_mp_add(NULL, 16553 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16554 mutex_exit(&ill_v6->ill_lock); 16555 err = EINPROGRESS; 16556 goto done; 16557 } 16558 mutex_exit(&ill_v6->ill_lock); 16559 } 16560 16561 /* 16562 * allocate including space for null terminator 16563 * before we insert. 16564 */ 16565 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 16566 if (tmp == NULL) 16567 return (ENOMEM); 16568 16569 rw_enter(&ill_g_lock, RW_WRITER); 16570 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16571 mutex_enter(&phyi->phyint_lock); 16572 if (phyi->phyint_groupname_len != 0) { 16573 ASSERT(phyi->phyint_groupname != NULL); 16574 mi_free(phyi->phyint_groupname); 16575 } 16576 16577 /* 16578 * setup the new group name. 16579 */ 16580 phyi->phyint_groupname = tmp; 16581 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 16582 phyi->phyint_groupname_len = namelen + 1; 16583 mutex_exit(&phyi->phyint_lock); 16584 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16585 rw_exit(&ill_g_lock); 16586 16587 err = ill_up_ipifs(ill, q, mp); 16588 } 16589 16590 done: 16591 /* 16592 * normally ILL_CHANGING is cleared in ill_up_ipifs. 16593 */ 16594 if (err != EINPROGRESS) { 16595 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16596 if (ill_v4 != NULL) 16597 ill_v4->ill_state_flags &= ~ILL_CHANGING; 16598 if (ill_v6 != NULL) 16599 ill_v6->ill_state_flags &= ~ILL_CHANGING; 16600 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16601 } 16602 return (err); 16603 } 16604 16605 /* ARGSUSED */ 16606 int 16607 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 16608 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 16609 { 16610 ill_t *ill; 16611 phyint_t *phyi; 16612 struct lifreq *lifr; 16613 mblk_t *mp1; 16614 16615 /* Existence verified in ip_wput_nondata */ 16616 mp1 = mp->b_cont->b_cont; 16617 lifr = (struct lifreq *)mp1->b_rptr; 16618 ill = ipif->ipif_ill; 16619 phyi = ill->ill_phyint; 16620 16621 lifr->lifr_groupname[0] = '\0'; 16622 /* 16623 * ill_group may be null if all the interfaces 16624 * are down. But still, the phyint should always 16625 * hold the name. 16626 */ 16627 if (phyi->phyint_groupname_len != 0) { 16628 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 16629 phyi->phyint_groupname_len); 16630 } 16631 16632 return (0); 16633 } 16634 16635 16636 typedef struct conn_move_s { 16637 ill_t *cm_from_ill; 16638 ill_t *cm_to_ill; 16639 int cm_ifindex; 16640 } conn_move_t; 16641 16642 /* 16643 * ipcl_walk function for moving conn_multicast_ill for a given ill. 16644 */ 16645 static void 16646 conn_move(conn_t *connp, caddr_t arg) 16647 { 16648 conn_move_t *connm; 16649 int ifindex; 16650 int i; 16651 ill_t *from_ill; 16652 ill_t *to_ill; 16653 ilg_t *ilg; 16654 ilm_t *ret_ilm; 16655 16656 connm = (conn_move_t *)arg; 16657 ifindex = connm->cm_ifindex; 16658 from_ill = connm->cm_from_ill; 16659 to_ill = connm->cm_to_ill; 16660 16661 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 16662 16663 /* All multicast fields protected by conn_lock */ 16664 mutex_enter(&connp->conn_lock); 16665 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 16666 if ((connp->conn_outgoing_ill == from_ill) && 16667 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 16668 connp->conn_outgoing_ill = to_ill; 16669 connp->conn_incoming_ill = to_ill; 16670 } 16671 16672 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 16673 16674 if ((connp->conn_multicast_ill == from_ill) && 16675 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 16676 connp->conn_multicast_ill = connm->cm_to_ill; 16677 } 16678 16679 /* Change IP_XMIT_IF associations */ 16680 if ((connp->conn_xmit_if_ill == from_ill) && 16681 (ifindex == 0 || connp->conn_orig_xmit_ifindex == ifindex)) { 16682 connp->conn_xmit_if_ill = to_ill; 16683 } 16684 /* 16685 * Change the ilg_ill to point to the new one. This assumes 16686 * ilm_move_v6 has moved the ilms to new_ill and the driver 16687 * has been told to receive packets on this interface. 16688 * ilm_move_v6 FAILBACKS all the ilms successfully always. 16689 * But when doing a FAILOVER, it might fail with ENOMEM and so 16690 * some ilms may not have moved. We check to see whether 16691 * the ilms have moved to to_ill. We can't check on from_ill 16692 * as in the process of moving, we could have split an ilm 16693 * in to two - which has the same orig_ifindex and v6group. 16694 * 16695 * For IPv4, ilg_ipif moves implicitly. The code below really 16696 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 16697 */ 16698 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 16699 ilg = &connp->conn_ilg[i]; 16700 if ((ilg->ilg_ill == from_ill) && 16701 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 16702 /* ifindex != 0 indicates failback */ 16703 if (ifindex != 0) { 16704 connp->conn_ilg[i].ilg_ill = to_ill; 16705 continue; 16706 } 16707 16708 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 16709 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 16710 connp->conn_zoneid); 16711 16712 if (ret_ilm != NULL) 16713 connp->conn_ilg[i].ilg_ill = to_ill; 16714 } 16715 } 16716 mutex_exit(&connp->conn_lock); 16717 } 16718 16719 static void 16720 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 16721 { 16722 conn_move_t connm; 16723 16724 connm.cm_from_ill = from_ill; 16725 connm.cm_to_ill = to_ill; 16726 connm.cm_ifindex = ifindex; 16727 16728 ipcl_walk(conn_move, (caddr_t)&connm); 16729 } 16730 16731 /* 16732 * ilm has been moved from from_ill to to_ill. 16733 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 16734 * appropriately. 16735 * 16736 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 16737 * the code there de-references ipif_ill to get the ill to 16738 * send multicast requests. It does not work as ipif is on its 16739 * move and already moved when this function is called. 16740 * Thus, we need to use from_ill and to_ill send down multicast 16741 * requests. 16742 */ 16743 static void 16744 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 16745 { 16746 ipif_t *ipif; 16747 ilm_t *ilm; 16748 16749 /* 16750 * See whether we need to send down DL_ENABMULTI_REQ on 16751 * to_ill as ilm has just been added. 16752 */ 16753 ASSERT(IAM_WRITER_ILL(to_ill)); 16754 ASSERT(IAM_WRITER_ILL(from_ill)); 16755 16756 ILM_WALKER_HOLD(to_ill); 16757 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16758 16759 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 16760 continue; 16761 /* 16762 * no locks held, ill/ipif cannot dissappear as long 16763 * as we are writer. 16764 */ 16765 ipif = to_ill->ill_ipif; 16766 /* 16767 * No need to hold any lock as we are the writer and this 16768 * can only be changed by a writer. 16769 */ 16770 ilm->ilm_is_new = B_FALSE; 16771 16772 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 16773 ipif->ipif_flags & IPIF_POINTOPOINT) { 16774 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 16775 "resolver\n")); 16776 continue; /* Must be IRE_IF_NORESOLVER */ 16777 } 16778 16779 16780 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16781 ip1dbg(("ilm_send_multicast_reqs: " 16782 "to_ill MULTI_BCAST\n")); 16783 goto from; 16784 } 16785 16786 if (to_ill->ill_isv6) 16787 mld_joingroup(ilm); 16788 else 16789 igmp_joingroup(ilm); 16790 16791 if (to_ill->ill_ipif_up_count == 0) { 16792 /* 16793 * Nobody there. All multicast addresses will be 16794 * re-joined when we get the DL_BIND_ACK bringing the 16795 * interface up. 16796 */ 16797 ilm->ilm_notify_driver = B_FALSE; 16798 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 16799 goto from; 16800 } 16801 16802 /* 16803 * For allmulti address, we want to join on only one interface. 16804 * Checking for ilm_numentries_v6 is not correct as you may 16805 * find an ilm with zero address on to_ill, but we may not 16806 * have nominated to_ill for receiving. Thus, if we have 16807 * nominated from_ill (ill_join_allmulti is set), nominate 16808 * only if to_ill is not already nominated (to_ill normally 16809 * should not have been nominated if "from_ill" has already 16810 * been nominated. As we don't prevent failovers from happening 16811 * across groups, we don't assert). 16812 */ 16813 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16814 /* 16815 * There is no need to hold ill locks as we are 16816 * writer on both ills and when ill_join_allmulti 16817 * is changed the thread is always a writer. 16818 */ 16819 if (from_ill->ill_join_allmulti && 16820 !to_ill->ill_join_allmulti) { 16821 (void) ip_join_allmulti(to_ill->ill_ipif); 16822 } 16823 } else if (ilm->ilm_notify_driver) { 16824 16825 /* 16826 * This is a newly moved ilm so we need to tell the 16827 * driver about the new group. There can be more than 16828 * one ilm's for the same group in the list each with a 16829 * different orig_ifindex. We have to inform the driver 16830 * once. In ilm_move_v[4,6] we only set the flag 16831 * ilm_notify_driver for the first ilm. 16832 */ 16833 16834 (void) ip_ll_send_enabmulti_req(to_ill, 16835 &ilm->ilm_v6addr); 16836 } 16837 16838 ilm->ilm_notify_driver = B_FALSE; 16839 16840 /* 16841 * See whether we need to send down DL_DISABMULTI_REQ on 16842 * from_ill as ilm has just been removed. 16843 */ 16844 from: 16845 ipif = from_ill->ill_ipif; 16846 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 16847 ipif->ipif_flags & IPIF_POINTOPOINT) { 16848 ip1dbg(("ilm_send_multicast_reqs: " 16849 "from_ill not resolver\n")); 16850 continue; /* Must be IRE_IF_NORESOLVER */ 16851 } 16852 16853 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16854 ip1dbg(("ilm_send_multicast_reqs: " 16855 "from_ill MULTI_BCAST\n")); 16856 continue; 16857 } 16858 16859 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16860 if (from_ill->ill_join_allmulti) 16861 (void) ip_leave_allmulti(from_ill->ill_ipif); 16862 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 16863 (void) ip_ll_send_disabmulti_req(from_ill, 16864 &ilm->ilm_v6addr); 16865 } 16866 } 16867 ILM_WALKER_RELE(to_ill); 16868 } 16869 16870 /* 16871 * This function is called when all multicast memberships needs 16872 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 16873 * called only once unlike the IPv4 counterpart where it is called after 16874 * every logical interface is moved. The reason is due to multicast 16875 * memberships are joined using an interface address in IPv4 while in 16876 * IPv6, interface index is used. 16877 */ 16878 static void 16879 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 16880 { 16881 ilm_t *ilm; 16882 ilm_t *ilm_next; 16883 ilm_t *new_ilm; 16884 ilm_t **ilmp; 16885 int count; 16886 char buf[INET6_ADDRSTRLEN]; 16887 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 16888 16889 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16890 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16891 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 16892 16893 if (ifindex == 0) { 16894 /* 16895 * Form the solicited node mcast address which is used later. 16896 */ 16897 ipif_t *ipif; 16898 16899 ipif = from_ill->ill_ipif; 16900 ASSERT(ipif->ipif_id == 0); 16901 16902 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 16903 } 16904 16905 ilmp = &from_ill->ill_ilm; 16906 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16907 ilm_next = ilm->ilm_next; 16908 16909 if (ilm->ilm_flags & ILM_DELETED) { 16910 ilmp = &ilm->ilm_next; 16911 continue; 16912 } 16913 16914 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 16915 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 16916 ASSERT(ilm->ilm_orig_ifindex != 0); 16917 if (ilm->ilm_orig_ifindex == ifindex) { 16918 /* 16919 * We are failing back multicast memberships. 16920 * If the same ilm exists in to_ill, it means somebody 16921 * has joined the same group there e.g. ff02::1 16922 * is joined within the kernel when the interfaces 16923 * came UP. 16924 */ 16925 ASSERT(ilm->ilm_ipif == NULL); 16926 if (new_ilm != NULL) { 16927 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16928 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16929 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16930 new_ilm->ilm_is_new = B_TRUE; 16931 } 16932 } else { 16933 /* 16934 * check if we can just move the ilm 16935 */ 16936 if (from_ill->ill_ilm_walker_cnt != 0) { 16937 /* 16938 * We have walkers we cannot move 16939 * the ilm, so allocate a new ilm, 16940 * this (old) ilm will be marked 16941 * ILM_DELETED at the end of the loop 16942 * and will be freed when the 16943 * last walker exits. 16944 */ 16945 new_ilm = (ilm_t *)mi_zalloc 16946 (sizeof (ilm_t)); 16947 if (new_ilm == NULL) { 16948 ip0dbg(("ilm_move_v6: " 16949 "FAILBACK of IPv6" 16950 " multicast address %s : " 16951 "from %s to" 16952 " %s failed : ENOMEM \n", 16953 inet_ntop(AF_INET6, 16954 &ilm->ilm_v6addr, buf, 16955 sizeof (buf)), 16956 from_ill->ill_name, 16957 to_ill->ill_name)); 16958 16959 ilmp = &ilm->ilm_next; 16960 continue; 16961 } 16962 *new_ilm = *ilm; 16963 /* 16964 * we don't want new_ilm linked to 16965 * ilm's filter list. 16966 */ 16967 new_ilm->ilm_filter = NULL; 16968 } else { 16969 /* 16970 * No walkers we can move the ilm. 16971 * lets take it out of the list. 16972 */ 16973 *ilmp = ilm->ilm_next; 16974 ilm->ilm_next = NULL; 16975 new_ilm = ilm; 16976 } 16977 16978 /* 16979 * if this is the first ilm for the group 16980 * set ilm_notify_driver so that we notify the 16981 * driver in ilm_send_multicast_reqs. 16982 */ 16983 if (ilm_lookup_ill_v6(to_ill, 16984 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16985 new_ilm->ilm_notify_driver = B_TRUE; 16986 16987 new_ilm->ilm_ill = to_ill; 16988 /* Add to the to_ill's list */ 16989 new_ilm->ilm_next = to_ill->ill_ilm; 16990 to_ill->ill_ilm = new_ilm; 16991 /* 16992 * set the flag so that mld_joingroup is 16993 * called in ilm_send_multicast_reqs(). 16994 */ 16995 new_ilm->ilm_is_new = B_TRUE; 16996 } 16997 goto bottom; 16998 } else if (ifindex != 0) { 16999 /* 17000 * If this is FAILBACK (ifindex != 0) and the ifindex 17001 * has not matched above, look at the next ilm. 17002 */ 17003 ilmp = &ilm->ilm_next; 17004 continue; 17005 } 17006 /* 17007 * If we are here, it means ifindex is 0. Failover 17008 * everything. 17009 * 17010 * We need to handle solicited node mcast address 17011 * and all_nodes mcast address differently as they 17012 * are joined witin the kenrel (ipif_multicast_up) 17013 * and potentially from the userland. We are called 17014 * after the ipifs of from_ill has been moved. 17015 * If we still find ilms on ill with solicited node 17016 * mcast address or all_nodes mcast address, it must 17017 * belong to the UP interface that has not moved e.g. 17018 * ipif_id 0 with the link local prefix does not move. 17019 * We join this on the new ill accounting for all the 17020 * userland memberships so that applications don't 17021 * see any failure. 17022 * 17023 * We need to make sure that we account only for the 17024 * solicited node and all node multicast addresses 17025 * that was brought UP on these. In the case of 17026 * a failover from A to B, we might have ilms belonging 17027 * to A (ilm_orig_ifindex pointing at A) on B accounting 17028 * for the membership from the userland. If we are failing 17029 * over from B to C now, we will find the ones belonging 17030 * to A on B. These don't account for the ill_ipif_up_count. 17031 * They just move from B to C. The check below on 17032 * ilm_orig_ifindex ensures that. 17033 */ 17034 if ((ilm->ilm_orig_ifindex == 17035 from_ill->ill_phyint->phyint_ifindex) && 17036 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 17037 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 17038 &ilm->ilm_v6addr))) { 17039 ASSERT(ilm->ilm_refcnt > 0); 17040 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 17041 /* 17042 * For indentation reasons, we are not using a 17043 * "else" here. 17044 */ 17045 if (count == 0) { 17046 ilmp = &ilm->ilm_next; 17047 continue; 17048 } 17049 ilm->ilm_refcnt -= count; 17050 if (new_ilm != NULL) { 17051 /* 17052 * Can find one with the same 17053 * ilm_orig_ifindex, if we are failing 17054 * over to a STANDBY. This happens 17055 * when somebody wants to join a group 17056 * on a STANDBY interface and we 17057 * internally join on a different one. 17058 * If we had joined on from_ill then, a 17059 * failover now will find a new ilm 17060 * with this index. 17061 */ 17062 ip1dbg(("ilm_move_v6: FAILOVER, found" 17063 " new ilm on %s, group address %s\n", 17064 to_ill->ill_name, 17065 inet_ntop(AF_INET6, 17066 &ilm->ilm_v6addr, buf, 17067 sizeof (buf)))); 17068 new_ilm->ilm_refcnt += count; 17069 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17070 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17071 new_ilm->ilm_is_new = B_TRUE; 17072 } 17073 } else { 17074 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 17075 if (new_ilm == NULL) { 17076 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 17077 " multicast address %s : from %s to" 17078 " %s failed : ENOMEM \n", 17079 inet_ntop(AF_INET6, 17080 &ilm->ilm_v6addr, buf, 17081 sizeof (buf)), from_ill->ill_name, 17082 to_ill->ill_name)); 17083 ilmp = &ilm->ilm_next; 17084 continue; 17085 } 17086 *new_ilm = *ilm; 17087 new_ilm->ilm_filter = NULL; 17088 new_ilm->ilm_refcnt = count; 17089 new_ilm->ilm_timer = INFINITY; 17090 new_ilm->ilm_rtx.rtx_timer = INFINITY; 17091 new_ilm->ilm_is_new = B_TRUE; 17092 /* 17093 * If the to_ill has not joined this 17094 * group we need to tell the driver in 17095 * ill_send_multicast_reqs. 17096 */ 17097 if (ilm_lookup_ill_v6(to_ill, 17098 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17099 new_ilm->ilm_notify_driver = B_TRUE; 17100 17101 new_ilm->ilm_ill = to_ill; 17102 /* Add to the to_ill's list */ 17103 new_ilm->ilm_next = to_ill->ill_ilm; 17104 to_ill->ill_ilm = new_ilm; 17105 ASSERT(new_ilm->ilm_ipif == NULL); 17106 } 17107 if (ilm->ilm_refcnt == 0) { 17108 goto bottom; 17109 } else { 17110 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17111 CLEAR_SLIST(new_ilm->ilm_filter); 17112 ilmp = &ilm->ilm_next; 17113 } 17114 continue; 17115 } else { 17116 /* 17117 * ifindex = 0 means, move everything pointing at 17118 * from_ill. We are doing this becuase ill has 17119 * either FAILED or became INACTIVE. 17120 * 17121 * As we would like to move things later back to 17122 * from_ill, we want to retain the identity of this 17123 * ilm. Thus, we don't blindly increment the reference 17124 * count on the ilms matching the address alone. We 17125 * need to match on the ilm_orig_index also. new_ilm 17126 * was obtained by matching ilm_orig_index also. 17127 */ 17128 if (new_ilm != NULL) { 17129 /* 17130 * This is possible only if a previous restore 17131 * was incomplete i.e restore to 17132 * ilm_orig_ifindex left some ilms because 17133 * of some failures. Thus when we are failing 17134 * again, we might find our old friends there. 17135 */ 17136 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 17137 " on %s, group address %s\n", 17138 to_ill->ill_name, 17139 inet_ntop(AF_INET6, 17140 &ilm->ilm_v6addr, buf, 17141 sizeof (buf)))); 17142 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17143 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17144 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17145 new_ilm->ilm_is_new = B_TRUE; 17146 } 17147 } else { 17148 if (from_ill->ill_ilm_walker_cnt != 0) { 17149 new_ilm = (ilm_t *) 17150 mi_zalloc(sizeof (ilm_t)); 17151 if (new_ilm == NULL) { 17152 ip0dbg(("ilm_move_v6: " 17153 "FAILOVER of IPv6" 17154 " multicast address %s : " 17155 "from %s to" 17156 " %s failed : ENOMEM \n", 17157 inet_ntop(AF_INET6, 17158 &ilm->ilm_v6addr, buf, 17159 sizeof (buf)), 17160 from_ill->ill_name, 17161 to_ill->ill_name)); 17162 17163 ilmp = &ilm->ilm_next; 17164 continue; 17165 } 17166 *new_ilm = *ilm; 17167 new_ilm->ilm_filter = NULL; 17168 } else { 17169 *ilmp = ilm->ilm_next; 17170 new_ilm = ilm; 17171 } 17172 /* 17173 * If the to_ill has not joined this 17174 * group we need to tell the driver in 17175 * ill_send_multicast_reqs. 17176 */ 17177 if (ilm_lookup_ill_v6(to_ill, 17178 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17179 new_ilm->ilm_notify_driver = B_TRUE; 17180 17181 /* Add to the to_ill's list */ 17182 new_ilm->ilm_next = to_ill->ill_ilm; 17183 to_ill->ill_ilm = new_ilm; 17184 ASSERT(ilm->ilm_ipif == NULL); 17185 new_ilm->ilm_ill = to_ill; 17186 new_ilm->ilm_is_new = B_TRUE; 17187 } 17188 17189 } 17190 17191 bottom: 17192 /* 17193 * Revert multicast filter state to (EXCLUDE, NULL). 17194 * new_ilm->ilm_is_new should already be set if needed. 17195 */ 17196 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17197 CLEAR_SLIST(new_ilm->ilm_filter); 17198 /* 17199 * We allocated/got a new ilm, free the old one. 17200 */ 17201 if (new_ilm != ilm) { 17202 if (from_ill->ill_ilm_walker_cnt == 0) { 17203 *ilmp = ilm->ilm_next; 17204 ilm->ilm_next = NULL; 17205 FREE_SLIST(ilm->ilm_filter); 17206 FREE_SLIST(ilm->ilm_pendsrcs); 17207 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17208 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17209 mi_free((char *)ilm); 17210 } else { 17211 ilm->ilm_flags |= ILM_DELETED; 17212 from_ill->ill_ilm_cleanup_reqd = 1; 17213 ilmp = &ilm->ilm_next; 17214 } 17215 } 17216 } 17217 } 17218 17219 /* 17220 * Move all the multicast memberships to to_ill. Called when 17221 * an ipif moves from "from_ill" to "to_ill". This function is slightly 17222 * different from IPv6 counterpart as multicast memberships are associated 17223 * with ills in IPv6. This function is called after every ipif is moved 17224 * unlike IPv6, where it is moved only once. 17225 */ 17226 static void 17227 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 17228 { 17229 ilm_t *ilm; 17230 ilm_t *ilm_next; 17231 ilm_t *new_ilm; 17232 ilm_t **ilmp; 17233 17234 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17235 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17236 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 17237 17238 ilmp = &from_ill->ill_ilm; 17239 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 17240 ilm_next = ilm->ilm_next; 17241 17242 if (ilm->ilm_flags & ILM_DELETED) { 17243 ilmp = &ilm->ilm_next; 17244 continue; 17245 } 17246 17247 ASSERT(ilm->ilm_ipif != NULL); 17248 17249 if (ilm->ilm_ipif != ipif) { 17250 ilmp = &ilm->ilm_next; 17251 continue; 17252 } 17253 17254 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 17255 htonl(INADDR_ALLHOSTS_GROUP)) { 17256 /* 17257 * We joined this in ipif_multicast_up 17258 * and we never did an ipif_multicast_down 17259 * for IPv4. If nobody else from the userland 17260 * has reference, we free the ilm, and later 17261 * when this ipif comes up on the new ill, 17262 * we will join this again. 17263 */ 17264 if (--ilm->ilm_refcnt == 0) 17265 goto delete_ilm; 17266 17267 new_ilm = ilm_lookup_ipif(ipif, 17268 V4_PART_OF_V6(ilm->ilm_v6addr)); 17269 if (new_ilm != NULL) { 17270 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17271 /* 17272 * We still need to deal with the from_ill. 17273 */ 17274 new_ilm->ilm_is_new = B_TRUE; 17275 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17276 CLEAR_SLIST(new_ilm->ilm_filter); 17277 goto delete_ilm; 17278 } 17279 /* 17280 * If we could not find one e.g. ipif is 17281 * still down on to_ill, we add this ilm 17282 * on ill_new to preserve the reference 17283 * count. 17284 */ 17285 } 17286 /* 17287 * When ipifs move, ilms always move with it 17288 * to the NEW ill. Thus we should never be 17289 * able to find ilm till we really move it here. 17290 */ 17291 ASSERT(ilm_lookup_ipif(ipif, 17292 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 17293 17294 if (from_ill->ill_ilm_walker_cnt != 0) { 17295 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 17296 if (new_ilm == NULL) { 17297 char buf[INET6_ADDRSTRLEN]; 17298 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 17299 " multicast address %s : " 17300 "from %s to" 17301 " %s failed : ENOMEM \n", 17302 inet_ntop(AF_INET, 17303 &ilm->ilm_v6addr, buf, 17304 sizeof (buf)), 17305 from_ill->ill_name, 17306 to_ill->ill_name)); 17307 17308 ilmp = &ilm->ilm_next; 17309 continue; 17310 } 17311 *new_ilm = *ilm; 17312 /* We don't want new_ilm linked to ilm's filter list */ 17313 new_ilm->ilm_filter = NULL; 17314 } else { 17315 /* Remove from the list */ 17316 *ilmp = ilm->ilm_next; 17317 new_ilm = ilm; 17318 } 17319 17320 /* 17321 * If we have never joined this group on the to_ill 17322 * make sure we tell the driver. 17323 */ 17324 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 17325 ALL_ZONES) == NULL) 17326 new_ilm->ilm_notify_driver = B_TRUE; 17327 17328 /* Add to the to_ill's list */ 17329 new_ilm->ilm_next = to_ill->ill_ilm; 17330 to_ill->ill_ilm = new_ilm; 17331 new_ilm->ilm_is_new = B_TRUE; 17332 17333 /* 17334 * Revert multicast filter state to (EXCLUDE, NULL) 17335 */ 17336 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17337 CLEAR_SLIST(new_ilm->ilm_filter); 17338 17339 /* 17340 * Delete only if we have allocated a new ilm. 17341 */ 17342 if (new_ilm != ilm) { 17343 delete_ilm: 17344 if (from_ill->ill_ilm_walker_cnt == 0) { 17345 /* Remove from the list */ 17346 *ilmp = ilm->ilm_next; 17347 ilm->ilm_next = NULL; 17348 FREE_SLIST(ilm->ilm_filter); 17349 FREE_SLIST(ilm->ilm_pendsrcs); 17350 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17351 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17352 mi_free((char *)ilm); 17353 } else { 17354 ilm->ilm_flags |= ILM_DELETED; 17355 from_ill->ill_ilm_cleanup_reqd = 1; 17356 ilmp = &ilm->ilm_next; 17357 } 17358 } 17359 } 17360 } 17361 17362 static uint_t 17363 ipif_get_id(ill_t *ill, uint_t id) 17364 { 17365 uint_t unit; 17366 ipif_t *tipif; 17367 boolean_t found = B_FALSE; 17368 17369 /* 17370 * During failback, we want to go back to the same id 17371 * instead of the smallest id so that the original 17372 * configuration is maintained. id is non-zero in that 17373 * case. 17374 */ 17375 if (id != 0) { 17376 /* 17377 * While failing back, if we still have an ipif with 17378 * MAX_ADDRS_PER_IF, it means this will be replaced 17379 * as soon as we return from this function. It was 17380 * to set to MAX_ADDRS_PER_IF by the caller so that 17381 * we can choose the smallest id. Thus we return zero 17382 * in that case ignoring the hint. 17383 */ 17384 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 17385 return (0); 17386 for (tipif = ill->ill_ipif; tipif != NULL; 17387 tipif = tipif->ipif_next) { 17388 if (tipif->ipif_id == id) { 17389 found = B_TRUE; 17390 break; 17391 } 17392 } 17393 /* 17394 * If somebody already plumbed another logical 17395 * with the same id, we won't be able to find it. 17396 */ 17397 if (!found) 17398 return (id); 17399 } 17400 for (unit = 0; unit <= ip_addrs_per_if; unit++) { 17401 found = B_FALSE; 17402 for (tipif = ill->ill_ipif; tipif != NULL; 17403 tipif = tipif->ipif_next) { 17404 if (tipif->ipif_id == unit) { 17405 found = B_TRUE; 17406 break; 17407 } 17408 } 17409 if (!found) 17410 break; 17411 } 17412 return (unit); 17413 } 17414 17415 /* ARGSUSED */ 17416 static int 17417 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 17418 ipif_t **rep_ipif_ptr) 17419 { 17420 ill_t *from_ill; 17421 ipif_t *rep_ipif; 17422 ipif_t **ipifp; 17423 uint_t unit; 17424 int err = 0; 17425 ipif_t *to_ipif; 17426 struct iocblk *iocp; 17427 boolean_t failback_cmd; 17428 boolean_t remove_ipif; 17429 int rc; 17430 17431 ASSERT(IAM_WRITER_ILL(to_ill)); 17432 ASSERT(IAM_WRITER_IPIF(ipif)); 17433 17434 iocp = (struct iocblk *)mp->b_rptr; 17435 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 17436 remove_ipif = B_FALSE; 17437 17438 from_ill = ipif->ipif_ill; 17439 17440 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17441 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17442 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 17443 17444 /* 17445 * Don't move LINK LOCAL addresses as they are tied to 17446 * physical interface. 17447 */ 17448 if (from_ill->ill_isv6 && 17449 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 17450 ipif->ipif_was_up = B_FALSE; 17451 IPIF_UNMARK_MOVING(ipif); 17452 return (0); 17453 } 17454 17455 /* 17456 * We set the ipif_id to maximum so that the search for 17457 * ipif_id will pick the lowest number i.e 0 in the 17458 * following 2 cases : 17459 * 17460 * 1) We have a replacement ipif at the head of to_ill. 17461 * We can't remove it yet as we can exceed ip_addrs_per_if 17462 * on to_ill and hence the MOVE might fail. We want to 17463 * remove it only if we could move the ipif. Thus, by 17464 * setting it to the MAX value, we make the search in 17465 * ipif_get_id return the zeroth id. 17466 * 17467 * 2) When DR pulls out the NIC and re-plumbs the interface, 17468 * we might just have a zero address plumbed on the ipif 17469 * with zero id in the case of IPv4. We remove that while 17470 * doing the failback. We want to remove it only if we 17471 * could move the ipif. Thus, by setting it to the MAX 17472 * value, we make the search in ipif_get_id return the 17473 * zeroth id. 17474 * 17475 * Both (1) and (2) are done only when when we are moving 17476 * an ipif (either due to failover/failback) which originally 17477 * belonged to this interface i.e the ipif_orig_ifindex is 17478 * the same as to_ill's ifindex. This is needed so that 17479 * FAILOVER from A -> B ( A failed) followed by FAILOVER 17480 * from B -> A (B is being removed from the group) and 17481 * FAILBACK from A -> B restores the original configuration. 17482 * Without the check for orig_ifindex, the second FAILOVER 17483 * could make the ipif belonging to B replace the A's zeroth 17484 * ipif and the subsequent failback re-creating the replacement 17485 * ipif again. 17486 * 17487 * NOTE : We created the replacement ipif when we did a 17488 * FAILOVER (See below). We could check for FAILBACK and 17489 * then look for replacement ipif to be removed. But we don't 17490 * want to do that because we wan't to allow the possibility 17491 * of a FAILOVER from A -> B (which creates the replacement ipif), 17492 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 17493 * from B -> A. 17494 */ 17495 to_ipif = to_ill->ill_ipif; 17496 if ((to_ill->ill_phyint->phyint_ifindex == 17497 ipif->ipif_orig_ifindex) && 17498 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 17499 ASSERT(to_ipif->ipif_id == 0); 17500 remove_ipif = B_TRUE; 17501 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 17502 } 17503 /* 17504 * Find the lowest logical unit number on the to_ill. 17505 * If we are failing back, try to get the original id 17506 * rather than the lowest one so that the original 17507 * configuration is maintained. 17508 * 17509 * XXX need a better scheme for this. 17510 */ 17511 if (failback_cmd) { 17512 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 17513 } else { 17514 unit = ipif_get_id(to_ill, 0); 17515 } 17516 17517 /* Reset back to zero in case we fail below */ 17518 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 17519 to_ipif->ipif_id = 0; 17520 17521 if (unit == ip_addrs_per_if) { 17522 ipif->ipif_was_up = B_FALSE; 17523 IPIF_UNMARK_MOVING(ipif); 17524 return (EINVAL); 17525 } 17526 17527 /* 17528 * ipif is ready to move from "from_ill" to "to_ill". 17529 * 17530 * 1) If we are moving ipif with id zero, create a 17531 * replacement ipif for this ipif on from_ill. If this fails 17532 * fail the MOVE operation. 17533 * 17534 * 2) Remove the replacement ipif on to_ill if any. 17535 * We could remove the replacement ipif when we are moving 17536 * the ipif with id zero. But what if somebody already 17537 * unplumbed it ? Thus we always remove it if it is present. 17538 * We want to do it only if we are sure we are going to 17539 * move the ipif to to_ill which is why there are no 17540 * returns due to error till ipif is linked to to_ill. 17541 * Note that the first ipif that we failback will always 17542 * be zero if it is present. 17543 */ 17544 if (ipif->ipif_id == 0) { 17545 ipaddr_t inaddr_any = INADDR_ANY; 17546 17547 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 17548 if (rep_ipif == NULL) { 17549 ipif->ipif_was_up = B_FALSE; 17550 IPIF_UNMARK_MOVING(ipif); 17551 return (ENOMEM); 17552 } 17553 *rep_ipif = ipif_zero; 17554 /* 17555 * Before we put the ipif on the list, store the addresses 17556 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 17557 * assumes so. This logic is not any different from what 17558 * ipif_allocate does. 17559 */ 17560 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17561 &rep_ipif->ipif_v6lcl_addr); 17562 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17563 &rep_ipif->ipif_v6src_addr); 17564 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17565 &rep_ipif->ipif_v6subnet); 17566 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17567 &rep_ipif->ipif_v6net_mask); 17568 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17569 &rep_ipif->ipif_v6brd_addr); 17570 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17571 &rep_ipif->ipif_v6pp_dst_addr); 17572 /* 17573 * We mark IPIF_NOFAILOVER so that this can never 17574 * move. 17575 */ 17576 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 17577 rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; 17578 rep_ipif->ipif_replace_zero = B_TRUE; 17579 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 17580 MUTEX_DEFAULT, NULL); 17581 rep_ipif->ipif_id = 0; 17582 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 17583 rep_ipif->ipif_ill = from_ill; 17584 rep_ipif->ipif_orig_ifindex = 17585 from_ill->ill_phyint->phyint_ifindex; 17586 /* Insert at head */ 17587 rep_ipif->ipif_next = from_ill->ill_ipif; 17588 from_ill->ill_ipif = rep_ipif; 17589 /* 17590 * We don't really care to let apps know about 17591 * this interface. 17592 */ 17593 } 17594 17595 if (remove_ipif) { 17596 /* 17597 * We set to a max value above for this case to get 17598 * id zero. ASSERT that we did get one. 17599 */ 17600 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 17601 rep_ipif = to_ipif; 17602 to_ill->ill_ipif = rep_ipif->ipif_next; 17603 rep_ipif->ipif_next = NULL; 17604 /* 17605 * If some apps scanned and find this interface, 17606 * it is time to let them know, so that they can 17607 * delete it. 17608 */ 17609 17610 *rep_ipif_ptr = rep_ipif; 17611 } 17612 17613 /* Get it out of the ILL interface list. */ 17614 ipifp = &ipif->ipif_ill->ill_ipif; 17615 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 17616 if (*ipifp == ipif) { 17617 *ipifp = ipif->ipif_next; 17618 break; 17619 } 17620 } 17621 17622 /* Assign the new ill */ 17623 ipif->ipif_ill = to_ill; 17624 ipif->ipif_id = unit; 17625 /* id has already been checked */ 17626 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 17627 ASSERT(rc == 0); 17628 /* Let SCTP update its list */ 17629 sctp_move_ipif(ipif, from_ill, to_ill); 17630 /* 17631 * Handle the failover and failback of ipif_t between 17632 * ill_t that have differing maximum mtu values. 17633 */ 17634 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 17635 if (ipif->ipif_saved_mtu == 0) { 17636 /* 17637 * As this ipif_t is moving to an ill_t 17638 * that has a lower ill_max_mtu, its 17639 * ipif_mtu needs to be saved so it can 17640 * be restored during failback or during 17641 * failover to an ill_t which has a 17642 * higher ill_max_mtu. 17643 */ 17644 ipif->ipif_saved_mtu = ipif->ipif_mtu; 17645 ipif->ipif_mtu = to_ill->ill_max_mtu; 17646 } else { 17647 /* 17648 * The ipif_t is, once again, moving to 17649 * an ill_t that has a lower maximum mtu 17650 * value. 17651 */ 17652 ipif->ipif_mtu = to_ill->ill_max_mtu; 17653 } 17654 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 17655 ipif->ipif_saved_mtu != 0) { 17656 /* 17657 * The mtu of this ipif_t had to be reduced 17658 * during an earlier failover; this is an 17659 * opportunity for it to be increased (either as 17660 * part of another failover or a failback). 17661 */ 17662 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 17663 ipif->ipif_mtu = ipif->ipif_saved_mtu; 17664 ipif->ipif_saved_mtu = 0; 17665 } else { 17666 ipif->ipif_mtu = to_ill->ill_max_mtu; 17667 } 17668 } 17669 17670 /* 17671 * We preserve all the other fields of the ipif including 17672 * ipif_saved_ire_mp. The routes that are saved here will 17673 * be recreated on the new interface and back on the old 17674 * interface when we move back. 17675 */ 17676 ASSERT(ipif->ipif_arp_del_mp == NULL); 17677 17678 return (err); 17679 } 17680 17681 static int 17682 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 17683 int ifindex, ipif_t **rep_ipif_ptr) 17684 { 17685 ipif_t *mipif; 17686 ipif_t *ipif_next; 17687 int err; 17688 17689 /* 17690 * We don't really try to MOVE back things if some of the 17691 * operations fail. The daemon will take care of moving again 17692 * later on. 17693 */ 17694 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 17695 ipif_next = mipif->ipif_next; 17696 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 17697 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 17698 17699 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 17700 17701 /* 17702 * When the MOVE fails, it is the job of the 17703 * application to take care of this properly 17704 * i.e try again if it is ENOMEM. 17705 */ 17706 if (mipif->ipif_ill != from_ill) { 17707 /* 17708 * ipif has moved. 17709 * 17710 * Move the multicast memberships associated 17711 * with this ipif to the new ill. For IPv6, we 17712 * do it once after all the ipifs are moved 17713 * (in ill_move) as they are not associated 17714 * with ipifs. 17715 * 17716 * We need to move the ilms as the ipif has 17717 * already been moved to a new ill even 17718 * in the case of errors. Neither 17719 * ilm_free(ipif) will find the ilm 17720 * when somebody unplumbs this ipif nor 17721 * ilm_delete(ilm) will be able to find the 17722 * ilm, if we don't move now. 17723 */ 17724 if (!from_ill->ill_isv6) 17725 ilm_move_v4(from_ill, to_ill, mipif); 17726 } 17727 17728 if (err != 0) 17729 return (err); 17730 } 17731 } 17732 return (0); 17733 } 17734 17735 static int 17736 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 17737 { 17738 int ifindex; 17739 int err; 17740 struct iocblk *iocp; 17741 ipif_t *ipif; 17742 ipif_t *rep_ipif_ptr = NULL; 17743 ipif_t *from_ipif = NULL; 17744 boolean_t check_rep_if = B_FALSE; 17745 17746 iocp = (struct iocblk *)mp->b_rptr; 17747 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 17748 /* 17749 * Move everything pointing at from_ill to to_ill. 17750 * We acheive this by passing in 0 as ifindex. 17751 */ 17752 ifindex = 0; 17753 } else { 17754 /* 17755 * Move everything pointing at from_ill whose original 17756 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 17757 * We acheive this by passing in ifindex rather than 0. 17758 * Multicast vifs, ilgs move implicitly because ipifs move. 17759 */ 17760 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 17761 ifindex = to_ill->ill_phyint->phyint_ifindex; 17762 } 17763 17764 /* 17765 * Determine if there is at least one ipif that would move from 17766 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 17767 * ipif (if it exists) on the to_ill would be consumed as a result of 17768 * the move, in which case we need to quiesce the replacement ipif also. 17769 */ 17770 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 17771 from_ipif = from_ipif->ipif_next) { 17772 if (((ifindex == 0) || 17773 (ifindex == from_ipif->ipif_orig_ifindex)) && 17774 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 17775 check_rep_if = B_TRUE; 17776 break; 17777 } 17778 } 17779 17780 17781 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 17782 17783 GRAB_ILL_LOCKS(from_ill, to_ill); 17784 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 17785 (void) ipsq_pending_mp_add(NULL, ipif, q, 17786 mp, ILL_MOVE_OK); 17787 RELEASE_ILL_LOCKS(from_ill, to_ill); 17788 return (EINPROGRESS); 17789 } 17790 17791 /* Check if the replacement ipif is quiescent to delete */ 17792 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 17793 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 17794 to_ill->ill_ipif->ipif_state_flags |= 17795 IPIF_MOVING | IPIF_CHANGING; 17796 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 17797 (void) ipsq_pending_mp_add(NULL, ipif, q, 17798 mp, ILL_MOVE_OK); 17799 RELEASE_ILL_LOCKS(from_ill, to_ill); 17800 return (EINPROGRESS); 17801 } 17802 } 17803 RELEASE_ILL_LOCKS(from_ill, to_ill); 17804 17805 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 17806 rw_enter(&ill_g_lock, RW_WRITER); 17807 GRAB_ILL_LOCKS(from_ill, to_ill); 17808 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 17809 17810 /* ilm_move is done inside ipif_move for IPv4 */ 17811 if (err == 0 && from_ill->ill_isv6) 17812 ilm_move_v6(from_ill, to_ill, ifindex); 17813 17814 RELEASE_ILL_LOCKS(from_ill, to_ill); 17815 rw_exit(&ill_g_lock); 17816 17817 /* 17818 * send rts messages and multicast messages. 17819 */ 17820 if (rep_ipif_ptr != NULL) { 17821 if (rep_ipif_ptr->ipif_recovery_id != 0) { 17822 (void) untimeout(rep_ipif_ptr->ipif_recovery_id); 17823 rep_ipif_ptr->ipif_recovery_id = 0; 17824 } 17825 ip_rts_ifmsg(rep_ipif_ptr); 17826 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 17827 IPIF_TRACE_CLEANUP(rep_ipif_ptr); 17828 mi_free(rep_ipif_ptr); 17829 } 17830 17831 conn_move_ill(from_ill, to_ill, ifindex); 17832 17833 return (err); 17834 } 17835 17836 /* 17837 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 17838 * Also checks for the validity of the arguments. 17839 * Note: We are already exclusive inside the from group. 17840 * It is upto the caller to release refcnt on the to_ill's. 17841 */ 17842 static int 17843 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 17844 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 17845 { 17846 int dst_index; 17847 ipif_t *ipif_v4, *ipif_v6; 17848 struct lifreq *lifr; 17849 mblk_t *mp1; 17850 boolean_t exists; 17851 sin_t *sin; 17852 int err = 0; 17853 17854 if ((mp1 = mp->b_cont) == NULL) 17855 return (EPROTO); 17856 17857 if ((mp1 = mp1->b_cont) == NULL) 17858 return (EPROTO); 17859 17860 lifr = (struct lifreq *)mp1->b_rptr; 17861 sin = (sin_t *)&lifr->lifr_addr; 17862 17863 /* 17864 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 17865 * specific operations. 17866 */ 17867 if (sin->sin_family != AF_UNSPEC) 17868 return (EINVAL); 17869 17870 /* 17871 * Get ipif with id 0. We are writer on the from ill. So we can pass 17872 * NULLs for the last 4 args and we know the lookup won't fail 17873 * with EINPROGRESS. 17874 */ 17875 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 17876 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 17877 ALL_ZONES, NULL, NULL, NULL, NULL); 17878 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 17879 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 17880 ALL_ZONES, NULL, NULL, NULL, NULL); 17881 17882 if (ipif_v4 == NULL && ipif_v6 == NULL) 17883 return (ENXIO); 17884 17885 if (ipif_v4 != NULL) { 17886 ASSERT(ipif_v4->ipif_refcnt != 0); 17887 if (ipif_v4->ipif_id != 0) { 17888 err = EINVAL; 17889 goto done; 17890 } 17891 17892 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 17893 *ill_from_v4 = ipif_v4->ipif_ill; 17894 } 17895 17896 if (ipif_v6 != NULL) { 17897 ASSERT(ipif_v6->ipif_refcnt != 0); 17898 if (ipif_v6->ipif_id != 0) { 17899 err = EINVAL; 17900 goto done; 17901 } 17902 17903 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 17904 *ill_from_v6 = ipif_v6->ipif_ill; 17905 } 17906 17907 err = 0; 17908 dst_index = lifr->lifr_movetoindex; 17909 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 17910 q, mp, ip_process_ioctl, &err); 17911 if (err != 0) { 17912 /* 17913 * There could be only v6. 17914 */ 17915 if (err != ENXIO) 17916 goto done; 17917 err = 0; 17918 } 17919 17920 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 17921 q, mp, ip_process_ioctl, &err); 17922 if (err != 0) { 17923 if (err != ENXIO) 17924 goto done; 17925 if (*ill_to_v4 == NULL) { 17926 err = ENXIO; 17927 goto done; 17928 } 17929 err = 0; 17930 } 17931 17932 /* 17933 * If we have something to MOVE i.e "from" not NULL, 17934 * "to" should be non-NULL. 17935 */ 17936 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 17937 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 17938 err = EINVAL; 17939 } 17940 17941 done: 17942 if (ipif_v4 != NULL) 17943 ipif_refrele(ipif_v4); 17944 if (ipif_v6 != NULL) 17945 ipif_refrele(ipif_v6); 17946 return (err); 17947 } 17948 17949 /* 17950 * FAILOVER and FAILBACK are modelled as MOVE operations. 17951 * 17952 * We don't check whether the MOVE is within the same group or 17953 * not, because this ioctl can be used as a generic mechanism 17954 * to failover from interface A to B, though things will function 17955 * only if they are really part of the same group. Moreover, 17956 * all ipifs may be down and hence temporarily out of the group. 17957 * 17958 * ipif's that need to be moved are first brought down; V4 ipifs are brought 17959 * down first and then V6. For each we wait for the ipif's to become quiescent. 17960 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 17961 * have been deleted and there are no active references. Once quiescent the 17962 * ipif's are moved and brought up on the new ill. 17963 * 17964 * Normally the source ill and destination ill belong to the same IPMP group 17965 * and hence the same ipsq_t. In the event they don't belong to the same 17966 * same group the two ipsq's are first merged into one ipsq - that of the 17967 * to_ill. The multicast memberships on the source and destination ill cannot 17968 * change during the move operation since multicast joins/leaves also have to 17969 * execute on the same ipsq and are hence serialized. 17970 */ 17971 /* ARGSUSED */ 17972 int 17973 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17974 ip_ioctl_cmd_t *ipip, void *ifreq) 17975 { 17976 ill_t *ill_to_v4 = NULL; 17977 ill_t *ill_to_v6 = NULL; 17978 ill_t *ill_from_v4 = NULL; 17979 ill_t *ill_from_v6 = NULL; 17980 int err = 0; 17981 17982 /* 17983 * setup from and to ill's, we can get EINPROGRESS only for 17984 * to_ill's. 17985 */ 17986 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 17987 &ill_to_v4, &ill_to_v6); 17988 17989 if (err != 0) { 17990 ip0dbg(("ip_sioctl_move: extract args failed\n")); 17991 goto done; 17992 } 17993 17994 /* 17995 * nothing to do. 17996 */ 17997 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 17998 goto done; 17999 } 18000 18001 /* 18002 * nothing to do. 18003 */ 18004 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 18005 goto done; 18006 } 18007 18008 /* 18009 * Mark the ill as changing. 18010 * ILL_CHANGING flag is cleared when the ipif's are brought up 18011 * in ill_up_ipifs in case of error they are cleared below. 18012 */ 18013 18014 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 18015 if (ill_from_v4 != NULL) 18016 ill_from_v4->ill_state_flags |= ILL_CHANGING; 18017 if (ill_from_v6 != NULL) 18018 ill_from_v6->ill_state_flags |= ILL_CHANGING; 18019 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 18020 18021 /* 18022 * Make sure that both src and dst are 18023 * in the same syncq group. If not make it happen. 18024 * We are not holding any locks because we are the writer 18025 * on the from_ipsq and we will hold locks in ill_merge_groups 18026 * to protect to_ipsq against changing. 18027 */ 18028 if (ill_from_v4 != NULL) { 18029 if (ill_from_v4->ill_phyint->phyint_ipsq != 18030 ill_to_v4->ill_phyint->phyint_ipsq) { 18031 err = ill_merge_groups(ill_from_v4, ill_to_v4, 18032 NULL, mp, q); 18033 goto err_ret; 18034 18035 } 18036 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 18037 } else { 18038 18039 if (ill_from_v6->ill_phyint->phyint_ipsq != 18040 ill_to_v6->ill_phyint->phyint_ipsq) { 18041 err = ill_merge_groups(ill_from_v6, ill_to_v6, 18042 NULL, mp, q); 18043 goto err_ret; 18044 18045 } 18046 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 18047 } 18048 18049 /* 18050 * Now that the ipsq's have been merged and we are the writer 18051 * lets mark to_ill as changing as well. 18052 */ 18053 18054 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 18055 if (ill_to_v4 != NULL) 18056 ill_to_v4->ill_state_flags |= ILL_CHANGING; 18057 if (ill_to_v6 != NULL) 18058 ill_to_v6->ill_state_flags |= ILL_CHANGING; 18059 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 18060 18061 /* 18062 * Its ok for us to proceed with the move even if 18063 * ill_pending_mp is non null on one of the from ill's as the reply 18064 * should not be looking at the ipif, it should only care about the 18065 * ill itself. 18066 */ 18067 18068 /* 18069 * lets move ipv4 first. 18070 */ 18071 if (ill_from_v4 != NULL) { 18072 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 18073 ill_from_v4->ill_move_in_progress = B_TRUE; 18074 ill_to_v4->ill_move_in_progress = B_TRUE; 18075 ill_to_v4->ill_move_peer = ill_from_v4; 18076 ill_from_v4->ill_move_peer = ill_to_v4; 18077 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 18078 } 18079 18080 /* 18081 * Now lets move ipv6. 18082 */ 18083 if (err == 0 && ill_from_v6 != NULL) { 18084 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 18085 ill_from_v6->ill_move_in_progress = B_TRUE; 18086 ill_to_v6->ill_move_in_progress = B_TRUE; 18087 ill_to_v6->ill_move_peer = ill_from_v6; 18088 ill_from_v6->ill_move_peer = ill_to_v6; 18089 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 18090 } 18091 18092 err_ret: 18093 /* 18094 * EINPROGRESS means we are waiting for the ipif's that need to be 18095 * moved to become quiescent. 18096 */ 18097 if (err == EINPROGRESS) { 18098 goto done; 18099 } 18100 18101 /* 18102 * if err is set ill_up_ipifs will not be called 18103 * lets clear the flags. 18104 */ 18105 18106 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 18107 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 18108 /* 18109 * Some of the clearing may be redundant. But it is simple 18110 * not making any extra checks. 18111 */ 18112 if (ill_from_v6 != NULL) { 18113 ill_from_v6->ill_move_in_progress = B_FALSE; 18114 ill_from_v6->ill_move_peer = NULL; 18115 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 18116 } 18117 if (ill_from_v4 != NULL) { 18118 ill_from_v4->ill_move_in_progress = B_FALSE; 18119 ill_from_v4->ill_move_peer = NULL; 18120 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 18121 } 18122 if (ill_to_v6 != NULL) { 18123 ill_to_v6->ill_move_in_progress = B_FALSE; 18124 ill_to_v6->ill_move_peer = NULL; 18125 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 18126 } 18127 if (ill_to_v4 != NULL) { 18128 ill_to_v4->ill_move_in_progress = B_FALSE; 18129 ill_to_v4->ill_move_peer = NULL; 18130 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 18131 } 18132 18133 /* 18134 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 18135 * Do this always to maintain proper state i.e even in case of errors. 18136 * As phyint_inactive looks at both v4 and v6 interfaces, 18137 * we need not call on both v4 and v6 interfaces. 18138 */ 18139 if (ill_from_v4 != NULL) { 18140 if ((ill_from_v4->ill_phyint->phyint_flags & 18141 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 18142 phyint_inactive(ill_from_v4->ill_phyint); 18143 } 18144 } else if (ill_from_v6 != NULL) { 18145 if ((ill_from_v6->ill_phyint->phyint_flags & 18146 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 18147 phyint_inactive(ill_from_v6->ill_phyint); 18148 } 18149 } 18150 18151 if (ill_to_v4 != NULL) { 18152 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 18153 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 18154 } 18155 } else if (ill_to_v6 != NULL) { 18156 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 18157 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 18158 } 18159 } 18160 18161 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 18162 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 18163 18164 no_err: 18165 /* 18166 * lets bring the interfaces up on the to_ill. 18167 */ 18168 if (err == 0) { 18169 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 18170 q, mp); 18171 } 18172 18173 if (err == 0) { 18174 if (ill_from_v4 != NULL && ill_to_v4 != NULL) 18175 ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); 18176 18177 if (ill_from_v6 != NULL && ill_to_v6 != NULL) 18178 ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); 18179 } 18180 done: 18181 18182 if (ill_to_v4 != NULL) { 18183 ill_refrele(ill_to_v4); 18184 } 18185 if (ill_to_v6 != NULL) { 18186 ill_refrele(ill_to_v6); 18187 } 18188 18189 return (err); 18190 } 18191 18192 static void 18193 ill_dl_down(ill_t *ill) 18194 { 18195 /* 18196 * The ill is down; unbind but stay attached since we're still 18197 * associated with a PPA. If we have negotiated DLPI capabilites 18198 * with the data link service provider (IDS_OK) then reset them. 18199 * The interval between unbinding and rebinding is potentially 18200 * unbounded hence we cannot assume things will be the same. 18201 * The DLPI capabilities will be probed again when the data link 18202 * is brought up. 18203 */ 18204 mblk_t *mp = ill->ill_unbind_mp; 18205 hook_nic_event_t *info; 18206 18207 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 18208 18209 ill->ill_unbind_mp = NULL; 18210 if (mp != NULL) { 18211 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 18212 dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 18213 ill->ill_name)); 18214 mutex_enter(&ill->ill_lock); 18215 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 18216 mutex_exit(&ill->ill_lock); 18217 if (ill->ill_dlpi_capab_state == IDS_OK) 18218 ill_capability_reset(ill); 18219 ill_dlpi_send(ill, mp); 18220 } 18221 18222 /* 18223 * Toss all of our multicast memberships. We could keep them, but 18224 * then we'd have to do bookkeeping of any joins and leaves performed 18225 * by the application while the the interface is down (we can't just 18226 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 18227 * on a downed interface). 18228 */ 18229 ill_leave_multicast(ill); 18230 18231 mutex_enter(&ill->ill_lock); 18232 18233 ill->ill_dl_up = 0; 18234 18235 if ((info = ill->ill_nic_event_info) != NULL) { 18236 ip2dbg(("ill_dl_down:unexpected nic event %d attached for %s\n", 18237 info->hne_event, ill->ill_name)); 18238 if (info->hne_data != NULL) 18239 kmem_free(info->hne_data, info->hne_datalen); 18240 kmem_free(info, sizeof (hook_nic_event_t)); 18241 } 18242 18243 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 18244 if (info != NULL) { 18245 info->hne_nic = ill->ill_phyint->phyint_ifindex; 18246 info->hne_lif = 0; 18247 info->hne_event = NE_DOWN; 18248 info->hne_data = NULL; 18249 info->hne_datalen = 0; 18250 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 18251 } else 18252 ip2dbg(("ill_dl_down: could not attach DOWN nic event " 18253 "information for %s (ENOMEM)\n", ill->ill_name)); 18254 18255 ill->ill_nic_event_info = info; 18256 18257 mutex_exit(&ill->ill_lock); 18258 } 18259 18260 void 18261 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 18262 { 18263 union DL_primitives *dlp; 18264 t_uscalar_t prim; 18265 18266 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 18267 18268 dlp = (union DL_primitives *)mp->b_rptr; 18269 prim = dlp->dl_primitive; 18270 18271 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 18272 dlpi_prim_str(prim), prim, ill->ill_name)); 18273 18274 switch (prim) { 18275 case DL_PHYS_ADDR_REQ: 18276 { 18277 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 18278 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 18279 break; 18280 } 18281 case DL_BIND_REQ: 18282 mutex_enter(&ill->ill_lock); 18283 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 18284 mutex_exit(&ill->ill_lock); 18285 break; 18286 } 18287 18288 /* 18289 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 18290 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 18291 * we only wait for the ACK of the DL_UNBIND_REQ. 18292 */ 18293 mutex_enter(&ill->ill_lock); 18294 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 18295 (prim == DL_UNBIND_REQ)) { 18296 ill->ill_dlpi_pending = prim; 18297 } 18298 mutex_exit(&ill->ill_lock); 18299 18300 /* 18301 * Some drivers send M_FLUSH up to IP as part of unbind 18302 * request. When this M_FLUSH is sent back to the driver, 18303 * this can go after we send the detach request if the 18304 * M_FLUSH ends up in IP's syncq. To avoid that, we reply 18305 * to the M_FLUSH in ip_rput and locally generate another 18306 * M_FLUSH for the correctness. This will get freed in 18307 * ip_wput_nondata. 18308 */ 18309 if (prim == DL_UNBIND_REQ) 18310 (void) putnextctl1(ill->ill_rq, M_FLUSH, FLUSHRW); 18311 18312 putnext(ill->ill_wq, mp); 18313 } 18314 18315 /* 18316 * Send a DLPI control message to the driver but make sure there 18317 * is only one outstanding message. Uses ill_dlpi_pending to tell 18318 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 18319 * when an ACK or a NAK is received to process the next queued message. 18320 * 18321 * We don't protect ill_dlpi_pending with any lock. This is okay as 18322 * every place where its accessed, ip is exclusive while accessing 18323 * ill_dlpi_pending except when this function is called from ill_init() 18324 */ 18325 void 18326 ill_dlpi_send(ill_t *ill, mblk_t *mp) 18327 { 18328 mblk_t **mpp; 18329 18330 ASSERT(IAM_WRITER_ILL(ill)); 18331 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 18332 18333 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 18334 /* Must queue message. Tail insertion */ 18335 mpp = &ill->ill_dlpi_deferred; 18336 while (*mpp != NULL) 18337 mpp = &((*mpp)->b_next); 18338 18339 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 18340 ill->ill_name)); 18341 18342 *mpp = mp; 18343 return; 18344 } 18345 18346 ill_dlpi_dispatch(ill, mp); 18347 } 18348 18349 /* 18350 * Called when an DLPI control message has been acked or nacked to 18351 * send down the next queued message (if any). 18352 */ 18353 void 18354 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 18355 { 18356 mblk_t *mp; 18357 18358 ASSERT(IAM_WRITER_ILL(ill)); 18359 18360 ASSERT(prim != DL_PRIM_INVAL); 18361 if (ill->ill_dlpi_pending != prim) { 18362 if (ill->ill_dlpi_pending == DL_PRIM_INVAL) { 18363 (void) mi_strlog(ill->ill_rq, 1, 18364 SL_CONSOLE|SL_ERROR|SL_TRACE, 18365 "ill_dlpi_done: unsolicited ack for %s from %s\n", 18366 dlpi_prim_str(prim), ill->ill_name); 18367 } else { 18368 (void) mi_strlog(ill->ill_rq, 1, 18369 SL_CONSOLE|SL_ERROR|SL_TRACE, 18370 "ill_dlpi_done: unexpected ack for %s from %s " 18371 "(expecting ack for %s)\n", 18372 dlpi_prim_str(prim), ill->ill_name, 18373 dlpi_prim_str(ill->ill_dlpi_pending)); 18374 } 18375 return; 18376 } 18377 18378 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 18379 dlpi_prim_str(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 18380 18381 if ((mp = ill->ill_dlpi_deferred) == NULL) { 18382 mutex_enter(&ill->ill_lock); 18383 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18384 cv_signal(&ill->ill_cv); 18385 mutex_exit(&ill->ill_lock); 18386 return; 18387 } 18388 18389 ill->ill_dlpi_deferred = mp->b_next; 18390 mp->b_next = NULL; 18391 18392 ill_dlpi_dispatch(ill, mp); 18393 } 18394 18395 void 18396 conn_delete_ire(conn_t *connp, caddr_t arg) 18397 { 18398 ipif_t *ipif = (ipif_t *)arg; 18399 ire_t *ire; 18400 18401 /* 18402 * Look at the cached ires on conns which has pointers to ipifs. 18403 * We just call ire_refrele which clears up the reference 18404 * to ire. Called when a conn closes. Also called from ipif_free 18405 * to cleanup indirect references to the stale ipif via the cached ire. 18406 */ 18407 mutex_enter(&connp->conn_lock); 18408 ire = connp->conn_ire_cache; 18409 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 18410 connp->conn_ire_cache = NULL; 18411 mutex_exit(&connp->conn_lock); 18412 IRE_REFRELE_NOTR(ire); 18413 return; 18414 } 18415 mutex_exit(&connp->conn_lock); 18416 18417 } 18418 18419 /* 18420 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 18421 * of IREs. Those IREs may have been previously cached in the conn structure. 18422 * This ipcl_walk() walker function releases all references to such IREs based 18423 * on the condemned flag. 18424 */ 18425 /* ARGSUSED */ 18426 void 18427 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 18428 { 18429 ire_t *ire; 18430 18431 mutex_enter(&connp->conn_lock); 18432 ire = connp->conn_ire_cache; 18433 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 18434 connp->conn_ire_cache = NULL; 18435 mutex_exit(&connp->conn_lock); 18436 IRE_REFRELE_NOTR(ire); 18437 return; 18438 } 18439 mutex_exit(&connp->conn_lock); 18440 } 18441 18442 /* 18443 * Take down a specific interface, but don't lose any information about it. 18444 * Also delete interface from its interface group (ifgrp). 18445 * (Always called as writer.) 18446 * This function goes through the down sequence even if the interface is 18447 * already down. There are 2 reasons. 18448 * a. Currently we permit interface routes that depend on down interfaces 18449 * to be added. This behaviour itself is questionable. However it appears 18450 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 18451 * time. We go thru the cleanup in order to remove these routes. 18452 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 18453 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 18454 * down, but we need to cleanup i.e. do ill_dl_down and 18455 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 18456 * 18457 * IP-MT notes: 18458 * 18459 * Model of reference to interfaces. 18460 * 18461 * The following members in ipif_t track references to the ipif. 18462 * int ipif_refcnt; Active reference count 18463 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 18464 * The following members in ill_t track references to the ill. 18465 * int ill_refcnt; active refcnt 18466 * uint_t ill_ire_cnt; Number of ires referencing ill 18467 * uint_t ill_nce_cnt; Number of nces referencing ill 18468 * 18469 * Reference to an ipif or ill can be obtained in any of the following ways. 18470 * 18471 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 18472 * Pointers to ipif / ill from other data structures viz ire and conn. 18473 * Implicit reference to the ipif / ill by holding a reference to the ire. 18474 * 18475 * The ipif/ill lookup functions return a reference held ipif / ill. 18476 * ipif_refcnt and ill_refcnt track the reference counts respectively. 18477 * This is a purely dynamic reference count associated with threads holding 18478 * references to the ipif / ill. Pointers from other structures do not 18479 * count towards this reference count. 18480 * 18481 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 18482 * ipif/ill. This is incremented whenever a new ire is created referencing the 18483 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 18484 * actually added to the ire hash table. The count is decremented in 18485 * ire_inactive where the ire is destroyed. 18486 * 18487 * nce's reference ill's thru nce_ill and the count of nce's associated with 18488 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 18489 * ndp_add() where the nce is actually added to the table. Similarly it is 18490 * decremented in ndp_inactive where the nce is destroyed. 18491 * 18492 * Flow of ioctls involving interface down/up 18493 * 18494 * The following is the sequence of an attempt to set some critical flags on an 18495 * up interface. 18496 * ip_sioctl_flags 18497 * ipif_down 18498 * wait for ipif to be quiescent 18499 * ipif_down_tail 18500 * ip_sioctl_flags_tail 18501 * 18502 * All set ioctls that involve down/up sequence would have a skeleton similar 18503 * to the above. All the *tail functions are called after the refcounts have 18504 * dropped to the appropriate values. 18505 * 18506 * The mechanism to quiesce an ipif is as follows. 18507 * 18508 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 18509 * on the ipif. Callers either pass a flag requesting wait or the lookup 18510 * functions will return NULL. 18511 * 18512 * Delete all ires referencing this ipif 18513 * 18514 * Any thread attempting to do an ipif_refhold on an ipif that has been 18515 * obtained thru a cached pointer will first make sure that 18516 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 18517 * increment the refcount. 18518 * 18519 * The above guarantees that the ipif refcount will eventually come down to 18520 * zero and the ipif will quiesce, once all threads that currently hold a 18521 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 18522 * ipif_refcount has dropped to zero and all ire's associated with this ipif 18523 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 18524 * drop to zero. 18525 * 18526 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 18527 * 18528 * Threads trying to lookup an ipif or ill can pass a flag requesting 18529 * wait and restart if the ipif / ill cannot be looked up currently. 18530 * For eg. bind, and route operations (Eg. route add / delete) cannot return 18531 * failure if the ipif is currently undergoing an exclusive operation, and 18532 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 18533 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 18534 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 18535 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 18536 * change while the ill_lock is held. Before dropping the ill_lock we acquire 18537 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 18538 * until we release the ipsq_lock, even though the the ill/ipif state flags 18539 * can change after we drop the ill_lock. 18540 * 18541 * An attempt to send out a packet using an ipif that is currently 18542 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 18543 * operation and restart it later when the exclusive condition on the ipif ends. 18544 * This is an example of not passing the wait flag to the lookup functions. For 18545 * example an attempt to refhold and use conn->conn_multicast_ipif and send 18546 * out a multicast packet on that ipif will fail while the ipif is 18547 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 18548 * currently IPIF_CHANGING will also fail. 18549 */ 18550 int 18551 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18552 { 18553 ill_t *ill = ipif->ipif_ill; 18554 phyint_t *phyi; 18555 conn_t *connp; 18556 boolean_t success; 18557 boolean_t ipif_was_up = B_FALSE; 18558 18559 ASSERT(IAM_WRITER_IPIF(ipif)); 18560 18561 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 18562 18563 if (ipif->ipif_flags & IPIF_UP) { 18564 mutex_enter(&ill->ill_lock); 18565 ipif->ipif_flags &= ~IPIF_UP; 18566 ASSERT(ill->ill_ipif_up_count > 0); 18567 --ill->ill_ipif_up_count; 18568 mutex_exit(&ill->ill_lock); 18569 ipif_was_up = B_TRUE; 18570 /* Update status in SCTP's list */ 18571 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 18572 } 18573 18574 /* 18575 * Blow away v6 memberships we established in ipif_multicast_up(); the 18576 * v4 ones are left alone (as is the ipif_multicast_up flag, so we 18577 * know not to rejoin when the interface is brought back up). 18578 */ 18579 if (ipif->ipif_isv6) 18580 ipif_multicast_down(ipif); 18581 /* 18582 * Remove from the mapping for __sin6_src_id. We insert only 18583 * when the address is not INADDR_ANY. As IPv4 addresses are 18584 * stored as mapped addresses, we need to check for mapped 18585 * INADDR_ANY also. 18586 */ 18587 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 18588 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 18589 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 18590 int err; 18591 18592 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 18593 ipif->ipif_zoneid); 18594 if (err != 0) { 18595 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 18596 } 18597 } 18598 18599 /* 18600 * Before we delete the ill from the group (if any), we need 18601 * to make sure that we delete all the routes dependent on 18602 * this and also any ipifs dependent on this ipif for 18603 * source address. We need to do before we delete from 18604 * the group because 18605 * 18606 * 1) ipif_down_delete_ire de-references ill->ill_group. 18607 * 18608 * 2) ipif_update_other_ipifs needs to walk the whole group 18609 * for re-doing source address selection. Note that 18610 * ipif_select_source[_v6] called from 18611 * ipif_update_other_ipifs[_v6] will not pick this ipif 18612 * because we have already marked down here i.e cleared 18613 * IPIF_UP. 18614 */ 18615 if (ipif->ipif_isv6) 18616 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 18617 else 18618 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES); 18619 18620 /* 18621 * Need to add these also to be saved and restored when the 18622 * ipif is brought down and up 18623 */ 18624 mutex_enter(&ire_mrtun_lock); 18625 if (ire_mrtun_count != 0) { 18626 mutex_exit(&ire_mrtun_lock); 18627 ire_walk_ill_mrtun(0, 0, ipif_down_delete_ire, 18628 (char *)ipif, NULL); 18629 } else { 18630 mutex_exit(&ire_mrtun_lock); 18631 } 18632 18633 mutex_enter(&ire_srcif_table_lock); 18634 if (ire_srcif_table_count > 0) { 18635 mutex_exit(&ire_srcif_table_lock); 18636 ire_walk_srcif_table_v4(ipif_down_delete_ire, (char *)ipif); 18637 } else { 18638 mutex_exit(&ire_srcif_table_lock); 18639 } 18640 18641 /* 18642 * Cleaning up the conn_ire_cache or conns must be done only after the 18643 * ires have been deleted above. Otherwise a thread could end up 18644 * caching an ire in a conn after we have finished the cleanup of the 18645 * conn. The caching is done after making sure that the ire is not yet 18646 * condemned. Also documented in the block comment above ip_output 18647 */ 18648 ipcl_walk(conn_cleanup_stale_ire, NULL); 18649 /* Also, delete the ires cached in SCTP */ 18650 sctp_ire_cache_flush(ipif); 18651 18652 /* Resolve any IPsec/IKE NAT-T instances that depend on this ipif. */ 18653 nattymod_clean_ipif(ipif); 18654 18655 /* 18656 * Update any other ipifs which have used "our" local address as 18657 * a source address. This entails removing and recreating IRE_INTERFACE 18658 * entries for such ipifs. 18659 */ 18660 if (ipif->ipif_isv6) 18661 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 18662 else 18663 ipif_update_other_ipifs(ipif, ill->ill_group); 18664 18665 if (ipif_was_up) { 18666 /* 18667 * Check whether it is last ipif to leave this group. 18668 * If this is the last ipif to leave, we should remove 18669 * this ill from the group as ipif_select_source will not 18670 * be able to find any useful ipifs if this ill is selected 18671 * for load balancing. 18672 * 18673 * For nameless groups, we should call ifgrp_delete if this 18674 * belongs to some group. As this ipif is going down, we may 18675 * need to reconstruct groups. 18676 */ 18677 phyi = ill->ill_phyint; 18678 /* 18679 * If the phyint_groupname_len is 0, it may or may not 18680 * be in the nameless group. If the phyint_groupname_len is 18681 * not 0, then this ill should be part of some group. 18682 * As we always insert this ill in the group if 18683 * phyint_groupname_len is not zero when the first ipif 18684 * comes up (in ipif_up_done), it should be in a group 18685 * when the namelen is not 0. 18686 * 18687 * NOTE : When we delete the ill from the group,it will 18688 * blow away all the IRE_CACHES pointing either at this ipif or 18689 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 18690 * should be pointing at this ill. 18691 */ 18692 ASSERT(phyi->phyint_groupname_len == 0 || 18693 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 18694 18695 if (phyi->phyint_groupname_len != 0) { 18696 if (ill->ill_ipif_up_count == 0) 18697 illgrp_delete(ill); 18698 } 18699 18700 /* 18701 * If we have deleted some of the broadcast ires associated 18702 * with this ipif, we need to re-nominate somebody else if 18703 * the ires that we deleted were the nominated ones. 18704 */ 18705 if (ill->ill_group != NULL && !ill->ill_isv6) 18706 ipif_renominate_bcast(ipif); 18707 } 18708 18709 /* 18710 * neighbor-discovery or arp entries for this interface. 18711 */ 18712 ipif_ndp_down(ipif); 18713 18714 /* 18715 * If mp is NULL the caller will wait for the appropriate refcnt. 18716 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 18717 * and ill_delete -> ipif_free -> ipif_down 18718 */ 18719 if (mp == NULL) { 18720 ASSERT(q == NULL); 18721 return (0); 18722 } 18723 18724 if (CONN_Q(q)) { 18725 connp = Q_TO_CONN(q); 18726 mutex_enter(&connp->conn_lock); 18727 } else { 18728 connp = NULL; 18729 } 18730 mutex_enter(&ill->ill_lock); 18731 /* 18732 * Are there any ire's pointing to this ipif that are still active ? 18733 * If this is the last ipif going down, are there any ire's pointing 18734 * to this ill that are still active ? 18735 */ 18736 if (ipif_is_quiescent(ipif)) { 18737 mutex_exit(&ill->ill_lock); 18738 if (connp != NULL) 18739 mutex_exit(&connp->conn_lock); 18740 return (0); 18741 } 18742 18743 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 18744 ill->ill_name, (void *)ill)); 18745 /* 18746 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 18747 * drops down, the operation will be restarted by ipif_ill_refrele_tail 18748 * which in turn is called by the last refrele on the ipif/ill/ire. 18749 */ 18750 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 18751 if (!success) { 18752 /* The conn is closing. So just return */ 18753 ASSERT(connp != NULL); 18754 mutex_exit(&ill->ill_lock); 18755 mutex_exit(&connp->conn_lock); 18756 return (EINTR); 18757 } 18758 18759 mutex_exit(&ill->ill_lock); 18760 if (connp != NULL) 18761 mutex_exit(&connp->conn_lock); 18762 return (EINPROGRESS); 18763 } 18764 18765 void 18766 ipif_down_tail(ipif_t *ipif) 18767 { 18768 ill_t *ill = ipif->ipif_ill; 18769 18770 /* 18771 * Skip any loopback interface (null wq). 18772 * If this is the last logical interface on the ill 18773 * have ill_dl_down tell the driver we are gone (unbind) 18774 * Note that lun 0 can ipif_down even though 18775 * there are other logical units that are up. 18776 * This occurs e.g. when we change a "significant" IFF_ flag. 18777 */ 18778 if (ill->ill_wq != NULL && !ill->ill_logical_down && 18779 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 18780 ill->ill_dl_up) { 18781 ill_dl_down(ill); 18782 } 18783 ill->ill_logical_down = 0; 18784 18785 /* 18786 * Have to be after removing the routes in ipif_down_delete_ire. 18787 */ 18788 if (ipif->ipif_isv6) { 18789 if (ill->ill_flags & ILLF_XRESOLV) 18790 ipif_arp_down(ipif); 18791 } else { 18792 ipif_arp_down(ipif); 18793 } 18794 18795 ip_rts_ifmsg(ipif); 18796 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 18797 } 18798 18799 /* 18800 * Bring interface logically down without bringing the physical interface 18801 * down e.g. when the netmask is changed. This avoids long lasting link 18802 * negotiations between an ethernet interface and a certain switches. 18803 */ 18804 static int 18805 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18806 { 18807 /* 18808 * The ill_logical_down flag is a transient flag. It is set here 18809 * and is cleared once the down has completed in ipif_down_tail. 18810 * This flag does not indicate whether the ill stream is in the 18811 * DL_BOUND state with the driver. Instead this flag is used by 18812 * ipif_down_tail to determine whether to DL_UNBIND the stream with 18813 * the driver. The state of the ill stream i.e. whether it is 18814 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 18815 */ 18816 ipif->ipif_ill->ill_logical_down = 1; 18817 return (ipif_down(ipif, q, mp)); 18818 } 18819 18820 /* 18821 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 18822 * If the usesrc client ILL is already part of a usesrc group or not, 18823 * in either case a ire_stq with the matching usesrc client ILL will 18824 * locate the IRE's that need to be deleted. We want IREs to be created 18825 * with the new source address. 18826 */ 18827 static void 18828 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 18829 { 18830 ill_t *ucill = (ill_t *)ill_arg; 18831 18832 ASSERT(IAM_WRITER_ILL(ucill)); 18833 18834 if (ire->ire_stq == NULL) 18835 return; 18836 18837 if ((ire->ire_type == IRE_CACHE) && 18838 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 18839 ire_delete(ire); 18840 } 18841 18842 /* 18843 * ire_walk routine to delete every IRE dependent on the interface 18844 * address that is going down. (Always called as writer.) 18845 * Works for both v4 and v6. 18846 * In addition for checking for ire_ipif matches it also checks for 18847 * IRE_CACHE entries which have the same source address as the 18848 * disappearing ipif since ipif_select_source might have picked 18849 * that source. Note that ipif_down/ipif_update_other_ipifs takes 18850 * care of any IRE_INTERFACE with the disappearing source address. 18851 */ 18852 static void 18853 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 18854 { 18855 ipif_t *ipif = (ipif_t *)ipif_arg; 18856 ill_t *ire_ill; 18857 ill_t *ipif_ill; 18858 18859 ASSERT(IAM_WRITER_IPIF(ipif)); 18860 if (ire->ire_ipif == NULL) 18861 return; 18862 18863 /* 18864 * For IPv4, we derive source addresses for an IRE from ipif's 18865 * belonging to the same IPMP group as the IRE's outgoing 18866 * interface. If an IRE's outgoing interface isn't in the 18867 * same IPMP group as a particular ipif, then that ipif 18868 * couldn't have been used as a source address for this IRE. 18869 * 18870 * For IPv6, source addresses are only restricted to the IPMP group 18871 * if the IRE is for a link-local address or a multicast address. 18872 * Otherwise, source addresses for an IRE can be chosen from 18873 * interfaces other than the the outgoing interface for that IRE. 18874 * 18875 * For source address selection details, see ipif_select_source() 18876 * and ipif_select_source_v6(). 18877 */ 18878 if (ire->ire_ipversion == IPV4_VERSION || 18879 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 18880 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 18881 ire_ill = ire->ire_ipif->ipif_ill; 18882 ipif_ill = ipif->ipif_ill; 18883 18884 if (ire_ill->ill_group != ipif_ill->ill_group) { 18885 return; 18886 } 18887 } 18888 18889 18890 if (ire->ire_ipif != ipif) { 18891 /* 18892 * Look for a matching source address. 18893 */ 18894 if (ire->ire_type != IRE_CACHE) 18895 return; 18896 if (ipif->ipif_flags & IPIF_NOLOCAL) 18897 return; 18898 18899 if (ire->ire_ipversion == IPV4_VERSION) { 18900 if (ire->ire_src_addr != ipif->ipif_src_addr) 18901 return; 18902 } else { 18903 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 18904 &ipif->ipif_v6lcl_addr)) 18905 return; 18906 } 18907 ire_delete(ire); 18908 return; 18909 } 18910 /* 18911 * ire_delete() will do an ire_flush_cache which will delete 18912 * all ire_ipif matches 18913 */ 18914 ire_delete(ire); 18915 } 18916 18917 /* 18918 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 18919 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 18920 * 2) when an interface is brought up or down (on that ill). 18921 * This ensures that the IRE_CACHE entries don't retain stale source 18922 * address selection results. 18923 */ 18924 void 18925 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 18926 { 18927 ill_t *ill = (ill_t *)ill_arg; 18928 ill_t *ipif_ill; 18929 18930 ASSERT(IAM_WRITER_ILL(ill)); 18931 /* 18932 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18933 * Hence this should be IRE_CACHE. 18934 */ 18935 ASSERT(ire->ire_type == IRE_CACHE); 18936 18937 /* 18938 * We are called for IRE_CACHES whose ire_ipif matches ill. 18939 * We are only interested in IRE_CACHES that has borrowed 18940 * the source address from ill_arg e.g. ipif_up_done[_v6] 18941 * for which we need to look at ire_ipif->ipif_ill match 18942 * with ill. 18943 */ 18944 ASSERT(ire->ire_ipif != NULL); 18945 ipif_ill = ire->ire_ipif->ipif_ill; 18946 if (ipif_ill == ill || (ill->ill_group != NULL && 18947 ipif_ill->ill_group == ill->ill_group)) { 18948 ire_delete(ire); 18949 } 18950 } 18951 18952 /* 18953 * Delete all the ire whose stq references ill_arg. 18954 */ 18955 static void 18956 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 18957 { 18958 ill_t *ill = (ill_t *)ill_arg; 18959 ill_t *ire_ill; 18960 18961 ASSERT(IAM_WRITER_ILL(ill)); 18962 /* 18963 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18964 * Hence this should be IRE_CACHE. 18965 */ 18966 ASSERT(ire->ire_type == IRE_CACHE); 18967 18968 /* 18969 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18970 * matches ill. We are only interested in IRE_CACHES that 18971 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 18972 * filtering here. 18973 */ 18974 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 18975 18976 if (ire_ill == ill) 18977 ire_delete(ire); 18978 } 18979 18980 /* 18981 * This is called when an ill leaves the group. We want to delete 18982 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 18983 * pointing at ill. 18984 */ 18985 static void 18986 illgrp_cache_delete(ire_t *ire, char *ill_arg) 18987 { 18988 ill_t *ill = (ill_t *)ill_arg; 18989 18990 ASSERT(IAM_WRITER_ILL(ill)); 18991 ASSERT(ill->ill_group == NULL); 18992 /* 18993 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18994 * Hence this should be IRE_CACHE. 18995 */ 18996 ASSERT(ire->ire_type == IRE_CACHE); 18997 /* 18998 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18999 * matches ill. We are interested in both. 19000 */ 19001 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 19002 (ire->ire_ipif->ipif_ill == ill)); 19003 19004 ire_delete(ire); 19005 } 19006 19007 /* 19008 * Initiate deallocate of an IPIF. Always called as writer. Called by 19009 * ill_delete or ip_sioctl_removeif. 19010 */ 19011 static void 19012 ipif_free(ipif_t *ipif) 19013 { 19014 ASSERT(IAM_WRITER_IPIF(ipif)); 19015 19016 if (ipif->ipif_recovery_id != 0) 19017 (void) untimeout(ipif->ipif_recovery_id); 19018 ipif->ipif_recovery_id = 0; 19019 19020 /* Remove conn references */ 19021 reset_conn_ipif(ipif); 19022 19023 /* 19024 * Make sure we have valid net and subnet broadcast ire's for the 19025 * other ipif's which share them with this ipif. 19026 */ 19027 if (!ipif->ipif_isv6) 19028 ipif_check_bcast_ires(ipif); 19029 19030 /* 19031 * Take down the interface. We can be called either from ill_delete 19032 * or from ip_sioctl_removeif. 19033 */ 19034 (void) ipif_down(ipif, NULL, NULL); 19035 19036 /* 19037 * Now that the interface is down, there's no chance it can still 19038 * become a duplicate. Cancel any timer that may have been set while 19039 * tearing down. 19040 */ 19041 if (ipif->ipif_recovery_id != 0) 19042 (void) untimeout(ipif->ipif_recovery_id); 19043 ipif->ipif_recovery_id = 0; 19044 19045 rw_enter(&ill_g_lock, RW_WRITER); 19046 /* Remove pointers to this ill in the multicast routing tables */ 19047 reset_mrt_vif_ipif(ipif); 19048 rw_exit(&ill_g_lock); 19049 } 19050 19051 /* 19052 * Warning: this is not the only function that calls mi_free on an ipif_t. See 19053 * also ill_move(). 19054 */ 19055 static void 19056 ipif_free_tail(ipif_t *ipif) 19057 { 19058 mblk_t *mp; 19059 ipif_t **ipifp; 19060 19061 /* 19062 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 19063 */ 19064 mutex_enter(&ipif->ipif_saved_ire_lock); 19065 mp = ipif->ipif_saved_ire_mp; 19066 ipif->ipif_saved_ire_mp = NULL; 19067 mutex_exit(&ipif->ipif_saved_ire_lock); 19068 freemsg(mp); 19069 19070 /* 19071 * Need to hold both ill_g_lock and ill_lock while 19072 * inserting or removing an ipif from the linked list 19073 * of ipifs hanging off the ill. 19074 */ 19075 rw_enter(&ill_g_lock, RW_WRITER); 19076 /* 19077 * Remove all multicast memberships on the interface now. 19078 * This removes IPv4 multicast memberships joined within 19079 * the kernel as ipif_down does not do ipif_multicast_down 19080 * for IPv4. IPv6 is not handled here as the multicast memberships 19081 * are based on ill and not on ipif. 19082 */ 19083 ilm_free(ipif); 19084 19085 /* 19086 * Since we held the ill_g_lock while doing the ilm_free above, 19087 * we can assert the ilms were really deleted and not just marked 19088 * ILM_DELETED. 19089 */ 19090 ASSERT(ilm_walk_ipif(ipif) == 0); 19091 19092 19093 IPIF_TRACE_CLEANUP(ipif); 19094 19095 /* Ask SCTP to take it out of it list */ 19096 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 19097 19098 mutex_enter(&ipif->ipif_ill->ill_lock); 19099 /* Get it out of the ILL interface list. */ 19100 ipifp = &ipif->ipif_ill->ill_ipif; 19101 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 19102 if (*ipifp == ipif) { 19103 *ipifp = ipif->ipif_next; 19104 break; 19105 } 19106 } 19107 19108 mutex_exit(&ipif->ipif_ill->ill_lock); 19109 rw_exit(&ill_g_lock); 19110 19111 mutex_destroy(&ipif->ipif_saved_ire_lock); 19112 19113 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 19114 ASSERT(ipif->ipif_recovery_id == 0); 19115 19116 /* Free the memory. */ 19117 mi_free((char *)ipif); 19118 } 19119 19120 /* 19121 * Returns an ipif name in the form "ill_name/unit" if ipif_id is not zero, 19122 * "ill_name" otherwise. 19123 */ 19124 char * 19125 ipif_get_name(const ipif_t *ipif, char *buf, int len) 19126 { 19127 char lbuf[32]; 19128 char *name; 19129 size_t name_len; 19130 19131 buf[0] = '\0'; 19132 if (!ipif) 19133 return (buf); 19134 name = ipif->ipif_ill->ill_name; 19135 name_len = ipif->ipif_ill->ill_name_length; 19136 if (ipif->ipif_id != 0) { 19137 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 19138 ipif->ipif_id); 19139 name = lbuf; 19140 name_len = mi_strlen(name) + 1; 19141 } 19142 len -= 1; 19143 buf[len] = '\0'; 19144 len = MIN(len, name_len); 19145 bcopy(name, buf, len); 19146 return (buf); 19147 } 19148 19149 /* 19150 * Find an IPIF based on the name passed in. Names can be of the 19151 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 19152 * The <phys> string can have forms like <dev><#> (e.g., le0), 19153 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 19154 * When there is no colon, the implied unit id is zero. <phys> must 19155 * correspond to the name of an ILL. (May be called as writer.) 19156 */ 19157 static ipif_t * 19158 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 19159 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 19160 mblk_t *mp, ipsq_func_t func, int *error) 19161 { 19162 char *cp; 19163 char *endp; 19164 long id; 19165 ill_t *ill; 19166 ipif_t *ipif; 19167 uint_t ire_type; 19168 boolean_t did_alloc = B_FALSE; 19169 ipsq_t *ipsq; 19170 19171 if (error != NULL) 19172 *error = 0; 19173 19174 /* 19175 * If the caller wants to us to create the ipif, make sure we have a 19176 * valid zoneid 19177 */ 19178 ASSERT(!do_alloc || zoneid != ALL_ZONES); 19179 19180 if (namelen == 0) { 19181 if (error != NULL) 19182 *error = ENXIO; 19183 return (NULL); 19184 } 19185 19186 *exists = B_FALSE; 19187 /* Look for a colon in the name. */ 19188 endp = &name[namelen]; 19189 for (cp = endp; --cp > name; ) { 19190 if (*cp == IPIF_SEPARATOR_CHAR) 19191 break; 19192 } 19193 19194 if (*cp == IPIF_SEPARATOR_CHAR) { 19195 /* 19196 * Reject any non-decimal aliases for logical 19197 * interfaces. Aliases with leading zeroes 19198 * are also rejected as they introduce ambiguity 19199 * in the naming of the interfaces. 19200 * In order to confirm with existing semantics, 19201 * and to not break any programs/script relying 19202 * on that behaviour, if<0>:0 is considered to be 19203 * a valid interface. 19204 * 19205 * If alias has two or more digits and the first 19206 * is zero, fail. 19207 */ 19208 if (&cp[2] < endp && cp[1] == '0') 19209 return (NULL); 19210 } 19211 19212 if (cp <= name) { 19213 cp = endp; 19214 } else { 19215 *cp = '\0'; 19216 } 19217 19218 /* 19219 * Look up the ILL, based on the portion of the name 19220 * before the slash. ill_lookup_on_name returns a held ill. 19221 * Temporary to check whether ill exists already. If so 19222 * ill_lookup_on_name will clear it. 19223 */ 19224 ill = ill_lookup_on_name(name, do_alloc, isv6, 19225 q, mp, func, error, &did_alloc); 19226 if (cp != endp) 19227 *cp = IPIF_SEPARATOR_CHAR; 19228 if (ill == NULL) 19229 return (NULL); 19230 19231 /* Establish the unit number in the name. */ 19232 id = 0; 19233 if (cp < endp && *endp == '\0') { 19234 /* If there was a colon, the unit number follows. */ 19235 cp++; 19236 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 19237 ill_refrele(ill); 19238 if (error != NULL) 19239 *error = ENXIO; 19240 return (NULL); 19241 } 19242 } 19243 19244 GRAB_CONN_LOCK(q); 19245 mutex_enter(&ill->ill_lock); 19246 /* Now see if there is an IPIF with this unit number. */ 19247 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19248 if (ipif->ipif_id == id) { 19249 if (zoneid != ALL_ZONES && 19250 zoneid != ipif->ipif_zoneid && 19251 ipif->ipif_zoneid != ALL_ZONES) { 19252 mutex_exit(&ill->ill_lock); 19253 RELEASE_CONN_LOCK(q); 19254 ill_refrele(ill); 19255 if (error != NULL) 19256 *error = ENXIO; 19257 return (NULL); 19258 } 19259 /* 19260 * The block comment at the start of ipif_down 19261 * explains the use of the macros used below 19262 */ 19263 if (IPIF_CAN_LOOKUP(ipif)) { 19264 ipif_refhold_locked(ipif); 19265 mutex_exit(&ill->ill_lock); 19266 if (!did_alloc) 19267 *exists = B_TRUE; 19268 /* 19269 * Drop locks before calling ill_refrele 19270 * since it can potentially call into 19271 * ipif_ill_refrele_tail which can end up 19272 * in trying to acquire any lock. 19273 */ 19274 RELEASE_CONN_LOCK(q); 19275 ill_refrele(ill); 19276 return (ipif); 19277 } else if (IPIF_CAN_WAIT(ipif, q)) { 19278 ipsq = ill->ill_phyint->phyint_ipsq; 19279 mutex_enter(&ipsq->ipsq_lock); 19280 mutex_exit(&ill->ill_lock); 19281 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 19282 mutex_exit(&ipsq->ipsq_lock); 19283 RELEASE_CONN_LOCK(q); 19284 ill_refrele(ill); 19285 *error = EINPROGRESS; 19286 return (NULL); 19287 } 19288 } 19289 } 19290 RELEASE_CONN_LOCK(q); 19291 19292 if (!do_alloc) { 19293 mutex_exit(&ill->ill_lock); 19294 ill_refrele(ill); 19295 if (error != NULL) 19296 *error = ENXIO; 19297 return (NULL); 19298 } 19299 19300 /* 19301 * If none found, atomically allocate and return a new one. 19302 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 19303 * to support "receive only" use of lo0:1 etc. as is still done 19304 * below as an initial guess. 19305 * However, this is now likely to be overriden later in ipif_up_done() 19306 * when we know for sure what address has been configured on the 19307 * interface, since we might have more than one loopback interface 19308 * with a loopback address, e.g. in the case of zones, and all the 19309 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 19310 */ 19311 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 19312 ire_type = IRE_LOOPBACK; 19313 else 19314 ire_type = IRE_LOCAL; 19315 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 19316 if (ipif != NULL) 19317 ipif_refhold_locked(ipif); 19318 else if (error != NULL) 19319 *error = ENOMEM; 19320 mutex_exit(&ill->ill_lock); 19321 ill_refrele(ill); 19322 return (ipif); 19323 } 19324 19325 /* 19326 * This routine is called whenever a new address comes up on an ipif. If 19327 * we are configured to respond to address mask requests, then we are supposed 19328 * to broadcast an address mask reply at this time. This routine is also 19329 * called if we are already up, but a netmask change is made. This is legal 19330 * but might not make the system manager very popular. (May be called 19331 * as writer.) 19332 */ 19333 void 19334 ipif_mask_reply(ipif_t *ipif) 19335 { 19336 icmph_t *icmph; 19337 ipha_t *ipha; 19338 mblk_t *mp; 19339 19340 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 19341 19342 if (!ip_respond_to_address_mask_broadcast) 19343 return; 19344 19345 /* ICMP mask reply is IPv4 only */ 19346 ASSERT(!ipif->ipif_isv6); 19347 /* ICMP mask reply is not for a loopback interface */ 19348 ASSERT(ipif->ipif_ill->ill_wq != NULL); 19349 19350 mp = allocb(REPLY_LEN, BPRI_HI); 19351 if (mp == NULL) 19352 return; 19353 mp->b_wptr = mp->b_rptr + REPLY_LEN; 19354 19355 ipha = (ipha_t *)mp->b_rptr; 19356 bzero(ipha, REPLY_LEN); 19357 *ipha = icmp_ipha; 19358 ipha->ipha_ttl = ip_broadcast_ttl; 19359 ipha->ipha_src = ipif->ipif_src_addr; 19360 ipha->ipha_dst = ipif->ipif_brd_addr; 19361 ipha->ipha_length = htons(REPLY_LEN); 19362 ipha->ipha_ident = 0; 19363 19364 icmph = (icmph_t *)&ipha[1]; 19365 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 19366 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 19367 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 19368 if (icmph->icmph_checksum == 0) 19369 icmph->icmph_checksum = 0xffff; 19370 19371 put(ipif->ipif_wq, mp); 19372 19373 #undef REPLY_LEN 19374 } 19375 19376 /* 19377 * When the mtu in the ipif changes, we call this routine through ire_walk 19378 * to update all the relevant IREs. 19379 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19380 */ 19381 static void 19382 ipif_mtu_change(ire_t *ire, char *ipif_arg) 19383 { 19384 ipif_t *ipif = (ipif_t *)ipif_arg; 19385 19386 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 19387 return; 19388 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 19389 } 19390 19391 /* 19392 * When the mtu in the ill changes, we call this routine through ire_walk 19393 * to update all the relevant IREs. 19394 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19395 */ 19396 void 19397 ill_mtu_change(ire_t *ire, char *ill_arg) 19398 { 19399 ill_t *ill = (ill_t *)ill_arg; 19400 19401 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 19402 return; 19403 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 19404 } 19405 19406 /* 19407 * Join the ipif specific multicast groups. 19408 * Must be called after a mapping has been set up in the resolver. (Always 19409 * called as writer.) 19410 */ 19411 void 19412 ipif_multicast_up(ipif_t *ipif) 19413 { 19414 int err, index; 19415 ill_t *ill; 19416 19417 ASSERT(IAM_WRITER_IPIF(ipif)); 19418 19419 ill = ipif->ipif_ill; 19420 index = ill->ill_phyint->phyint_ifindex; 19421 19422 ip1dbg(("ipif_multicast_up\n")); 19423 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 19424 return; 19425 19426 if (ipif->ipif_isv6) { 19427 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 19428 return; 19429 19430 /* Join the all hosts multicast address */ 19431 ip1dbg(("ipif_multicast_up - addmulti\n")); 19432 /* 19433 * Passing B_TRUE means we have to join the multicast 19434 * membership on this interface even though this is 19435 * FAILED. If we join on a different one in the group, 19436 * we will not be able to delete the membership later 19437 * as we currently don't track where we join when we 19438 * join within the kernel unlike applications where 19439 * we have ilg/ilg_orig_index. See ip_addmulti_v6 19440 * for more on this. 19441 */ 19442 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 19443 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19444 if (err != 0) { 19445 ip0dbg(("ipif_multicast_up: " 19446 "all_hosts_mcast failed %d\n", 19447 err)); 19448 return; 19449 } 19450 /* 19451 * Enable multicast for the solicited node multicast address 19452 */ 19453 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19454 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19455 19456 ipv6_multi.s6_addr32[3] |= 19457 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19458 19459 err = ip_addmulti_v6(&ipv6_multi, ill, index, 19460 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 19461 NULL); 19462 if (err != 0) { 19463 ip0dbg(("ipif_multicast_up: solicited MC" 19464 " failed %d\n", err)); 19465 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 19466 ill, ill->ill_phyint->phyint_ifindex, 19467 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19468 return; 19469 } 19470 } 19471 } else { 19472 if (ipif->ipif_lcl_addr == INADDR_ANY) 19473 return; 19474 19475 /* Join the all hosts multicast address */ 19476 ip1dbg(("ipif_multicast_up - addmulti\n")); 19477 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 19478 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19479 if (err) { 19480 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 19481 return; 19482 } 19483 } 19484 ipif->ipif_multicast_up = 1; 19485 } 19486 19487 /* 19488 * Blow away any IPv6 multicast groups that we joined in ipif_multicast_up(); 19489 * any explicit memberships are blown away in ill_leave_multicast() when the 19490 * ill is brought down. 19491 */ 19492 static void 19493 ipif_multicast_down(ipif_t *ipif) 19494 { 19495 int err; 19496 19497 ASSERT(IAM_WRITER_IPIF(ipif)); 19498 19499 ip1dbg(("ipif_multicast_down\n")); 19500 if (!ipif->ipif_multicast_up) 19501 return; 19502 19503 ASSERT(ipif->ipif_isv6); 19504 19505 ip1dbg(("ipif_multicast_down - delmulti\n")); 19506 19507 /* 19508 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 19509 * we should look for ilms on this ill rather than the ones that have 19510 * been failed over here. They are here temporarily. As 19511 * ipif_multicast_up has joined on this ill, we should delete only 19512 * from this ill. 19513 */ 19514 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 19515 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 19516 B_TRUE, B_TRUE); 19517 if (err != 0) { 19518 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 19519 err)); 19520 } 19521 /* 19522 * Disable multicast for the solicited node multicast address 19523 */ 19524 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19525 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19526 19527 ipv6_multi.s6_addr32[3] |= 19528 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19529 19530 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 19531 ipif->ipif_ill->ill_phyint->phyint_ifindex, 19532 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19533 19534 if (err != 0) { 19535 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 19536 err)); 19537 } 19538 } 19539 19540 ipif->ipif_multicast_up = 0; 19541 } 19542 19543 /* 19544 * Used when an interface comes up to recreate any extra routes on this 19545 * interface. 19546 */ 19547 static ire_t ** 19548 ipif_recover_ire(ipif_t *ipif) 19549 { 19550 mblk_t *mp; 19551 ire_t **ipif_saved_irep; 19552 ire_t **irep; 19553 19554 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 19555 ipif->ipif_id)); 19556 19557 mutex_enter(&ipif->ipif_saved_ire_lock); 19558 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 19559 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 19560 if (ipif_saved_irep == NULL) { 19561 mutex_exit(&ipif->ipif_saved_ire_lock); 19562 return (NULL); 19563 } 19564 19565 irep = ipif_saved_irep; 19566 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 19567 ire_t *ire; 19568 queue_t *rfq; 19569 queue_t *stq; 19570 ifrt_t *ifrt; 19571 uchar_t *src_addr; 19572 uchar_t *gateway_addr; 19573 mblk_t *resolver_mp; 19574 ushort_t type; 19575 19576 /* 19577 * When the ire was initially created and then added in 19578 * ip_rt_add(), it was created either using ipif->ipif_net_type 19579 * in the case of a traditional interface route, or as one of 19580 * the IRE_OFFSUBNET types (with the exception of 19581 * IRE_HOST types ire which is created by icmp_redirect() and 19582 * which we don't need to save or recover). In the case where 19583 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 19584 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 19585 * to satisfy software like GateD and Sun Cluster which creates 19586 * routes using the the loopback interface's address as a 19587 * gateway. 19588 * 19589 * As ifrt->ifrt_type reflects the already updated ire_type and 19590 * since ire_create() expects that IRE_IF_NORESOLVER will have 19591 * a valid nce_res_mp field (which doesn't make sense for a 19592 * IRE_LOOPBACK), ire_create() will be called in the same way 19593 * here as in ip_rt_add(), namely using ipif->ipif_net_type when 19594 * the route looks like a traditional interface route (where 19595 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 19596 * the saved ifrt->ifrt_type. This means that in the case where 19597 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 19598 * ire_create() will be an IRE_LOOPBACK, it will then be turned 19599 * into an IRE_IF_NORESOLVER and then added by ire_add(). 19600 */ 19601 ifrt = (ifrt_t *)mp->b_rptr; 19602 if (ifrt->ifrt_type & IRE_INTERFACE) { 19603 rfq = NULL; 19604 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 19605 ? ipif->ipif_rq : ipif->ipif_wq; 19606 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19607 ? (uint8_t *)&ifrt->ifrt_src_addr 19608 : (uint8_t *)&ipif->ipif_src_addr; 19609 gateway_addr = NULL; 19610 resolver_mp = ipif->ipif_resolver_mp; 19611 type = ipif->ipif_net_type; 19612 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 19613 /* Recover multiroute broadcast IRE. */ 19614 rfq = ipif->ipif_rq; 19615 stq = ipif->ipif_wq; 19616 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19617 ? (uint8_t *)&ifrt->ifrt_src_addr 19618 : (uint8_t *)&ipif->ipif_src_addr; 19619 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19620 resolver_mp = ipif->ipif_bcast_mp; 19621 type = ifrt->ifrt_type; 19622 } else { 19623 rfq = NULL; 19624 stq = NULL; 19625 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19626 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 19627 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19628 resolver_mp = NULL; 19629 type = ifrt->ifrt_type; 19630 } 19631 19632 /* 19633 * Create a copy of the IRE with the saved address and netmask. 19634 */ 19635 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 19636 "0x%x/0x%x\n", 19637 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 19638 ntohl(ifrt->ifrt_addr), 19639 ntohl(ifrt->ifrt_mask))); 19640 ire = ire_create( 19641 (uint8_t *)&ifrt->ifrt_addr, 19642 (uint8_t *)&ifrt->ifrt_mask, 19643 src_addr, 19644 gateway_addr, 19645 NULL, 19646 &ifrt->ifrt_max_frag, 19647 NULL, 19648 rfq, 19649 stq, 19650 type, 19651 resolver_mp, 19652 ipif, 19653 NULL, 19654 0, 19655 0, 19656 0, 19657 ifrt->ifrt_flags, 19658 &ifrt->ifrt_iulp_info, 19659 NULL, 19660 NULL); 19661 19662 if (ire == NULL) { 19663 mutex_exit(&ipif->ipif_saved_ire_lock); 19664 kmem_free(ipif_saved_irep, 19665 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 19666 return (NULL); 19667 } 19668 19669 /* 19670 * Some software (for example, GateD and Sun Cluster) attempts 19671 * to create (what amount to) IRE_PREFIX routes with the 19672 * loopback address as the gateway. This is primarily done to 19673 * set up prefixes with the RTF_REJECT flag set (for example, 19674 * when generating aggregate routes.) 19675 * 19676 * If the IRE type (as defined by ipif->ipif_net_type) is 19677 * IRE_LOOPBACK, then we map the request into a 19678 * IRE_IF_NORESOLVER. 19679 */ 19680 if (ipif->ipif_net_type == IRE_LOOPBACK) 19681 ire->ire_type = IRE_IF_NORESOLVER; 19682 /* 19683 * ire held by ire_add, will be refreled' towards the 19684 * the end of ipif_up_done 19685 */ 19686 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 19687 *irep = ire; 19688 irep++; 19689 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 19690 } 19691 mutex_exit(&ipif->ipif_saved_ire_lock); 19692 return (ipif_saved_irep); 19693 } 19694 19695 /* 19696 * Used to set the netmask and broadcast address to default values when the 19697 * interface is brought up. (Always called as writer.) 19698 */ 19699 static void 19700 ipif_set_default(ipif_t *ipif) 19701 { 19702 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19703 19704 if (!ipif->ipif_isv6) { 19705 /* 19706 * Interface holds an IPv4 address. Default 19707 * mask is the natural netmask. 19708 */ 19709 if (!ipif->ipif_net_mask) { 19710 ipaddr_t v4mask; 19711 19712 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 19713 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 19714 } 19715 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19716 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19717 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19718 } else { 19719 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19720 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19721 } 19722 /* 19723 * NOTE: SunOS 4.X does this even if the broadcast address 19724 * has been already set thus we do the same here. 19725 */ 19726 if (ipif->ipif_flags & IPIF_BROADCAST) { 19727 ipaddr_t v4addr; 19728 19729 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 19730 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 19731 } 19732 } else { 19733 /* 19734 * Interface holds an IPv6-only address. Default 19735 * mask is all-ones. 19736 */ 19737 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 19738 ipif->ipif_v6net_mask = ipv6_all_ones; 19739 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19740 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19741 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19742 } else { 19743 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19744 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19745 } 19746 } 19747 } 19748 19749 /* 19750 * Return 0 if this address can be used as local address without causing 19751 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 19752 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 19753 * Special checks are needed to allow the same IPv6 link-local address 19754 * on different ills. 19755 * TODO: allowing the same site-local address on different ill's. 19756 */ 19757 int 19758 ip_addr_availability_check(ipif_t *new_ipif) 19759 { 19760 in6_addr_t our_v6addr; 19761 ill_t *ill; 19762 ipif_t *ipif; 19763 ill_walk_context_t ctx; 19764 19765 ASSERT(IAM_WRITER_IPIF(new_ipif)); 19766 ASSERT(MUTEX_HELD(&ip_addr_avail_lock)); 19767 ASSERT(RW_READ_HELD(&ill_g_lock)); 19768 19769 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 19770 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 19771 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 19772 return (0); 19773 19774 our_v6addr = new_ipif->ipif_v6lcl_addr; 19775 19776 if (new_ipif->ipif_isv6) 19777 ill = ILL_START_WALK_V6(&ctx); 19778 else 19779 ill = ILL_START_WALK_V4(&ctx); 19780 19781 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19782 for (ipif = ill->ill_ipif; ipif != NULL; 19783 ipif = ipif->ipif_next) { 19784 if ((ipif == new_ipif) || 19785 !(ipif->ipif_flags & IPIF_UP) || 19786 (ipif->ipif_flags & IPIF_UNNUMBERED)) 19787 continue; 19788 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 19789 &our_v6addr)) { 19790 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 19791 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 19792 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 19793 ipif->ipif_flags |= IPIF_UNNUMBERED; 19794 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 19795 new_ipif->ipif_ill != ill) 19796 continue; 19797 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 19798 new_ipif->ipif_ill != ill) 19799 continue; 19800 else if (new_ipif->ipif_zoneid != 19801 ipif->ipif_zoneid && 19802 ipif->ipif_zoneid != ALL_ZONES && 19803 (ill->ill_phyint->phyint_flags & 19804 PHYI_LOOPBACK)) 19805 continue; 19806 else if (new_ipif->ipif_ill == ill) 19807 return (EADDRINUSE); 19808 else 19809 return (EADDRNOTAVAIL); 19810 } 19811 } 19812 } 19813 19814 return (0); 19815 } 19816 19817 /* 19818 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 19819 * IREs for the ipif. 19820 * When the routine returns EINPROGRESS then mp has been consumed and 19821 * the ioctl will be acked from ip_rput_dlpi. 19822 */ 19823 static int 19824 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 19825 { 19826 ill_t *ill = ipif->ipif_ill; 19827 boolean_t isv6 = ipif->ipif_isv6; 19828 int err = 0; 19829 boolean_t success; 19830 19831 ASSERT(IAM_WRITER_IPIF(ipif)); 19832 19833 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 19834 19835 /* Shouldn't get here if it is already up. */ 19836 if (ipif->ipif_flags & IPIF_UP) 19837 return (EALREADY); 19838 19839 /* Skip arp/ndp for any loopback interface. */ 19840 if (ill->ill_wq != NULL) { 19841 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 19842 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19843 19844 if (!ill->ill_dl_up) { 19845 /* 19846 * ill_dl_up is not yet set. i.e. we are yet to 19847 * DL_BIND with the driver and this is the first 19848 * logical interface on the ill to become "up". 19849 * Tell the driver to get going (via DL_BIND_REQ). 19850 * Note that changing "significant" IFF_ flags 19851 * address/netmask etc cause a down/up dance, but 19852 * does not cause an unbind (DL_UNBIND) with the driver 19853 */ 19854 return (ill_dl_up(ill, ipif, mp, q)); 19855 } 19856 19857 /* 19858 * ipif_resolver_up may end up sending an 19859 * AR_INTERFACE_UP message to ARP, which would, in 19860 * turn send a DLPI message to the driver. ioctls are 19861 * serialized and so we cannot send more than one 19862 * interface up message at a time. If ipif_resolver_up 19863 * does send an interface up message to ARP, we get 19864 * EINPROGRESS and we will complete in ip_arp_done. 19865 */ 19866 19867 ASSERT(connp != NULL || !CONN_Q(q)); 19868 ASSERT(ipsq->ipsq_pending_mp == NULL); 19869 if (connp != NULL) 19870 mutex_enter(&connp->conn_lock); 19871 mutex_enter(&ill->ill_lock); 19872 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19873 mutex_exit(&ill->ill_lock); 19874 if (connp != NULL) 19875 mutex_exit(&connp->conn_lock); 19876 if (!success) 19877 return (EINTR); 19878 19879 /* 19880 * Crank up IPv6 neighbor discovery 19881 * Unlike ARP, this should complete when 19882 * ipif_ndp_up returns. However, for 19883 * ILLF_XRESOLV interfaces we also send a 19884 * AR_INTERFACE_UP to the external resolver. 19885 * That ioctl will complete in ip_rput. 19886 */ 19887 if (isv6) { 19888 err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr); 19889 if (err != 0) { 19890 if (err != EINPROGRESS) 19891 mp = ipsq_pending_mp_get(ipsq, &connp); 19892 return (err); 19893 } 19894 } 19895 /* Now, ARP */ 19896 err = ipif_resolver_up(ipif, Res_act_initial); 19897 if (err == EINPROGRESS) { 19898 /* We will complete it in ip_arp_done */ 19899 return (err); 19900 } 19901 mp = ipsq_pending_mp_get(ipsq, &connp); 19902 ASSERT(mp != NULL); 19903 if (err != 0) 19904 return (err); 19905 } else { 19906 /* 19907 * Interfaces without underlying hardware don't do duplicate 19908 * address detection. 19909 */ 19910 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 19911 ipif->ipif_addr_ready = 1; 19912 } 19913 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 19914 } 19915 19916 /* 19917 * Perform a bind for the physical device. 19918 * When the routine returns EINPROGRESS then mp has been consumed and 19919 * the ioctl will be acked from ip_rput_dlpi. 19920 * Allocate an unbind message and save it until ipif_down. 19921 */ 19922 static int 19923 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 19924 { 19925 mblk_t *areq_mp = NULL; 19926 mblk_t *bind_mp = NULL; 19927 mblk_t *unbind_mp = NULL; 19928 conn_t *connp; 19929 boolean_t success; 19930 19931 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 19932 ASSERT(IAM_WRITER_ILL(ill)); 19933 19934 ASSERT(mp != NULL); 19935 19936 /* Create a resolver cookie for ARP */ 19937 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 19938 areq_t *areq; 19939 uint16_t sap_addr; 19940 19941 areq_mp = ill_arp_alloc(ill, 19942 (uchar_t *)&ip_areq_template, 0); 19943 if (areq_mp == NULL) { 19944 return (ENOMEM); 19945 } 19946 freemsg(ill->ill_resolver_mp); 19947 ill->ill_resolver_mp = areq_mp; 19948 areq = (areq_t *)areq_mp->b_rptr; 19949 sap_addr = ill->ill_sap; 19950 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 19951 /* 19952 * Wait till we call ill_pending_mp_add to determine 19953 * the success before we free the ill_resolver_mp and 19954 * attach areq_mp in it's place. 19955 */ 19956 } 19957 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 19958 DL_BIND_REQ); 19959 if (bind_mp == NULL) 19960 goto bad; 19961 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 19962 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 19963 19964 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 19965 if (unbind_mp == NULL) 19966 goto bad; 19967 19968 /* 19969 * Record state needed to complete this operation when the 19970 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 19971 */ 19972 ASSERT(WR(q)->q_next == NULL); 19973 connp = Q_TO_CONN(q); 19974 19975 mutex_enter(&connp->conn_lock); 19976 mutex_enter(&ipif->ipif_ill->ill_lock); 19977 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19978 mutex_exit(&ipif->ipif_ill->ill_lock); 19979 mutex_exit(&connp->conn_lock); 19980 if (!success) 19981 goto bad; 19982 19983 /* 19984 * Save the unbind message for ill_dl_down(); it will be consumed when 19985 * the interface goes down. 19986 */ 19987 ASSERT(ill->ill_unbind_mp == NULL); 19988 ill->ill_unbind_mp = unbind_mp; 19989 19990 ill_dlpi_send(ill, bind_mp); 19991 /* Send down link-layer capabilities probe if not already done. */ 19992 ill_capability_probe(ill); 19993 19994 /* 19995 * Sysid used to rely on the fact that netboots set domainname 19996 * and the like. Now that miniroot boots aren't strictly netboots 19997 * and miniroot network configuration is driven from userland 19998 * these things still need to be set. This situation can be detected 19999 * by comparing the interface being configured here to the one 20000 * dhcack was set to reference by the boot loader. Once sysid is 20001 * converted to use dhcp_ipc_getinfo() this call can go away. 20002 */ 20003 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && (dhcack != NULL) && 20004 (strcmp(ill->ill_name, dhcack) == 0) && 20005 (strlen(srpc_domain) == 0)) { 20006 if (dhcpinit() != 0) 20007 cmn_err(CE_WARN, "no cached dhcp response"); 20008 } 20009 20010 /* 20011 * This operation will complete in ip_rput_dlpi with either 20012 * a DL_BIND_ACK or DL_ERROR_ACK. 20013 */ 20014 return (EINPROGRESS); 20015 bad: 20016 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 20017 /* 20018 * We don't have to check for possible removal from illgrp 20019 * as we have not yet inserted in illgrp. For groups 20020 * without names, this ipif is still not UP and hence 20021 * this could not have possibly had any influence in forming 20022 * groups. 20023 */ 20024 20025 freemsg(bind_mp); 20026 freemsg(unbind_mp); 20027 return (ENOMEM); 20028 } 20029 20030 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 20031 20032 /* 20033 * DLPI and ARP is up. 20034 * Create all the IREs associated with an interface bring up multicast. 20035 * Set the interface flag and finish other initialization 20036 * that potentially had to be differed to after DL_BIND_ACK. 20037 */ 20038 int 20039 ipif_up_done(ipif_t *ipif) 20040 { 20041 ire_t *ire_array[20]; 20042 ire_t **irep = ire_array; 20043 ire_t **irep1; 20044 ipaddr_t net_mask = 0; 20045 ipaddr_t subnet_mask, route_mask; 20046 ill_t *ill = ipif->ipif_ill; 20047 queue_t *stq; 20048 ipif_t *src_ipif; 20049 ipif_t *tmp_ipif; 20050 boolean_t flush_ire_cache = B_TRUE; 20051 int err = 0; 20052 phyint_t *phyi; 20053 ire_t **ipif_saved_irep = NULL; 20054 int ipif_saved_ire_cnt; 20055 int cnt; 20056 boolean_t src_ipif_held = B_FALSE; 20057 boolean_t ire_added = B_FALSE; 20058 boolean_t loopback = B_FALSE; 20059 20060 ip1dbg(("ipif_up_done(%s:%u)\n", 20061 ipif->ipif_ill->ill_name, ipif->ipif_id)); 20062 /* Check if this is a loopback interface */ 20063 if (ipif->ipif_ill->ill_wq == NULL) 20064 loopback = B_TRUE; 20065 20066 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20067 /* 20068 * If all other interfaces for this ill are down or DEPRECATED, 20069 * or otherwise unsuitable for source address selection, remove 20070 * any IRE_CACHE entries for this ill to make sure source 20071 * address selection gets to take this new ipif into account. 20072 * No need to hold ill_lock while traversing the ipif list since 20073 * we are writer 20074 */ 20075 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 20076 tmp_ipif = tmp_ipif->ipif_next) { 20077 if (((tmp_ipif->ipif_flags & 20078 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 20079 !(tmp_ipif->ipif_flags & IPIF_UP)) || 20080 (tmp_ipif == ipif)) 20081 continue; 20082 /* first useable pre-existing interface */ 20083 flush_ire_cache = B_FALSE; 20084 break; 20085 } 20086 if (flush_ire_cache) 20087 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 20088 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 20089 20090 /* 20091 * Figure out which way the send-to queue should go. Only 20092 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 20093 * should show up here. 20094 */ 20095 switch (ill->ill_net_type) { 20096 case IRE_IF_RESOLVER: 20097 stq = ill->ill_rq; 20098 break; 20099 case IRE_IF_NORESOLVER: 20100 case IRE_LOOPBACK: 20101 stq = ill->ill_wq; 20102 break; 20103 default: 20104 return (EINVAL); 20105 } 20106 20107 if (ill->ill_phyint->phyint_flags & PHYI_LOOPBACK) { 20108 /* 20109 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 20110 * ipif_lookup_on_name(), but in the case of zones we can have 20111 * several loopback addresses on lo0. So all the interfaces with 20112 * loopback addresses need to be marked IRE_LOOPBACK. 20113 */ 20114 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 20115 htonl(INADDR_LOOPBACK)) 20116 ipif->ipif_ire_type = IRE_LOOPBACK; 20117 else 20118 ipif->ipif_ire_type = IRE_LOCAL; 20119 } 20120 20121 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 20122 /* 20123 * Can't use our source address. Select a different 20124 * source address for the IRE_INTERFACE and IRE_LOCAL 20125 */ 20126 src_ipif = ipif_select_source(ipif->ipif_ill, 20127 ipif->ipif_subnet, ipif->ipif_zoneid); 20128 if (src_ipif == NULL) 20129 src_ipif = ipif; /* Last resort */ 20130 else 20131 src_ipif_held = B_TRUE; 20132 } else { 20133 src_ipif = ipif; 20134 } 20135 20136 /* Create all the IREs associated with this interface */ 20137 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 20138 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 20139 20140 /* 20141 * If we're on a labeled system then make sure that zone- 20142 * private addresses have proper remote host database entries. 20143 */ 20144 if (is_system_labeled() && 20145 ipif->ipif_ire_type != IRE_LOOPBACK && 20146 !tsol_check_interface_address(ipif)) 20147 return (EINVAL); 20148 20149 /* Register the source address for __sin6_src_id */ 20150 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 20151 ipif->ipif_zoneid); 20152 if (err != 0) { 20153 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 20154 return (err); 20155 } 20156 20157 /* If the interface address is set, create the local IRE. */ 20158 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 20159 (void *)ipif, 20160 ipif->ipif_ire_type, 20161 ntohl(ipif->ipif_lcl_addr))); 20162 *irep++ = ire_create( 20163 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 20164 (uchar_t *)&ip_g_all_ones, /* mask */ 20165 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 20166 NULL, /* no gateway */ 20167 NULL, 20168 &ip_loopback_mtuplus, /* max frag size */ 20169 NULL, 20170 ipif->ipif_rq, /* recv-from queue */ 20171 NULL, /* no send-to queue */ 20172 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 20173 NULL, 20174 ipif, 20175 NULL, 20176 0, 20177 0, 20178 0, 20179 (ipif->ipif_flags & IPIF_PRIVATE) ? 20180 RTF_PRIVATE : 0, 20181 &ire_uinfo_null, 20182 NULL, 20183 NULL); 20184 } else { 20185 ip1dbg(( 20186 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 20187 ipif->ipif_ire_type, 20188 ntohl(ipif->ipif_lcl_addr), 20189 (uint_t)ipif->ipif_flags)); 20190 } 20191 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 20192 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 20193 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 20194 } else { 20195 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 20196 } 20197 20198 subnet_mask = ipif->ipif_net_mask; 20199 20200 /* 20201 * If mask was not specified, use natural netmask of 20202 * interface address. Also, store this mask back into the 20203 * ipif struct. 20204 */ 20205 if (subnet_mask == 0) { 20206 subnet_mask = net_mask; 20207 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 20208 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 20209 ipif->ipif_v6subnet); 20210 } 20211 20212 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 20213 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 20214 ipif->ipif_subnet != INADDR_ANY) { 20215 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 20216 20217 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 20218 route_mask = IP_HOST_MASK; 20219 } else { 20220 route_mask = subnet_mask; 20221 } 20222 20223 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 20224 "creating if IRE ill_net_type 0x%x for 0x%x\n", 20225 (void *)ipif, (void *)ill, 20226 ill->ill_net_type, 20227 ntohl(ipif->ipif_subnet))); 20228 *irep++ = ire_create( 20229 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 20230 (uchar_t *)&route_mask, /* mask */ 20231 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 20232 NULL, /* no gateway */ 20233 NULL, 20234 &ipif->ipif_mtu, /* max frag */ 20235 NULL, 20236 NULL, /* no recv queue */ 20237 stq, /* send-to queue */ 20238 ill->ill_net_type, /* IF_[NO]RESOLVER */ 20239 ill->ill_resolver_mp, /* xmit header */ 20240 ipif, 20241 NULL, 20242 0, 20243 0, 20244 0, 20245 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 20246 &ire_uinfo_null, 20247 NULL, 20248 NULL); 20249 } 20250 20251 /* 20252 * If the interface address is set, create the broadcast IREs. 20253 * 20254 * ire_create_bcast checks if the proposed new IRE matches 20255 * any existing IRE's with the same physical interface (ILL). 20256 * This should get rid of duplicates. 20257 * ire_create_bcast also check IPIF_NOXMIT and does not create 20258 * any broadcast ires. 20259 */ 20260 if ((ipif->ipif_subnet != INADDR_ANY) && 20261 (ipif->ipif_flags & IPIF_BROADCAST)) { 20262 ipaddr_t addr; 20263 20264 ip1dbg(("ipif_up_done: creating broadcast IRE\n")); 20265 irep = ire_check_and_create_bcast(ipif, 0, irep, 20266 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20267 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, 20268 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20269 20270 /* 20271 * For backward compatibility, we need to create net 20272 * broadcast ire's based on the old "IP address class 20273 * system." The reason is that some old machines only 20274 * respond to these class derived net broadcast. 20275 * 20276 * But we should not create these net broadcast ire's if 20277 * the subnet_mask is shorter than the IP address class based 20278 * derived netmask. Otherwise, we may create a net 20279 * broadcast address which is the same as an IP address 20280 * on the subnet. Then TCP will refuse to talk to that 20281 * address. 20282 * 20283 * Nor do we need IRE_BROADCAST ire's for the interface 20284 * with the netmask as 0xFFFFFFFF, as IRE_LOCAL for that 20285 * interface is already created. Creating these broadcast 20286 * ire's will only create confusion as the "addr" is going 20287 * to be same as that of the IP address of the interface. 20288 */ 20289 if (net_mask < subnet_mask) { 20290 addr = net_mask & ipif->ipif_subnet; 20291 irep = ire_check_and_create_bcast(ipif, addr, irep, 20292 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20293 irep = ire_check_and_create_bcast(ipif, 20294 ~net_mask | addr, irep, 20295 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20296 } 20297 20298 if (subnet_mask != 0xFFFFFFFF) { 20299 addr = ipif->ipif_subnet; 20300 irep = ire_check_and_create_bcast(ipif, addr, irep, 20301 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20302 irep = ire_check_and_create_bcast(ipif, 20303 ~subnet_mask|addr, irep, 20304 (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20305 } 20306 } 20307 20308 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20309 20310 /* If an earlier ire_create failed, get out now */ 20311 for (irep1 = irep; irep1 > ire_array; ) { 20312 irep1--; 20313 if (*irep1 == NULL) { 20314 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 20315 err = ENOMEM; 20316 goto bad; 20317 } 20318 } 20319 20320 /* 20321 * Need to atomically check for ip_addr_availablity_check 20322 * under ip_addr_avail_lock, and if it fails got bad, and remove 20323 * from group also.The ill_g_lock is grabbed as reader 20324 * just to make sure no new ills or new ipifs are being added 20325 * to the system while we are checking the uniqueness of addresses. 20326 */ 20327 rw_enter(&ill_g_lock, RW_READER); 20328 mutex_enter(&ip_addr_avail_lock); 20329 /* Mark it up, and increment counters. */ 20330 ipif->ipif_flags |= IPIF_UP; 20331 ill->ill_ipif_up_count++; 20332 err = ip_addr_availability_check(ipif); 20333 mutex_exit(&ip_addr_avail_lock); 20334 rw_exit(&ill_g_lock); 20335 20336 if (err != 0) { 20337 /* 20338 * Our address may already be up on the same ill. In this case, 20339 * the ARP entry for our ipif replaced the one for the other 20340 * ipif. So we don't want to delete it (otherwise the other ipif 20341 * would be unable to send packets). 20342 * ip_addr_availability_check() identifies this case for us and 20343 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 20344 * which is the expected error code. 20345 */ 20346 if (err == EADDRINUSE) { 20347 freemsg(ipif->ipif_arp_del_mp); 20348 ipif->ipif_arp_del_mp = NULL; 20349 err = EADDRNOTAVAIL; 20350 } 20351 ill->ill_ipif_up_count--; 20352 ipif->ipif_flags &= ~IPIF_UP; 20353 goto bad; 20354 } 20355 20356 /* 20357 * Add in all newly created IREs. ire_create_bcast() has 20358 * already checked for duplicates of the IRE_BROADCAST type. 20359 * We want to add before we call ifgrp_insert which wants 20360 * to know whether IRE_IF_RESOLVER exists or not. 20361 * 20362 * NOTE : We refrele the ire though we may branch to "bad" 20363 * later on where we do ire_delete. This is okay 20364 * because nobody can delete it as we are running 20365 * exclusively. 20366 */ 20367 for (irep1 = irep; irep1 > ire_array; ) { 20368 irep1--; 20369 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 20370 /* 20371 * refheld by ire_add. refele towards the end of the func 20372 */ 20373 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 20374 } 20375 ire_added = B_TRUE; 20376 /* 20377 * Form groups if possible. 20378 * 20379 * If we are supposed to be in a ill_group with a name, insert it 20380 * now as we know that at least one ipif is UP. Otherwise form 20381 * nameless groups. 20382 * 20383 * If ip_enable_group_ifs is set and ipif address is not 0, insert 20384 * this ipif into the appropriate interface group, or create a 20385 * new one. If this is already in a nameless group, we try to form 20386 * a bigger group looking at other ills potentially sharing this 20387 * ipif's prefix. 20388 */ 20389 phyi = ill->ill_phyint; 20390 if (phyi->phyint_groupname_len != 0) { 20391 ASSERT(phyi->phyint_groupname != NULL); 20392 if (ill->ill_ipif_up_count == 1) { 20393 ASSERT(ill->ill_group == NULL); 20394 err = illgrp_insert(&illgrp_head_v4, ill, 20395 phyi->phyint_groupname, NULL, B_TRUE); 20396 if (err != 0) { 20397 ip1dbg(("ipif_up_done: illgrp allocation " 20398 "failed, error %d\n", err)); 20399 goto bad; 20400 } 20401 } 20402 ASSERT(ill->ill_group != NULL); 20403 } 20404 20405 /* 20406 * When this is part of group, we need to make sure that 20407 * any broadcast ires created because of this ipif coming 20408 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 20409 * so that we don't receive duplicate broadcast packets. 20410 */ 20411 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 20412 ipif_renominate_bcast(ipif); 20413 20414 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 20415 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 20416 ipif_saved_irep = ipif_recover_ire(ipif); 20417 20418 if (!loopback) { 20419 /* 20420 * If the broadcast address has been set, make sure it makes 20421 * sense based on the interface address. 20422 * Only match on ill since we are sharing broadcast addresses. 20423 */ 20424 if ((ipif->ipif_brd_addr != INADDR_ANY) && 20425 (ipif->ipif_flags & IPIF_BROADCAST)) { 20426 ire_t *ire; 20427 20428 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 20429 IRE_BROADCAST, ipif, ALL_ZONES, 20430 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL)); 20431 20432 if (ire == NULL) { 20433 /* 20434 * If there isn't a matching broadcast IRE, 20435 * revert to the default for this netmask. 20436 */ 20437 ipif->ipif_v6brd_addr = ipv6_all_zeros; 20438 mutex_enter(&ipif->ipif_ill->ill_lock); 20439 ipif_set_default(ipif); 20440 mutex_exit(&ipif->ipif_ill->ill_lock); 20441 } else { 20442 ire_refrele(ire); 20443 } 20444 } 20445 20446 } 20447 20448 /* This is the first interface on this ill */ 20449 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 20450 /* 20451 * Need to recover all multicast memberships in the driver. 20452 * This had to be deferred until we had attached. 20453 */ 20454 ill_recover_multicast(ill); 20455 } 20456 /* Join the allhosts multicast address */ 20457 ipif_multicast_up(ipif); 20458 20459 if (!loopback) { 20460 /* 20461 * See whether anybody else would benefit from the 20462 * new ipif that we added. We call this always rather 20463 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 20464 * ipif is for the benefit of illgrp_insert (done above) 20465 * which does not do source address selection as it does 20466 * not want to re-create interface routes that we are 20467 * having reference to it here. 20468 */ 20469 ill_update_source_selection(ill); 20470 } 20471 20472 for (irep1 = irep; irep1 > ire_array; ) { 20473 irep1--; 20474 if (*irep1 != NULL) { 20475 /* was held in ire_add */ 20476 ire_refrele(*irep1); 20477 } 20478 } 20479 20480 cnt = ipif_saved_ire_cnt; 20481 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 20482 if (*irep1 != NULL) { 20483 /* was held in ire_add */ 20484 ire_refrele(*irep1); 20485 } 20486 } 20487 20488 if (!loopback && ipif->ipif_addr_ready) { 20489 /* Broadcast an address mask reply. */ 20490 ipif_mask_reply(ipif); 20491 } 20492 if (ipif_saved_irep != NULL) { 20493 kmem_free(ipif_saved_irep, 20494 ipif_saved_ire_cnt * sizeof (ire_t *)); 20495 } 20496 if (src_ipif_held) 20497 ipif_refrele(src_ipif); 20498 20499 /* 20500 * This had to be deferred until we had bound. Tell routing sockets and 20501 * others that this interface is up if it looks like the address has 20502 * been validated. Otherwise, if it isn't ready yet, wait for 20503 * duplicate address detection to do its thing. 20504 */ 20505 if (ipif->ipif_addr_ready) { 20506 ip_rts_ifmsg(ipif); 20507 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 20508 /* Let SCTP update the status for this ipif */ 20509 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20510 } 20511 return (0); 20512 20513 bad: 20514 ip1dbg(("ipif_up_done: FAILED \n")); 20515 /* 20516 * We don't have to bother removing from ill groups because 20517 * 20518 * 1) For groups with names, we insert only when the first ipif 20519 * comes up. In that case if it fails, it will not be in any 20520 * group. So, we need not try to remove for that case. 20521 * 20522 * 2) For groups without names, either we tried to insert ipif_ill 20523 * in a group as singleton or found some other group to become 20524 * a bigger group. For the former, if it fails we don't have 20525 * anything to do as ipif_ill is not in the group and for the 20526 * latter, there are no failures in illgrp_insert/illgrp_delete 20527 * (ENOMEM can't occur for this. Check ifgrp_insert). 20528 */ 20529 while (irep > ire_array) { 20530 irep--; 20531 if (*irep != NULL) { 20532 ire_delete(*irep); 20533 if (ire_added) 20534 ire_refrele(*irep); 20535 } 20536 } 20537 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid); 20538 20539 if (ipif_saved_irep != NULL) { 20540 kmem_free(ipif_saved_irep, 20541 ipif_saved_ire_cnt * sizeof (ire_t *)); 20542 } 20543 if (src_ipif_held) 20544 ipif_refrele(src_ipif); 20545 20546 ipif_arp_down(ipif); 20547 return (err); 20548 } 20549 20550 /* 20551 * Turn off the ARP with the ILLF_NOARP flag. 20552 */ 20553 static int 20554 ill_arp_off(ill_t *ill) 20555 { 20556 mblk_t *arp_off_mp = NULL; 20557 mblk_t *arp_on_mp = NULL; 20558 20559 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 20560 20561 ASSERT(IAM_WRITER_ILL(ill)); 20562 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20563 20564 /* 20565 * If the on message is still around we've already done 20566 * an arp_off without doing an arp_on thus there is no 20567 * work needed. 20568 */ 20569 if (ill->ill_arp_on_mp != NULL) 20570 return (0); 20571 20572 /* 20573 * Allocate an ARP on message (to be saved) and an ARP off message 20574 */ 20575 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 20576 if (!arp_off_mp) 20577 return (ENOMEM); 20578 20579 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 20580 if (!arp_on_mp) 20581 goto failed; 20582 20583 ASSERT(ill->ill_arp_on_mp == NULL); 20584 ill->ill_arp_on_mp = arp_on_mp; 20585 20586 /* Send an AR_INTERFACE_OFF request */ 20587 putnext(ill->ill_rq, arp_off_mp); 20588 return (0); 20589 failed: 20590 20591 if (arp_off_mp) 20592 freemsg(arp_off_mp); 20593 return (ENOMEM); 20594 } 20595 20596 /* 20597 * Turn on ARP by turning off the ILLF_NOARP flag. 20598 */ 20599 static int 20600 ill_arp_on(ill_t *ill) 20601 { 20602 mblk_t *mp; 20603 20604 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 20605 20606 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20607 20608 ASSERT(IAM_WRITER_ILL(ill)); 20609 /* 20610 * Send an AR_INTERFACE_ON request if we have already done 20611 * an arp_off (which allocated the message). 20612 */ 20613 if (ill->ill_arp_on_mp != NULL) { 20614 mp = ill->ill_arp_on_mp; 20615 ill->ill_arp_on_mp = NULL; 20616 putnext(ill->ill_rq, mp); 20617 } 20618 return (0); 20619 } 20620 20621 /* 20622 * Called after either deleting ill from the group or when setting 20623 * FAILED or STANDBY on the interface. 20624 */ 20625 static void 20626 illgrp_reset_schednext(ill_t *ill) 20627 { 20628 ill_group_t *illgrp; 20629 ill_t *save_ill; 20630 20631 ASSERT(IAM_WRITER_ILL(ill)); 20632 /* 20633 * When called from illgrp_delete, ill_group will be non-NULL. 20634 * But when called from ip_sioctl_flags, it could be NULL if 20635 * somebody is setting FAILED/INACTIVE on some interface which 20636 * is not part of a group. 20637 */ 20638 illgrp = ill->ill_group; 20639 if (illgrp == NULL) 20640 return; 20641 if (illgrp->illgrp_ill_schednext != ill) 20642 return; 20643 20644 illgrp->illgrp_ill_schednext = NULL; 20645 save_ill = ill; 20646 /* 20647 * Choose a good ill to be the next one for 20648 * outbound traffic. As the flags FAILED/STANDBY is 20649 * not yet marked when called from ip_sioctl_flags, 20650 * we check for ill separately. 20651 */ 20652 for (ill = illgrp->illgrp_ill; ill != NULL; 20653 ill = ill->ill_group_next) { 20654 if ((ill != save_ill) && 20655 !(ill->ill_phyint->phyint_flags & 20656 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 20657 illgrp->illgrp_ill_schednext = ill; 20658 return; 20659 } 20660 } 20661 } 20662 20663 /* 20664 * Given an ill, find the next ill in the group to be scheduled. 20665 * (This should be called by ip_newroute() before ire_create().) 20666 * The passed in ill may be pulled out of the group, after we have picked 20667 * up a different outgoing ill from the same group. However ire add will 20668 * atomically check this. 20669 */ 20670 ill_t * 20671 illgrp_scheduler(ill_t *ill) 20672 { 20673 ill_t *retill; 20674 ill_group_t *illgrp; 20675 int illcnt; 20676 int i; 20677 uint64_t flags; 20678 20679 /* 20680 * We don't use a lock to check for the ill_group. If this ill 20681 * is currently being inserted we may end up just returning this 20682 * ill itself. That is ok. 20683 */ 20684 if (ill->ill_group == NULL) { 20685 ill_refhold(ill); 20686 return (ill); 20687 } 20688 20689 /* 20690 * Grab the ill_g_lock as reader to make sure we are dealing with 20691 * a set of stable ills. No ill can be added or deleted or change 20692 * group while we hold the reader lock. 20693 */ 20694 rw_enter(&ill_g_lock, RW_READER); 20695 if ((illgrp = ill->ill_group) == NULL) { 20696 rw_exit(&ill_g_lock); 20697 ill_refhold(ill); 20698 return (ill); 20699 } 20700 20701 illcnt = illgrp->illgrp_ill_count; 20702 mutex_enter(&illgrp->illgrp_lock); 20703 retill = illgrp->illgrp_ill_schednext; 20704 20705 if (retill == NULL) 20706 retill = illgrp->illgrp_ill; 20707 20708 /* 20709 * We do a circular search beginning at illgrp_ill_schednext 20710 * or illgrp_ill. We don't check the flags against the ill lock 20711 * since it can change anytime. The ire creation will be atomic 20712 * and will fail if the ill is FAILED or OFFLINE. 20713 */ 20714 for (i = 0; i < illcnt; i++) { 20715 flags = retill->ill_phyint->phyint_flags; 20716 20717 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 20718 ILL_CAN_LOOKUP(retill)) { 20719 illgrp->illgrp_ill_schednext = retill->ill_group_next; 20720 ill_refhold(retill); 20721 break; 20722 } 20723 retill = retill->ill_group_next; 20724 if (retill == NULL) 20725 retill = illgrp->illgrp_ill; 20726 } 20727 mutex_exit(&illgrp->illgrp_lock); 20728 rw_exit(&ill_g_lock); 20729 20730 return (i == illcnt ? NULL : retill); 20731 } 20732 20733 /* 20734 * Checks for availbility of a usable source address (if there is one) when the 20735 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 20736 * this selection is done regardless of the destination. 20737 */ 20738 boolean_t 20739 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 20740 { 20741 uint_t ifindex; 20742 ipif_t *ipif = NULL; 20743 ill_t *uill; 20744 boolean_t isv6; 20745 20746 ASSERT(ill != NULL); 20747 20748 isv6 = ill->ill_isv6; 20749 ifindex = ill->ill_usesrc_ifindex; 20750 if (ifindex != 0) { 20751 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 20752 NULL); 20753 if (uill == NULL) 20754 return (NULL); 20755 mutex_enter(&uill->ill_lock); 20756 for (ipif = uill->ill_ipif; ipif != NULL; 20757 ipif = ipif->ipif_next) { 20758 if (!IPIF_CAN_LOOKUP(ipif)) 20759 continue; 20760 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20761 continue; 20762 if (!(ipif->ipif_flags & IPIF_UP)) 20763 continue; 20764 if (ipif->ipif_zoneid != zoneid) 20765 continue; 20766 if ((isv6 && 20767 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 20768 (ipif->ipif_lcl_addr == INADDR_ANY)) 20769 continue; 20770 mutex_exit(&uill->ill_lock); 20771 ill_refrele(uill); 20772 return (B_TRUE); 20773 } 20774 mutex_exit(&uill->ill_lock); 20775 ill_refrele(uill); 20776 } 20777 return (B_FALSE); 20778 } 20779 20780 /* 20781 * Determine the best source address given a destination address and an ill. 20782 * Prefers non-deprecated over deprecated but will return a deprecated 20783 * address if there is no other choice. If there is a usable source address 20784 * on the interface pointed to by ill_usesrc_ifindex then that is given 20785 * first preference. 20786 * 20787 * Returns NULL if there is no suitable source address for the ill. 20788 * This only occurs when there is no valid source address for the ill. 20789 */ 20790 ipif_t * 20791 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 20792 { 20793 ipif_t *ipif; 20794 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 20795 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 20796 int index = 0; 20797 boolean_t wrapped = B_FALSE; 20798 boolean_t same_subnet_only = B_FALSE; 20799 boolean_t ipif_same_found, ipif_other_found; 20800 boolean_t specific_found; 20801 ill_t *till, *usill = NULL; 20802 tsol_tpc_t *src_rhtp, *dst_rhtp; 20803 20804 if (ill->ill_usesrc_ifindex != 0) { 20805 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, B_FALSE, 20806 NULL, NULL, NULL, NULL); 20807 if (usill != NULL) 20808 ill = usill; /* Select source from usesrc ILL */ 20809 else 20810 return (NULL); 20811 } 20812 20813 /* 20814 * If we're dealing with an unlabeled destination on a labeled system, 20815 * make sure that we ignore source addresses that are incompatible with 20816 * the destination's default label. That destination's default label 20817 * must dominate the minimum label on the source address. 20818 */ 20819 dst_rhtp = NULL; 20820 if (is_system_labeled()) { 20821 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 20822 if (dst_rhtp == NULL) 20823 return (NULL); 20824 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 20825 TPC_RELE(dst_rhtp); 20826 dst_rhtp = NULL; 20827 } 20828 } 20829 20830 /* 20831 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 20832 * can be deleted. But an ipif/ill can get CONDEMNED any time. 20833 * After selecting the right ipif, under ill_lock make sure ipif is 20834 * not condemned, and increment refcnt. If ipif is CONDEMNED, 20835 * we retry. Inside the loop we still need to check for CONDEMNED, 20836 * but not under a lock. 20837 */ 20838 rw_enter(&ill_g_lock, RW_READER); 20839 20840 retry: 20841 till = ill; 20842 ipif_arr[0] = NULL; 20843 20844 if (till->ill_group != NULL) 20845 till = till->ill_group->illgrp_ill; 20846 20847 /* 20848 * Choose one good source address from each ill across the group. 20849 * If possible choose a source address in the same subnet as 20850 * the destination address. 20851 * 20852 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 20853 * This is okay because of the following. 20854 * 20855 * If PHYI_FAILED is set and we still have non-deprecated 20856 * addresses, it means the addresses have not yet been 20857 * failed over to a different interface. We potentially 20858 * select them to create IRE_CACHES, which will be later 20859 * flushed when the addresses move over. 20860 * 20861 * If PHYI_INACTIVE is set and we still have non-deprecated 20862 * addresses, it means either the user has configured them 20863 * or PHYI_INACTIVE has not been cleared after the addresses 20864 * been moved over. For the former, in.mpathd does a failover 20865 * when the interface becomes INACTIVE and hence we should 20866 * not find them. Once INACTIVE is set, we don't allow them 20867 * to create logical interfaces anymore. For the latter, a 20868 * flush will happen when INACTIVE is cleared which will 20869 * flush the IRE_CACHES. 20870 * 20871 * If PHYI_OFFLINE is set, all the addresses will be failed 20872 * over soon. We potentially select them to create IRE_CACHEs, 20873 * which will be later flushed when the addresses move over. 20874 * 20875 * NOTE : As ipif_select_source is called to borrow source address 20876 * for an ipif that is part of a group, source address selection 20877 * will be re-done whenever the group changes i.e either an 20878 * insertion/deletion in the group. 20879 * 20880 * Fill ipif_arr[] with source addresses, using these rules: 20881 * 20882 * 1. At most one source address from a given ill ends up 20883 * in ipif_arr[] -- that is, at most one of the ipif's 20884 * associated with a given ill ends up in ipif_arr[]. 20885 * 20886 * 2. If there is at least one non-deprecated ipif in the 20887 * IPMP group with a source address on the same subnet as 20888 * our destination, then fill ipif_arr[] only with 20889 * source addresses on the same subnet as our destination. 20890 * Note that because of (1), only the first 20891 * non-deprecated ipif found with a source address 20892 * matching the destination ends up in ipif_arr[]. 20893 * 20894 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 20895 * addresses not in the same subnet as our destination. 20896 * Again, because of (1), only the first off-subnet source 20897 * address will be chosen. 20898 * 20899 * 4. If there are no non-deprecated ipifs, then just use 20900 * the source address associated with the last deprecated 20901 * one we find that happens to be on the same subnet, 20902 * otherwise the first one not in the same subnet. 20903 */ 20904 specific_found = B_FALSE; 20905 for (; till != NULL; till = till->ill_group_next) { 20906 ipif_same_found = B_FALSE; 20907 ipif_other_found = B_FALSE; 20908 for (ipif = till->ill_ipif; ipif != NULL; 20909 ipif = ipif->ipif_next) { 20910 if (!IPIF_CAN_LOOKUP(ipif)) 20911 continue; 20912 /* Always skip NOLOCAL and ANYCAST interfaces */ 20913 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20914 continue; 20915 if (!(ipif->ipif_flags & IPIF_UP) || 20916 !ipif->ipif_addr_ready) 20917 continue; 20918 if (ipif->ipif_zoneid != zoneid && 20919 ipif->ipif_zoneid != ALL_ZONES) 20920 continue; 20921 /* 20922 * Interfaces with 0.0.0.0 address are allowed to be UP, 20923 * but are not valid as source addresses. 20924 */ 20925 if (ipif->ipif_lcl_addr == INADDR_ANY) 20926 continue; 20927 20928 /* 20929 * Check compatibility of local address for 20930 * destination's default label if we're on a labeled 20931 * system. Incompatible addresses can't be used at 20932 * all. 20933 */ 20934 if (dst_rhtp != NULL) { 20935 boolean_t incompat; 20936 20937 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 20938 IPV4_VERSION, B_FALSE); 20939 if (src_rhtp == NULL) 20940 continue; 20941 incompat = 20942 src_rhtp->tpc_tp.host_type != SUN_CIPSO || 20943 src_rhtp->tpc_tp.tp_doi != 20944 dst_rhtp->tpc_tp.tp_doi || 20945 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 20946 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 20947 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 20948 src_rhtp->tpc_tp.tp_sl_set_cipso)); 20949 TPC_RELE(src_rhtp); 20950 if (incompat) 20951 continue; 20952 } 20953 20954 /* 20955 * We prefer not to use all all-zones addresses, if we 20956 * can avoid it, as they pose problems with unlabeled 20957 * destinations. 20958 */ 20959 if (ipif->ipif_zoneid != ALL_ZONES) { 20960 if (!specific_found && 20961 (!same_subnet_only || 20962 (ipif->ipif_net_mask & dst) == 20963 ipif->ipif_subnet)) { 20964 index = 0; 20965 specific_found = B_TRUE; 20966 ipif_other_found = B_FALSE; 20967 } 20968 } else { 20969 if (specific_found) 20970 continue; 20971 } 20972 if (ipif->ipif_flags & IPIF_DEPRECATED) { 20973 if (ipif_dep == NULL || 20974 (ipif->ipif_net_mask & dst) == 20975 ipif->ipif_subnet) 20976 ipif_dep = ipif; 20977 continue; 20978 } 20979 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 20980 /* found a source address in the same subnet */ 20981 if (!same_subnet_only) { 20982 same_subnet_only = B_TRUE; 20983 index = 0; 20984 } 20985 ipif_same_found = B_TRUE; 20986 } else { 20987 if (same_subnet_only || ipif_other_found) 20988 continue; 20989 ipif_other_found = B_TRUE; 20990 } 20991 ipif_arr[index++] = ipif; 20992 if (index == MAX_IPIF_SELECT_SOURCE) { 20993 wrapped = B_TRUE; 20994 index = 0; 20995 } 20996 if (ipif_same_found) 20997 break; 20998 } 20999 } 21000 21001 if (ipif_arr[0] == NULL) { 21002 ipif = ipif_dep; 21003 } else { 21004 if (wrapped) 21005 index = MAX_IPIF_SELECT_SOURCE; 21006 ipif = ipif_arr[ipif_rand() % index]; 21007 ASSERT(ipif != NULL); 21008 } 21009 21010 if (ipif != NULL) { 21011 mutex_enter(&ipif->ipif_ill->ill_lock); 21012 if (!IPIF_CAN_LOOKUP(ipif)) { 21013 mutex_exit(&ipif->ipif_ill->ill_lock); 21014 goto retry; 21015 } 21016 ipif_refhold_locked(ipif); 21017 mutex_exit(&ipif->ipif_ill->ill_lock); 21018 } 21019 21020 rw_exit(&ill_g_lock); 21021 if (usill != NULL) 21022 ill_refrele(usill); 21023 if (dst_rhtp != NULL) 21024 TPC_RELE(dst_rhtp); 21025 21026 #ifdef DEBUG 21027 if (ipif == NULL) { 21028 char buf1[INET6_ADDRSTRLEN]; 21029 21030 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 21031 ill->ill_name, 21032 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 21033 } else { 21034 char buf1[INET6_ADDRSTRLEN]; 21035 char buf2[INET6_ADDRSTRLEN]; 21036 21037 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 21038 ipif->ipif_ill->ill_name, 21039 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 21040 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 21041 buf2, sizeof (buf2)))); 21042 } 21043 #endif /* DEBUG */ 21044 return (ipif); 21045 } 21046 21047 21048 /* 21049 * If old_ipif is not NULL, see if ipif was derived from old 21050 * ipif and if so, recreate the interface route by re-doing 21051 * source address selection. This happens when ipif_down -> 21052 * ipif_update_other_ipifs calls us. 21053 * 21054 * If old_ipif is NULL, just redo the source address selection 21055 * if needed. This happens when illgrp_insert or ipif_up_done 21056 * calls us. 21057 */ 21058 static void 21059 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 21060 { 21061 ire_t *ire; 21062 ire_t *ipif_ire; 21063 queue_t *stq; 21064 ipif_t *nipif; 21065 ill_t *ill; 21066 boolean_t need_rele = B_FALSE; 21067 21068 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 21069 ASSERT(IAM_WRITER_IPIF(ipif)); 21070 21071 ill = ipif->ipif_ill; 21072 if (!(ipif->ipif_flags & 21073 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 21074 /* 21075 * Can't possibly have borrowed the source 21076 * from old_ipif. 21077 */ 21078 return; 21079 } 21080 21081 /* 21082 * Is there any work to be done? No work if the address 21083 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 21084 * ipif_select_source() does not borrow addresses from 21085 * NOLOCAL and ANYCAST interfaces). 21086 */ 21087 if ((old_ipif != NULL) && 21088 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 21089 (old_ipif->ipif_ill->ill_wq == NULL) || 21090 (old_ipif->ipif_flags & 21091 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 21092 return; 21093 } 21094 21095 /* 21096 * Perform the same checks as when creating the 21097 * IRE_INTERFACE in ipif_up_done. 21098 */ 21099 if (!(ipif->ipif_flags & IPIF_UP)) 21100 return; 21101 21102 if ((ipif->ipif_flags & IPIF_NOXMIT) || 21103 (ipif->ipif_subnet == INADDR_ANY)) 21104 return; 21105 21106 ipif_ire = ipif_to_ire(ipif); 21107 if (ipif_ire == NULL) 21108 return; 21109 21110 /* 21111 * We know that ipif uses some other source for its 21112 * IRE_INTERFACE. Is it using the source of this 21113 * old_ipif? 21114 */ 21115 if (old_ipif != NULL && 21116 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 21117 ire_refrele(ipif_ire); 21118 return; 21119 } 21120 if (ip_debug > 2) { 21121 /* ip1dbg */ 21122 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 21123 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 21124 } 21125 21126 stq = ipif_ire->ire_stq; 21127 21128 /* 21129 * Can't use our source address. Select a different 21130 * source address for the IRE_INTERFACE. 21131 */ 21132 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 21133 if (nipif == NULL) { 21134 /* Last resort - all ipif's have IPIF_NOLOCAL */ 21135 nipif = ipif; 21136 } else { 21137 need_rele = B_TRUE; 21138 } 21139 21140 ire = ire_create( 21141 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 21142 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 21143 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 21144 NULL, /* no gateway */ 21145 NULL, 21146 &ipif->ipif_mtu, /* max frag */ 21147 NULL, /* fast path header */ 21148 NULL, /* no recv from queue */ 21149 stq, /* send-to queue */ 21150 ill->ill_net_type, /* IF_[NO]RESOLVER */ 21151 ill->ill_resolver_mp, /* xmit header */ 21152 ipif, 21153 NULL, 21154 0, 21155 0, 21156 0, 21157 0, 21158 &ire_uinfo_null, 21159 NULL, 21160 NULL); 21161 21162 if (ire != NULL) { 21163 ire_t *ret_ire; 21164 int error; 21165 21166 /* 21167 * We don't need ipif_ire anymore. We need to delete 21168 * before we add so that ire_add does not detect 21169 * duplicates. 21170 */ 21171 ire_delete(ipif_ire); 21172 ret_ire = ire; 21173 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 21174 ASSERT(error == 0); 21175 ASSERT(ire == ret_ire); 21176 /* Held in ire_add */ 21177 ire_refrele(ret_ire); 21178 } 21179 /* 21180 * Either we are falling through from above or could not 21181 * allocate a replacement. 21182 */ 21183 ire_refrele(ipif_ire); 21184 if (need_rele) 21185 ipif_refrele(nipif); 21186 } 21187 21188 /* 21189 * This old_ipif is going away. 21190 * 21191 * Determine if any other ipif's is using our address as 21192 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 21193 * IPIF_DEPRECATED). 21194 * Find the IRE_INTERFACE for such ipifs and recreate them 21195 * to use an different source address following the rules in 21196 * ipif_up_done. 21197 * 21198 * This function takes an illgrp as an argument so that illgrp_delete 21199 * can call this to update source address even after deleting the 21200 * old_ipif->ipif_ill from the ill group. 21201 */ 21202 static void 21203 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 21204 { 21205 ipif_t *ipif; 21206 ill_t *ill; 21207 char buf[INET6_ADDRSTRLEN]; 21208 21209 ASSERT(IAM_WRITER_IPIF(old_ipif)); 21210 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 21211 21212 ill = old_ipif->ipif_ill; 21213 21214 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 21215 ill->ill_name, 21216 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 21217 buf, sizeof (buf)))); 21218 /* 21219 * If this part of a group, look at all ills as ipif_select_source 21220 * borrows source address across all the ills in the group. 21221 */ 21222 if (illgrp != NULL) 21223 ill = illgrp->illgrp_ill; 21224 21225 for (; ill != NULL; ill = ill->ill_group_next) { 21226 for (ipif = ill->ill_ipif; ipif != NULL; 21227 ipif = ipif->ipif_next) { 21228 21229 if (ipif == old_ipif) 21230 continue; 21231 21232 ipif_recreate_interface_routes(old_ipif, ipif); 21233 } 21234 } 21235 } 21236 21237 /* ARGSUSED */ 21238 int 21239 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21240 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21241 { 21242 /* 21243 * ill_phyint_reinit merged the v4 and v6 into a single 21244 * ipsq. Could also have become part of a ipmp group in the 21245 * process, and we might not have been able to complete the 21246 * operation in ipif_set_values, if we could not become 21247 * exclusive. If so restart it here. 21248 */ 21249 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21250 } 21251 21252 21253 /* ARGSUSED */ 21254 int 21255 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21256 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21257 { 21258 queue_t *q1 = q; 21259 char *cp; 21260 char interf_name[LIFNAMSIZ]; 21261 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 21262 21263 if (!q->q_next) { 21264 ip1dbg(( 21265 "if_unitsel: IF_UNITSEL: no q_next\n")); 21266 return (EINVAL); 21267 } 21268 21269 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 21270 return (EALREADY); 21271 21272 do { 21273 q1 = q1->q_next; 21274 } while (q1->q_next); 21275 cp = q1->q_qinfo->qi_minfo->mi_idname; 21276 (void) sprintf(interf_name, "%s%d", cp, ppa); 21277 21278 /* 21279 * Here we are not going to delay the ioack until after 21280 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 21281 * original ioctl message before sending the requests. 21282 */ 21283 return (ipif_set_values(q, mp, interf_name, &ppa)); 21284 } 21285 21286 /* ARGSUSED */ 21287 int 21288 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21289 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21290 { 21291 return (ENXIO); 21292 } 21293 21294 /* 21295 * Net and subnet broadcast ire's are now specific to the particular 21296 * physical interface (ill) and not to any one locigal interface (ipif). 21297 * However, if a particular logical interface is being taken down, it's 21298 * associated ire's will be taken down as well. Hence, when we go to 21299 * take down or change the local address, broadcast address or netmask 21300 * of a specific logical interface, we must check to make sure that we 21301 * have valid net and subnet broadcast ire's for the other logical 21302 * interfaces which may have been shared with the logical interface 21303 * being brought down or changed. 21304 * 21305 * There is one set of 0.0.0.0 and 255.255.255.255 per ill. Usually it 21306 * is tied to the first interface coming UP. If that ipif is going down, 21307 * we need to recreate them on the next valid ipif. 21308 * 21309 * Note: assume that the ipif passed in is still up so that it's IRE 21310 * entries are still valid. 21311 */ 21312 static void 21313 ipif_check_bcast_ires(ipif_t *test_ipif) 21314 { 21315 ipif_t *ipif; 21316 ire_t *test_subnet_ire, *test_net_ire; 21317 ire_t *test_allzero_ire, *test_allone_ire; 21318 ire_t *ire_array[12]; 21319 ire_t **irep = &ire_array[0]; 21320 ire_t **irep1; 21321 21322 ipaddr_t net_addr, subnet_addr, net_mask, subnet_mask; 21323 ipaddr_t test_net_addr, test_subnet_addr; 21324 ipaddr_t test_net_mask, test_subnet_mask; 21325 boolean_t need_net_bcast_ire = B_FALSE; 21326 boolean_t need_subnet_bcast_ire = B_FALSE; 21327 boolean_t allzero_bcast_ire_created = B_FALSE; 21328 boolean_t allone_bcast_ire_created = B_FALSE; 21329 boolean_t net_bcast_ire_created = B_FALSE; 21330 boolean_t subnet_bcast_ire_created = B_FALSE; 21331 21332 ipif_t *backup_ipif_net = (ipif_t *)NULL; 21333 ipif_t *backup_ipif_subnet = (ipif_t *)NULL; 21334 ipif_t *backup_ipif_allzeros = (ipif_t *)NULL; 21335 ipif_t *backup_ipif_allones = (ipif_t *)NULL; 21336 uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; 21337 21338 ASSERT(!test_ipif->ipif_isv6); 21339 ASSERT(IAM_WRITER_IPIF(test_ipif)); 21340 21341 /* 21342 * No broadcast IREs for the LOOPBACK interface 21343 * or others such as point to point and IPIF_NOXMIT. 21344 */ 21345 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 21346 (test_ipif->ipif_flags & IPIF_NOXMIT)) 21347 return; 21348 21349 test_allzero_ire = ire_ctable_lookup(0, 0, IRE_BROADCAST, 21350 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21351 21352 test_allone_ire = ire_ctable_lookup(INADDR_BROADCAST, 0, IRE_BROADCAST, 21353 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21354 21355 test_net_mask = ip_net_mask(test_ipif->ipif_subnet); 21356 test_subnet_mask = test_ipif->ipif_net_mask; 21357 21358 /* 21359 * If no net mask set, assume the default based on net class. 21360 */ 21361 if (test_subnet_mask == 0) 21362 test_subnet_mask = test_net_mask; 21363 21364 /* 21365 * Check if there is a network broadcast ire associated with this ipif 21366 */ 21367 test_net_addr = test_net_mask & test_ipif->ipif_subnet; 21368 test_net_ire = ire_ctable_lookup(test_net_addr, 0, IRE_BROADCAST, 21369 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21370 21371 /* 21372 * Check if there is a subnet broadcast IRE associated with this ipif 21373 */ 21374 test_subnet_addr = test_subnet_mask & test_ipif->ipif_subnet; 21375 test_subnet_ire = ire_ctable_lookup(test_subnet_addr, 0, IRE_BROADCAST, 21376 test_ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF)); 21377 21378 /* 21379 * No broadcast ire's associated with this ipif. 21380 */ 21381 if ((test_subnet_ire == NULL) && (test_net_ire == NULL) && 21382 (test_allzero_ire == NULL) && (test_allone_ire == NULL)) { 21383 return; 21384 } 21385 21386 /* 21387 * We have established which bcast ires have to be replaced. 21388 * Next we try to locate ipifs that match there ires. 21389 * The rules are simple: If we find an ipif that matches on the subnet 21390 * address it will also match on the net address, the allzeros and 21391 * allones address. Any ipif that matches only on the net address will 21392 * also match the allzeros and allones addresses. 21393 * The other criterion is the ipif_flags. We look for non-deprecated 21394 * (and non-anycast and non-nolocal) ipifs as the best choice. 21395 * ipifs with check_flags matching (deprecated, etc) are used only 21396 * if good ipifs are not available. While looping, we save existing 21397 * deprecated ipifs as backup_ipif. 21398 * We loop through all the ipifs for this ill looking for ipifs 21399 * whose broadcast addr match the ipif passed in, but do not have 21400 * their own broadcast ires. For creating 0.0.0.0 and 21401 * 255.255.255.255 we just need an ipif on this ill to create. 21402 */ 21403 for (ipif = test_ipif->ipif_ill->ill_ipif; ipif != NULL; 21404 ipif = ipif->ipif_next) { 21405 21406 ASSERT(!ipif->ipif_isv6); 21407 /* 21408 * Already checked the ipif passed in. 21409 */ 21410 if (ipif == test_ipif) { 21411 continue; 21412 } 21413 21414 /* 21415 * We only need to recreate broadcast ires if another ipif in 21416 * the same zone uses them. The new ires must be created in the 21417 * same zone. 21418 */ 21419 if (ipif->ipif_zoneid != test_ipif->ipif_zoneid) { 21420 continue; 21421 } 21422 21423 /* 21424 * Only interested in logical interfaces with valid local 21425 * addresses or with the ability to broadcast. 21426 */ 21427 if ((ipif->ipif_subnet == 0) || 21428 !(ipif->ipif_flags & IPIF_BROADCAST) || 21429 (ipif->ipif_flags & IPIF_NOXMIT) || 21430 !(ipif->ipif_flags & IPIF_UP)) { 21431 continue; 21432 } 21433 /* 21434 * Check if there is a net broadcast ire for this 21435 * net address. If it turns out that the ipif we are 21436 * about to take down owns this ire, we must make a 21437 * new one because it is potentially going away. 21438 */ 21439 if (test_net_ire && (!net_bcast_ire_created)) { 21440 net_mask = ip_net_mask(ipif->ipif_subnet); 21441 net_addr = net_mask & ipif->ipif_subnet; 21442 if (net_addr == test_net_addr) { 21443 need_net_bcast_ire = B_TRUE; 21444 /* 21445 * Use DEPRECATED ipif only if no good 21446 * ires are available. subnet_addr is 21447 * a better match than net_addr. 21448 */ 21449 if ((ipif->ipif_flags & check_flags) && 21450 (backup_ipif_net == NULL)) { 21451 backup_ipif_net = ipif; 21452 } 21453 } 21454 } 21455 /* 21456 * Check if there is a subnet broadcast ire for this 21457 * net address. If it turns out that the ipif we are 21458 * about to take down owns this ire, we must make a 21459 * new one because it is potentially going away. 21460 */ 21461 if (test_subnet_ire && (!subnet_bcast_ire_created)) { 21462 subnet_mask = ipif->ipif_net_mask; 21463 subnet_addr = ipif->ipif_subnet; 21464 if (subnet_addr == test_subnet_addr) { 21465 need_subnet_bcast_ire = B_TRUE; 21466 if ((ipif->ipif_flags & check_flags) && 21467 (backup_ipif_subnet == NULL)) { 21468 backup_ipif_subnet = ipif; 21469 } 21470 } 21471 } 21472 21473 21474 /* Short circuit here if this ipif is deprecated */ 21475 if (ipif->ipif_flags & check_flags) { 21476 if ((test_allzero_ire != NULL) && 21477 (!allzero_bcast_ire_created) && 21478 (backup_ipif_allzeros == NULL)) { 21479 backup_ipif_allzeros = ipif; 21480 } 21481 if ((test_allone_ire != NULL) && 21482 (!allone_bcast_ire_created) && 21483 (backup_ipif_allones == NULL)) { 21484 backup_ipif_allones = ipif; 21485 } 21486 continue; 21487 } 21488 21489 /* 21490 * Found an ipif which has the same broadcast ire as the 21491 * ipif passed in and the ipif passed in "owns" the ire. 21492 * Create new broadcast ire's for this broadcast addr. 21493 */ 21494 if (need_net_bcast_ire && !net_bcast_ire_created) { 21495 irep = ire_create_bcast(ipif, net_addr, irep); 21496 irep = ire_create_bcast(ipif, 21497 ~net_mask | net_addr, irep); 21498 net_bcast_ire_created = B_TRUE; 21499 } 21500 if (need_subnet_bcast_ire && !subnet_bcast_ire_created) { 21501 irep = ire_create_bcast(ipif, subnet_addr, irep); 21502 irep = ire_create_bcast(ipif, 21503 ~subnet_mask | subnet_addr, irep); 21504 subnet_bcast_ire_created = B_TRUE; 21505 } 21506 if (test_allzero_ire != NULL && !allzero_bcast_ire_created) { 21507 irep = ire_create_bcast(ipif, 0, irep); 21508 allzero_bcast_ire_created = B_TRUE; 21509 } 21510 if (test_allone_ire != NULL && !allone_bcast_ire_created) { 21511 irep = ire_create_bcast(ipif, INADDR_BROADCAST, irep); 21512 allone_bcast_ire_created = B_TRUE; 21513 } 21514 /* 21515 * Once we have created all the appropriate ires, we 21516 * just break out of this loop to add what we have created. 21517 * This has been indented similar to ire_match_args for 21518 * readability. 21519 */ 21520 if (((test_net_ire == NULL) || 21521 (net_bcast_ire_created)) && 21522 ((test_subnet_ire == NULL) || 21523 (subnet_bcast_ire_created)) && 21524 ((test_allzero_ire == NULL) || 21525 (allzero_bcast_ire_created)) && 21526 ((test_allone_ire == NULL) || 21527 (allone_bcast_ire_created))) { 21528 break; 21529 } 21530 } 21531 21532 /* 21533 * Create bcast ires on deprecated ipifs if no non-deprecated ipifs 21534 * exist. 6 pairs of bcast ires are needed. 21535 * Note - the old ires are deleted in ipif_down. 21536 */ 21537 if (need_net_bcast_ire && !net_bcast_ire_created && backup_ipif_net) { 21538 ipif = backup_ipif_net; 21539 irep = ire_create_bcast(ipif, net_addr, irep); 21540 irep = ire_create_bcast(ipif, ~net_mask | net_addr, irep); 21541 net_bcast_ire_created = B_TRUE; 21542 } 21543 if (need_subnet_bcast_ire && !subnet_bcast_ire_created && 21544 backup_ipif_subnet) { 21545 ipif = backup_ipif_subnet; 21546 irep = ire_create_bcast(ipif, subnet_addr, irep); 21547 irep = ire_create_bcast(ipif, 21548 ~subnet_mask | subnet_addr, irep); 21549 subnet_bcast_ire_created = B_TRUE; 21550 } 21551 if (test_allzero_ire != NULL && !allzero_bcast_ire_created && 21552 backup_ipif_allzeros) { 21553 irep = ire_create_bcast(backup_ipif_allzeros, 0, irep); 21554 allzero_bcast_ire_created = B_TRUE; 21555 } 21556 if (test_allone_ire != NULL && !allone_bcast_ire_created && 21557 backup_ipif_allones) { 21558 irep = ire_create_bcast(backup_ipif_allones, 21559 INADDR_BROADCAST, irep); 21560 allone_bcast_ire_created = B_TRUE; 21561 } 21562 21563 /* 21564 * If we can't create all of them, don't add any of them. 21565 * Code in ip_wput_ire and ire_to_ill assumes that we 21566 * always have a non-loopback copy and loopback copy 21567 * for a given address. 21568 */ 21569 for (irep1 = irep; irep1 > ire_array; ) { 21570 irep1--; 21571 if (*irep1 == NULL) { 21572 ip0dbg(("ipif_check_bcast_ires: can't create " 21573 "IRE_BROADCAST, memory allocation failure\n")); 21574 while (irep > ire_array) { 21575 irep--; 21576 if (*irep != NULL) 21577 ire_delete(*irep); 21578 } 21579 goto bad; 21580 } 21581 } 21582 for (irep1 = irep; irep1 > ire_array; ) { 21583 int error; 21584 21585 irep1--; 21586 error = ire_add(irep1, NULL, NULL, NULL, B_FALSE); 21587 if (error == 0) { 21588 ire_refrele(*irep1); /* Held in ire_add */ 21589 } 21590 } 21591 bad: 21592 if (test_allzero_ire != NULL) 21593 ire_refrele(test_allzero_ire); 21594 if (test_allone_ire != NULL) 21595 ire_refrele(test_allone_ire); 21596 if (test_net_ire != NULL) 21597 ire_refrele(test_net_ire); 21598 if (test_subnet_ire != NULL) 21599 ire_refrele(test_subnet_ire); 21600 } 21601 21602 /* 21603 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 21604 * from lifr_flags and the name from lifr_name. 21605 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 21606 * since ipif_lookup_on_name uses the _isv6 flags when matching. 21607 * Returns EINPROGRESS when mp has been consumed by queueing it on 21608 * ill_pending_mp and the ioctl will complete in ip_rput. 21609 */ 21610 /* ARGSUSED */ 21611 int 21612 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21613 ip_ioctl_cmd_t *ipip, void *if_req) 21614 { 21615 int err; 21616 ill_t *ill; 21617 struct lifreq *lifr = (struct lifreq *)if_req; 21618 21619 ASSERT(ipif != NULL); 21620 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 21621 ASSERT(q->q_next != NULL); 21622 21623 ill = (ill_t *)q->q_ptr; 21624 /* 21625 * If we are not writer on 'q' then this interface exists already 21626 * and previous lookups (ipif_extract_lifreq_cmn) found this ipif. 21627 * So return EALREADY 21628 */ 21629 if (ill != ipif->ipif_ill) 21630 return (EALREADY); 21631 21632 if (ill->ill_name[0] != '\0') 21633 return (EALREADY); 21634 21635 /* 21636 * Set all the flags. Allows all kinds of override. Provide some 21637 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 21638 * unless there is either multicast/broadcast support in the driver 21639 * or it is a pt-pt link. 21640 */ 21641 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 21642 /* Meaningless to IP thus don't allow them to be set. */ 21643 ip1dbg(("ip_setname: EINVAL 1\n")); 21644 return (EINVAL); 21645 } 21646 /* 21647 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 21648 * ill_bcast_addr_length info. 21649 */ 21650 if (!ill->ill_needs_attach && 21651 ((lifr->lifr_flags & IFF_MULTICAST) && 21652 !(lifr->lifr_flags & IFF_POINTOPOINT) && 21653 ill->ill_bcast_addr_length == 0)) { 21654 /* Link not broadcast/pt-pt capable i.e. no multicast */ 21655 ip1dbg(("ip_setname: EINVAL 2\n")); 21656 return (EINVAL); 21657 } 21658 if ((lifr->lifr_flags & IFF_BROADCAST) && 21659 ((lifr->lifr_flags & IFF_IPV6) || 21660 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 21661 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 21662 ip1dbg(("ip_setname: EINVAL 3\n")); 21663 return (EINVAL); 21664 } 21665 if (lifr->lifr_flags & IFF_UP) { 21666 /* Can only be set with SIOCSLIFFLAGS */ 21667 ip1dbg(("ip_setname: EINVAL 4\n")); 21668 return (EINVAL); 21669 } 21670 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 21671 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 21672 ip1dbg(("ip_setname: EINVAL 5\n")); 21673 return (EINVAL); 21674 } 21675 /* 21676 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 21677 */ 21678 if ((lifr->lifr_flags & IFF_XRESOLV) && 21679 !(lifr->lifr_flags & IFF_IPV6) && 21680 !(ipif->ipif_isv6)) { 21681 ip1dbg(("ip_setname: EINVAL 6\n")); 21682 return (EINVAL); 21683 } 21684 21685 /* 21686 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 21687 * we have all the flags here. So, we assign rather than we OR. 21688 * We can't OR the flags here because we don't want to set 21689 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 21690 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 21691 * on lifr_flags value here. 21692 */ 21693 /* 21694 * This ill has not been inserted into the global list. 21695 * So we are still single threaded and don't need any lock 21696 */ 21697 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS & 21698 ~IFF_DUPLICATE; 21699 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 21700 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 21701 21702 /* We started off as V4. */ 21703 if (ill->ill_flags & ILLF_IPV6) { 21704 ill->ill_phyint->phyint_illv6 = ill; 21705 ill->ill_phyint->phyint_illv4 = NULL; 21706 } 21707 err = ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa); 21708 return (err); 21709 } 21710 21711 /* ARGSUSED */ 21712 int 21713 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21714 ip_ioctl_cmd_t *ipip, void *if_req) 21715 { 21716 /* 21717 * ill_phyint_reinit merged the v4 and v6 into a single 21718 * ipsq. Could also have become part of a ipmp group in the 21719 * process, and we might not have been able to complete the 21720 * slifname in ipif_set_values, if we could not become 21721 * exclusive. If so restart it here 21722 */ 21723 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21724 } 21725 21726 /* 21727 * Return a pointer to the ipif which matches the index, IP version type and 21728 * zoneid. 21729 */ 21730 ipif_t * 21731 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 21732 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 21733 { 21734 ill_t *ill; 21735 ipsq_t *ipsq; 21736 phyint_t *phyi; 21737 ipif_t *ipif; 21738 21739 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 21740 (q != NULL && mp != NULL && func != NULL && err != NULL)); 21741 21742 if (err != NULL) 21743 *err = 0; 21744 21745 /* 21746 * Indexes are stored in the phyint - a common structure 21747 * to both IPv4 and IPv6. 21748 */ 21749 21750 rw_enter(&ill_g_lock, RW_READER); 21751 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_index, 21752 (void *) &index, NULL); 21753 if (phyi != NULL) { 21754 ill = isv6 ? phyi->phyint_illv6 : phyi->phyint_illv4; 21755 if (ill == NULL) { 21756 rw_exit(&ill_g_lock); 21757 if (err != NULL) 21758 *err = ENXIO; 21759 return (NULL); 21760 } 21761 GRAB_CONN_LOCK(q); 21762 mutex_enter(&ill->ill_lock); 21763 if (ILL_CAN_LOOKUP(ill)) { 21764 for (ipif = ill->ill_ipif; ipif != NULL; 21765 ipif = ipif->ipif_next) { 21766 if (IPIF_CAN_LOOKUP(ipif) && 21767 (zoneid == ALL_ZONES || 21768 zoneid == ipif->ipif_zoneid || 21769 ipif->ipif_zoneid == ALL_ZONES)) { 21770 ipif_refhold_locked(ipif); 21771 mutex_exit(&ill->ill_lock); 21772 RELEASE_CONN_LOCK(q); 21773 rw_exit(&ill_g_lock); 21774 return (ipif); 21775 } 21776 } 21777 } else if (ILL_CAN_WAIT(ill, q)) { 21778 ipsq = ill->ill_phyint->phyint_ipsq; 21779 mutex_enter(&ipsq->ipsq_lock); 21780 rw_exit(&ill_g_lock); 21781 mutex_exit(&ill->ill_lock); 21782 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 21783 mutex_exit(&ipsq->ipsq_lock); 21784 RELEASE_CONN_LOCK(q); 21785 *err = EINPROGRESS; 21786 return (NULL); 21787 } 21788 mutex_exit(&ill->ill_lock); 21789 RELEASE_CONN_LOCK(q); 21790 } 21791 rw_exit(&ill_g_lock); 21792 if (err != NULL) 21793 *err = ENXIO; 21794 return (NULL); 21795 } 21796 21797 typedef struct conn_change_s { 21798 uint_t cc_old_ifindex; 21799 uint_t cc_new_ifindex; 21800 } conn_change_t; 21801 21802 /* 21803 * ipcl_walk function for changing interface index. 21804 */ 21805 static void 21806 conn_change_ifindex(conn_t *connp, caddr_t arg) 21807 { 21808 conn_change_t *connc; 21809 uint_t old_ifindex; 21810 uint_t new_ifindex; 21811 int i; 21812 ilg_t *ilg; 21813 21814 connc = (conn_change_t *)arg; 21815 old_ifindex = connc->cc_old_ifindex; 21816 new_ifindex = connc->cc_new_ifindex; 21817 21818 if (connp->conn_orig_bound_ifindex == old_ifindex) 21819 connp->conn_orig_bound_ifindex = new_ifindex; 21820 21821 if (connp->conn_orig_multicast_ifindex == old_ifindex) 21822 connp->conn_orig_multicast_ifindex = new_ifindex; 21823 21824 if (connp->conn_orig_xmit_ifindex == old_ifindex) 21825 connp->conn_orig_xmit_ifindex = new_ifindex; 21826 21827 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 21828 ilg = &connp->conn_ilg[i]; 21829 if (ilg->ilg_orig_ifindex == old_ifindex) 21830 ilg->ilg_orig_ifindex = new_ifindex; 21831 } 21832 } 21833 21834 /* 21835 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 21836 * to new_index if it matches the old_index. 21837 * 21838 * Failovers typically happen within a group of ills. But somebody 21839 * can remove an ill from the group after a failover happened. If 21840 * we are setting the ifindex after this, we potentially need to 21841 * look at all the ills rather than just the ones in the group. 21842 * We cut down the work by looking at matching ill_net_types 21843 * and ill_types as we could not possibly grouped them together. 21844 */ 21845 static void 21846 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 21847 { 21848 ill_t *ill; 21849 ipif_t *ipif; 21850 uint_t old_ifindex; 21851 uint_t new_ifindex; 21852 ilm_t *ilm; 21853 ill_walk_context_t ctx; 21854 21855 old_ifindex = connc->cc_old_ifindex; 21856 new_ifindex = connc->cc_new_ifindex; 21857 21858 rw_enter(&ill_g_lock, RW_READER); 21859 ill = ILL_START_WALK_ALL(&ctx); 21860 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 21861 if ((ill_orig->ill_net_type != ill->ill_net_type) || 21862 (ill_orig->ill_type != ill->ill_type)) { 21863 continue; 21864 } 21865 for (ipif = ill->ill_ipif; ipif != NULL; 21866 ipif = ipif->ipif_next) { 21867 if (ipif->ipif_orig_ifindex == old_ifindex) 21868 ipif->ipif_orig_ifindex = new_ifindex; 21869 } 21870 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 21871 if (ilm->ilm_orig_ifindex == old_ifindex) 21872 ilm->ilm_orig_ifindex = new_ifindex; 21873 } 21874 } 21875 rw_exit(&ill_g_lock); 21876 } 21877 21878 /* 21879 * We first need to ensure that the new index is unique, and 21880 * then carry the change across both v4 and v6 ill representation 21881 * of the physical interface. 21882 */ 21883 /* ARGSUSED */ 21884 int 21885 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21886 ip_ioctl_cmd_t *ipip, void *ifreq) 21887 { 21888 ill_t *ill; 21889 ill_t *ill_other; 21890 phyint_t *phyi; 21891 int old_index; 21892 conn_change_t connc; 21893 struct ifreq *ifr = (struct ifreq *)ifreq; 21894 struct lifreq *lifr = (struct lifreq *)ifreq; 21895 uint_t index; 21896 ill_t *ill_v4; 21897 ill_t *ill_v6; 21898 21899 if (ipip->ipi_cmd_type == IF_CMD) 21900 index = ifr->ifr_index; 21901 else 21902 index = lifr->lifr_index; 21903 21904 /* 21905 * Only allow on physical interface. Also, index zero is illegal. 21906 * 21907 * Need to check for PHYI_FAILED and PHYI_INACTIVE 21908 * 21909 * 1) If PHYI_FAILED is set, a failover could have happened which 21910 * implies a possible failback might have to happen. As failback 21911 * depends on the old index, we should fail setting the index. 21912 * 21913 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 21914 * any addresses or multicast memberships are failed over to 21915 * a non-STANDBY interface. As failback depends on the old 21916 * index, we should fail setting the index for this case also. 21917 * 21918 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 21919 * Be consistent with PHYI_FAILED and fail the ioctl. 21920 */ 21921 ill = ipif->ipif_ill; 21922 phyi = ill->ill_phyint; 21923 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 21924 ipif->ipif_id != 0 || index == 0) { 21925 return (EINVAL); 21926 } 21927 old_index = phyi->phyint_ifindex; 21928 21929 /* If the index is not changing, no work to do */ 21930 if (old_index == index) 21931 return (0); 21932 21933 /* 21934 * Use ill_lookup_on_ifindex to determine if the 21935 * new index is unused and if so allow the change. 21936 */ 21937 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL); 21938 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL); 21939 if (ill_v6 != NULL || ill_v4 != NULL) { 21940 if (ill_v4 != NULL) 21941 ill_refrele(ill_v4); 21942 if (ill_v6 != NULL) 21943 ill_refrele(ill_v6); 21944 return (EBUSY); 21945 } 21946 21947 /* 21948 * The new index is unused. Set it in the phyint. 21949 * Locate the other ill so that we can send a routing 21950 * sockets message. 21951 */ 21952 if (ill->ill_isv6) { 21953 ill_other = phyi->phyint_illv4; 21954 } else { 21955 ill_other = phyi->phyint_illv6; 21956 } 21957 21958 phyi->phyint_ifindex = index; 21959 21960 connc.cc_old_ifindex = old_index; 21961 connc.cc_new_ifindex = index; 21962 ip_change_ifindex(ill, &connc); 21963 ipcl_walk(conn_change_ifindex, (caddr_t)&connc); 21964 21965 /* Send the routing sockets message */ 21966 ip_rts_ifmsg(ipif); 21967 if (ill_other != NULL) 21968 ip_rts_ifmsg(ill_other->ill_ipif); 21969 21970 return (0); 21971 } 21972 21973 /* ARGSUSED */ 21974 int 21975 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21976 ip_ioctl_cmd_t *ipip, void *ifreq) 21977 { 21978 struct ifreq *ifr = (struct ifreq *)ifreq; 21979 struct lifreq *lifr = (struct lifreq *)ifreq; 21980 21981 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 21982 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21983 /* Get the interface index */ 21984 if (ipip->ipi_cmd_type == IF_CMD) { 21985 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21986 } else { 21987 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21988 } 21989 return (0); 21990 } 21991 21992 /* ARGSUSED */ 21993 int 21994 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21995 ip_ioctl_cmd_t *ipip, void *ifreq) 21996 { 21997 struct lifreq *lifr = (struct lifreq *)ifreq; 21998 21999 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 22000 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22001 /* Get the interface zone */ 22002 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 22003 lifr->lifr_zoneid = ipif->ipif_zoneid; 22004 return (0); 22005 } 22006 22007 /* 22008 * Set the zoneid of an interface. 22009 */ 22010 /* ARGSUSED */ 22011 int 22012 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22013 ip_ioctl_cmd_t *ipip, void *ifreq) 22014 { 22015 struct lifreq *lifr = (struct lifreq *)ifreq; 22016 int err = 0; 22017 boolean_t need_up = B_FALSE; 22018 zone_t *zptr; 22019 zone_status_t status; 22020 zoneid_t zoneid; 22021 22022 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 22023 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 22024 if (!is_system_labeled()) 22025 return (ENOTSUP); 22026 zoneid = GLOBAL_ZONEID; 22027 } 22028 22029 /* cannot assign instance zero to a non-global zone */ 22030 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 22031 return (ENOTSUP); 22032 22033 /* 22034 * Cannot assign to a zone that doesn't exist or is shutting down. In 22035 * the event of a race with the zone shutdown processing, since IP 22036 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 22037 * interface will be cleaned up even if the zone is shut down 22038 * immediately after the status check. If the interface can't be brought 22039 * down right away, and the zone is shut down before the restart 22040 * function is called, we resolve the possible races by rechecking the 22041 * zone status in the restart function. 22042 */ 22043 if ((zptr = zone_find_by_id(zoneid)) == NULL) 22044 return (EINVAL); 22045 status = zone_status_get(zptr); 22046 zone_rele(zptr); 22047 22048 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 22049 return (EINVAL); 22050 22051 if (ipif->ipif_flags & IPIF_UP) { 22052 /* 22053 * If the interface is already marked up, 22054 * we call ipif_down which will take care 22055 * of ditching any IREs that have been set 22056 * up based on the old interface address. 22057 */ 22058 err = ipif_logical_down(ipif, q, mp); 22059 if (err == EINPROGRESS) 22060 return (err); 22061 ipif_down_tail(ipif); 22062 need_up = B_TRUE; 22063 } 22064 22065 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 22066 return (err); 22067 } 22068 22069 static int 22070 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 22071 queue_t *q, mblk_t *mp, boolean_t need_up) 22072 { 22073 int err = 0; 22074 22075 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 22076 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22077 22078 /* Set the new zone id. */ 22079 ipif->ipif_zoneid = zoneid; 22080 22081 /* Update sctp list */ 22082 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 22083 22084 if (need_up) { 22085 /* 22086 * Now bring the interface back up. If this 22087 * is the only IPIF for the ILL, ipif_up 22088 * will have to re-bind to the device, so 22089 * we may get back EINPROGRESS, in which 22090 * case, this IOCTL will get completed in 22091 * ip_rput_dlpi when we see the DL_BIND_ACK. 22092 */ 22093 err = ipif_up(ipif, q, mp); 22094 } 22095 return (err); 22096 } 22097 22098 /* ARGSUSED */ 22099 int 22100 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22101 ip_ioctl_cmd_t *ipip, void *if_req) 22102 { 22103 struct lifreq *lifr = (struct lifreq *)if_req; 22104 zoneid_t zoneid; 22105 zone_t *zptr; 22106 zone_status_t status; 22107 22108 ASSERT(ipif->ipif_id != 0); 22109 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 22110 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 22111 zoneid = GLOBAL_ZONEID; 22112 22113 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 22114 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22115 22116 /* 22117 * We recheck the zone status to resolve the following race condition: 22118 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 22119 * 2) hme0:1 is up and can't be brought down right away; 22120 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 22121 * 3) zone "myzone" is halted; the zone status switches to 22122 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 22123 * the interfaces to remove - hme0:1 is not returned because it's not 22124 * yet in "myzone", so it won't be removed; 22125 * 4) the restart function for SIOCSLIFZONE is called; without the 22126 * status check here, we would have hme0:1 in "myzone" after it's been 22127 * destroyed. 22128 * Note that if the status check fails, we need to bring the interface 22129 * back to its state prior to ip_sioctl_slifzone(), hence the call to 22130 * ipif_up_done[_v6](). 22131 */ 22132 status = ZONE_IS_UNINITIALIZED; 22133 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 22134 status = zone_status_get(zptr); 22135 zone_rele(zptr); 22136 } 22137 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 22138 if (ipif->ipif_isv6) { 22139 (void) ipif_up_done_v6(ipif); 22140 } else { 22141 (void) ipif_up_done(ipif); 22142 } 22143 return (EINVAL); 22144 } 22145 22146 ipif_down_tail(ipif); 22147 22148 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 22149 B_TRUE)); 22150 } 22151 22152 /* ARGSUSED */ 22153 int 22154 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22155 ip_ioctl_cmd_t *ipip, void *ifreq) 22156 { 22157 struct lifreq *lifr = ifreq; 22158 22159 ASSERT(q->q_next == NULL); 22160 ASSERT(CONN_Q(q)); 22161 22162 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 22163 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22164 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 22165 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 22166 22167 return (0); 22168 } 22169 22170 22171 /* Find the previous ILL in this usesrc group */ 22172 static ill_t * 22173 ill_prev_usesrc(ill_t *uill) 22174 { 22175 ill_t *ill; 22176 22177 for (ill = uill->ill_usesrc_grp_next; 22178 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 22179 ill = ill->ill_usesrc_grp_next) 22180 /* do nothing */; 22181 return (ill); 22182 } 22183 22184 /* 22185 * Release all members of the usesrc group. This routine is called 22186 * from ill_delete when the interface being unplumbed is the 22187 * group head. 22188 */ 22189 static void 22190 ill_disband_usesrc_group(ill_t *uill) 22191 { 22192 ill_t *next_ill, *tmp_ill; 22193 ASSERT(RW_WRITE_HELD(&ill_g_usesrc_lock)); 22194 next_ill = uill->ill_usesrc_grp_next; 22195 22196 do { 22197 ASSERT(next_ill != NULL); 22198 tmp_ill = next_ill->ill_usesrc_grp_next; 22199 ASSERT(tmp_ill != NULL); 22200 next_ill->ill_usesrc_grp_next = NULL; 22201 next_ill->ill_usesrc_ifindex = 0; 22202 next_ill = tmp_ill; 22203 } while (next_ill->ill_usesrc_ifindex != 0); 22204 uill->ill_usesrc_grp_next = NULL; 22205 } 22206 22207 /* 22208 * Remove the client usesrc ILL from the list and relink to a new list 22209 */ 22210 int 22211 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 22212 { 22213 ill_t *ill, *tmp_ill; 22214 22215 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 22216 (uill != NULL) && RW_WRITE_HELD(&ill_g_usesrc_lock)); 22217 22218 /* 22219 * Check if the usesrc client ILL passed in is not already 22220 * in use as a usesrc ILL i.e one whose source address is 22221 * in use OR a usesrc ILL is not already in use as a usesrc 22222 * client ILL 22223 */ 22224 if ((ucill->ill_usesrc_ifindex == 0) || 22225 (uill->ill_usesrc_ifindex != 0)) { 22226 return (-1); 22227 } 22228 22229 ill = ill_prev_usesrc(ucill); 22230 ASSERT(ill->ill_usesrc_grp_next != NULL); 22231 22232 /* Remove from the current list */ 22233 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 22234 /* Only two elements in the list */ 22235 ASSERT(ill->ill_usesrc_ifindex == 0); 22236 ill->ill_usesrc_grp_next = NULL; 22237 } else { 22238 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 22239 } 22240 22241 if (ifindex == 0) { 22242 ucill->ill_usesrc_ifindex = 0; 22243 ucill->ill_usesrc_grp_next = NULL; 22244 return (0); 22245 } 22246 22247 ucill->ill_usesrc_ifindex = ifindex; 22248 tmp_ill = uill->ill_usesrc_grp_next; 22249 uill->ill_usesrc_grp_next = ucill; 22250 ucill->ill_usesrc_grp_next = 22251 (tmp_ill != NULL) ? tmp_ill : uill; 22252 return (0); 22253 } 22254 22255 /* 22256 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 22257 * ip.c for locking details. 22258 */ 22259 /* ARGSUSED */ 22260 int 22261 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22262 ip_ioctl_cmd_t *ipip, void *ifreq) 22263 { 22264 struct lifreq *lifr = (struct lifreq *)ifreq; 22265 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 22266 ill_flag_changed = B_FALSE; 22267 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 22268 int err = 0, ret; 22269 uint_t ifindex; 22270 phyint_t *us_phyint, *us_cli_phyint; 22271 ipsq_t *ipsq = NULL; 22272 22273 ASSERT(IAM_WRITER_IPIF(ipif)); 22274 ASSERT(q->q_next == NULL); 22275 ASSERT(CONN_Q(q)); 22276 22277 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 22278 us_cli_phyint = usesrc_cli_ill->ill_phyint; 22279 22280 ASSERT(us_cli_phyint != NULL); 22281 22282 /* 22283 * If the client ILL is being used for IPMP, abort. 22284 * Note, this can be done before ipsq_try_enter since we are already 22285 * exclusive on this ILL 22286 */ 22287 if ((us_cli_phyint->phyint_groupname != NULL) || 22288 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 22289 return (EINVAL); 22290 } 22291 22292 ifindex = lifr->lifr_index; 22293 if (ifindex == 0) { 22294 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 22295 /* non usesrc group interface, nothing to reset */ 22296 return (0); 22297 } 22298 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 22299 /* valid reset request */ 22300 reset_flg = B_TRUE; 22301 } 22302 22303 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 22304 ip_process_ioctl, &err); 22305 22306 if (usesrc_ill == NULL) { 22307 return (err); 22308 } 22309 22310 /* 22311 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 22312 * group nor can either of the interfaces be used for standy. So 22313 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 22314 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 22315 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 22316 * We are already exlusive on this ipsq i.e ipsq corresponding to 22317 * the usesrc_cli_ill 22318 */ 22319 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 22320 NEW_OP, B_TRUE); 22321 if (ipsq == NULL) { 22322 err = EINPROGRESS; 22323 /* Operation enqueued on the ipsq of the usesrc ILL */ 22324 goto done; 22325 } 22326 22327 /* Check if the usesrc_ill is used for IPMP */ 22328 us_phyint = usesrc_ill->ill_phyint; 22329 if ((us_phyint->phyint_groupname != NULL) || 22330 (us_phyint->phyint_flags & PHYI_STANDBY)) { 22331 err = EINVAL; 22332 goto done; 22333 } 22334 22335 /* 22336 * If the client is already in use as a usesrc_ill or a usesrc_ill is 22337 * already a client then return EINVAL 22338 */ 22339 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 22340 err = EINVAL; 22341 goto done; 22342 } 22343 22344 /* 22345 * If the ill_usesrc_ifindex field is already set to what it needs to 22346 * be then this is a duplicate operation. 22347 */ 22348 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 22349 err = 0; 22350 goto done; 22351 } 22352 22353 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 22354 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 22355 usesrc_ill->ill_isv6)); 22356 22357 /* 22358 * The next step ensures that no new ires will be created referencing 22359 * the client ill, until the ILL_CHANGING flag is cleared. Then 22360 * we go through an ire walk deleting all ire caches that reference 22361 * the client ill. New ires referencing the client ill that are added 22362 * to the ire table before the ILL_CHANGING flag is set, will be 22363 * cleaned up by the ire walk below. Attempt to add new ires referencing 22364 * the client ill while the ILL_CHANGING flag is set will be failed 22365 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 22366 * checks (under the ill_g_usesrc_lock) that the ire being added 22367 * is not stale, i.e the ire_stq and ire_ipif are consistent and 22368 * belong to the same usesrc group. 22369 */ 22370 mutex_enter(&usesrc_cli_ill->ill_lock); 22371 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 22372 mutex_exit(&usesrc_cli_ill->ill_lock); 22373 ill_flag_changed = B_TRUE; 22374 22375 if (ipif->ipif_isv6) 22376 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22377 ALL_ZONES); 22378 else 22379 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22380 ALL_ZONES); 22381 22382 /* 22383 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 22384 * and the ill_usesrc_ifindex fields 22385 */ 22386 rw_enter(&ill_g_usesrc_lock, RW_WRITER); 22387 22388 if (reset_flg) { 22389 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 22390 if (ret != 0) { 22391 err = EINVAL; 22392 } 22393 rw_exit(&ill_g_usesrc_lock); 22394 goto done; 22395 } 22396 22397 /* 22398 * Four possibilities to consider: 22399 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 22400 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 22401 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 22402 * 4. Both are part of their respective usesrc groups 22403 */ 22404 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 22405 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22406 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 22407 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22408 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22409 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 22410 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 22411 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22412 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22413 /* Insert at head of list */ 22414 usesrc_cli_ill->ill_usesrc_grp_next = 22415 usesrc_ill->ill_usesrc_grp_next; 22416 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22417 } else { 22418 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 22419 ifindex); 22420 if (ret != 0) 22421 err = EINVAL; 22422 } 22423 rw_exit(&ill_g_usesrc_lock); 22424 22425 done: 22426 if (ill_flag_changed) { 22427 mutex_enter(&usesrc_cli_ill->ill_lock); 22428 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 22429 mutex_exit(&usesrc_cli_ill->ill_lock); 22430 } 22431 if (ipsq != NULL) 22432 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22433 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 22434 ill_refrele(usesrc_ill); 22435 return (err); 22436 } 22437 22438 /* 22439 * comparison function used by avl. 22440 */ 22441 static int 22442 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 22443 { 22444 22445 uint_t index; 22446 22447 ASSERT(phyip != NULL && index_ptr != NULL); 22448 22449 index = *((uint_t *)index_ptr); 22450 /* 22451 * let the phyint with the lowest index be on top. 22452 */ 22453 if (((phyint_t *)phyip)->phyint_ifindex < index) 22454 return (1); 22455 if (((phyint_t *)phyip)->phyint_ifindex > index) 22456 return (-1); 22457 return (0); 22458 } 22459 22460 /* 22461 * comparison function used by avl. 22462 */ 22463 static int 22464 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 22465 { 22466 ill_t *ill; 22467 int res = 0; 22468 22469 ASSERT(phyip != NULL && name_ptr != NULL); 22470 22471 if (((phyint_t *)phyip)->phyint_illv4) 22472 ill = ((phyint_t *)phyip)->phyint_illv4; 22473 else 22474 ill = ((phyint_t *)phyip)->phyint_illv6; 22475 ASSERT(ill != NULL); 22476 22477 res = strcmp(ill->ill_name, (char *)name_ptr); 22478 if (res > 0) 22479 return (1); 22480 else if (res < 0) 22481 return (-1); 22482 return (0); 22483 } 22484 /* 22485 * This function is called from ill_delete when the ill is being 22486 * unplumbed. We remove the reference from the phyint and we also 22487 * free the phyint when there are no more references to it. 22488 */ 22489 static void 22490 ill_phyint_free(ill_t *ill) 22491 { 22492 phyint_t *phyi; 22493 phyint_t *next_phyint; 22494 ipsq_t *cur_ipsq; 22495 22496 ASSERT(ill->ill_phyint != NULL); 22497 22498 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 22499 phyi = ill->ill_phyint; 22500 ill->ill_phyint = NULL; 22501 /* 22502 * ill_init allocates a phyint always to store the copy 22503 * of flags relevant to phyint. At that point in time, we could 22504 * not assign the name and hence phyint_illv4/v6 could not be 22505 * initialized. Later in ipif_set_values, we assign the name to 22506 * the ill, at which point in time we assign phyint_illv4/v6. 22507 * Thus we don't rely on phyint_illv6 to be initialized always. 22508 */ 22509 if (ill->ill_flags & ILLF_IPV6) { 22510 phyi->phyint_illv6 = NULL; 22511 } else { 22512 phyi->phyint_illv4 = NULL; 22513 } 22514 /* 22515 * ipif_down removes it from the group when the last ipif goes 22516 * down. 22517 */ 22518 ASSERT(ill->ill_group == NULL); 22519 22520 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 22521 return; 22522 22523 /* 22524 * Make sure this phyint was put in the list. 22525 */ 22526 if (phyi->phyint_ifindex > 0) { 22527 avl_remove(&phyint_g_list.phyint_list_avl_by_index, 22528 phyi); 22529 avl_remove(&phyint_g_list.phyint_list_avl_by_name, 22530 phyi); 22531 } 22532 /* 22533 * remove phyint from the ipsq list. 22534 */ 22535 cur_ipsq = phyi->phyint_ipsq; 22536 if (phyi == cur_ipsq->ipsq_phyint_list) { 22537 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 22538 } else { 22539 next_phyint = cur_ipsq->ipsq_phyint_list; 22540 while (next_phyint != NULL) { 22541 if (next_phyint->phyint_ipsq_next == phyi) { 22542 next_phyint->phyint_ipsq_next = 22543 phyi->phyint_ipsq_next; 22544 break; 22545 } 22546 next_phyint = next_phyint->phyint_ipsq_next; 22547 } 22548 ASSERT(next_phyint != NULL); 22549 } 22550 IPSQ_DEC_REF(cur_ipsq); 22551 22552 if (phyi->phyint_groupname_len != 0) { 22553 ASSERT(phyi->phyint_groupname != NULL); 22554 mi_free(phyi->phyint_groupname); 22555 } 22556 mi_free(phyi); 22557 } 22558 22559 /* 22560 * Attach the ill to the phyint structure which can be shared by both 22561 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 22562 * function is called from ipif_set_values and ill_lookup_on_name (for 22563 * loopback) where we know the name of the ill. We lookup the ill and if 22564 * there is one present already with the name use that phyint. Otherwise 22565 * reuse the one allocated by ill_init. 22566 */ 22567 static void 22568 ill_phyint_reinit(ill_t *ill) 22569 { 22570 boolean_t isv6 = ill->ill_isv6; 22571 phyint_t *phyi_old; 22572 phyint_t *phyi; 22573 avl_index_t where = 0; 22574 ill_t *ill_other = NULL; 22575 ipsq_t *ipsq; 22576 22577 ASSERT(RW_WRITE_HELD(&ill_g_lock)); 22578 22579 phyi_old = ill->ill_phyint; 22580 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 22581 phyi_old->phyint_illv6 == NULL)); 22582 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 22583 phyi_old->phyint_illv4 == NULL)); 22584 ASSERT(phyi_old->phyint_ifindex == 0); 22585 22586 phyi = avl_find(&phyint_g_list.phyint_list_avl_by_name, 22587 ill->ill_name, &where); 22588 22589 /* 22590 * 1. We grabbed the ill_g_lock before inserting this ill into 22591 * the global list of ills. So no other thread could have located 22592 * this ill and hence the ipsq of this ill is guaranteed to be empty. 22593 * 2. Now locate the other protocol instance of this ill. 22594 * 3. Now grab both ill locks in the right order, and the phyint lock of 22595 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 22596 * of neither ill can change. 22597 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 22598 * other ill. 22599 * 5. Release all locks. 22600 */ 22601 22602 /* 22603 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 22604 * we are initializing IPv4. 22605 */ 22606 if (phyi != NULL) { 22607 ill_other = (isv6) ? phyi->phyint_illv4 : 22608 phyi->phyint_illv6; 22609 ASSERT(ill_other->ill_phyint != NULL); 22610 ASSERT((isv6 && !ill_other->ill_isv6) || 22611 (!isv6 && ill_other->ill_isv6)); 22612 GRAB_ILL_LOCKS(ill, ill_other); 22613 /* 22614 * We are potentially throwing away phyint_flags which 22615 * could be different from the one that we obtain from 22616 * ill_other->ill_phyint. But it is okay as we are assuming 22617 * that the state maintained within IP is correct. 22618 */ 22619 mutex_enter(&phyi->phyint_lock); 22620 if (isv6) { 22621 ASSERT(phyi->phyint_illv6 == NULL); 22622 phyi->phyint_illv6 = ill; 22623 } else { 22624 ASSERT(phyi->phyint_illv4 == NULL); 22625 phyi->phyint_illv4 = ill; 22626 } 22627 /* 22628 * This is a new ill, currently undergoing SLIFNAME 22629 * So we could not have joined an IPMP group until now. 22630 */ 22631 ASSERT(phyi_old->phyint_ipsq_next == NULL && 22632 phyi_old->phyint_groupname == NULL); 22633 22634 /* 22635 * This phyi_old is going away. Decref ipsq_refs and 22636 * assert it is zero. The ipsq itself will be freed in 22637 * ipsq_exit 22638 */ 22639 ipsq = phyi_old->phyint_ipsq; 22640 IPSQ_DEC_REF(ipsq); 22641 ASSERT(ipsq->ipsq_refs == 0); 22642 /* Get the singleton phyint out of the ipsq list */ 22643 ASSERT(phyi_old->phyint_ipsq_next == NULL); 22644 ipsq->ipsq_phyint_list = NULL; 22645 phyi_old->phyint_illv4 = NULL; 22646 phyi_old->phyint_illv6 = NULL; 22647 mi_free(phyi_old); 22648 } else { 22649 mutex_enter(&ill->ill_lock); 22650 /* 22651 * We don't need to acquire any lock, since 22652 * the ill is not yet visible globally and we 22653 * have not yet released the ill_g_lock. 22654 */ 22655 phyi = phyi_old; 22656 mutex_enter(&phyi->phyint_lock); 22657 /* XXX We need a recovery strategy here. */ 22658 if (!phyint_assign_ifindex(phyi)) 22659 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 22660 22661 avl_insert(&phyint_g_list.phyint_list_avl_by_name, 22662 (void *)phyi, where); 22663 22664 (void) avl_find(&phyint_g_list.phyint_list_avl_by_index, 22665 &phyi->phyint_ifindex, &where); 22666 avl_insert(&phyint_g_list.phyint_list_avl_by_index, 22667 (void *)phyi, where); 22668 } 22669 22670 /* 22671 * Reassigning ill_phyint automatically reassigns the ipsq also. 22672 * pending mp is not affected because that is per ill basis. 22673 */ 22674 ill->ill_phyint = phyi; 22675 22676 /* 22677 * Keep the index on ipif_orig_index to be used by FAILOVER. 22678 * We do this here as when the first ipif was allocated, 22679 * ipif_allocate does not know the right interface index. 22680 */ 22681 22682 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 22683 /* 22684 * Now that the phyint's ifindex has been assigned, complete the 22685 * remaining 22686 */ 22687 22688 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 22689 if (ill->ill_isv6) { 22690 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 22691 ill->ill_phyint->phyint_ifindex; 22692 } 22693 22694 /* 22695 * Generate an event within the hooks framework to indicate that 22696 * a new interface has just been added to IP. For this event to 22697 * be generated, the network interface must, at least, have an 22698 * ifindex assigned to it. 22699 * 22700 * This needs to be run inside the ill_g_lock perimeter to ensure 22701 * that the ordering of delivered events to listeners matches the 22702 * order of them in the kernel. 22703 * 22704 * This function could be called from ill_lookup_on_name. In that case 22705 * the interface is loopback "lo", which will not generate a NIC event. 22706 */ 22707 if (ill->ill_name_length <= 2 || 22708 ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') { 22709 hook_nic_event_t *info; 22710 if ((info = ill->ill_nic_event_info) != NULL) { 22711 ip2dbg(("ill_phyint_reinit: unexpected nic event %d " 22712 "attached for %s\n", info->hne_event, 22713 ill->ill_name)); 22714 if (info->hne_data != NULL) 22715 kmem_free(info->hne_data, info->hne_datalen); 22716 kmem_free(info, sizeof (hook_nic_event_t)); 22717 } 22718 22719 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 22720 if (info != NULL) { 22721 info->hne_nic = ill->ill_phyint->phyint_ifindex; 22722 info->hne_lif = 0; 22723 info->hne_event = NE_PLUMB; 22724 info->hne_family = ill->ill_isv6 ? ipv6 : ipv4; 22725 info->hne_data = kmem_alloc(ill->ill_name_length, 22726 KM_NOSLEEP); 22727 if (info->hne_data != NULL) { 22728 info->hne_datalen = ill->ill_name_length; 22729 bcopy(ill->ill_name, info->hne_data, 22730 info->hne_datalen); 22731 } else { 22732 ip2dbg(("ill_phyint_reinit: could not attach " 22733 "ill_name information for PLUMB nic event " 22734 "of %s (ENOMEM)\n", ill->ill_name)); 22735 kmem_free(info, sizeof (hook_nic_event_t)); 22736 } 22737 } else 22738 ip2dbg(("ill_phyint_reinit: could not attach PLUMB nic " 22739 "event information for %s (ENOMEM)\n", 22740 ill->ill_name)); 22741 22742 ill->ill_nic_event_info = info; 22743 } 22744 22745 RELEASE_ILL_LOCKS(ill, ill_other); 22746 mutex_exit(&phyi->phyint_lock); 22747 } 22748 22749 /* 22750 * Notify any downstream modules of the name of this interface. 22751 * An M_IOCTL is used even though we don't expect a successful reply. 22752 * Any reply message from the driver (presumably an M_IOCNAK) will 22753 * eventually get discarded somewhere upstream. The message format is 22754 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 22755 * to IP. 22756 */ 22757 static void 22758 ip_ifname_notify(ill_t *ill, queue_t *q) 22759 { 22760 mblk_t *mp1, *mp2; 22761 struct iocblk *iocp; 22762 struct lifreq *lifr; 22763 22764 mp1 = mkiocb(SIOCSLIFNAME); 22765 if (mp1 == NULL) 22766 return; 22767 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 22768 if (mp2 == NULL) { 22769 freeb(mp1); 22770 return; 22771 } 22772 22773 mp1->b_cont = mp2; 22774 iocp = (struct iocblk *)mp1->b_rptr; 22775 iocp->ioc_count = sizeof (struct lifreq); 22776 22777 lifr = (struct lifreq *)mp2->b_rptr; 22778 mp2->b_wptr += sizeof (struct lifreq); 22779 bzero(lifr, sizeof (struct lifreq)); 22780 22781 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 22782 lifr->lifr_ppa = ill->ill_ppa; 22783 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 22784 22785 putnext(q, mp1); 22786 } 22787 22788 static boolean_t ip_trash_timer_started = B_FALSE; 22789 22790 static int 22791 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 22792 { 22793 int err; 22794 22795 /* Set the obsolete NDD per-interface forwarding name. */ 22796 err = ill_set_ndd_name(ill); 22797 if (err != 0) { 22798 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 22799 err); 22800 } 22801 22802 /* Tell downstream modules where they are. */ 22803 ip_ifname_notify(ill, q); 22804 22805 /* 22806 * ill_dl_phys returns EINPROGRESS in the usual case. 22807 * Error cases are ENOMEM ... 22808 */ 22809 err = ill_dl_phys(ill, ipif, mp, q); 22810 22811 /* 22812 * If there is no IRE expiration timer running, get one started. 22813 * igmp and mld timers will be triggered by the first multicast 22814 */ 22815 if (!ip_trash_timer_started) { 22816 /* 22817 * acquire the lock and check again. 22818 */ 22819 mutex_enter(&ip_trash_timer_lock); 22820 if (!ip_trash_timer_started) { 22821 ip_ire_expire_id = timeout(ip_trash_timer_expire, NULL, 22822 MSEC_TO_TICK(ip_timer_interval)); 22823 ip_trash_timer_started = B_TRUE; 22824 } 22825 mutex_exit(&ip_trash_timer_lock); 22826 } 22827 22828 if (ill->ill_isv6) { 22829 mutex_enter(&mld_slowtimeout_lock); 22830 if (mld_slowtimeout_id == 0) { 22831 mld_slowtimeout_id = timeout(mld_slowtimo, NULL, 22832 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22833 } 22834 mutex_exit(&mld_slowtimeout_lock); 22835 } else { 22836 mutex_enter(&igmp_slowtimeout_lock); 22837 if (igmp_slowtimeout_id == 0) { 22838 igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL, 22839 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22840 } 22841 mutex_exit(&igmp_slowtimeout_lock); 22842 } 22843 22844 return (err); 22845 } 22846 22847 /* 22848 * Common routine for ppa and ifname setting. Should be called exclusive. 22849 * 22850 * Returns EINPROGRESS when mp has been consumed by queueing it on 22851 * ill_pending_mp and the ioctl will complete in ip_rput. 22852 * 22853 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 22854 * the new name and new ppa in lifr_name and lifr_ppa respectively. 22855 * For SLIFNAME, we pass these values back to the userland. 22856 */ 22857 static int 22858 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 22859 { 22860 ill_t *ill; 22861 ipif_t *ipif; 22862 ipsq_t *ipsq; 22863 char *ppa_ptr; 22864 char *old_ptr; 22865 char old_char; 22866 int error; 22867 22868 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 22869 ASSERT(q->q_next != NULL); 22870 ASSERT(interf_name != NULL); 22871 22872 ill = (ill_t *)q->q_ptr; 22873 22874 ASSERT(ill->ill_name[0] == '\0'); 22875 ASSERT(IAM_WRITER_ILL(ill)); 22876 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 22877 ASSERT(ill->ill_ppa == UINT_MAX); 22878 22879 /* The ppa is sent down by ifconfig or is chosen */ 22880 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 22881 return (EINVAL); 22882 } 22883 22884 /* 22885 * make sure ppa passed in is same as ppa in the name. 22886 * This check is not made when ppa == UINT_MAX in that case ppa 22887 * in the name could be anything. System will choose a ppa and 22888 * update new_ppa_ptr and inter_name to contain the choosen ppa. 22889 */ 22890 if (*new_ppa_ptr != UINT_MAX) { 22891 /* stoi changes the pointer */ 22892 old_ptr = ppa_ptr; 22893 /* 22894 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 22895 * (they don't have an externally visible ppa). We assign one 22896 * here so that we can manage the interface. Note that in 22897 * the past this value was always 0 for DLPI 1 drivers. 22898 */ 22899 if (*new_ppa_ptr == 0) 22900 *new_ppa_ptr = stoi(&old_ptr); 22901 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 22902 return (EINVAL); 22903 } 22904 /* 22905 * terminate string before ppa 22906 * save char at that location. 22907 */ 22908 old_char = ppa_ptr[0]; 22909 ppa_ptr[0] = '\0'; 22910 22911 ill->ill_ppa = *new_ppa_ptr; 22912 /* 22913 * Finish as much work now as possible before calling ill_glist_insert 22914 * which makes the ill globally visible and also merges it with the 22915 * other protocol instance of this phyint. The remaining work is 22916 * done after entering the ipsq which may happen sometime later. 22917 * ill_set_ndd_name occurs after the ill has been made globally visible. 22918 */ 22919 ipif = ill->ill_ipif; 22920 22921 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 22922 ipif_assign_seqid(ipif); 22923 22924 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 22925 ill->ill_flags |= ILLF_IPV4; 22926 22927 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 22928 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 22929 22930 if (ill->ill_flags & ILLF_IPV6) { 22931 22932 ill->ill_isv6 = B_TRUE; 22933 if (ill->ill_rq != NULL) { 22934 ill->ill_rq->q_qinfo = &rinit_ipv6; 22935 ill->ill_wq->q_qinfo = &winit_ipv6; 22936 } 22937 22938 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 22939 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 22940 ipif->ipif_v6src_addr = ipv6_all_zeros; 22941 ipif->ipif_v6subnet = ipv6_all_zeros; 22942 ipif->ipif_v6net_mask = ipv6_all_zeros; 22943 ipif->ipif_v6brd_addr = ipv6_all_zeros; 22944 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 22945 /* 22946 * point-to-point or Non-mulicast capable 22947 * interfaces won't do NUD unless explicitly 22948 * configured to do so. 22949 */ 22950 if (ipif->ipif_flags & IPIF_POINTOPOINT || 22951 !(ill->ill_flags & ILLF_MULTICAST)) { 22952 ill->ill_flags |= ILLF_NONUD; 22953 } 22954 /* Make sure IPv4 specific flag is not set on IPv6 if */ 22955 if (ill->ill_flags & ILLF_NOARP) { 22956 /* 22957 * Note: xresolv interfaces will eventually need 22958 * NOARP set here as well, but that will require 22959 * those external resolvers to have some 22960 * knowledge of that flag and act appropriately. 22961 * Not to be changed at present. 22962 */ 22963 ill->ill_flags &= ~ILLF_NOARP; 22964 } 22965 /* 22966 * Set the ILLF_ROUTER flag according to the global 22967 * IPv6 forwarding policy. 22968 */ 22969 if (ipv6_forward != 0) 22970 ill->ill_flags |= ILLF_ROUTER; 22971 } else if (ill->ill_flags & ILLF_IPV4) { 22972 ill->ill_isv6 = B_FALSE; 22973 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 22974 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 22975 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 22976 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 22977 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 22978 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 22979 /* 22980 * Set the ILLF_ROUTER flag according to the global 22981 * IPv4 forwarding policy. 22982 */ 22983 if (ip_g_forward != 0) 22984 ill->ill_flags |= ILLF_ROUTER; 22985 } 22986 22987 ASSERT(ill->ill_phyint != NULL); 22988 22989 /* 22990 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 22991 * be completed in ill_glist_insert -> ill_phyint_reinit 22992 */ 22993 if (!ill_allocate_mibs(ill)) 22994 return (ENOMEM); 22995 22996 /* 22997 * Pick a default sap until we get the DL_INFO_ACK back from 22998 * the driver. 22999 */ 23000 if (ill->ill_sap == 0) { 23001 if (ill->ill_isv6) 23002 ill->ill_sap = IP6_DL_SAP; 23003 else 23004 ill->ill_sap = IP_DL_SAP; 23005 } 23006 23007 ill->ill_ifname_pending = 1; 23008 ill->ill_ifname_pending_err = 0; 23009 23010 ill_refhold(ill); 23011 rw_enter(&ill_g_lock, RW_WRITER); 23012 if ((error = ill_glist_insert(ill, interf_name, 23013 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 23014 ill->ill_ppa = UINT_MAX; 23015 ill->ill_name[0] = '\0'; 23016 /* 23017 * undo null termination done above. 23018 */ 23019 ppa_ptr[0] = old_char; 23020 rw_exit(&ill_g_lock); 23021 ill_refrele(ill); 23022 return (error); 23023 } 23024 23025 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 23026 23027 /* 23028 * When we return the buffer pointed to by interf_name should contain 23029 * the same name as in ill_name. 23030 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 23031 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 23032 * so copy full name and update the ppa ptr. 23033 * When ppa passed in != UINT_MAX all values are correct just undo 23034 * null termination, this saves a bcopy. 23035 */ 23036 if (*new_ppa_ptr == UINT_MAX) { 23037 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 23038 *new_ppa_ptr = ill->ill_ppa; 23039 } else { 23040 /* 23041 * undo null termination done above. 23042 */ 23043 ppa_ptr[0] = old_char; 23044 } 23045 23046 /* Let SCTP know about this ILL */ 23047 sctp_update_ill(ill, SCTP_ILL_INSERT); 23048 23049 /* and also about the first ipif */ 23050 sctp_update_ipif(ipif, SCTP_IPIF_INSERT); 23051 23052 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 23053 B_TRUE); 23054 23055 rw_exit(&ill_g_lock); 23056 ill_refrele(ill); 23057 if (ipsq == NULL) 23058 return (EINPROGRESS); 23059 23060 /* 23061 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 23062 */ 23063 if (ipsq->ipsq_current_ipif == NULL) 23064 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 23065 else 23066 ASSERT(ipsq->ipsq_current_ipif == ipif); 23067 23068 error = ipif_set_values_tail(ill, ipif, mp, q); 23069 ipsq_exit(ipsq, B_TRUE, B_TRUE); 23070 if (error != 0 && error != EINPROGRESS) { 23071 /* 23072 * restore previous values 23073 */ 23074 ill->ill_isv6 = B_FALSE; 23075 } 23076 return (error); 23077 } 23078 23079 23080 extern void (*ip_cleanup_func)(void); 23081 23082 void 23083 ipif_init(void) 23084 { 23085 hrtime_t hrt; 23086 int i; 23087 23088 /* 23089 * Can't call drv_getparm here as it is too early in the boot. 23090 * As we use ipif_src_random just for picking a different 23091 * source address everytime, this need not be really random. 23092 */ 23093 hrt = gethrtime(); 23094 ipif_src_random = ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 23095 23096 for (i = 0; i < MAX_G_HEADS; i++) { 23097 ill_g_heads[i].ill_g_list_head = (ill_if_t *)&ill_g_heads[i]; 23098 ill_g_heads[i].ill_g_list_tail = (ill_if_t *)&ill_g_heads[i]; 23099 } 23100 23101 avl_create(&phyint_g_list.phyint_list_avl_by_index, 23102 ill_phyint_compare_index, 23103 sizeof (phyint_t), 23104 offsetof(struct phyint, phyint_avl_by_index)); 23105 avl_create(&phyint_g_list.phyint_list_avl_by_name, 23106 ill_phyint_compare_name, 23107 sizeof (phyint_t), 23108 offsetof(struct phyint, phyint_avl_by_name)); 23109 23110 ip_cleanup_func = ip_thread_exit; 23111 } 23112 23113 /* 23114 * This is called by ip_rt_add when src_addr value is other than zero. 23115 * src_addr signifies the source address of the incoming packet. For 23116 * reverse tunnel route we need to create a source addr based routing 23117 * table. This routine creates ip_mrtun_table if it's empty and then 23118 * it adds the route entry hashed by source address. It verifies that 23119 * the outgoing interface is always a non-resolver interface (tunnel). 23120 */ 23121 int 23122 ip_mrtun_rt_add(ipaddr_t in_src_addr, int flags, ipif_t *ipif_arg, 23123 ipif_t *src_ipif, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func) 23124 { 23125 ire_t *ire; 23126 ire_t *save_ire; 23127 ipif_t *ipif; 23128 ill_t *in_ill = NULL; 23129 ill_t *out_ill; 23130 queue_t *stq; 23131 mblk_t *dlureq_mp; 23132 int error; 23133 23134 if (ire_arg != NULL) 23135 *ire_arg = NULL; 23136 ASSERT(in_src_addr != INADDR_ANY); 23137 23138 ipif = ipif_arg; 23139 if (ipif != NULL) { 23140 out_ill = ipif->ipif_ill; 23141 } else { 23142 ip1dbg(("ip_mrtun_rt_add: ipif is NULL\n")); 23143 return (EINVAL); 23144 } 23145 23146 if (src_ipif == NULL) { 23147 ip1dbg(("ip_mrtun_rt_add: src_ipif is NULL\n")); 23148 return (EINVAL); 23149 } 23150 in_ill = src_ipif->ipif_ill; 23151 23152 /* 23153 * Check for duplicates. We don't need to 23154 * match out_ill, because the uniqueness of 23155 * a route is only dependent on src_addr and 23156 * in_ill. 23157 */ 23158 ire = ire_mrtun_lookup(in_src_addr, in_ill); 23159 if (ire != NULL) { 23160 ire_refrele(ire); 23161 return (EEXIST); 23162 } 23163 if (ipif->ipif_net_type != IRE_IF_NORESOLVER) { 23164 ip2dbg(("ip_mrtun_rt_add: outgoing interface is type %d\n", 23165 ipif->ipif_net_type)); 23166 return (EINVAL); 23167 } 23168 23169 stq = ipif->ipif_wq; 23170 ASSERT(stq != NULL); 23171 23172 /* 23173 * The outgoing interface must be non-resolver 23174 * interface. 23175 */ 23176 dlureq_mp = ill_dlur_gen(NULL, 23177 out_ill->ill_phys_addr_length, out_ill->ill_sap, 23178 out_ill->ill_sap_length); 23179 23180 if (dlureq_mp == NULL) { 23181 ip1dbg(("ip_newroute: dlureq_mp NULL\n")); 23182 return (ENOMEM); 23183 } 23184 23185 /* Create the IRE. */ 23186 23187 ire = ire_create( 23188 NULL, /* Zero dst addr */ 23189 NULL, /* Zero mask */ 23190 NULL, /* Zero gateway addr */ 23191 NULL, /* Zero ipif_src addr */ 23192 (uint8_t *)&in_src_addr, /* in_src-addr */ 23193 &ipif->ipif_mtu, 23194 NULL, 23195 NULL, /* rfq */ 23196 stq, 23197 IRE_MIPRTUN, 23198 dlureq_mp, 23199 ipif, 23200 in_ill, 23201 0, 23202 0, 23203 0, 23204 flags, 23205 &ire_uinfo_null, 23206 NULL, 23207 NULL); 23208 23209 if (ire == NULL) { 23210 freeb(dlureq_mp); 23211 return (ENOMEM); 23212 } 23213 ip2dbg(("ip_mrtun_rt_add: mrtun route is created with type %d\n", 23214 ire->ire_type)); 23215 save_ire = ire; 23216 ASSERT(save_ire != NULL); 23217 error = ire_add_mrtun(&ire, q, mp, func); 23218 /* 23219 * If ire_add_mrtun() failed, the ire passed in was freed 23220 * so there is no need to do so here. 23221 */ 23222 if (error != 0) { 23223 return (error); 23224 } 23225 23226 /* Duplicate check */ 23227 if (ire != save_ire) { 23228 /* route already exists by now */ 23229 ire_refrele(ire); 23230 return (EEXIST); 23231 } 23232 23233 if (ire_arg != NULL) { 23234 /* 23235 * Store the ire that was just added. the caller 23236 * ip_rts_request responsible for doing ire_refrele() 23237 * on it. 23238 */ 23239 *ire_arg = ire; 23240 } else { 23241 ire_refrele(ire); /* held in ire_add_mrtun */ 23242 } 23243 23244 return (0); 23245 } 23246 23247 /* 23248 * It is called by ip_rt_delete() only when mipagent requests to delete 23249 * a reverse tunnel route that was added by ip_mrtun_rt_add() before. 23250 */ 23251 23252 int 23253 ip_mrtun_rt_delete(ipaddr_t in_src_addr, ipif_t *src_ipif) 23254 { 23255 ire_t *ire = NULL; 23256 23257 if (in_src_addr == INADDR_ANY) 23258 return (EINVAL); 23259 if (src_ipif == NULL) 23260 return (EINVAL); 23261 23262 /* search if this route exists in the ip_mrtun_table */ 23263 ire = ire_mrtun_lookup(in_src_addr, src_ipif->ipif_ill); 23264 if (ire == NULL) { 23265 ip2dbg(("ip_mrtun_rt_delete: ire not found\n")); 23266 return (ESRCH); 23267 } 23268 ire_delete(ire); 23269 ire_refrele(ire); 23270 return (0); 23271 } 23272 23273 /* 23274 * Lookup the ipif corresponding to the onlink destination address. For 23275 * point-to-point interfaces, it matches with remote endpoint destination 23276 * address. For point-to-multipoint interfaces it only tries to match the 23277 * destination with the interface's subnet address. The longest, most specific 23278 * match is found to take care of such rare network configurations like - 23279 * le0: 129.146.1.1/16 23280 * le1: 129.146.2.2/24 23281 * It is used only by SO_DONTROUTE at the moment. 23282 */ 23283 ipif_t * 23284 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid) 23285 { 23286 ipif_t *ipif, *best_ipif; 23287 ill_t *ill; 23288 ill_walk_context_t ctx; 23289 23290 ASSERT(zoneid != ALL_ZONES); 23291 best_ipif = NULL; 23292 23293 rw_enter(&ill_g_lock, RW_READER); 23294 ill = ILL_START_WALK_V4(&ctx); 23295 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 23296 mutex_enter(&ill->ill_lock); 23297 for (ipif = ill->ill_ipif; ipif != NULL; 23298 ipif = ipif->ipif_next) { 23299 if (!IPIF_CAN_LOOKUP(ipif)) 23300 continue; 23301 if (ipif->ipif_zoneid != zoneid && 23302 ipif->ipif_zoneid != ALL_ZONES) 23303 continue; 23304 /* 23305 * Point-to-point case. Look for exact match with 23306 * destination address. 23307 */ 23308 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 23309 if (ipif->ipif_pp_dst_addr == addr) { 23310 ipif_refhold_locked(ipif); 23311 mutex_exit(&ill->ill_lock); 23312 rw_exit(&ill_g_lock); 23313 if (best_ipif != NULL) 23314 ipif_refrele(best_ipif); 23315 return (ipif); 23316 } 23317 } else if (ipif->ipif_subnet == (addr & 23318 ipif->ipif_net_mask)) { 23319 /* 23320 * Point-to-multipoint case. Looping through to 23321 * find the most specific match. If there are 23322 * multiple best match ipif's then prefer ipif's 23323 * that are UP. If there is only one best match 23324 * ipif and it is DOWN we must still return it. 23325 */ 23326 if ((best_ipif == NULL) || 23327 (ipif->ipif_net_mask > 23328 best_ipif->ipif_net_mask) || 23329 ((ipif->ipif_net_mask == 23330 best_ipif->ipif_net_mask) && 23331 ((ipif->ipif_flags & IPIF_UP) && 23332 (!(best_ipif->ipif_flags & IPIF_UP))))) { 23333 ipif_refhold_locked(ipif); 23334 mutex_exit(&ill->ill_lock); 23335 rw_exit(&ill_g_lock); 23336 if (best_ipif != NULL) 23337 ipif_refrele(best_ipif); 23338 best_ipif = ipif; 23339 rw_enter(&ill_g_lock, RW_READER); 23340 mutex_enter(&ill->ill_lock); 23341 } 23342 } 23343 } 23344 mutex_exit(&ill->ill_lock); 23345 } 23346 rw_exit(&ill_g_lock); 23347 return (best_ipif); 23348 } 23349 23350 23351 /* 23352 * Save enough information so that we can recreate the IRE if 23353 * the interface goes down and then up. 23354 */ 23355 static void 23356 ipif_save_ire(ipif_t *ipif, ire_t *ire) 23357 { 23358 mblk_t *save_mp; 23359 23360 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 23361 if (save_mp != NULL) { 23362 ifrt_t *ifrt; 23363 23364 save_mp->b_wptr += sizeof (ifrt_t); 23365 ifrt = (ifrt_t *)save_mp->b_rptr; 23366 bzero(ifrt, sizeof (ifrt_t)); 23367 ifrt->ifrt_type = ire->ire_type; 23368 ifrt->ifrt_addr = ire->ire_addr; 23369 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 23370 ifrt->ifrt_src_addr = ire->ire_src_addr; 23371 ifrt->ifrt_mask = ire->ire_mask; 23372 ifrt->ifrt_flags = ire->ire_flags; 23373 ifrt->ifrt_max_frag = ire->ire_max_frag; 23374 mutex_enter(&ipif->ipif_saved_ire_lock); 23375 save_mp->b_cont = ipif->ipif_saved_ire_mp; 23376 ipif->ipif_saved_ire_mp = save_mp; 23377 ipif->ipif_saved_ire_cnt++; 23378 mutex_exit(&ipif->ipif_saved_ire_lock); 23379 } 23380 } 23381 23382 23383 static void 23384 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 23385 { 23386 mblk_t **mpp; 23387 mblk_t *mp; 23388 ifrt_t *ifrt; 23389 23390 /* Remove from ipif_saved_ire_mp list if it is there */ 23391 mutex_enter(&ipif->ipif_saved_ire_lock); 23392 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 23393 mpp = &(*mpp)->b_cont) { 23394 /* 23395 * On a given ipif, the triple of address, gateway and 23396 * mask is unique for each saved IRE (in the case of 23397 * ordinary interface routes, the gateway address is 23398 * all-zeroes). 23399 */ 23400 mp = *mpp; 23401 ifrt = (ifrt_t *)mp->b_rptr; 23402 if (ifrt->ifrt_addr == ire->ire_addr && 23403 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 23404 ifrt->ifrt_mask == ire->ire_mask) { 23405 *mpp = mp->b_cont; 23406 ipif->ipif_saved_ire_cnt--; 23407 freeb(mp); 23408 break; 23409 } 23410 } 23411 mutex_exit(&ipif->ipif_saved_ire_lock); 23412 } 23413 23414 23415 /* 23416 * IP multirouting broadcast routes handling 23417 * Append CGTP broadcast IREs to regular ones created 23418 * at ifconfig time. 23419 */ 23420 static void 23421 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst) 23422 { 23423 ire_t *ire_prim; 23424 23425 ASSERT(ire != NULL); 23426 ASSERT(ire_dst != NULL); 23427 23428 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23429 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23430 if (ire_prim != NULL) { 23431 /* 23432 * We are in the special case of broadcasts for 23433 * CGTP. We add an IRE_BROADCAST that holds 23434 * the RTF_MULTIRT flag, the destination 23435 * address of ire_dst and the low level 23436 * info of ire_prim. In other words, CGTP 23437 * broadcast is added to the redundant ipif. 23438 */ 23439 ipif_t *ipif_prim; 23440 ire_t *bcast_ire; 23441 23442 ipif_prim = ire_prim->ire_ipif; 23443 23444 ip2dbg(("ip_cgtp_filter_bcast_add: " 23445 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23446 (void *)ire_dst, (void *)ire_prim, 23447 (void *)ipif_prim)); 23448 23449 bcast_ire = ire_create( 23450 (uchar_t *)&ire->ire_addr, 23451 (uchar_t *)&ip_g_all_ones, 23452 (uchar_t *)&ire_dst->ire_src_addr, 23453 (uchar_t *)&ire->ire_gateway_addr, 23454 NULL, 23455 &ipif_prim->ipif_mtu, 23456 NULL, 23457 ipif_prim->ipif_rq, 23458 ipif_prim->ipif_wq, 23459 IRE_BROADCAST, 23460 ipif_prim->ipif_bcast_mp, 23461 ipif_prim, 23462 NULL, 23463 0, 23464 0, 23465 0, 23466 ire->ire_flags, 23467 &ire_uinfo_null, 23468 NULL, 23469 NULL); 23470 23471 if (bcast_ire != NULL) { 23472 23473 if (ire_add(&bcast_ire, NULL, NULL, NULL, 23474 B_FALSE) == 0) { 23475 ip2dbg(("ip_cgtp_filter_bcast_add: " 23476 "added bcast_ire %p\n", 23477 (void *)bcast_ire)); 23478 23479 ipif_save_ire(bcast_ire->ire_ipif, 23480 bcast_ire); 23481 ire_refrele(bcast_ire); 23482 } 23483 } 23484 ire_refrele(ire_prim); 23485 } 23486 } 23487 23488 23489 /* 23490 * IP multirouting broadcast routes handling 23491 * Remove the broadcast ire 23492 */ 23493 static void 23494 ip_cgtp_bcast_delete(ire_t *ire) 23495 { 23496 ire_t *ire_dst; 23497 23498 ASSERT(ire != NULL); 23499 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 23500 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23501 if (ire_dst != NULL) { 23502 ire_t *ire_prim; 23503 23504 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23505 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE); 23506 if (ire_prim != NULL) { 23507 ipif_t *ipif_prim; 23508 ire_t *bcast_ire; 23509 23510 ipif_prim = ire_prim->ire_ipif; 23511 23512 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23513 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23514 (void *)ire_dst, (void *)ire_prim, 23515 (void *)ipif_prim)); 23516 23517 bcast_ire = ire_ctable_lookup(ire->ire_addr, 23518 ire->ire_gateway_addr, 23519 IRE_BROADCAST, 23520 ipif_prim, ALL_ZONES, 23521 NULL, 23522 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 23523 MATCH_IRE_MASK); 23524 23525 if (bcast_ire != NULL) { 23526 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23527 "looked up bcast_ire %p\n", 23528 (void *)bcast_ire)); 23529 ipif_remove_ire(bcast_ire->ire_ipif, 23530 bcast_ire); 23531 ire_delete(bcast_ire); 23532 } 23533 ire_refrele(ire_prim); 23534 } 23535 ire_refrele(ire_dst); 23536 } 23537 } 23538 23539 /* 23540 * IPsec hardware acceleration capabilities related functions. 23541 */ 23542 23543 /* 23544 * Free a per-ill IPsec capabilities structure. 23545 */ 23546 static void 23547 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 23548 { 23549 if (capab->auth_hw_algs != NULL) 23550 kmem_free(capab->auth_hw_algs, capab->algs_size); 23551 if (capab->encr_hw_algs != NULL) 23552 kmem_free(capab->encr_hw_algs, capab->algs_size); 23553 if (capab->encr_algparm != NULL) 23554 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 23555 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 23556 } 23557 23558 /* 23559 * Allocate a new per-ill IPsec capabilities structure. This structure 23560 * is specific to an IPsec protocol (AH or ESP). It is implemented as 23561 * an array which specifies, for each algorithm, whether this algorithm 23562 * is supported by the ill or not. 23563 */ 23564 static ill_ipsec_capab_t * 23565 ill_ipsec_capab_alloc(void) 23566 { 23567 ill_ipsec_capab_t *capab; 23568 uint_t nelems; 23569 23570 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 23571 if (capab == NULL) 23572 return (NULL); 23573 23574 /* we need one bit per algorithm */ 23575 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 23576 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 23577 23578 /* allocate memory to store algorithm flags */ 23579 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23580 if (capab->encr_hw_algs == NULL) 23581 goto nomem; 23582 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23583 if (capab->auth_hw_algs == NULL) 23584 goto nomem; 23585 /* 23586 * Leave encr_algparm NULL for now since we won't need it half 23587 * the time 23588 */ 23589 return (capab); 23590 23591 nomem: 23592 ill_ipsec_capab_free(capab); 23593 return (NULL); 23594 } 23595 23596 /* 23597 * Resize capability array. Since we're exclusive, this is OK. 23598 */ 23599 static boolean_t 23600 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 23601 { 23602 ipsec_capab_algparm_t *nalp, *oalp; 23603 uint32_t olen, nlen; 23604 23605 oalp = capab->encr_algparm; 23606 olen = capab->encr_algparm_size; 23607 23608 if (oalp != NULL) { 23609 if (algid < capab->encr_algparm_end) 23610 return (B_TRUE); 23611 } 23612 23613 nlen = (algid + 1) * sizeof (*nalp); 23614 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 23615 if (nalp == NULL) 23616 return (B_FALSE); 23617 23618 if (oalp != NULL) { 23619 bcopy(oalp, nalp, olen); 23620 kmem_free(oalp, olen); 23621 } 23622 capab->encr_algparm = nalp; 23623 capab->encr_algparm_size = nlen; 23624 capab->encr_algparm_end = algid + 1; 23625 23626 return (B_TRUE); 23627 } 23628 23629 /* 23630 * Compare the capabilities of the specified ill with the protocol 23631 * and algorithms specified by the SA passed as argument. 23632 * If they match, returns B_TRUE, B_FALSE if they do not match. 23633 * 23634 * The ill can be passed as a pointer to it, or by specifying its index 23635 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 23636 * 23637 * Called by ipsec_out_is_accelerated() do decide whether an outbound 23638 * packet is eligible for hardware acceleration, and by 23639 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 23640 * to a particular ill. 23641 */ 23642 boolean_t 23643 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 23644 ipsa_t *sa) 23645 { 23646 boolean_t sa_isv6; 23647 uint_t algid; 23648 struct ill_ipsec_capab_s *cpp; 23649 boolean_t need_refrele = B_FALSE; 23650 23651 if (ill == NULL) { 23652 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 23653 NULL, NULL, NULL); 23654 if (ill == NULL) { 23655 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 23656 return (B_FALSE); 23657 } 23658 need_refrele = B_TRUE; 23659 } 23660 23661 /* 23662 * Use the address length specified by the SA to determine 23663 * if it corresponds to a IPv6 address, and fail the matching 23664 * if the isv6 flag passed as argument does not match. 23665 * Note: this check is used for SADB capability checking before 23666 * sending SA information to an ill. 23667 */ 23668 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 23669 if (sa_isv6 != ill_isv6) 23670 /* protocol mismatch */ 23671 goto done; 23672 23673 /* 23674 * Check if the ill supports the protocol, algorithm(s) and 23675 * key size(s) specified by the SA, and get the pointers to 23676 * the algorithms supported by the ill. 23677 */ 23678 switch (sa->ipsa_type) { 23679 23680 case SADB_SATYPE_ESP: 23681 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 23682 /* ill does not support ESP acceleration */ 23683 goto done; 23684 cpp = ill->ill_ipsec_capab_esp; 23685 algid = sa->ipsa_auth_alg; 23686 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 23687 goto done; 23688 algid = sa->ipsa_encr_alg; 23689 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 23690 goto done; 23691 if (algid < cpp->encr_algparm_end) { 23692 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 23693 if (sa->ipsa_encrkeybits < alp->minkeylen) 23694 goto done; 23695 if (sa->ipsa_encrkeybits > alp->maxkeylen) 23696 goto done; 23697 } 23698 break; 23699 23700 case SADB_SATYPE_AH: 23701 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 23702 /* ill does not support AH acceleration */ 23703 goto done; 23704 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 23705 ill->ill_ipsec_capab_ah->auth_hw_algs)) 23706 goto done; 23707 break; 23708 } 23709 23710 if (need_refrele) 23711 ill_refrele(ill); 23712 return (B_TRUE); 23713 done: 23714 if (need_refrele) 23715 ill_refrele(ill); 23716 return (B_FALSE); 23717 } 23718 23719 23720 /* 23721 * Add a new ill to the list of IPsec capable ills. 23722 * Called from ill_capability_ipsec_ack() when an ACK was received 23723 * indicating that IPsec hardware processing was enabled for an ill. 23724 * 23725 * ill must point to the ill for which acceleration was enabled. 23726 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 23727 */ 23728 static void 23729 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 23730 { 23731 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 23732 uint_t sa_type; 23733 uint_t ipproto; 23734 23735 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 23736 (dl_cap == DL_CAPAB_IPSEC_ESP)); 23737 23738 switch (dl_cap) { 23739 case DL_CAPAB_IPSEC_AH: 23740 sa_type = SADB_SATYPE_AH; 23741 ills = &ipsec_capab_ills_ah; 23742 ipproto = IPPROTO_AH; 23743 break; 23744 case DL_CAPAB_IPSEC_ESP: 23745 sa_type = SADB_SATYPE_ESP; 23746 ills = &ipsec_capab_ills_esp; 23747 ipproto = IPPROTO_ESP; 23748 break; 23749 } 23750 23751 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 23752 23753 /* 23754 * Add ill index to list of hardware accelerators. If 23755 * already in list, do nothing. 23756 */ 23757 for (cur_ill = *ills; cur_ill != NULL && 23758 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 23759 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 23760 ; 23761 23762 if (cur_ill == NULL) { 23763 /* if this is a new entry for this ill */ 23764 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 23765 if (new_ill == NULL) { 23766 rw_exit(&ipsec_capab_ills_lock); 23767 return; 23768 } 23769 23770 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 23771 new_ill->ill_isv6 = ill->ill_isv6; 23772 new_ill->next = *ills; 23773 *ills = new_ill; 23774 } else if (!sadb_resync) { 23775 /* not resync'ing SADB and an entry exists for this ill */ 23776 rw_exit(&ipsec_capab_ills_lock); 23777 return; 23778 } 23779 23780 rw_exit(&ipsec_capab_ills_lock); 23781 23782 if (ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 23783 /* 23784 * IPsec module for protocol loaded, initiate dump 23785 * of the SADB to this ill. 23786 */ 23787 sadb_ill_download(ill, sa_type); 23788 } 23789 23790 /* 23791 * Remove an ill from the list of IPsec capable ills. 23792 */ 23793 static void 23794 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 23795 { 23796 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 23797 23798 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 23799 dl_cap == DL_CAPAB_IPSEC_ESP); 23800 23801 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipsec_capab_ills_ah : 23802 &ipsec_capab_ills_esp; 23803 23804 rw_enter(&ipsec_capab_ills_lock, RW_WRITER); 23805 23806 prev_ill = NULL; 23807 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 23808 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 23809 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 23810 ; 23811 if (cur_ill == NULL) { 23812 /* entry not found */ 23813 rw_exit(&ipsec_capab_ills_lock); 23814 return; 23815 } 23816 if (prev_ill == NULL) { 23817 /* entry at front of list */ 23818 *ills = NULL; 23819 } else { 23820 prev_ill->next = cur_ill->next; 23821 } 23822 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 23823 rw_exit(&ipsec_capab_ills_lock); 23824 } 23825 23826 23827 /* 23828 * Handling of DL_CONTROL_REQ messages that must be sent down to 23829 * an ill while having exclusive access. 23830 */ 23831 /* ARGSUSED */ 23832 static void 23833 ill_ipsec_capab_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 23834 { 23835 ill_t *ill = (ill_t *)q->q_ptr; 23836 23837 ill_dlpi_send(ill, mp); 23838 } 23839 23840 23841 /* 23842 * Called by SADB to send a DL_CONTROL_REQ message to every ill 23843 * supporting the specified IPsec protocol acceleration. 23844 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 23845 * We free the mblk and, if sa is non-null, release the held referece. 23846 */ 23847 void 23848 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa) 23849 { 23850 ipsec_capab_ill_t *ici, *cur_ici; 23851 ill_t *ill; 23852 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 23853 23854 ici = (sa_type == SADB_SATYPE_AH) ? ipsec_capab_ills_ah : 23855 ipsec_capab_ills_esp; 23856 23857 rw_enter(&ipsec_capab_ills_lock, RW_READER); 23858 23859 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 23860 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 23861 cur_ici->ill_isv6, NULL, NULL, NULL, NULL); 23862 23863 /* 23864 * Handle the case where the ill goes away while the SADB is 23865 * attempting to send messages. If it's going away, it's 23866 * nuking its shadow SADB, so we don't care.. 23867 */ 23868 23869 if (ill == NULL) 23870 continue; 23871 23872 if (sa != NULL) { 23873 /* 23874 * Make sure capabilities match before 23875 * sending SA to ill. 23876 */ 23877 if (!ipsec_capab_match(ill, cur_ici->ill_index, 23878 cur_ici->ill_isv6, sa)) { 23879 ill_refrele(ill); 23880 continue; 23881 } 23882 23883 mutex_enter(&sa->ipsa_lock); 23884 sa->ipsa_flags |= IPSA_F_HW; 23885 mutex_exit(&sa->ipsa_lock); 23886 } 23887 23888 /* 23889 * Copy template message, and add it to the front 23890 * of the mblk ship list. We want to avoid holding 23891 * the ipsec_capab_ills_lock while sending the 23892 * message to the ills. 23893 * 23894 * The b_next and b_prev are temporarily used 23895 * to build a list of mblks to be sent down, and to 23896 * save the ill to which they must be sent. 23897 */ 23898 nmp = copymsg(mp); 23899 if (nmp == NULL) { 23900 ill_refrele(ill); 23901 continue; 23902 } 23903 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 23904 nmp->b_next = mp_ship_list; 23905 mp_ship_list = nmp; 23906 nmp->b_prev = (mblk_t *)ill; 23907 } 23908 23909 rw_exit(&ipsec_capab_ills_lock); 23910 23911 nmp = mp_ship_list; 23912 while (nmp != NULL) { 23913 /* restore the mblk to a sane state */ 23914 next_mp = nmp->b_next; 23915 nmp->b_next = NULL; 23916 ill = (ill_t *)nmp->b_prev; 23917 nmp->b_prev = NULL; 23918 23919 /* 23920 * Ship the mblk to the ill, must be exclusive. Keep the 23921 * reference to the ill as qwriter_ip() does a ill_referele(). 23922 */ 23923 (void) qwriter_ip(NULL, ill, ill->ill_wq, nmp, 23924 ill_ipsec_capab_send_writer, NEW_OP, B_TRUE); 23925 23926 nmp = next_mp; 23927 } 23928 23929 if (sa != NULL) 23930 IPSA_REFRELE(sa); 23931 freemsg(mp); 23932 } 23933 23934 23935 /* 23936 * Derive an interface id from the link layer address. 23937 * Knows about IEEE 802 and IEEE EUI-64 mappings. 23938 */ 23939 static boolean_t 23940 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23941 { 23942 char *addr; 23943 23944 if (phys_length != ETHERADDRL) 23945 return (B_FALSE); 23946 23947 /* Form EUI-64 like address */ 23948 addr = (char *)&v6addr->s6_addr32[2]; 23949 bcopy((char *)phys_addr, addr, 3); 23950 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 23951 addr[3] = (char)0xff; 23952 addr[4] = (char)0xfe; 23953 bcopy((char *)phys_addr + 3, addr + 5, 3); 23954 return (B_TRUE); 23955 } 23956 23957 /* ARGSUSED */ 23958 static boolean_t 23959 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23960 { 23961 return (B_FALSE); 23962 } 23963 23964 /* ARGSUSED */ 23965 static boolean_t 23966 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23967 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23968 { 23969 /* 23970 * Multicast address mappings used over Ethernet/802.X. 23971 * This address is used as a base for mappings. 23972 */ 23973 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 23974 0x00, 0x00, 0x00}; 23975 23976 /* 23977 * Extract low order 32 bits from IPv6 multicast address. 23978 * Or that into the link layer address, starting from the 23979 * second byte. 23980 */ 23981 *hw_start = 2; 23982 v6_extract_mask->s6_addr32[0] = 0; 23983 v6_extract_mask->s6_addr32[1] = 0; 23984 v6_extract_mask->s6_addr32[2] = 0; 23985 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23986 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 23987 return (B_TRUE); 23988 } 23989 23990 /* 23991 * Indicate by return value whether multicast is supported. If not, 23992 * this code should not touch/change any parameters. 23993 */ 23994 /* ARGSUSED */ 23995 static boolean_t 23996 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23997 uint32_t *hw_start, ipaddr_t *extract_mask) 23998 { 23999 /* 24000 * Multicast address mappings used over Ethernet/802.X. 24001 * This address is used as a base for mappings. 24002 */ 24003 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 24004 0x00, 0x00, 0x00 }; 24005 24006 if (phys_length != ETHERADDRL) 24007 return (B_FALSE); 24008 24009 *extract_mask = htonl(0x007fffff); 24010 *hw_start = 2; 24011 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 24012 return (B_TRUE); 24013 } 24014 24015 /* 24016 * Derive IPoIB interface id from the link layer address. 24017 */ 24018 static boolean_t 24019 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 24020 { 24021 char *addr; 24022 24023 if (phys_length != 20) 24024 return (B_FALSE); 24025 addr = (char *)&v6addr->s6_addr32[2]; 24026 bcopy(phys_addr + 12, addr, 8); 24027 /* 24028 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 24029 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 24030 * rules. In these cases, the IBA considers these GUIDs to be in 24031 * "Modified EUI-64" format, and thus toggling the u/l bit is not 24032 * required; vendors are required not to assign global EUI-64's 24033 * that differ only in u/l bit values, thus guaranteeing uniqueness 24034 * of the interface identifier. Whether the GUID is in modified 24035 * or proper EUI-64 format, the ipv6 identifier must have the u/l 24036 * bit set to 1. 24037 */ 24038 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 24039 return (B_TRUE); 24040 } 24041 24042 /* 24043 * Note on mapping from multicast IP addresses to IPoIB multicast link 24044 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 24045 * The format of an IPoIB multicast address is: 24046 * 24047 * 4 byte QPN Scope Sign. Pkey 24048 * +--------------------------------------------+ 24049 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 24050 * +--------------------------------------------+ 24051 * 24052 * The Scope and Pkey components are properties of the IBA port and 24053 * network interface. They can be ascertained from the broadcast address. 24054 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 24055 */ 24056 24057 static boolean_t 24058 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 24059 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 24060 { 24061 /* 24062 * Base IPoIB IPv6 multicast address used for mappings. 24063 * Does not contain the IBA scope/Pkey values. 24064 */ 24065 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 24066 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 24067 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 24068 24069 /* 24070 * Extract low order 80 bits from IPv6 multicast address. 24071 * Or that into the link layer address, starting from the 24072 * sixth byte. 24073 */ 24074 *hw_start = 6; 24075 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 24076 24077 /* 24078 * Now fill in the IBA scope/Pkey values from the broadcast address. 24079 */ 24080 *(maddr + 5) = *(bphys_addr + 5); 24081 *(maddr + 8) = *(bphys_addr + 8); 24082 *(maddr + 9) = *(bphys_addr + 9); 24083 24084 v6_extract_mask->s6_addr32[0] = 0; 24085 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 24086 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 24087 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 24088 return (B_TRUE); 24089 } 24090 24091 static boolean_t 24092 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 24093 uint32_t *hw_start, ipaddr_t *extract_mask) 24094 { 24095 /* 24096 * Base IPoIB IPv4 multicast address used for mappings. 24097 * Does not contain the IBA scope/Pkey values. 24098 */ 24099 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 24100 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 24101 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 24102 24103 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 24104 return (B_FALSE); 24105 24106 /* 24107 * Extract low order 28 bits from IPv4 multicast address. 24108 * Or that into the link layer address, starting from the 24109 * sixteenth byte. 24110 */ 24111 *extract_mask = htonl(0x0fffffff); 24112 *hw_start = 16; 24113 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 24114 24115 /* 24116 * Now fill in the IBA scope/Pkey values from the broadcast address. 24117 */ 24118 *(maddr + 5) = *(bphys_addr + 5); 24119 *(maddr + 8) = *(bphys_addr + 8); 24120 *(maddr + 9) = *(bphys_addr + 9); 24121 return (B_TRUE); 24122 } 24123 24124 /* 24125 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 24126 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 24127 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 24128 * the link-local address is preferred. 24129 */ 24130 boolean_t 24131 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 24132 { 24133 ipif_t *ipif; 24134 ipif_t *maybe_ipif = NULL; 24135 24136 mutex_enter(&ill->ill_lock); 24137 if (ill->ill_state_flags & ILL_CONDEMNED) { 24138 mutex_exit(&ill->ill_lock); 24139 if (ipifp != NULL) 24140 *ipifp = NULL; 24141 return (B_FALSE); 24142 } 24143 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 24144 if (!IPIF_CAN_LOOKUP(ipif)) 24145 continue; 24146 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 24147 ipif->ipif_zoneid != ALL_ZONES) 24148 continue; 24149 if ((ipif->ipif_flags & flags) != flags) 24150 continue; 24151 24152 if (ipifp == NULL) { 24153 mutex_exit(&ill->ill_lock); 24154 ASSERT(maybe_ipif == NULL); 24155 return (B_TRUE); 24156 } 24157 if (!ill->ill_isv6 || 24158 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 24159 ipif_refhold_locked(ipif); 24160 mutex_exit(&ill->ill_lock); 24161 *ipifp = ipif; 24162 return (B_TRUE); 24163 } 24164 if (maybe_ipif == NULL) 24165 maybe_ipif = ipif; 24166 } 24167 if (ipifp != NULL) { 24168 if (maybe_ipif != NULL) 24169 ipif_refhold_locked(maybe_ipif); 24170 *ipifp = maybe_ipif; 24171 } 24172 mutex_exit(&ill->ill_lock); 24173 return (maybe_ipif != NULL); 24174 } 24175 24176 /* 24177 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 24178 */ 24179 boolean_t 24180 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 24181 { 24182 ill_t *illg; 24183 24184 /* 24185 * We look at the passed-in ill first without grabbing ill_g_lock. 24186 */ 24187 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 24188 return (B_TRUE); 24189 } 24190 rw_enter(&ill_g_lock, RW_READER); 24191 if (ill->ill_group == NULL) { 24192 /* ill not in a group */ 24193 rw_exit(&ill_g_lock); 24194 return (B_FALSE); 24195 } 24196 24197 /* 24198 * There's no ipif in the zone on ill, however ill is part of an IPMP 24199 * group. We need to look for an ipif in the zone on all the ills in the 24200 * group. 24201 */ 24202 illg = ill->ill_group->illgrp_ill; 24203 do { 24204 /* 24205 * We don't call ipif_lookup_zoneid() on ill as we already know 24206 * that it's not there. 24207 */ 24208 if (illg != ill && 24209 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 24210 break; 24211 } 24212 } while ((illg = illg->ill_group_next) != NULL); 24213 rw_exit(&ill_g_lock); 24214 return (illg != NULL); 24215 } 24216 24217 /* 24218 * Check if this ill is only being used to send ICMP probes for IPMP 24219 */ 24220 boolean_t 24221 ill_is_probeonly(ill_t *ill) 24222 { 24223 /* 24224 * Check if the interface is FAILED, or INACTIVE 24225 */ 24226 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 24227 return (B_TRUE); 24228 24229 return (B_FALSE); 24230 } 24231 24232 /* 24233 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 24234 * If a pointer to an ipif_t is returned then the caller will need to do 24235 * an ill_refrele(). 24236 */ 24237 ipif_t * 24238 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6) 24239 { 24240 ipif_t *ipif; 24241 ill_t *ill; 24242 24243 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL); 24244 24245 if (ill == NULL) 24246 return (NULL); 24247 24248 mutex_enter(&ill->ill_lock); 24249 if (ill->ill_state_flags & ILL_CONDEMNED) { 24250 mutex_exit(&ill->ill_lock); 24251 ill_refrele(ill); 24252 return (NULL); 24253 } 24254 24255 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 24256 if (!IPIF_CAN_LOOKUP(ipif)) 24257 continue; 24258 if (lifidx == ipif->ipif_id) { 24259 ipif_refhold_locked(ipif); 24260 break; 24261 } 24262 } 24263 24264 mutex_exit(&ill->ill_lock); 24265 ill_refrele(ill); 24266 return (ipif); 24267 } 24268 24269 /* 24270 * Flush the fastpath by deleting any IRE's that are waiting for the fastpath, 24271 * and any IRE's that are using the fastpath. There are two exceptions: 24272 * IRE_MIPRTUN and IRE_BROADCAST are difficult to recreate, so instead we just 24273 * nuke their nce_fp_mp's; see ire_fastpath_flush() for details. 24274 */ 24275 void 24276 ill_fastpath_flush(ill_t *ill) 24277 { 24278 if (ill->ill_isv6) { 24279 nce_fastpath_list_dispatch(ill, NULL, NULL); 24280 ndp_walk(ill, (pfi_t)ndp_fastpath_flush, NULL); 24281 } else { 24282 ire_fastpath_list_dispatch(ill, NULL, NULL); 24283 ire_walk_ill_v4(MATCH_IRE_WQ | MATCH_IRE_TYPE, 24284 IRE_CACHE | IRE_BROADCAST, ire_fastpath_flush, NULL, ill); 24285 mutex_enter(&ire_mrtun_lock); 24286 if (ire_mrtun_count != 0) { 24287 mutex_exit(&ire_mrtun_lock); 24288 ire_walk_ill_mrtun(MATCH_IRE_WQ, IRE_MIPRTUN, 24289 ire_fastpath_flush, NULL, ill); 24290 } else { 24291 mutex_exit(&ire_mrtun_lock); 24292 } 24293 } 24294 } 24295 24296 /* 24297 * Set the physical address information for `ill' to the contents of the 24298 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 24299 * asynchronous if `ill' cannot immediately be quiesced -- in which case 24300 * EINPROGRESS will be returned. 24301 */ 24302 int 24303 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 24304 { 24305 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 24306 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 24307 24308 ASSERT(IAM_WRITER_IPSQ(ipsq)); 24309 24310 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 24311 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 24312 /* Changing DL_IPV6_TOKEN is not yet supported */ 24313 return (0); 24314 } 24315 24316 /* 24317 * We need to store up to two copies of `mp' in `ill'. Due to the 24318 * design of ipsq_pending_mp_add(), we can't pass them as separate 24319 * arguments to ill_set_phys_addr_tail(). Instead, chain them 24320 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 24321 */ 24322 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 24323 freemsg(mp); 24324 return (ENOMEM); 24325 } 24326 24327 ipsq_current_start(ipsq, ill->ill_ipif, 0); 24328 24329 /* 24330 * If we can quiesce the ill, then set the address. If not, then 24331 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 24332 */ 24333 ill_down_ipifs(ill, NULL, 0, B_FALSE); 24334 mutex_enter(&ill->ill_lock); 24335 if (!ill_is_quiescent(ill)) { 24336 /* call cannot fail since `conn_t *' argument is NULL */ 24337 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 24338 mp, ILL_DOWN); 24339 mutex_exit(&ill->ill_lock); 24340 return (EINPROGRESS); 24341 } 24342 mutex_exit(&ill->ill_lock); 24343 24344 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 24345 return (0); 24346 } 24347 24348 /* 24349 * Once the ill associated with `q' has quiesced, set its physical address 24350 * information to the values in `addrmp'. Note that two copies of `addrmp' 24351 * are passed (linked by b_cont), since we sometimes need to save two distinct 24352 * copies in the ill_t, and our context doesn't permit sleeping or allocation 24353 * failure (we'll free the other copy if it's not needed). Since the ill_t 24354 * is quiesced, we know any stale IREs with the old address information have 24355 * already been removed, so we don't need to call ill_fastpath_flush(). 24356 */ 24357 /* ARGSUSED */ 24358 static void 24359 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 24360 { 24361 ill_t *ill = q->q_ptr; 24362 mblk_t *addrmp2 = unlinkb(addrmp); 24363 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 24364 uint_t addrlen, addroff; 24365 24366 ASSERT(IAM_WRITER_IPSQ(ipsq)); 24367 mutex_enter(&ill->ill_lock); 24368 ASSERT(ill_is_quiescent(ill)); 24369 mutex_exit(&ill->ill_lock); 24370 24371 addroff = dlindp->dl_addr_offset; 24372 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 24373 24374 switch (dlindp->dl_data) { 24375 case DL_IPV6_LINK_LAYER_ADDR: 24376 ill_set_ndmp(ill, addrmp, addroff, addrlen); 24377 freemsg(addrmp2); 24378 break; 24379 24380 case DL_CURR_PHYS_ADDR: 24381 freemsg(ill->ill_phys_addr_mp); 24382 ill->ill_phys_addr = addrmp->b_rptr + addroff; 24383 ill->ill_phys_addr_mp = addrmp; 24384 ill->ill_phys_addr_length = addrlen; 24385 24386 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 24387 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 24388 else 24389 freemsg(addrmp2); 24390 break; 24391 default: 24392 ASSERT(0); 24393 } 24394 24395 /* 24396 * If there are ipifs to bring up, ill_up_ipifs() will return nonzero, 24397 * and ipsq_current_finish() will be called by ip_rput_dlpi_writer() 24398 * or ip_arp_done() when the last ipif is brought up. 24399 */ 24400 if (ill_up_ipifs(ill, q, addrmp) == 0) 24401 ipsq_current_finish(ipsq); 24402 } 24403 24404 /* 24405 * Helper routine for setting the ill_nd_lla fields. 24406 */ 24407 void 24408 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 24409 { 24410 freemsg(ill->ill_nd_lla_mp); 24411 ill->ill_nd_lla = ndmp->b_rptr + addroff; 24412 ill->ill_nd_lla_mp = ndmp; 24413 ill->ill_nd_lla_len = addrlen; 24414 } 24415