1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * This file contains the interface control functions for IP. 31 */ 32 33 #include <sys/types.h> 34 #include <sys/stream.h> 35 #include <sys/dlpi.h> 36 #include <sys/stropts.h> 37 #include <sys/strsun.h> 38 #include <sys/sysmacros.h> 39 #include <sys/strlog.h> 40 #include <sys/ddi.h> 41 #include <sys/sunddi.h> 42 #include <sys/cmn_err.h> 43 #include <sys/kstat.h> 44 #include <sys/debug.h> 45 #include <sys/zone.h> 46 #include <sys/sunldi.h> 47 #include <sys/file.h> 48 #include <sys/bitmap.h> 49 50 #include <sys/kmem.h> 51 #include <sys/systm.h> 52 #include <sys/param.h> 53 #include <sys/socket.h> 54 #include <sys/isa_defs.h> 55 #include <net/if.h> 56 #include <net/if_arp.h> 57 #include <net/if_types.h> 58 #include <net/if_dl.h> 59 #include <net/route.h> 60 #include <sys/sockio.h> 61 #include <netinet/in.h> 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/igmp_var.h> 65 #include <sys/strsun.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 69 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 70 #include <inet/mi.h> 71 #include <inet/nd.h> 72 #include <inet/arp.h> 73 #include <inet/mib2.h> 74 #include <inet/ip.h> 75 #include <inet/ip6.h> 76 #include <inet/ip6_asp.h> 77 #include <inet/tcp.h> 78 #include <inet/ip_multi.h> 79 #include <inet/ip_ire.h> 80 #include <inet/ip_ftable.h> 81 #include <inet/ip_rts.h> 82 #include <inet/ip_ndp.h> 83 #include <inet/ip_if.h> 84 #include <inet/ip_impl.h> 85 #include <inet/tun.h> 86 #include <inet/sctp_ip.h> 87 #include <inet/ip_netinfo.h> 88 #include <inet/mib2.h> 89 90 #include <net/pfkeyv2.h> 91 #include <inet/ipsec_info.h> 92 #include <inet/sadb.h> 93 #include <inet/ipsec_impl.h> 94 #include <sys/iphada.h> 95 96 97 #include <netinet/igmp.h> 98 #include <inet/ip_listutils.h> 99 #include <inet/ipclassifier.h> 100 #include <sys/mac.h> 101 102 #include <sys/systeminfo.h> 103 #include <sys/bootconf.h> 104 105 #include <sys/tsol/tndb.h> 106 #include <sys/tsol/tnet.h> 107 108 /* The character which tells where the ill_name ends */ 109 #define IPIF_SEPARATOR_CHAR ':' 110 111 /* IP ioctl function table entry */ 112 typedef struct ipft_s { 113 int ipft_cmd; 114 pfi_t ipft_pfi; 115 int ipft_min_size; 116 int ipft_flags; 117 } ipft_t; 118 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 119 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 120 121 typedef struct ip_sock_ar_s { 122 union { 123 area_t ip_sock_area; 124 ared_t ip_sock_ared; 125 areq_t ip_sock_areq; 126 } ip_sock_ar_u; 127 queue_t *ip_sock_ar_q; 128 } ip_sock_ar_t; 129 130 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 131 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 132 char *value, caddr_t cp, cred_t *ioc_cr); 133 134 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 135 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 136 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 137 mblk_t *mp, boolean_t need_up); 138 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 139 mblk_t *mp, boolean_t need_up); 140 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 141 queue_t *q, mblk_t *mp, boolean_t need_up); 142 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 143 mblk_t *mp, boolean_t need_up); 144 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 145 mblk_t *mp); 146 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 147 queue_t *q, mblk_t *mp, boolean_t need_up); 148 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 149 int ioccmd, struct linkblk *li, boolean_t doconsist); 150 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 151 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 152 static void ipsq_flush(ill_t *ill); 153 154 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 155 queue_t *q, mblk_t *mp, boolean_t need_up); 156 static void ipsq_delete(ipsq_t *); 157 158 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 159 boolean_t initialize); 160 static void ipif_check_bcast_ires(ipif_t *test_ipif); 161 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 162 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 163 boolean_t isv6); 164 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 165 static void ipif_delete_cache_ire(ire_t *, char *); 166 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 167 static void ipif_free(ipif_t *ipif); 168 static void ipif_free_tail(ipif_t *ipif); 169 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 170 static void ipif_multicast_down(ipif_t *ipif); 171 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 172 static void ipif_set_default(ipif_t *ipif); 173 static int ipif_set_values(queue_t *q, mblk_t *mp, 174 char *interf_name, uint_t *ppa); 175 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 176 queue_t *q); 177 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 178 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 179 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); 180 static int ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp); 181 static void ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp); 182 183 static int ill_alloc_ppa(ill_if_t *, ill_t *); 184 static int ill_arp_off(ill_t *ill); 185 static int ill_arp_on(ill_t *ill); 186 static void ill_delete_interface_type(ill_if_t *); 187 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 188 static void ill_dl_down(ill_t *ill); 189 static void ill_down(ill_t *ill); 190 static void ill_downi(ire_t *ire, char *ill_arg); 191 static void ill_free_mib(ill_t *ill); 192 static void ill_glist_delete(ill_t *); 193 static boolean_t ill_has_usable_ipif(ill_t *); 194 static int ill_lock_ipsq_ills(ipsq_t *sq, ill_t **list, int); 195 static void ill_nominate_bcast_rcv(ill_group_t *illgrp); 196 static void ill_phyint_free(ill_t *ill); 197 static void ill_phyint_reinit(ill_t *ill); 198 static void ill_set_nce_router_flags(ill_t *, boolean_t); 199 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 200 static void ill_signal_ipsq_ills(ipsq_t *, boolean_t); 201 static boolean_t ill_split_ipsq(ipsq_t *cur_sq); 202 static void ill_stq_cache_delete(ire_t *, char *); 203 204 static boolean_t ip_ether_v6intfid(uint_t, uint8_t *, in6_addr_t *); 205 static boolean_t ip_nodef_v6intfid(uint_t, uint8_t *, in6_addr_t *); 206 static boolean_t ip_ether_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 207 in6_addr_t *); 208 static boolean_t ip_ether_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 209 ipaddr_t *); 210 static boolean_t ip_ib_v6intfid(uint_t, uint8_t *, in6_addr_t *); 211 static boolean_t ip_ib_v6mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 212 in6_addr_t *); 213 static boolean_t ip_ib_v4mapinfo(uint_t, uint8_t *, uint8_t *, uint32_t *, 214 ipaddr_t *); 215 216 static void ipif_save_ire(ipif_t *, ire_t *); 217 static void ipif_remove_ire(ipif_t *, ire_t *); 218 static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); 219 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 220 221 /* 222 * Per-ill IPsec capabilities management. 223 */ 224 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 225 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 226 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 227 static void ill_ipsec_capab_delete(ill_t *, uint_t); 228 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 229 static void ill_capability_proto(ill_t *, int, mblk_t *); 230 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 231 boolean_t); 232 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 233 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 234 static void ill_capability_mdt_reset(ill_t *, mblk_t **); 235 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 236 static void ill_capability_ipsec_reset(ill_t *, mblk_t **); 237 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 238 static void ill_capability_hcksum_reset(ill_t *, mblk_t **); 239 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 240 dl_capability_sub_t *); 241 static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); 242 static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 243 static void ill_capability_lso_reset(ill_t *, mblk_t **); 244 static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 245 static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); 246 static void ill_capability_dls_reset(ill_t *, mblk_t **); 247 static void ill_capability_dls_disable(ill_t *); 248 249 static void illgrp_cache_delete(ire_t *, char *); 250 static void illgrp_delete(ill_t *ill); 251 static void illgrp_reset_schednext(ill_t *ill); 252 253 static ill_t *ill_prev_usesrc(ill_t *); 254 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 255 static void ill_disband_usesrc_group(ill_t *); 256 257 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 258 259 #ifdef DEBUG 260 static void ill_trace_cleanup(const ill_t *); 261 static void ipif_trace_cleanup(const ipif_t *); 262 #endif 263 264 /* 265 * if we go over the memory footprint limit more than once in this msec 266 * interval, we'll start pruning aggressively. 267 */ 268 int ip_min_frag_prune_time = 0; 269 270 /* 271 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 272 * and the IPsec DOI 273 */ 274 #define MAX_IPSEC_ALGS 256 275 276 #define BITSPERBYTE 8 277 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 278 279 #define IPSEC_ALG_ENABLE(algs, algid) \ 280 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 281 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 282 283 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 284 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 285 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 286 287 typedef uint8_t ipsec_capab_elem_t; 288 289 /* 290 * Per-algorithm parameters. Note that at present, only encryption 291 * algorithms have variable keysize (IKE does not provide a way to negotiate 292 * auth algorithm keysize). 293 * 294 * All sizes here are in bits. 295 */ 296 typedef struct 297 { 298 uint16_t minkeylen; 299 uint16_t maxkeylen; 300 } ipsec_capab_algparm_t; 301 302 /* 303 * Per-ill capabilities. 304 */ 305 struct ill_ipsec_capab_s { 306 ipsec_capab_elem_t *encr_hw_algs; 307 ipsec_capab_elem_t *auth_hw_algs; 308 uint32_t algs_size; /* size of _hw_algs in bytes */ 309 /* algorithm key lengths */ 310 ipsec_capab_algparm_t *encr_algparm; 311 uint32_t encr_algparm_size; 312 uint32_t encr_algparm_end; 313 }; 314 315 /* 316 * The field values are larger than strictly necessary for simple 317 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 318 */ 319 static area_t ip_area_template = { 320 AR_ENTRY_ADD, /* area_cmd */ 321 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 322 /* area_name_offset */ 323 /* area_name_length temporarily holds this structure length */ 324 sizeof (area_t), /* area_name_length */ 325 IP_ARP_PROTO_TYPE, /* area_proto */ 326 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 327 IP_ADDR_LEN, /* area_proto_addr_length */ 328 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 329 /* area_proto_mask_offset */ 330 0, /* area_flags */ 331 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 332 /* area_hw_addr_offset */ 333 /* Zero length hw_addr_length means 'use your idea of the address' */ 334 0 /* area_hw_addr_length */ 335 }; 336 337 /* 338 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 339 * support 340 */ 341 static area_t ip6_area_template = { 342 AR_ENTRY_ADD, /* area_cmd */ 343 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 344 /* area_name_offset */ 345 /* area_name_length temporarily holds this structure length */ 346 sizeof (area_t), /* area_name_length */ 347 IP_ARP_PROTO_TYPE, /* area_proto */ 348 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 349 IPV6_ADDR_LEN, /* area_proto_addr_length */ 350 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 351 /* area_proto_mask_offset */ 352 0, /* area_flags */ 353 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 354 /* area_hw_addr_offset */ 355 /* Zero length hw_addr_length means 'use your idea of the address' */ 356 0 /* area_hw_addr_length */ 357 }; 358 359 static ared_t ip_ared_template = { 360 AR_ENTRY_DELETE, 361 sizeof (ared_t) + IP_ADDR_LEN, 362 sizeof (ared_t), 363 IP_ARP_PROTO_TYPE, 364 sizeof (ared_t), 365 IP_ADDR_LEN 366 }; 367 368 static ared_t ip6_ared_template = { 369 AR_ENTRY_DELETE, 370 sizeof (ared_t) + IPV6_ADDR_LEN, 371 sizeof (ared_t), 372 IP_ARP_PROTO_TYPE, 373 sizeof (ared_t), 374 IPV6_ADDR_LEN 375 }; 376 377 /* 378 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 379 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 380 * areq is used). 381 */ 382 static areq_t ip_areq_template = { 383 AR_ENTRY_QUERY, /* cmd */ 384 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 385 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 386 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 387 sizeof (areq_t), /* target addr offset */ 388 IP_ADDR_LEN, /* target addr_length */ 389 0, /* flags */ 390 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 391 IP_ADDR_LEN, /* sender addr length */ 392 AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ 393 AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ 394 AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ 395 /* anything else filled in by the code */ 396 }; 397 398 static arc_t ip_aru_template = { 399 AR_INTERFACE_UP, 400 sizeof (arc_t), /* Name offset */ 401 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 402 }; 403 404 static arc_t ip_ard_template = { 405 AR_INTERFACE_DOWN, 406 sizeof (arc_t), /* Name offset */ 407 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 408 }; 409 410 static arc_t ip_aron_template = { 411 AR_INTERFACE_ON, 412 sizeof (arc_t), /* Name offset */ 413 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 414 }; 415 416 static arc_t ip_aroff_template = { 417 AR_INTERFACE_OFF, 418 sizeof (arc_t), /* Name offset */ 419 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 420 }; 421 422 423 static arma_t ip_arma_multi_template = { 424 AR_MAPPING_ADD, 425 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 426 /* Name offset */ 427 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 428 IP_ARP_PROTO_TYPE, 429 sizeof (arma_t), /* proto_addr_offset */ 430 IP_ADDR_LEN, /* proto_addr_length */ 431 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 432 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 433 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 434 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 435 IP_MAX_HW_LEN, /* hw_addr_length */ 436 0, /* hw_mapping_start */ 437 }; 438 439 static ipft_t ip_ioctl_ftbl[] = { 440 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 441 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 442 IPFT_F_NO_REPLY }, 443 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 444 IPFT_F_NO_REPLY }, 445 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 446 { 0 } 447 }; 448 449 /* Simple ICMP IP Header Template */ 450 static ipha_t icmp_ipha = { 451 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 452 }; 453 454 /* Flag descriptors for ip_ipif_report */ 455 static nv_t ipif_nv_tbl[] = { 456 { IPIF_UP, "UP" }, 457 { IPIF_BROADCAST, "BROADCAST" }, 458 { ILLF_DEBUG, "DEBUG" }, 459 { PHYI_LOOPBACK, "LOOPBACK" }, 460 { IPIF_POINTOPOINT, "POINTOPOINT" }, 461 { ILLF_NOTRAILERS, "NOTRAILERS" }, 462 { PHYI_RUNNING, "RUNNING" }, 463 { ILLF_NOARP, "NOARP" }, 464 { PHYI_PROMISC, "PROMISC" }, 465 { PHYI_ALLMULTI, "ALLMULTI" }, 466 { PHYI_INTELLIGENT, "INTELLIGENT" }, 467 { ILLF_MULTICAST, "MULTICAST" }, 468 { PHYI_MULTI_BCAST, "MULTI_BCAST" }, 469 { IPIF_UNNUMBERED, "UNNUMBERED" }, 470 { IPIF_DHCPRUNNING, "DHCP" }, 471 { IPIF_PRIVATE, "PRIVATE" }, 472 { IPIF_NOXMIT, "NOXMIT" }, 473 { IPIF_NOLOCAL, "NOLOCAL" }, 474 { IPIF_DEPRECATED, "DEPRECATED" }, 475 { IPIF_PREFERRED, "PREFERRED" }, 476 { IPIF_TEMPORARY, "TEMPORARY" }, 477 { IPIF_ADDRCONF, "ADDRCONF" }, 478 { PHYI_VIRTUAL, "VIRTUAL" }, 479 { ILLF_ROUTER, "ROUTER" }, 480 { ILLF_NONUD, "NONUD" }, 481 { IPIF_ANYCAST, "ANYCAST" }, 482 { ILLF_NORTEXCH, "NORTEXCH" }, 483 { ILLF_IPV4, "IPV4" }, 484 { ILLF_IPV6, "IPV6" }, 485 { IPIF_NOFAILOVER, "NOFAILOVER" }, 486 { PHYI_FAILED, "FAILED" }, 487 { PHYI_STANDBY, "STANDBY" }, 488 { PHYI_INACTIVE, "INACTIVE" }, 489 { PHYI_OFFLINE, "OFFLINE" }, 490 }; 491 492 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 493 494 static ip_m_t ip_m_tbl[] = { 495 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 496 ip_ether_v6intfid }, 497 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 498 ip_nodef_v6intfid }, 499 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 500 ip_nodef_v6intfid }, 501 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 502 ip_nodef_v6intfid }, 503 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 504 ip_ether_v6intfid }, 505 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 506 ip_ib_v6intfid }, 507 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL}, 508 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 509 ip_nodef_v6intfid } 510 }; 511 512 static ill_t ill_null; /* Empty ILL for init. */ 513 char ipif_loopback_name[] = "lo0"; 514 static char *ipv4_forward_suffix = ":ip_forwarding"; 515 static char *ipv6_forward_suffix = ":ip6_forwarding"; 516 static sin6_t sin6_null; /* Zero address for quick clears */ 517 static sin_t sin_null; /* Zero address for quick clears */ 518 519 /* When set search for unused ipif_seqid */ 520 static ipif_t ipif_zero; 521 522 /* 523 * ppa arena is created after these many 524 * interfaces have been plumbed. 525 */ 526 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 527 528 /* 529 * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout 530 * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is 531 * set through platform specific code (Niagara/Ontario). 532 */ 533 #define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ 534 (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) 535 536 #define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) 537 538 static uint_t 539 ipif_rand(ip_stack_t *ipst) 540 { 541 ipst->ips_ipif_src_random = ipst->ips_ipif_src_random * 1103515245 + 542 12345; 543 return ((ipst->ips_ipif_src_random >> 16) & 0x7fff); 544 } 545 546 /* 547 * Allocate per-interface mibs. 548 * Returns true if ok. False otherwise. 549 * ipsq may not yet be allocated (loopback case ). 550 */ 551 static boolean_t 552 ill_allocate_mibs(ill_t *ill) 553 { 554 /* Already allocated? */ 555 if (ill->ill_ip_mib != NULL) { 556 if (ill->ill_isv6) 557 ASSERT(ill->ill_icmp6_mib != NULL); 558 return (B_TRUE); 559 } 560 561 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 562 KM_NOSLEEP); 563 if (ill->ill_ip_mib == NULL) { 564 return (B_FALSE); 565 } 566 567 /* Setup static information */ 568 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 569 sizeof (mib2_ipIfStatsEntry_t)); 570 if (ill->ill_isv6) { 571 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 572 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 573 sizeof (mib2_ipv6AddrEntry_t)); 574 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 575 sizeof (mib2_ipv6RouteEntry_t)); 576 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 577 sizeof (mib2_ipv6NetToMediaEntry_t)); 578 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 579 sizeof (ipv6_member_t)); 580 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 581 sizeof (ipv6_grpsrc_t)); 582 } else { 583 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 584 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 585 sizeof (mib2_ipAddrEntry_t)); 586 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 587 sizeof (mib2_ipRouteEntry_t)); 588 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 589 sizeof (mib2_ipNetToMediaEntry_t)); 590 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 591 sizeof (ip_member_t)); 592 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 593 sizeof (ip_grpsrc_t)); 594 595 /* 596 * For a v4 ill, we are done at this point, because per ill 597 * icmp mibs are only used for v6. 598 */ 599 return (B_TRUE); 600 } 601 602 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 603 KM_NOSLEEP); 604 if (ill->ill_icmp6_mib == NULL) { 605 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 606 ill->ill_ip_mib = NULL; 607 return (B_FALSE); 608 } 609 /* static icmp info */ 610 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 611 sizeof (mib2_ipv6IfIcmpEntry_t); 612 /* 613 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 614 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 615 * -> ill_phyint_reinit 616 */ 617 return (B_TRUE); 618 } 619 620 /* 621 * Common code for preparation of ARP commands. Two points to remember: 622 * 1) The ill_name is tacked on at the end of the allocated space so 623 * the templates name_offset field must contain the total space 624 * to allocate less the name length. 625 * 626 * 2) The templates name_length field should contain the *template* 627 * length. We use it as a parameter to bcopy() and then write 628 * the real ill_name_length into the name_length field of the copy. 629 * (Always called as writer.) 630 */ 631 mblk_t * 632 ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) 633 { 634 arc_t *arc = (arc_t *)template; 635 char *cp; 636 int len; 637 mblk_t *mp; 638 uint_t name_length = ill->ill_name_length; 639 uint_t template_len = arc->arc_name_length; 640 641 len = arc->arc_name_offset + name_length; 642 mp = allocb(len, BPRI_HI); 643 if (mp == NULL) 644 return (NULL); 645 cp = (char *)mp->b_rptr; 646 mp->b_wptr = (uchar_t *)&cp[len]; 647 if (template_len) 648 bcopy(template, cp, template_len); 649 if (len > template_len) 650 bzero(&cp[template_len], len - template_len); 651 mp->b_datap->db_type = M_PROTO; 652 653 arc = (arc_t *)cp; 654 arc->arc_name_length = name_length; 655 cp = (char *)arc + arc->arc_name_offset; 656 bcopy(ill->ill_name, cp, name_length); 657 658 if (addr) { 659 area_t *area = (area_t *)mp->b_rptr; 660 661 cp = (char *)area + area->area_proto_addr_offset; 662 bcopy(addr, cp, area->area_proto_addr_length); 663 if (area->area_cmd == AR_ENTRY_ADD) { 664 cp = (char *)area; 665 len = area->area_proto_addr_length; 666 if (area->area_proto_mask_offset) 667 cp += area->area_proto_mask_offset; 668 else 669 cp += area->area_proto_addr_offset + len; 670 while (len-- > 0) 671 *cp++ = (char)~0; 672 } 673 } 674 return (mp); 675 } 676 677 mblk_t * 678 ipif_area_alloc(ipif_t *ipif) 679 { 680 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, 681 (char *)&ipif->ipif_lcl_addr)); 682 } 683 684 mblk_t * 685 ipif_ared_alloc(ipif_t *ipif) 686 { 687 return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, 688 (char *)&ipif->ipif_lcl_addr)); 689 } 690 691 mblk_t * 692 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 693 { 694 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 695 (char *)&addr)); 696 } 697 698 /* 699 * Completely vaporize a lower level tap and all associated interfaces. 700 * ill_delete is called only out of ip_close when the device control 701 * stream is being closed. 702 */ 703 void 704 ill_delete(ill_t *ill) 705 { 706 ipif_t *ipif; 707 ill_t *prev_ill; 708 ip_stack_t *ipst = ill->ill_ipst; 709 710 /* 711 * ill_delete may be forcibly entering the ipsq. The previous 712 * ioctl may not have completed and may need to be aborted. 713 * ipsq_flush takes care of it. If we don't need to enter the 714 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 715 * ill_delete_tail is sufficient. 716 */ 717 ipsq_flush(ill); 718 719 /* 720 * Nuke all interfaces. ipif_free will take down the interface, 721 * remove it from the list, and free the data structure. 722 * Walk down the ipif list and remove the logical interfaces 723 * first before removing the main ipif. We can't unplumb 724 * zeroth interface first in the case of IPv6 as reset_conn_ill 725 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 726 * POINTOPOINT. 727 * 728 * If ill_ipif was not properly initialized (i.e low on memory), 729 * then no interfaces to clean up. In this case just clean up the 730 * ill. 731 */ 732 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 733 ipif_free(ipif); 734 735 /* 736 * Used only by ill_arp_on and ill_arp_off, which are writers. 737 * So nobody can be using this mp now. Free the mp allocated for 738 * honoring ILLF_NOARP 739 */ 740 freemsg(ill->ill_arp_on_mp); 741 ill->ill_arp_on_mp = NULL; 742 743 /* Clean up msgs on pending upcalls for mrouted */ 744 reset_mrt_ill(ill); 745 746 /* 747 * ipif_free -> reset_conn_ipif will remove all multicast 748 * references for IPv4. For IPv6, we need to do it here as 749 * it points only at ills. 750 */ 751 reset_conn_ill(ill); 752 753 /* 754 * ill_down will arrange to blow off any IRE's dependent on this 755 * ILL, and shut down fragmentation reassembly. 756 */ 757 ill_down(ill); 758 759 /* Let SCTP know, so that it can remove this from its list. */ 760 sctp_update_ill(ill, SCTP_ILL_REMOVE); 761 762 /* 763 * If an address on this ILL is being used as a source address then 764 * clear out the pointers in other ILLs that point to this ILL. 765 */ 766 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 767 if (ill->ill_usesrc_grp_next != NULL) { 768 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 769 ill_disband_usesrc_group(ill); 770 } else { /* consumer of the usesrc ILL */ 771 prev_ill = ill_prev_usesrc(ill); 772 prev_ill->ill_usesrc_grp_next = 773 ill->ill_usesrc_grp_next; 774 } 775 } 776 rw_exit(&ipst->ips_ill_g_usesrc_lock); 777 } 778 779 static void 780 ipif_non_duplicate(ipif_t *ipif) 781 { 782 ill_t *ill = ipif->ipif_ill; 783 mutex_enter(&ill->ill_lock); 784 if (ipif->ipif_flags & IPIF_DUPLICATE) { 785 ipif->ipif_flags &= ~IPIF_DUPLICATE; 786 ASSERT(ill->ill_ipif_dup_count > 0); 787 ill->ill_ipif_dup_count--; 788 } 789 mutex_exit(&ill->ill_lock); 790 } 791 792 /* 793 * ill_delete_tail is called from ip_modclose after all references 794 * to the closing ill are gone. The wait is done in ip_modclose 795 */ 796 void 797 ill_delete_tail(ill_t *ill) 798 { 799 mblk_t **mpp; 800 ipif_t *ipif; 801 ip_stack_t *ipst = ill->ill_ipst; 802 803 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 804 ipif_non_duplicate(ipif); 805 ipif_down_tail(ipif); 806 } 807 808 ASSERT(ill->ill_ipif_dup_count == 0 && 809 ill->ill_arp_down_mp == NULL && 810 ill->ill_arp_del_mapping_mp == NULL); 811 812 /* 813 * If polling capability is enabled (which signifies direct 814 * upcall into IP and driver has ill saved as a handle), 815 * we need to make sure that unbind has completed before we 816 * let the ill disappear and driver no longer has any reference 817 * to this ill. 818 */ 819 mutex_enter(&ill->ill_lock); 820 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 821 cv_wait(&ill->ill_cv, &ill->ill_lock); 822 mutex_exit(&ill->ill_lock); 823 824 /* 825 * Clean up polling and soft ring capabilities 826 */ 827 if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) 828 ill_capability_dls_disable(ill); 829 830 if (ill->ill_net_type != IRE_LOOPBACK) 831 qprocsoff(ill->ill_rq); 832 833 /* 834 * We do an ipsq_flush once again now. New messages could have 835 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 836 * could also have landed up if an ioctl thread had looked up 837 * the ill before we set the ILL_CONDEMNED flag, but not yet 838 * enqueued the ioctl when we did the ipsq_flush last time. 839 */ 840 ipsq_flush(ill); 841 842 /* 843 * Free capabilities. 844 */ 845 if (ill->ill_ipsec_capab_ah != NULL) { 846 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 847 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 848 ill->ill_ipsec_capab_ah = NULL; 849 } 850 851 if (ill->ill_ipsec_capab_esp != NULL) { 852 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 853 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 854 ill->ill_ipsec_capab_esp = NULL; 855 } 856 857 if (ill->ill_mdt_capab != NULL) { 858 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 859 ill->ill_mdt_capab = NULL; 860 } 861 862 if (ill->ill_hcksum_capab != NULL) { 863 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 864 ill->ill_hcksum_capab = NULL; 865 } 866 867 if (ill->ill_zerocopy_capab != NULL) { 868 kmem_free(ill->ill_zerocopy_capab, 869 sizeof (ill_zerocopy_capab_t)); 870 ill->ill_zerocopy_capab = NULL; 871 } 872 873 if (ill->ill_lso_capab != NULL) { 874 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 875 ill->ill_lso_capab = NULL; 876 } 877 878 if (ill->ill_dls_capab != NULL) { 879 CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); 880 ill->ill_dls_capab->ill_unbind_conn = NULL; 881 kmem_free(ill->ill_dls_capab, 882 sizeof (ill_dls_capab_t) + 883 (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); 884 ill->ill_dls_capab = NULL; 885 } 886 887 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); 888 889 while (ill->ill_ipif != NULL) 890 ipif_free_tail(ill->ill_ipif); 891 892 /* 893 * We have removed all references to ilm from conn and the ones joined 894 * within the kernel. 895 * 896 * We don't walk conns, mrts and ires because 897 * 898 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 899 * 2) ill_down ->ill_downi walks all the ires and cleans up 900 * ill references. 901 */ 902 ASSERT(ilm_walk_ill(ill) == 0); 903 /* 904 * Take us out of the list of ILLs. ill_glist_delete -> ill_phyint_free 905 * could free the phyint. No more reference to the phyint after this 906 * point. 907 */ 908 (void) ill_glist_delete(ill); 909 910 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 911 if (ill->ill_ndd_name != NULL) 912 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 913 rw_exit(&ipst->ips_ip_g_nd_lock); 914 915 916 if (ill->ill_frag_ptr != NULL) { 917 uint_t count; 918 919 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 920 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 921 } 922 mi_free(ill->ill_frag_ptr); 923 ill->ill_frag_ptr = NULL; 924 ill->ill_frag_hash_tbl = NULL; 925 } 926 927 freemsg(ill->ill_nd_lla_mp); 928 /* Free all retained control messages. */ 929 mpp = &ill->ill_first_mp_to_free; 930 do { 931 while (mpp[0]) { 932 mblk_t *mp; 933 mblk_t *mp1; 934 935 mp = mpp[0]; 936 mpp[0] = mp->b_next; 937 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 938 mp1->b_next = NULL; 939 mp1->b_prev = NULL; 940 } 941 freemsg(mp); 942 } 943 } while (mpp++ != &ill->ill_last_mp_to_free); 944 945 ill_free_mib(ill); 946 947 #ifdef DEBUG 948 ill_trace_cleanup(ill); 949 #endif 950 951 /* Drop refcnt here */ 952 netstack_rele(ill->ill_ipst->ips_netstack); 953 ill->ill_ipst = NULL; 954 } 955 956 static void 957 ill_free_mib(ill_t *ill) 958 { 959 ip_stack_t *ipst = ill->ill_ipst; 960 961 /* 962 * MIB statistics must not be lost, so when an interface 963 * goes away the counter values will be added to the global 964 * MIBs. 965 */ 966 if (ill->ill_ip_mib != NULL) { 967 if (ill->ill_isv6) { 968 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 969 ill->ill_ip_mib); 970 } else { 971 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 972 ill->ill_ip_mib); 973 } 974 975 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 976 ill->ill_ip_mib = NULL; 977 } 978 if (ill->ill_icmp6_mib != NULL) { 979 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 980 ill->ill_icmp6_mib); 981 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 982 ill->ill_icmp6_mib = NULL; 983 } 984 } 985 986 /* 987 * Concatenate together a physical address and a sap. 988 * 989 * Sap_lengths are interpreted as follows: 990 * sap_length == 0 ==> no sap 991 * sap_length > 0 ==> sap is at the head of the dlpi address 992 * sap_length < 0 ==> sap is at the tail of the dlpi address 993 */ 994 static void 995 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 996 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 997 { 998 uint16_t sap_addr = (uint16_t)sap_src; 999 1000 if (sap_length == 0) { 1001 if (phys_src == NULL) 1002 bzero(dst, phys_length); 1003 else 1004 bcopy(phys_src, dst, phys_length); 1005 } else if (sap_length < 0) { 1006 if (phys_src == NULL) 1007 bzero(dst, phys_length); 1008 else 1009 bcopy(phys_src, dst, phys_length); 1010 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 1011 } else { 1012 bcopy(&sap_addr, dst, sizeof (sap_addr)); 1013 if (phys_src == NULL) 1014 bzero((char *)dst + sap_length, phys_length); 1015 else 1016 bcopy(phys_src, (char *)dst + sap_length, phys_length); 1017 } 1018 } 1019 1020 /* 1021 * Generate a dl_unitdata_req mblk for the device and address given. 1022 * addr_length is the length of the physical portion of the address. 1023 * If addr is NULL include an all zero address of the specified length. 1024 * TRUE? In any case, addr_length is taken to be the entire length of the 1025 * dlpi address, including the absolute value of sap_length. 1026 */ 1027 mblk_t * 1028 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1029 t_scalar_t sap_length) 1030 { 1031 dl_unitdata_req_t *dlur; 1032 mblk_t *mp; 1033 t_scalar_t abs_sap_length; /* absolute value */ 1034 1035 abs_sap_length = ABS(sap_length); 1036 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1037 DL_UNITDATA_REQ); 1038 if (mp == NULL) 1039 return (NULL); 1040 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1041 /* HACK: accomodate incompatible DLPI drivers */ 1042 if (addr_length == 8) 1043 addr_length = 6; 1044 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1045 dlur->dl_dest_addr_offset = sizeof (*dlur); 1046 dlur->dl_priority.dl_min = 0; 1047 dlur->dl_priority.dl_max = 0; 1048 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1049 (uchar_t *)&dlur[1]); 1050 return (mp); 1051 } 1052 1053 /* 1054 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 1055 * Return an error if we already have 1 or more ioctls in progress. 1056 * This is used only for non-exclusive ioctls. Currently this is used 1057 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 1058 * and thus need to use ipsq_pending_mp_add. 1059 */ 1060 boolean_t 1061 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1062 { 1063 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1064 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1065 /* 1066 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1067 */ 1068 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1069 (add_mp->b_datap->db_type == M_IOCTL)); 1070 1071 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1072 /* 1073 * Return error if the conn has started closing. The conn 1074 * could have finished cleaning up the pending mp list, 1075 * If so we should not add another mp to the list negating 1076 * the cleanup. 1077 */ 1078 if (connp->conn_state_flags & CONN_CLOSING) 1079 return (B_FALSE); 1080 /* 1081 * Add the pending mp to the head of the list, chained by b_next. 1082 * Note down the conn on which the ioctl request came, in b_prev. 1083 * This will be used to later get the conn, when we get a response 1084 * on the ill queue, from some other module (typically arp) 1085 */ 1086 add_mp->b_next = (void *)ill->ill_pending_mp; 1087 add_mp->b_queue = CONNP_TO_WQ(connp); 1088 ill->ill_pending_mp = add_mp; 1089 if (connp != NULL) 1090 connp->conn_oper_pending_ill = ill; 1091 return (B_TRUE); 1092 } 1093 1094 /* 1095 * Retrieve the ill_pending_mp and return it. We have to walk the list 1096 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1097 */ 1098 mblk_t * 1099 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1100 { 1101 mblk_t *prev = NULL; 1102 mblk_t *curr = NULL; 1103 uint_t id; 1104 conn_t *connp; 1105 1106 /* 1107 * When the conn closes, conn_ioctl_cleanup needs to clean 1108 * up the pending mp, but it does not know the ioc_id and 1109 * passes in a zero for it. 1110 */ 1111 mutex_enter(&ill->ill_lock); 1112 if (ioc_id != 0) 1113 *connpp = NULL; 1114 1115 /* Search the list for the appropriate ioctl based on ioc_id */ 1116 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1117 prev = curr, curr = curr->b_next) { 1118 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1119 connp = Q_TO_CONN(curr->b_queue); 1120 /* Match based on the ioc_id or based on the conn */ 1121 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1122 break; 1123 } 1124 1125 if (curr != NULL) { 1126 /* Unlink the mblk from the pending mp list */ 1127 if (prev != NULL) { 1128 prev->b_next = curr->b_next; 1129 } else { 1130 ASSERT(ill->ill_pending_mp == curr); 1131 ill->ill_pending_mp = curr->b_next; 1132 } 1133 1134 /* 1135 * conn refcnt must have been bumped up at the start of 1136 * the ioctl. So we can safely access the conn. 1137 */ 1138 ASSERT(CONN_Q(curr->b_queue)); 1139 *connpp = Q_TO_CONN(curr->b_queue); 1140 curr->b_next = NULL; 1141 curr->b_queue = NULL; 1142 } 1143 1144 mutex_exit(&ill->ill_lock); 1145 1146 return (curr); 1147 } 1148 1149 /* 1150 * Add the pending mp to the list. There can be only 1 pending mp 1151 * in the list. Any exclusive ioctl that needs to wait for a response 1152 * from another module or driver needs to use this function to set 1153 * the ipsq_pending_mp to the ioctl mblk and wait for the response from 1154 * the other module/driver. This is also used while waiting for the 1155 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1156 */ 1157 boolean_t 1158 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1159 int waitfor) 1160 { 1161 ipsq_t *ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 1162 1163 ASSERT(IAM_WRITER_IPIF(ipif)); 1164 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1165 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1166 ASSERT(ipsq->ipsq_pending_mp == NULL); 1167 /* 1168 * The caller may be using a different ipif than the one passed into 1169 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 1170 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 1171 * that `ipsq_current_ipif == ipif'. 1172 */ 1173 ASSERT(ipsq->ipsq_current_ipif != NULL); 1174 1175 /* 1176 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1177 * M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the driver. 1178 */ 1179 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1180 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP) || 1181 (DB_TYPE(add_mp) == M_PROTO) || (DB_TYPE(add_mp) == M_PCPROTO)); 1182 1183 if (connp != NULL) { 1184 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1185 /* 1186 * Return error if the conn has started closing. The conn 1187 * could have finished cleaning up the pending mp list, 1188 * If so we should not add another mp to the list negating 1189 * the cleanup. 1190 */ 1191 if (connp->conn_state_flags & CONN_CLOSING) 1192 return (B_FALSE); 1193 } 1194 mutex_enter(&ipsq->ipsq_lock); 1195 ipsq->ipsq_pending_ipif = ipif; 1196 /* 1197 * Note down the queue in b_queue. This will be returned by 1198 * ipsq_pending_mp_get. Caller will then use these values to restart 1199 * the processing 1200 */ 1201 add_mp->b_next = NULL; 1202 add_mp->b_queue = q; 1203 ipsq->ipsq_pending_mp = add_mp; 1204 ipsq->ipsq_waitfor = waitfor; 1205 1206 if (connp != NULL) 1207 connp->conn_oper_pending_ill = ipif->ipif_ill; 1208 mutex_exit(&ipsq->ipsq_lock); 1209 return (B_TRUE); 1210 } 1211 1212 /* 1213 * Retrieve the ipsq_pending_mp and return it. There can be only 1 mp 1214 * queued in the list. 1215 */ 1216 mblk_t * 1217 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1218 { 1219 mblk_t *curr = NULL; 1220 1221 mutex_enter(&ipsq->ipsq_lock); 1222 *connpp = NULL; 1223 if (ipsq->ipsq_pending_mp == NULL) { 1224 mutex_exit(&ipsq->ipsq_lock); 1225 return (NULL); 1226 } 1227 1228 /* There can be only 1 such excl message */ 1229 curr = ipsq->ipsq_pending_mp; 1230 ASSERT(curr != NULL && curr->b_next == NULL); 1231 ipsq->ipsq_pending_ipif = NULL; 1232 ipsq->ipsq_pending_mp = NULL; 1233 ipsq->ipsq_waitfor = 0; 1234 mutex_exit(&ipsq->ipsq_lock); 1235 1236 if (CONN_Q(curr->b_queue)) { 1237 /* 1238 * This mp did a refhold on the conn, at the start of the ioctl. 1239 * So we can safely return a pointer to the conn to the caller. 1240 */ 1241 *connpp = Q_TO_CONN(curr->b_queue); 1242 } else { 1243 *connpp = NULL; 1244 } 1245 curr->b_next = NULL; 1246 curr->b_prev = NULL; 1247 return (curr); 1248 } 1249 1250 /* 1251 * Cleanup the ioctl mp queued in ipsq_pending_mp 1252 * - Called in the ill_delete path 1253 * - Called in the M_ERROR or M_HANGUP path on the ill. 1254 * - Called in the conn close path. 1255 */ 1256 boolean_t 1257 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1258 { 1259 mblk_t *mp; 1260 ipsq_t *ipsq; 1261 queue_t *q; 1262 ipif_t *ipif; 1263 1264 ASSERT(IAM_WRITER_ILL(ill)); 1265 ipsq = ill->ill_phyint->phyint_ipsq; 1266 mutex_enter(&ipsq->ipsq_lock); 1267 /* 1268 * If connp is null, unconditionally clean up the ipsq_pending_mp. 1269 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1270 * even if it is meant for another ill, since we have to enqueue 1271 * a new mp now in ipsq_pending_mp to complete the ipif_down. 1272 * If connp is non-null we are called from the conn close path. 1273 */ 1274 mp = ipsq->ipsq_pending_mp; 1275 if (mp == NULL || (connp != NULL && 1276 mp->b_queue != CONNP_TO_WQ(connp))) { 1277 mutex_exit(&ipsq->ipsq_lock); 1278 return (B_FALSE); 1279 } 1280 /* Now remove from the ipsq_pending_mp */ 1281 ipsq->ipsq_pending_mp = NULL; 1282 q = mp->b_queue; 1283 mp->b_next = NULL; 1284 mp->b_prev = NULL; 1285 mp->b_queue = NULL; 1286 1287 /* If MOVE was in progress, clear the move_in_progress fields also. */ 1288 ill = ipsq->ipsq_pending_ipif->ipif_ill; 1289 if (ill->ill_move_in_progress) { 1290 ILL_CLEAR_MOVE(ill); 1291 } else if (ill->ill_up_ipifs) { 1292 ill_group_cleanup(ill); 1293 } 1294 1295 ipif = ipsq->ipsq_pending_ipif; 1296 ipsq->ipsq_pending_ipif = NULL; 1297 ipsq->ipsq_waitfor = 0; 1298 ipsq->ipsq_current_ipif = NULL; 1299 ipsq->ipsq_current_ioctl = 0; 1300 mutex_exit(&ipsq->ipsq_lock); 1301 1302 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1303 if (connp == NULL) { 1304 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1305 } else { 1306 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 1307 mutex_enter(&ipif->ipif_ill->ill_lock); 1308 ipif->ipif_state_flags &= ~IPIF_CHANGING; 1309 mutex_exit(&ipif->ipif_ill->ill_lock); 1310 } 1311 } else { 1312 /* 1313 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1314 * be just inet_freemsg. we have to restart it 1315 * otherwise the thread will be stuck. 1316 */ 1317 inet_freemsg(mp); 1318 } 1319 return (B_TRUE); 1320 } 1321 1322 /* 1323 * The ill is closing. Cleanup all the pending mps. Called exclusively 1324 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1325 * knows this ill, and hence nobody can add an mp to this list 1326 */ 1327 static void 1328 ill_pending_mp_cleanup(ill_t *ill) 1329 { 1330 mblk_t *mp; 1331 queue_t *q; 1332 1333 ASSERT(IAM_WRITER_ILL(ill)); 1334 1335 mutex_enter(&ill->ill_lock); 1336 /* 1337 * Every mp on the pending mp list originating from an ioctl 1338 * added 1 to the conn refcnt, at the start of the ioctl. 1339 * So bump it down now. See comments in ip_wput_nondata() 1340 */ 1341 while (ill->ill_pending_mp != NULL) { 1342 mp = ill->ill_pending_mp; 1343 ill->ill_pending_mp = mp->b_next; 1344 mutex_exit(&ill->ill_lock); 1345 1346 q = mp->b_queue; 1347 ASSERT(CONN_Q(q)); 1348 mp->b_next = NULL; 1349 mp->b_prev = NULL; 1350 mp->b_queue = NULL; 1351 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1352 mutex_enter(&ill->ill_lock); 1353 } 1354 ill->ill_pending_ipif = NULL; 1355 1356 mutex_exit(&ill->ill_lock); 1357 } 1358 1359 /* 1360 * Called in the conn close path and ill delete path 1361 */ 1362 static void 1363 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1364 { 1365 ipsq_t *ipsq; 1366 mblk_t *prev; 1367 mblk_t *curr; 1368 mblk_t *next; 1369 queue_t *q; 1370 mblk_t *tmp_list = NULL; 1371 1372 ASSERT(IAM_WRITER_ILL(ill)); 1373 if (connp != NULL) 1374 q = CONNP_TO_WQ(connp); 1375 else 1376 q = ill->ill_wq; 1377 1378 ipsq = ill->ill_phyint->phyint_ipsq; 1379 /* 1380 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1381 * In the case of ioctl from a conn, there can be only 1 mp 1382 * queued on the ipsq. If an ill is being unplumbed, only messages 1383 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1384 * ioctls meant for this ill form conn's are not flushed. They will 1385 * be processed during ipsq_exit and will not find the ill and will 1386 * return error. 1387 */ 1388 mutex_enter(&ipsq->ipsq_lock); 1389 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1390 curr = next) { 1391 next = curr->b_next; 1392 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1393 /* Unlink the mblk from the pending mp list */ 1394 if (prev != NULL) { 1395 prev->b_next = curr->b_next; 1396 } else { 1397 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1398 ipsq->ipsq_xopq_mphead = curr->b_next; 1399 } 1400 if (ipsq->ipsq_xopq_mptail == curr) 1401 ipsq->ipsq_xopq_mptail = prev; 1402 /* 1403 * Create a temporary list and release the ipsq lock 1404 * New elements are added to the head of the tmp_list 1405 */ 1406 curr->b_next = tmp_list; 1407 tmp_list = curr; 1408 } else { 1409 prev = curr; 1410 } 1411 } 1412 mutex_exit(&ipsq->ipsq_lock); 1413 1414 while (tmp_list != NULL) { 1415 curr = tmp_list; 1416 tmp_list = curr->b_next; 1417 curr->b_next = NULL; 1418 curr->b_prev = NULL; 1419 curr->b_queue = NULL; 1420 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1421 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1422 CONN_CLOSE : NO_COPYOUT, NULL); 1423 } else { 1424 /* 1425 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1426 * this can't be just inet_freemsg. we have to 1427 * restart it otherwise the thread will be stuck. 1428 */ 1429 inet_freemsg(curr); 1430 } 1431 } 1432 } 1433 1434 /* 1435 * This conn has started closing. Cleanup any pending ioctl from this conn. 1436 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1437 */ 1438 void 1439 conn_ioctl_cleanup(conn_t *connp) 1440 { 1441 mblk_t *curr; 1442 ipsq_t *ipsq; 1443 ill_t *ill; 1444 boolean_t refheld; 1445 1446 /* 1447 * Is any exclusive ioctl pending ? If so clean it up. If the 1448 * ioctl has not yet started, the mp is pending in the list headed by 1449 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1450 * ipsq_pending_mp. If the ioctl timed out in the streamhead but 1451 * is currently executing now the mp is not queued anywhere but 1452 * conn_oper_pending_ill is null. The conn close will wait 1453 * till the conn_ref drops to zero. 1454 */ 1455 mutex_enter(&connp->conn_lock); 1456 ill = connp->conn_oper_pending_ill; 1457 if (ill == NULL) { 1458 mutex_exit(&connp->conn_lock); 1459 return; 1460 } 1461 1462 curr = ill_pending_mp_get(ill, &connp, 0); 1463 if (curr != NULL) { 1464 mutex_exit(&connp->conn_lock); 1465 CONN_DEC_REF(connp); 1466 inet_freemsg(curr); 1467 return; 1468 } 1469 /* 1470 * We may not be able to refhold the ill if the ill/ipif 1471 * is changing. But we need to make sure that the ill will 1472 * not vanish. So we just bump up the ill_waiter count. 1473 */ 1474 refheld = ill_waiter_inc(ill); 1475 mutex_exit(&connp->conn_lock); 1476 if (refheld) { 1477 if (ipsq_enter(ill, B_TRUE)) { 1478 ill_waiter_dcr(ill); 1479 /* 1480 * Check whether this ioctl has started and is 1481 * pending now in ipsq_pending_mp. If it is not 1482 * found there then check whether this ioctl has 1483 * not even started and is in the ipsq_xopq list. 1484 */ 1485 if (!ipsq_pending_mp_cleanup(ill, connp)) 1486 ipsq_xopq_mp_cleanup(ill, connp); 1487 ipsq = ill->ill_phyint->phyint_ipsq; 1488 ipsq_exit(ipsq, B_TRUE, B_TRUE); 1489 return; 1490 } 1491 } 1492 1493 /* 1494 * The ill is also closing and we could not bump up the 1495 * ill_waiter_count or we could not enter the ipsq. Leave 1496 * the cleanup to ill_delete 1497 */ 1498 mutex_enter(&connp->conn_lock); 1499 while (connp->conn_oper_pending_ill != NULL) 1500 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1501 mutex_exit(&connp->conn_lock); 1502 if (refheld) 1503 ill_waiter_dcr(ill); 1504 } 1505 1506 /* 1507 * ipcl_walk function for cleaning up conn_*_ill fields. 1508 */ 1509 static void 1510 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1511 { 1512 ill_t *ill = (ill_t *)arg; 1513 ire_t *ire; 1514 1515 mutex_enter(&connp->conn_lock); 1516 if (connp->conn_multicast_ill == ill) { 1517 /* Revert to late binding */ 1518 connp->conn_multicast_ill = NULL; 1519 connp->conn_orig_multicast_ifindex = 0; 1520 } 1521 if (connp->conn_incoming_ill == ill) 1522 connp->conn_incoming_ill = NULL; 1523 if (connp->conn_outgoing_ill == ill) 1524 connp->conn_outgoing_ill = NULL; 1525 if (connp->conn_outgoing_pill == ill) 1526 connp->conn_outgoing_pill = NULL; 1527 if (connp->conn_nofailover_ill == ill) 1528 connp->conn_nofailover_ill = NULL; 1529 if (connp->conn_dhcpinit_ill == ill) { 1530 connp->conn_dhcpinit_ill = NULL; 1531 ASSERT(ill->ill_dhcpinit != 0); 1532 atomic_dec_32(&ill->ill_dhcpinit); 1533 } 1534 if (connp->conn_ire_cache != NULL) { 1535 ire = connp->conn_ire_cache; 1536 /* 1537 * ip_newroute creates IRE_CACHE with ire_stq coming from 1538 * interface X and ipif coming from interface Y, if interface 1539 * X and Y are part of the same IPMPgroup. Thus whenever 1540 * interface X goes down, remove all references to it by 1541 * checking both on ire_ipif and ire_stq. 1542 */ 1543 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1544 (ire->ire_type == IRE_CACHE && 1545 ire->ire_stq == ill->ill_wq)) { 1546 connp->conn_ire_cache = NULL; 1547 mutex_exit(&connp->conn_lock); 1548 ire_refrele_notr(ire); 1549 return; 1550 } 1551 } 1552 mutex_exit(&connp->conn_lock); 1553 1554 } 1555 1556 /* ARGSUSED */ 1557 void 1558 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1559 { 1560 ill_t *ill = q->q_ptr; 1561 ipif_t *ipif; 1562 1563 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1564 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1565 ipif_non_duplicate(ipif); 1566 ipif_down_tail(ipif); 1567 } 1568 freemsg(mp); 1569 ipsq_current_finish(ipsq); 1570 } 1571 1572 /* 1573 * ill_down_start is called when we want to down this ill and bring it up again 1574 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1575 * all interfaces, but don't tear down any plumbing. 1576 */ 1577 boolean_t 1578 ill_down_start(queue_t *q, mblk_t *mp) 1579 { 1580 ill_t *ill = q->q_ptr; 1581 ipif_t *ipif; 1582 1583 ASSERT(IAM_WRITER_ILL(ill)); 1584 1585 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1586 (void) ipif_down(ipif, NULL, NULL); 1587 1588 ill_down(ill); 1589 1590 (void) ipsq_pending_mp_cleanup(ill, NULL); 1591 1592 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1593 1594 /* 1595 * Atomically test and add the pending mp if references are active. 1596 */ 1597 mutex_enter(&ill->ill_lock); 1598 if (!ill_is_quiescent(ill)) { 1599 /* call cannot fail since `conn_t *' argument is NULL */ 1600 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1601 mp, ILL_DOWN); 1602 mutex_exit(&ill->ill_lock); 1603 return (B_FALSE); 1604 } 1605 mutex_exit(&ill->ill_lock); 1606 return (B_TRUE); 1607 } 1608 1609 static void 1610 ill_down(ill_t *ill) 1611 { 1612 ip_stack_t *ipst = ill->ill_ipst; 1613 1614 /* Blow off any IREs dependent on this ILL. */ 1615 ire_walk(ill_downi, (char *)ill, ipst); 1616 1617 /* Remove any conn_*_ill depending on this ill */ 1618 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1619 1620 if (ill->ill_group != NULL) { 1621 illgrp_delete(ill); 1622 } 1623 } 1624 1625 /* 1626 * ire_walk routine used to delete every IRE that depends on queues 1627 * associated with 'ill'. (Always called as writer.) 1628 */ 1629 static void 1630 ill_downi(ire_t *ire, char *ill_arg) 1631 { 1632 ill_t *ill = (ill_t *)ill_arg; 1633 1634 /* 1635 * ip_newroute creates IRE_CACHE with ire_stq coming from 1636 * interface X and ipif coming from interface Y, if interface 1637 * X and Y are part of the same IPMP group. Thus whenever interface 1638 * X goes down, remove all references to it by checking both 1639 * on ire_ipif and ire_stq. 1640 */ 1641 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1642 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1643 ire_delete(ire); 1644 } 1645 } 1646 1647 /* 1648 * Remove ire/nce from the fastpath list. 1649 */ 1650 void 1651 ill_fastpath_nack(ill_t *ill) 1652 { 1653 nce_fastpath_list_dispatch(ill, NULL, NULL); 1654 } 1655 1656 /* Consume an M_IOCACK of the fastpath probe. */ 1657 void 1658 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1659 { 1660 mblk_t *mp1 = mp; 1661 1662 /* 1663 * If this was the first attempt turn on the fastpath probing. 1664 */ 1665 mutex_enter(&ill->ill_lock); 1666 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1667 ill->ill_dlpi_fastpath_state = IDS_OK; 1668 mutex_exit(&ill->ill_lock); 1669 1670 /* Free the M_IOCACK mblk, hold on to the data */ 1671 mp = mp->b_cont; 1672 freeb(mp1); 1673 if (mp == NULL) 1674 return; 1675 if (mp->b_cont != NULL) { 1676 /* 1677 * Update all IRE's or NCE's that are waiting for 1678 * fastpath update. 1679 */ 1680 nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp); 1681 mp1 = mp->b_cont; 1682 freeb(mp); 1683 mp = mp1; 1684 } else { 1685 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1686 } 1687 1688 freeb(mp); 1689 } 1690 1691 /* 1692 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1693 * The data portion of the request is a dl_unitdata_req_t template for 1694 * what we would send downstream in the absence of a fastpath confirmation. 1695 */ 1696 int 1697 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1698 { 1699 struct iocblk *ioc; 1700 mblk_t *mp; 1701 1702 if (dlur_mp == NULL) 1703 return (EINVAL); 1704 1705 mutex_enter(&ill->ill_lock); 1706 switch (ill->ill_dlpi_fastpath_state) { 1707 case IDS_FAILED: 1708 /* 1709 * Driver NAKed the first fastpath ioctl - assume it doesn't 1710 * support it. 1711 */ 1712 mutex_exit(&ill->ill_lock); 1713 return (ENOTSUP); 1714 case IDS_UNKNOWN: 1715 /* This is the first probe */ 1716 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1717 break; 1718 default: 1719 break; 1720 } 1721 mutex_exit(&ill->ill_lock); 1722 1723 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1724 return (EAGAIN); 1725 1726 mp->b_cont = copyb(dlur_mp); 1727 if (mp->b_cont == NULL) { 1728 freeb(mp); 1729 return (EAGAIN); 1730 } 1731 1732 ioc = (struct iocblk *)mp->b_rptr; 1733 ioc->ioc_count = msgdsize(mp->b_cont); 1734 1735 putnext(ill->ill_wq, mp); 1736 return (0); 1737 } 1738 1739 void 1740 ill_capability_probe(ill_t *ill) 1741 { 1742 /* 1743 * Do so only if capabilities are still unknown. 1744 */ 1745 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) 1746 return; 1747 1748 ill->ill_dlpi_capab_state = IDS_INPROGRESS; 1749 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1750 ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); 1751 } 1752 1753 void 1754 ill_capability_reset(ill_t *ill) 1755 { 1756 mblk_t *sc_mp = NULL; 1757 mblk_t *tmp; 1758 1759 /* 1760 * Note here that we reset the state to UNKNOWN, and later send 1761 * down the DL_CAPABILITY_REQ without first setting the state to 1762 * INPROGRESS. We do this in order to distinguish the 1763 * DL_CAPABILITY_ACK response which may come back in response to 1764 * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would 1765 * also handle the case where the driver doesn't send us back 1766 * a DL_CAPABILITY_ACK in response, since the "probe" routine 1767 * requires the state to be in UNKNOWN anyway. In any case, all 1768 * features are turned off until the state reaches IDS_OK. 1769 */ 1770 ill->ill_dlpi_capab_state = IDS_UNKNOWN; 1771 ill->ill_capab_reneg = B_FALSE; 1772 1773 /* 1774 * Disable sub-capabilities and request a list of sub-capability 1775 * messages which will be sent down to the driver. Each handler 1776 * allocates the corresponding dl_capability_sub_t inside an 1777 * mblk, and links it to the existing sc_mp mblk, or return it 1778 * as sc_mp if it's the first sub-capability (the passed in 1779 * sc_mp is NULL). Upon returning from all capability handlers, 1780 * sc_mp will be pulled-up, before passing it downstream. 1781 */ 1782 ill_capability_mdt_reset(ill, &sc_mp); 1783 ill_capability_hcksum_reset(ill, &sc_mp); 1784 ill_capability_zerocopy_reset(ill, &sc_mp); 1785 ill_capability_ipsec_reset(ill, &sc_mp); 1786 ill_capability_dls_reset(ill, &sc_mp); 1787 ill_capability_lso_reset(ill, &sc_mp); 1788 1789 /* Nothing to send down in order to disable the capabilities? */ 1790 if (sc_mp == NULL) 1791 return; 1792 1793 tmp = msgpullup(sc_mp, -1); 1794 freemsg(sc_mp); 1795 if ((sc_mp = tmp) == NULL) { 1796 cmn_err(CE_WARN, "ill_capability_reset: unable to send down " 1797 "DL_CAPABILITY_REQ (ENOMEM)\n"); 1798 return; 1799 } 1800 1801 ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); 1802 ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); 1803 } 1804 1805 /* 1806 * Request or set new-style hardware capabilities supported by DLS provider. 1807 */ 1808 static void 1809 ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) 1810 { 1811 mblk_t *mp; 1812 dl_capability_req_t *capb; 1813 size_t size = 0; 1814 uint8_t *ptr; 1815 1816 if (reqp != NULL) 1817 size = MBLKL(reqp); 1818 1819 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); 1820 if (mp == NULL) { 1821 freemsg(reqp); 1822 return; 1823 } 1824 ptr = mp->b_rptr; 1825 1826 capb = (dl_capability_req_t *)ptr; 1827 ptr += sizeof (dl_capability_req_t); 1828 1829 if (reqp != NULL) { 1830 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1831 capb->dl_sub_length = size; 1832 bcopy(reqp->b_rptr, ptr, size); 1833 ptr += size; 1834 mp->b_cont = reqp->b_cont; 1835 freeb(reqp); 1836 } 1837 ASSERT(ptr == mp->b_wptr); 1838 1839 ill_dlpi_send(ill, mp); 1840 } 1841 1842 static void 1843 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1844 { 1845 dl_capab_id_t *id_ic; 1846 uint_t sub_dl_cap = outers->dl_cap; 1847 dl_capability_sub_t *inners; 1848 uint8_t *capend; 1849 1850 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1851 1852 /* 1853 * Note: range checks here are not absolutely sufficient to 1854 * make us robust against malformed messages sent by drivers; 1855 * this is in keeping with the rest of IP's dlpi handling. 1856 * (Remember, it's coming from something else in the kernel 1857 * address space) 1858 */ 1859 1860 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1861 if (capend > mp->b_wptr) { 1862 cmn_err(CE_WARN, "ill_capability_id_ack: " 1863 "malformed sub-capability too long for mblk"); 1864 return; 1865 } 1866 1867 id_ic = (dl_capab_id_t *)(outers + 1); 1868 1869 if (outers->dl_length < sizeof (*id_ic) || 1870 (inners = &id_ic->id_subcap, 1871 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1872 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1873 "encapsulated capab type %d too long for mblk", 1874 inners->dl_cap); 1875 return; 1876 } 1877 1878 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1879 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1880 "isn't as expected; pass-thru module(s) detected, " 1881 "discarding capability\n", inners->dl_cap)); 1882 return; 1883 } 1884 1885 /* Process the encapsulated sub-capability */ 1886 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1887 } 1888 1889 /* 1890 * Process Multidata Transmit capability negotiation ack received from a 1891 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1892 * DL_CAPABILITY_ACK message. 1893 */ 1894 static void 1895 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1896 { 1897 mblk_t *nmp = NULL; 1898 dl_capability_req_t *oc; 1899 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1900 ill_mdt_capab_t **ill_mdt_capab; 1901 uint_t sub_dl_cap = isub->dl_cap; 1902 uint8_t *capend; 1903 1904 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1905 1906 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1907 1908 /* 1909 * Note: range checks here are not absolutely sufficient to 1910 * make us robust against malformed messages sent by drivers; 1911 * this is in keeping with the rest of IP's dlpi handling. 1912 * (Remember, it's coming from something else in the kernel 1913 * address space) 1914 */ 1915 1916 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1917 if (capend > mp->b_wptr) { 1918 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1919 "malformed sub-capability too long for mblk"); 1920 return; 1921 } 1922 1923 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1924 1925 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1926 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1927 "unsupported MDT sub-capability (version %d, expected %d)", 1928 mdt_ic->mdt_version, MDT_VERSION_2); 1929 return; 1930 } 1931 1932 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 1933 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 1934 "capability isn't as expected; pass-thru module(s) " 1935 "detected, discarding capability\n")); 1936 return; 1937 } 1938 1939 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 1940 1941 if (*ill_mdt_capab == NULL) { 1942 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 1943 KM_NOSLEEP); 1944 1945 if (*ill_mdt_capab == NULL) { 1946 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1947 "could not enable MDT version %d " 1948 "for %s (ENOMEM)\n", MDT_VERSION_2, 1949 ill->ill_name); 1950 return; 1951 } 1952 } 1953 1954 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 1955 "MDT version %d (%d bytes leading, %d bytes trailing " 1956 "header spaces, %d max pld bufs, %d span limit)\n", 1957 ill->ill_name, MDT_VERSION_2, 1958 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 1959 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 1960 1961 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 1962 (*ill_mdt_capab)->ill_mdt_on = 1; 1963 /* 1964 * Round the following values to the nearest 32-bit; ULP 1965 * may further adjust them to accomodate for additional 1966 * protocol headers. We pass these values to ULP during 1967 * bind time. 1968 */ 1969 (*ill_mdt_capab)->ill_mdt_hdr_head = 1970 roundup(mdt_ic->mdt_hdr_head, 4); 1971 (*ill_mdt_capab)->ill_mdt_hdr_tail = 1972 roundup(mdt_ic->mdt_hdr_tail, 4); 1973 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 1974 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 1975 1976 ill->ill_capabilities |= ILL_CAPAB_MDT; 1977 } else { 1978 uint_t size; 1979 uchar_t *rptr; 1980 1981 size = sizeof (dl_capability_req_t) + 1982 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 1983 1984 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1985 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1986 "could not enable MDT for %s (ENOMEM)\n", 1987 ill->ill_name); 1988 return; 1989 } 1990 1991 rptr = nmp->b_rptr; 1992 /* initialize dl_capability_req_t */ 1993 oc = (dl_capability_req_t *)nmp->b_rptr; 1994 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1995 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1996 sizeof (dl_capab_mdt_t); 1997 nmp->b_rptr += sizeof (dl_capability_req_t); 1998 1999 /* initialize dl_capability_sub_t */ 2000 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2001 nmp->b_rptr += sizeof (*isub); 2002 2003 /* initialize dl_capab_mdt_t */ 2004 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2005 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2006 2007 nmp->b_rptr = rptr; 2008 2009 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2010 "to enable MDT version %d\n", ill->ill_name, 2011 MDT_VERSION_2)); 2012 2013 /* set ENABLE flag */ 2014 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2015 2016 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2017 ill_dlpi_send(ill, nmp); 2018 } 2019 } 2020 2021 static void 2022 ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) 2023 { 2024 mblk_t *mp; 2025 dl_capab_mdt_t *mdt_subcap; 2026 dl_capability_sub_t *dl_subcap; 2027 int size; 2028 2029 if (!ILL_MDT_CAPABLE(ill)) 2030 return; 2031 2032 ASSERT(ill->ill_mdt_capab != NULL); 2033 /* 2034 * Clear the capability flag for MDT but retain the ill_mdt_capab 2035 * structure since it's possible that another thread is still 2036 * referring to it. The structure only gets deallocated when 2037 * we destroy the ill. 2038 */ 2039 ill->ill_capabilities &= ~ILL_CAPAB_MDT; 2040 2041 size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2042 2043 mp = allocb(size, BPRI_HI); 2044 if (mp == NULL) { 2045 ip1dbg(("ill_capability_mdt_reset: unable to allocate " 2046 "request to disable MDT\n")); 2047 return; 2048 } 2049 2050 mp->b_wptr = mp->b_rptr + size; 2051 2052 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2053 dl_subcap->dl_cap = DL_CAPAB_MDT; 2054 dl_subcap->dl_length = sizeof (*mdt_subcap); 2055 2056 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2057 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2058 mdt_subcap->mdt_flags = 0; 2059 mdt_subcap->mdt_hdr_head = 0; 2060 mdt_subcap->mdt_hdr_tail = 0; 2061 2062 if (*sc_mp != NULL) 2063 linkb(*sc_mp, mp); 2064 else 2065 *sc_mp = mp; 2066 } 2067 2068 /* 2069 * Send a DL_NOTIFY_REQ to the specified ill to enable 2070 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2071 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2072 * acceleration. 2073 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2074 */ 2075 static boolean_t 2076 ill_enable_promisc_notify(ill_t *ill) 2077 { 2078 mblk_t *mp; 2079 dl_notify_req_t *req; 2080 2081 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2082 2083 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2084 if (mp == NULL) 2085 return (B_FALSE); 2086 2087 req = (dl_notify_req_t *)mp->b_rptr; 2088 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2089 DL_NOTE_PROMISC_OFF_PHYS; 2090 2091 ill_dlpi_send(ill, mp); 2092 2093 return (B_TRUE); 2094 } 2095 2096 2097 /* 2098 * Allocate an IPsec capability request which will be filled by our 2099 * caller to turn on support for one or more algorithms. 2100 */ 2101 static mblk_t * 2102 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2103 { 2104 mblk_t *nmp; 2105 dl_capability_req_t *ocap; 2106 dl_capab_ipsec_t *ocip; 2107 dl_capab_ipsec_t *icip; 2108 uint8_t *ptr; 2109 icip = (dl_capab_ipsec_t *)(isub + 1); 2110 2111 /* 2112 * The first time around, we send a DL_NOTIFY_REQ to enable 2113 * PROMISC_ON/OFF notification from the provider. We need to 2114 * do this before enabling the algorithms to avoid leakage of 2115 * cleartext packets. 2116 */ 2117 2118 if (!ill_enable_promisc_notify(ill)) 2119 return (NULL); 2120 2121 /* 2122 * Allocate new mblk which will contain a new capability 2123 * request to enable the capabilities. 2124 */ 2125 2126 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2127 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2128 if (nmp == NULL) 2129 return (NULL); 2130 2131 ptr = nmp->b_rptr; 2132 2133 /* initialize dl_capability_req_t */ 2134 ocap = (dl_capability_req_t *)ptr; 2135 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2136 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2137 ptr += sizeof (dl_capability_req_t); 2138 2139 /* initialize dl_capability_sub_t */ 2140 bcopy(isub, ptr, sizeof (*isub)); 2141 ptr += sizeof (*isub); 2142 2143 /* initialize dl_capab_ipsec_t */ 2144 ocip = (dl_capab_ipsec_t *)ptr; 2145 bcopy(icip, ocip, sizeof (*icip)); 2146 2147 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2148 return (nmp); 2149 } 2150 2151 /* 2152 * Process an IPsec capability negotiation ack received from a DLS Provider. 2153 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2154 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2155 */ 2156 static void 2157 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2158 { 2159 dl_capab_ipsec_t *icip; 2160 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2161 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2162 uint_t cipher, nciphers; 2163 mblk_t *nmp; 2164 uint_t alg_len; 2165 boolean_t need_sadb_dump; 2166 uint_t sub_dl_cap = isub->dl_cap; 2167 ill_ipsec_capab_t **ill_capab; 2168 uint64_t ill_capab_flag; 2169 uint8_t *capend, *ciphend; 2170 boolean_t sadb_resync; 2171 2172 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2173 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2174 2175 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2176 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2177 ill_capab_flag = ILL_CAPAB_AH; 2178 } else { 2179 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2180 ill_capab_flag = ILL_CAPAB_ESP; 2181 } 2182 2183 /* 2184 * If the ill capability structure exists, then this incoming 2185 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2186 * If this is so, then we'd need to resynchronize the SADB 2187 * after re-enabling the offloaded ciphers. 2188 */ 2189 sadb_resync = (*ill_capab != NULL); 2190 2191 /* 2192 * Note: range checks here are not absolutely sufficient to 2193 * make us robust against malformed messages sent by drivers; 2194 * this is in keeping with the rest of IP's dlpi handling. 2195 * (Remember, it's coming from something else in the kernel 2196 * address space) 2197 */ 2198 2199 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2200 if (capend > mp->b_wptr) { 2201 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2202 "malformed sub-capability too long for mblk"); 2203 return; 2204 } 2205 2206 /* 2207 * There are two types of acks we process here: 2208 * 1. acks in reply to a (first form) generic capability req 2209 * (no ENABLE flag set) 2210 * 2. acks in reply to a ENABLE capability req. 2211 * (ENABLE flag set) 2212 * 2213 * We process the subcapability passed as argument as follows: 2214 * 1 do initializations 2215 * 1.1 initialize nmp = NULL 2216 * 1.2 set need_sadb_dump to B_FALSE 2217 * 2 for each cipher in subcapability: 2218 * 2.1 if ENABLE flag is set: 2219 * 2.1.1 update per-ill ipsec capabilities info 2220 * 2.1.2 set need_sadb_dump to B_TRUE 2221 * 2.2 if ENABLE flag is not set: 2222 * 2.2.1 if nmp is NULL: 2223 * 2.2.1.1 allocate and initialize nmp 2224 * 2.2.1.2 init current pos in nmp 2225 * 2.2.2 copy current cipher to current pos in nmp 2226 * 2.2.3 set ENABLE flag in nmp 2227 * 2.2.4 update current pos 2228 * 3 if nmp is not equal to NULL, send enable request 2229 * 3.1 send capability request 2230 * 4 if need_sadb_dump is B_TRUE 2231 * 4.1 enable promiscuous on/off notifications 2232 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2233 * AH or ESP SA's to interface. 2234 */ 2235 2236 nmp = NULL; 2237 oalg = NULL; 2238 need_sadb_dump = B_FALSE; 2239 icip = (dl_capab_ipsec_t *)(isub + 1); 2240 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2241 2242 nciphers = icip->cip_nciphers; 2243 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2244 2245 if (ciphend > capend) { 2246 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2247 "too many ciphers for sub-capability len"); 2248 return; 2249 } 2250 2251 for (cipher = 0; cipher < nciphers; cipher++) { 2252 alg_len = sizeof (dl_capab_ipsec_alg_t); 2253 2254 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2255 /* 2256 * TBD: when we provide a way to disable capabilities 2257 * from above, need to manage the request-pending state 2258 * and fail if we were not expecting this ACK. 2259 */ 2260 IPSECHW_DEBUG(IPSECHW_CAPAB, 2261 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2262 2263 /* 2264 * Update IPsec capabilities for this ill 2265 */ 2266 2267 if (*ill_capab == NULL) { 2268 IPSECHW_DEBUG(IPSECHW_CAPAB, 2269 ("ill_capability_ipsec_ack: " 2270 "allocating ipsec_capab for ill\n")); 2271 *ill_capab = ill_ipsec_capab_alloc(); 2272 2273 if (*ill_capab == NULL) { 2274 cmn_err(CE_WARN, 2275 "ill_capability_ipsec_ack: " 2276 "could not enable IPsec Hardware " 2277 "acceleration for %s (ENOMEM)\n", 2278 ill->ill_name); 2279 return; 2280 } 2281 } 2282 2283 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2284 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2285 2286 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2287 cmn_err(CE_WARN, 2288 "ill_capability_ipsec_ack: " 2289 "malformed IPsec algorithm id %d", 2290 ialg->alg_prim); 2291 continue; 2292 } 2293 2294 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2295 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2296 ialg->alg_prim); 2297 } else { 2298 ipsec_capab_algparm_t *alp; 2299 2300 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2301 ialg->alg_prim); 2302 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2303 ialg->alg_prim)) { 2304 cmn_err(CE_WARN, 2305 "ill_capability_ipsec_ack: " 2306 "no space for IPsec alg id %d", 2307 ialg->alg_prim); 2308 continue; 2309 } 2310 alp = &((*ill_capab)->encr_algparm[ 2311 ialg->alg_prim]); 2312 alp->minkeylen = ialg->alg_minbits; 2313 alp->maxkeylen = ialg->alg_maxbits; 2314 } 2315 ill->ill_capabilities |= ill_capab_flag; 2316 /* 2317 * indicate that a capability was enabled, which 2318 * will be used below to kick off a SADB dump 2319 * to the ill. 2320 */ 2321 need_sadb_dump = B_TRUE; 2322 } else { 2323 IPSECHW_DEBUG(IPSECHW_CAPAB, 2324 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2325 ialg->alg_prim)); 2326 2327 if (nmp == NULL) { 2328 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2329 if (nmp == NULL) { 2330 /* 2331 * Sending the PROMISC_ON/OFF 2332 * notification request failed. 2333 * We cannot enable the algorithms 2334 * since the Provider will not 2335 * notify IP of promiscous mode 2336 * changes, which could lead 2337 * to leakage of packets. 2338 */ 2339 cmn_err(CE_WARN, 2340 "ill_capability_ipsec_ack: " 2341 "could not enable IPsec Hardware " 2342 "acceleration for %s (ENOMEM)\n", 2343 ill->ill_name); 2344 return; 2345 } 2346 /* ptr to current output alg specifier */ 2347 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2348 } 2349 2350 /* 2351 * Copy current alg specifier, set ENABLE 2352 * flag, and advance to next output alg. 2353 * For now we enable all IPsec capabilities. 2354 */ 2355 ASSERT(oalg != NULL); 2356 bcopy(ialg, oalg, alg_len); 2357 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2358 nmp->b_wptr += alg_len; 2359 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2360 } 2361 2362 /* move to next input algorithm specifier */ 2363 ialg = (dl_capab_ipsec_alg_t *) 2364 ((char *)ialg + alg_len); 2365 } 2366 2367 if (nmp != NULL) 2368 /* 2369 * nmp points to a DL_CAPABILITY_REQ message to enable 2370 * IPsec hardware acceleration. 2371 */ 2372 ill_dlpi_send(ill, nmp); 2373 2374 if (need_sadb_dump) 2375 /* 2376 * An acknowledgement corresponding to a request to 2377 * enable acceleration was received, notify SADB. 2378 */ 2379 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2380 } 2381 2382 /* 2383 * Given an mblk with enough space in it, create sub-capability entries for 2384 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2385 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2386 * in preparation for the reset the DL_CAPABILITY_REQ message. 2387 */ 2388 static void 2389 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2390 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2391 { 2392 dl_capab_ipsec_t *oipsec; 2393 dl_capab_ipsec_alg_t *oalg; 2394 dl_capability_sub_t *dl_subcap; 2395 int i, k; 2396 2397 ASSERT(nciphers > 0); 2398 ASSERT(ill_cap != NULL); 2399 ASSERT(mp != NULL); 2400 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2401 2402 /* dl_capability_sub_t for "stype" */ 2403 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2404 dl_subcap->dl_cap = stype; 2405 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2406 mp->b_wptr += sizeof (dl_capability_sub_t); 2407 2408 /* dl_capab_ipsec_t for "stype" */ 2409 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2410 oipsec->cip_version = 1; 2411 oipsec->cip_nciphers = nciphers; 2412 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2413 2414 /* create entries for "stype" AUTH ciphers */ 2415 for (i = 0; i < ill_cap->algs_size; i++) { 2416 for (k = 0; k < BITSPERBYTE; k++) { 2417 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2418 continue; 2419 2420 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2421 bzero((void *)oalg, sizeof (*oalg)); 2422 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2423 oalg->alg_prim = k + (BITSPERBYTE * i); 2424 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2425 } 2426 } 2427 /* create entries for "stype" ENCR ciphers */ 2428 for (i = 0; i < ill_cap->algs_size; i++) { 2429 for (k = 0; k < BITSPERBYTE; k++) { 2430 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2431 continue; 2432 2433 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2434 bzero((void *)oalg, sizeof (*oalg)); 2435 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2436 oalg->alg_prim = k + (BITSPERBYTE * i); 2437 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2438 } 2439 } 2440 } 2441 2442 /* 2443 * Macro to count number of 1s in a byte (8-bit word). The total count is 2444 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2445 * POPC instruction, but our macro is more flexible for an arbitrary length 2446 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2447 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2448 * stays that way, we can reduce the number of iterations required. 2449 */ 2450 #define COUNT_1S(val, sum) { \ 2451 uint8_t x = val & 0xff; \ 2452 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2453 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2454 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2455 } 2456 2457 /* ARGSUSED */ 2458 static void 2459 ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) 2460 { 2461 mblk_t *mp; 2462 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2463 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2464 uint64_t ill_capabilities = ill->ill_capabilities; 2465 int ah_cnt = 0, esp_cnt = 0; 2466 int ah_len = 0, esp_len = 0; 2467 int i, size = 0; 2468 2469 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2470 return; 2471 2472 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2473 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2474 2475 /* Find out the number of ciphers for AH */ 2476 if (cap_ah != NULL) { 2477 for (i = 0; i < cap_ah->algs_size; i++) { 2478 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2479 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2480 } 2481 if (ah_cnt > 0) { 2482 size += sizeof (dl_capability_sub_t) + 2483 sizeof (dl_capab_ipsec_t); 2484 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2485 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2486 size += ah_len; 2487 } 2488 } 2489 2490 /* Find out the number of ciphers for ESP */ 2491 if (cap_esp != NULL) { 2492 for (i = 0; i < cap_esp->algs_size; i++) { 2493 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2494 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2495 } 2496 if (esp_cnt > 0) { 2497 size += sizeof (dl_capability_sub_t) + 2498 sizeof (dl_capab_ipsec_t); 2499 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2500 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2501 size += esp_len; 2502 } 2503 } 2504 2505 if (size == 0) { 2506 ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " 2507 "there's nothing to reset\n")); 2508 return; 2509 } 2510 2511 mp = allocb(size, BPRI_HI); 2512 if (mp == NULL) { 2513 ip1dbg(("ill_capability_ipsec_reset: unable to allocate " 2514 "request to disable IPSEC Hardware Acceleration\n")); 2515 return; 2516 } 2517 2518 /* 2519 * Clear the capability flags for IPsec HA but retain the ill 2520 * capability structures since it's possible that another thread 2521 * is still referring to them. The structures only get deallocated 2522 * when we destroy the ill. 2523 * 2524 * Various places check the flags to see if the ill is capable of 2525 * hardware acceleration, and by clearing them we ensure that new 2526 * outbound IPsec packets are sent down encrypted. 2527 */ 2528 ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); 2529 2530 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2531 if (ah_cnt > 0) { 2532 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2533 cap_ah, mp); 2534 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2535 } 2536 2537 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2538 if (esp_cnt > 0) { 2539 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2540 cap_esp, mp); 2541 ASSERT(mp->b_rptr + size >= mp->b_wptr); 2542 } 2543 2544 /* 2545 * At this point we've composed a bunch of sub-capabilities to be 2546 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2547 * by the caller. Upon receiving this reset message, the driver 2548 * must stop inbound decryption (by destroying all inbound SAs) 2549 * and let the corresponding packets come in encrypted. 2550 */ 2551 2552 if (*sc_mp != NULL) 2553 linkb(*sc_mp, mp); 2554 else 2555 *sc_mp = mp; 2556 } 2557 2558 static void 2559 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2560 boolean_t encapsulated) 2561 { 2562 boolean_t legacy = B_FALSE; 2563 2564 /* 2565 * If this DL_CAPABILITY_ACK came in as a response to our "reset" 2566 * DL_CAPABILITY_REQ, ignore it during this cycle. We've just 2567 * instructed the driver to disable its advertised capabilities, 2568 * so there's no point in accepting any response at this moment. 2569 */ 2570 if (ill->ill_dlpi_capab_state == IDS_UNKNOWN) 2571 return; 2572 2573 /* 2574 * Note that only the following two sub-capabilities may be 2575 * considered as "legacy", since their original definitions 2576 * do not incorporate the dl_mid_t module ID token, and hence 2577 * may require the use of the wrapper sub-capability. 2578 */ 2579 switch (subp->dl_cap) { 2580 case DL_CAPAB_IPSEC_AH: 2581 case DL_CAPAB_IPSEC_ESP: 2582 legacy = B_TRUE; 2583 break; 2584 } 2585 2586 /* 2587 * For legacy sub-capabilities which don't incorporate a queue_t 2588 * pointer in their structures, discard them if we detect that 2589 * there are intermediate modules in between IP and the driver. 2590 */ 2591 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2592 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2593 "%d discarded; %d module(s) present below IP\n", 2594 subp->dl_cap, ill->ill_lmod_cnt)); 2595 return; 2596 } 2597 2598 switch (subp->dl_cap) { 2599 case DL_CAPAB_IPSEC_AH: 2600 case DL_CAPAB_IPSEC_ESP: 2601 ill_capability_ipsec_ack(ill, mp, subp); 2602 break; 2603 case DL_CAPAB_MDT: 2604 ill_capability_mdt_ack(ill, mp, subp); 2605 break; 2606 case DL_CAPAB_HCKSUM: 2607 ill_capability_hcksum_ack(ill, mp, subp); 2608 break; 2609 case DL_CAPAB_ZEROCOPY: 2610 ill_capability_zerocopy_ack(ill, mp, subp); 2611 break; 2612 case DL_CAPAB_POLL: 2613 if (!SOFT_RINGS_ENABLED()) 2614 ill_capability_dls_ack(ill, mp, subp); 2615 break; 2616 case DL_CAPAB_SOFT_RING: 2617 if (SOFT_RINGS_ENABLED()) 2618 ill_capability_dls_ack(ill, mp, subp); 2619 break; 2620 case DL_CAPAB_LSO: 2621 ill_capability_lso_ack(ill, mp, subp); 2622 break; 2623 default: 2624 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2625 subp->dl_cap)); 2626 } 2627 } 2628 2629 /* 2630 * As part of negotiating polling capability, the driver tells us 2631 * the default (or normal) blanking interval and packet threshold 2632 * (the receive timer fires if blanking interval is reached or 2633 * the packet threshold is reached). 2634 * 2635 * As part of manipulating the polling interval, we always use our 2636 * estimated interval (avg service time * number of packets queued 2637 * on the squeue) but we try to blank for a minimum of 2638 * rr_normal_blank_time * rr_max_blank_ratio. We disable the 2639 * packet threshold during this time. When we are not in polling mode 2640 * we set the blank interval typically lower, rr_normal_pkt_cnt * 2641 * rr_min_blank_ratio but up the packet cnt by a ratio of 2642 * rr_min_pkt_cnt_ratio so that we are still getting chains if 2643 * possible although for a shorter interval. 2644 */ 2645 #define RR_MAX_BLANK_RATIO 20 2646 #define RR_MIN_BLANK_RATIO 10 2647 #define RR_MAX_PKT_CNT_RATIO 3 2648 #define RR_MIN_PKT_CNT_RATIO 3 2649 2650 /* 2651 * These can be tuned via /etc/system. 2652 */ 2653 int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; 2654 int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; 2655 int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; 2656 int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; 2657 2658 static mac_resource_handle_t 2659 ill_ring_add(void *arg, mac_resource_t *mrp) 2660 { 2661 ill_t *ill = (ill_t *)arg; 2662 mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 2663 ill_rx_ring_t *rx_ring; 2664 int ip_rx_index; 2665 2666 ASSERT(mrp != NULL); 2667 if (mrp->mr_type != MAC_RX_FIFO) { 2668 return (NULL); 2669 } 2670 ASSERT(ill != NULL); 2671 ASSERT(ill->ill_dls_capab != NULL); 2672 2673 mutex_enter(&ill->ill_lock); 2674 for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 2675 rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; 2676 ASSERT(rx_ring != NULL); 2677 2678 if (rx_ring->rr_ring_state == ILL_RING_FREE) { 2679 time_t normal_blank_time = 2680 mrfp->mrf_normal_blank_time; 2681 uint_t normal_pkt_cnt = 2682 mrfp->mrf_normal_pkt_count; 2683 2684 bzero(rx_ring, sizeof (ill_rx_ring_t)); 2685 2686 rx_ring->rr_blank = mrfp->mrf_blank; 2687 rx_ring->rr_handle = mrfp->mrf_arg; 2688 rx_ring->rr_ill = ill; 2689 rx_ring->rr_normal_blank_time = normal_blank_time; 2690 rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; 2691 2692 rx_ring->rr_max_blank_time = 2693 normal_blank_time * rr_max_blank_ratio; 2694 rx_ring->rr_min_blank_time = 2695 normal_blank_time * rr_min_blank_ratio; 2696 rx_ring->rr_max_pkt_cnt = 2697 normal_pkt_cnt * rr_max_pkt_cnt_ratio; 2698 rx_ring->rr_min_pkt_cnt = 2699 normal_pkt_cnt * rr_min_pkt_cnt_ratio; 2700 2701 rx_ring->rr_ring_state = ILL_RING_INUSE; 2702 mutex_exit(&ill->ill_lock); 2703 2704 DTRACE_PROBE2(ill__ring__add, (void *), ill, 2705 (int), ip_rx_index); 2706 return ((mac_resource_handle_t)rx_ring); 2707 } 2708 } 2709 2710 /* 2711 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 2712 * we have devices which can overwhelm this limit, ILL_MAX_RING 2713 * should be made configurable. Meanwhile it cause no panic because 2714 * driver will pass ip_input a NULL handle which will make 2715 * IP allocate the default squeue and Polling mode will not 2716 * be used for this ring. 2717 */ 2718 cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " 2719 "for %s\n", ILL_MAX_RINGS, ill->ill_name); 2720 2721 mutex_exit(&ill->ill_lock); 2722 return (NULL); 2723 } 2724 2725 static boolean_t 2726 ill_capability_dls_init(ill_t *ill) 2727 { 2728 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2729 conn_t *connp; 2730 size_t sz; 2731 ip_stack_t *ipst = ill->ill_ipst; 2732 2733 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 2734 if (ill_dls == NULL) { 2735 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2736 "soft_ring enabled for ill=%s (%p) but data " 2737 "structs uninitialized\n", ill->ill_name, 2738 (void *)ill); 2739 } 2740 return (B_TRUE); 2741 } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { 2742 if (ill_dls == NULL) { 2743 cmn_err(CE_PANIC, "ill_capability_dls_init: " 2744 "polling enabled for ill=%s (%p) but data " 2745 "structs uninitialized\n", ill->ill_name, 2746 (void *)ill); 2747 } 2748 return (B_TRUE); 2749 } 2750 2751 if (ill_dls != NULL) { 2752 ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; 2753 /* Soft_Ring or polling is being re-enabled */ 2754 2755 connp = ill_dls->ill_unbind_conn; 2756 ASSERT(rx_ring != NULL); 2757 bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); 2758 bzero((void *)rx_ring, 2759 sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); 2760 ill_dls->ill_ring_tbl = rx_ring; 2761 ill_dls->ill_unbind_conn = connp; 2762 return (B_TRUE); 2763 } 2764 2765 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, 2766 ipst->ips_netstack)) == NULL) 2767 return (B_FALSE); 2768 2769 sz = sizeof (ill_dls_capab_t); 2770 sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; 2771 2772 ill_dls = kmem_zalloc(sz, KM_NOSLEEP); 2773 if (ill_dls == NULL) { 2774 cmn_err(CE_WARN, "ill_capability_dls_init: could not " 2775 "allocate dls_capab for %s (%p)\n", ill->ill_name, 2776 (void *)ill); 2777 CONN_DEC_REF(connp); 2778 return (B_FALSE); 2779 } 2780 2781 /* Allocate space to hold ring table */ 2782 ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; 2783 ill->ill_dls_capab = ill_dls; 2784 ill_dls->ill_unbind_conn = connp; 2785 return (B_TRUE); 2786 } 2787 2788 /* 2789 * ill_capability_dls_disable: disable soft_ring and/or polling 2790 * capability. Since any of the rings might already be in use, need 2791 * to call ip_squeue_clean_all() which gets behind the squeue to disable 2792 * direct calls if necessary. 2793 */ 2794 static void 2795 ill_capability_dls_disable(ill_t *ill) 2796 { 2797 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 2798 2799 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2800 ip_squeue_clean_all(ill); 2801 ill_dls->ill_tx = NULL; 2802 ill_dls->ill_tx_handle = NULL; 2803 ill_dls->ill_dls_change_status = NULL; 2804 ill_dls->ill_dls_bind = NULL; 2805 ill_dls->ill_dls_unbind = NULL; 2806 } 2807 2808 ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); 2809 } 2810 2811 static void 2812 ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, 2813 dl_capability_sub_t *isub) 2814 { 2815 uint_t size; 2816 uchar_t *rptr; 2817 dl_capab_dls_t dls, *odls; 2818 ill_dls_capab_t *ill_dls; 2819 mblk_t *nmp = NULL; 2820 dl_capability_req_t *ocap; 2821 uint_t sub_dl_cap = isub->dl_cap; 2822 2823 if (!ill_capability_dls_init(ill)) 2824 return; 2825 ill_dls = ill->ill_dls_capab; 2826 2827 /* Copy locally to get the members aligned */ 2828 bcopy((void *)idls, (void *)&dls, 2829 sizeof (dl_capab_dls_t)); 2830 2831 /* Get the tx function and handle from dld */ 2832 ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; 2833 ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; 2834 2835 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2836 ill_dls->ill_dls_change_status = 2837 (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; 2838 ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; 2839 ill_dls->ill_dls_unbind = 2840 (ip_dls_unbind_t)dls.dls_ring_unbind; 2841 ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; 2842 } 2843 2844 size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + 2845 isub->dl_length; 2846 2847 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2848 cmn_err(CE_WARN, "ill_capability_dls_capable: could " 2849 "not allocate memory for CAPAB_REQ for %s (%p)\n", 2850 ill->ill_name, (void *)ill); 2851 return; 2852 } 2853 2854 /* initialize dl_capability_req_t */ 2855 rptr = nmp->b_rptr; 2856 ocap = (dl_capability_req_t *)rptr; 2857 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2858 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2859 rptr += sizeof (dl_capability_req_t); 2860 2861 /* initialize dl_capability_sub_t */ 2862 bcopy(isub, rptr, sizeof (*isub)); 2863 rptr += sizeof (*isub); 2864 2865 odls = (dl_capab_dls_t *)rptr; 2866 rptr += sizeof (dl_capab_dls_t); 2867 2868 /* initialize dl_capab_dls_t to be sent down */ 2869 dls.dls_rx_handle = (uintptr_t)ill; 2870 dls.dls_rx = (uintptr_t)ip_input; 2871 dls.dls_ring_add = (uintptr_t)ill_ring_add; 2872 2873 if (sub_dl_cap == DL_CAPAB_SOFT_RING) { 2874 dls.dls_ring_cnt = ip_soft_rings_cnt; 2875 dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; 2876 dls.dls_flags = SOFT_RING_ENABLE; 2877 } else { 2878 dls.dls_flags = POLL_ENABLE; 2879 ip1dbg(("ill_capability_dls_capable: asking interface %s " 2880 "to enable polling\n", ill->ill_name)); 2881 } 2882 bcopy((void *)&dls, (void *)odls, 2883 sizeof (dl_capab_dls_t)); 2884 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2885 /* 2886 * nmp points to a DL_CAPABILITY_REQ message to 2887 * enable either soft_ring or polling 2888 */ 2889 ill_dlpi_send(ill, nmp); 2890 } 2891 2892 static void 2893 ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) 2894 { 2895 mblk_t *mp; 2896 dl_capab_dls_t *idls; 2897 dl_capability_sub_t *dl_subcap; 2898 int size; 2899 2900 if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) 2901 return; 2902 2903 ASSERT(ill->ill_dls_capab != NULL); 2904 2905 size = sizeof (*dl_subcap) + sizeof (*idls); 2906 2907 mp = allocb(size, BPRI_HI); 2908 if (mp == NULL) { 2909 ip1dbg(("ill_capability_dls_reset: unable to allocate " 2910 "request to disable soft_ring\n")); 2911 return; 2912 } 2913 2914 mp->b_wptr = mp->b_rptr + size; 2915 2916 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 2917 dl_subcap->dl_length = sizeof (*idls); 2918 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2919 dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; 2920 else 2921 dl_subcap->dl_cap = DL_CAPAB_POLL; 2922 2923 idls = (dl_capab_dls_t *)(dl_subcap + 1); 2924 if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) 2925 idls->dls_flags = SOFT_RING_DISABLE; 2926 else 2927 idls->dls_flags = POLL_DISABLE; 2928 2929 if (*sc_mp != NULL) 2930 linkb(*sc_mp, mp); 2931 else 2932 *sc_mp = mp; 2933 } 2934 2935 /* 2936 * Process a soft_ring/poll capability negotiation ack received 2937 * from a DLS Provider.isub must point to the sub-capability 2938 * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. 2939 */ 2940 static void 2941 ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2942 { 2943 dl_capab_dls_t *idls; 2944 uint_t sub_dl_cap = isub->dl_cap; 2945 uint8_t *capend; 2946 2947 ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || 2948 sub_dl_cap == DL_CAPAB_POLL); 2949 2950 if (ill->ill_isv6) 2951 return; 2952 2953 /* 2954 * Note: range checks here are not absolutely sufficient to 2955 * make us robust against malformed messages sent by drivers; 2956 * this is in keeping with the rest of IP's dlpi handling. 2957 * (Remember, it's coming from something else in the kernel 2958 * address space) 2959 */ 2960 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2961 if (capend > mp->b_wptr) { 2962 cmn_err(CE_WARN, "ill_capability_dls_ack: " 2963 "malformed sub-capability too long for mblk"); 2964 return; 2965 } 2966 2967 /* 2968 * There are two types of acks we process here: 2969 * 1. acks in reply to a (first form) generic capability req 2970 * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) 2971 * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE 2972 * capability req. 2973 */ 2974 idls = (dl_capab_dls_t *)(isub + 1); 2975 2976 if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { 2977 ip1dbg(("ill_capability_dls_ack: mid token for dls " 2978 "capability isn't as expected; pass-thru " 2979 "module(s) detected, discarding capability\n")); 2980 if (ill->ill_capabilities & ILL_CAPAB_DLS) { 2981 /* 2982 * This is a capability renegotitation case. 2983 * The interface better be unusable at this 2984 * point other wise bad things will happen 2985 * if we disable direct calls on a running 2986 * and up interface. 2987 */ 2988 ill_capability_dls_disable(ill); 2989 } 2990 return; 2991 } 2992 2993 switch (idls->dls_flags) { 2994 default: 2995 /* Disable if unknown flag */ 2996 case SOFT_RING_DISABLE: 2997 case POLL_DISABLE: 2998 ill_capability_dls_disable(ill); 2999 break; 3000 case SOFT_RING_CAPABLE: 3001 case POLL_CAPABLE: 3002 /* 3003 * If the capability was already enabled, its safe 3004 * to disable it first to get rid of stale information 3005 * and then start enabling it again. 3006 */ 3007 ill_capability_dls_disable(ill); 3008 ill_capability_dls_capable(ill, idls, isub); 3009 break; 3010 case SOFT_RING_ENABLE: 3011 case POLL_ENABLE: 3012 mutex_enter(&ill->ill_lock); 3013 if (sub_dl_cap == DL_CAPAB_SOFT_RING && 3014 !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { 3015 ASSERT(ill->ill_dls_capab != NULL); 3016 ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; 3017 } 3018 if (sub_dl_cap == DL_CAPAB_POLL && 3019 !(ill->ill_capabilities & ILL_CAPAB_POLL)) { 3020 ASSERT(ill->ill_dls_capab != NULL); 3021 ill->ill_capabilities |= ILL_CAPAB_POLL; 3022 ip1dbg(("ill_capability_dls_ack: interface %s " 3023 "has enabled polling\n", ill->ill_name)); 3024 } 3025 mutex_exit(&ill->ill_lock); 3026 break; 3027 } 3028 } 3029 3030 /* 3031 * Process a hardware checksum offload capability negotiation ack received 3032 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 3033 * of a DL_CAPABILITY_ACK message. 3034 */ 3035 static void 3036 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3037 { 3038 dl_capability_req_t *ocap; 3039 dl_capab_hcksum_t *ihck, *ohck; 3040 ill_hcksum_capab_t **ill_hcksum; 3041 mblk_t *nmp = NULL; 3042 uint_t sub_dl_cap = isub->dl_cap; 3043 uint8_t *capend; 3044 3045 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 3046 3047 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 3048 3049 /* 3050 * Note: range checks here are not absolutely sufficient to 3051 * make us robust against malformed messages sent by drivers; 3052 * this is in keeping with the rest of IP's dlpi handling. 3053 * (Remember, it's coming from something else in the kernel 3054 * address space) 3055 */ 3056 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3057 if (capend > mp->b_wptr) { 3058 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3059 "malformed sub-capability too long for mblk"); 3060 return; 3061 } 3062 3063 /* 3064 * There are two types of acks we process here: 3065 * 1. acks in reply to a (first form) generic capability req 3066 * (no ENABLE flag set) 3067 * 2. acks in reply to a ENABLE capability req. 3068 * (ENABLE flag set) 3069 */ 3070 ihck = (dl_capab_hcksum_t *)(isub + 1); 3071 3072 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 3073 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 3074 "unsupported hardware checksum " 3075 "sub-capability (version %d, expected %d)", 3076 ihck->hcksum_version, HCKSUM_VERSION_1); 3077 return; 3078 } 3079 3080 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 3081 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 3082 "checksum capability isn't as expected; pass-thru " 3083 "module(s) detected, discarding capability\n")); 3084 return; 3085 } 3086 3087 #define CURR_HCKSUM_CAPAB \ 3088 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 3089 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 3090 3091 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 3092 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 3093 /* do ENABLE processing */ 3094 if (*ill_hcksum == NULL) { 3095 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 3096 KM_NOSLEEP); 3097 3098 if (*ill_hcksum == NULL) { 3099 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3100 "could not enable hcksum version %d " 3101 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 3102 ill->ill_name); 3103 return; 3104 } 3105 } 3106 3107 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 3108 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 3109 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 3110 ip1dbg(("ill_capability_hcksum_ack: interface %s " 3111 "has enabled hardware checksumming\n ", 3112 ill->ill_name)); 3113 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 3114 /* 3115 * Enabling hardware checksum offload 3116 * Currently IP supports {TCP,UDP}/IPv4 3117 * partial and full cksum offload and 3118 * IPv4 header checksum offload. 3119 * Allocate new mblk which will 3120 * contain a new capability request 3121 * to enable hardware checksum offload. 3122 */ 3123 uint_t size; 3124 uchar_t *rptr; 3125 3126 size = sizeof (dl_capability_req_t) + 3127 sizeof (dl_capability_sub_t) + isub->dl_length; 3128 3129 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3130 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 3131 "could not enable hardware cksum for %s (ENOMEM)\n", 3132 ill->ill_name); 3133 return; 3134 } 3135 3136 rptr = nmp->b_rptr; 3137 /* initialize dl_capability_req_t */ 3138 ocap = (dl_capability_req_t *)nmp->b_rptr; 3139 ocap->dl_sub_offset = 3140 sizeof (dl_capability_req_t); 3141 ocap->dl_sub_length = 3142 sizeof (dl_capability_sub_t) + 3143 isub->dl_length; 3144 nmp->b_rptr += sizeof (dl_capability_req_t); 3145 3146 /* initialize dl_capability_sub_t */ 3147 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3148 nmp->b_rptr += sizeof (*isub); 3149 3150 /* initialize dl_capab_hcksum_t */ 3151 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 3152 bcopy(ihck, ohck, sizeof (*ihck)); 3153 3154 nmp->b_rptr = rptr; 3155 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3156 3157 /* Set ENABLE flag */ 3158 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 3159 ohck->hcksum_txflags |= HCKSUM_ENABLE; 3160 3161 /* 3162 * nmp points to a DL_CAPABILITY_REQ message to enable 3163 * hardware checksum acceleration. 3164 */ 3165 ill_dlpi_send(ill, nmp); 3166 } else { 3167 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 3168 "advertised %x hardware checksum capability flags\n", 3169 ill->ill_name, ihck->hcksum_txflags)); 3170 } 3171 } 3172 3173 static void 3174 ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) 3175 { 3176 mblk_t *mp; 3177 dl_capab_hcksum_t *hck_subcap; 3178 dl_capability_sub_t *dl_subcap; 3179 int size; 3180 3181 if (!ILL_HCKSUM_CAPABLE(ill)) 3182 return; 3183 3184 ASSERT(ill->ill_hcksum_capab != NULL); 3185 /* 3186 * Clear the capability flag for hardware checksum offload but 3187 * retain the ill_hcksum_capab structure since it's possible that 3188 * another thread is still referring to it. The structure only 3189 * gets deallocated when we destroy the ill. 3190 */ 3191 ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; 3192 3193 size = sizeof (*dl_subcap) + sizeof (*hck_subcap); 3194 3195 mp = allocb(size, BPRI_HI); 3196 if (mp == NULL) { 3197 ip1dbg(("ill_capability_hcksum_reset: unable to allocate " 3198 "request to disable hardware checksum offload\n")); 3199 return; 3200 } 3201 3202 mp->b_wptr = mp->b_rptr + size; 3203 3204 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3205 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 3206 dl_subcap->dl_length = sizeof (*hck_subcap); 3207 3208 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 3209 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 3210 hck_subcap->hcksum_txflags = 0; 3211 3212 if (*sc_mp != NULL) 3213 linkb(*sc_mp, mp); 3214 else 3215 *sc_mp = mp; 3216 } 3217 3218 static void 3219 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3220 { 3221 mblk_t *nmp = NULL; 3222 dl_capability_req_t *oc; 3223 dl_capab_zerocopy_t *zc_ic, *zc_oc; 3224 ill_zerocopy_capab_t **ill_zerocopy_capab; 3225 uint_t sub_dl_cap = isub->dl_cap; 3226 uint8_t *capend; 3227 3228 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 3229 3230 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 3231 3232 /* 3233 * Note: range checks here are not absolutely sufficient to 3234 * make us robust against malformed messages sent by drivers; 3235 * this is in keeping with the rest of IP's dlpi handling. 3236 * (Remember, it's coming from something else in the kernel 3237 * address space) 3238 */ 3239 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3240 if (capend > mp->b_wptr) { 3241 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3242 "malformed sub-capability too long for mblk"); 3243 return; 3244 } 3245 3246 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 3247 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 3248 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 3249 "unsupported ZEROCOPY sub-capability (version %d, " 3250 "expected %d)", zc_ic->zerocopy_version, 3251 ZEROCOPY_VERSION_1); 3252 return; 3253 } 3254 3255 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 3256 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 3257 "capability isn't as expected; pass-thru module(s) " 3258 "detected, discarding capability\n")); 3259 return; 3260 } 3261 3262 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 3263 if (*ill_zerocopy_capab == NULL) { 3264 *ill_zerocopy_capab = 3265 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 3266 KM_NOSLEEP); 3267 3268 if (*ill_zerocopy_capab == NULL) { 3269 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3270 "could not enable Zero-copy version %d " 3271 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 3272 ill->ill_name); 3273 return; 3274 } 3275 } 3276 3277 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 3278 "supports Zero-copy version %d\n", ill->ill_name, 3279 ZEROCOPY_VERSION_1)); 3280 3281 (*ill_zerocopy_capab)->ill_zerocopy_version = 3282 zc_ic->zerocopy_version; 3283 (*ill_zerocopy_capab)->ill_zerocopy_flags = 3284 zc_ic->zerocopy_flags; 3285 3286 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 3287 } else { 3288 uint_t size; 3289 uchar_t *rptr; 3290 3291 size = sizeof (dl_capability_req_t) + 3292 sizeof (dl_capability_sub_t) + 3293 sizeof (dl_capab_zerocopy_t); 3294 3295 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3296 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 3297 "could not enable zerocopy for %s (ENOMEM)\n", 3298 ill->ill_name); 3299 return; 3300 } 3301 3302 rptr = nmp->b_rptr; 3303 /* initialize dl_capability_req_t */ 3304 oc = (dl_capability_req_t *)rptr; 3305 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3306 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3307 sizeof (dl_capab_zerocopy_t); 3308 rptr += sizeof (dl_capability_req_t); 3309 3310 /* initialize dl_capability_sub_t */ 3311 bcopy(isub, rptr, sizeof (*isub)); 3312 rptr += sizeof (*isub); 3313 3314 /* initialize dl_capab_zerocopy_t */ 3315 zc_oc = (dl_capab_zerocopy_t *)rptr; 3316 *zc_oc = *zc_ic; 3317 3318 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 3319 "to enable zero-copy version %d\n", ill->ill_name, 3320 ZEROCOPY_VERSION_1)); 3321 3322 /* set VMSAFE_MEM flag */ 3323 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 3324 3325 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 3326 ill_dlpi_send(ill, nmp); 3327 } 3328 } 3329 3330 static void 3331 ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) 3332 { 3333 mblk_t *mp; 3334 dl_capab_zerocopy_t *zerocopy_subcap; 3335 dl_capability_sub_t *dl_subcap; 3336 int size; 3337 3338 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 3339 return; 3340 3341 ASSERT(ill->ill_zerocopy_capab != NULL); 3342 /* 3343 * Clear the capability flag for Zero-copy but retain the 3344 * ill_zerocopy_capab structure since it's possible that another 3345 * thread is still referring to it. The structure only gets 3346 * deallocated when we destroy the ill. 3347 */ 3348 ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; 3349 3350 size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 3351 3352 mp = allocb(size, BPRI_HI); 3353 if (mp == NULL) { 3354 ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " 3355 "request to disable Zero-copy\n")); 3356 return; 3357 } 3358 3359 mp->b_wptr = mp->b_rptr + size; 3360 3361 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3362 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 3363 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 3364 3365 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 3366 zerocopy_subcap->zerocopy_version = 3367 ill->ill_zerocopy_capab->ill_zerocopy_version; 3368 zerocopy_subcap->zerocopy_flags = 0; 3369 3370 if (*sc_mp != NULL) 3371 linkb(*sc_mp, mp); 3372 else 3373 *sc_mp = mp; 3374 } 3375 3376 /* 3377 * Process Large Segment Offload capability negotiation ack received from a 3378 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_LSO) of a 3379 * DL_CAPABILITY_ACK message. 3380 */ 3381 static void 3382 ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 3383 { 3384 mblk_t *nmp = NULL; 3385 dl_capability_req_t *oc; 3386 dl_capab_lso_t *lso_ic, *lso_oc; 3387 ill_lso_capab_t **ill_lso_capab; 3388 uint_t sub_dl_cap = isub->dl_cap; 3389 uint8_t *capend; 3390 3391 ASSERT(sub_dl_cap == DL_CAPAB_LSO); 3392 3393 ill_lso_capab = (ill_lso_capab_t **)&ill->ill_lso_capab; 3394 3395 /* 3396 * Note: range checks here are not absolutely sufficient to 3397 * make us robust against malformed messages sent by drivers; 3398 * this is in keeping with the rest of IP's dlpi handling. 3399 * (Remember, it's coming from something else in the kernel 3400 * address space) 3401 */ 3402 capend = (uint8_t *)(isub + 1) + isub->dl_length; 3403 if (capend > mp->b_wptr) { 3404 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3405 "malformed sub-capability too long for mblk"); 3406 return; 3407 } 3408 3409 lso_ic = (dl_capab_lso_t *)(isub + 1); 3410 3411 if (lso_ic->lso_version != LSO_VERSION_1) { 3412 cmn_err(CE_CONT, "ill_capability_lso_ack: " 3413 "unsupported LSO sub-capability (version %d, expected %d)", 3414 lso_ic->lso_version, LSO_VERSION_1); 3415 return; 3416 } 3417 3418 if (!dlcapabcheckqid(&lso_ic->lso_mid, ill->ill_lmod_rq)) { 3419 ip1dbg(("ill_capability_lso_ack: mid token for LSO " 3420 "capability isn't as expected; pass-thru module(s) " 3421 "detected, discarding capability\n")); 3422 return; 3423 } 3424 3425 if ((lso_ic->lso_flags & LSO_TX_ENABLE) && 3426 (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4)) { 3427 if (*ill_lso_capab == NULL) { 3428 *ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 3429 KM_NOSLEEP); 3430 3431 if (*ill_lso_capab == NULL) { 3432 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3433 "could not enable LSO version %d " 3434 "for %s (ENOMEM)\n", LSO_VERSION_1, 3435 ill->ill_name); 3436 return; 3437 } 3438 } 3439 3440 (*ill_lso_capab)->ill_lso_version = lso_ic->lso_version; 3441 (*ill_lso_capab)->ill_lso_flags = lso_ic->lso_flags; 3442 (*ill_lso_capab)->ill_lso_max = lso_ic->lso_max; 3443 ill->ill_capabilities |= ILL_CAPAB_LSO; 3444 3445 ip1dbg(("ill_capability_lso_ack: interface %s " 3446 "has enabled LSO\n ", ill->ill_name)); 3447 } else if (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4) { 3448 uint_t size; 3449 uchar_t *rptr; 3450 3451 size = sizeof (dl_capability_req_t) + 3452 sizeof (dl_capability_sub_t) + sizeof (dl_capab_lso_t); 3453 3454 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 3455 cmn_err(CE_WARN, "ill_capability_lso_ack: " 3456 "could not enable LSO for %s (ENOMEM)\n", 3457 ill->ill_name); 3458 return; 3459 } 3460 3461 rptr = nmp->b_rptr; 3462 /* initialize dl_capability_req_t */ 3463 oc = (dl_capability_req_t *)nmp->b_rptr; 3464 oc->dl_sub_offset = sizeof (dl_capability_req_t); 3465 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 3466 sizeof (dl_capab_lso_t); 3467 nmp->b_rptr += sizeof (dl_capability_req_t); 3468 3469 /* initialize dl_capability_sub_t */ 3470 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 3471 nmp->b_rptr += sizeof (*isub); 3472 3473 /* initialize dl_capab_lso_t */ 3474 lso_oc = (dl_capab_lso_t *)nmp->b_rptr; 3475 bcopy(lso_ic, lso_oc, sizeof (*lso_ic)); 3476 3477 nmp->b_rptr = rptr; 3478 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 3479 3480 /* set ENABLE flag */ 3481 lso_oc->lso_flags |= LSO_TX_ENABLE; 3482 3483 /* nmp points to a DL_CAPABILITY_REQ message to enable LSO */ 3484 ill_dlpi_send(ill, nmp); 3485 } else { 3486 ip1dbg(("ill_capability_lso_ack: interface %s has " 3487 "advertised %x LSO capability flags\n", 3488 ill->ill_name, lso_ic->lso_flags)); 3489 } 3490 } 3491 3492 3493 static void 3494 ill_capability_lso_reset(ill_t *ill, mblk_t **sc_mp) 3495 { 3496 mblk_t *mp; 3497 dl_capab_lso_t *lso_subcap; 3498 dl_capability_sub_t *dl_subcap; 3499 int size; 3500 3501 if (!(ill->ill_capabilities & ILL_CAPAB_LSO)) 3502 return; 3503 3504 ASSERT(ill->ill_lso_capab != NULL); 3505 /* 3506 * Clear the capability flag for LSO but retain the 3507 * ill_lso_capab structure since it's possible that another 3508 * thread is still referring to it. The structure only gets 3509 * deallocated when we destroy the ill. 3510 */ 3511 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 3512 3513 size = sizeof (*dl_subcap) + sizeof (*lso_subcap); 3514 3515 mp = allocb(size, BPRI_HI); 3516 if (mp == NULL) { 3517 ip1dbg(("ill_capability_lso_reset: unable to allocate " 3518 "request to disable LSO\n")); 3519 return; 3520 } 3521 3522 mp->b_wptr = mp->b_rptr + size; 3523 3524 dl_subcap = (dl_capability_sub_t *)mp->b_rptr; 3525 dl_subcap->dl_cap = DL_CAPAB_LSO; 3526 dl_subcap->dl_length = sizeof (*lso_subcap); 3527 3528 lso_subcap = (dl_capab_lso_t *)(dl_subcap + 1); 3529 lso_subcap->lso_version = ill->ill_lso_capab->ill_lso_version; 3530 lso_subcap->lso_flags = 0; 3531 3532 if (*sc_mp != NULL) 3533 linkb(*sc_mp, mp); 3534 else 3535 *sc_mp = mp; 3536 } 3537 3538 /* 3539 * Consume a new-style hardware capabilities negotiation ack. 3540 * Called from ip_rput_dlpi_writer(). 3541 */ 3542 void 3543 ill_capability_ack(ill_t *ill, mblk_t *mp) 3544 { 3545 dl_capability_ack_t *capp; 3546 dl_capability_sub_t *subp, *endp; 3547 3548 if (ill->ill_dlpi_capab_state == IDS_INPROGRESS) 3549 ill->ill_dlpi_capab_state = IDS_OK; 3550 3551 capp = (dl_capability_ack_t *)mp->b_rptr; 3552 3553 if (capp->dl_sub_length == 0) 3554 /* no new-style capabilities */ 3555 return; 3556 3557 /* make sure the driver supplied correct dl_sub_length */ 3558 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3559 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3560 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3561 return; 3562 } 3563 3564 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3565 /* 3566 * There are sub-capabilities. Process the ones we know about. 3567 * Loop until we don't have room for another sub-cap header.. 3568 */ 3569 for (subp = SC(capp, capp->dl_sub_offset), 3570 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3571 subp <= endp; 3572 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3573 3574 switch (subp->dl_cap) { 3575 case DL_CAPAB_ID_WRAPPER: 3576 ill_capability_id_ack(ill, mp, subp); 3577 break; 3578 default: 3579 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3580 break; 3581 } 3582 } 3583 #undef SC 3584 } 3585 3586 /* 3587 * This routine is called to scan the fragmentation reassembly table for 3588 * the specified ILL for any packets that are starting to smell. 3589 * dead_interval is the maximum time in seconds that will be tolerated. It 3590 * will either be the value specified in ip_g_frag_timeout, or zero if the 3591 * ILL is shutting down and it is time to blow everything off. 3592 * 3593 * It returns the number of seconds (as a time_t) that the next frag timer 3594 * should be scheduled for, 0 meaning that the timer doesn't need to be 3595 * re-started. Note that the method of calculating next_timeout isn't 3596 * entirely accurate since time will flow between the time we grab 3597 * current_time and the time we schedule the next timeout. This isn't a 3598 * big problem since this is the timer for sending an ICMP reassembly time 3599 * exceeded messages, and it doesn't have to be exactly accurate. 3600 * 3601 * This function is 3602 * sometimes called as writer, although this is not required. 3603 */ 3604 time_t 3605 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3606 { 3607 ipfb_t *ipfb; 3608 ipfb_t *endp; 3609 ipf_t *ipf; 3610 ipf_t *ipfnext; 3611 mblk_t *mp; 3612 time_t current_time = gethrestime_sec(); 3613 time_t next_timeout = 0; 3614 uint32_t hdr_length; 3615 mblk_t *send_icmp_head; 3616 mblk_t *send_icmp_head_v6; 3617 zoneid_t zoneid; 3618 ip_stack_t *ipst = ill->ill_ipst; 3619 3620 ipfb = ill->ill_frag_hash_tbl; 3621 if (ipfb == NULL) 3622 return (B_FALSE); 3623 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3624 /* Walk the frag hash table. */ 3625 for (; ipfb < endp; ipfb++) { 3626 send_icmp_head = NULL; 3627 send_icmp_head_v6 = NULL; 3628 mutex_enter(&ipfb->ipfb_lock); 3629 while ((ipf = ipfb->ipfb_ipf) != 0) { 3630 time_t frag_time = current_time - ipf->ipf_timestamp; 3631 time_t frag_timeout; 3632 3633 if (frag_time < dead_interval) { 3634 /* 3635 * There are some outstanding fragments 3636 * that will timeout later. Make note of 3637 * the time so that we can reschedule the 3638 * next timeout appropriately. 3639 */ 3640 frag_timeout = dead_interval - frag_time; 3641 if (next_timeout == 0 || 3642 frag_timeout < next_timeout) { 3643 next_timeout = frag_timeout; 3644 } 3645 break; 3646 } 3647 /* Time's up. Get it out of here. */ 3648 hdr_length = ipf->ipf_nf_hdr_len; 3649 ipfnext = ipf->ipf_hash_next; 3650 if (ipfnext) 3651 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3652 *ipf->ipf_ptphn = ipfnext; 3653 mp = ipf->ipf_mp->b_cont; 3654 for (; mp; mp = mp->b_cont) { 3655 /* Extra points for neatness. */ 3656 IP_REASS_SET_START(mp, 0); 3657 IP_REASS_SET_END(mp, 0); 3658 } 3659 mp = ipf->ipf_mp->b_cont; 3660 ill->ill_frag_count -= ipf->ipf_count; 3661 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3662 ipfb->ipfb_count -= ipf->ipf_count; 3663 ASSERT(ipfb->ipfb_frag_pkts > 0); 3664 ipfb->ipfb_frag_pkts--; 3665 /* 3666 * We do not send any icmp message from here because 3667 * we currently are holding the ipfb_lock for this 3668 * hash chain. If we try and send any icmp messages 3669 * from here we may end up via a put back into ip 3670 * trying to get the same lock, causing a recursive 3671 * mutex panic. Instead we build a list and send all 3672 * the icmp messages after we have dropped the lock. 3673 */ 3674 if (ill->ill_isv6) { 3675 if (hdr_length != 0) { 3676 mp->b_next = send_icmp_head_v6; 3677 send_icmp_head_v6 = mp; 3678 } else { 3679 freemsg(mp); 3680 } 3681 } else { 3682 if (hdr_length != 0) { 3683 mp->b_next = send_icmp_head; 3684 send_icmp_head = mp; 3685 } else { 3686 freemsg(mp); 3687 } 3688 } 3689 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3690 freeb(ipf->ipf_mp); 3691 } 3692 mutex_exit(&ipfb->ipfb_lock); 3693 /* 3694 * Now need to send any icmp messages that we delayed from 3695 * above. 3696 */ 3697 while (send_icmp_head_v6 != NULL) { 3698 ip6_t *ip6h; 3699 3700 mp = send_icmp_head_v6; 3701 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3702 mp->b_next = NULL; 3703 if (mp->b_datap->db_type == M_CTL) 3704 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3705 else 3706 ip6h = (ip6_t *)mp->b_rptr; 3707 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3708 ill, ipst); 3709 if (zoneid == ALL_ZONES) { 3710 freemsg(mp); 3711 } else { 3712 icmp_time_exceeded_v6(ill->ill_wq, mp, 3713 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3714 B_FALSE, zoneid, ipst); 3715 } 3716 } 3717 while (send_icmp_head != NULL) { 3718 ipaddr_t dst; 3719 3720 mp = send_icmp_head; 3721 send_icmp_head = send_icmp_head->b_next; 3722 mp->b_next = NULL; 3723 3724 if (mp->b_datap->db_type == M_CTL) 3725 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3726 else 3727 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3728 3729 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 3730 if (zoneid == ALL_ZONES) { 3731 freemsg(mp); 3732 } else { 3733 icmp_time_exceeded(ill->ill_wq, mp, 3734 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid, 3735 ipst); 3736 } 3737 } 3738 } 3739 /* 3740 * A non-dying ILL will use the return value to decide whether to 3741 * restart the frag timer, and for how long. 3742 */ 3743 return (next_timeout); 3744 } 3745 3746 /* 3747 * This routine is called when the approximate count of mblk memory used 3748 * for the specified ILL has exceeded max_count. 3749 */ 3750 void 3751 ill_frag_prune(ill_t *ill, uint_t max_count) 3752 { 3753 ipfb_t *ipfb; 3754 ipf_t *ipf; 3755 size_t count; 3756 3757 /* 3758 * If we are here within ip_min_frag_prune_time msecs remove 3759 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3760 * ill_frag_free_num_pkts. 3761 */ 3762 mutex_enter(&ill->ill_lock); 3763 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3764 (ip_min_frag_prune_time != 0 ? 3765 ip_min_frag_prune_time : msec_per_tick)) { 3766 3767 ill->ill_frag_free_num_pkts++; 3768 3769 } else { 3770 ill->ill_frag_free_num_pkts = 0; 3771 } 3772 ill->ill_last_frag_clean_time = lbolt; 3773 mutex_exit(&ill->ill_lock); 3774 3775 /* 3776 * free ill_frag_free_num_pkts oldest packets from each bucket. 3777 */ 3778 if (ill->ill_frag_free_num_pkts != 0) { 3779 int ix; 3780 3781 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3782 ipfb = &ill->ill_frag_hash_tbl[ix]; 3783 mutex_enter(&ipfb->ipfb_lock); 3784 if (ipfb->ipfb_ipf != NULL) { 3785 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3786 ill->ill_frag_free_num_pkts); 3787 } 3788 mutex_exit(&ipfb->ipfb_lock); 3789 } 3790 } 3791 /* 3792 * While the reassembly list for this ILL is too big, prune a fragment 3793 * queue by age, oldest first. Note that the per ILL count is 3794 * approximate, while the per frag hash bucket counts are accurate. 3795 */ 3796 while (ill->ill_frag_count > max_count) { 3797 int ix; 3798 ipfb_t *oipfb = NULL; 3799 uint_t oldest = UINT_MAX; 3800 3801 count = 0; 3802 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3803 ipfb = &ill->ill_frag_hash_tbl[ix]; 3804 mutex_enter(&ipfb->ipfb_lock); 3805 ipf = ipfb->ipfb_ipf; 3806 if (ipf != NULL && ipf->ipf_gen < oldest) { 3807 oldest = ipf->ipf_gen; 3808 oipfb = ipfb; 3809 } 3810 count += ipfb->ipfb_count; 3811 mutex_exit(&ipfb->ipfb_lock); 3812 } 3813 /* Refresh the per ILL count */ 3814 ill->ill_frag_count = count; 3815 if (oipfb == NULL) { 3816 ill->ill_frag_count = 0; 3817 break; 3818 } 3819 if (count <= max_count) 3820 return; /* Somebody beat us to it, nothing to do */ 3821 mutex_enter(&oipfb->ipfb_lock); 3822 ipf = oipfb->ipfb_ipf; 3823 if (ipf != NULL) { 3824 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3825 } 3826 mutex_exit(&oipfb->ipfb_lock); 3827 } 3828 } 3829 3830 /* 3831 * free 'free_cnt' fragmented packets starting at ipf. 3832 */ 3833 void 3834 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3835 { 3836 size_t count; 3837 mblk_t *mp; 3838 mblk_t *tmp; 3839 ipf_t **ipfp = ipf->ipf_ptphn; 3840 3841 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3842 ASSERT(ipfp != NULL); 3843 ASSERT(ipf != NULL); 3844 3845 while (ipf != NULL && free_cnt-- > 0) { 3846 count = ipf->ipf_count; 3847 mp = ipf->ipf_mp; 3848 ipf = ipf->ipf_hash_next; 3849 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3850 IP_REASS_SET_START(tmp, 0); 3851 IP_REASS_SET_END(tmp, 0); 3852 } 3853 ill->ill_frag_count -= count; 3854 ASSERT(ipfb->ipfb_count >= count); 3855 ipfb->ipfb_count -= count; 3856 ASSERT(ipfb->ipfb_frag_pkts > 0); 3857 ipfb->ipfb_frag_pkts--; 3858 freemsg(mp); 3859 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3860 } 3861 3862 if (ipf) 3863 ipf->ipf_ptphn = ipfp; 3864 ipfp[0] = ipf; 3865 } 3866 3867 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3868 "obsolete and may be removed in a future release of Solaris. Use " \ 3869 "ifconfig(1M) to manipulate the forwarding status of an interface." 3870 3871 /* 3872 * For obsolete per-interface forwarding configuration; 3873 * called in response to ND_GET. 3874 */ 3875 /* ARGSUSED */ 3876 static int 3877 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3878 { 3879 ill_t *ill = (ill_t *)cp; 3880 3881 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3882 3883 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3884 return (0); 3885 } 3886 3887 /* 3888 * For obsolete per-interface forwarding configuration; 3889 * called in response to ND_SET. 3890 */ 3891 /* ARGSUSED */ 3892 static int 3893 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3894 cred_t *ioc_cr) 3895 { 3896 long value; 3897 int retval; 3898 ip_stack_t *ipst = CONNQ_TO_IPST(q); 3899 3900 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3901 3902 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3903 value < 0 || value > 1) { 3904 return (EINVAL); 3905 } 3906 3907 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3908 retval = ill_forward_set((ill_t *)cp, (value != 0)); 3909 rw_exit(&ipst->ips_ill_g_lock); 3910 return (retval); 3911 } 3912 3913 /* 3914 * Set an ill's ILLF_ROUTER flag appropriately. If the ill is part of an 3915 * IPMP group, make sure all ill's in the group adopt the new policy. Send 3916 * up RTS_IFINFO routing socket messages for each interface whose flags we 3917 * change. 3918 */ 3919 int 3920 ill_forward_set(ill_t *ill, boolean_t enable) 3921 { 3922 ill_group_t *illgrp; 3923 ip_stack_t *ipst = ill->ill_ipst; 3924 3925 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3926 3927 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3928 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 3929 return (0); 3930 3931 if (IS_LOOPBACK(ill)) 3932 return (EINVAL); 3933 3934 /* 3935 * If the ill is in an IPMP group, set the forwarding policy on all 3936 * members of the group to the same value. 3937 */ 3938 illgrp = ill->ill_group; 3939 if (illgrp != NULL) { 3940 ill_t *tmp_ill; 3941 3942 for (tmp_ill = illgrp->illgrp_ill; tmp_ill != NULL; 3943 tmp_ill = tmp_ill->ill_group_next) { 3944 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3945 (enable ? "Enabling" : "Disabling"), 3946 (tmp_ill->ill_isv6 ? "IPv6" : "IPv4"), 3947 tmp_ill->ill_name)); 3948 mutex_enter(&tmp_ill->ill_lock); 3949 if (enable) 3950 tmp_ill->ill_flags |= ILLF_ROUTER; 3951 else 3952 tmp_ill->ill_flags &= ~ILLF_ROUTER; 3953 mutex_exit(&tmp_ill->ill_lock); 3954 if (tmp_ill->ill_isv6) 3955 ill_set_nce_router_flags(tmp_ill, enable); 3956 /* Notify routing socket listeners of this change. */ 3957 ip_rts_ifmsg(tmp_ill->ill_ipif); 3958 } 3959 } else { 3960 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3961 (enable ? "Enabling" : "Disabling"), 3962 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3963 mutex_enter(&ill->ill_lock); 3964 if (enable) 3965 ill->ill_flags |= ILLF_ROUTER; 3966 else 3967 ill->ill_flags &= ~ILLF_ROUTER; 3968 mutex_exit(&ill->ill_lock); 3969 if (ill->ill_isv6) 3970 ill_set_nce_router_flags(ill, enable); 3971 /* Notify routing socket listeners of this change. */ 3972 ip_rts_ifmsg(ill->ill_ipif); 3973 } 3974 3975 return (0); 3976 } 3977 3978 /* 3979 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3980 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3981 * set or clear. 3982 */ 3983 static void 3984 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3985 { 3986 ipif_t *ipif; 3987 nce_t *nce; 3988 3989 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3990 nce = ndp_lookup_v6(ill, &ipif->ipif_v6lcl_addr, B_FALSE); 3991 if (nce != NULL) { 3992 mutex_enter(&nce->nce_lock); 3993 if (enable) 3994 nce->nce_flags |= NCE_F_ISROUTER; 3995 else 3996 nce->nce_flags &= ~NCE_F_ISROUTER; 3997 mutex_exit(&nce->nce_lock); 3998 NCE_REFRELE(nce); 3999 } 4000 } 4001 } 4002 4003 /* 4004 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 4005 * for this ill. Make sure the v6/v4 question has been answered about this 4006 * ill. The creation of this ndd variable is only for backwards compatibility. 4007 * The preferred way to control per-interface IP forwarding is through the 4008 * ILLF_ROUTER interface flag. 4009 */ 4010 static int 4011 ill_set_ndd_name(ill_t *ill) 4012 { 4013 char *suffix; 4014 ip_stack_t *ipst = ill->ill_ipst; 4015 4016 ASSERT(IAM_WRITER_ILL(ill)); 4017 4018 if (ill->ill_isv6) 4019 suffix = ipv6_forward_suffix; 4020 else 4021 suffix = ipv4_forward_suffix; 4022 4023 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 4024 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 4025 /* 4026 * Copies over the '\0'. 4027 * Note that strlen(suffix) is always bounded. 4028 */ 4029 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 4030 strlen(suffix) + 1); 4031 4032 /* 4033 * Use of the nd table requires holding the reader lock. 4034 * Modifying the nd table thru nd_load/nd_unload requires 4035 * the writer lock. 4036 */ 4037 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 4038 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 4039 nd_ill_forward_set, (caddr_t)ill)) { 4040 /* 4041 * If the nd_load failed, it only meant that it could not 4042 * allocate a new bunch of room for further NDD expansion. 4043 * Because of that, the ill_ndd_name will be set to 0, and 4044 * this interface is at the mercy of the global ip_forwarding 4045 * variable. 4046 */ 4047 rw_exit(&ipst->ips_ip_g_nd_lock); 4048 ill->ill_ndd_name = NULL; 4049 return (ENOMEM); 4050 } 4051 rw_exit(&ipst->ips_ip_g_nd_lock); 4052 return (0); 4053 } 4054 4055 /* 4056 * Intializes the context structure and returns the first ill in the list 4057 * cuurently start_list and end_list can have values: 4058 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 4059 * IP_V4_G_HEAD Traverse IPV4 list only. 4060 * IP_V6_G_HEAD Traverse IPV6 list only. 4061 */ 4062 4063 /* 4064 * We don't check for CONDEMNED ills here. Caller must do that if 4065 * necessary under the ill lock. 4066 */ 4067 ill_t * 4068 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 4069 ip_stack_t *ipst) 4070 { 4071 ill_if_t *ifp; 4072 ill_t *ill; 4073 avl_tree_t *avl_tree; 4074 4075 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4076 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 4077 4078 /* 4079 * setup the lists to search 4080 */ 4081 if (end_list != MAX_G_HEADS) { 4082 ctx->ctx_current_list = start_list; 4083 ctx->ctx_last_list = end_list; 4084 } else { 4085 ctx->ctx_last_list = MAX_G_HEADS - 1; 4086 ctx->ctx_current_list = 0; 4087 } 4088 4089 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 4090 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 4091 if (ifp != (ill_if_t *) 4092 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 4093 avl_tree = &ifp->illif_avl_by_ppa; 4094 ill = avl_first(avl_tree); 4095 /* 4096 * ill is guaranteed to be non NULL or ifp should have 4097 * not existed. 4098 */ 4099 ASSERT(ill != NULL); 4100 return (ill); 4101 } 4102 ctx->ctx_current_list++; 4103 } 4104 4105 return (NULL); 4106 } 4107 4108 /* 4109 * returns the next ill in the list. ill_first() must have been called 4110 * before calling ill_next() or bad things will happen. 4111 */ 4112 4113 /* 4114 * We don't check for CONDEMNED ills here. Caller must do that if 4115 * necessary under the ill lock. 4116 */ 4117 ill_t * 4118 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 4119 { 4120 ill_if_t *ifp; 4121 ill_t *ill; 4122 ip_stack_t *ipst = lastill->ill_ipst; 4123 4124 ASSERT(lastill->ill_ifptr != (ill_if_t *) 4125 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 4126 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 4127 AVL_AFTER)) != NULL) { 4128 return (ill); 4129 } 4130 4131 /* goto next ill_ifp in the list. */ 4132 ifp = lastill->ill_ifptr->illif_next; 4133 4134 /* make sure not at end of circular list */ 4135 while (ifp == 4136 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 4137 if (++ctx->ctx_current_list > ctx->ctx_last_list) 4138 return (NULL); 4139 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 4140 } 4141 4142 return (avl_first(&ifp->illif_avl_by_ppa)); 4143 } 4144 4145 /* 4146 * Check interface name for correct format which is name+ppa. 4147 * name can contain characters and digits, the right most digits 4148 * make up the ppa number. use of octal is not allowed, name must contain 4149 * a ppa, return pointer to the start of ppa. 4150 * In case of error return NULL. 4151 */ 4152 static char * 4153 ill_get_ppa_ptr(char *name) 4154 { 4155 int namelen = mi_strlen(name); 4156 4157 int len = namelen; 4158 4159 name += len; 4160 while (len > 0) { 4161 name--; 4162 if (*name < '0' || *name > '9') 4163 break; 4164 len--; 4165 } 4166 4167 /* empty string, all digits, or no trailing digits */ 4168 if (len == 0 || len == (int)namelen) 4169 return (NULL); 4170 4171 name++; 4172 /* check for attempted use of octal */ 4173 if (*name == '0' && len != (int)namelen - 1) 4174 return (NULL); 4175 return (name); 4176 } 4177 4178 /* 4179 * use avl tree to locate the ill. 4180 */ 4181 static ill_t * 4182 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 4183 ipsq_func_t func, int *error, ip_stack_t *ipst) 4184 { 4185 char *ppa_ptr = NULL; 4186 int len; 4187 uint_t ppa; 4188 ill_t *ill = NULL; 4189 ill_if_t *ifp; 4190 int list; 4191 ipsq_t *ipsq; 4192 4193 if (error != NULL) 4194 *error = 0; 4195 4196 /* 4197 * get ppa ptr 4198 */ 4199 if (isv6) 4200 list = IP_V6_G_HEAD; 4201 else 4202 list = IP_V4_G_HEAD; 4203 4204 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4205 if (error != NULL) 4206 *error = ENXIO; 4207 return (NULL); 4208 } 4209 4210 len = ppa_ptr - name + 1; 4211 4212 ppa = stoi(&ppa_ptr); 4213 4214 ifp = IP_VX_ILL_G_LIST(list, ipst); 4215 4216 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4217 /* 4218 * match is done on len - 1 as the name is not null 4219 * terminated it contains ppa in addition to the interface 4220 * name. 4221 */ 4222 if ((ifp->illif_name_len == len) && 4223 bcmp(ifp->illif_name, name, len - 1) == 0) { 4224 break; 4225 } else { 4226 ifp = ifp->illif_next; 4227 } 4228 } 4229 4230 4231 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4232 /* 4233 * Even the interface type does not exist. 4234 */ 4235 if (error != NULL) 4236 *error = ENXIO; 4237 return (NULL); 4238 } 4239 4240 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4241 if (ill != NULL) { 4242 /* 4243 * The block comment at the start of ipif_down 4244 * explains the use of the macros used below 4245 */ 4246 GRAB_CONN_LOCK(q); 4247 mutex_enter(&ill->ill_lock); 4248 if (ILL_CAN_LOOKUP(ill)) { 4249 ill_refhold_locked(ill); 4250 mutex_exit(&ill->ill_lock); 4251 RELEASE_CONN_LOCK(q); 4252 return (ill); 4253 } else if (ILL_CAN_WAIT(ill, q)) { 4254 ipsq = ill->ill_phyint->phyint_ipsq; 4255 mutex_enter(&ipsq->ipsq_lock); 4256 mutex_exit(&ill->ill_lock); 4257 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4258 mutex_exit(&ipsq->ipsq_lock); 4259 RELEASE_CONN_LOCK(q); 4260 if (error != NULL) 4261 *error = EINPROGRESS; 4262 return (NULL); 4263 } 4264 mutex_exit(&ill->ill_lock); 4265 RELEASE_CONN_LOCK(q); 4266 } 4267 if (error != NULL) 4268 *error = ENXIO; 4269 return (NULL); 4270 } 4271 4272 /* 4273 * comparison function for use with avl. 4274 */ 4275 static int 4276 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4277 { 4278 uint_t ppa; 4279 uint_t ill_ppa; 4280 4281 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4282 4283 ppa = *((uint_t *)ppa_ptr); 4284 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4285 /* 4286 * We want the ill with the lowest ppa to be on the 4287 * top. 4288 */ 4289 if (ill_ppa < ppa) 4290 return (1); 4291 if (ill_ppa > ppa) 4292 return (-1); 4293 return (0); 4294 } 4295 4296 /* 4297 * remove an interface type from the global list. 4298 */ 4299 static void 4300 ill_delete_interface_type(ill_if_t *interface) 4301 { 4302 ASSERT(interface != NULL); 4303 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4304 4305 avl_destroy(&interface->illif_avl_by_ppa); 4306 if (interface->illif_ppa_arena != NULL) 4307 vmem_destroy(interface->illif_ppa_arena); 4308 4309 remque(interface); 4310 4311 mi_free(interface); 4312 } 4313 4314 /* Defined in ip_netinfo.c */ 4315 extern ddi_taskq_t *eventq_queue_nic; 4316 4317 /* 4318 * remove ill from the global list. 4319 */ 4320 static void 4321 ill_glist_delete(ill_t *ill) 4322 { 4323 char *nicname; 4324 size_t nicnamelen; 4325 hook_nic_event_t *info; 4326 ip_stack_t *ipst; 4327 4328 if (ill == NULL) 4329 return; 4330 ipst = ill->ill_ipst; 4331 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4332 4333 if (ill->ill_name != NULL) { 4334 nicname = kmem_alloc(ill->ill_name_length, KM_NOSLEEP); 4335 if (nicname != NULL) { 4336 bcopy(ill->ill_name, nicname, ill->ill_name_length); 4337 nicnamelen = ill->ill_name_length; 4338 } 4339 } else { 4340 nicname = NULL; 4341 nicnamelen = 0; 4342 } 4343 4344 /* 4345 * If the ill was never inserted into the AVL tree 4346 * we skip the if branch. 4347 */ 4348 if (ill->ill_ifptr != NULL) { 4349 /* 4350 * remove from AVL tree and free ppa number 4351 */ 4352 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4353 4354 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4355 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4356 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4357 } 4358 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4359 ill_delete_interface_type(ill->ill_ifptr); 4360 } 4361 4362 /* 4363 * Indicate ill is no longer in the list. 4364 */ 4365 ill->ill_ifptr = NULL; 4366 ill->ill_name_length = 0; 4367 ill->ill_name[0] = '\0'; 4368 ill->ill_ppa = UINT_MAX; 4369 } 4370 4371 /* 4372 * Run the unplumb hook after the NIC has disappeared from being 4373 * visible so that attempts to revalidate its existance will fail. 4374 * 4375 * This needs to be run inside the ill_g_lock perimeter to ensure 4376 * that the ordering of delivered events to listeners matches the 4377 * order of them in the kernel. 4378 */ 4379 if ((info = ill->ill_nic_event_info) != NULL) { 4380 if (info->hne_event != NE_DOWN) { 4381 ip2dbg(("ill_glist_delete: unexpected nic event %d " 4382 "attached for %s\n", info->hne_event, 4383 ill->ill_name)); 4384 if (info->hne_data != NULL) 4385 kmem_free(info->hne_data, info->hne_datalen); 4386 kmem_free(info, sizeof (hook_nic_event_t)); 4387 } else { 4388 if (ddi_taskq_dispatch(eventq_queue_nic, 4389 ip_ne_queue_func, (void *)info, DDI_SLEEP) 4390 == DDI_FAILURE) { 4391 ip2dbg(("ill_glist_delete: ddi_taskq_dispatch " 4392 "failed\n")); 4393 if (info->hne_data != NULL) 4394 kmem_free(info->hne_data, 4395 info->hne_datalen); 4396 kmem_free(info, sizeof (hook_nic_event_t)); 4397 } 4398 } 4399 } 4400 4401 /* Generate NE_UNPLUMB event for ill_name. */ 4402 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 4403 if (info != NULL) { 4404 info->hne_nic = ill->ill_phyint->phyint_ifindex; 4405 info->hne_lif = 0; 4406 info->hne_event = NE_UNPLUMB; 4407 info->hne_data = nicname; 4408 info->hne_datalen = nicnamelen; 4409 info->hne_family = ill->ill_isv6 ? 4410 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 4411 } else { 4412 ip2dbg(("ill_glist_delete: could not attach UNPLUMB nic event " 4413 "information for %s (ENOMEM)\n", ill->ill_name)); 4414 if (nicname != NULL) 4415 kmem_free(nicname, nicnamelen); 4416 } 4417 4418 ill->ill_nic_event_info = info; 4419 4420 ill_phyint_free(ill); 4421 rw_exit(&ipst->ips_ill_g_lock); 4422 } 4423 4424 /* 4425 * allocate a ppa, if the number of plumbed interfaces of this type are 4426 * less than ill_no_arena do a linear search to find a unused ppa. 4427 * When the number goes beyond ill_no_arena switch to using an arena. 4428 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4429 * is the return value for an error condition, so allocation starts at one 4430 * and is decremented by one. 4431 */ 4432 static int 4433 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4434 { 4435 ill_t *tmp_ill; 4436 uint_t start, end; 4437 int ppa; 4438 4439 if (ifp->illif_ppa_arena == NULL && 4440 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4441 /* 4442 * Create an arena. 4443 */ 4444 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4445 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4446 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4447 /* allocate what has already been assigned */ 4448 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4449 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4450 tmp_ill, AVL_AFTER)) { 4451 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4452 1, /* size */ 4453 1, /* align/quantum */ 4454 0, /* phase */ 4455 0, /* nocross */ 4456 /* minaddr */ 4457 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 4458 /* maxaddr */ 4459 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 4460 VM_NOSLEEP|VM_FIRSTFIT); 4461 if (ppa == 0) { 4462 ip1dbg(("ill_alloc_ppa: ppa allocation" 4463 " failed while switching")); 4464 vmem_destroy(ifp->illif_ppa_arena); 4465 ifp->illif_ppa_arena = NULL; 4466 break; 4467 } 4468 } 4469 } 4470 4471 if (ifp->illif_ppa_arena != NULL) { 4472 if (ill->ill_ppa == UINT_MAX) { 4473 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4474 1, VM_NOSLEEP|VM_FIRSTFIT); 4475 if (ppa == 0) 4476 return (EAGAIN); 4477 ill->ill_ppa = --ppa; 4478 } else { 4479 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4480 1, /* size */ 4481 1, /* align/quantum */ 4482 0, /* phase */ 4483 0, /* nocross */ 4484 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4485 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4486 VM_NOSLEEP|VM_FIRSTFIT); 4487 /* 4488 * Most likely the allocation failed because 4489 * the requested ppa was in use. 4490 */ 4491 if (ppa == 0) 4492 return (EEXIST); 4493 } 4494 return (0); 4495 } 4496 4497 /* 4498 * No arena is in use and not enough (>ill_no_arena) interfaces have 4499 * been plumbed to create one. Do a linear search to get a unused ppa. 4500 */ 4501 if (ill->ill_ppa == UINT_MAX) { 4502 end = UINT_MAX - 1; 4503 start = 0; 4504 } else { 4505 end = start = ill->ill_ppa; 4506 } 4507 4508 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4509 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4510 if (start++ >= end) { 4511 if (ill->ill_ppa == UINT_MAX) 4512 return (EAGAIN); 4513 else 4514 return (EEXIST); 4515 } 4516 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4517 } 4518 ill->ill_ppa = start; 4519 return (0); 4520 } 4521 4522 /* 4523 * Insert ill into the list of configured ill's. Once this function completes, 4524 * the ill is globally visible and is available through lookups. More precisely 4525 * this happens after the caller drops the ill_g_lock. 4526 */ 4527 static int 4528 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4529 { 4530 ill_if_t *ill_interface; 4531 avl_index_t where = 0; 4532 int error; 4533 int name_length; 4534 int index; 4535 boolean_t check_length = B_FALSE; 4536 ip_stack_t *ipst = ill->ill_ipst; 4537 4538 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 4539 4540 name_length = mi_strlen(name) + 1; 4541 4542 if (isv6) 4543 index = IP_V6_G_HEAD; 4544 else 4545 index = IP_V4_G_HEAD; 4546 4547 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 4548 /* 4549 * Search for interface type based on name 4550 */ 4551 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4552 if ((ill_interface->illif_name_len == name_length) && 4553 (strcmp(ill_interface->illif_name, name) == 0)) { 4554 break; 4555 } 4556 ill_interface = ill_interface->illif_next; 4557 } 4558 4559 /* 4560 * Interface type not found, create one. 4561 */ 4562 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4563 4564 ill_g_head_t ghead; 4565 4566 /* 4567 * allocate ill_if_t structure 4568 */ 4569 4570 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4571 if (ill_interface == NULL) { 4572 return (ENOMEM); 4573 } 4574 4575 4576 4577 (void) strcpy(ill_interface->illif_name, name); 4578 ill_interface->illif_name_len = name_length; 4579 4580 avl_create(&ill_interface->illif_avl_by_ppa, 4581 ill_compare_ppa, sizeof (ill_t), 4582 offsetof(struct ill_s, ill_avl_byppa)); 4583 4584 /* 4585 * link the structure in the back to maintain order 4586 * of configuration for ifconfig output. 4587 */ 4588 ghead = ipst->ips_ill_g_heads[index]; 4589 insque(ill_interface, ghead.ill_g_list_tail); 4590 4591 } 4592 4593 if (ill->ill_ppa == UINT_MAX) 4594 check_length = B_TRUE; 4595 4596 error = ill_alloc_ppa(ill_interface, ill); 4597 if (error != 0) { 4598 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4599 ill_delete_interface_type(ill->ill_ifptr); 4600 return (error); 4601 } 4602 4603 /* 4604 * When the ppa is choosen by the system, check that there is 4605 * enough space to insert ppa. if a specific ppa was passed in this 4606 * check is not required as the interface name passed in will have 4607 * the right ppa in it. 4608 */ 4609 if (check_length) { 4610 /* 4611 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4612 */ 4613 char buf[sizeof (uint_t) * 3]; 4614 4615 /* 4616 * convert ppa to string to calculate the amount of space 4617 * required for it in the name. 4618 */ 4619 numtos(ill->ill_ppa, buf); 4620 4621 /* Do we have enough space to insert ppa ? */ 4622 4623 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4624 /* Free ppa and interface type struct */ 4625 if (ill_interface->illif_ppa_arena != NULL) { 4626 vmem_free(ill_interface->illif_ppa_arena, 4627 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4628 } 4629 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 4630 0) { 4631 ill_delete_interface_type(ill->ill_ifptr); 4632 } 4633 4634 return (EINVAL); 4635 } 4636 } 4637 4638 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4639 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4640 4641 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4642 &where); 4643 ill->ill_ifptr = ill_interface; 4644 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4645 4646 ill_phyint_reinit(ill); 4647 return (0); 4648 } 4649 4650 /* Initialize the per phyint (per IPMP group) ipsq used for serialization */ 4651 static boolean_t 4652 ipsq_init(ill_t *ill) 4653 { 4654 ipsq_t *ipsq; 4655 4656 /* Init the ipsq and impicitly enter as writer */ 4657 ill->ill_phyint->phyint_ipsq = 4658 kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 4659 if (ill->ill_phyint->phyint_ipsq == NULL) 4660 return (B_FALSE); 4661 ipsq = ill->ill_phyint->phyint_ipsq; 4662 ipsq->ipsq_phyint_list = ill->ill_phyint; 4663 ill->ill_phyint->phyint_ipsq_next = NULL; 4664 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4665 ipsq->ipsq_refs = 1; 4666 ipsq->ipsq_writer = curthread; 4667 ipsq->ipsq_reentry_cnt = 1; 4668 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 4669 #ifdef DEBUG 4670 ipsq->ipsq_depth = getpcstack((pc_t *)ipsq->ipsq_stack, 4671 IPSQ_STACK_DEPTH); 4672 #endif 4673 (void) strcpy(ipsq->ipsq_name, ill->ill_name); 4674 return (B_TRUE); 4675 } 4676 4677 /* 4678 * ill_init is called by ip_open when a device control stream is opened. 4679 * It does a few initializations, and shoots a DL_INFO_REQ message down 4680 * to the driver. The response is later picked up in ip_rput_dlpi and 4681 * used to set up default mechanisms for talking to the driver. (Always 4682 * called as writer.) 4683 * 4684 * If this function returns error, ip_open will call ip_close which in 4685 * turn will call ill_delete to clean up any memory allocated here that 4686 * is not yet freed. 4687 */ 4688 int 4689 ill_init(queue_t *q, ill_t *ill) 4690 { 4691 int count; 4692 dl_info_req_t *dlir; 4693 mblk_t *info_mp; 4694 uchar_t *frag_ptr; 4695 4696 /* 4697 * The ill is initialized to zero by mi_alloc*(). In addition 4698 * some fields already contain valid values, initialized in 4699 * ip_open(), before we reach here. 4700 */ 4701 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4702 4703 ill->ill_rq = q; 4704 ill->ill_wq = WR(q); 4705 4706 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4707 BPRI_HI); 4708 if (info_mp == NULL) 4709 return (ENOMEM); 4710 4711 /* 4712 * Allocate sufficient space to contain our fragment hash table and 4713 * the device name. 4714 */ 4715 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4716 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4717 if (frag_ptr == NULL) { 4718 freemsg(info_mp); 4719 return (ENOMEM); 4720 } 4721 ill->ill_frag_ptr = frag_ptr; 4722 ill->ill_frag_free_num_pkts = 0; 4723 ill->ill_last_frag_clean_time = 0; 4724 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4725 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4726 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4727 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4728 NULL, MUTEX_DEFAULT, NULL); 4729 } 4730 4731 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4732 if (ill->ill_phyint == NULL) { 4733 freemsg(info_mp); 4734 mi_free(frag_ptr); 4735 return (ENOMEM); 4736 } 4737 4738 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4739 /* 4740 * For now pretend this is a v4 ill. We need to set phyint_ill* 4741 * at this point because of the following reason. If we can't 4742 * enter the ipsq at some point and cv_wait, the writer that 4743 * wakes us up tries to locate us using the list of all phyints 4744 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4745 * If we don't set it now, we risk a missed wakeup. 4746 */ 4747 ill->ill_phyint->phyint_illv4 = ill; 4748 ill->ill_ppa = UINT_MAX; 4749 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4750 4751 if (!ipsq_init(ill)) { 4752 freemsg(info_mp); 4753 mi_free(frag_ptr); 4754 mi_free(ill->ill_phyint); 4755 return (ENOMEM); 4756 } 4757 4758 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4759 4760 4761 /* Frag queue limit stuff */ 4762 ill->ill_frag_count = 0; 4763 ill->ill_ipf_gen = 0; 4764 4765 ill->ill_global_timer = INFINITY; 4766 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4767 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4768 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4769 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4770 4771 /* 4772 * Initialize IPv6 configuration variables. The IP module is always 4773 * opened as an IPv4 module. Instead tracking down the cases where 4774 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4775 * here for convenience, this has no effect until the ill is set to do 4776 * IPv6. 4777 */ 4778 ill->ill_reachable_time = ND_REACHABLE_TIME; 4779 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4780 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4781 ill->ill_max_buf = ND_MAX_Q; 4782 ill->ill_refcnt = 0; 4783 4784 /* Send down the Info Request to the driver. */ 4785 info_mp->b_datap->db_type = M_PCPROTO; 4786 dlir = (dl_info_req_t *)info_mp->b_rptr; 4787 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4788 dlir->dl_primitive = DL_INFO_REQ; 4789 4790 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4791 4792 qprocson(q); 4793 ill_dlpi_send(ill, info_mp); 4794 4795 return (0); 4796 } 4797 4798 /* 4799 * ill_dls_info 4800 * creates datalink socket info from the device. 4801 */ 4802 int 4803 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4804 { 4805 size_t len; 4806 ill_t *ill = ipif->ipif_ill; 4807 4808 sdl->sdl_family = AF_LINK; 4809 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4810 sdl->sdl_type = ill->ill_type; 4811 ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4812 len = strlen(sdl->sdl_data); 4813 ASSERT(len < 256); 4814 sdl->sdl_nlen = (uchar_t)len; 4815 sdl->sdl_alen = ill->ill_phys_addr_length; 4816 sdl->sdl_slen = 0; 4817 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 4818 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 4819 4820 return (sizeof (struct sockaddr_dl)); 4821 } 4822 4823 /* 4824 * ill_xarp_info 4825 * creates xarp info from the device. 4826 */ 4827 static int 4828 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4829 { 4830 sdl->sdl_family = AF_LINK; 4831 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4832 sdl->sdl_type = ill->ill_type; 4833 ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4834 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4835 sdl->sdl_alen = ill->ill_phys_addr_length; 4836 sdl->sdl_slen = 0; 4837 return (sdl->sdl_nlen); 4838 } 4839 4840 static int 4841 loopback_kstat_update(kstat_t *ksp, int rw) 4842 { 4843 kstat_named_t *kn; 4844 netstackid_t stackid; 4845 netstack_t *ns; 4846 ip_stack_t *ipst; 4847 4848 if (ksp == NULL || ksp->ks_data == NULL) 4849 return (EIO); 4850 4851 if (rw == KSTAT_WRITE) 4852 return (EACCES); 4853 4854 kn = KSTAT_NAMED_PTR(ksp); 4855 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 4856 4857 ns = netstack_find_by_stackid(stackid); 4858 if (ns == NULL) 4859 return (-1); 4860 4861 ipst = ns->netstack_ip; 4862 if (ipst == NULL) { 4863 netstack_rele(ns); 4864 return (-1); 4865 } 4866 kn[0].value.ui32 = ipst->ips_loopback_packets; 4867 kn[1].value.ui32 = ipst->ips_loopback_packets; 4868 netstack_rele(ns); 4869 return (0); 4870 } 4871 4872 4873 /* 4874 * Has ifindex been plumbed already. 4875 * Compares both phyint_ifindex and phyint_group_ifindex. 4876 */ 4877 static boolean_t 4878 phyint_exists(uint_t index, ip_stack_t *ipst) 4879 { 4880 phyint_t *phyi; 4881 4882 ASSERT(index != 0); 4883 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4884 /* 4885 * Indexes are stored in the phyint - a common structure 4886 * to both IPv4 and IPv6. 4887 */ 4888 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 4889 for (; phyi != NULL; 4890 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4891 phyi, AVL_AFTER)) { 4892 if (phyi->phyint_ifindex == index || 4893 phyi->phyint_group_ifindex == index) 4894 return (B_TRUE); 4895 } 4896 return (B_FALSE); 4897 } 4898 4899 /* Pick a unique ifindex */ 4900 boolean_t 4901 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 4902 { 4903 uint_t starting_index; 4904 4905 if (!ipst->ips_ill_index_wrap) { 4906 *indexp = ipst->ips_ill_index++; 4907 if (ipst->ips_ill_index == 0) { 4908 /* Reached the uint_t limit Next time wrap */ 4909 ipst->ips_ill_index_wrap = B_TRUE; 4910 } 4911 return (B_TRUE); 4912 } 4913 4914 /* 4915 * Start reusing unused indexes. Note that we hold the ill_g_lock 4916 * at this point and don't want to call any function that attempts 4917 * to get the lock again. 4918 */ 4919 starting_index = ipst->ips_ill_index++; 4920 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 4921 if (ipst->ips_ill_index != 0 && 4922 !phyint_exists(ipst->ips_ill_index, ipst)) { 4923 /* found unused index - use it */ 4924 *indexp = ipst->ips_ill_index; 4925 return (B_TRUE); 4926 } 4927 } 4928 4929 /* 4930 * all interface indicies are inuse. 4931 */ 4932 return (B_FALSE); 4933 } 4934 4935 /* 4936 * Assign a unique interface index for the phyint. 4937 */ 4938 static boolean_t 4939 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 4940 { 4941 ASSERT(phyi->phyint_ifindex == 0); 4942 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 4943 } 4944 4945 /* 4946 * Return a pointer to the ill which matches the supplied name. Note that 4947 * the ill name length includes the null termination character. (May be 4948 * called as writer.) 4949 * If do_alloc and the interface is "lo0" it will be automatically created. 4950 * Cannot bump up reference on condemned ills. So dup detect can't be done 4951 * using this func. 4952 */ 4953 ill_t * 4954 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4955 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc, 4956 ip_stack_t *ipst) 4957 { 4958 ill_t *ill; 4959 ipif_t *ipif; 4960 kstat_named_t *kn; 4961 boolean_t isloopback; 4962 ipsq_t *old_ipsq; 4963 in6_addr_t ov6addr; 4964 4965 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4966 4967 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4968 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4969 rw_exit(&ipst->ips_ill_g_lock); 4970 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4971 return (ill); 4972 4973 /* 4974 * Couldn't find it. Does this happen to be a lookup for the 4975 * loopback device and are we allowed to allocate it? 4976 */ 4977 if (!isloopback || !do_alloc) 4978 return (NULL); 4979 4980 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4981 4982 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4983 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4984 rw_exit(&ipst->ips_ill_g_lock); 4985 return (ill); 4986 } 4987 4988 /* Create the loopback device on demand */ 4989 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4990 sizeof (ipif_loopback_name), BPRI_MED)); 4991 if (ill == NULL) 4992 goto done; 4993 4994 *ill = ill_null; 4995 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4996 ill->ill_ipst = ipst; 4997 netstack_hold(ipst->ips_netstack); 4998 /* 4999 * For exclusive stacks we set the zoneid to zero 5000 * to make IP operate as if in the global zone. 5001 */ 5002 ill->ill_zoneid = GLOBAL_ZONEID; 5003 5004 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 5005 if (ill->ill_phyint == NULL) 5006 goto done; 5007 5008 if (isv6) 5009 ill->ill_phyint->phyint_illv6 = ill; 5010 else 5011 ill->ill_phyint->phyint_illv4 = ill; 5012 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 5013 ill->ill_max_frag = IP_LOOPBACK_MTU; 5014 /* Add room for tcp+ip headers */ 5015 if (isv6) { 5016 ill->ill_isv6 = B_TRUE; 5017 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 5018 } else { 5019 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 5020 } 5021 if (!ill_allocate_mibs(ill)) 5022 goto done; 5023 ill->ill_max_mtu = ill->ill_max_frag; 5024 /* 5025 * ipif_loopback_name can't be pointed at directly because its used 5026 * by both the ipv4 and ipv6 interfaces. When the ill is removed 5027 * from the glist, ill_glist_delete() sets the first character of 5028 * ill_name to '\0'. 5029 */ 5030 ill->ill_name = (char *)ill + sizeof (*ill); 5031 (void) strcpy(ill->ill_name, ipif_loopback_name); 5032 ill->ill_name_length = sizeof (ipif_loopback_name); 5033 /* Set ill_name_set for ill_phyint_reinit to work properly */ 5034 5035 ill->ill_global_timer = INFINITY; 5036 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 5037 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 5038 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 5039 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 5040 5041 /* No resolver here. */ 5042 ill->ill_net_type = IRE_LOOPBACK; 5043 5044 /* Initialize the ipsq */ 5045 if (!ipsq_init(ill)) 5046 goto done; 5047 5048 ill->ill_phyint->phyint_ipsq->ipsq_writer = NULL; 5049 ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt--; 5050 ASSERT(ill->ill_phyint->phyint_ipsq->ipsq_reentry_cnt == 0); 5051 #ifdef DEBUG 5052 ill->ill_phyint->phyint_ipsq->ipsq_depth = 0; 5053 #endif 5054 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE); 5055 if (ipif == NULL) 5056 goto done; 5057 5058 ill->ill_flags = ILLF_MULTICAST; 5059 5060 ov6addr = ipif->ipif_v6lcl_addr; 5061 /* Set up default loopback address and mask. */ 5062 if (!isv6) { 5063 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 5064 5065 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 5066 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 5067 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 5068 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 5069 ipif->ipif_v6subnet); 5070 ill->ill_flags |= ILLF_IPV4; 5071 } else { 5072 ipif->ipif_v6lcl_addr = ipv6_loopback; 5073 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 5074 ipif->ipif_v6net_mask = ipv6_all_ones; 5075 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 5076 ipif->ipif_v6subnet); 5077 ill->ill_flags |= ILLF_IPV6; 5078 } 5079 5080 /* 5081 * Chain us in at the end of the ill list. hold the ill 5082 * before we make it globally visible. 1 for the lookup. 5083 */ 5084 ill->ill_refcnt = 0; 5085 ill_refhold(ill); 5086 5087 ill->ill_frag_count = 0; 5088 ill->ill_frag_free_num_pkts = 0; 5089 ill->ill_last_frag_clean_time = 0; 5090 5091 old_ipsq = ill->ill_phyint->phyint_ipsq; 5092 5093 if (ill_glist_insert(ill, "lo", isv6) != 0) 5094 cmn_err(CE_PANIC, "cannot insert loopback interface"); 5095 5096 /* Let SCTP know so that it can add this to its list */ 5097 sctp_update_ill(ill, SCTP_ILL_INSERT); 5098 5099 /* 5100 * We have already assigned ipif_v6lcl_addr above, but we need to 5101 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 5102 * requires to be after ill_glist_insert() since we need the 5103 * ill_index set. Pass on ipv6_loopback as the old address. 5104 */ 5105 sctp_update_ipif_addr(ipif, ov6addr); 5106 5107 /* 5108 * If the ipsq was changed in ill_phyint_reinit free the old ipsq. 5109 */ 5110 if (old_ipsq != ill->ill_phyint->phyint_ipsq) { 5111 /* Loopback ills aren't in any IPMP group */ 5112 ASSERT(!(old_ipsq->ipsq_flags & IPSQ_GROUP)); 5113 ipsq_delete(old_ipsq); 5114 } 5115 5116 /* 5117 * Delay this till the ipif is allocated as ipif_allocate 5118 * de-references ill_phyint for getting the ifindex. We 5119 * can't do this before ipif_allocate because ill_phyint_reinit 5120 * -> phyint_assign_ifindex expects ipif to be present. 5121 */ 5122 mutex_enter(&ill->ill_phyint->phyint_lock); 5123 ill->ill_phyint->phyint_flags |= PHYI_LOOPBACK | PHYI_VIRTUAL; 5124 mutex_exit(&ill->ill_phyint->phyint_lock); 5125 5126 if (ipst->ips_loopback_ksp == NULL) { 5127 /* Export loopback interface statistics */ 5128 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 5129 ipif_loopback_name, "net", 5130 KSTAT_TYPE_NAMED, 2, 0, 5131 ipst->ips_netstack->netstack_stackid); 5132 if (ipst->ips_loopback_ksp != NULL) { 5133 ipst->ips_loopback_ksp->ks_update = 5134 loopback_kstat_update; 5135 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 5136 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 5137 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 5138 ipst->ips_loopback_ksp->ks_private = 5139 (void *)(uintptr_t)ipst->ips_netstack-> 5140 netstack_stackid; 5141 kstat_install(ipst->ips_loopback_ksp); 5142 } 5143 } 5144 5145 if (error != NULL) 5146 *error = 0; 5147 *did_alloc = B_TRUE; 5148 rw_exit(&ipst->ips_ill_g_lock); 5149 return (ill); 5150 done: 5151 if (ill != NULL) { 5152 if (ill->ill_phyint != NULL) { 5153 ipsq_t *ipsq; 5154 5155 ipsq = ill->ill_phyint->phyint_ipsq; 5156 if (ipsq != NULL) { 5157 ipsq->ipsq_ipst = NULL; 5158 kmem_free(ipsq, sizeof (ipsq_t)); 5159 } 5160 mi_free(ill->ill_phyint); 5161 } 5162 ill_free_mib(ill); 5163 if (ill->ill_ipst != NULL) 5164 netstack_rele(ill->ill_ipst->ips_netstack); 5165 mi_free(ill); 5166 } 5167 rw_exit(&ipst->ips_ill_g_lock); 5168 if (error != NULL) 5169 *error = ENOMEM; 5170 return (NULL); 5171 } 5172 5173 /* 5174 * For IPP calls - use the ip_stack_t for global stack. 5175 */ 5176 ill_t * 5177 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, 5178 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 5179 { 5180 ip_stack_t *ipst; 5181 ill_t *ill; 5182 5183 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 5184 if (ipst == NULL) { 5185 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 5186 return (NULL); 5187 } 5188 5189 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 5190 netstack_rele(ipst->ips_netstack); 5191 return (ill); 5192 } 5193 5194 /* 5195 * Return a pointer to the ill which matches the index and IP version type. 5196 */ 5197 ill_t * 5198 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 5199 ipsq_func_t func, int *err, ip_stack_t *ipst) 5200 { 5201 ill_t *ill; 5202 ipsq_t *ipsq; 5203 phyint_t *phyi; 5204 5205 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 5206 (q != NULL && mp != NULL && func != NULL && err != NULL)); 5207 5208 if (err != NULL) 5209 *err = 0; 5210 5211 /* 5212 * Indexes are stored in the phyint - a common structure 5213 * to both IPv4 and IPv6. 5214 */ 5215 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5216 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5217 (void *) &index, NULL); 5218 if (phyi != NULL) { 5219 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5220 if (ill != NULL) { 5221 /* 5222 * The block comment at the start of ipif_down 5223 * explains the use of the macros used below 5224 */ 5225 GRAB_CONN_LOCK(q); 5226 mutex_enter(&ill->ill_lock); 5227 if (ILL_CAN_LOOKUP(ill)) { 5228 ill_refhold_locked(ill); 5229 mutex_exit(&ill->ill_lock); 5230 RELEASE_CONN_LOCK(q); 5231 rw_exit(&ipst->ips_ill_g_lock); 5232 return (ill); 5233 } else if (ILL_CAN_WAIT(ill, q)) { 5234 ipsq = ill->ill_phyint->phyint_ipsq; 5235 mutex_enter(&ipsq->ipsq_lock); 5236 rw_exit(&ipst->ips_ill_g_lock); 5237 mutex_exit(&ill->ill_lock); 5238 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5239 mutex_exit(&ipsq->ipsq_lock); 5240 RELEASE_CONN_LOCK(q); 5241 if (err != NULL) 5242 *err = EINPROGRESS; 5243 return (NULL); 5244 } 5245 RELEASE_CONN_LOCK(q); 5246 mutex_exit(&ill->ill_lock); 5247 } 5248 } 5249 rw_exit(&ipst->ips_ill_g_lock); 5250 if (err != NULL) 5251 *err = ENXIO; 5252 return (NULL); 5253 } 5254 5255 /* 5256 * Return the ifindex next in sequence after the passed in ifindex. 5257 * If there is no next ifindex for the given protocol, return 0. 5258 */ 5259 uint_t 5260 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 5261 { 5262 phyint_t *phyi; 5263 phyint_t *phyi_initial; 5264 uint_t ifindex; 5265 5266 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5267 5268 if (index == 0) { 5269 phyi = avl_first( 5270 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 5271 } else { 5272 phyi = phyi_initial = avl_find( 5273 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5274 (void *) &index, NULL); 5275 } 5276 5277 for (; phyi != NULL; 5278 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5279 phyi, AVL_AFTER)) { 5280 /* 5281 * If we're not returning the first interface in the tree 5282 * and we still haven't moved past the phyint_t that 5283 * corresponds to index, avl_walk needs to be called again 5284 */ 5285 if (!((index != 0) && (phyi == phyi_initial))) { 5286 if (isv6) { 5287 if ((phyi->phyint_illv6) && 5288 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5289 (phyi->phyint_illv6->ill_isv6 == 1)) 5290 break; 5291 } else { 5292 if ((phyi->phyint_illv4) && 5293 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5294 (phyi->phyint_illv4->ill_isv6 == 0)) 5295 break; 5296 } 5297 } 5298 } 5299 5300 rw_exit(&ipst->ips_ill_g_lock); 5301 5302 if (phyi != NULL) 5303 ifindex = phyi->phyint_ifindex; 5304 else 5305 ifindex = 0; 5306 5307 return (ifindex); 5308 } 5309 5310 5311 /* 5312 * Return the ifindex for the named interface. 5313 * If there is no next ifindex for the interface, return 0. 5314 */ 5315 uint_t 5316 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 5317 { 5318 phyint_t *phyi; 5319 avl_index_t where = 0; 5320 uint_t ifindex; 5321 5322 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5323 5324 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 5325 name, &where)) == NULL) { 5326 rw_exit(&ipst->ips_ill_g_lock); 5327 return (0); 5328 } 5329 5330 ifindex = phyi->phyint_ifindex; 5331 5332 rw_exit(&ipst->ips_ill_g_lock); 5333 5334 return (ifindex); 5335 } 5336 5337 5338 /* 5339 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5340 * that gives a running thread a reference to the ill. This reference must be 5341 * released by the thread when it is done accessing the ill and related 5342 * objects. ill_refcnt can not be used to account for static references 5343 * such as other structures pointing to an ill. Callers must generally 5344 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5345 * or be sure that the ill is not being deleted or changing state before 5346 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5347 * ill won't change any of its critical state such as address, netmask etc. 5348 */ 5349 void 5350 ill_refhold(ill_t *ill) 5351 { 5352 mutex_enter(&ill->ill_lock); 5353 ill->ill_refcnt++; 5354 ILL_TRACE_REF(ill); 5355 mutex_exit(&ill->ill_lock); 5356 } 5357 5358 void 5359 ill_refhold_locked(ill_t *ill) 5360 { 5361 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5362 ill->ill_refcnt++; 5363 ILL_TRACE_REF(ill); 5364 } 5365 5366 int 5367 ill_check_and_refhold(ill_t *ill) 5368 { 5369 mutex_enter(&ill->ill_lock); 5370 if (ILL_CAN_LOOKUP(ill)) { 5371 ill_refhold_locked(ill); 5372 mutex_exit(&ill->ill_lock); 5373 return (0); 5374 } 5375 mutex_exit(&ill->ill_lock); 5376 return (ILL_LOOKUP_FAILED); 5377 } 5378 5379 /* 5380 * Must not be called while holding any locks. Otherwise if this is 5381 * the last reference to be released, there is a chance of recursive mutex 5382 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5383 * to restart an ioctl. 5384 */ 5385 void 5386 ill_refrele(ill_t *ill) 5387 { 5388 mutex_enter(&ill->ill_lock); 5389 ASSERT(ill->ill_refcnt != 0); 5390 ill->ill_refcnt--; 5391 ILL_UNTRACE_REF(ill); 5392 if (ill->ill_refcnt != 0) { 5393 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5394 mutex_exit(&ill->ill_lock); 5395 return; 5396 } 5397 5398 /* Drops the ill_lock */ 5399 ipif_ill_refrele_tail(ill); 5400 } 5401 5402 /* 5403 * Obtain a weak reference count on the ill. This reference ensures the 5404 * ill won't be freed, but the ill may change any of its critical state 5405 * such as netmask, address etc. Returns an error if the ill has started 5406 * closing. 5407 */ 5408 boolean_t 5409 ill_waiter_inc(ill_t *ill) 5410 { 5411 mutex_enter(&ill->ill_lock); 5412 if (ill->ill_state_flags & ILL_CONDEMNED) { 5413 mutex_exit(&ill->ill_lock); 5414 return (B_FALSE); 5415 } 5416 ill->ill_waiters++; 5417 mutex_exit(&ill->ill_lock); 5418 return (B_TRUE); 5419 } 5420 5421 void 5422 ill_waiter_dcr(ill_t *ill) 5423 { 5424 mutex_enter(&ill->ill_lock); 5425 ill->ill_waiters--; 5426 if (ill->ill_waiters == 0) 5427 cv_broadcast(&ill->ill_cv); 5428 mutex_exit(&ill->ill_lock); 5429 } 5430 5431 /* 5432 * Named Dispatch routine to produce a formatted report on all ILLs. 5433 * This report is accessed by using the ndd utility to "get" ND variable 5434 * "ip_ill_status". 5435 */ 5436 /* ARGSUSED */ 5437 int 5438 ip_ill_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5439 { 5440 ill_t *ill; 5441 ill_walk_context_t ctx; 5442 ip_stack_t *ipst; 5443 5444 ipst = CONNQ_TO_IPST(q); 5445 5446 (void) mi_mpprintf(mp, 5447 "ILL " MI_COL_HDRPAD_STR 5448 /* 01234567[89ABCDEF] */ 5449 "rq " MI_COL_HDRPAD_STR 5450 /* 01234567[89ABCDEF] */ 5451 "wq " MI_COL_HDRPAD_STR 5452 /* 01234567[89ABCDEF] */ 5453 "upcnt mxfrg err name"); 5454 /* 12345 12345 123 xxxxxxxx */ 5455 5456 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5457 ill = ILL_START_WALK_ALL(&ctx, ipst); 5458 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5459 (void) mi_mpprintf(mp, 5460 MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR MI_COL_PTRFMT_STR 5461 "%05u %05u %03d %s", 5462 (void *)ill, (void *)ill->ill_rq, (void *)ill->ill_wq, 5463 ill->ill_ipif_up_count, 5464 ill->ill_max_frag, ill->ill_error, ill->ill_name); 5465 } 5466 rw_exit(&ipst->ips_ill_g_lock); 5467 5468 return (0); 5469 } 5470 5471 /* 5472 * Named Dispatch routine to produce a formatted report on all IPIFs. 5473 * This report is accessed by using the ndd utility to "get" ND variable 5474 * "ip_ipif_status". 5475 */ 5476 /* ARGSUSED */ 5477 int 5478 ip_ipif_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) 5479 { 5480 char buf1[INET6_ADDRSTRLEN]; 5481 char buf2[INET6_ADDRSTRLEN]; 5482 char buf3[INET6_ADDRSTRLEN]; 5483 char buf4[INET6_ADDRSTRLEN]; 5484 char buf5[INET6_ADDRSTRLEN]; 5485 char buf6[INET6_ADDRSTRLEN]; 5486 char buf[LIFNAMSIZ]; 5487 ill_t *ill; 5488 ipif_t *ipif; 5489 nv_t *nvp; 5490 uint64_t flags; 5491 zoneid_t zoneid; 5492 ill_walk_context_t ctx; 5493 ip_stack_t *ipst = CONNQ_TO_IPST(q); 5494 5495 (void) mi_mpprintf(mp, 5496 "IPIF metric mtu in/out/forward name zone flags...\n" 5497 "\tlocal address\n" 5498 "\tsrc address\n" 5499 "\tsubnet\n" 5500 "\tmask\n" 5501 "\tbroadcast\n" 5502 "\tp-p-dst"); 5503 5504 ASSERT(q->q_next == NULL); 5505 zoneid = Q_TO_CONN(q)->conn_zoneid; /* IP is a driver */ 5506 5507 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5508 ill = ILL_START_WALK_ALL(&ctx, ipst); 5509 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5510 for (ipif = ill->ill_ipif; ipif != NULL; 5511 ipif = ipif->ipif_next) { 5512 if (zoneid != GLOBAL_ZONEID && 5513 zoneid != ipif->ipif_zoneid && 5514 ipif->ipif_zoneid != ALL_ZONES) 5515 continue; 5516 5517 ipif_get_name(ipif, buf, sizeof (buf)); 5518 (void) mi_mpprintf(mp, 5519 MI_COL_PTRFMT_STR 5520 "%04u %05u %u/%u/%u %s %d", 5521 (void *)ipif, 5522 ipif->ipif_metric, ipif->ipif_mtu, 5523 ipif->ipif_ib_pkt_count, 5524 ipif->ipif_ob_pkt_count, 5525 ipif->ipif_fo_pkt_count, 5526 buf, 5527 ipif->ipif_zoneid); 5528 5529 flags = ipif->ipif_flags | ipif->ipif_ill->ill_flags | 5530 ipif->ipif_ill->ill_phyint->phyint_flags; 5531 5532 /* Tack on text strings for any flags. */ 5533 nvp = ipif_nv_tbl; 5534 for (; nvp < A_END(ipif_nv_tbl); nvp++) { 5535 if (nvp->nv_value & flags) 5536 (void) mi_mpprintf_nr(mp, " %s", 5537 nvp->nv_name); 5538 } 5539 (void) mi_mpprintf(mp, 5540 "\t%s\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s", 5541 inet_ntop(AF_INET6, 5542 &ipif->ipif_v6lcl_addr, buf1, sizeof (buf1)), 5543 inet_ntop(AF_INET6, 5544 &ipif->ipif_v6src_addr, buf2, sizeof (buf2)), 5545 inet_ntop(AF_INET6, 5546 &ipif->ipif_v6subnet, buf3, sizeof (buf3)), 5547 inet_ntop(AF_INET6, 5548 &ipif->ipif_v6net_mask, buf4, sizeof (buf4)), 5549 inet_ntop(AF_INET6, 5550 &ipif->ipif_v6brd_addr, buf5, sizeof (buf5)), 5551 inet_ntop(AF_INET6, 5552 &ipif->ipif_v6pp_dst_addr, buf6, sizeof (buf6))); 5553 } 5554 } 5555 rw_exit(&ipst->ips_ill_g_lock); 5556 return (0); 5557 } 5558 5559 /* 5560 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5561 * driver. We construct best guess defaults for lower level information that 5562 * we need. If an interface is brought up without injection of any overriding 5563 * information from outside, we have to be ready to go with these defaults. 5564 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5565 * we primarely want the dl_provider_style. 5566 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5567 * at which point we assume the other part of the information is valid. 5568 */ 5569 void 5570 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5571 { 5572 uchar_t *brdcst_addr; 5573 uint_t brdcst_addr_length, phys_addr_length; 5574 t_scalar_t sap_length; 5575 dl_info_ack_t *dlia; 5576 ip_m_t *ipm; 5577 dl_qos_cl_sel1_t *sel1; 5578 5579 ASSERT(IAM_WRITER_ILL(ill)); 5580 5581 /* 5582 * Till the ill is fully up ILL_CHANGING will be set and 5583 * the ill is not globally visible. So no need for a lock. 5584 */ 5585 dlia = (dl_info_ack_t *)mp->b_rptr; 5586 ill->ill_mactype = dlia->dl_mac_type; 5587 5588 ipm = ip_m_lookup(dlia->dl_mac_type); 5589 if (ipm == NULL) { 5590 ipm = ip_m_lookup(DL_OTHER); 5591 ASSERT(ipm != NULL); 5592 } 5593 ill->ill_media = ipm; 5594 5595 /* 5596 * When the new DLPI stuff is ready we'll pull lengths 5597 * from dlia. 5598 */ 5599 if (dlia->dl_version == DL_VERSION_2) { 5600 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5601 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5602 brdcst_addr_length); 5603 if (brdcst_addr == NULL) { 5604 brdcst_addr_length = 0; 5605 } 5606 sap_length = dlia->dl_sap_length; 5607 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5608 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5609 brdcst_addr_length, sap_length, phys_addr_length)); 5610 } else { 5611 brdcst_addr_length = 6; 5612 brdcst_addr = ip_six_byte_all_ones; 5613 sap_length = -2; 5614 phys_addr_length = brdcst_addr_length; 5615 } 5616 5617 ill->ill_bcast_addr_length = brdcst_addr_length; 5618 ill->ill_phys_addr_length = phys_addr_length; 5619 ill->ill_sap_length = sap_length; 5620 ill->ill_max_frag = dlia->dl_max_sdu; 5621 ill->ill_max_mtu = ill->ill_max_frag; 5622 5623 ill->ill_type = ipm->ip_m_type; 5624 5625 if (!ill->ill_dlpi_style_set) { 5626 if (dlia->dl_provider_style == DL_STYLE2) 5627 ill->ill_needs_attach = 1; 5628 5629 /* 5630 * Allocate the first ipif on this ill. We don't delay it 5631 * further as ioctl handling assumes atleast one ipif to 5632 * be present. 5633 * 5634 * At this point we don't know whether the ill is v4 or v6. 5635 * We will know this whan the SIOCSLIFNAME happens and 5636 * the correct value for ill_isv6 will be assigned in 5637 * ipif_set_values(). We need to hold the ill lock and 5638 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5639 * the wakeup. 5640 */ 5641 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5642 dlia->dl_provider_style == DL_STYLE2 ? B_FALSE : B_TRUE); 5643 mutex_enter(&ill->ill_lock); 5644 ASSERT(ill->ill_dlpi_style_set == 0); 5645 ill->ill_dlpi_style_set = 1; 5646 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5647 cv_broadcast(&ill->ill_cv); 5648 mutex_exit(&ill->ill_lock); 5649 freemsg(mp); 5650 return; 5651 } 5652 ASSERT(ill->ill_ipif != NULL); 5653 /* 5654 * We know whether it is IPv4 or IPv6 now, as this is the 5655 * second DL_INFO_ACK we are recieving in response to the 5656 * DL_INFO_REQ sent in ipif_set_values. 5657 */ 5658 if (ill->ill_isv6) 5659 ill->ill_sap = IP6_DL_SAP; 5660 else 5661 ill->ill_sap = IP_DL_SAP; 5662 /* 5663 * Set ipif_mtu which is used to set the IRE's 5664 * ire_max_frag value. The driver could have sent 5665 * a different mtu from what it sent last time. No 5666 * need to call ipif_mtu_change because IREs have 5667 * not yet been created. 5668 */ 5669 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5670 /* 5671 * Clear all the flags that were set based on ill_bcast_addr_length 5672 * and ill_phys_addr_length (in ipif_set_values) as these could have 5673 * changed now and we need to re-evaluate. 5674 */ 5675 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5676 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5677 5678 /* 5679 * Free ill_resolver_mp and ill_bcast_mp as things could have 5680 * changed now. 5681 */ 5682 if (ill->ill_bcast_addr_length == 0) { 5683 if (ill->ill_resolver_mp != NULL) 5684 freemsg(ill->ill_resolver_mp); 5685 if (ill->ill_bcast_mp != NULL) 5686 freemsg(ill->ill_bcast_mp); 5687 if (ill->ill_flags & ILLF_XRESOLV) 5688 ill->ill_net_type = IRE_IF_RESOLVER; 5689 else 5690 ill->ill_net_type = IRE_IF_NORESOLVER; 5691 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5692 ill->ill_phys_addr_length, 5693 ill->ill_sap, 5694 ill->ill_sap_length); 5695 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5696 5697 if (ill->ill_isv6) 5698 /* 5699 * Note: xresolv interfaces will eventually need NOARP 5700 * set here as well, but that will require those 5701 * external resolvers to have some knowledge of 5702 * that flag and act appropriately. Not to be changed 5703 * at present. 5704 */ 5705 ill->ill_flags |= ILLF_NONUD; 5706 else 5707 ill->ill_flags |= ILLF_NOARP; 5708 5709 if (ill->ill_phys_addr_length == 0) { 5710 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5711 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5712 ill->ill_phyint->phyint_flags |= PHYI_VIRTUAL; 5713 } else { 5714 /* pt-pt supports multicast. */ 5715 ill->ill_flags |= ILLF_MULTICAST; 5716 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5717 } 5718 } 5719 } else { 5720 ill->ill_net_type = IRE_IF_RESOLVER; 5721 if (ill->ill_bcast_mp != NULL) 5722 freemsg(ill->ill_bcast_mp); 5723 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5724 ill->ill_bcast_addr_length, ill->ill_sap, 5725 ill->ill_sap_length); 5726 /* 5727 * Later detect lack of DLPI driver multicast 5728 * capability by catching DL_ENABMULTI errors in 5729 * ip_rput_dlpi. 5730 */ 5731 ill->ill_flags |= ILLF_MULTICAST; 5732 if (!ill->ill_isv6) 5733 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5734 } 5735 /* By default an interface does not support any CoS marking */ 5736 ill->ill_flags &= ~ILLF_COS_ENABLED; 5737 5738 /* 5739 * If we get QoS information in DL_INFO_ACK, the device supports 5740 * some form of CoS marking, set ILLF_COS_ENABLED. 5741 */ 5742 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5743 dlia->dl_qos_length); 5744 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5745 ill->ill_flags |= ILLF_COS_ENABLED; 5746 } 5747 5748 /* Clear any previous error indication. */ 5749 ill->ill_error = 0; 5750 freemsg(mp); 5751 } 5752 5753 /* 5754 * Perform various checks to verify that an address would make sense as a 5755 * local, remote, or subnet interface address. 5756 */ 5757 static boolean_t 5758 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5759 { 5760 ipaddr_t net_mask; 5761 5762 /* 5763 * Don't allow all zeroes, or all ones, but allow 5764 * all ones netmask. 5765 */ 5766 if ((net_mask = ip_net_mask(addr)) == 0) 5767 return (B_FALSE); 5768 /* A given netmask overrides the "guess" netmask */ 5769 if (subnet_mask != 0) 5770 net_mask = subnet_mask; 5771 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5772 (addr == (addr | ~net_mask)))) { 5773 return (B_FALSE); 5774 } 5775 5776 /* 5777 * Even if the netmask is all ones, we do not allow address to be 5778 * 255.255.255.255 5779 */ 5780 if (addr == INADDR_BROADCAST) 5781 return (B_FALSE); 5782 5783 if (CLASSD(addr)) 5784 return (B_FALSE); 5785 5786 return (B_TRUE); 5787 } 5788 5789 #define V6_IPIF_LINKLOCAL(p) \ 5790 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 5791 5792 /* 5793 * Compare two given ipifs and check if the second one is better than 5794 * the first one using the order of preference (not taking deprecated 5795 * into acount) specified in ipif_lookup_multicast(). 5796 */ 5797 static boolean_t 5798 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 5799 { 5800 /* Check the least preferred first. */ 5801 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 5802 /* If both ipifs are the same, use the first one. */ 5803 if (IS_LOOPBACK(new_ipif->ipif_ill)) 5804 return (B_FALSE); 5805 else 5806 return (B_TRUE); 5807 } 5808 5809 /* For IPv6, check for link local address. */ 5810 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 5811 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5812 V6_IPIF_LINKLOCAL(new_ipif)) { 5813 /* The second one is equal or less preferred. */ 5814 return (B_FALSE); 5815 } else { 5816 return (B_TRUE); 5817 } 5818 } 5819 5820 /* Then check for point to point interface. */ 5821 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 5822 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5823 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 5824 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 5825 return (B_FALSE); 5826 } else { 5827 return (B_TRUE); 5828 } 5829 } 5830 5831 /* old_ipif is a normal interface, so no need to use the new one. */ 5832 return (B_FALSE); 5833 } 5834 5835 /* 5836 * Find any non-virtual, not condemned, and up multicast capable interface 5837 * given an IP instance and zoneid. Order of preference is: 5838 * 5839 * 1. normal 5840 * 1.1 normal, but deprecated 5841 * 2. point to point 5842 * 2.1 point to point, but deprecated 5843 * 3. link local 5844 * 3.1 link local, but deprecated 5845 * 4. loopback. 5846 */ 5847 ipif_t * 5848 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 5849 { 5850 ill_t *ill; 5851 ill_walk_context_t ctx; 5852 ipif_t *ipif; 5853 ipif_t *saved_ipif = NULL; 5854 ipif_t *dep_ipif = NULL; 5855 5856 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5857 if (isv6) 5858 ill = ILL_START_WALK_V6(&ctx, ipst); 5859 else 5860 ill = ILL_START_WALK_V4(&ctx, ipst); 5861 5862 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5863 mutex_enter(&ill->ill_lock); 5864 if (IS_VNI(ill) || !ILL_CAN_LOOKUP(ill) || 5865 !(ill->ill_flags & ILLF_MULTICAST)) { 5866 mutex_exit(&ill->ill_lock); 5867 continue; 5868 } 5869 for (ipif = ill->ill_ipif; ipif != NULL; 5870 ipif = ipif->ipif_next) { 5871 if (zoneid != ipif->ipif_zoneid && 5872 zoneid != ALL_ZONES && 5873 ipif->ipif_zoneid != ALL_ZONES) { 5874 continue; 5875 } 5876 if (!(ipif->ipif_flags & IPIF_UP) || 5877 !IPIF_CAN_LOOKUP(ipif)) { 5878 continue; 5879 } 5880 5881 /* 5882 * Found one candidate. If it is deprecated, 5883 * remember it in dep_ipif. If it is not deprecated, 5884 * remember it in saved_ipif. 5885 */ 5886 if (ipif->ipif_flags & IPIF_DEPRECATED) { 5887 if (dep_ipif == NULL) { 5888 dep_ipif = ipif; 5889 } else if (ipif_comp_multi(dep_ipif, ipif, 5890 isv6)) { 5891 /* 5892 * If the previous dep_ipif does not 5893 * belong to the same ill, we've done 5894 * a ipif_refhold() on it. So we need 5895 * to release it. 5896 */ 5897 if (dep_ipif->ipif_ill != ill) 5898 ipif_refrele(dep_ipif); 5899 dep_ipif = ipif; 5900 } 5901 continue; 5902 } 5903 if (saved_ipif == NULL) { 5904 saved_ipif = ipif; 5905 } else { 5906 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 5907 if (saved_ipif->ipif_ill != ill) 5908 ipif_refrele(saved_ipif); 5909 saved_ipif = ipif; 5910 } 5911 } 5912 } 5913 /* 5914 * Before going to the next ill, do a ipif_refhold() on the 5915 * saved ones. 5916 */ 5917 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 5918 ipif_refhold_locked(saved_ipif); 5919 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 5920 ipif_refhold_locked(dep_ipif); 5921 mutex_exit(&ill->ill_lock); 5922 } 5923 rw_exit(&ipst->ips_ill_g_lock); 5924 5925 /* 5926 * If we have only the saved_ipif, return it. But if we have both 5927 * saved_ipif and dep_ipif, check to see which one is better. 5928 */ 5929 if (saved_ipif != NULL) { 5930 if (dep_ipif != NULL) { 5931 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 5932 ipif_refrele(saved_ipif); 5933 return (dep_ipif); 5934 } else { 5935 ipif_refrele(dep_ipif); 5936 return (saved_ipif); 5937 } 5938 } 5939 return (saved_ipif); 5940 } else { 5941 return (dep_ipif); 5942 } 5943 } 5944 5945 /* 5946 * This function is called when an application does not specify an interface 5947 * to be used for multicast traffic (joining a group/sending data). It 5948 * calls ire_lookup_multi() to look for an interface route for the 5949 * specified multicast group. Doing this allows the administrator to add 5950 * prefix routes for multicast to indicate which interface to be used for 5951 * multicast traffic in the above scenario. The route could be for all 5952 * multicast (224.0/4), for a single multicast group (a /32 route) or 5953 * anything in between. If there is no such multicast route, we just find 5954 * any multicast capable interface and return it. The returned ipif 5955 * is refhold'ed. 5956 */ 5957 ipif_t * 5958 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) 5959 { 5960 ire_t *ire; 5961 ipif_t *ipif; 5962 5963 ire = ire_lookup_multi(group, zoneid, ipst); 5964 if (ire != NULL) { 5965 ipif = ire->ire_ipif; 5966 ipif_refhold(ipif); 5967 ire_refrele(ire); 5968 return (ipif); 5969 } 5970 5971 return (ipif_lookup_multicast(ipst, zoneid, B_FALSE)); 5972 } 5973 5974 /* 5975 * Look for an ipif with the specified interface address and destination. 5976 * The destination address is used only for matching point-to-point interfaces. 5977 */ 5978 ipif_t * 5979 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5980 ipsq_func_t func, int *error, ip_stack_t *ipst) 5981 { 5982 ipif_t *ipif; 5983 ill_t *ill; 5984 ill_walk_context_t ctx; 5985 ipsq_t *ipsq; 5986 5987 if (error != NULL) 5988 *error = 0; 5989 5990 /* 5991 * First match all the point-to-point interfaces 5992 * before looking at non-point-to-point interfaces. 5993 * This is done to avoid returning non-point-to-point 5994 * ipif instead of unnumbered point-to-point ipif. 5995 */ 5996 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5997 ill = ILL_START_WALK_V4(&ctx, ipst); 5998 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5999 GRAB_CONN_LOCK(q); 6000 mutex_enter(&ill->ill_lock); 6001 for (ipif = ill->ill_ipif; ipif != NULL; 6002 ipif = ipif->ipif_next) { 6003 /* Allow the ipif to be down */ 6004 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 6005 (ipif->ipif_lcl_addr == if_addr) && 6006 (ipif->ipif_pp_dst_addr == dst)) { 6007 /* 6008 * The block comment at the start of ipif_down 6009 * explains the use of the macros used below 6010 */ 6011 if (IPIF_CAN_LOOKUP(ipif)) { 6012 ipif_refhold_locked(ipif); 6013 mutex_exit(&ill->ill_lock); 6014 RELEASE_CONN_LOCK(q); 6015 rw_exit(&ipst->ips_ill_g_lock); 6016 return (ipif); 6017 } else if (IPIF_CAN_WAIT(ipif, q)) { 6018 ipsq = ill->ill_phyint->phyint_ipsq; 6019 mutex_enter(&ipsq->ipsq_lock); 6020 mutex_exit(&ill->ill_lock); 6021 rw_exit(&ipst->ips_ill_g_lock); 6022 ipsq_enq(ipsq, q, mp, func, NEW_OP, 6023 ill); 6024 mutex_exit(&ipsq->ipsq_lock); 6025 RELEASE_CONN_LOCK(q); 6026 if (error != NULL) 6027 *error = EINPROGRESS; 6028 return (NULL); 6029 } 6030 } 6031 } 6032 mutex_exit(&ill->ill_lock); 6033 RELEASE_CONN_LOCK(q); 6034 } 6035 rw_exit(&ipst->ips_ill_g_lock); 6036 6037 /* lookup the ipif based on interface address */ 6038 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error, 6039 ipst); 6040 ASSERT(ipif == NULL || !ipif->ipif_isv6); 6041 return (ipif); 6042 } 6043 6044 /* 6045 * Look for an ipif with the specified address. For point-point links 6046 * we look for matches on either the destination address and the local 6047 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 6048 * is set. 6049 * Matches on a specific ill if match_ill is set. 6050 */ 6051 ipif_t * 6052 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 6053 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 6054 { 6055 ipif_t *ipif; 6056 ill_t *ill; 6057 boolean_t ptp = B_FALSE; 6058 ipsq_t *ipsq; 6059 ill_walk_context_t ctx; 6060 6061 if (error != NULL) 6062 *error = 0; 6063 6064 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6065 /* 6066 * Repeat twice, first based on local addresses and 6067 * next time for pointopoint. 6068 */ 6069 repeat: 6070 ill = ILL_START_WALK_V4(&ctx, ipst); 6071 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6072 if (match_ill != NULL && ill != match_ill) { 6073 continue; 6074 } 6075 GRAB_CONN_LOCK(q); 6076 mutex_enter(&ill->ill_lock); 6077 for (ipif = ill->ill_ipif; ipif != NULL; 6078 ipif = ipif->ipif_next) { 6079 if (zoneid != ALL_ZONES && 6080 zoneid != ipif->ipif_zoneid && 6081 ipif->ipif_zoneid != ALL_ZONES) 6082 continue; 6083 /* Allow the ipif to be down */ 6084 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 6085 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 6086 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 6087 (ipif->ipif_pp_dst_addr == addr))) { 6088 /* 6089 * The block comment at the start of ipif_down 6090 * explains the use of the macros used below 6091 */ 6092 if (IPIF_CAN_LOOKUP(ipif)) { 6093 ipif_refhold_locked(ipif); 6094 mutex_exit(&ill->ill_lock); 6095 RELEASE_CONN_LOCK(q); 6096 rw_exit(&ipst->ips_ill_g_lock); 6097 return (ipif); 6098 } else if (IPIF_CAN_WAIT(ipif, q)) { 6099 ipsq = ill->ill_phyint->phyint_ipsq; 6100 mutex_enter(&ipsq->ipsq_lock); 6101 mutex_exit(&ill->ill_lock); 6102 rw_exit(&ipst->ips_ill_g_lock); 6103 ipsq_enq(ipsq, q, mp, func, NEW_OP, 6104 ill); 6105 mutex_exit(&ipsq->ipsq_lock); 6106 RELEASE_CONN_LOCK(q); 6107 if (error != NULL) 6108 *error = EINPROGRESS; 6109 return (NULL); 6110 } 6111 } 6112 } 6113 mutex_exit(&ill->ill_lock); 6114 RELEASE_CONN_LOCK(q); 6115 } 6116 6117 /* If we already did the ptp case, then we are done */ 6118 if (ptp) { 6119 rw_exit(&ipst->ips_ill_g_lock); 6120 if (error != NULL) 6121 *error = ENXIO; 6122 return (NULL); 6123 } 6124 ptp = B_TRUE; 6125 goto repeat; 6126 } 6127 6128 /* 6129 * Look for an ipif with the specified address. For point-point links 6130 * we look for matches on either the destination address and the local 6131 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 6132 * is set. 6133 * Matches on a specific ill if match_ill is set. 6134 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 6135 */ 6136 zoneid_t 6137 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 6138 { 6139 zoneid_t zoneid; 6140 ipif_t *ipif; 6141 ill_t *ill; 6142 boolean_t ptp = B_FALSE; 6143 ill_walk_context_t ctx; 6144 6145 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6146 /* 6147 * Repeat twice, first based on local addresses and 6148 * next time for pointopoint. 6149 */ 6150 repeat: 6151 ill = ILL_START_WALK_V4(&ctx, ipst); 6152 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 6153 if (match_ill != NULL && ill != match_ill) { 6154 continue; 6155 } 6156 mutex_enter(&ill->ill_lock); 6157 for (ipif = ill->ill_ipif; ipif != NULL; 6158 ipif = ipif->ipif_next) { 6159 /* Allow the ipif to be down */ 6160 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 6161 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 6162 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 6163 (ipif->ipif_pp_dst_addr == addr)) && 6164 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 6165 zoneid = ipif->ipif_zoneid; 6166 mutex_exit(&ill->ill_lock); 6167 rw_exit(&ipst->ips_ill_g_lock); 6168 /* 6169 * If ipif_zoneid was ALL_ZONES then we have 6170 * a trusted extensions shared IP address. 6171 * In that case GLOBAL_ZONEID works to send. 6172 */ 6173 if (zoneid == ALL_ZONES) 6174 zoneid = GLOBAL_ZONEID; 6175 return (zoneid); 6176 } 6177 } 6178 mutex_exit(&ill->ill_lock); 6179 } 6180 6181 /* If we already did the ptp case, then we are done */ 6182 if (ptp) { 6183 rw_exit(&ipst->ips_ill_g_lock); 6184 return (ALL_ZONES); 6185 } 6186 ptp = B_TRUE; 6187 goto repeat; 6188 } 6189 6190 /* 6191 * Look for an ipif that matches the specified remote address i.e. the 6192 * ipif that would receive the specified packet. 6193 * First look for directly connected interfaces and then do a recursive 6194 * IRE lookup and pick the first ipif corresponding to the source address in the 6195 * ire. 6196 * Returns: held ipif 6197 */ 6198 ipif_t * 6199 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 6200 { 6201 ipif_t *ipif; 6202 ire_t *ire; 6203 ip_stack_t *ipst = ill->ill_ipst; 6204 6205 ASSERT(!ill->ill_isv6); 6206 6207 /* 6208 * Someone could be changing this ipif currently or change it 6209 * after we return this. Thus a few packets could use the old 6210 * old values. However structure updates/creates (ire, ilg, ilm etc) 6211 * will atomically be updated or cleaned up with the new value 6212 * Thus we don't need a lock to check the flags or other attrs below. 6213 */ 6214 mutex_enter(&ill->ill_lock); 6215 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6216 if (!IPIF_CAN_LOOKUP(ipif)) 6217 continue; 6218 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 6219 ipif->ipif_zoneid != ALL_ZONES) 6220 continue; 6221 /* Allow the ipif to be down */ 6222 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 6223 if ((ipif->ipif_pp_dst_addr == addr) || 6224 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 6225 ipif->ipif_lcl_addr == addr)) { 6226 ipif_refhold_locked(ipif); 6227 mutex_exit(&ill->ill_lock); 6228 return (ipif); 6229 } 6230 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 6231 ipif_refhold_locked(ipif); 6232 mutex_exit(&ill->ill_lock); 6233 return (ipif); 6234 } 6235 } 6236 mutex_exit(&ill->ill_lock); 6237 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 6238 NULL, MATCH_IRE_RECURSIVE, ipst); 6239 if (ire != NULL) { 6240 /* 6241 * The callers of this function wants to know the 6242 * interface on which they have to send the replies 6243 * back. For IRE_CACHES that have ire_stq and ire_ipif 6244 * derived from different ills, we really don't care 6245 * what we return here. 6246 */ 6247 ipif = ire->ire_ipif; 6248 if (ipif != NULL) { 6249 ipif_refhold(ipif); 6250 ire_refrele(ire); 6251 return (ipif); 6252 } 6253 ire_refrele(ire); 6254 } 6255 /* Pick the first interface */ 6256 ipif = ipif_get_next_ipif(NULL, ill); 6257 return (ipif); 6258 } 6259 6260 /* 6261 * This func does not prevent refcnt from increasing. But if 6262 * the caller has taken steps to that effect, then this func 6263 * can be used to determine whether the ill has become quiescent 6264 */ 6265 boolean_t 6266 ill_is_quiescent(ill_t *ill) 6267 { 6268 ipif_t *ipif; 6269 6270 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6271 6272 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6273 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6274 return (B_FALSE); 6275 } 6276 } 6277 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0 || 6278 ill->ill_nce_cnt != 0) { 6279 return (B_FALSE); 6280 } 6281 return (B_TRUE); 6282 } 6283 6284 /* 6285 * This func does not prevent refcnt from increasing. But if 6286 * the caller has taken steps to that effect, then this func 6287 * can be used to determine whether the ipif has become quiescent 6288 */ 6289 static boolean_t 6290 ipif_is_quiescent(ipif_t *ipif) 6291 { 6292 ill_t *ill; 6293 6294 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6295 6296 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6297 return (B_FALSE); 6298 } 6299 6300 ill = ipif->ipif_ill; 6301 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 6302 ill->ill_logical_down) { 6303 return (B_TRUE); 6304 } 6305 6306 /* This is the last ipif going down or being deleted on this ill */ 6307 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 6308 return (B_FALSE); 6309 } 6310 6311 return (B_TRUE); 6312 } 6313 6314 /* 6315 * This func does not prevent refcnt from increasing. But if 6316 * the caller has taken steps to that effect, then this func 6317 * can be used to determine whether the ipifs marked with IPIF_MOVING 6318 * have become quiescent and can be moved in a failover/failback. 6319 */ 6320 static ipif_t * 6321 ill_quiescent_to_move(ill_t *ill) 6322 { 6323 ipif_t *ipif; 6324 6325 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6326 6327 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6328 if (ipif->ipif_state_flags & IPIF_MOVING) { 6329 if (ipif->ipif_refcnt != 0 || ipif->ipif_ire_cnt != 0) { 6330 return (ipif); 6331 } 6332 } 6333 } 6334 return (NULL); 6335 } 6336 6337 /* 6338 * The ipif/ill/ire has been refreled. Do the tail processing. 6339 * Determine if the ipif or ill in question has become quiescent and if so 6340 * wakeup close and/or restart any queued pending ioctl that is waiting 6341 * for the ipif_down (or ill_down) 6342 */ 6343 void 6344 ipif_ill_refrele_tail(ill_t *ill) 6345 { 6346 mblk_t *mp; 6347 conn_t *connp; 6348 ipsq_t *ipsq; 6349 ipif_t *ipif; 6350 dl_notify_ind_t *dlindp; 6351 6352 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6353 6354 if ((ill->ill_state_flags & ILL_CONDEMNED) && 6355 ill_is_quiescent(ill)) { 6356 /* ill_close may be waiting */ 6357 cv_broadcast(&ill->ill_cv); 6358 } 6359 6360 /* ipsq can't change because ill_lock is held */ 6361 ipsq = ill->ill_phyint->phyint_ipsq; 6362 if (ipsq->ipsq_waitfor == 0) { 6363 /* Not waiting for anything, just return. */ 6364 mutex_exit(&ill->ill_lock); 6365 return; 6366 } 6367 ASSERT(ipsq->ipsq_pending_mp != NULL && 6368 ipsq->ipsq_pending_ipif != NULL); 6369 /* 6370 * ipif->ipif_refcnt must go down to zero for restarting REMOVEIF. 6371 * Last ipif going down needs to down the ill, so ill_ire_cnt must 6372 * be zero for restarting an ioctl that ends up downing the ill. 6373 */ 6374 ipif = ipsq->ipsq_pending_ipif; 6375 if (ipif->ipif_ill != ill) { 6376 /* The ioctl is pending on some other ill. */ 6377 mutex_exit(&ill->ill_lock); 6378 return; 6379 } 6380 6381 switch (ipsq->ipsq_waitfor) { 6382 case IPIF_DOWN: 6383 case IPIF_FREE: 6384 if (!ipif_is_quiescent(ipif)) { 6385 mutex_exit(&ill->ill_lock); 6386 return; 6387 } 6388 break; 6389 6390 case ILL_DOWN: 6391 case ILL_FREE: 6392 /* 6393 * case ILL_FREE arises only for loopback. otherwise ill_delete 6394 * waits synchronously in ip_close, and no message is queued in 6395 * ipsq_pending_mp at all in this case 6396 */ 6397 if (!ill_is_quiescent(ill)) { 6398 mutex_exit(&ill->ill_lock); 6399 return; 6400 } 6401 6402 break; 6403 6404 case ILL_MOVE_OK: 6405 if (ill_quiescent_to_move(ill) != NULL) { 6406 mutex_exit(&ill->ill_lock); 6407 return; 6408 } 6409 6410 break; 6411 default: 6412 cmn_err(CE_PANIC, "ipsq: %p unknown ipsq_waitfor %d\n", 6413 (void *)ipsq, ipsq->ipsq_waitfor); 6414 } 6415 6416 /* 6417 * Incr refcnt for the qwriter_ip call below which 6418 * does a refrele 6419 */ 6420 ill_refhold_locked(ill); 6421 mutex_exit(&ill->ill_lock); 6422 6423 mp = ipsq_pending_mp_get(ipsq, &connp); 6424 ASSERT(mp != NULL); 6425 6426 /* 6427 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 6428 * we can only get here when the current operation decides it 6429 * it needs to quiesce via ipsq_pending_mp_add(). 6430 */ 6431 switch (mp->b_datap->db_type) { 6432 case M_PCPROTO: 6433 case M_PROTO: 6434 /* 6435 * For now, only DL_NOTIFY_IND messages can use this facility. 6436 */ 6437 dlindp = (dl_notify_ind_t *)mp->b_rptr; 6438 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 6439 6440 switch (dlindp->dl_notification) { 6441 case DL_NOTE_PHYS_ADDR: 6442 qwriter_ip(ill, ill->ill_rq, mp, 6443 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 6444 return; 6445 default: 6446 ASSERT(0); 6447 } 6448 break; 6449 6450 case M_ERROR: 6451 case M_HANGUP: 6452 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 6453 B_TRUE); 6454 return; 6455 6456 case M_IOCTL: 6457 case M_IOCDATA: 6458 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 6459 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 6460 return; 6461 6462 default: 6463 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6464 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6465 } 6466 } 6467 6468 #ifdef DEBUG 6469 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6470 static void 6471 th_trace_rrecord(th_trace_t *th_trace) 6472 { 6473 tr_buf_t *tr_buf; 6474 uint_t lastref; 6475 6476 lastref = th_trace->th_trace_lastref; 6477 lastref++; 6478 if (lastref == TR_BUF_MAX) 6479 lastref = 0; 6480 th_trace->th_trace_lastref = lastref; 6481 tr_buf = &th_trace->th_trbuf[lastref]; 6482 tr_buf->tr_time = lbolt; 6483 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 6484 } 6485 6486 static void 6487 th_trace_free(void *value) 6488 { 6489 th_trace_t *th_trace = value; 6490 6491 ASSERT(th_trace->th_refcnt == 0); 6492 kmem_free(th_trace, sizeof (*th_trace)); 6493 } 6494 6495 /* 6496 * Find or create the per-thread hash table used to track object references. 6497 * The ipst argument is NULL if we shouldn't allocate. 6498 * 6499 * Accesses per-thread data, so there's no need to lock here. 6500 */ 6501 static mod_hash_t * 6502 th_trace_gethash(ip_stack_t *ipst) 6503 { 6504 th_hash_t *thh; 6505 6506 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 6507 mod_hash_t *mh; 6508 char name[256]; 6509 size_t objsize, rshift; 6510 int retv; 6511 6512 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 6513 return (NULL); 6514 (void) snprintf(name, sizeof (name), "th_trace_%p", curthread); 6515 6516 /* 6517 * We use mod_hash_create_extended here rather than the more 6518 * obvious mod_hash_create_ptrhash because the latter has a 6519 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 6520 * block. 6521 */ 6522 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 6523 MAX(sizeof (ire_t), sizeof (nce_t))); 6524 rshift = highbit(objsize); 6525 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 6526 th_trace_free, mod_hash_byptr, (void *)rshift, 6527 mod_hash_ptrkey_cmp, KM_NOSLEEP); 6528 if (mh == NULL) { 6529 kmem_free(thh, sizeof (*thh)); 6530 return (NULL); 6531 } 6532 thh->thh_hash = mh; 6533 thh->thh_ipst = ipst; 6534 /* 6535 * We trace ills, ipifs, ires, and nces. All of these are 6536 * per-IP-stack, so the lock on the thread list is as well. 6537 */ 6538 rw_enter(&ip_thread_rwlock, RW_WRITER); 6539 list_insert_tail(&ip_thread_list, thh); 6540 rw_exit(&ip_thread_rwlock); 6541 retv = tsd_set(ip_thread_data, thh); 6542 ASSERT(retv == 0); 6543 } 6544 return (thh != NULL ? thh->thh_hash : NULL); 6545 } 6546 6547 boolean_t 6548 th_trace_ref(const void *obj, ip_stack_t *ipst) 6549 { 6550 th_trace_t *th_trace; 6551 mod_hash_t *mh; 6552 mod_hash_val_t val; 6553 6554 if ((mh = th_trace_gethash(ipst)) == NULL) 6555 return (B_FALSE); 6556 6557 /* 6558 * Attempt to locate the trace buffer for this obj and thread. 6559 * If it does not exist, then allocate a new trace buffer and 6560 * insert into the hash. 6561 */ 6562 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 6563 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 6564 if (th_trace == NULL) 6565 return (B_FALSE); 6566 6567 th_trace->th_id = curthread; 6568 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 6569 (mod_hash_val_t)th_trace) != 0) { 6570 kmem_free(th_trace, sizeof (th_trace_t)); 6571 return (B_FALSE); 6572 } 6573 } else { 6574 th_trace = (th_trace_t *)val; 6575 } 6576 6577 ASSERT(th_trace->th_refcnt >= 0 && 6578 th_trace->th_refcnt < TR_BUF_MAX - 1); 6579 6580 th_trace->th_refcnt++; 6581 th_trace_rrecord(th_trace); 6582 return (B_TRUE); 6583 } 6584 6585 /* 6586 * For the purpose of tracing a reference release, we assume that global 6587 * tracing is always on and that the same thread initiated the reference hold 6588 * is releasing. 6589 */ 6590 void 6591 th_trace_unref(const void *obj) 6592 { 6593 int retv; 6594 mod_hash_t *mh; 6595 th_trace_t *th_trace; 6596 mod_hash_val_t val; 6597 6598 mh = th_trace_gethash(NULL); 6599 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 6600 ASSERT(retv == 0); 6601 th_trace = (th_trace_t *)val; 6602 6603 ASSERT(th_trace->th_refcnt > 0); 6604 th_trace->th_refcnt--; 6605 th_trace_rrecord(th_trace); 6606 } 6607 6608 /* 6609 * If tracing has been disabled, then we assume that the reference counts are 6610 * now useless, and we clear them out before destroying the entries. 6611 */ 6612 void 6613 th_trace_cleanup(const void *obj, boolean_t trace_disable) 6614 { 6615 th_hash_t *thh; 6616 mod_hash_t *mh; 6617 mod_hash_val_t val; 6618 th_trace_t *th_trace; 6619 int retv; 6620 6621 rw_enter(&ip_thread_rwlock, RW_READER); 6622 for (thh = list_head(&ip_thread_list); thh != NULL; 6623 thh = list_next(&ip_thread_list, thh)) { 6624 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 6625 &val) == 0) { 6626 th_trace = (th_trace_t *)val; 6627 if (trace_disable) 6628 th_trace->th_refcnt = 0; 6629 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 6630 ASSERT(retv == 0); 6631 } 6632 } 6633 rw_exit(&ip_thread_rwlock); 6634 } 6635 6636 void 6637 ipif_trace_ref(ipif_t *ipif) 6638 { 6639 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6640 6641 if (ipif->ipif_trace_disable) 6642 return; 6643 6644 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 6645 ipif->ipif_trace_disable = B_TRUE; 6646 ipif_trace_cleanup(ipif); 6647 } 6648 } 6649 6650 void 6651 ipif_untrace_ref(ipif_t *ipif) 6652 { 6653 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6654 6655 if (!ipif->ipif_trace_disable) 6656 th_trace_unref(ipif); 6657 } 6658 6659 void 6660 ill_trace_ref(ill_t *ill) 6661 { 6662 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6663 6664 if (ill->ill_trace_disable) 6665 return; 6666 6667 if (!th_trace_ref(ill, ill->ill_ipst)) { 6668 ill->ill_trace_disable = B_TRUE; 6669 ill_trace_cleanup(ill); 6670 } 6671 } 6672 6673 void 6674 ill_untrace_ref(ill_t *ill) 6675 { 6676 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6677 6678 if (!ill->ill_trace_disable) 6679 th_trace_unref(ill); 6680 } 6681 6682 /* 6683 * Called when ipif is unplumbed or when memory alloc fails. Note that on 6684 * failure, ipif_trace_disable is set. 6685 */ 6686 static void 6687 ipif_trace_cleanup(const ipif_t *ipif) 6688 { 6689 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 6690 } 6691 6692 /* 6693 * Called when ill is unplumbed or when memory alloc fails. Note that on 6694 * failure, ill_trace_disable is set. 6695 */ 6696 static void 6697 ill_trace_cleanup(const ill_t *ill) 6698 { 6699 th_trace_cleanup(ill, ill->ill_trace_disable); 6700 } 6701 #endif /* DEBUG */ 6702 6703 void 6704 ipif_refhold_locked(ipif_t *ipif) 6705 { 6706 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6707 ipif->ipif_refcnt++; 6708 IPIF_TRACE_REF(ipif); 6709 } 6710 6711 void 6712 ipif_refhold(ipif_t *ipif) 6713 { 6714 ill_t *ill; 6715 6716 ill = ipif->ipif_ill; 6717 mutex_enter(&ill->ill_lock); 6718 ipif->ipif_refcnt++; 6719 IPIF_TRACE_REF(ipif); 6720 mutex_exit(&ill->ill_lock); 6721 } 6722 6723 /* 6724 * Must not be called while holding any locks. Otherwise if this is 6725 * the last reference to be released there is a chance of recursive mutex 6726 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6727 * to restart an ioctl. 6728 */ 6729 void 6730 ipif_refrele(ipif_t *ipif) 6731 { 6732 ill_t *ill; 6733 6734 ill = ipif->ipif_ill; 6735 6736 mutex_enter(&ill->ill_lock); 6737 ASSERT(ipif->ipif_refcnt != 0); 6738 ipif->ipif_refcnt--; 6739 IPIF_UNTRACE_REF(ipif); 6740 if (ipif->ipif_refcnt != 0) { 6741 mutex_exit(&ill->ill_lock); 6742 return; 6743 } 6744 6745 /* Drops the ill_lock */ 6746 ipif_ill_refrele_tail(ill); 6747 } 6748 6749 ipif_t * 6750 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6751 { 6752 ipif_t *ipif; 6753 6754 mutex_enter(&ill->ill_lock); 6755 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6756 ipif != NULL; ipif = ipif->ipif_next) { 6757 if (!IPIF_CAN_LOOKUP(ipif)) 6758 continue; 6759 ipif_refhold_locked(ipif); 6760 mutex_exit(&ill->ill_lock); 6761 return (ipif); 6762 } 6763 mutex_exit(&ill->ill_lock); 6764 return (NULL); 6765 } 6766 6767 /* 6768 * TODO: make this table extendible at run time 6769 * Return a pointer to the mac type info for 'mac_type' 6770 */ 6771 static ip_m_t * 6772 ip_m_lookup(t_uscalar_t mac_type) 6773 { 6774 ip_m_t *ipm; 6775 6776 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6777 if (ipm->ip_m_mac_type == mac_type) 6778 return (ipm); 6779 return (NULL); 6780 } 6781 6782 /* 6783 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6784 * ipif_arg is passed in to associate it with the correct interface. 6785 * We may need to restart this operation if the ipif cannot be looked up 6786 * due to an exclusive operation that is currently in progress. The restart 6787 * entry point is specified by 'func' 6788 */ 6789 int 6790 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6791 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg, 6792 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func, 6793 struct rtsa_s *sp, ip_stack_t *ipst) 6794 { 6795 ire_t *ire; 6796 ire_t *gw_ire = NULL; 6797 ipif_t *ipif = NULL; 6798 boolean_t ipif_refheld = B_FALSE; 6799 uint_t type; 6800 int match_flags = MATCH_IRE_TYPE; 6801 int error; 6802 tsol_gc_t *gc = NULL; 6803 tsol_gcgrp_t *gcgrp = NULL; 6804 boolean_t gcgrp_xtraref = B_FALSE; 6805 6806 ip1dbg(("ip_rt_add:")); 6807 6808 if (ire_arg != NULL) 6809 *ire_arg = NULL; 6810 6811 /* 6812 * If this is the case of RTF_HOST being set, then we set the netmask 6813 * to all ones (regardless if one was supplied). 6814 */ 6815 if (flags & RTF_HOST) 6816 mask = IP_HOST_MASK; 6817 6818 /* 6819 * Prevent routes with a zero gateway from being created (since 6820 * interfaces can currently be plumbed and brought up no assigned 6821 * address). 6822 */ 6823 if (gw_addr == 0) 6824 return (ENETUNREACH); 6825 /* 6826 * Get the ipif, if any, corresponding to the gw_addr 6827 */ 6828 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error, 6829 ipst); 6830 if (ipif != NULL) { 6831 if (IS_VNI(ipif->ipif_ill)) { 6832 ipif_refrele(ipif); 6833 return (EINVAL); 6834 } 6835 ipif_refheld = B_TRUE; 6836 } else if (error == EINPROGRESS) { 6837 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6838 return (EINPROGRESS); 6839 } else { 6840 error = 0; 6841 } 6842 6843 if (ipif != NULL) { 6844 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6845 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6846 } else { 6847 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6848 } 6849 6850 /* 6851 * GateD will attempt to create routes with a loopback interface 6852 * address as the gateway and with RTF_GATEWAY set. We allow 6853 * these routes to be added, but create them as interface routes 6854 * since the gateway is an interface address. 6855 */ 6856 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6857 flags &= ~RTF_GATEWAY; 6858 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6859 mask == IP_HOST_MASK) { 6860 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6861 ALL_ZONES, NULL, match_flags, ipst); 6862 if (ire != NULL) { 6863 ire_refrele(ire); 6864 if (ipif_refheld) 6865 ipif_refrele(ipif); 6866 return (EEXIST); 6867 } 6868 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" 6869 "for 0x%x\n", (void *)ipif, 6870 ipif->ipif_ire_type, 6871 ntohl(ipif->ipif_lcl_addr))); 6872 ire = ire_create( 6873 (uchar_t *)&dst_addr, /* dest address */ 6874 (uchar_t *)&mask, /* mask */ 6875 (uchar_t *)&ipif->ipif_src_addr, 6876 NULL, /* no gateway */ 6877 &ipif->ipif_mtu, 6878 NULL, 6879 ipif->ipif_rq, /* recv-from queue */ 6880 NULL, /* no send-to queue */ 6881 ipif->ipif_ire_type, /* LOOPBACK */ 6882 ipif, 6883 0, 6884 0, 6885 0, 6886 (ipif->ipif_flags & IPIF_PRIVATE) ? 6887 RTF_PRIVATE : 0, 6888 &ire_uinfo_null, 6889 NULL, 6890 NULL, 6891 ipst); 6892 6893 if (ire == NULL) { 6894 if (ipif_refheld) 6895 ipif_refrele(ipif); 6896 return (ENOMEM); 6897 } 6898 error = ire_add(&ire, q, mp, func, B_FALSE); 6899 if (error == 0) 6900 goto save_ire; 6901 if (ipif_refheld) 6902 ipif_refrele(ipif); 6903 return (error); 6904 6905 } 6906 } 6907 6908 /* 6909 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6910 * and the gateway address provided is one of the system's interface 6911 * addresses. By using the routing socket interface and supplying an 6912 * RTA_IFP sockaddr with an interface index, an alternate method of 6913 * specifying an interface route to be created is available which uses 6914 * the interface index that specifies the outgoing interface rather than 6915 * the address of an outgoing interface (which may not be able to 6916 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6917 * flag, routes can be specified which not only specify the next-hop to 6918 * be used when routing to a certain prefix, but also which outgoing 6919 * interface should be used. 6920 * 6921 * Previously, interfaces would have unique addresses assigned to them 6922 * and so the address assigned to a particular interface could be used 6923 * to identify a particular interface. One exception to this was the 6924 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6925 * 6926 * With the advent of IPv6 and its link-local addresses, this 6927 * restriction was relaxed and interfaces could share addresses between 6928 * themselves. In fact, typically all of the link-local interfaces on 6929 * an IPv6 node or router will have the same link-local address. In 6930 * order to differentiate between these interfaces, the use of an 6931 * interface index is necessary and this index can be carried inside a 6932 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6933 * of using the interface index, however, is that all of the ipif's that 6934 * are part of an ill have the same index and so the RTA_IFP sockaddr 6935 * cannot be used to differentiate between ipif's (or logical 6936 * interfaces) that belong to the same ill (physical interface). 6937 * 6938 * For example, in the following case involving IPv4 interfaces and 6939 * logical interfaces 6940 * 6941 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6942 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6943 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6944 * 6945 * the ipif's corresponding to each of these interface routes can be 6946 * uniquely identified by the "gateway" (actually interface address). 6947 * 6948 * In this case involving multiple IPv6 default routes to a particular 6949 * link-local gateway, the use of RTA_IFP is necessary to specify which 6950 * default route is of interest: 6951 * 6952 * default fe80::123:4567:89ab:cdef U if0 6953 * default fe80::123:4567:89ab:cdef U if1 6954 */ 6955 6956 /* RTF_GATEWAY not set */ 6957 if (!(flags & RTF_GATEWAY)) { 6958 queue_t *stq; 6959 6960 if (sp != NULL) { 6961 ip2dbg(("ip_rt_add: gateway security attributes " 6962 "cannot be set with interface route\n")); 6963 if (ipif_refheld) 6964 ipif_refrele(ipif); 6965 return (EINVAL); 6966 } 6967 6968 /* 6969 * As the interface index specified with the RTA_IFP sockaddr is 6970 * the same for all ipif's off of an ill, the matching logic 6971 * below uses MATCH_IRE_ILL if such an index was specified. 6972 * This means that routes sharing the same prefix when added 6973 * using a RTA_IFP sockaddr must have distinct interface 6974 * indices (namely, they must be on distinct ill's). 6975 * 6976 * On the other hand, since the gateway address will usually be 6977 * different for each ipif on the system, the matching logic 6978 * uses MATCH_IRE_IPIF in the case of a traditional interface 6979 * route. This means that interface routes for the same prefix 6980 * can be created if they belong to distinct ipif's and if a 6981 * RTA_IFP sockaddr is not present. 6982 */ 6983 if (ipif_arg != NULL) { 6984 if (ipif_refheld) { 6985 ipif_refrele(ipif); 6986 ipif_refheld = B_FALSE; 6987 } 6988 ipif = ipif_arg; 6989 match_flags |= MATCH_IRE_ILL; 6990 } else { 6991 /* 6992 * Check the ipif corresponding to the gw_addr 6993 */ 6994 if (ipif == NULL) 6995 return (ENETUNREACH); 6996 match_flags |= MATCH_IRE_IPIF; 6997 } 6998 ASSERT(ipif != NULL); 6999 7000 /* 7001 * We check for an existing entry at this point. 7002 * 7003 * Since a netmask isn't passed in via the ioctl interface 7004 * (SIOCADDRT), we don't check for a matching netmask in that 7005 * case. 7006 */ 7007 if (!ioctl_msg) 7008 match_flags |= MATCH_IRE_MASK; 7009 ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif, 7010 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 7011 if (ire != NULL) { 7012 ire_refrele(ire); 7013 if (ipif_refheld) 7014 ipif_refrele(ipif); 7015 return (EEXIST); 7016 } 7017 7018 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 7019 ? ipif->ipif_rq : ipif->ipif_wq; 7020 7021 /* 7022 * Create a copy of the IRE_LOOPBACK, 7023 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 7024 * the modified address and netmask. 7025 */ 7026 ire = ire_create( 7027 (uchar_t *)&dst_addr, 7028 (uint8_t *)&mask, 7029 (uint8_t *)&ipif->ipif_src_addr, 7030 NULL, 7031 &ipif->ipif_mtu, 7032 NULL, 7033 NULL, 7034 stq, 7035 ipif->ipif_net_type, 7036 ipif, 7037 0, 7038 0, 7039 0, 7040 flags, 7041 &ire_uinfo_null, 7042 NULL, 7043 NULL, 7044 ipst); 7045 if (ire == NULL) { 7046 if (ipif_refheld) 7047 ipif_refrele(ipif); 7048 return (ENOMEM); 7049 } 7050 7051 /* 7052 * Some software (for example, GateD and Sun Cluster) attempts 7053 * to create (what amount to) IRE_PREFIX routes with the 7054 * loopback address as the gateway. This is primarily done to 7055 * set up prefixes with the RTF_REJECT flag set (for example, 7056 * when generating aggregate routes.) 7057 * 7058 * If the IRE type (as defined by ipif->ipif_net_type) is 7059 * IRE_LOOPBACK, then we map the request into a 7060 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 7061 * these interface routes, by definition, can only be that. 7062 * 7063 * Needless to say, the real IRE_LOOPBACK is NOT created by this 7064 * routine, but rather using ire_create() directly. 7065 * 7066 */ 7067 if (ipif->ipif_net_type == IRE_LOOPBACK) { 7068 ire->ire_type = IRE_IF_NORESOLVER; 7069 ire->ire_flags |= RTF_BLACKHOLE; 7070 } 7071 7072 error = ire_add(&ire, q, mp, func, B_FALSE); 7073 if (error == 0) 7074 goto save_ire; 7075 7076 /* 7077 * In the result of failure, ire_add() will have already 7078 * deleted the ire in question, so there is no need to 7079 * do that here. 7080 */ 7081 if (ipif_refheld) 7082 ipif_refrele(ipif); 7083 return (error); 7084 } 7085 if (ipif_refheld) { 7086 ipif_refrele(ipif); 7087 ipif_refheld = B_FALSE; 7088 } 7089 7090 /* 7091 * Get an interface IRE for the specified gateway. 7092 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 7093 * gateway, it is currently unreachable and we fail the request 7094 * accordingly. 7095 */ 7096 ipif = ipif_arg; 7097 if (ipif_arg != NULL) 7098 match_flags |= MATCH_IRE_ILL; 7099 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 7100 ALL_ZONES, 0, NULL, match_flags, ipst); 7101 if (gw_ire == NULL) 7102 return (ENETUNREACH); 7103 7104 /* 7105 * We create one of three types of IREs as a result of this request 7106 * based on the netmask. A netmask of all ones (which is automatically 7107 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 7108 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 7109 * created. Otherwise, an IRE_PREFIX route is created for the 7110 * destination prefix. 7111 */ 7112 if (mask == IP_HOST_MASK) 7113 type = IRE_HOST; 7114 else if (mask == 0) 7115 type = IRE_DEFAULT; 7116 else 7117 type = IRE_PREFIX; 7118 7119 /* check for a duplicate entry */ 7120 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7121 NULL, ALL_ZONES, 0, NULL, 7122 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst); 7123 if (ire != NULL) { 7124 ire_refrele(gw_ire); 7125 ire_refrele(ire); 7126 return (EEXIST); 7127 } 7128 7129 /* Security attribute exists */ 7130 if (sp != NULL) { 7131 tsol_gcgrp_addr_t ga; 7132 7133 /* find or create the gateway credentials group */ 7134 ga.ga_af = AF_INET; 7135 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 7136 7137 /* we hold reference to it upon success */ 7138 gcgrp = gcgrp_lookup(&ga, B_TRUE); 7139 if (gcgrp == NULL) { 7140 ire_refrele(gw_ire); 7141 return (ENOMEM); 7142 } 7143 7144 /* 7145 * Create and add the security attribute to the group; a 7146 * reference to the group is made upon allocating a new 7147 * entry successfully. If it finds an already-existing 7148 * entry for the security attribute in the group, it simply 7149 * returns it and no new reference is made to the group. 7150 */ 7151 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 7152 if (gc == NULL) { 7153 /* release reference held by gcgrp_lookup */ 7154 GCGRP_REFRELE(gcgrp); 7155 ire_refrele(gw_ire); 7156 return (ENOMEM); 7157 } 7158 } 7159 7160 /* Create the IRE. */ 7161 ire = ire_create( 7162 (uchar_t *)&dst_addr, /* dest address */ 7163 (uchar_t *)&mask, /* mask */ 7164 /* src address assigned by the caller? */ 7165 (uchar_t *)(((src_addr != INADDR_ANY) && 7166 (flags & RTF_SETSRC)) ? &src_addr : NULL), 7167 (uchar_t *)&gw_addr, /* gateway address */ 7168 &gw_ire->ire_max_frag, 7169 NULL, /* no src nce */ 7170 NULL, /* no recv-from queue */ 7171 NULL, /* no send-to queue */ 7172 (ushort_t)type, /* IRE type */ 7173 ipif_arg, 7174 0, 7175 0, 7176 0, 7177 flags, 7178 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 7179 gc, /* security attribute */ 7180 NULL, 7181 ipst); 7182 7183 /* 7184 * The ire holds a reference to the 'gc' and the 'gc' holds a 7185 * reference to the 'gcgrp'. We can now release the extra reference 7186 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 7187 */ 7188 if (gcgrp_xtraref) 7189 GCGRP_REFRELE(gcgrp); 7190 if (ire == NULL) { 7191 if (gc != NULL) 7192 GC_REFRELE(gc); 7193 ire_refrele(gw_ire); 7194 return (ENOMEM); 7195 } 7196 7197 /* 7198 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 7199 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 7200 */ 7201 7202 /* Add the new IRE. */ 7203 error = ire_add(&ire, q, mp, func, B_FALSE); 7204 if (error != 0) { 7205 /* 7206 * In the result of failure, ire_add() will have already 7207 * deleted the ire in question, so there is no need to 7208 * do that here. 7209 */ 7210 ire_refrele(gw_ire); 7211 return (error); 7212 } 7213 7214 if (flags & RTF_MULTIRT) { 7215 /* 7216 * Invoke the CGTP (multirouting) filtering module 7217 * to add the dst address in the filtering database. 7218 * Replicated inbound packets coming from that address 7219 * will be filtered to discard the duplicates. 7220 * It is not necessary to call the CGTP filter hook 7221 * when the dst address is a broadcast or multicast, 7222 * because an IP source address cannot be a broadcast 7223 * or a multicast. 7224 */ 7225 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 7226 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7227 if (ire_dst != NULL) { 7228 ip_cgtp_bcast_add(ire, ire_dst, ipst); 7229 ire_refrele(ire_dst); 7230 goto save_ire; 7231 } 7232 if (ipst->ips_ip_cgtp_filter_ops != NULL && 7233 !CLASSD(ire->ire_addr)) { 7234 int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4( 7235 ipst->ips_netstack->netstack_stackid, 7236 ire->ire_addr, 7237 ire->ire_gateway_addr, 7238 ire->ire_src_addr, 7239 gw_ire->ire_src_addr); 7240 if (res != 0) { 7241 ire_refrele(gw_ire); 7242 ire_delete(ire); 7243 return (res); 7244 } 7245 } 7246 } 7247 7248 /* 7249 * Now that the prefix IRE entry has been created, delete any 7250 * existing gateway IRE cache entries as well as any IRE caches 7251 * using the gateway, and force them to be created through 7252 * ip_newroute. 7253 */ 7254 if (gc != NULL) { 7255 ASSERT(gcgrp != NULL); 7256 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst); 7257 } 7258 7259 save_ire: 7260 if (gw_ire != NULL) { 7261 ire_refrele(gw_ire); 7262 } 7263 if (ipif != NULL) { 7264 /* 7265 * Save enough information so that we can recreate the IRE if 7266 * the interface goes down and then up. The metrics associated 7267 * with the route will be saved as well when rts_setmetrics() is 7268 * called after the IRE has been created. In the case where 7269 * memory cannot be allocated, none of this information will be 7270 * saved. 7271 */ 7272 ipif_save_ire(ipif, ire); 7273 } 7274 if (ioctl_msg) 7275 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 7276 if (ire_arg != NULL) { 7277 /* 7278 * Store the ire that was successfully added into where ire_arg 7279 * points to so that callers don't have to look it up 7280 * themselves (but they are responsible for ire_refrele()ing 7281 * the ire when they are finished with it). 7282 */ 7283 *ire_arg = ire; 7284 } else { 7285 ire_refrele(ire); /* Held in ire_add */ 7286 } 7287 if (ipif_refheld) 7288 ipif_refrele(ipif); 7289 return (0); 7290 } 7291 7292 /* 7293 * ip_rt_delete is called to delete an IPv4 route. 7294 * ipif_arg is passed in to associate it with the correct interface. 7295 * We may need to restart this operation if the ipif cannot be looked up 7296 * due to an exclusive operation that is currently in progress. The restart 7297 * entry point is specified by 'func' 7298 */ 7299 /* ARGSUSED4 */ 7300 int 7301 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7302 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg, 7303 queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst) 7304 { 7305 ire_t *ire = NULL; 7306 ipif_t *ipif; 7307 boolean_t ipif_refheld = B_FALSE; 7308 uint_t type; 7309 uint_t match_flags = MATCH_IRE_TYPE; 7310 int err = 0; 7311 7312 ip1dbg(("ip_rt_delete:")); 7313 /* 7314 * If this is the case of RTF_HOST being set, then we set the netmask 7315 * to all ones. Otherwise, we use the netmask if one was supplied. 7316 */ 7317 if (flags & RTF_HOST) { 7318 mask = IP_HOST_MASK; 7319 match_flags |= MATCH_IRE_MASK; 7320 } else if (rtm_addrs & RTA_NETMASK) { 7321 match_flags |= MATCH_IRE_MASK; 7322 } 7323 7324 /* 7325 * Note that RTF_GATEWAY is never set on a delete, therefore 7326 * we check if the gateway address is one of our interfaces first, 7327 * and fall back on RTF_GATEWAY routes. 7328 * 7329 * This makes it possible to delete an original 7330 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7331 * 7332 * As the interface index specified with the RTA_IFP sockaddr is the 7333 * same for all ipif's off of an ill, the matching logic below uses 7334 * MATCH_IRE_ILL if such an index was specified. This means a route 7335 * sharing the same prefix and interface index as the the route 7336 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7337 * is specified in the request. 7338 * 7339 * On the other hand, since the gateway address will usually be 7340 * different for each ipif on the system, the matching logic 7341 * uses MATCH_IRE_IPIF in the case of a traditional interface 7342 * route. This means that interface routes for the same prefix can be 7343 * uniquely identified if they belong to distinct ipif's and if a 7344 * RTA_IFP sockaddr is not present. 7345 * 7346 * For more detail on specifying routes by gateway address and by 7347 * interface index, see the comments in ip_rt_add(). 7348 */ 7349 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err, 7350 ipst); 7351 if (ipif != NULL) 7352 ipif_refheld = B_TRUE; 7353 else if (err == EINPROGRESS) 7354 return (err); 7355 else 7356 err = 0; 7357 if (ipif != NULL) { 7358 if (ipif_arg != NULL) { 7359 if (ipif_refheld) { 7360 ipif_refrele(ipif); 7361 ipif_refheld = B_FALSE; 7362 } 7363 ipif = ipif_arg; 7364 match_flags |= MATCH_IRE_ILL; 7365 } else { 7366 match_flags |= MATCH_IRE_IPIF; 7367 } 7368 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7369 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 7370 ALL_ZONES, NULL, match_flags, ipst); 7371 } 7372 if (ire == NULL) { 7373 ire = ire_ftable_lookup(dst_addr, mask, 0, 7374 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, 7375 match_flags, ipst); 7376 } 7377 } 7378 7379 if (ire == NULL) { 7380 /* 7381 * At this point, the gateway address is not one of our own 7382 * addresses or a matching interface route was not found. We 7383 * set the IRE type to lookup based on whether 7384 * this is a host route, a default route or just a prefix. 7385 * 7386 * If an ipif_arg was passed in, then the lookup is based on an 7387 * interface index so MATCH_IRE_ILL is added to match_flags. 7388 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7389 * set as the route being looked up is not a traditional 7390 * interface route. 7391 */ 7392 match_flags &= ~MATCH_IRE_IPIF; 7393 match_flags |= MATCH_IRE_GW; 7394 if (ipif_arg != NULL) 7395 match_flags |= MATCH_IRE_ILL; 7396 if (mask == IP_HOST_MASK) 7397 type = IRE_HOST; 7398 else if (mask == 0) 7399 type = IRE_DEFAULT; 7400 else 7401 type = IRE_PREFIX; 7402 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7403 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 7404 } 7405 7406 if (ipif_refheld) 7407 ipif_refrele(ipif); 7408 7409 /* ipif is not refheld anymore */ 7410 if (ire == NULL) 7411 return (ESRCH); 7412 7413 if (ire->ire_flags & RTF_MULTIRT) { 7414 /* 7415 * Invoke the CGTP (multirouting) filtering module 7416 * to remove the dst address from the filtering database. 7417 * Packets coming from that address will no longer be 7418 * filtered to remove duplicates. 7419 */ 7420 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 7421 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 7422 ipst->ips_netstack->netstack_stackid, 7423 ire->ire_addr, ire->ire_gateway_addr); 7424 } 7425 ip_cgtp_bcast_delete(ire, ipst); 7426 } 7427 7428 ipif = ire->ire_ipif; 7429 if (ipif != NULL) 7430 ipif_remove_ire(ipif, ire); 7431 if (ioctl_msg) 7432 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 7433 ire_delete(ire); 7434 ire_refrele(ire); 7435 return (err); 7436 } 7437 7438 /* 7439 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7440 */ 7441 /* ARGSUSED */ 7442 int 7443 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7444 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7445 { 7446 ipaddr_t dst_addr; 7447 ipaddr_t gw_addr; 7448 ipaddr_t mask; 7449 int error = 0; 7450 mblk_t *mp1; 7451 struct rtentry *rt; 7452 ipif_t *ipif = NULL; 7453 ip_stack_t *ipst; 7454 7455 ASSERT(q->q_next == NULL); 7456 ipst = CONNQ_TO_IPST(q); 7457 7458 ip1dbg(("ip_siocaddrt:")); 7459 /* Existence of mp1 verified in ip_wput_nondata */ 7460 mp1 = mp->b_cont->b_cont; 7461 rt = (struct rtentry *)mp1->b_rptr; 7462 7463 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7464 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7465 7466 /* 7467 * If the RTF_HOST flag is on, this is a request to assign a gateway 7468 * to a particular host address. In this case, we set the netmask to 7469 * all ones for the particular destination address. Otherwise, 7470 * determine the netmask to be used based on dst_addr and the interfaces 7471 * in use. 7472 */ 7473 if (rt->rt_flags & RTF_HOST) { 7474 mask = IP_HOST_MASK; 7475 } else { 7476 /* 7477 * Note that ip_subnet_mask returns a zero mask in the case of 7478 * default (an all-zeroes address). 7479 */ 7480 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7481 } 7482 7483 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7484 B_TRUE, q, mp, ip_process_ioctl, NULL, ipst); 7485 if (ipif != NULL) 7486 ipif_refrele(ipif); 7487 return (error); 7488 } 7489 7490 /* 7491 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7492 */ 7493 /* ARGSUSED */ 7494 int 7495 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7496 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7497 { 7498 ipaddr_t dst_addr; 7499 ipaddr_t gw_addr; 7500 ipaddr_t mask; 7501 int error; 7502 mblk_t *mp1; 7503 struct rtentry *rt; 7504 ipif_t *ipif = NULL; 7505 ip_stack_t *ipst; 7506 7507 ASSERT(q->q_next == NULL); 7508 ipst = CONNQ_TO_IPST(q); 7509 7510 ip1dbg(("ip_siocdelrt:")); 7511 /* Existence of mp1 verified in ip_wput_nondata */ 7512 mp1 = mp->b_cont->b_cont; 7513 rt = (struct rtentry *)mp1->b_rptr; 7514 7515 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7516 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7517 7518 /* 7519 * If the RTF_HOST flag is on, this is a request to delete a gateway 7520 * to a particular host address. In this case, we set the netmask to 7521 * all ones for the particular destination address. Otherwise, 7522 * determine the netmask to be used based on dst_addr and the interfaces 7523 * in use. 7524 */ 7525 if (rt->rt_flags & RTF_HOST) { 7526 mask = IP_HOST_MASK; 7527 } else { 7528 /* 7529 * Note that ip_subnet_mask returns a zero mask in the case of 7530 * default (an all-zeroes address). 7531 */ 7532 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7533 } 7534 7535 error = ip_rt_delete(dst_addr, mask, gw_addr, 7536 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q, 7537 mp, ip_process_ioctl, ipst); 7538 if (ipif != NULL) 7539 ipif_refrele(ipif); 7540 return (error); 7541 } 7542 7543 /* 7544 * Enqueue the mp onto the ipsq, chained by b_next. 7545 * b_prev stores the function to be executed later, and b_queue the queue 7546 * where this mp originated. 7547 */ 7548 void 7549 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7550 ill_t *pending_ill) 7551 { 7552 conn_t *connp = NULL; 7553 7554 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7555 ASSERT(func != NULL); 7556 7557 mp->b_queue = q; 7558 mp->b_prev = (void *)func; 7559 mp->b_next = NULL; 7560 7561 switch (type) { 7562 case CUR_OP: 7563 if (ipsq->ipsq_mptail != NULL) { 7564 ASSERT(ipsq->ipsq_mphead != NULL); 7565 ipsq->ipsq_mptail->b_next = mp; 7566 } else { 7567 ASSERT(ipsq->ipsq_mphead == NULL); 7568 ipsq->ipsq_mphead = mp; 7569 } 7570 ipsq->ipsq_mptail = mp; 7571 break; 7572 7573 case NEW_OP: 7574 if (ipsq->ipsq_xopq_mptail != NULL) { 7575 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7576 ipsq->ipsq_xopq_mptail->b_next = mp; 7577 } else { 7578 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7579 ipsq->ipsq_xopq_mphead = mp; 7580 } 7581 ipsq->ipsq_xopq_mptail = mp; 7582 break; 7583 default: 7584 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7585 } 7586 7587 if (CONN_Q(q) && pending_ill != NULL) { 7588 connp = Q_TO_CONN(q); 7589 7590 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7591 connp->conn_oper_pending_ill = pending_ill; 7592 } 7593 } 7594 7595 /* 7596 * Return the mp at the head of the ipsq. After emptying the ipsq 7597 * look at the next ioctl, if this ioctl is complete. Otherwise 7598 * return, we will resume when we complete the current ioctl. 7599 * The current ioctl will wait till it gets a response from the 7600 * driver below. 7601 */ 7602 static mblk_t * 7603 ipsq_dq(ipsq_t *ipsq) 7604 { 7605 mblk_t *mp; 7606 7607 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7608 7609 mp = ipsq->ipsq_mphead; 7610 if (mp != NULL) { 7611 ipsq->ipsq_mphead = mp->b_next; 7612 if (ipsq->ipsq_mphead == NULL) 7613 ipsq->ipsq_mptail = NULL; 7614 mp->b_next = NULL; 7615 return (mp); 7616 } 7617 if (ipsq->ipsq_current_ipif != NULL) 7618 return (NULL); 7619 mp = ipsq->ipsq_xopq_mphead; 7620 if (mp != NULL) { 7621 ipsq->ipsq_xopq_mphead = mp->b_next; 7622 if (ipsq->ipsq_xopq_mphead == NULL) 7623 ipsq->ipsq_xopq_mptail = NULL; 7624 mp->b_next = NULL; 7625 return (mp); 7626 } 7627 return (NULL); 7628 } 7629 7630 /* 7631 * Enter the ipsq corresponding to ill, by waiting synchronously till 7632 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7633 * will have to drain completely before ipsq_enter returns success. 7634 * ipsq_current_ipif will be set if some exclusive ioctl is in progress, 7635 * and the ipsq_exit logic will start the next enqueued ioctl after 7636 * completion of the current ioctl. If 'force' is used, we don't wait 7637 * for the enqueued ioctls. This is needed when a conn_close wants to 7638 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7639 * of an ill can also use this option. But we dont' use it currently. 7640 */ 7641 #define ENTER_SQ_WAIT_TICKS 100 7642 boolean_t 7643 ipsq_enter(ill_t *ill, boolean_t force) 7644 { 7645 ipsq_t *ipsq; 7646 boolean_t waited_enough = B_FALSE; 7647 7648 /* 7649 * Holding the ill_lock prevents <ill-ipsq> assocs from changing. 7650 * Since the <ill-ipsq> assocs could change while we wait for the 7651 * writer, it is easier to wait on a fixed global rather than try to 7652 * cv_wait on a changing ipsq. 7653 */ 7654 mutex_enter(&ill->ill_lock); 7655 for (;;) { 7656 if (ill->ill_state_flags & ILL_CONDEMNED) { 7657 mutex_exit(&ill->ill_lock); 7658 return (B_FALSE); 7659 } 7660 7661 ipsq = ill->ill_phyint->phyint_ipsq; 7662 mutex_enter(&ipsq->ipsq_lock); 7663 if (ipsq->ipsq_writer == NULL && 7664 (ipsq->ipsq_current_ipif == NULL || waited_enough)) { 7665 break; 7666 } else if (ipsq->ipsq_writer != NULL) { 7667 mutex_exit(&ipsq->ipsq_lock); 7668 cv_wait(&ill->ill_cv, &ill->ill_lock); 7669 } else { 7670 mutex_exit(&ipsq->ipsq_lock); 7671 if (force) { 7672 (void) cv_timedwait(&ill->ill_cv, 7673 &ill->ill_lock, 7674 lbolt + ENTER_SQ_WAIT_TICKS); 7675 waited_enough = B_TRUE; 7676 continue; 7677 } else { 7678 cv_wait(&ill->ill_cv, &ill->ill_lock); 7679 } 7680 } 7681 } 7682 7683 ASSERT(ipsq->ipsq_mphead == NULL && ipsq->ipsq_mptail == NULL); 7684 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7685 ipsq->ipsq_writer = curthread; 7686 ipsq->ipsq_reentry_cnt++; 7687 #ifdef DEBUG 7688 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, IPSQ_STACK_DEPTH); 7689 #endif 7690 mutex_exit(&ipsq->ipsq_lock); 7691 mutex_exit(&ill->ill_lock); 7692 return (B_TRUE); 7693 } 7694 7695 /* 7696 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7697 * certain critical operations like plumbing (i.e. most set ioctls), 7698 * multicast joins, igmp/mld timers, IPMP operations etc. On a non-IPMP 7699 * system there is 1 ipsq per phyint. On an IPMP system there is 1 ipsq per 7700 * IPMP group. The ipsq serializes exclusive ioctls issued by applications 7701 * on a per ipsq basis in ipsq_xopq_mphead. It also protects against multiple 7702 * threads executing in the ipsq. Responses from the driver pertain to the 7703 * current ioctl (say a DL_BIND_ACK in response to a DL_BIND_REQUEST initiated 7704 * as part of bringing up the interface) and are enqueued in ipsq_mphead. 7705 * 7706 * If a thread does not want to reenter the ipsq when it is already writer, 7707 * it must make sure that the specified reentry point to be called later 7708 * when the ipsq is empty, nor any code path starting from the specified reentry 7709 * point must never ever try to enter the ipsq again. Otherwise it can lead 7710 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7711 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7712 * dequeues the requests waiting to become exclusive in ipsq_mphead and calls 7713 * the reentry point. When the list at ipsq_mphead becomes empty ipsq_exit 7714 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7715 * ioctl if the current ioctl has completed. If the current ioctl is still 7716 * in progress it simply returns. The current ioctl could be waiting for 7717 * a response from another module (arp_ or the driver or could be waiting for 7718 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipsq_pending_mp 7719 * and ipsq_pending_ipif are set. ipsq_current_ipif is set throughout the 7720 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7721 * ipsq_current_ipif is clear which happens only on ioctl completion. 7722 */ 7723 7724 /* 7725 * Try to enter the ipsq exclusively, corresponding to ipif or ill. (only 1 of 7726 * ipif or ill can be specified). The caller ensures ipif or ill is valid by 7727 * ref-holding it if necessary. If the ipsq cannot be entered, the mp is queued 7728 * completion. 7729 */ 7730 ipsq_t * 7731 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7732 ipsq_func_t func, int type, boolean_t reentry_ok) 7733 { 7734 ipsq_t *ipsq; 7735 7736 /* Only 1 of ipif or ill can be specified */ 7737 ASSERT((ipif != NULL) ^ (ill != NULL)); 7738 if (ipif != NULL) 7739 ill = ipif->ipif_ill; 7740 7741 /* 7742 * lock ordering ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock 7743 * ipsq of an ill can't change when ill_lock is held. 7744 */ 7745 GRAB_CONN_LOCK(q); 7746 mutex_enter(&ill->ill_lock); 7747 ipsq = ill->ill_phyint->phyint_ipsq; 7748 mutex_enter(&ipsq->ipsq_lock); 7749 7750 /* 7751 * 1. Enter the ipsq if we are already writer and reentry is ok. 7752 * (Note: If the caller does not specify reentry_ok then neither 7753 * 'func' nor any of its callees must ever attempt to enter the ipsq 7754 * again. Otherwise it can lead to an infinite loop 7755 * 2. Enter the ipsq if there is no current writer and this attempted 7756 * entry is part of the current ioctl or operation 7757 * 3. Enter the ipsq if there is no current writer and this is a new 7758 * ioctl (or operation) and the ioctl (or operation) queue is 7759 * empty and there is no ioctl (or operation) currently in progress 7760 */ 7761 if ((ipsq->ipsq_writer == NULL && ((type == CUR_OP) || 7762 (type == NEW_OP && ipsq->ipsq_xopq_mphead == NULL && 7763 ipsq->ipsq_current_ipif == NULL))) || 7764 (ipsq->ipsq_writer == curthread && reentry_ok)) { 7765 /* Success. */ 7766 ipsq->ipsq_reentry_cnt++; 7767 ipsq->ipsq_writer = curthread; 7768 mutex_exit(&ipsq->ipsq_lock); 7769 mutex_exit(&ill->ill_lock); 7770 RELEASE_CONN_LOCK(q); 7771 #ifdef DEBUG 7772 ipsq->ipsq_depth = getpcstack(ipsq->ipsq_stack, 7773 IPSQ_STACK_DEPTH); 7774 #endif 7775 return (ipsq); 7776 } 7777 7778 ipsq_enq(ipsq, q, mp, func, type, ill); 7779 7780 mutex_exit(&ipsq->ipsq_lock); 7781 mutex_exit(&ill->ill_lock); 7782 RELEASE_CONN_LOCK(q); 7783 return (NULL); 7784 } 7785 7786 /* 7787 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 7788 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 7789 * cannot be entered, the mp is queued for completion. 7790 */ 7791 void 7792 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7793 boolean_t reentry_ok) 7794 { 7795 ipsq_t *ipsq; 7796 7797 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 7798 7799 /* 7800 * Drop the caller's refhold on the ill. This is safe since we either 7801 * entered the IPSQ (and thus are exclusive), or failed to enter the 7802 * IPSQ, in which case we return without accessing ill anymore. This 7803 * is needed because func needs to see the correct refcount. 7804 * e.g. removeif can work only then. 7805 */ 7806 ill_refrele(ill); 7807 if (ipsq != NULL) { 7808 (*func)(ipsq, q, mp, NULL); 7809 ipsq_exit(ipsq, B_TRUE, B_TRUE); 7810 } 7811 } 7812 7813 /* 7814 * If there are more than ILL_GRP_CNT ills in a group, 7815 * we use kmem alloc'd buffers, else use the stack 7816 */ 7817 #define ILL_GRP_CNT 14 7818 /* 7819 * Drain the ipsq, if there are messages on it, and then leave the ipsq. 7820 * Called by a thread that is currently exclusive on this ipsq. 7821 */ 7822 void 7823 ipsq_exit(ipsq_t *ipsq, boolean_t start_igmp_timer, boolean_t start_mld_timer) 7824 { 7825 queue_t *q; 7826 mblk_t *mp; 7827 ipsq_func_t func; 7828 int next; 7829 ill_t **ill_list = NULL; 7830 size_t ill_list_size = 0; 7831 int cnt = 0; 7832 boolean_t need_ipsq_free = B_FALSE; 7833 ip_stack_t *ipst = ipsq->ipsq_ipst; 7834 7835 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7836 mutex_enter(&ipsq->ipsq_lock); 7837 ASSERT(ipsq->ipsq_reentry_cnt >= 1); 7838 if (ipsq->ipsq_reentry_cnt != 1) { 7839 ipsq->ipsq_reentry_cnt--; 7840 mutex_exit(&ipsq->ipsq_lock); 7841 return; 7842 } 7843 7844 mp = ipsq_dq(ipsq); 7845 while (mp != NULL) { 7846 again: 7847 mutex_exit(&ipsq->ipsq_lock); 7848 func = (ipsq_func_t)mp->b_prev; 7849 q = (queue_t *)mp->b_queue; 7850 mp->b_prev = NULL; 7851 mp->b_queue = NULL; 7852 7853 /* 7854 * If 'q' is an conn queue, it is valid, since we did a 7855 * a refhold on the connp, at the start of the ioctl. 7856 * If 'q' is an ill queue, it is valid, since close of an 7857 * ill will clean up the 'ipsq'. 7858 */ 7859 (*func)(ipsq, q, mp, NULL); 7860 7861 mutex_enter(&ipsq->ipsq_lock); 7862 mp = ipsq_dq(ipsq); 7863 } 7864 7865 mutex_exit(&ipsq->ipsq_lock); 7866 7867 /* 7868 * Need to grab the locks in the right order. Need to 7869 * atomically check (under ipsq_lock) that there are no 7870 * messages before relinquishing the ipsq. Also need to 7871 * atomically wakeup waiters on ill_cv while holding ill_lock. 7872 * Holding ill_g_lock ensures that ipsq list of ills is stable. 7873 * If we need to call ill_split_ipsq and change <ill-ipsq> we need 7874 * to grab ill_g_lock as writer. 7875 */ 7876 rw_enter(&ipst->ips_ill_g_lock, 7877 ipsq->ipsq_split ? RW_WRITER : RW_READER); 7878 7879 /* ipsq_refs can't change while ill_g_lock is held as reader */ 7880 if (ipsq->ipsq_refs != 0) { 7881 /* At most 2 ills v4/v6 per phyint */ 7882 cnt = ipsq->ipsq_refs << 1; 7883 ill_list_size = cnt * sizeof (ill_t *); 7884 /* 7885 * If memory allocation fails, we will do the split 7886 * the next time ipsq_exit is called for whatever reason. 7887 * As long as the ipsq_split flag is set the need to 7888 * split is remembered. 7889 */ 7890 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 7891 if (ill_list != NULL) 7892 cnt = ill_lock_ipsq_ills(ipsq, ill_list, cnt); 7893 } 7894 mutex_enter(&ipsq->ipsq_lock); 7895 mp = ipsq_dq(ipsq); 7896 if (mp != NULL) { 7897 /* oops, some message has landed up, we can't get out */ 7898 if (ill_list != NULL) 7899 ill_unlock_ills(ill_list, cnt); 7900 rw_exit(&ipst->ips_ill_g_lock); 7901 if (ill_list != NULL) 7902 kmem_free(ill_list, ill_list_size); 7903 ill_list = NULL; 7904 ill_list_size = 0; 7905 cnt = 0; 7906 goto again; 7907 } 7908 7909 /* 7910 * Split only if no ioctl is pending and if memory alloc succeeded 7911 * above. 7912 */ 7913 if (ipsq->ipsq_split && ipsq->ipsq_current_ipif == NULL && 7914 ill_list != NULL) { 7915 /* 7916 * No new ill can join this ipsq since we are holding the 7917 * ill_g_lock. Hence ill_split_ipsq can safely traverse the 7918 * ipsq. ill_split_ipsq may fail due to memory shortage. 7919 * If so we will retry on the next ipsq_exit. 7920 */ 7921 ipsq->ipsq_split = ill_split_ipsq(ipsq); 7922 } 7923 7924 /* 7925 * We are holding the ipsq lock, hence no new messages can 7926 * land up on the ipsq, and there are no messages currently. 7927 * Now safe to get out. Wake up waiters and relinquish ipsq 7928 * atomically while holding ill locks. 7929 */ 7930 ipsq->ipsq_writer = NULL; 7931 ipsq->ipsq_reentry_cnt--; 7932 ASSERT(ipsq->ipsq_reentry_cnt == 0); 7933 #ifdef DEBUG 7934 ipsq->ipsq_depth = 0; 7935 #endif 7936 mutex_exit(&ipsq->ipsq_lock); 7937 /* 7938 * For IPMP this should wake up all ills in this ipsq. 7939 * We need to hold the ill_lock while waking up waiters to 7940 * avoid missed wakeups. But there is no need to acquire all 7941 * the ill locks and then wakeup. If we have not acquired all 7942 * the locks (due to memory failure above) ill_signal_ipsq_ills 7943 * wakes up ills one at a time after getting the right ill_lock 7944 */ 7945 ill_signal_ipsq_ills(ipsq, ill_list != NULL); 7946 if (ill_list != NULL) 7947 ill_unlock_ills(ill_list, cnt); 7948 if (ipsq->ipsq_refs == 0) 7949 need_ipsq_free = B_TRUE; 7950 rw_exit(&ipst->ips_ill_g_lock); 7951 if (ill_list != 0) 7952 kmem_free(ill_list, ill_list_size); 7953 7954 if (need_ipsq_free) { 7955 /* 7956 * Free the ipsq. ipsq_refs can't increase because ipsq can't be 7957 * looked up. ipsq can be looked up only thru ill or phyint 7958 * and there are no ills/phyint on this ipsq. 7959 */ 7960 ipsq_delete(ipsq); 7961 } 7962 /* 7963 * Now start any igmp or mld timers that could not be started 7964 * while inside the ipsq. The timers can't be started while inside 7965 * the ipsq, since igmp_start_timers may need to call untimeout() 7966 * which can't be done while holding a lock i.e. the ipsq. Otherwise 7967 * there could be a deadlock since the timeout handlers 7968 * mld_timeout_handler / igmp_timeout_handler also synchronously 7969 * wait in ipsq_enter() trying to get the ipsq. 7970 * 7971 * However there is one exception to the above. If this thread is 7972 * itself the igmp/mld timeout handler thread, then we don't want 7973 * to start any new timer until the current handler is done. The 7974 * handler thread passes in B_FALSE for start_igmp/mld_timers, while 7975 * all others pass B_TRUE. 7976 */ 7977 if (start_igmp_timer) { 7978 mutex_enter(&ipst->ips_igmp_timer_lock); 7979 next = ipst->ips_igmp_deferred_next; 7980 ipst->ips_igmp_deferred_next = INFINITY; 7981 mutex_exit(&ipst->ips_igmp_timer_lock); 7982 7983 if (next != INFINITY) 7984 igmp_start_timers(next, ipst); 7985 } 7986 7987 if (start_mld_timer) { 7988 mutex_enter(&ipst->ips_mld_timer_lock); 7989 next = ipst->ips_mld_deferred_next; 7990 ipst->ips_mld_deferred_next = INFINITY; 7991 mutex_exit(&ipst->ips_mld_timer_lock); 7992 7993 if (next != INFINITY) 7994 mld_start_timers(next, ipst); 7995 } 7996 } 7997 7998 /* 7999 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 8000 * and `ioccmd'. 8001 */ 8002 void 8003 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 8004 { 8005 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8006 8007 mutex_enter(&ipsq->ipsq_lock); 8008 ASSERT(ipsq->ipsq_current_ipif == NULL); 8009 ASSERT(ipsq->ipsq_current_ioctl == 0); 8010 ipsq->ipsq_current_ipif = ipif; 8011 ipsq->ipsq_current_ioctl = ioccmd; 8012 mutex_exit(&ipsq->ipsq_lock); 8013 } 8014 8015 /* 8016 * Finish the current exclusive operation on `ipsq'. Note that other 8017 * operations will not be able to proceed until an ipsq_exit() is done. 8018 */ 8019 void 8020 ipsq_current_finish(ipsq_t *ipsq) 8021 { 8022 ipif_t *ipif = ipsq->ipsq_current_ipif; 8023 8024 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8025 8026 /* 8027 * For SIOCSLIFREMOVEIF, the ipif has been already been blown away 8028 * (but we're careful to never set IPIF_CHANGING in that case). 8029 */ 8030 if (ipsq->ipsq_current_ioctl != SIOCLIFREMOVEIF) { 8031 mutex_enter(&ipif->ipif_ill->ill_lock); 8032 ipif->ipif_state_flags &= ~IPIF_CHANGING; 8033 8034 /* Send any queued event */ 8035 ill_nic_info_dispatch(ipif->ipif_ill); 8036 mutex_exit(&ipif->ipif_ill->ill_lock); 8037 } 8038 8039 mutex_enter(&ipsq->ipsq_lock); 8040 ASSERT(ipsq->ipsq_current_ipif != NULL); 8041 ipsq->ipsq_current_ipif = NULL; 8042 ipsq->ipsq_current_ioctl = 0; 8043 mutex_exit(&ipsq->ipsq_lock); 8044 } 8045 8046 /* 8047 * The ill is closing. Flush all messages on the ipsq that originated 8048 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 8049 * for this ill since ipsq_enter could not have entered until then. 8050 * New messages can't be queued since the CONDEMNED flag is set. 8051 */ 8052 static void 8053 ipsq_flush(ill_t *ill) 8054 { 8055 queue_t *q; 8056 mblk_t *prev; 8057 mblk_t *mp; 8058 mblk_t *mp_next; 8059 ipsq_t *ipsq; 8060 8061 ASSERT(IAM_WRITER_ILL(ill)); 8062 ipsq = ill->ill_phyint->phyint_ipsq; 8063 /* 8064 * Flush any messages sent up by the driver. 8065 */ 8066 mutex_enter(&ipsq->ipsq_lock); 8067 for (prev = NULL, mp = ipsq->ipsq_mphead; mp != NULL; mp = mp_next) { 8068 mp_next = mp->b_next; 8069 q = mp->b_queue; 8070 if (q == ill->ill_rq || q == ill->ill_wq) { 8071 /* Remove the mp from the ipsq */ 8072 if (prev == NULL) 8073 ipsq->ipsq_mphead = mp->b_next; 8074 else 8075 prev->b_next = mp->b_next; 8076 if (ipsq->ipsq_mptail == mp) { 8077 ASSERT(mp_next == NULL); 8078 ipsq->ipsq_mptail = prev; 8079 } 8080 inet_freemsg(mp); 8081 } else { 8082 prev = mp; 8083 } 8084 } 8085 mutex_exit(&ipsq->ipsq_lock); 8086 (void) ipsq_pending_mp_cleanup(ill, NULL); 8087 ipsq_xopq_mp_cleanup(ill, NULL); 8088 ill_pending_mp_cleanup(ill); 8089 } 8090 8091 /* ARGSUSED */ 8092 int 8093 ip_sioctl_slifoindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8094 ip_ioctl_cmd_t *ipip, void *ifreq) 8095 { 8096 ill_t *ill; 8097 struct lifreq *lifr = (struct lifreq *)ifreq; 8098 boolean_t isv6; 8099 conn_t *connp; 8100 ip_stack_t *ipst; 8101 8102 connp = Q_TO_CONN(q); 8103 ipst = connp->conn_netstack->netstack_ip; 8104 isv6 = connp->conn_af_isv6; 8105 /* 8106 * Set original index. 8107 * Failover and failback move logical interfaces 8108 * from one physical interface to another. The 8109 * original index indicates the parent of a logical 8110 * interface, in other words, the physical interface 8111 * the logical interface will be moved back to on 8112 * failback. 8113 */ 8114 8115 /* 8116 * Don't allow the original index to be changed 8117 * for non-failover addresses, autoconfigured 8118 * addresses, or IPv6 link local addresses. 8119 */ 8120 if (((ipif->ipif_flags & (IPIF_NOFAILOVER | IPIF_ADDRCONF)) != NULL) || 8121 (isv6 && IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))) { 8122 return (EINVAL); 8123 } 8124 /* 8125 * The new original index must be in use by some 8126 * physical interface. 8127 */ 8128 ill = ill_lookup_on_ifindex(lifr->lifr_index, isv6, NULL, NULL, 8129 NULL, NULL, ipst); 8130 if (ill == NULL) 8131 return (ENXIO); 8132 ill_refrele(ill); 8133 8134 ipif->ipif_orig_ifindex = lifr->lifr_index; 8135 /* 8136 * When this ipif gets failed back, don't 8137 * preserve the original id, as it is no 8138 * longer applicable. 8139 */ 8140 ipif->ipif_orig_ipifid = 0; 8141 /* 8142 * For IPv4, change the original index of any 8143 * multicast addresses associated with the 8144 * ipif to the new value. 8145 */ 8146 if (!isv6) { 8147 ilm_t *ilm; 8148 8149 mutex_enter(&ipif->ipif_ill->ill_lock); 8150 for (ilm = ipif->ipif_ill->ill_ilm; ilm != NULL; 8151 ilm = ilm->ilm_next) { 8152 if (ilm->ilm_ipif == ipif) { 8153 ilm->ilm_orig_ifindex = lifr->lifr_index; 8154 } 8155 } 8156 mutex_exit(&ipif->ipif_ill->ill_lock); 8157 } 8158 return (0); 8159 } 8160 8161 /* ARGSUSED */ 8162 int 8163 ip_sioctl_get_oindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8164 ip_ioctl_cmd_t *ipip, void *ifreq) 8165 { 8166 struct lifreq *lifr = (struct lifreq *)ifreq; 8167 8168 /* 8169 * Get the original interface index i.e the one 8170 * before FAILOVER if it ever happened. 8171 */ 8172 lifr->lifr_index = ipif->ipif_orig_ifindex; 8173 return (0); 8174 } 8175 8176 /* 8177 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 8178 * refhold and return the associated ipif 8179 */ 8180 /* ARGSUSED */ 8181 int 8182 ip_extract_tunreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8183 cmd_info_t *ci, ipsq_func_t func) 8184 { 8185 boolean_t exists; 8186 struct iftun_req *ta; 8187 ipif_t *ipif; 8188 ill_t *ill; 8189 boolean_t isv6; 8190 mblk_t *mp1; 8191 int error; 8192 conn_t *connp; 8193 ip_stack_t *ipst; 8194 8195 /* Existence verified in ip_wput_nondata */ 8196 mp1 = mp->b_cont->b_cont; 8197 ta = (struct iftun_req *)mp1->b_rptr; 8198 /* 8199 * Null terminate the string to protect against buffer 8200 * overrun. String was generated by user code and may not 8201 * be trusted. 8202 */ 8203 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 8204 8205 connp = Q_TO_CONN(q); 8206 isv6 = connp->conn_af_isv6; 8207 ipst = connp->conn_netstack->netstack_ip; 8208 8209 /* Disallows implicit create */ 8210 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 8211 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 8212 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error, ipst); 8213 if (ipif == NULL) 8214 return (error); 8215 8216 if (ipif->ipif_id != 0) { 8217 /* 8218 * We really don't want to set/get tunnel parameters 8219 * on virtual tunnel interfaces. Only allow the 8220 * base tunnel to do these. 8221 */ 8222 ipif_refrele(ipif); 8223 return (EINVAL); 8224 } 8225 8226 /* 8227 * Send down to tunnel mod for ioctl processing. 8228 * Will finish ioctl in ip_rput_other(). 8229 */ 8230 ill = ipif->ipif_ill; 8231 if (ill->ill_net_type == IRE_LOOPBACK) { 8232 ipif_refrele(ipif); 8233 return (EOPNOTSUPP); 8234 } 8235 8236 if (ill->ill_wq == NULL) { 8237 ipif_refrele(ipif); 8238 return (ENXIO); 8239 } 8240 /* 8241 * Mark the ioctl as coming from an IPv6 interface for 8242 * tun's convenience. 8243 */ 8244 if (ill->ill_isv6) 8245 ta->ifta_flags |= 0x80000000; 8246 ci->ci_ipif = ipif; 8247 return (0); 8248 } 8249 8250 /* 8251 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8252 * and return the associated ipif. 8253 * Return value: 8254 * Non zero: An error has occurred. ci may not be filled out. 8255 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8256 * a held ipif in ci.ci_ipif. 8257 */ 8258 int 8259 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8260 cmd_info_t *ci, ipsq_func_t func) 8261 { 8262 sin_t *sin; 8263 sin6_t *sin6; 8264 char *name; 8265 struct ifreq *ifr; 8266 struct lifreq *lifr; 8267 ipif_t *ipif = NULL; 8268 ill_t *ill; 8269 conn_t *connp; 8270 boolean_t isv6; 8271 boolean_t exists; 8272 int err; 8273 mblk_t *mp1; 8274 zoneid_t zoneid; 8275 ip_stack_t *ipst; 8276 8277 if (q->q_next != NULL) { 8278 ill = (ill_t *)q->q_ptr; 8279 isv6 = ill->ill_isv6; 8280 connp = NULL; 8281 zoneid = ALL_ZONES; 8282 ipst = ill->ill_ipst; 8283 } else { 8284 ill = NULL; 8285 connp = Q_TO_CONN(q); 8286 isv6 = connp->conn_af_isv6; 8287 zoneid = connp->conn_zoneid; 8288 if (zoneid == GLOBAL_ZONEID) { 8289 /* global zone can access ipifs in all zones */ 8290 zoneid = ALL_ZONES; 8291 } 8292 ipst = connp->conn_netstack->netstack_ip; 8293 } 8294 8295 /* Has been checked in ip_wput_nondata */ 8296 mp1 = mp->b_cont->b_cont; 8297 8298 if (ipip->ipi_cmd_type == IF_CMD) { 8299 /* This a old style SIOC[GS]IF* command */ 8300 ifr = (struct ifreq *)mp1->b_rptr; 8301 /* 8302 * Null terminate the string to protect against buffer 8303 * overrun. String was generated by user code and may not 8304 * be trusted. 8305 */ 8306 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8307 sin = (sin_t *)&ifr->ifr_addr; 8308 name = ifr->ifr_name; 8309 ci->ci_sin = sin; 8310 ci->ci_sin6 = NULL; 8311 ci->ci_lifr = (struct lifreq *)ifr; 8312 } else { 8313 /* This a new style SIOC[GS]LIF* command */ 8314 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 8315 lifr = (struct lifreq *)mp1->b_rptr; 8316 /* 8317 * Null terminate the string to protect against buffer 8318 * overrun. String was generated by user code and may not 8319 * be trusted. 8320 */ 8321 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8322 name = lifr->lifr_name; 8323 sin = (sin_t *)&lifr->lifr_addr; 8324 sin6 = (sin6_t *)&lifr->lifr_addr; 8325 if (ipip->ipi_cmd == SIOCSLIFGROUPNAME) { 8326 (void) strncpy(ci->ci_groupname, lifr->lifr_groupname, 8327 LIFNAMSIZ); 8328 } 8329 ci->ci_sin = sin; 8330 ci->ci_sin6 = sin6; 8331 ci->ci_lifr = lifr; 8332 } 8333 8334 if (ipip->ipi_cmd == SIOCSLIFNAME) { 8335 /* 8336 * The ioctl will be failed if the ioctl comes down 8337 * an conn stream 8338 */ 8339 if (ill == NULL) { 8340 /* 8341 * Not an ill queue, return EINVAL same as the 8342 * old error code. 8343 */ 8344 return (ENXIO); 8345 } 8346 ipif = ill->ill_ipif; 8347 ipif_refhold(ipif); 8348 } else { 8349 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8350 &exists, isv6, zoneid, 8351 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err, 8352 ipst); 8353 if (ipif == NULL) { 8354 if (err == EINPROGRESS) 8355 return (err); 8356 if (ipip->ipi_cmd == SIOCLIFFAILOVER || 8357 ipip->ipi_cmd == SIOCLIFFAILBACK) { 8358 /* 8359 * Need to try both v4 and v6 since this 8360 * ioctl can come down either v4 or v6 8361 * socket. The lifreq.lifr_family passed 8362 * down by this ioctl is AF_UNSPEC. 8363 */ 8364 ipif = ipif_lookup_on_name(name, 8365 mi_strlen(name), B_FALSE, &exists, !isv6, 8366 zoneid, (connp == NULL) ? q : 8367 CONNP_TO_WQ(connp), mp, func, &err, ipst); 8368 if (err == EINPROGRESS) 8369 return (err); 8370 } 8371 err = 0; /* Ensure we don't use it below */ 8372 } 8373 } 8374 8375 /* 8376 * Old style [GS]IFCMD does not admit IPv6 ipif 8377 */ 8378 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 8379 ipif_refrele(ipif); 8380 return (ENXIO); 8381 } 8382 8383 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8384 name[0] == '\0') { 8385 /* 8386 * Handle a or a SIOC?IF* with a null name 8387 * during plumb (on the ill queue before the I_PLINK). 8388 */ 8389 ipif = ill->ill_ipif; 8390 ipif_refhold(ipif); 8391 } 8392 8393 if (ipif == NULL) 8394 return (ENXIO); 8395 8396 /* 8397 * Allow only GET operations if this ipif has been created 8398 * temporarily due to a MOVE operation. 8399 */ 8400 if (ipif->ipif_replace_zero && !(ipip->ipi_flags & IPI_REPL)) { 8401 ipif_refrele(ipif); 8402 return (EINVAL); 8403 } 8404 8405 ci->ci_ipif = ipif; 8406 return (0); 8407 } 8408 8409 /* 8410 * Return the total number of ipifs. 8411 */ 8412 static uint_t 8413 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 8414 { 8415 uint_t numifs = 0; 8416 ill_t *ill; 8417 ill_walk_context_t ctx; 8418 ipif_t *ipif; 8419 8420 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8421 ill = ILL_START_WALK_V4(&ctx, ipst); 8422 8423 while (ill != NULL) { 8424 for (ipif = ill->ill_ipif; ipif != NULL; 8425 ipif = ipif->ipif_next) { 8426 if (ipif->ipif_zoneid == zoneid || 8427 ipif->ipif_zoneid == ALL_ZONES) 8428 numifs++; 8429 } 8430 ill = ill_next(&ctx, ill); 8431 } 8432 rw_exit(&ipst->ips_ill_g_lock); 8433 return (numifs); 8434 } 8435 8436 /* 8437 * Return the total number of ipifs. 8438 */ 8439 static uint_t 8440 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 8441 { 8442 uint_t numifs = 0; 8443 ill_t *ill; 8444 ipif_t *ipif; 8445 ill_walk_context_t ctx; 8446 8447 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8448 8449 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8450 if (family == AF_INET) 8451 ill = ILL_START_WALK_V4(&ctx, ipst); 8452 else if (family == AF_INET6) 8453 ill = ILL_START_WALK_V6(&ctx, ipst); 8454 else 8455 ill = ILL_START_WALK_ALL(&ctx, ipst); 8456 8457 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8458 for (ipif = ill->ill_ipif; ipif != NULL; 8459 ipif = ipif->ipif_next) { 8460 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8461 !(lifn_flags & LIFC_NOXMIT)) 8462 continue; 8463 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8464 !(lifn_flags & LIFC_TEMPORARY)) 8465 continue; 8466 if (((ipif->ipif_flags & 8467 (IPIF_NOXMIT|IPIF_NOLOCAL| 8468 IPIF_DEPRECATED)) || 8469 IS_LOOPBACK(ill) || 8470 !(ipif->ipif_flags & IPIF_UP)) && 8471 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8472 continue; 8473 8474 if (zoneid != ipif->ipif_zoneid && 8475 ipif->ipif_zoneid != ALL_ZONES && 8476 (zoneid != GLOBAL_ZONEID || 8477 !(lifn_flags & LIFC_ALLZONES))) 8478 continue; 8479 8480 numifs++; 8481 } 8482 } 8483 rw_exit(&ipst->ips_ill_g_lock); 8484 return (numifs); 8485 } 8486 8487 uint_t 8488 ip_get_lifsrcofnum(ill_t *ill) 8489 { 8490 uint_t numifs = 0; 8491 ill_t *ill_head = ill; 8492 ip_stack_t *ipst = ill->ill_ipst; 8493 8494 /* 8495 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8496 * other thread may be trying to relink the ILLs in this usesrc group 8497 * and adjusting the ill_usesrc_grp_next pointers 8498 */ 8499 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8500 if ((ill->ill_usesrc_ifindex == 0) && 8501 (ill->ill_usesrc_grp_next != NULL)) { 8502 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8503 ill = ill->ill_usesrc_grp_next) 8504 numifs++; 8505 } 8506 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8507 8508 return (numifs); 8509 } 8510 8511 /* Null values are passed in for ipif, sin, and ifreq */ 8512 /* ARGSUSED */ 8513 int 8514 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8515 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8516 { 8517 int *nump; 8518 conn_t *connp = Q_TO_CONN(q); 8519 8520 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8521 8522 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8523 nump = (int *)mp->b_cont->b_cont->b_rptr; 8524 8525 *nump = ip_get_numifs(connp->conn_zoneid, 8526 connp->conn_netstack->netstack_ip); 8527 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8528 return (0); 8529 } 8530 8531 /* Null values are passed in for ipif, sin, and ifreq */ 8532 /* ARGSUSED */ 8533 int 8534 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8535 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8536 { 8537 struct lifnum *lifn; 8538 mblk_t *mp1; 8539 conn_t *connp = Q_TO_CONN(q); 8540 8541 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8542 8543 /* Existence checked in ip_wput_nondata */ 8544 mp1 = mp->b_cont->b_cont; 8545 8546 lifn = (struct lifnum *)mp1->b_rptr; 8547 switch (lifn->lifn_family) { 8548 case AF_UNSPEC: 8549 case AF_INET: 8550 case AF_INET6: 8551 break; 8552 default: 8553 return (EAFNOSUPPORT); 8554 } 8555 8556 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8557 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 8558 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8559 return (0); 8560 } 8561 8562 /* ARGSUSED */ 8563 int 8564 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8565 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8566 { 8567 STRUCT_HANDLE(ifconf, ifc); 8568 mblk_t *mp1; 8569 struct iocblk *iocp; 8570 struct ifreq *ifr; 8571 ill_walk_context_t ctx; 8572 ill_t *ill; 8573 ipif_t *ipif; 8574 struct sockaddr_in *sin; 8575 int32_t ifclen; 8576 zoneid_t zoneid; 8577 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8578 8579 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8580 8581 ip1dbg(("ip_sioctl_get_ifconf")); 8582 /* Existence verified in ip_wput_nondata */ 8583 mp1 = mp->b_cont->b_cont; 8584 iocp = (struct iocblk *)mp->b_rptr; 8585 zoneid = Q_TO_CONN(q)->conn_zoneid; 8586 8587 /* 8588 * The original SIOCGIFCONF passed in a struct ifconf which specified 8589 * the user buffer address and length into which the list of struct 8590 * ifreqs was to be copied. Since AT&T Streams does not seem to 8591 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8592 * the SIOCGIFCONF operation was redefined to simply provide 8593 * a large output buffer into which we are supposed to jam the ifreq 8594 * array. The same ioctl command code was used, despite the fact that 8595 * both the applications and the kernel code had to change, thus making 8596 * it impossible to support both interfaces. 8597 * 8598 * For reasons not good enough to try to explain, the following 8599 * algorithm is used for deciding what to do with one of these: 8600 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8601 * form with the output buffer coming down as the continuation message. 8602 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8603 * and we have to copy in the ifconf structure to find out how big the 8604 * output buffer is and where to copy out to. Sure no problem... 8605 * 8606 */ 8607 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8608 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8609 int numifs = 0; 8610 size_t ifc_bufsize; 8611 8612 /* 8613 * Must be (better be!) continuation of a TRANSPARENT 8614 * IOCTL. We just copied in the ifconf structure. 8615 */ 8616 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8617 (struct ifconf *)mp1->b_rptr); 8618 8619 /* 8620 * Allocate a buffer to hold requested information. 8621 * 8622 * If ifc_len is larger than what is needed, we only 8623 * allocate what we will use. 8624 * 8625 * If ifc_len is smaller than what is needed, return 8626 * EINVAL. 8627 * 8628 * XXX: the ill_t structure can hava 2 counters, for 8629 * v4 and v6 (not just ill_ipif_up_count) to store the 8630 * number of interfaces for a device, so we don't need 8631 * to count them here... 8632 */ 8633 numifs = ip_get_numifs(zoneid, ipst); 8634 8635 ifclen = STRUCT_FGET(ifc, ifc_len); 8636 ifc_bufsize = numifs * sizeof (struct ifreq); 8637 if (ifc_bufsize > ifclen) { 8638 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8639 /* old behaviour */ 8640 return (EINVAL); 8641 } else { 8642 ifc_bufsize = ifclen; 8643 } 8644 } 8645 8646 mp1 = mi_copyout_alloc(q, mp, 8647 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8648 if (mp1 == NULL) 8649 return (ENOMEM); 8650 8651 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8652 } 8653 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8654 /* 8655 * the SIOCGIFCONF ioctl only knows about 8656 * IPv4 addresses, so don't try to tell 8657 * it about interfaces with IPv6-only 8658 * addresses. (Last parm 'isv6' is B_FALSE) 8659 */ 8660 8661 ifr = (struct ifreq *)mp1->b_rptr; 8662 8663 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8664 ill = ILL_START_WALK_V4(&ctx, ipst); 8665 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8666 for (ipif = ill->ill_ipif; ipif != NULL; 8667 ipif = ipif->ipif_next) { 8668 if (zoneid != ipif->ipif_zoneid && 8669 ipif->ipif_zoneid != ALL_ZONES) 8670 continue; 8671 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8672 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8673 /* old behaviour */ 8674 rw_exit(&ipst->ips_ill_g_lock); 8675 return (EINVAL); 8676 } else { 8677 goto if_copydone; 8678 } 8679 } 8680 ipif_get_name(ipif, ifr->ifr_name, 8681 sizeof (ifr->ifr_name)); 8682 sin = (sin_t *)&ifr->ifr_addr; 8683 *sin = sin_null; 8684 sin->sin_family = AF_INET; 8685 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8686 ifr++; 8687 } 8688 } 8689 if_copydone: 8690 rw_exit(&ipst->ips_ill_g_lock); 8691 mp1->b_wptr = (uchar_t *)ifr; 8692 8693 if (STRUCT_BUF(ifc) != NULL) { 8694 STRUCT_FSET(ifc, ifc_len, 8695 (int)((uchar_t *)ifr - mp1->b_rptr)); 8696 } 8697 return (0); 8698 } 8699 8700 /* 8701 * Get the interfaces using the address hosted on the interface passed in, 8702 * as a source adddress 8703 */ 8704 /* ARGSUSED */ 8705 int 8706 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8707 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8708 { 8709 mblk_t *mp1; 8710 ill_t *ill, *ill_head; 8711 ipif_t *ipif, *orig_ipif; 8712 int numlifs = 0; 8713 size_t lifs_bufsize, lifsmaxlen; 8714 struct lifreq *lifr; 8715 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8716 uint_t ifindex; 8717 zoneid_t zoneid; 8718 int err = 0; 8719 boolean_t isv6 = B_FALSE; 8720 struct sockaddr_in *sin; 8721 struct sockaddr_in6 *sin6; 8722 STRUCT_HANDLE(lifsrcof, lifs); 8723 ip_stack_t *ipst; 8724 8725 ipst = CONNQ_TO_IPST(q); 8726 8727 ASSERT(q->q_next == NULL); 8728 8729 zoneid = Q_TO_CONN(q)->conn_zoneid; 8730 8731 /* Existence verified in ip_wput_nondata */ 8732 mp1 = mp->b_cont->b_cont; 8733 8734 /* 8735 * Must be (better be!) continuation of a TRANSPARENT 8736 * IOCTL. We just copied in the lifsrcof structure. 8737 */ 8738 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8739 (struct lifsrcof *)mp1->b_rptr); 8740 8741 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8742 return (EINVAL); 8743 8744 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8745 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8746 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8747 ip_process_ioctl, &err, ipst); 8748 if (ipif == NULL) { 8749 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8750 ifindex)); 8751 return (err); 8752 } 8753 8754 8755 /* Allocate a buffer to hold requested information */ 8756 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8757 lifs_bufsize = numlifs * sizeof (struct lifreq); 8758 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8759 /* The actual size needed is always returned in lifs_len */ 8760 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8761 8762 /* If the amount we need is more than what is passed in, abort */ 8763 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8764 ipif_refrele(ipif); 8765 return (0); 8766 } 8767 8768 mp1 = mi_copyout_alloc(q, mp, 8769 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8770 if (mp1 == NULL) { 8771 ipif_refrele(ipif); 8772 return (ENOMEM); 8773 } 8774 8775 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8776 bzero(mp1->b_rptr, lifs_bufsize); 8777 8778 lifr = (struct lifreq *)mp1->b_rptr; 8779 8780 ill = ill_head = ipif->ipif_ill; 8781 orig_ipif = ipif; 8782 8783 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8784 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8785 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8786 8787 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8788 for (; (ill != NULL) && (ill != ill_head); 8789 ill = ill->ill_usesrc_grp_next) { 8790 8791 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8792 break; 8793 8794 ipif = ill->ill_ipif; 8795 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 8796 if (ipif->ipif_isv6) { 8797 sin6 = (sin6_t *)&lifr->lifr_addr; 8798 *sin6 = sin6_null; 8799 sin6->sin6_family = AF_INET6; 8800 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8801 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8802 &ipif->ipif_v6net_mask); 8803 } else { 8804 sin = (sin_t *)&lifr->lifr_addr; 8805 *sin = sin_null; 8806 sin->sin_family = AF_INET; 8807 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8808 lifr->lifr_addrlen = ip_mask_to_plen( 8809 ipif->ipif_net_mask); 8810 } 8811 lifr++; 8812 } 8813 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8814 rw_exit(&ipst->ips_ill_g_lock); 8815 ipif_refrele(orig_ipif); 8816 mp1->b_wptr = (uchar_t *)lifr; 8817 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8818 8819 return (0); 8820 } 8821 8822 /* ARGSUSED */ 8823 int 8824 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8825 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8826 { 8827 mblk_t *mp1; 8828 int list; 8829 ill_t *ill; 8830 ipif_t *ipif; 8831 int flags; 8832 int numlifs = 0; 8833 size_t lifc_bufsize; 8834 struct lifreq *lifr; 8835 sa_family_t family; 8836 struct sockaddr_in *sin; 8837 struct sockaddr_in6 *sin6; 8838 ill_walk_context_t ctx; 8839 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8840 int32_t lifclen; 8841 zoneid_t zoneid; 8842 STRUCT_HANDLE(lifconf, lifc); 8843 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8844 8845 ip1dbg(("ip_sioctl_get_lifconf")); 8846 8847 ASSERT(q->q_next == NULL); 8848 8849 zoneid = Q_TO_CONN(q)->conn_zoneid; 8850 8851 /* Existence verified in ip_wput_nondata */ 8852 mp1 = mp->b_cont->b_cont; 8853 8854 /* 8855 * An extended version of SIOCGIFCONF that takes an 8856 * additional address family and flags field. 8857 * AF_UNSPEC retrieve both IPv4 and IPv6. 8858 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8859 * interfaces are omitted. 8860 * Similarly, IPIF_TEMPORARY interfaces are omitted 8861 * unless LIFC_TEMPORARY is specified. 8862 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8863 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8864 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8865 * has priority over LIFC_NOXMIT. 8866 */ 8867 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8868 8869 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8870 return (EINVAL); 8871 8872 /* 8873 * Must be (better be!) continuation of a TRANSPARENT 8874 * IOCTL. We just copied in the lifconf structure. 8875 */ 8876 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8877 8878 family = STRUCT_FGET(lifc, lifc_family); 8879 flags = STRUCT_FGET(lifc, lifc_flags); 8880 8881 switch (family) { 8882 case AF_UNSPEC: 8883 /* 8884 * walk all ILL's. 8885 */ 8886 list = MAX_G_HEADS; 8887 break; 8888 case AF_INET: 8889 /* 8890 * walk only IPV4 ILL's. 8891 */ 8892 list = IP_V4_G_HEAD; 8893 break; 8894 case AF_INET6: 8895 /* 8896 * walk only IPV6 ILL's. 8897 */ 8898 list = IP_V6_G_HEAD; 8899 break; 8900 default: 8901 return (EAFNOSUPPORT); 8902 } 8903 8904 /* 8905 * Allocate a buffer to hold requested information. 8906 * 8907 * If lifc_len is larger than what is needed, we only 8908 * allocate what we will use. 8909 * 8910 * If lifc_len is smaller than what is needed, return 8911 * EINVAL. 8912 */ 8913 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 8914 lifc_bufsize = numlifs * sizeof (struct lifreq); 8915 lifclen = STRUCT_FGET(lifc, lifc_len); 8916 if (lifc_bufsize > lifclen) { 8917 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8918 return (EINVAL); 8919 else 8920 lifc_bufsize = lifclen; 8921 } 8922 8923 mp1 = mi_copyout_alloc(q, mp, 8924 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8925 if (mp1 == NULL) 8926 return (ENOMEM); 8927 8928 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8929 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8930 8931 lifr = (struct lifreq *)mp1->b_rptr; 8932 8933 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8934 ill = ill_first(list, list, &ctx, ipst); 8935 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8936 for (ipif = ill->ill_ipif; ipif != NULL; 8937 ipif = ipif->ipif_next) { 8938 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8939 !(flags & LIFC_NOXMIT)) 8940 continue; 8941 8942 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8943 !(flags & LIFC_TEMPORARY)) 8944 continue; 8945 8946 if (((ipif->ipif_flags & 8947 (IPIF_NOXMIT|IPIF_NOLOCAL| 8948 IPIF_DEPRECATED)) || 8949 IS_LOOPBACK(ill) || 8950 !(ipif->ipif_flags & IPIF_UP)) && 8951 (flags & LIFC_EXTERNAL_SOURCE)) 8952 continue; 8953 8954 if (zoneid != ipif->ipif_zoneid && 8955 ipif->ipif_zoneid != ALL_ZONES && 8956 (zoneid != GLOBAL_ZONEID || 8957 !(flags & LIFC_ALLZONES))) 8958 continue; 8959 8960 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8961 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8962 rw_exit(&ipst->ips_ill_g_lock); 8963 return (EINVAL); 8964 } else { 8965 goto lif_copydone; 8966 } 8967 } 8968 8969 ipif_get_name(ipif, lifr->lifr_name, 8970 sizeof (lifr->lifr_name)); 8971 if (ipif->ipif_isv6) { 8972 sin6 = (sin6_t *)&lifr->lifr_addr; 8973 *sin6 = sin6_null; 8974 sin6->sin6_family = AF_INET6; 8975 sin6->sin6_addr = 8976 ipif->ipif_v6lcl_addr; 8977 lifr->lifr_addrlen = 8978 ip_mask_to_plen_v6( 8979 &ipif->ipif_v6net_mask); 8980 } else { 8981 sin = (sin_t *)&lifr->lifr_addr; 8982 *sin = sin_null; 8983 sin->sin_family = AF_INET; 8984 sin->sin_addr.s_addr = 8985 ipif->ipif_lcl_addr; 8986 lifr->lifr_addrlen = 8987 ip_mask_to_plen( 8988 ipif->ipif_net_mask); 8989 } 8990 lifr++; 8991 } 8992 } 8993 lif_copydone: 8994 rw_exit(&ipst->ips_ill_g_lock); 8995 8996 mp1->b_wptr = (uchar_t *)lifr; 8997 if (STRUCT_BUF(lifc) != NULL) { 8998 STRUCT_FSET(lifc, lifc_len, 8999 (int)((uchar_t *)lifr - mp1->b_rptr)); 9000 } 9001 return (0); 9002 } 9003 9004 /* ARGSUSED */ 9005 int 9006 ip_sioctl_set_ipmpfailback(ipif_t *dummy_ipif, sin_t *dummy_sin, 9007 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 9008 { 9009 ip_stack_t *ipst; 9010 9011 if (q->q_next == NULL) 9012 ipst = CONNQ_TO_IPST(q); 9013 else 9014 ipst = ILLQ_TO_IPST(q); 9015 9016 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 9017 ipst->ips_ipmp_enable_failback = *(int *)mp->b_cont->b_cont->b_rptr; 9018 return (0); 9019 } 9020 9021 static void 9022 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 9023 { 9024 ip6_asp_t *table; 9025 size_t table_size; 9026 mblk_t *data_mp; 9027 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9028 ip_stack_t *ipst; 9029 9030 if (q->q_next == NULL) 9031 ipst = CONNQ_TO_IPST(q); 9032 else 9033 ipst = ILLQ_TO_IPST(q); 9034 9035 /* These two ioctls are I_STR only */ 9036 if (iocp->ioc_count == TRANSPARENT) { 9037 miocnak(q, mp, 0, EINVAL); 9038 return; 9039 } 9040 9041 data_mp = mp->b_cont; 9042 if (data_mp == NULL) { 9043 /* The user passed us a NULL argument */ 9044 table = NULL; 9045 table_size = iocp->ioc_count; 9046 } else { 9047 /* 9048 * The user provided a table. The stream head 9049 * may have copied in the user data in chunks, 9050 * so make sure everything is pulled up 9051 * properly. 9052 */ 9053 if (MBLKL(data_mp) < iocp->ioc_count) { 9054 mblk_t *new_data_mp; 9055 if ((new_data_mp = msgpullup(data_mp, -1)) == 9056 NULL) { 9057 miocnak(q, mp, 0, ENOMEM); 9058 return; 9059 } 9060 freemsg(data_mp); 9061 data_mp = new_data_mp; 9062 mp->b_cont = data_mp; 9063 } 9064 table = (ip6_asp_t *)data_mp->b_rptr; 9065 table_size = iocp->ioc_count; 9066 } 9067 9068 switch (iocp->ioc_cmd) { 9069 case SIOCGIP6ADDRPOLICY: 9070 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 9071 if (iocp->ioc_rval == -1) 9072 iocp->ioc_error = EINVAL; 9073 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 9074 else if (table != NULL && 9075 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 9076 ip6_asp_t *src = table; 9077 ip6_asp32_t *dst = (void *)table; 9078 int count = table_size / sizeof (ip6_asp_t); 9079 int i; 9080 9081 /* 9082 * We need to do an in-place shrink of the array 9083 * to match the alignment attributes of the 9084 * 32-bit ABI looking at it. 9085 */ 9086 /* LINTED: logical expression always true: op "||" */ 9087 ASSERT(sizeof (*src) > sizeof (*dst)); 9088 for (i = 1; i < count; i++) 9089 bcopy(src + i, dst + i, sizeof (*dst)); 9090 } 9091 #endif 9092 break; 9093 9094 case SIOCSIP6ADDRPOLICY: 9095 ASSERT(mp->b_prev == NULL); 9096 mp->b_prev = (void *)q; 9097 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 9098 /* 9099 * We pass in the datamodel here so that the ip6_asp_replace() 9100 * routine can handle converting from 32-bit to native formats 9101 * where necessary. 9102 * 9103 * A better way to handle this might be to convert the inbound 9104 * data structure here, and hang it off a new 'mp'; thus the 9105 * ip6_asp_replace() logic would always be dealing with native 9106 * format data structures.. 9107 * 9108 * (An even simpler way to handle these ioctls is to just 9109 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 9110 * and just recompile everything that depends on it.) 9111 */ 9112 #endif 9113 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 9114 iocp->ioc_flag & IOC_MODELS); 9115 return; 9116 } 9117 9118 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 9119 qreply(q, mp); 9120 } 9121 9122 static void 9123 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 9124 { 9125 mblk_t *data_mp; 9126 struct dstinforeq *dir; 9127 uint8_t *end, *cur; 9128 in6_addr_t *daddr, *saddr; 9129 ipaddr_t v4daddr; 9130 ire_t *ire; 9131 char *slabel, *dlabel; 9132 boolean_t isipv4; 9133 int match_ire; 9134 ill_t *dst_ill; 9135 ipif_t *src_ipif, *ire_ipif; 9136 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9137 zoneid_t zoneid; 9138 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9139 9140 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9141 zoneid = Q_TO_CONN(q)->conn_zoneid; 9142 9143 /* 9144 * This ioctl is I_STR only, and must have a 9145 * data mblk following the M_IOCTL mblk. 9146 */ 9147 data_mp = mp->b_cont; 9148 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 9149 miocnak(q, mp, 0, EINVAL); 9150 return; 9151 } 9152 9153 if (MBLKL(data_mp) < iocp->ioc_count) { 9154 mblk_t *new_data_mp; 9155 9156 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 9157 miocnak(q, mp, 0, ENOMEM); 9158 return; 9159 } 9160 freemsg(data_mp); 9161 data_mp = new_data_mp; 9162 mp->b_cont = data_mp; 9163 } 9164 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 9165 9166 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 9167 end - cur >= sizeof (struct dstinforeq); 9168 cur += sizeof (struct dstinforeq)) { 9169 dir = (struct dstinforeq *)cur; 9170 daddr = &dir->dir_daddr; 9171 saddr = &dir->dir_saddr; 9172 9173 /* 9174 * ip_addr_scope_v6() and ip6_asp_lookup() handle 9175 * v4 mapped addresses; ire_ftable_lookup[_v6]() 9176 * and ipif_select_source[_v6]() do not. 9177 */ 9178 dir->dir_dscope = ip_addr_scope_v6(daddr); 9179 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 9180 9181 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 9182 if (isipv4) { 9183 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 9184 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 9185 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9186 } else { 9187 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 9188 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9189 } 9190 if (ire == NULL) { 9191 dir->dir_dreachable = 0; 9192 9193 /* move on to next dst addr */ 9194 continue; 9195 } 9196 dir->dir_dreachable = 1; 9197 9198 ire_ipif = ire->ire_ipif; 9199 if (ire_ipif == NULL) 9200 goto next_dst; 9201 9202 /* 9203 * We expect to get back an interface ire or a 9204 * gateway ire cache entry. For both types, the 9205 * output interface is ire_ipif->ipif_ill. 9206 */ 9207 dst_ill = ire_ipif->ipif_ill; 9208 dir->dir_dmactype = dst_ill->ill_mactype; 9209 9210 if (isipv4) { 9211 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 9212 } else { 9213 src_ipif = ipif_select_source_v6(dst_ill, 9214 daddr, RESTRICT_TO_NONE, IPV6_PREFER_SRC_DEFAULT, 9215 zoneid); 9216 } 9217 if (src_ipif == NULL) 9218 goto next_dst; 9219 9220 *saddr = src_ipif->ipif_v6lcl_addr; 9221 dir->dir_sscope = ip_addr_scope_v6(saddr); 9222 slabel = ip6_asp_lookup(saddr, NULL, ipst); 9223 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 9224 dir->dir_sdeprecated = 9225 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 9226 ipif_refrele(src_ipif); 9227 next_dst: 9228 ire_refrele(ire); 9229 } 9230 miocack(q, mp, iocp->ioc_count, 0); 9231 } 9232 9233 9234 /* 9235 * Check if this is an address assigned to this machine. 9236 * Skips interfaces that are down by using ire checks. 9237 * Translates mapped addresses to v4 addresses and then 9238 * treats them as such, returning true if the v4 address 9239 * associated with this mapped address is configured. 9240 * Note: Applications will have to be careful what they do 9241 * with the response; use of mapped addresses limits 9242 * what can be done with the socket, especially with 9243 * respect to socket options and ioctls - neither IPv4 9244 * options nor IPv6 sticky options/ancillary data options 9245 * may be used. 9246 */ 9247 /* ARGSUSED */ 9248 int 9249 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9250 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9251 { 9252 struct sioc_addrreq *sia; 9253 sin_t *sin; 9254 ire_t *ire; 9255 mblk_t *mp1; 9256 zoneid_t zoneid; 9257 ip_stack_t *ipst; 9258 9259 ip1dbg(("ip_sioctl_tmyaddr")); 9260 9261 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9262 zoneid = Q_TO_CONN(q)->conn_zoneid; 9263 ipst = CONNQ_TO_IPST(q); 9264 9265 /* Existence verified in ip_wput_nondata */ 9266 mp1 = mp->b_cont->b_cont; 9267 sia = (struct sioc_addrreq *)mp1->b_rptr; 9268 sin = (sin_t *)&sia->sa_addr; 9269 switch (sin->sin_family) { 9270 case AF_INET6: { 9271 sin6_t *sin6 = (sin6_t *)sin; 9272 9273 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9274 ipaddr_t v4_addr; 9275 9276 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9277 v4_addr); 9278 ire = ire_ctable_lookup(v4_addr, 0, 9279 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9280 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9281 } else { 9282 in6_addr_t v6addr; 9283 9284 v6addr = sin6->sin6_addr; 9285 ire = ire_ctable_lookup_v6(&v6addr, 0, 9286 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9287 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9288 } 9289 break; 9290 } 9291 case AF_INET: { 9292 ipaddr_t v4addr; 9293 9294 v4addr = sin->sin_addr.s_addr; 9295 ire = ire_ctable_lookup(v4addr, 0, 9296 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9297 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9298 break; 9299 } 9300 default: 9301 return (EAFNOSUPPORT); 9302 } 9303 if (ire != NULL) { 9304 sia->sa_res = 1; 9305 ire_refrele(ire); 9306 } else { 9307 sia->sa_res = 0; 9308 } 9309 return (0); 9310 } 9311 9312 /* 9313 * Check if this is an address assigned on-link i.e. neighbor, 9314 * and makes sure it's reachable from the current zone. 9315 * Returns true for my addresses as well. 9316 * Translates mapped addresses to v4 addresses and then 9317 * treats them as such, returning true if the v4 address 9318 * associated with this mapped address is configured. 9319 * Note: Applications will have to be careful what they do 9320 * with the response; use of mapped addresses limits 9321 * what can be done with the socket, especially with 9322 * respect to socket options and ioctls - neither IPv4 9323 * options nor IPv6 sticky options/ancillary data options 9324 * may be used. 9325 */ 9326 /* ARGSUSED */ 9327 int 9328 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9329 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9330 { 9331 struct sioc_addrreq *sia; 9332 sin_t *sin; 9333 mblk_t *mp1; 9334 ire_t *ire = NULL; 9335 zoneid_t zoneid; 9336 ip_stack_t *ipst; 9337 9338 ip1dbg(("ip_sioctl_tonlink")); 9339 9340 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9341 zoneid = Q_TO_CONN(q)->conn_zoneid; 9342 ipst = CONNQ_TO_IPST(q); 9343 9344 /* Existence verified in ip_wput_nondata */ 9345 mp1 = mp->b_cont->b_cont; 9346 sia = (struct sioc_addrreq *)mp1->b_rptr; 9347 sin = (sin_t *)&sia->sa_addr; 9348 9349 /* 9350 * Match addresses with a zero gateway field to avoid 9351 * routes going through a router. 9352 * Exclude broadcast and multicast addresses. 9353 */ 9354 switch (sin->sin_family) { 9355 case AF_INET6: { 9356 sin6_t *sin6 = (sin6_t *)sin; 9357 9358 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9359 ipaddr_t v4_addr; 9360 9361 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9362 v4_addr); 9363 if (!CLASSD(v4_addr)) { 9364 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9365 NULL, NULL, zoneid, NULL, 9366 MATCH_IRE_GW, ipst); 9367 } 9368 } else { 9369 in6_addr_t v6addr; 9370 in6_addr_t v6gw; 9371 9372 v6addr = sin6->sin6_addr; 9373 v6gw = ipv6_all_zeros; 9374 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9375 ire = ire_route_lookup_v6(&v6addr, 0, 9376 &v6gw, 0, NULL, NULL, zoneid, 9377 NULL, MATCH_IRE_GW, ipst); 9378 } 9379 } 9380 break; 9381 } 9382 case AF_INET: { 9383 ipaddr_t v4addr; 9384 9385 v4addr = sin->sin_addr.s_addr; 9386 if (!CLASSD(v4addr)) { 9387 ire = ire_route_lookup(v4addr, 0, 0, 0, 9388 NULL, NULL, zoneid, NULL, 9389 MATCH_IRE_GW, ipst); 9390 } 9391 break; 9392 } 9393 default: 9394 return (EAFNOSUPPORT); 9395 } 9396 sia->sa_res = 0; 9397 if (ire != NULL) { 9398 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9399 IRE_LOCAL|IRE_LOOPBACK)) { 9400 sia->sa_res = 1; 9401 } 9402 ire_refrele(ire); 9403 } 9404 return (0); 9405 } 9406 9407 /* 9408 * TBD: implement when kernel maintaines a list of site prefixes. 9409 */ 9410 /* ARGSUSED */ 9411 int 9412 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9413 ip_ioctl_cmd_t *ipip, void *ifreq) 9414 { 9415 return (ENXIO); 9416 } 9417 9418 /* ARGSUSED */ 9419 int 9420 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9421 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9422 { 9423 ill_t *ill; 9424 mblk_t *mp1; 9425 conn_t *connp; 9426 boolean_t success; 9427 9428 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 9429 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9430 /* ioctl comes down on an conn */ 9431 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9432 connp = Q_TO_CONN(q); 9433 9434 mp->b_datap->db_type = M_IOCTL; 9435 9436 /* 9437 * Send down a copy. (copymsg does not copy b_next/b_prev). 9438 * The original mp contains contaminated b_next values due to 'mi', 9439 * which is needed to do the mi_copy_done. Unfortunately if we 9440 * send down the original mblk itself and if we are popped due to an 9441 * an unplumb before the response comes back from tunnel, 9442 * the streamhead (which does a freemsg) will see this contaminated 9443 * message and the assertion in freemsg about non-null b_next/b_prev 9444 * will panic a DEBUG kernel. 9445 */ 9446 mp1 = copymsg(mp); 9447 if (mp1 == NULL) 9448 return (ENOMEM); 9449 9450 ill = ipif->ipif_ill; 9451 mutex_enter(&connp->conn_lock); 9452 mutex_enter(&ill->ill_lock); 9453 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 9454 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9455 mp, 0); 9456 } else { 9457 success = ill_pending_mp_add(ill, connp, mp); 9458 } 9459 mutex_exit(&ill->ill_lock); 9460 mutex_exit(&connp->conn_lock); 9461 9462 if (success) { 9463 ip1dbg(("sending down tunparam request ")); 9464 putnext(ill->ill_wq, mp1); 9465 return (EINPROGRESS); 9466 } else { 9467 /* The conn has started closing */ 9468 freemsg(mp1); 9469 return (EINTR); 9470 } 9471 } 9472 9473 /* 9474 * ARP IOCTLs. 9475 * How does IP get in the business of fronting ARP configuration/queries? 9476 * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9477 * are by tradition passed in through a datagram socket. That lands in IP. 9478 * As it happens, this is just as well since the interface is quite crude in 9479 * that it passes in no information about protocol or hardware types, or 9480 * interface association. After making the protocol assumption, IP is in 9481 * the position to look up the name of the ILL, which ARP will need, and 9482 * format a request that can be handled by ARP. The request is passed up 9483 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9484 * back a response. ARP supports its own set of more general IOCTLs, in 9485 * case anyone is interested. 9486 */ 9487 /* ARGSUSED */ 9488 int 9489 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9490 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9491 { 9492 mblk_t *mp1; 9493 mblk_t *mp2; 9494 mblk_t *pending_mp; 9495 ipaddr_t ipaddr; 9496 area_t *area; 9497 struct iocblk *iocp; 9498 conn_t *connp; 9499 struct arpreq *ar; 9500 struct xarpreq *xar; 9501 int flags, alength; 9502 char *lladdr; 9503 ip_stack_t *ipst; 9504 ill_t *ill = ipif->ipif_ill; 9505 boolean_t if_arp_ioctl = B_FALSE; 9506 9507 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9508 connp = Q_TO_CONN(q); 9509 ipst = connp->conn_netstack->netstack_ip; 9510 9511 if (ipip->ipi_cmd_type == XARP_CMD) { 9512 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9513 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9514 ar = NULL; 9515 9516 flags = xar->xarp_flags; 9517 lladdr = LLADDR(&xar->xarp_ha); 9518 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 9519 /* 9520 * Validate against user's link layer address length 9521 * input and name and addr length limits. 9522 */ 9523 alength = ill->ill_phys_addr_length; 9524 if (ipip->ipi_cmd == SIOCSXARP) { 9525 if (alength != xar->xarp_ha.sdl_alen || 9526 (alength + xar->xarp_ha.sdl_nlen > 9527 sizeof (xar->xarp_ha.sdl_data))) 9528 return (EINVAL); 9529 } 9530 } else { 9531 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9532 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9533 xar = NULL; 9534 9535 flags = ar->arp_flags; 9536 lladdr = ar->arp_ha.sa_data; 9537 /* 9538 * Theoretically, the sa_family could tell us what link 9539 * layer type this operation is trying to deal with. By 9540 * common usage AF_UNSPEC means ethernet. We'll assume 9541 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9542 * for now. Our new SIOC*XARP ioctls can be used more 9543 * generally. 9544 * 9545 * If the underlying media happens to have a non 6 byte 9546 * address, arp module will fail set/get, but the del 9547 * operation will succeed. 9548 */ 9549 alength = 6; 9550 if ((ipip->ipi_cmd != SIOCDARP) && 9551 (alength != ill->ill_phys_addr_length)) { 9552 return (EINVAL); 9553 } 9554 } 9555 9556 /* 9557 * We are going to pass up to ARP a packet chain that looks 9558 * like: 9559 * 9560 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9561 * 9562 * Get a copy of the original IOCTL mblk to head the chain, 9563 * to be sent up (in mp1). Also get another copy to store 9564 * in the ill_pending_mp list, for matching the response 9565 * when it comes back from ARP. 9566 */ 9567 mp1 = copyb(mp); 9568 pending_mp = copymsg(mp); 9569 if (mp1 == NULL || pending_mp == NULL) { 9570 if (mp1 != NULL) 9571 freeb(mp1); 9572 if (pending_mp != NULL) 9573 inet_freemsg(pending_mp); 9574 return (ENOMEM); 9575 } 9576 9577 ipaddr = sin->sin_addr.s_addr; 9578 9579 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9580 (caddr_t)&ipaddr); 9581 if (mp2 == NULL) { 9582 freeb(mp1); 9583 inet_freemsg(pending_mp); 9584 return (ENOMEM); 9585 } 9586 /* Put together the chain. */ 9587 mp1->b_cont = mp2; 9588 mp1->b_datap->db_type = M_IOCTL; 9589 mp2->b_cont = mp; 9590 mp2->b_datap->db_type = M_DATA; 9591 9592 iocp = (struct iocblk *)mp1->b_rptr; 9593 9594 /* 9595 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9596 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9597 * cp_private field (or cp_rval on 32-bit systems) in place of the 9598 * ioc_count field; set ioc_count to be correct. 9599 */ 9600 iocp->ioc_count = MBLKL(mp1->b_cont); 9601 9602 /* 9603 * Set the proper command in the ARP message. 9604 * Convert the SIOC{G|S|D}ARP calls into our 9605 * AR_ENTRY_xxx calls. 9606 */ 9607 area = (area_t *)mp2->b_rptr; 9608 switch (iocp->ioc_cmd) { 9609 case SIOCDARP: 9610 case SIOCDXARP: 9611 /* 9612 * We defer deleting the corresponding IRE until 9613 * we return from arp. 9614 */ 9615 area->area_cmd = AR_ENTRY_DELETE; 9616 area->area_proto_mask_offset = 0; 9617 break; 9618 case SIOCGARP: 9619 case SIOCGXARP: 9620 area->area_cmd = AR_ENTRY_SQUERY; 9621 area->area_proto_mask_offset = 0; 9622 break; 9623 case SIOCSARP: 9624 case SIOCSXARP: 9625 /* 9626 * Delete the corresponding ire to make sure IP will 9627 * pick up any change from arp. 9628 */ 9629 if (!if_arp_ioctl) { 9630 (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst); 9631 } else { 9632 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9633 if (ipif != NULL) { 9634 (void) ip_ire_clookup_and_delete(ipaddr, ipif, 9635 ipst); 9636 ipif_refrele(ipif); 9637 } 9638 } 9639 break; 9640 } 9641 iocp->ioc_cmd = area->area_cmd; 9642 9643 /* 9644 * Fill in the rest of the ARP operation fields. 9645 */ 9646 area->area_hw_addr_length = alength; 9647 bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength); 9648 9649 /* Translate the flags. */ 9650 if (flags & ATF_PERM) 9651 area->area_flags |= ACE_F_PERMANENT; 9652 if (flags & ATF_PUBL) 9653 area->area_flags |= ACE_F_PUBLISH; 9654 if (flags & ATF_AUTHORITY) 9655 area->area_flags |= ACE_F_AUTHORITY; 9656 9657 /* 9658 * Before sending 'mp' to ARP, we have to clear the b_next 9659 * and b_prev. Otherwise if STREAMS encounters such a message 9660 * in freemsg(), (because ARP can close any time) it can cause 9661 * a panic. But mi code needs the b_next and b_prev values of 9662 * mp->b_cont, to complete the ioctl. So we store it here 9663 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9664 * when the response comes down from ARP. 9665 */ 9666 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9667 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9668 mp->b_cont->b_next = NULL; 9669 mp->b_cont->b_prev = NULL; 9670 9671 mutex_enter(&connp->conn_lock); 9672 mutex_enter(&ill->ill_lock); 9673 /* conn has not yet started closing, hence this can't fail */ 9674 VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); 9675 mutex_exit(&ill->ill_lock); 9676 mutex_exit(&connp->conn_lock); 9677 9678 /* 9679 * Up to ARP it goes. The response will come back in ip_wput() as an 9680 * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion. 9681 */ 9682 putnext(ill->ill_rq, mp1); 9683 return (EINPROGRESS); 9684 } 9685 9686 /* 9687 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 9688 * the associated sin and refhold and return the associated ipif via `ci'. 9689 */ 9690 int 9691 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 9692 cmd_info_t *ci, ipsq_func_t func) 9693 { 9694 mblk_t *mp1; 9695 int err; 9696 sin_t *sin; 9697 conn_t *connp; 9698 ipif_t *ipif; 9699 ire_t *ire = NULL; 9700 ill_t *ill = NULL; 9701 boolean_t exists; 9702 ip_stack_t *ipst; 9703 struct arpreq *ar; 9704 struct xarpreq *xar; 9705 struct sockaddr_dl *sdl; 9706 9707 /* ioctl comes down on a conn */ 9708 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9709 connp = Q_TO_CONN(q); 9710 if (connp->conn_af_isv6) 9711 return (ENXIO); 9712 9713 ipst = connp->conn_netstack->netstack_ip; 9714 9715 /* Verified in ip_wput_nondata */ 9716 mp1 = mp->b_cont->b_cont; 9717 9718 if (ipip->ipi_cmd_type == XARP_CMD) { 9719 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 9720 xar = (struct xarpreq *)mp1->b_rptr; 9721 sin = (sin_t *)&xar->xarp_pa; 9722 sdl = &xar->xarp_ha; 9723 9724 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 9725 return (ENXIO); 9726 if (sdl->sdl_nlen >= LIFNAMSIZ) 9727 return (EINVAL); 9728 } else { 9729 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 9730 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 9731 ar = (struct arpreq *)mp1->b_rptr; 9732 sin = (sin_t *)&ar->arp_pa; 9733 } 9734 9735 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 9736 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 9737 B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp), 9738 mp, func, &err, ipst); 9739 if (ipif == NULL) 9740 return (err); 9741 if (ipif->ipif_id != 0 || 9742 ipif->ipif_net_type != IRE_IF_RESOLVER) { 9743 ipif_refrele(ipif); 9744 return (ENXIO); 9745 } 9746 } else { 9747 /* 9748 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with sdl_nlen == 9749 * 0: use the IP address to figure out the ill. In the IPMP 9750 * case, a simple forwarding table lookup will return the 9751 * IRE_IF_RESOLVER for the first interface in the group, which 9752 * might not be the interface on which the requested IP 9753 * address was resolved due to the ill selection algorithm 9754 * (see ip_newroute_get_dst_ill()). So we do a cache table 9755 * lookup first: if the IRE cache entry for the IP address is 9756 * still there, it will contain the ill pointer for the right 9757 * interface, so we use that. If the cache entry has been 9758 * flushed, we fall back to the forwarding table lookup. This 9759 * should be rare enough since IRE cache entries have a longer 9760 * life expectancy than ARP cache entries. 9761 */ 9762 ire = ire_cache_lookup(sin->sin_addr.s_addr, ALL_ZONES, NULL, 9763 ipst); 9764 if ((ire == NULL) || (ire->ire_type == IRE_LOOPBACK) || 9765 ((ill = ire_to_ill(ire)) == NULL) || 9766 (ill->ill_net_type != IRE_IF_RESOLVER)) { 9767 if (ire != NULL) 9768 ire_refrele(ire); 9769 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 9770 0, 0, IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, 9771 NULL, MATCH_IRE_TYPE, ipst); 9772 if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) { 9773 9774 if (ire != NULL) 9775 ire_refrele(ire); 9776 return (ENXIO); 9777 } 9778 } 9779 ASSERT(ire != NULL && ill != NULL); 9780 ipif = ill->ill_ipif; 9781 ipif_refhold(ipif); 9782 ire_refrele(ire); 9783 } 9784 ci->ci_sin = sin; 9785 ci->ci_ipif = ipif; 9786 return (0); 9787 } 9788 9789 /* 9790 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9791 * atomically set/clear the muxids. Also complete the ioctl by acking or 9792 * naking it. Note that the code is structured such that the link type, 9793 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9794 * its clones use the persistent link, while pppd(1M) and perhaps many 9795 * other daemons may use non-persistent link. When combined with some 9796 * ill_t states, linking and unlinking lower streams may be used as 9797 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9798 */ 9799 /* ARGSUSED */ 9800 void 9801 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9802 { 9803 mblk_t *mp1, *mp2; 9804 struct linkblk *li; 9805 struct ipmx_s *ipmxp; 9806 ill_t *ill; 9807 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 9808 int err = 0; 9809 boolean_t entered_ipsq = B_FALSE; 9810 boolean_t islink; 9811 ip_stack_t *ipst; 9812 9813 if (CONN_Q(q)) 9814 ipst = CONNQ_TO_IPST(q); 9815 else 9816 ipst = ILLQ_TO_IPST(q); 9817 9818 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 9819 ioccmd == I_LINK || ioccmd == I_UNLINK); 9820 9821 islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9822 9823 mp1 = mp->b_cont; /* This is the linkblk info */ 9824 li = (struct linkblk *)mp1->b_rptr; 9825 9826 /* 9827 * ARP has added this special mblk, and the utility is asking us 9828 * to perform consistency checks, and also atomically set the 9829 * muxid. Ifconfig is an example. It achieves this by using 9830 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9831 * to /dev/udp[6] stream for use as the mux when plinking the IP 9832 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9833 * and other comments in this routine for more details. 9834 */ 9835 mp2 = mp1->b_cont; /* This is added by ARP */ 9836 9837 /* 9838 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9839 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9840 * get the special mblk above. For backward compatibility, we 9841 * request ip_sioctl_plink_ipmod() to skip the consistency checks. 9842 * The utility will use SIOCSLIFMUXID to store the muxids. This is 9843 * not atomic, and can leave the streams unplumbable if the utility 9844 * is interrupted before it does the SIOCSLIFMUXID. 9845 */ 9846 if (mp2 == NULL) { 9847 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE); 9848 if (err == EINPROGRESS) 9849 return; 9850 goto done; 9851 } 9852 9853 /* 9854 * This is an I_{P}LINK sent down by ifconfig through the ARP module; 9855 * ARP has appended this last mblk to tell us whether the lower stream 9856 * is an arp-dev stream or an IP module stream. 9857 */ 9858 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9859 if (ipmxp->ipmx_arpdev_stream) { 9860 /* 9861 * The lower stream is the arp-dev stream. 9862 */ 9863 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9864 q, mp, ip_sioctl_plink, &err, NULL, ipst); 9865 if (ill == NULL) { 9866 if (err == EINPROGRESS) 9867 return; 9868 err = EINVAL; 9869 goto done; 9870 } 9871 9872 if (ipsq == NULL) { 9873 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9874 NEW_OP, B_TRUE); 9875 if (ipsq == NULL) { 9876 ill_refrele(ill); 9877 return; 9878 } 9879 entered_ipsq = B_TRUE; 9880 } 9881 ASSERT(IAM_WRITER_ILL(ill)); 9882 ill_refrele(ill); 9883 9884 /* 9885 * To ensure consistency between IP and ARP, the following 9886 * LIFO scheme is used in plink/punlink. (IP first, ARP last). 9887 * This is because the muxid's are stored in the IP stream on 9888 * the ill. 9889 * 9890 * I_{P}LINK: ifconfig plinks the IP stream before plinking 9891 * the ARP stream. On an arp-dev stream, IP checks that it is 9892 * not yet plinked, and it also checks that the corresponding 9893 * IP stream is already plinked. 9894 * 9895 * I_{P}UNLINK: ifconfig punlinks the ARP stream before 9896 * punlinking the IP stream. IP does not allow punlink of the 9897 * IP stream unless the arp stream has been punlinked. 9898 */ 9899 if ((islink && 9900 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9901 (!islink && ill->ill_arp_muxid != li->l_index)) { 9902 err = EINVAL; 9903 goto done; 9904 } 9905 ill->ill_arp_muxid = islink ? li->l_index : 0; 9906 } else { 9907 /* 9908 * The lower stream is probably an IP module stream. Do 9909 * consistency checking. 9910 */ 9911 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE); 9912 if (err == EINPROGRESS) 9913 return; 9914 } 9915 done: 9916 if (err == 0) 9917 miocack(q, mp, 0, 0); 9918 else 9919 miocnak(q, mp, 0, err); 9920 9921 /* Conn was refheld in ip_sioctl_copyin_setup */ 9922 if (CONN_Q(q)) 9923 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9924 if (entered_ipsq) 9925 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9926 } 9927 9928 /* 9929 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 9930 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 9931 * module stream). If `doconsist' is set, then do the extended consistency 9932 * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here. 9933 * Returns zero on success, EINPROGRESS if the operation is still pending, or 9934 * an error code on failure. 9935 */ 9936 static int 9937 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 9938 struct linkblk *li, boolean_t doconsist) 9939 { 9940 ill_t *ill; 9941 queue_t *ipwq, *dwq; 9942 const char *name; 9943 struct qinit *qinfo; 9944 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9945 boolean_t entered_ipsq = B_FALSE; 9946 9947 /* 9948 * Walk the lower stream to verify it's the IP module stream. 9949 * The IP module is identified by its name, wput function, 9950 * and non-NULL q_next. STREAMS ensures that the lower stream 9951 * (li->l_qbot) will not vanish until this ioctl completes. 9952 */ 9953 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 9954 qinfo = ipwq->q_qinfo; 9955 name = qinfo->qi_minfo->mi_idname; 9956 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 9957 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 9958 break; 9959 } 9960 } 9961 9962 /* 9963 * If this isn't an IP module stream, bail. 9964 */ 9965 if (ipwq == NULL) 9966 return (0); 9967 9968 ill = ipwq->q_ptr; 9969 ASSERT(ill != NULL); 9970 9971 if (ipsq == NULL) { 9972 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9973 NEW_OP, B_TRUE); 9974 if (ipsq == NULL) 9975 return (EINPROGRESS); 9976 entered_ipsq = B_TRUE; 9977 } 9978 ASSERT(IAM_WRITER_ILL(ill)); 9979 9980 if (doconsist) { 9981 /* 9982 * Consistency checking requires that I_{P}LINK occurs 9983 * prior to setting ill_ip_muxid, and that I_{P}UNLINK 9984 * occurs prior to clearing ill_arp_muxid. 9985 */ 9986 if ((islink && ill->ill_ip_muxid != 0) || 9987 (!islink && ill->ill_arp_muxid != 0)) { 9988 if (entered_ipsq) 9989 ipsq_exit(ipsq, B_TRUE, B_TRUE); 9990 return (EINVAL); 9991 } 9992 } 9993 9994 /* 9995 * As part of I_{P}LINKing, stash the number of downstream modules and 9996 * the read queue of the module immediately below IP in the ill. 9997 * These are used during the capability negotiation below. 9998 */ 9999 ill->ill_lmod_rq = NULL; 10000 ill->ill_lmod_cnt = 0; 10001 if (islink && ((dwq = ipwq->q_next) != NULL)) { 10002 ill->ill_lmod_rq = RD(dwq); 10003 for (; dwq != NULL; dwq = dwq->q_next) 10004 ill->ill_lmod_cnt++; 10005 } 10006 10007 if (doconsist) 10008 ill->ill_ip_muxid = islink ? li->l_index : 0; 10009 10010 /* 10011 * If there's at least one up ipif on this ill, then we're bound to 10012 * the underlying driver via DLPI. In that case, renegotiate 10013 * capabilities to account for any possible change in modules 10014 * interposed between IP and the driver. 10015 */ 10016 if (ill->ill_ipif_up_count > 0) { 10017 if (islink) 10018 ill_capability_probe(ill); 10019 else 10020 ill_capability_reset(ill); 10021 } 10022 10023 if (entered_ipsq) 10024 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10025 10026 return (0); 10027 } 10028 10029 /* 10030 * Search the ioctl command in the ioctl tables and return a pointer 10031 * to the ioctl command information. The ioctl command tables are 10032 * static and fully populated at compile time. 10033 */ 10034 ip_ioctl_cmd_t * 10035 ip_sioctl_lookup(int ioc_cmd) 10036 { 10037 int index; 10038 ip_ioctl_cmd_t *ipip; 10039 ip_ioctl_cmd_t *ipip_end; 10040 10041 if (ioc_cmd == IPI_DONTCARE) 10042 return (NULL); 10043 10044 /* 10045 * Do a 2 step search. First search the indexed table 10046 * based on the least significant byte of the ioctl cmd. 10047 * If we don't find a match, then search the misc table 10048 * serially. 10049 */ 10050 index = ioc_cmd & 0xFF; 10051 if (index < ip_ndx_ioctl_count) { 10052 ipip = &ip_ndx_ioctl_table[index]; 10053 if (ipip->ipi_cmd == ioc_cmd) { 10054 /* Found a match in the ndx table */ 10055 return (ipip); 10056 } 10057 } 10058 10059 /* Search the misc table */ 10060 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 10061 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 10062 if (ipip->ipi_cmd == ioc_cmd) 10063 /* Found a match in the misc table */ 10064 return (ipip); 10065 } 10066 10067 return (NULL); 10068 } 10069 10070 /* 10071 * Wrapper function for resuming deferred ioctl processing 10072 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 10073 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 10074 */ 10075 /* ARGSUSED */ 10076 void 10077 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 10078 void *dummy_arg) 10079 { 10080 ip_sioctl_copyin_setup(q, mp); 10081 } 10082 10083 /* 10084 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 10085 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 10086 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 10087 * We establish here the size of the block to be copied in. mi_copyin 10088 * arranges for this to happen, an processing continues in ip_wput with 10089 * an M_IOCDATA message. 10090 */ 10091 void 10092 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 10093 { 10094 int copyin_size; 10095 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10096 ip_ioctl_cmd_t *ipip; 10097 cred_t *cr; 10098 ip_stack_t *ipst; 10099 10100 if (CONN_Q(q)) 10101 ipst = CONNQ_TO_IPST(q); 10102 else 10103 ipst = ILLQ_TO_IPST(q); 10104 10105 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 10106 if (ipip == NULL) { 10107 /* 10108 * The ioctl is not one we understand or own. 10109 * Pass it along to be processed down stream, 10110 * if this is a module instance of IP, else nak 10111 * the ioctl. 10112 */ 10113 if (q->q_next == NULL) { 10114 goto nak; 10115 } else { 10116 putnext(q, mp); 10117 return; 10118 } 10119 } 10120 10121 /* 10122 * If this is deferred, then we will do all the checks when we 10123 * come back. 10124 */ 10125 if ((iocp->ioc_cmd == SIOCGDSTINFO || 10126 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 10127 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 10128 return; 10129 } 10130 10131 /* 10132 * Only allow a very small subset of IP ioctls on this stream if 10133 * IP is a module and not a driver. Allowing ioctls to be processed 10134 * in this case may cause assert failures or data corruption. 10135 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10136 * ioctls allowed on an IP module stream, after which this stream 10137 * normally becomes a multiplexor (at which time the stream head 10138 * will fail all ioctls). 10139 */ 10140 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10141 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10142 /* 10143 * Pass common Streams ioctls which the IP 10144 * module does not own or consume along to 10145 * be processed down stream. 10146 */ 10147 putnext(q, mp); 10148 return; 10149 } else { 10150 goto nak; 10151 } 10152 } 10153 10154 /* Make sure we have ioctl data to process. */ 10155 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10156 goto nak; 10157 10158 /* 10159 * Prefer dblk credential over ioctl credential; some synthesized 10160 * ioctls have kcred set because there's no way to crhold() 10161 * a credential in some contexts. (ioc_cr is not crfree() by 10162 * the framework; the caller of ioctl needs to hold the reference 10163 * for the duration of the call). 10164 */ 10165 cr = DB_CREDDEF(mp, iocp->ioc_cr); 10166 10167 /* Make sure normal users don't send down privileged ioctls */ 10168 if ((ipip->ipi_flags & IPI_PRIV) && 10169 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 10170 /* We checked the privilege earlier but log it here */ 10171 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 10172 return; 10173 } 10174 10175 /* 10176 * The ioctl command tables can only encode fixed length 10177 * ioctl data. If the length is variable, the table will 10178 * encode the length as zero. Such special cases are handled 10179 * below in the switch. 10180 */ 10181 if (ipip->ipi_copyin_size != 0) { 10182 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10183 return; 10184 } 10185 10186 switch (iocp->ioc_cmd) { 10187 case O_SIOCGIFCONF: 10188 case SIOCGIFCONF: 10189 /* 10190 * This IOCTL is hilarious. See comments in 10191 * ip_sioctl_get_ifconf for the story. 10192 */ 10193 if (iocp->ioc_count == TRANSPARENT) 10194 copyin_size = SIZEOF_STRUCT(ifconf, 10195 iocp->ioc_flag); 10196 else 10197 copyin_size = iocp->ioc_count; 10198 mi_copyin(q, mp, NULL, copyin_size); 10199 return; 10200 10201 case O_SIOCGLIFCONF: 10202 case SIOCGLIFCONF: 10203 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10204 mi_copyin(q, mp, NULL, copyin_size); 10205 return; 10206 10207 case SIOCGLIFSRCOF: 10208 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10209 mi_copyin(q, mp, NULL, copyin_size); 10210 return; 10211 case SIOCGIP6ADDRPOLICY: 10212 ip_sioctl_ip6addrpolicy(q, mp); 10213 ip6_asp_table_refrele(ipst); 10214 return; 10215 10216 case SIOCSIP6ADDRPOLICY: 10217 ip_sioctl_ip6addrpolicy(q, mp); 10218 return; 10219 10220 case SIOCGDSTINFO: 10221 ip_sioctl_dstinfo(q, mp); 10222 ip6_asp_table_refrele(ipst); 10223 return; 10224 10225 case I_PLINK: 10226 case I_PUNLINK: 10227 case I_LINK: 10228 case I_UNLINK: 10229 /* 10230 * We treat non-persistent link similarly as the persistent 10231 * link case, in terms of plumbing/unplumbing, as well as 10232 * dynamic re-plumbing events indicator. See comments 10233 * in ip_sioctl_plink() for more. 10234 * 10235 * Request can be enqueued in the 'ipsq' while waiting 10236 * to become exclusive. So bump up the conn ref. 10237 */ 10238 if (CONN_Q(q)) 10239 CONN_INC_REF(Q_TO_CONN(q)); 10240 ip_sioctl_plink(NULL, q, mp, NULL); 10241 return; 10242 10243 case ND_GET: 10244 case ND_SET: 10245 /* 10246 * Use of the nd table requires holding the reader lock. 10247 * Modifying the nd table thru nd_load/nd_unload requires 10248 * the writer lock. 10249 */ 10250 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 10251 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 10252 rw_exit(&ipst->ips_ip_g_nd_lock); 10253 10254 if (iocp->ioc_error) 10255 iocp->ioc_count = 0; 10256 mp->b_datap->db_type = M_IOCACK; 10257 qreply(q, mp); 10258 return; 10259 } 10260 rw_exit(&ipst->ips_ip_g_nd_lock); 10261 /* 10262 * We don't understand this subioctl of ND_GET / ND_SET. 10263 * Maybe intended for some driver / module below us 10264 */ 10265 if (q->q_next) { 10266 putnext(q, mp); 10267 } else { 10268 iocp->ioc_error = ENOENT; 10269 mp->b_datap->db_type = M_IOCNAK; 10270 iocp->ioc_count = 0; 10271 qreply(q, mp); 10272 } 10273 return; 10274 10275 case IP_IOCTL: 10276 ip_wput_ioctl(q, mp); 10277 return; 10278 default: 10279 cmn_err(CE_PANIC, "should not happen "); 10280 } 10281 nak: 10282 if (mp->b_cont != NULL) { 10283 freemsg(mp->b_cont); 10284 mp->b_cont = NULL; 10285 } 10286 iocp->ioc_error = EINVAL; 10287 mp->b_datap->db_type = M_IOCNAK; 10288 iocp->ioc_count = 0; 10289 qreply(q, mp); 10290 } 10291 10292 /* ip_wput hands off ARP IOCTL responses to us */ 10293 void 10294 ip_sioctl_iocack(queue_t *q, mblk_t *mp) 10295 { 10296 struct arpreq *ar; 10297 struct xarpreq *xar; 10298 area_t *area; 10299 mblk_t *area_mp; 10300 struct iocblk *iocp; 10301 mblk_t *orig_ioc_mp, *tmp; 10302 struct iocblk *orig_iocp; 10303 ill_t *ill; 10304 conn_t *connp = NULL; 10305 uint_t ioc_id; 10306 mblk_t *pending_mp; 10307 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10308 int *flagsp; 10309 char *storage = NULL; 10310 sin_t *sin; 10311 ipaddr_t addr; 10312 int err; 10313 ip_stack_t *ipst; 10314 10315 ill = q->q_ptr; 10316 ASSERT(ill != NULL); 10317 ipst = ill->ill_ipst; 10318 10319 /* 10320 * We should get back from ARP a packet chain that looks like: 10321 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10322 */ 10323 if (!(area_mp = mp->b_cont) || 10324 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10325 !(orig_ioc_mp = area_mp->b_cont) || 10326 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10327 freemsg(mp); 10328 return; 10329 } 10330 10331 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10332 10333 tmp = (orig_ioc_mp->b_cont)->b_cont; 10334 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10335 (orig_iocp->ioc_cmd == SIOCSXARP) || 10336 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10337 x_arp_ioctl = B_TRUE; 10338 xar = (struct xarpreq *)tmp->b_rptr; 10339 sin = (sin_t *)&xar->xarp_pa; 10340 flagsp = &xar->xarp_flags; 10341 storage = xar->xarp_ha.sdl_data; 10342 if (xar->xarp_ha.sdl_nlen != 0) 10343 ifx_arp_ioctl = B_TRUE; 10344 } else { 10345 ar = (struct arpreq *)tmp->b_rptr; 10346 sin = (sin_t *)&ar->arp_pa; 10347 flagsp = &ar->arp_flags; 10348 storage = ar->arp_ha.sa_data; 10349 } 10350 10351 iocp = (struct iocblk *)mp->b_rptr; 10352 10353 /* 10354 * Pick out the originating queue based on the ioc_id. 10355 */ 10356 ioc_id = iocp->ioc_id; 10357 pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); 10358 if (pending_mp == NULL) { 10359 ASSERT(connp == NULL); 10360 inet_freemsg(mp); 10361 return; 10362 } 10363 ASSERT(connp != NULL); 10364 q = CONNP_TO_WQ(connp); 10365 10366 /* Uncouple the internally generated IOCTL from the original one */ 10367 area = (area_t *)area_mp->b_rptr; 10368 area_mp->b_cont = NULL; 10369 10370 /* 10371 * Restore the b_next and b_prev used by mi code. This is needed 10372 * to complete the ioctl using mi* functions. We stored them in 10373 * the pending mp prior to sending the request to ARP. 10374 */ 10375 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10376 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10377 inet_freemsg(pending_mp); 10378 10379 /* 10380 * We're done if there was an error or if this is not an SIOCG{X}ARP 10381 * Catch the case where there is an IRE_CACHE by no entry in the 10382 * arp table. 10383 */ 10384 addr = sin->sin_addr.s_addr; 10385 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10386 ire_t *ire; 10387 dl_unitdata_req_t *dlup; 10388 mblk_t *llmp; 10389 int addr_len; 10390 ill_t *ipsqill = NULL; 10391 10392 if (ifx_arp_ioctl) { 10393 /* 10394 * There's no need to lookup the ill, since 10395 * we've already done that when we started 10396 * processing the ioctl and sent the message 10397 * to ARP on that ill. So use the ill that 10398 * is stored in q->q_ptr. 10399 */ 10400 ipsqill = ill; 10401 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10402 ipsqill->ill_ipif, ALL_ZONES, 10403 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 10404 } else { 10405 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10406 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 10407 if (ire != NULL) 10408 ipsqill = ire_to_ill(ire); 10409 } 10410 10411 if ((x_arp_ioctl) && (ipsqill != NULL)) 10412 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10413 10414 if (ire != NULL) { 10415 /* 10416 * Since the ire obtained from cachetable is used for 10417 * mac addr copying below, treat an incomplete ire as if 10418 * as if we never found it. 10419 */ 10420 if (ire->ire_nce != NULL && 10421 ire->ire_nce->nce_state != ND_REACHABLE) { 10422 ire_refrele(ire); 10423 ire = NULL; 10424 ipsqill = NULL; 10425 goto errack; 10426 } 10427 *flagsp = ATF_INUSE; 10428 llmp = (ire->ire_nce != NULL ? 10429 ire->ire_nce->nce_res_mp : NULL); 10430 if (llmp != NULL && ipsqill != NULL) { 10431 uchar_t *macaddr; 10432 10433 addr_len = ipsqill->ill_phys_addr_length; 10434 if (x_arp_ioctl && ((addr_len + 10435 ipsqill->ill_name_length) > 10436 sizeof (xar->xarp_ha.sdl_data))) { 10437 ire_refrele(ire); 10438 freemsg(mp); 10439 ip_ioctl_finish(q, orig_ioc_mp, 10440 EINVAL, NO_COPYOUT, NULL); 10441 return; 10442 } 10443 *flagsp |= ATF_COM; 10444 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10445 if (ipsqill->ill_sap_length < 0) 10446 macaddr = llmp->b_rptr + 10447 dlup->dl_dest_addr_offset; 10448 else 10449 macaddr = llmp->b_rptr + 10450 dlup->dl_dest_addr_offset + 10451 ipsqill->ill_sap_length; 10452 /* 10453 * For SIOCGARP, MAC address length 10454 * validation has already been done 10455 * before the ioctl was issued to ARP to 10456 * allow it to progress only on 6 byte 10457 * addressable (ethernet like) media. Thus 10458 * the mac address copying can not overwrite 10459 * the sa_data area below. 10460 */ 10461 bcopy(macaddr, storage, addr_len); 10462 } 10463 /* Ditch the internal IOCTL. */ 10464 freemsg(mp); 10465 ire_refrele(ire); 10466 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); 10467 return; 10468 } 10469 } 10470 10471 /* 10472 * Delete the coresponding IRE_CACHE if any. 10473 * Reset the error if there was one (in case there was no entry 10474 * in arp.) 10475 */ 10476 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10477 ipif_t *ipintf = NULL; 10478 10479 if (ifx_arp_ioctl) { 10480 /* 10481 * There's no need to lookup the ill, since 10482 * we've already done that when we started 10483 * processing the ioctl and sent the message 10484 * to ARP on that ill. So use the ill that 10485 * is stored in q->q_ptr. 10486 */ 10487 ipintf = ill->ill_ipif; 10488 } 10489 if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) { 10490 /* 10491 * The address in "addr" may be an entry for a 10492 * router. If that's true, then any off-net 10493 * IRE_CACHE entries that go through the router 10494 * with address "addr" must be clobbered. Use 10495 * ire_walk to achieve this goal. 10496 */ 10497 if (ifx_arp_ioctl) 10498 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10499 ire_delete_cache_gw, (char *)&addr, ill); 10500 else 10501 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10502 ALL_ZONES, ipst); 10503 iocp->ioc_error = 0; 10504 } 10505 } 10506 errack: 10507 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10508 err = iocp->ioc_error; 10509 freemsg(mp); 10510 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, NULL); 10511 return; 10512 } 10513 10514 /* 10515 * Completion of an SIOCG{X}ARP. Translate the information from 10516 * the area_t into the struct {x}arpreq. 10517 */ 10518 if (x_arp_ioctl) { 10519 storage += ill_xarp_info(&xar->xarp_ha, ill); 10520 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10521 sizeof (xar->xarp_ha.sdl_data)) { 10522 freemsg(mp); 10523 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, 10524 NULL); 10525 return; 10526 } 10527 } 10528 *flagsp = ATF_INUSE; 10529 if (area->area_flags & ACE_F_PERMANENT) 10530 *flagsp |= ATF_PERM; 10531 if (area->area_flags & ACE_F_PUBLISH) 10532 *flagsp |= ATF_PUBL; 10533 if (area->area_flags & ACE_F_AUTHORITY) 10534 *flagsp |= ATF_AUTHORITY; 10535 if (area->area_hw_addr_length != 0) { 10536 *flagsp |= ATF_COM; 10537 /* 10538 * For SIOCGARP, MAC address length validation has 10539 * already been done before the ioctl was issued to ARP 10540 * to allow it to progress only on 6 byte addressable 10541 * (ethernet like) media. Thus the mac address copying 10542 * can not overwrite the sa_data area below. 10543 */ 10544 bcopy((char *)area + area->area_hw_addr_offset, 10545 storage, area->area_hw_addr_length); 10546 } 10547 10548 /* Ditch the internal IOCTL. */ 10549 freemsg(mp); 10550 /* Complete the original. */ 10551 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, NULL); 10552 } 10553 10554 /* 10555 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10556 * interface) create the next available logical interface for this 10557 * physical interface. 10558 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10559 * ipif with the specified name. 10560 * 10561 * If the address family is not AF_UNSPEC then set the address as well. 10562 * 10563 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10564 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10565 * 10566 * Executed as a writer on the ill or ill group. 10567 * So no lock is needed to traverse the ipif chain, or examine the 10568 * phyint flags. 10569 */ 10570 /* ARGSUSED */ 10571 int 10572 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10573 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10574 { 10575 mblk_t *mp1; 10576 struct lifreq *lifr; 10577 boolean_t isv6; 10578 boolean_t exists; 10579 char *name; 10580 char *endp; 10581 char *cp; 10582 int namelen; 10583 ipif_t *ipif; 10584 long id; 10585 ipsq_t *ipsq; 10586 ill_t *ill; 10587 sin_t *sin; 10588 int err = 0; 10589 boolean_t found_sep = B_FALSE; 10590 conn_t *connp; 10591 zoneid_t zoneid; 10592 int orig_ifindex = 0; 10593 ip_stack_t *ipst = CONNQ_TO_IPST(q); 10594 10595 ASSERT(q->q_next == NULL); 10596 ip1dbg(("ip_sioctl_addif\n")); 10597 /* Existence of mp1 has been checked in ip_wput_nondata */ 10598 mp1 = mp->b_cont->b_cont; 10599 /* 10600 * Null terminate the string to protect against buffer 10601 * overrun. String was generated by user code and may not 10602 * be trusted. 10603 */ 10604 lifr = (struct lifreq *)mp1->b_rptr; 10605 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10606 name = lifr->lifr_name; 10607 ASSERT(CONN_Q(q)); 10608 connp = Q_TO_CONN(q); 10609 isv6 = connp->conn_af_isv6; 10610 zoneid = connp->conn_zoneid; 10611 namelen = mi_strlen(name); 10612 if (namelen == 0) 10613 return (EINVAL); 10614 10615 exists = B_FALSE; 10616 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10617 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10618 /* 10619 * Allow creating lo0 using SIOCLIFADDIF. 10620 * can't be any other writer thread. So can pass null below 10621 * for the last 4 args to ipif_lookup_name. 10622 */ 10623 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 10624 &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst); 10625 /* Prevent any further action */ 10626 if (ipif == NULL) { 10627 return (ENOBUFS); 10628 } else if (!exists) { 10629 /* We created the ipif now and as writer */ 10630 ipif_refrele(ipif); 10631 return (0); 10632 } else { 10633 ill = ipif->ipif_ill; 10634 ill_refhold(ill); 10635 ipif_refrele(ipif); 10636 } 10637 } else { 10638 /* Look for a colon in the name. */ 10639 endp = &name[namelen]; 10640 for (cp = endp; --cp > name; ) { 10641 if (*cp == IPIF_SEPARATOR_CHAR) { 10642 found_sep = B_TRUE; 10643 /* 10644 * Reject any non-decimal aliases for plumbing 10645 * of logical interfaces. Aliases with leading 10646 * zeroes are also rejected as they introduce 10647 * ambiguity in the naming of the interfaces. 10648 * Comparing with "0" takes care of all such 10649 * cases. 10650 */ 10651 if ((strncmp("0", cp+1, 1)) == 0) 10652 return (EINVAL); 10653 10654 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10655 id <= 0 || *endp != '\0') { 10656 return (EINVAL); 10657 } 10658 *cp = '\0'; 10659 break; 10660 } 10661 } 10662 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10663 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst); 10664 if (found_sep) 10665 *cp = IPIF_SEPARATOR_CHAR; 10666 if (ill == NULL) 10667 return (err); 10668 } 10669 10670 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10671 B_TRUE); 10672 10673 /* 10674 * Release the refhold due to the lookup, now that we are excl 10675 * or we are just returning 10676 */ 10677 ill_refrele(ill); 10678 10679 if (ipsq == NULL) 10680 return (EINPROGRESS); 10681 10682 /* 10683 * If the interface is failed, inactive or offlined, look for a working 10684 * interface in the ill group and create the ipif there. If we can't 10685 * find a good interface, create the ipif anyway so that in.mpathd can 10686 * move it to the first repaired interface. 10687 */ 10688 if ((ill->ill_phyint->phyint_flags & 10689 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10690 ill->ill_phyint->phyint_groupname_len != 0) { 10691 phyint_t *phyi; 10692 char *groupname = ill->ill_phyint->phyint_groupname; 10693 10694 /* 10695 * We're looking for a working interface, but it doesn't matter 10696 * if it's up or down; so instead of following the group lists, 10697 * we look at each physical interface and compare the groupname. 10698 * We're only interested in interfaces with IPv4 (resp. IPv6) 10699 * plumbed when we're adding an IPv4 (resp. IPv6) ipif. 10700 * Otherwise we create the ipif on the failed interface. 10701 */ 10702 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 10703 phyi = avl_first(&ipst->ips_phyint_g_list-> 10704 phyint_list_avl_by_index); 10705 for (; phyi != NULL; 10706 phyi = avl_walk(&ipst->ips_phyint_g_list-> 10707 phyint_list_avl_by_index, 10708 phyi, AVL_AFTER)) { 10709 if (phyi->phyint_groupname_len == 0) 10710 continue; 10711 ASSERT(phyi->phyint_groupname != NULL); 10712 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0 && 10713 !(phyi->phyint_flags & 10714 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 10715 (ill->ill_isv6 ? (phyi->phyint_illv6 != NULL) : 10716 (phyi->phyint_illv4 != NULL))) { 10717 break; 10718 } 10719 } 10720 rw_exit(&ipst->ips_ill_g_lock); 10721 10722 if (phyi != NULL) { 10723 orig_ifindex = ill->ill_phyint->phyint_ifindex; 10724 ill = (ill->ill_isv6 ? phyi->phyint_illv6 : 10725 phyi->phyint_illv4); 10726 } 10727 } 10728 10729 /* 10730 * We are now exclusive on the ipsq, so an ill move will be serialized 10731 * before or after us. 10732 */ 10733 ASSERT(IAM_WRITER_ILL(ill)); 10734 ASSERT(ill->ill_move_in_progress == B_FALSE); 10735 10736 if (found_sep && orig_ifindex == 0) { 10737 /* Now see if there is an IPIF with this unit number. */ 10738 for (ipif = ill->ill_ipif; ipif != NULL; 10739 ipif = ipif->ipif_next) { 10740 if (ipif->ipif_id == id) { 10741 err = EEXIST; 10742 goto done; 10743 } 10744 } 10745 } 10746 10747 /* 10748 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10749 * of lo0. We never come here when we plumb lo0:0. It 10750 * happens in ipif_lookup_on_name. 10751 * The specified unit number is ignored when we create the ipif on a 10752 * different interface. However, we save it in ipif_orig_ipifid below so 10753 * that the ipif fails back to the right position. 10754 */ 10755 if ((ipif = ipif_allocate(ill, (found_sep && orig_ifindex == 0) ? 10756 id : -1, IRE_LOCAL, B_TRUE)) == NULL) { 10757 err = ENOBUFS; 10758 goto done; 10759 } 10760 10761 /* Return created name with ioctl */ 10762 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10763 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10764 ip1dbg(("created %s\n", lifr->lifr_name)); 10765 10766 /* Set address */ 10767 sin = (sin_t *)&lifr->lifr_addr; 10768 if (sin->sin_family != AF_UNSPEC) { 10769 err = ip_sioctl_addr(ipif, sin, q, mp, 10770 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10771 } 10772 10773 /* Set ifindex and unit number for failback */ 10774 if (err == 0 && orig_ifindex != 0) { 10775 ipif->ipif_orig_ifindex = orig_ifindex; 10776 if (found_sep) { 10777 ipif->ipif_orig_ipifid = id; 10778 } 10779 } 10780 10781 done: 10782 ipsq_exit(ipsq, B_TRUE, B_TRUE); 10783 return (err); 10784 } 10785 10786 /* 10787 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10788 * interface) delete it based on the IP address (on this physical interface). 10789 * Otherwise delete it based on the ipif_id. 10790 * Also, special handling to allow a removeif of lo0. 10791 */ 10792 /* ARGSUSED */ 10793 int 10794 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10795 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10796 { 10797 conn_t *connp; 10798 ill_t *ill = ipif->ipif_ill; 10799 boolean_t success; 10800 ip_stack_t *ipst; 10801 10802 ipst = CONNQ_TO_IPST(q); 10803 10804 ASSERT(q->q_next == NULL); 10805 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10806 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10807 ASSERT(IAM_WRITER_IPIF(ipif)); 10808 10809 connp = Q_TO_CONN(q); 10810 /* 10811 * Special case for unplumbing lo0 (the loopback physical interface). 10812 * If unplumbing lo0, the incoming address structure has been 10813 * initialized to all zeros. When unplumbing lo0, all its logical 10814 * interfaces must be removed too. 10815 * 10816 * Note that this interface may be called to remove a specific 10817 * loopback logical interface (eg, lo0:1). But in that case 10818 * ipif->ipif_id != 0 so that the code path for that case is the 10819 * same as any other interface (meaning it skips the code directly 10820 * below). 10821 */ 10822 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10823 if (sin->sin_family == AF_UNSPEC && 10824 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10825 /* 10826 * Mark it condemned. No new ref. will be made to ill. 10827 */ 10828 mutex_enter(&ill->ill_lock); 10829 ill->ill_state_flags |= ILL_CONDEMNED; 10830 for (ipif = ill->ill_ipif; ipif != NULL; 10831 ipif = ipif->ipif_next) { 10832 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10833 } 10834 mutex_exit(&ill->ill_lock); 10835 10836 ipif = ill->ill_ipif; 10837 /* unplumb the loopback interface */ 10838 ill_delete(ill); 10839 mutex_enter(&connp->conn_lock); 10840 mutex_enter(&ill->ill_lock); 10841 ASSERT(ill->ill_group == NULL); 10842 10843 /* Are any references to this ill active */ 10844 if (ill_is_quiescent(ill)) { 10845 mutex_exit(&ill->ill_lock); 10846 mutex_exit(&connp->conn_lock); 10847 ill_delete_tail(ill); 10848 mi_free(ill); 10849 return (0); 10850 } 10851 success = ipsq_pending_mp_add(connp, ipif, 10852 CONNP_TO_WQ(connp), mp, ILL_FREE); 10853 mutex_exit(&connp->conn_lock); 10854 mutex_exit(&ill->ill_lock); 10855 if (success) 10856 return (EINPROGRESS); 10857 else 10858 return (EINTR); 10859 } 10860 } 10861 10862 /* 10863 * We are exclusive on the ipsq, so an ill move will be serialized 10864 * before or after us. 10865 */ 10866 ASSERT(ill->ill_move_in_progress == B_FALSE); 10867 10868 if (ipif->ipif_id == 0) { 10869 /* Find based on address */ 10870 if (ipif->ipif_isv6) { 10871 sin6_t *sin6; 10872 10873 if (sin->sin_family != AF_INET6) 10874 return (EAFNOSUPPORT); 10875 10876 sin6 = (sin6_t *)sin; 10877 /* We are a writer, so we should be able to lookup */ 10878 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10879 ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 10880 if (ipif == NULL) { 10881 /* 10882 * Maybe the address in on another interface in 10883 * the same IPMP group? We check this below. 10884 */ 10885 ipif = ipif_lookup_addr_v6(&sin6->sin6_addr, 10886 NULL, ALL_ZONES, NULL, NULL, NULL, NULL, 10887 ipst); 10888 } 10889 } else { 10890 ipaddr_t addr; 10891 10892 if (sin->sin_family != AF_INET) 10893 return (EAFNOSUPPORT); 10894 10895 addr = sin->sin_addr.s_addr; 10896 /* We are a writer, so we should be able to lookup */ 10897 ipif = ipif_lookup_addr(addr, ill, ALL_ZONES, NULL, 10898 NULL, NULL, NULL, ipst); 10899 if (ipif == NULL) { 10900 /* 10901 * Maybe the address in on another interface in 10902 * the same IPMP group? We check this below. 10903 */ 10904 ipif = ipif_lookup_addr(addr, NULL, ALL_ZONES, 10905 NULL, NULL, NULL, NULL, ipst); 10906 } 10907 } 10908 if (ipif == NULL) { 10909 return (EADDRNOTAVAIL); 10910 } 10911 /* 10912 * When the address to be removed is hosted on a different 10913 * interface, we check if the interface is in the same IPMP 10914 * group as the specified one; if so we proceed with the 10915 * removal. 10916 * ill->ill_group is NULL when the ill is down, so we have to 10917 * compare the group names instead. 10918 */ 10919 if (ipif->ipif_ill != ill && 10920 (ipif->ipif_ill->ill_phyint->phyint_groupname_len == 0 || 10921 ill->ill_phyint->phyint_groupname_len == 0 || 10922 mi_strcmp(ipif->ipif_ill->ill_phyint->phyint_groupname, 10923 ill->ill_phyint->phyint_groupname) != 0)) { 10924 ipif_refrele(ipif); 10925 return (EADDRNOTAVAIL); 10926 } 10927 10928 /* This is a writer */ 10929 ipif_refrele(ipif); 10930 } 10931 10932 /* 10933 * Can not delete instance zero since it is tied to the ill. 10934 */ 10935 if (ipif->ipif_id == 0) 10936 return (EBUSY); 10937 10938 mutex_enter(&ill->ill_lock); 10939 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10940 mutex_exit(&ill->ill_lock); 10941 10942 ipif_free(ipif); 10943 10944 mutex_enter(&connp->conn_lock); 10945 mutex_enter(&ill->ill_lock); 10946 10947 /* Are any references to this ipif active */ 10948 if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { 10949 mutex_exit(&ill->ill_lock); 10950 mutex_exit(&connp->conn_lock); 10951 ipif_non_duplicate(ipif); 10952 ipif_down_tail(ipif); 10953 ipif_free_tail(ipif); 10954 return (0); 10955 } 10956 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10957 IPIF_FREE); 10958 mutex_exit(&ill->ill_lock); 10959 mutex_exit(&connp->conn_lock); 10960 if (success) 10961 return (EINPROGRESS); 10962 else 10963 return (EINTR); 10964 } 10965 10966 /* 10967 * Restart the removeif ioctl. The refcnt has gone down to 0. 10968 * The ipif is already condemned. So can't find it thru lookups. 10969 */ 10970 /* ARGSUSED */ 10971 int 10972 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10973 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10974 { 10975 ill_t *ill = ipif->ipif_ill; 10976 10977 ASSERT(IAM_WRITER_IPIF(ipif)); 10978 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10979 10980 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10981 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10982 10983 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10984 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 10985 ill_delete_tail(ill); 10986 mi_free(ill); 10987 return (0); 10988 } 10989 10990 ipif_non_duplicate(ipif); 10991 ipif_down_tail(ipif); 10992 ipif_free_tail(ipif); 10993 10994 ILL_UNMARK_CHANGING(ill); 10995 return (0); 10996 } 10997 10998 /* 10999 * Set the local interface address. 11000 * Allow an address of all zero when the interface is down. 11001 */ 11002 /* ARGSUSED */ 11003 int 11004 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11005 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 11006 { 11007 int err = 0; 11008 in6_addr_t v6addr; 11009 boolean_t need_up = B_FALSE; 11010 11011 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 11012 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11013 11014 ASSERT(IAM_WRITER_IPIF(ipif)); 11015 11016 if (ipif->ipif_isv6) { 11017 sin6_t *sin6; 11018 ill_t *ill; 11019 phyint_t *phyi; 11020 11021 if (sin->sin_family != AF_INET6) 11022 return (EAFNOSUPPORT); 11023 11024 sin6 = (sin6_t *)sin; 11025 v6addr = sin6->sin6_addr; 11026 ill = ipif->ipif_ill; 11027 phyi = ill->ill_phyint; 11028 11029 /* 11030 * Enforce that true multicast interfaces have a link-local 11031 * address for logical unit 0. 11032 */ 11033 if (ipif->ipif_id == 0 && 11034 (ill->ill_flags & ILLF_MULTICAST) && 11035 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 11036 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 11037 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 11038 return (EADDRNOTAVAIL); 11039 } 11040 11041 /* 11042 * up interfaces shouldn't have the unspecified address 11043 * unless they also have the IPIF_NOLOCAL flags set and 11044 * have a subnet assigned. 11045 */ 11046 if ((ipif->ipif_flags & IPIF_UP) && 11047 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 11048 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 11049 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 11050 return (EADDRNOTAVAIL); 11051 } 11052 11053 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11054 return (EADDRNOTAVAIL); 11055 } else { 11056 ipaddr_t addr; 11057 11058 if (sin->sin_family != AF_INET) 11059 return (EAFNOSUPPORT); 11060 11061 addr = sin->sin_addr.s_addr; 11062 11063 /* Allow 0 as the local address. */ 11064 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11065 return (EADDRNOTAVAIL); 11066 11067 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11068 } 11069 11070 11071 /* 11072 * Even if there is no change we redo things just to rerun 11073 * ipif_set_default. 11074 */ 11075 if (ipif->ipif_flags & IPIF_UP) { 11076 /* 11077 * Setting a new local address, make sure 11078 * we have net and subnet bcast ire's for 11079 * the old address if we need them. 11080 */ 11081 if (!ipif->ipif_isv6) 11082 ipif_check_bcast_ires(ipif); 11083 /* 11084 * If the interface is already marked up, 11085 * we call ipif_down which will take care 11086 * of ditching any IREs that have been set 11087 * up based on the old interface address. 11088 */ 11089 err = ipif_logical_down(ipif, q, mp); 11090 if (err == EINPROGRESS) 11091 return (err); 11092 ipif_down_tail(ipif); 11093 need_up = 1; 11094 } 11095 11096 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 11097 return (err); 11098 } 11099 11100 int 11101 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11102 boolean_t need_up) 11103 { 11104 in6_addr_t v6addr; 11105 in6_addr_t ov6addr; 11106 ipaddr_t addr; 11107 sin6_t *sin6; 11108 int sinlen; 11109 int err = 0; 11110 ill_t *ill = ipif->ipif_ill; 11111 boolean_t need_dl_down; 11112 boolean_t need_arp_down; 11113 struct iocblk *iocp; 11114 11115 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 11116 11117 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 11118 ill->ill_name, ipif->ipif_id, (void *)ipif)); 11119 ASSERT(IAM_WRITER_IPIF(ipif)); 11120 11121 /* Must cancel any pending timer before taking the ill_lock */ 11122 if (ipif->ipif_recovery_id != 0) 11123 (void) untimeout(ipif->ipif_recovery_id); 11124 ipif->ipif_recovery_id = 0; 11125 11126 if (ipif->ipif_isv6) { 11127 sin6 = (sin6_t *)sin; 11128 v6addr = sin6->sin6_addr; 11129 sinlen = sizeof (struct sockaddr_in6); 11130 } else { 11131 addr = sin->sin_addr.s_addr; 11132 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11133 sinlen = sizeof (struct sockaddr_in); 11134 } 11135 mutex_enter(&ill->ill_lock); 11136 ov6addr = ipif->ipif_v6lcl_addr; 11137 ipif->ipif_v6lcl_addr = v6addr; 11138 sctp_update_ipif_addr(ipif, ov6addr); 11139 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 11140 ipif->ipif_v6src_addr = ipv6_all_zeros; 11141 } else { 11142 ipif->ipif_v6src_addr = v6addr; 11143 } 11144 ipif->ipif_addr_ready = 0; 11145 11146 /* 11147 * If the interface was previously marked as a duplicate, then since 11148 * we've now got a "new" address, it should no longer be considered a 11149 * duplicate -- even if the "new" address is the same as the old one. 11150 * Note that if all ipifs are down, we may have a pending ARP down 11151 * event to handle. This is because we want to recover from duplicates 11152 * and thus delay tearing down ARP until the duplicates have been 11153 * removed or disabled. 11154 */ 11155 need_dl_down = need_arp_down = B_FALSE; 11156 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11157 need_arp_down = !need_up; 11158 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11159 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11160 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11161 need_dl_down = B_TRUE; 11162 } 11163 } 11164 11165 if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && 11166 !ill->ill_is_6to4tun) { 11167 queue_t *wqp = ill->ill_wq; 11168 11169 /* 11170 * The local address of this interface is a 6to4 address, 11171 * check if this interface is in fact a 6to4 tunnel or just 11172 * an interface configured with a 6to4 address. We are only 11173 * interested in the former. 11174 */ 11175 if (wqp != NULL) { 11176 while ((wqp->q_next != NULL) && 11177 (wqp->q_next->q_qinfo != NULL) && 11178 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 11179 11180 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 11181 == TUN6TO4_MODID) { 11182 /* set for use in IP */ 11183 ill->ill_is_6to4tun = 1; 11184 break; 11185 } 11186 wqp = wqp->q_next; 11187 } 11188 } 11189 } 11190 11191 ipif_set_default(ipif); 11192 11193 /* 11194 * When publishing an interface address change event, we only notify 11195 * the event listeners of the new address. It is assumed that if they 11196 * actively care about the addresses assigned that they will have 11197 * already discovered the previous address assigned (if there was one.) 11198 * 11199 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11200 */ 11201 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 11202 hook_nic_event_t *info; 11203 if ((info = ipif->ipif_ill->ill_nic_event_info) != NULL) { 11204 ip2dbg(("ip_sioctl_addr_tail: unexpected nic event %d " 11205 "attached for %s\n", info->hne_event, 11206 ill->ill_name)); 11207 if (info->hne_data != NULL) 11208 kmem_free(info->hne_data, info->hne_datalen); 11209 kmem_free(info, sizeof (hook_nic_event_t)); 11210 } 11211 11212 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 11213 if (info != NULL) { 11214 ip_stack_t *ipst = ill->ill_ipst; 11215 11216 info->hne_nic = 11217 ipif->ipif_ill->ill_phyint->phyint_hook_ifindex; 11218 info->hne_lif = MAP_IPIF_ID(ipif->ipif_id); 11219 info->hne_event = NE_ADDRESS_CHANGE; 11220 info->hne_family = ipif->ipif_isv6 ? 11221 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 11222 info->hne_data = kmem_alloc(sinlen, KM_NOSLEEP); 11223 if (info->hne_data != NULL) { 11224 info->hne_datalen = sinlen; 11225 bcopy(sin, info->hne_data, sinlen); 11226 } else { 11227 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11228 "address information for ADDRESS_CHANGE nic" 11229 " event of %s (ENOMEM)\n", 11230 ipif->ipif_ill->ill_name)); 11231 kmem_free(info, sizeof (hook_nic_event_t)); 11232 } 11233 } else 11234 ip2dbg(("ip_sioctl_addr_tail: could not attach " 11235 "ADDRESS_CHANGE nic event information for %s " 11236 "(ENOMEM)\n", ipif->ipif_ill->ill_name)); 11237 11238 ipif->ipif_ill->ill_nic_event_info = info; 11239 } 11240 11241 mutex_exit(&ill->ill_lock); 11242 11243 if (need_up) { 11244 /* 11245 * Now bring the interface back up. If this 11246 * is the only IPIF for the ILL, ipif_up 11247 * will have to re-bind to the device, so 11248 * we may get back EINPROGRESS, in which 11249 * case, this IOCTL will get completed in 11250 * ip_rput_dlpi when we see the DL_BIND_ACK. 11251 */ 11252 err = ipif_up(ipif, q, mp); 11253 } 11254 11255 if (need_dl_down) 11256 ill_dl_down(ill); 11257 if (need_arp_down) 11258 ipif_arp_down(ipif); 11259 11260 return (err); 11261 } 11262 11263 11264 /* 11265 * Restart entry point to restart the address set operation after the 11266 * refcounts have dropped to zero. 11267 */ 11268 /* ARGSUSED */ 11269 int 11270 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11271 ip_ioctl_cmd_t *ipip, void *ifreq) 11272 { 11273 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11274 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11275 ASSERT(IAM_WRITER_IPIF(ipif)); 11276 ipif_down_tail(ipif); 11277 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11278 } 11279 11280 /* ARGSUSED */ 11281 int 11282 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11283 ip_ioctl_cmd_t *ipip, void *if_req) 11284 { 11285 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11286 struct lifreq *lifr = (struct lifreq *)if_req; 11287 11288 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11289 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11290 /* 11291 * The net mask and address can't change since we have a 11292 * reference to the ipif. So no lock is necessary. 11293 */ 11294 if (ipif->ipif_isv6) { 11295 *sin6 = sin6_null; 11296 sin6->sin6_family = AF_INET6; 11297 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11298 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11299 lifr->lifr_addrlen = 11300 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11301 } else { 11302 *sin = sin_null; 11303 sin->sin_family = AF_INET; 11304 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11305 if (ipip->ipi_cmd_type == LIF_CMD) { 11306 lifr->lifr_addrlen = 11307 ip_mask_to_plen(ipif->ipif_net_mask); 11308 } 11309 } 11310 return (0); 11311 } 11312 11313 /* 11314 * Set the destination address for a pt-pt interface. 11315 */ 11316 /* ARGSUSED */ 11317 int 11318 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11319 ip_ioctl_cmd_t *ipip, void *if_req) 11320 { 11321 int err = 0; 11322 in6_addr_t v6addr; 11323 boolean_t need_up = B_FALSE; 11324 11325 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11326 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11327 ASSERT(IAM_WRITER_IPIF(ipif)); 11328 11329 if (ipif->ipif_isv6) { 11330 sin6_t *sin6; 11331 11332 if (sin->sin_family != AF_INET6) 11333 return (EAFNOSUPPORT); 11334 11335 sin6 = (sin6_t *)sin; 11336 v6addr = sin6->sin6_addr; 11337 11338 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11339 return (EADDRNOTAVAIL); 11340 } else { 11341 ipaddr_t addr; 11342 11343 if (sin->sin_family != AF_INET) 11344 return (EAFNOSUPPORT); 11345 11346 addr = sin->sin_addr.s_addr; 11347 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11348 return (EADDRNOTAVAIL); 11349 11350 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11351 } 11352 11353 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11354 return (0); /* No change */ 11355 11356 if (ipif->ipif_flags & IPIF_UP) { 11357 /* 11358 * If the interface is already marked up, 11359 * we call ipif_down which will take care 11360 * of ditching any IREs that have been set 11361 * up based on the old pp dst address. 11362 */ 11363 err = ipif_logical_down(ipif, q, mp); 11364 if (err == EINPROGRESS) 11365 return (err); 11366 ipif_down_tail(ipif); 11367 need_up = B_TRUE; 11368 } 11369 /* 11370 * could return EINPROGRESS. If so ioctl will complete in 11371 * ip_rput_dlpi_writer 11372 */ 11373 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11374 return (err); 11375 } 11376 11377 static int 11378 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11379 boolean_t need_up) 11380 { 11381 in6_addr_t v6addr; 11382 ill_t *ill = ipif->ipif_ill; 11383 int err = 0; 11384 boolean_t need_dl_down; 11385 boolean_t need_arp_down; 11386 11387 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11388 ipif->ipif_id, (void *)ipif)); 11389 11390 /* Must cancel any pending timer before taking the ill_lock */ 11391 if (ipif->ipif_recovery_id != 0) 11392 (void) untimeout(ipif->ipif_recovery_id); 11393 ipif->ipif_recovery_id = 0; 11394 11395 if (ipif->ipif_isv6) { 11396 sin6_t *sin6; 11397 11398 sin6 = (sin6_t *)sin; 11399 v6addr = sin6->sin6_addr; 11400 } else { 11401 ipaddr_t addr; 11402 11403 addr = sin->sin_addr.s_addr; 11404 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11405 } 11406 mutex_enter(&ill->ill_lock); 11407 /* Set point to point destination address. */ 11408 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11409 /* 11410 * Allow this as a means of creating logical 11411 * pt-pt interfaces on top of e.g. an Ethernet. 11412 * XXX Undocumented HACK for testing. 11413 * pt-pt interfaces are created with NUD disabled. 11414 */ 11415 ipif->ipif_flags |= IPIF_POINTOPOINT; 11416 ipif->ipif_flags &= ~IPIF_BROADCAST; 11417 if (ipif->ipif_isv6) 11418 ill->ill_flags |= ILLF_NONUD; 11419 } 11420 11421 /* 11422 * If the interface was previously marked as a duplicate, then since 11423 * we've now got a "new" address, it should no longer be considered a 11424 * duplicate -- even if the "new" address is the same as the old one. 11425 * Note that if all ipifs are down, we may have a pending ARP down 11426 * event to handle. 11427 */ 11428 need_dl_down = need_arp_down = B_FALSE; 11429 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11430 need_arp_down = !need_up; 11431 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11432 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11433 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11434 need_dl_down = B_TRUE; 11435 } 11436 } 11437 11438 /* Set the new address. */ 11439 ipif->ipif_v6pp_dst_addr = v6addr; 11440 /* Make sure subnet tracks pp_dst */ 11441 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11442 mutex_exit(&ill->ill_lock); 11443 11444 if (need_up) { 11445 /* 11446 * Now bring the interface back up. If this 11447 * is the only IPIF for the ILL, ipif_up 11448 * will have to re-bind to the device, so 11449 * we may get back EINPROGRESS, in which 11450 * case, this IOCTL will get completed in 11451 * ip_rput_dlpi when we see the DL_BIND_ACK. 11452 */ 11453 err = ipif_up(ipif, q, mp); 11454 } 11455 11456 if (need_dl_down) 11457 ill_dl_down(ill); 11458 11459 if (need_arp_down) 11460 ipif_arp_down(ipif); 11461 return (err); 11462 } 11463 11464 /* 11465 * Restart entry point to restart the dstaddress set operation after the 11466 * refcounts have dropped to zero. 11467 */ 11468 /* ARGSUSED */ 11469 int 11470 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11471 ip_ioctl_cmd_t *ipip, void *ifreq) 11472 { 11473 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11474 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11475 ipif_down_tail(ipif); 11476 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11477 } 11478 11479 /* ARGSUSED */ 11480 int 11481 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11482 ip_ioctl_cmd_t *ipip, void *if_req) 11483 { 11484 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11485 11486 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11487 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11488 /* 11489 * Get point to point destination address. The addresses can't 11490 * change since we hold a reference to the ipif. 11491 */ 11492 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11493 return (EADDRNOTAVAIL); 11494 11495 if (ipif->ipif_isv6) { 11496 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11497 *sin6 = sin6_null; 11498 sin6->sin6_family = AF_INET6; 11499 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11500 } else { 11501 *sin = sin_null; 11502 sin->sin_family = AF_INET; 11503 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11504 } 11505 return (0); 11506 } 11507 11508 /* 11509 * part of ipmp, make this func return the active/inactive state and 11510 * caller can set once atomically instead of multiple mutex_enter/mutex_exit 11511 */ 11512 /* 11513 * This function either sets or clears the IFF_INACTIVE flag. 11514 * 11515 * As long as there are some addresses or multicast memberships on the 11516 * IPv4 or IPv6 interface of the "phyi" that does not belong in here, we 11517 * will consider it to be ACTIVE (clear IFF_INACTIVE) i.e the interface 11518 * will be used for outbound packets. 11519 * 11520 * Caller needs to verify the validity of setting IFF_INACTIVE. 11521 */ 11522 static void 11523 phyint_inactive(phyint_t *phyi) 11524 { 11525 ill_t *ill_v4; 11526 ill_t *ill_v6; 11527 ipif_t *ipif; 11528 ilm_t *ilm; 11529 11530 ill_v4 = phyi->phyint_illv4; 11531 ill_v6 = phyi->phyint_illv6; 11532 11533 /* 11534 * No need for a lock while traversing the list since iam 11535 * a writer 11536 */ 11537 if (ill_v4 != NULL) { 11538 ASSERT(IAM_WRITER_ILL(ill_v4)); 11539 for (ipif = ill_v4->ill_ipif; ipif != NULL; 11540 ipif = ipif->ipif_next) { 11541 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11542 mutex_enter(&phyi->phyint_lock); 11543 phyi->phyint_flags &= ~PHYI_INACTIVE; 11544 mutex_exit(&phyi->phyint_lock); 11545 return; 11546 } 11547 } 11548 for (ilm = ill_v4->ill_ilm; ilm != NULL; 11549 ilm = ilm->ilm_next) { 11550 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11551 mutex_enter(&phyi->phyint_lock); 11552 phyi->phyint_flags &= ~PHYI_INACTIVE; 11553 mutex_exit(&phyi->phyint_lock); 11554 return; 11555 } 11556 } 11557 } 11558 if (ill_v6 != NULL) { 11559 ill_v6 = phyi->phyint_illv6; 11560 for (ipif = ill_v6->ill_ipif; ipif != NULL; 11561 ipif = ipif->ipif_next) { 11562 if (ipif->ipif_orig_ifindex != phyi->phyint_ifindex) { 11563 mutex_enter(&phyi->phyint_lock); 11564 phyi->phyint_flags &= ~PHYI_INACTIVE; 11565 mutex_exit(&phyi->phyint_lock); 11566 return; 11567 } 11568 } 11569 for (ilm = ill_v6->ill_ilm; ilm != NULL; 11570 ilm = ilm->ilm_next) { 11571 if (ilm->ilm_orig_ifindex != phyi->phyint_ifindex) { 11572 mutex_enter(&phyi->phyint_lock); 11573 phyi->phyint_flags &= ~PHYI_INACTIVE; 11574 mutex_exit(&phyi->phyint_lock); 11575 return; 11576 } 11577 } 11578 } 11579 mutex_enter(&phyi->phyint_lock); 11580 phyi->phyint_flags |= PHYI_INACTIVE; 11581 mutex_exit(&phyi->phyint_lock); 11582 } 11583 11584 /* 11585 * This function is called only when the phyint flags change. Currently 11586 * called from ip_sioctl_flags. We re-do the broadcast nomination so 11587 * that we can select a good ill. 11588 */ 11589 static void 11590 ip_redo_nomination(phyint_t *phyi) 11591 { 11592 ill_t *ill_v4; 11593 11594 ill_v4 = phyi->phyint_illv4; 11595 11596 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 11597 ASSERT(IAM_WRITER_ILL(ill_v4)); 11598 if (ill_v4->ill_group->illgrp_ill_count > 1) 11599 ill_nominate_bcast_rcv(ill_v4->ill_group); 11600 } 11601 } 11602 11603 /* 11604 * Heuristic to check if ill is INACTIVE. 11605 * Checks if ill has an ipif with an usable ip address. 11606 * 11607 * Return values: 11608 * B_TRUE - ill is INACTIVE; has no usable ipif 11609 * B_FALSE - ill is not INACTIVE; ill has at least one usable ipif 11610 */ 11611 static boolean_t 11612 ill_is_inactive(ill_t *ill) 11613 { 11614 ipif_t *ipif; 11615 11616 /* Check whether it is in an IPMP group */ 11617 if (ill->ill_phyint->phyint_groupname == NULL) 11618 return (B_FALSE); 11619 11620 if (ill->ill_ipif_up_count == 0) 11621 return (B_TRUE); 11622 11623 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 11624 uint64_t flags = ipif->ipif_flags; 11625 11626 /* 11627 * This ipif is usable if it is IPIF_UP and not a 11628 * dedicated test address. A dedicated test address 11629 * is marked IPIF_NOFAILOVER *and* IPIF_DEPRECATED 11630 * (note in particular that V6 test addresses are 11631 * link-local data addresses and thus are marked 11632 * IPIF_NOFAILOVER but not IPIF_DEPRECATED). 11633 */ 11634 if ((flags & IPIF_UP) && 11635 ((flags & (IPIF_DEPRECATED|IPIF_NOFAILOVER)) != 11636 (IPIF_DEPRECATED|IPIF_NOFAILOVER))) 11637 return (B_FALSE); 11638 } 11639 return (B_TRUE); 11640 } 11641 11642 /* 11643 * Set interface flags. 11644 * Need to do special action for IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, 11645 * IPIF_NOLOCAL, ILLF_NONUD, ILLF_NOARP, IPIF_PRIVATE, IPIF_ANYCAST, 11646 * IPIF_PREFERRED, PHYI_STANDBY, PHYI_FAILED and PHYI_OFFLINE. 11647 * 11648 * NOTE : We really don't enforce that ipif_id zero should be used 11649 * for setting any flags other than IFF_LOGINT_FLAGS. This 11650 * is because applications generally does SICGLIFFLAGS and 11651 * ORs in the new flags (that affects the logical) and does a 11652 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11653 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11654 * flags that will be turned on is correct with respect to 11655 * ipif_id 0. For backward compatibility reasons, it is not done. 11656 */ 11657 /* ARGSUSED */ 11658 int 11659 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11660 ip_ioctl_cmd_t *ipip, void *if_req) 11661 { 11662 uint64_t turn_on; 11663 uint64_t turn_off; 11664 int err; 11665 boolean_t need_up = B_FALSE; 11666 phyint_t *phyi; 11667 ill_t *ill; 11668 uint64_t intf_flags; 11669 boolean_t phyint_flags_modified = B_FALSE; 11670 uint64_t flags; 11671 struct ifreq *ifr; 11672 struct lifreq *lifr; 11673 boolean_t set_linklocal = B_FALSE; 11674 boolean_t zero_source = B_FALSE; 11675 ip_stack_t *ipst; 11676 11677 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11678 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11679 11680 ASSERT(IAM_WRITER_IPIF(ipif)); 11681 11682 ill = ipif->ipif_ill; 11683 phyi = ill->ill_phyint; 11684 ipst = ill->ill_ipst; 11685 11686 if (ipip->ipi_cmd_type == IF_CMD) { 11687 ifr = (struct ifreq *)if_req; 11688 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11689 } else { 11690 lifr = (struct lifreq *)if_req; 11691 flags = lifr->lifr_flags; 11692 } 11693 11694 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11695 11696 /* 11697 * Has the flags been set correctly till now ? 11698 */ 11699 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11700 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11701 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11702 /* 11703 * Compare the new flags to the old, and partition 11704 * into those coming on and those going off. 11705 * For the 16 bit command keep the bits above bit 16 unchanged. 11706 */ 11707 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11708 flags |= intf_flags & ~0xFFFF; 11709 11710 /* 11711 * First check which bits will change and then which will 11712 * go on and off 11713 */ 11714 turn_on = (flags ^ intf_flags) & ~IFF_CANTCHANGE; 11715 if (!turn_on) 11716 return (0); /* No change */ 11717 11718 turn_off = intf_flags & turn_on; 11719 turn_on ^= turn_off; 11720 err = 0; 11721 11722 /* 11723 * Don't allow any bits belonging to the logical interface 11724 * to be set or cleared on the replacement ipif that was 11725 * created temporarily during a MOVE. 11726 */ 11727 if (ipif->ipif_replace_zero && 11728 ((turn_on|turn_off) & IFF_LOGINT_FLAGS) != 0) { 11729 return (EINVAL); 11730 } 11731 11732 /* 11733 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11734 * IPv6 interfaces. 11735 */ 11736 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11737 return (EINVAL); 11738 11739 /* 11740 * cannot turn off IFF_NOXMIT on VNI interfaces. 11741 */ 11742 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 11743 return (EINVAL); 11744 11745 /* 11746 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11747 * interfaces. It makes no sense in that context. 11748 */ 11749 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11750 return (EINVAL); 11751 11752 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11753 zero_source = B_TRUE; 11754 11755 /* 11756 * For IPv6 ipif_id 0, don't allow the interface to be up without 11757 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11758 * If the link local address isn't set, and can be set, it will get 11759 * set later on in this function. 11760 */ 11761 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11762 (flags & IFF_UP) && !zero_source && 11763 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11764 if (ipif_cant_setlinklocal(ipif)) 11765 return (EINVAL); 11766 set_linklocal = B_TRUE; 11767 } 11768 11769 /* 11770 * ILL cannot be part of a usesrc group and and IPMP group at the 11771 * same time. No need to grab ill_g_usesrc_lock here, see 11772 * synchronization notes in ip.c 11773 */ 11774 if (turn_on & PHYI_STANDBY && 11775 ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 11776 return (EINVAL); 11777 } 11778 11779 /* 11780 * If we modify physical interface flags, we'll potentially need to 11781 * send up two routing socket messages for the changes (one for the 11782 * IPv4 ill, and another for the IPv6 ill). Note that here. 11783 */ 11784 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11785 phyint_flags_modified = B_TRUE; 11786 11787 /* 11788 * If we are setting or clearing FAILED or STANDBY or OFFLINE, 11789 * we need to flush the IRE_CACHES belonging to this ill. 11790 * We handle this case here without doing the DOWN/UP dance 11791 * like it is done for other flags. If some other flags are 11792 * being turned on/off with FAILED/STANDBY/OFFLINE, the code 11793 * below will handle it by bringing it down and then 11794 * bringing it UP. 11795 */ 11796 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) { 11797 ill_t *ill_v4, *ill_v6; 11798 11799 ill_v4 = phyi->phyint_illv4; 11800 ill_v6 = phyi->phyint_illv6; 11801 11802 /* 11803 * First set the INACTIVE flag if needed. Then delete the ires. 11804 * ire_add will atomically prevent creating new IRE_CACHEs 11805 * unless hidden flag is set. 11806 * PHYI_FAILED and PHYI_INACTIVE are exclusive 11807 */ 11808 if ((turn_on & PHYI_FAILED) && 11809 ((intf_flags & PHYI_STANDBY) || 11810 !ipst->ips_ipmp_enable_failback)) { 11811 /* Reset PHYI_INACTIVE when PHYI_FAILED is being set */ 11812 phyi->phyint_flags &= ~PHYI_INACTIVE; 11813 } 11814 if ((turn_off & PHYI_FAILED) && 11815 ((intf_flags & PHYI_STANDBY) || 11816 (!ipst->ips_ipmp_enable_failback && 11817 ill_is_inactive(ill)))) { 11818 phyint_inactive(phyi); 11819 } 11820 11821 if (turn_on & PHYI_STANDBY) { 11822 /* 11823 * We implicitly set INACTIVE only when STANDBY is set. 11824 * INACTIVE is also set on non-STANDBY phyint when user 11825 * disables FAILBACK using configuration file. 11826 * Do not allow STANDBY to be set on such INACTIVE 11827 * phyint 11828 */ 11829 if (phyi->phyint_flags & PHYI_INACTIVE) 11830 return (EINVAL); 11831 if (!(phyi->phyint_flags & PHYI_FAILED)) 11832 phyint_inactive(phyi); 11833 } 11834 if (turn_off & PHYI_STANDBY) { 11835 if (ipst->ips_ipmp_enable_failback) { 11836 /* 11837 * Reset PHYI_INACTIVE. 11838 */ 11839 phyi->phyint_flags &= ~PHYI_INACTIVE; 11840 } else if (ill_is_inactive(ill) && 11841 !(phyi->phyint_flags & PHYI_FAILED)) { 11842 /* 11843 * Need to set INACTIVE, when user sets 11844 * STANDBY on a non-STANDBY phyint and 11845 * later resets STANDBY 11846 */ 11847 phyint_inactive(phyi); 11848 } 11849 } 11850 /* 11851 * We should always send up a message so that the 11852 * daemons come to know of it. Note that the zeroth 11853 * interface can be down and the check below for IPIF_UP 11854 * will not make sense as we are actually setting 11855 * a phyint flag here. We assume that the ipif used 11856 * is always the zeroth ipif. (ip_rts_ifmsg does not 11857 * send up any message for non-zero ipifs). 11858 */ 11859 phyint_flags_modified = B_TRUE; 11860 11861 if (ill_v4 != NULL) { 11862 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11863 IRE_CACHE, ill_stq_cache_delete, 11864 (char *)ill_v4, ill_v4); 11865 illgrp_reset_schednext(ill_v4); 11866 } 11867 if (ill_v6 != NULL) { 11868 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 11869 IRE_CACHE, ill_stq_cache_delete, 11870 (char *)ill_v6, ill_v6); 11871 illgrp_reset_schednext(ill_v6); 11872 } 11873 } 11874 11875 /* 11876 * If ILLF_ROUTER changes, we need to change the ip forwarding 11877 * status of the interface and, if the interface is part of an IPMP 11878 * group, all other interfaces that are part of the same IPMP 11879 * group. 11880 */ 11881 if ((turn_on | turn_off) & ILLF_ROUTER) 11882 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 11883 11884 /* 11885 * If the interface is not UP and we are not going to 11886 * bring it UP, record the flags and return. When the 11887 * interface comes UP later, the right actions will be 11888 * taken. 11889 */ 11890 if (!(ipif->ipif_flags & IPIF_UP) && 11891 !(turn_on & IPIF_UP)) { 11892 /* Record new flags in their respective places. */ 11893 mutex_enter(&ill->ill_lock); 11894 mutex_enter(&ill->ill_phyint->phyint_lock); 11895 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11896 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11897 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11898 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11899 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11900 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11901 mutex_exit(&ill->ill_lock); 11902 mutex_exit(&ill->ill_phyint->phyint_lock); 11903 11904 /* 11905 * We do the broadcast and nomination here rather 11906 * than waiting for a FAILOVER/FAILBACK to happen. In 11907 * the case of FAILBACK from INACTIVE standby to the 11908 * interface that has been repaired, PHYI_FAILED has not 11909 * been cleared yet. If there are only two interfaces in 11910 * that group, all we have is a FAILED and INACTIVE 11911 * interface. If we do the nomination soon after a failback, 11912 * the broadcast nomination code would select the 11913 * INACTIVE interface for receiving broadcasts as FAILED is 11914 * not yet cleared. As we don't want STANDBY/INACTIVE to 11915 * receive broadcast packets, we need to redo nomination 11916 * when the FAILED is cleared here. Thus, in general we 11917 * always do the nomination here for FAILED, STANDBY 11918 * and OFFLINE. 11919 */ 11920 if (((turn_on | turn_off) & 11921 (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) { 11922 ip_redo_nomination(phyi); 11923 } 11924 if (phyint_flags_modified) { 11925 if (phyi->phyint_illv4 != NULL) { 11926 ip_rts_ifmsg(phyi->phyint_illv4-> 11927 ill_ipif); 11928 } 11929 if (phyi->phyint_illv6 != NULL) { 11930 ip_rts_ifmsg(phyi->phyint_illv6-> 11931 ill_ipif); 11932 } 11933 } 11934 return (0); 11935 } else if (set_linklocal || zero_source) { 11936 mutex_enter(&ill->ill_lock); 11937 if (set_linklocal) 11938 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11939 if (zero_source) 11940 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11941 mutex_exit(&ill->ill_lock); 11942 } 11943 11944 /* 11945 * Disallow IPv6 interfaces coming up that have the unspecified address, 11946 * or point-to-point interfaces with an unspecified destination. We do 11947 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11948 * have a subnet assigned, which is how in.ndpd currently manages its 11949 * onlink prefix list when no addresses are configured with those 11950 * prefixes. 11951 */ 11952 if (ipif->ipif_isv6 && 11953 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11954 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11955 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11956 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11957 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11958 return (EINVAL); 11959 } 11960 11961 /* 11962 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11963 * from being brought up. 11964 */ 11965 if (!ipif->ipif_isv6 && 11966 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11967 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11968 return (EINVAL); 11969 } 11970 11971 /* 11972 * The only flag changes that we currently take specific action on 11973 * is IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, 11974 * ILLF_NOARP, ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, and 11975 * IPIF_PREFERRED. This is done by bring the ipif down, changing 11976 * the flags and bringing it back up again. 11977 */ 11978 if ((turn_on|turn_off) & 11979 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11980 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED)) { 11981 /* 11982 * Taking this ipif down, make sure we have 11983 * valid net and subnet bcast ire's for other 11984 * logical interfaces, if we need them. 11985 */ 11986 if (!ipif->ipif_isv6) 11987 ipif_check_bcast_ires(ipif); 11988 11989 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11990 !(turn_off & IPIF_UP)) { 11991 need_up = B_TRUE; 11992 if (ipif->ipif_flags & IPIF_UP) 11993 ill->ill_logical_down = 1; 11994 turn_on &= ~IPIF_UP; 11995 } 11996 err = ipif_down(ipif, q, mp); 11997 ip1dbg(("ipif_down returns %d err ", err)); 11998 if (err == EINPROGRESS) 11999 return (err); 12000 ipif_down_tail(ipif); 12001 } 12002 return (ip_sioctl_flags_tail(ipif, flags, q, mp, need_up)); 12003 } 12004 12005 static int 12006 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp, 12007 boolean_t need_up) 12008 { 12009 ill_t *ill; 12010 phyint_t *phyi; 12011 uint64_t turn_on; 12012 uint64_t turn_off; 12013 uint64_t intf_flags; 12014 boolean_t phyint_flags_modified = B_FALSE; 12015 int err = 0; 12016 boolean_t set_linklocal = B_FALSE; 12017 boolean_t zero_source = B_FALSE; 12018 12019 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 12020 ipif->ipif_ill->ill_name, ipif->ipif_id)); 12021 12022 ASSERT(IAM_WRITER_IPIF(ipif)); 12023 12024 ill = ipif->ipif_ill; 12025 phyi = ill->ill_phyint; 12026 12027 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 12028 turn_on = (flags ^ intf_flags) & ~(IFF_CANTCHANGE | IFF_UP); 12029 12030 turn_off = intf_flags & turn_on; 12031 turn_on ^= turn_off; 12032 12033 if ((turn_on|turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE)) 12034 phyint_flags_modified = B_TRUE; 12035 12036 /* 12037 * Now we change the flags. Track current value of 12038 * other flags in their respective places. 12039 */ 12040 mutex_enter(&ill->ill_lock); 12041 mutex_enter(&phyi->phyint_lock); 12042 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 12043 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 12044 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 12045 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 12046 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 12047 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 12048 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 12049 set_linklocal = B_TRUE; 12050 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 12051 } 12052 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 12053 zero_source = B_TRUE; 12054 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 12055 } 12056 mutex_exit(&ill->ill_lock); 12057 mutex_exit(&phyi->phyint_lock); 12058 12059 if (((turn_on | turn_off) & (PHYI_FAILED|PHYI_STANDBY|PHYI_OFFLINE))) 12060 ip_redo_nomination(phyi); 12061 12062 if (set_linklocal) 12063 (void) ipif_setlinklocal(ipif); 12064 12065 if (zero_source) 12066 ipif->ipif_v6src_addr = ipv6_all_zeros; 12067 else 12068 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 12069 12070 if (need_up) { 12071 /* 12072 * XXX ipif_up really does not know whether a phyint flags 12073 * was modified or not. So, it sends up information on 12074 * only one routing sockets message. As we don't bring up 12075 * the interface and also set STANDBY/FAILED simultaneously 12076 * it should be okay. 12077 */ 12078 err = ipif_up(ipif, q, mp); 12079 } else { 12080 /* 12081 * Make sure routing socket sees all changes to the flags. 12082 * ipif_up_done* handles this when we use ipif_up. 12083 */ 12084 if (phyint_flags_modified) { 12085 if (phyi->phyint_illv4 != NULL) { 12086 ip_rts_ifmsg(phyi->phyint_illv4-> 12087 ill_ipif); 12088 } 12089 if (phyi->phyint_illv6 != NULL) { 12090 ip_rts_ifmsg(phyi->phyint_illv6-> 12091 ill_ipif); 12092 } 12093 } else { 12094 ip_rts_ifmsg(ipif); 12095 } 12096 /* 12097 * Update the flags in SCTP's IPIF list, ipif_up() will do 12098 * this in need_up case. 12099 */ 12100 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12101 } 12102 return (err); 12103 } 12104 12105 /* 12106 * Restart entry point to restart the flags restart operation after the 12107 * refcounts have dropped to zero. 12108 */ 12109 /* ARGSUSED */ 12110 int 12111 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12112 ip_ioctl_cmd_t *ipip, void *if_req) 12113 { 12114 int err; 12115 struct ifreq *ifr = (struct ifreq *)if_req; 12116 struct lifreq *lifr = (struct lifreq *)if_req; 12117 12118 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 12119 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12120 12121 ipif_down_tail(ipif); 12122 if (ipip->ipi_cmd_type == IF_CMD) { 12123 /* 12124 * Since ip_sioctl_flags expects an int and ifr_flags 12125 * is a short we need to cast ifr_flags into an int 12126 * to avoid having sign extension cause bits to get 12127 * set that should not be. 12128 */ 12129 err = ip_sioctl_flags_tail(ipif, 12130 (uint64_t)(ifr->ifr_flags & 0x0000ffff), 12131 q, mp, B_TRUE); 12132 } else { 12133 err = ip_sioctl_flags_tail(ipif, lifr->lifr_flags, 12134 q, mp, B_TRUE); 12135 } 12136 return (err); 12137 } 12138 12139 /* 12140 * Can operate on either a module or a driver queue. 12141 */ 12142 /* ARGSUSED */ 12143 int 12144 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12145 ip_ioctl_cmd_t *ipip, void *if_req) 12146 { 12147 /* 12148 * Has the flags been set correctly till now ? 12149 */ 12150 ill_t *ill = ipif->ipif_ill; 12151 phyint_t *phyi = ill->ill_phyint; 12152 12153 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 12154 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12155 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 12156 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 12157 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 12158 12159 /* 12160 * Need a lock since some flags can be set even when there are 12161 * references to the ipif. 12162 */ 12163 mutex_enter(&ill->ill_lock); 12164 if (ipip->ipi_cmd_type == IF_CMD) { 12165 struct ifreq *ifr = (struct ifreq *)if_req; 12166 12167 /* Get interface flags (low 16 only). */ 12168 ifr->ifr_flags = ((ipif->ipif_flags | 12169 ill->ill_flags | phyi->phyint_flags) & 0xffff); 12170 } else { 12171 struct lifreq *lifr = (struct lifreq *)if_req; 12172 12173 /* Get interface flags. */ 12174 lifr->lifr_flags = ipif->ipif_flags | 12175 ill->ill_flags | phyi->phyint_flags; 12176 } 12177 mutex_exit(&ill->ill_lock); 12178 return (0); 12179 } 12180 12181 /* ARGSUSED */ 12182 int 12183 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12184 ip_ioctl_cmd_t *ipip, void *if_req) 12185 { 12186 int mtu; 12187 int ip_min_mtu; 12188 struct ifreq *ifr; 12189 struct lifreq *lifr; 12190 ire_t *ire; 12191 ip_stack_t *ipst; 12192 12193 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 12194 ipif->ipif_id, (void *)ipif)); 12195 if (ipip->ipi_cmd_type == IF_CMD) { 12196 ifr = (struct ifreq *)if_req; 12197 mtu = ifr->ifr_metric; 12198 } else { 12199 lifr = (struct lifreq *)if_req; 12200 mtu = lifr->lifr_mtu; 12201 } 12202 12203 if (ipif->ipif_isv6) 12204 ip_min_mtu = IPV6_MIN_MTU; 12205 else 12206 ip_min_mtu = IP_MIN_MTU; 12207 12208 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 12209 return (EINVAL); 12210 12211 /* 12212 * Change the MTU size in all relevant ire's. 12213 * Mtu change Vs. new ire creation - protocol below. 12214 * First change ipif_mtu and the ire_max_frag of the 12215 * interface ire. Then do an ire walk and change the 12216 * ire_max_frag of all affected ires. During ire_add 12217 * under the bucket lock, set the ire_max_frag of the 12218 * new ire being created from the ipif/ire from which 12219 * it is being derived. If an mtu change happens after 12220 * the ire is added, the new ire will be cleaned up. 12221 * Conversely if the mtu change happens before the ire 12222 * is added, ire_add will see the new value of the mtu. 12223 */ 12224 ipif->ipif_mtu = mtu; 12225 ipif->ipif_flags |= IPIF_FIXEDMTU; 12226 12227 if (ipif->ipif_isv6) 12228 ire = ipif_to_ire_v6(ipif); 12229 else 12230 ire = ipif_to_ire(ipif); 12231 if (ire != NULL) { 12232 ire->ire_max_frag = ipif->ipif_mtu; 12233 ire_refrele(ire); 12234 } 12235 ipst = ipif->ipif_ill->ill_ipst; 12236 if (ipif->ipif_flags & IPIF_UP) { 12237 if (ipif->ipif_isv6) 12238 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES, 12239 ipst); 12240 else 12241 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES, 12242 ipst); 12243 } 12244 /* Update the MTU in SCTP's list */ 12245 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 12246 return (0); 12247 } 12248 12249 /* Get interface MTU. */ 12250 /* ARGSUSED */ 12251 int 12252 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12253 ip_ioctl_cmd_t *ipip, void *if_req) 12254 { 12255 struct ifreq *ifr; 12256 struct lifreq *lifr; 12257 12258 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 12259 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12260 if (ipip->ipi_cmd_type == IF_CMD) { 12261 ifr = (struct ifreq *)if_req; 12262 ifr->ifr_metric = ipif->ipif_mtu; 12263 } else { 12264 lifr = (struct lifreq *)if_req; 12265 lifr->lifr_mtu = ipif->ipif_mtu; 12266 } 12267 return (0); 12268 } 12269 12270 /* Set interface broadcast address. */ 12271 /* ARGSUSED2 */ 12272 int 12273 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12274 ip_ioctl_cmd_t *ipip, void *if_req) 12275 { 12276 ipaddr_t addr; 12277 ire_t *ire; 12278 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12279 12280 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 12281 ipif->ipif_id)); 12282 12283 ASSERT(IAM_WRITER_IPIF(ipif)); 12284 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12285 return (EADDRNOTAVAIL); 12286 12287 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 12288 12289 if (sin->sin_family != AF_INET) 12290 return (EAFNOSUPPORT); 12291 12292 addr = sin->sin_addr.s_addr; 12293 if (ipif->ipif_flags & IPIF_UP) { 12294 /* 12295 * If we are already up, make sure the new 12296 * broadcast address makes sense. If it does, 12297 * there should be an IRE for it already. 12298 * Don't match on ipif, only on the ill 12299 * since we are sharing these now. Don't use 12300 * MATCH_IRE_ILL_GROUP as we are looking for 12301 * the broadcast ire on this ill and each ill 12302 * in the group has its own broadcast ire. 12303 */ 12304 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 12305 ipif, ALL_ZONES, NULL, 12306 (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst); 12307 if (ire == NULL) { 12308 return (EINVAL); 12309 } else { 12310 ire_refrele(ire); 12311 } 12312 } 12313 /* 12314 * Changing the broadcast addr for this ipif. 12315 * Make sure we have valid net and subnet bcast 12316 * ire's for other logical interfaces, if needed. 12317 */ 12318 if (addr != ipif->ipif_brd_addr) 12319 ipif_check_bcast_ires(ipif); 12320 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 12321 return (0); 12322 } 12323 12324 /* Get interface broadcast address. */ 12325 /* ARGSUSED */ 12326 int 12327 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12328 ip_ioctl_cmd_t *ipip, void *if_req) 12329 { 12330 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 12331 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12332 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12333 return (EADDRNOTAVAIL); 12334 12335 /* IPIF_BROADCAST not possible with IPv6 */ 12336 ASSERT(!ipif->ipif_isv6); 12337 *sin = sin_null; 12338 sin->sin_family = AF_INET; 12339 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 12340 return (0); 12341 } 12342 12343 /* 12344 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 12345 */ 12346 /* ARGSUSED */ 12347 int 12348 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12349 ip_ioctl_cmd_t *ipip, void *if_req) 12350 { 12351 int err = 0; 12352 in6_addr_t v6mask; 12353 12354 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 12355 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12356 12357 ASSERT(IAM_WRITER_IPIF(ipif)); 12358 12359 if (ipif->ipif_isv6) { 12360 sin6_t *sin6; 12361 12362 if (sin->sin_family != AF_INET6) 12363 return (EAFNOSUPPORT); 12364 12365 sin6 = (sin6_t *)sin; 12366 v6mask = sin6->sin6_addr; 12367 } else { 12368 ipaddr_t mask; 12369 12370 if (sin->sin_family != AF_INET) 12371 return (EAFNOSUPPORT); 12372 12373 mask = sin->sin_addr.s_addr; 12374 V4MASK_TO_V6(mask, v6mask); 12375 } 12376 12377 /* 12378 * No big deal if the interface isn't already up, or the mask 12379 * isn't really changing, or this is pt-pt. 12380 */ 12381 if (!(ipif->ipif_flags & IPIF_UP) || 12382 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12383 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12384 ipif->ipif_v6net_mask = v6mask; 12385 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12386 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12387 ipif->ipif_v6net_mask, 12388 ipif->ipif_v6subnet); 12389 } 12390 return (0); 12391 } 12392 /* 12393 * Make sure we have valid net and subnet broadcast ire's 12394 * for the old netmask, if needed by other logical interfaces. 12395 */ 12396 if (!ipif->ipif_isv6) 12397 ipif_check_bcast_ires(ipif); 12398 12399 err = ipif_logical_down(ipif, q, mp); 12400 if (err == EINPROGRESS) 12401 return (err); 12402 ipif_down_tail(ipif); 12403 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12404 return (err); 12405 } 12406 12407 static int 12408 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12409 { 12410 in6_addr_t v6mask; 12411 int err = 0; 12412 12413 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12414 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12415 12416 if (ipif->ipif_isv6) { 12417 sin6_t *sin6; 12418 12419 sin6 = (sin6_t *)sin; 12420 v6mask = sin6->sin6_addr; 12421 } else { 12422 ipaddr_t mask; 12423 12424 mask = sin->sin_addr.s_addr; 12425 V4MASK_TO_V6(mask, v6mask); 12426 } 12427 12428 ipif->ipif_v6net_mask = v6mask; 12429 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12430 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12431 ipif->ipif_v6subnet); 12432 } 12433 err = ipif_up(ipif, q, mp); 12434 12435 if (err == 0 || err == EINPROGRESS) { 12436 /* 12437 * The interface must be DL_BOUND if this packet has to 12438 * go out on the wire. Since we only go through a logical 12439 * down and are bound with the driver during an internal 12440 * down/up that is satisfied. 12441 */ 12442 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12443 /* Potentially broadcast an address mask reply. */ 12444 ipif_mask_reply(ipif); 12445 } 12446 } 12447 return (err); 12448 } 12449 12450 /* ARGSUSED */ 12451 int 12452 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12453 ip_ioctl_cmd_t *ipip, void *if_req) 12454 { 12455 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12456 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12457 ipif_down_tail(ipif); 12458 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12459 } 12460 12461 /* Get interface net mask. */ 12462 /* ARGSUSED */ 12463 int 12464 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12465 ip_ioctl_cmd_t *ipip, void *if_req) 12466 { 12467 struct lifreq *lifr = (struct lifreq *)if_req; 12468 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12469 12470 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12471 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12472 12473 /* 12474 * net mask can't change since we have a reference to the ipif. 12475 */ 12476 if (ipif->ipif_isv6) { 12477 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12478 *sin6 = sin6_null; 12479 sin6->sin6_family = AF_INET6; 12480 sin6->sin6_addr = ipif->ipif_v6net_mask; 12481 lifr->lifr_addrlen = 12482 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12483 } else { 12484 *sin = sin_null; 12485 sin->sin_family = AF_INET; 12486 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12487 if (ipip->ipi_cmd_type == LIF_CMD) { 12488 lifr->lifr_addrlen = 12489 ip_mask_to_plen(ipif->ipif_net_mask); 12490 } 12491 } 12492 return (0); 12493 } 12494 12495 /* ARGSUSED */ 12496 int 12497 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12498 ip_ioctl_cmd_t *ipip, void *if_req) 12499 { 12500 12501 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12502 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12503 /* 12504 * Set interface metric. We don't use this for 12505 * anything but we keep track of it in case it is 12506 * important to routing applications or such. 12507 */ 12508 if (ipip->ipi_cmd_type == IF_CMD) { 12509 struct ifreq *ifr; 12510 12511 ifr = (struct ifreq *)if_req; 12512 ipif->ipif_metric = ifr->ifr_metric; 12513 } else { 12514 struct lifreq *lifr; 12515 12516 lifr = (struct lifreq *)if_req; 12517 ipif->ipif_metric = lifr->lifr_metric; 12518 } 12519 return (0); 12520 } 12521 12522 12523 /* ARGSUSED */ 12524 int 12525 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12526 ip_ioctl_cmd_t *ipip, void *if_req) 12527 { 12528 12529 /* Get interface metric. */ 12530 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12531 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12532 if (ipip->ipi_cmd_type == IF_CMD) { 12533 struct ifreq *ifr; 12534 12535 ifr = (struct ifreq *)if_req; 12536 ifr->ifr_metric = ipif->ipif_metric; 12537 } else { 12538 struct lifreq *lifr; 12539 12540 lifr = (struct lifreq *)if_req; 12541 lifr->lifr_metric = ipif->ipif_metric; 12542 } 12543 12544 return (0); 12545 } 12546 12547 /* ARGSUSED */ 12548 int 12549 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12550 ip_ioctl_cmd_t *ipip, void *if_req) 12551 { 12552 12553 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12554 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12555 /* 12556 * Set the muxid returned from I_PLINK. 12557 */ 12558 if (ipip->ipi_cmd_type == IF_CMD) { 12559 struct ifreq *ifr = (struct ifreq *)if_req; 12560 12561 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12562 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12563 } else { 12564 struct lifreq *lifr = (struct lifreq *)if_req; 12565 12566 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12567 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12568 } 12569 return (0); 12570 } 12571 12572 /* ARGSUSED */ 12573 int 12574 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12575 ip_ioctl_cmd_t *ipip, void *if_req) 12576 { 12577 12578 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12579 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12580 /* 12581 * Get the muxid saved in ill for I_PUNLINK. 12582 */ 12583 if (ipip->ipi_cmd_type == IF_CMD) { 12584 struct ifreq *ifr = (struct ifreq *)if_req; 12585 12586 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12587 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12588 } else { 12589 struct lifreq *lifr = (struct lifreq *)if_req; 12590 12591 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12592 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12593 } 12594 return (0); 12595 } 12596 12597 /* 12598 * Set the subnet prefix. Does not modify the broadcast address. 12599 */ 12600 /* ARGSUSED */ 12601 int 12602 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12603 ip_ioctl_cmd_t *ipip, void *if_req) 12604 { 12605 int err = 0; 12606 in6_addr_t v6addr; 12607 in6_addr_t v6mask; 12608 boolean_t need_up = B_FALSE; 12609 int addrlen; 12610 12611 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12612 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12613 12614 ASSERT(IAM_WRITER_IPIF(ipif)); 12615 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12616 12617 if (ipif->ipif_isv6) { 12618 sin6_t *sin6; 12619 12620 if (sin->sin_family != AF_INET6) 12621 return (EAFNOSUPPORT); 12622 12623 sin6 = (sin6_t *)sin; 12624 v6addr = sin6->sin6_addr; 12625 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12626 return (EADDRNOTAVAIL); 12627 } else { 12628 ipaddr_t addr; 12629 12630 if (sin->sin_family != AF_INET) 12631 return (EAFNOSUPPORT); 12632 12633 addr = sin->sin_addr.s_addr; 12634 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12635 return (EADDRNOTAVAIL); 12636 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12637 /* Add 96 bits */ 12638 addrlen += IPV6_ABITS - IP_ABITS; 12639 } 12640 12641 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12642 return (EINVAL); 12643 12644 /* Check if bits in the address is set past the mask */ 12645 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12646 return (EINVAL); 12647 12648 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12649 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12650 return (0); /* No change */ 12651 12652 if (ipif->ipif_flags & IPIF_UP) { 12653 /* 12654 * If the interface is already marked up, 12655 * we call ipif_down which will take care 12656 * of ditching any IREs that have been set 12657 * up based on the old interface address. 12658 */ 12659 err = ipif_logical_down(ipif, q, mp); 12660 if (err == EINPROGRESS) 12661 return (err); 12662 ipif_down_tail(ipif); 12663 need_up = B_TRUE; 12664 } 12665 12666 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12667 return (err); 12668 } 12669 12670 static int 12671 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12672 queue_t *q, mblk_t *mp, boolean_t need_up) 12673 { 12674 ill_t *ill = ipif->ipif_ill; 12675 int err = 0; 12676 12677 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12678 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12679 12680 /* Set the new address. */ 12681 mutex_enter(&ill->ill_lock); 12682 ipif->ipif_v6net_mask = v6mask; 12683 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12684 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12685 ipif->ipif_v6subnet); 12686 } 12687 mutex_exit(&ill->ill_lock); 12688 12689 if (need_up) { 12690 /* 12691 * Now bring the interface back up. If this 12692 * is the only IPIF for the ILL, ipif_up 12693 * will have to re-bind to the device, so 12694 * we may get back EINPROGRESS, in which 12695 * case, this IOCTL will get completed in 12696 * ip_rput_dlpi when we see the DL_BIND_ACK. 12697 */ 12698 err = ipif_up(ipif, q, mp); 12699 if (err == EINPROGRESS) 12700 return (err); 12701 } 12702 return (err); 12703 } 12704 12705 /* ARGSUSED */ 12706 int 12707 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12708 ip_ioctl_cmd_t *ipip, void *if_req) 12709 { 12710 int addrlen; 12711 in6_addr_t v6addr; 12712 in6_addr_t v6mask; 12713 struct lifreq *lifr = (struct lifreq *)if_req; 12714 12715 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12716 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12717 ipif_down_tail(ipif); 12718 12719 addrlen = lifr->lifr_addrlen; 12720 if (ipif->ipif_isv6) { 12721 sin6_t *sin6; 12722 12723 sin6 = (sin6_t *)sin; 12724 v6addr = sin6->sin6_addr; 12725 } else { 12726 ipaddr_t addr; 12727 12728 addr = sin->sin_addr.s_addr; 12729 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12730 addrlen += IPV6_ABITS - IP_ABITS; 12731 } 12732 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12733 12734 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12735 } 12736 12737 /* ARGSUSED */ 12738 int 12739 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12740 ip_ioctl_cmd_t *ipip, void *if_req) 12741 { 12742 struct lifreq *lifr = (struct lifreq *)if_req; 12743 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12744 12745 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12746 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12747 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12748 12749 if (ipif->ipif_isv6) { 12750 *sin6 = sin6_null; 12751 sin6->sin6_family = AF_INET6; 12752 sin6->sin6_addr = ipif->ipif_v6subnet; 12753 lifr->lifr_addrlen = 12754 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12755 } else { 12756 *sin = sin_null; 12757 sin->sin_family = AF_INET; 12758 sin->sin_addr.s_addr = ipif->ipif_subnet; 12759 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12760 } 12761 return (0); 12762 } 12763 12764 /* 12765 * Set the IPv6 address token. 12766 */ 12767 /* ARGSUSED */ 12768 int 12769 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12770 ip_ioctl_cmd_t *ipi, void *if_req) 12771 { 12772 ill_t *ill = ipif->ipif_ill; 12773 int err; 12774 in6_addr_t v6addr; 12775 in6_addr_t v6mask; 12776 boolean_t need_up = B_FALSE; 12777 int i; 12778 sin6_t *sin6 = (sin6_t *)sin; 12779 struct lifreq *lifr = (struct lifreq *)if_req; 12780 int addrlen; 12781 12782 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12783 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12784 ASSERT(IAM_WRITER_IPIF(ipif)); 12785 12786 addrlen = lifr->lifr_addrlen; 12787 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12788 if (ipif->ipif_id != 0) 12789 return (EINVAL); 12790 12791 if (!ipif->ipif_isv6) 12792 return (EINVAL); 12793 12794 if (addrlen > IPV6_ABITS) 12795 return (EINVAL); 12796 12797 v6addr = sin6->sin6_addr; 12798 12799 /* 12800 * The length of the token is the length from the end. To get 12801 * the proper mask for this, compute the mask of the bits not 12802 * in the token; ie. the prefix, and then xor to get the mask. 12803 */ 12804 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12805 return (EINVAL); 12806 for (i = 0; i < 4; i++) { 12807 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12808 } 12809 12810 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12811 ill->ill_token_length == addrlen) 12812 return (0); /* No change */ 12813 12814 if (ipif->ipif_flags & IPIF_UP) { 12815 err = ipif_logical_down(ipif, q, mp); 12816 if (err == EINPROGRESS) 12817 return (err); 12818 ipif_down_tail(ipif); 12819 need_up = B_TRUE; 12820 } 12821 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12822 return (err); 12823 } 12824 12825 static int 12826 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12827 mblk_t *mp, boolean_t need_up) 12828 { 12829 in6_addr_t v6addr; 12830 in6_addr_t v6mask; 12831 ill_t *ill = ipif->ipif_ill; 12832 int i; 12833 int err = 0; 12834 12835 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12836 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12837 v6addr = sin6->sin6_addr; 12838 /* 12839 * The length of the token is the length from the end. To get 12840 * the proper mask for this, compute the mask of the bits not 12841 * in the token; ie. the prefix, and then xor to get the mask. 12842 */ 12843 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12844 for (i = 0; i < 4; i++) 12845 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12846 12847 mutex_enter(&ill->ill_lock); 12848 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12849 ill->ill_token_length = addrlen; 12850 mutex_exit(&ill->ill_lock); 12851 12852 if (need_up) { 12853 /* 12854 * Now bring the interface back up. If this 12855 * is the only IPIF for the ILL, ipif_up 12856 * will have to re-bind to the device, so 12857 * we may get back EINPROGRESS, in which 12858 * case, this IOCTL will get completed in 12859 * ip_rput_dlpi when we see the DL_BIND_ACK. 12860 */ 12861 err = ipif_up(ipif, q, mp); 12862 if (err == EINPROGRESS) 12863 return (err); 12864 } 12865 return (err); 12866 } 12867 12868 /* ARGSUSED */ 12869 int 12870 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12871 ip_ioctl_cmd_t *ipi, void *if_req) 12872 { 12873 ill_t *ill; 12874 sin6_t *sin6 = (sin6_t *)sin; 12875 struct lifreq *lifr = (struct lifreq *)if_req; 12876 12877 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12878 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12879 if (ipif->ipif_id != 0) 12880 return (EINVAL); 12881 12882 ill = ipif->ipif_ill; 12883 if (!ill->ill_isv6) 12884 return (ENXIO); 12885 12886 *sin6 = sin6_null; 12887 sin6->sin6_family = AF_INET6; 12888 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12889 sin6->sin6_addr = ill->ill_token; 12890 lifr->lifr_addrlen = ill->ill_token_length; 12891 return (0); 12892 } 12893 12894 /* 12895 * Set (hardware) link specific information that might override 12896 * what was acquired through the DL_INFO_ACK. 12897 * The logic is as follows. 12898 * 12899 * become exclusive 12900 * set CHANGING flag 12901 * change mtu on affected IREs 12902 * clear CHANGING flag 12903 * 12904 * An ire add that occurs before the CHANGING flag is set will have its mtu 12905 * changed by the ip_sioctl_lnkinfo. 12906 * 12907 * During the time the CHANGING flag is set, no new ires will be added to the 12908 * bucket, and ire add will fail (due the CHANGING flag). 12909 * 12910 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12911 * before it is added to the bucket. 12912 * 12913 * Obviously only 1 thread can set the CHANGING flag and we need to become 12914 * exclusive to set the flag. 12915 */ 12916 /* ARGSUSED */ 12917 int 12918 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12919 ip_ioctl_cmd_t *ipi, void *if_req) 12920 { 12921 ill_t *ill = ipif->ipif_ill; 12922 ipif_t *nipif; 12923 int ip_min_mtu; 12924 boolean_t mtu_walk = B_FALSE; 12925 struct lifreq *lifr = (struct lifreq *)if_req; 12926 lif_ifinfo_req_t *lir; 12927 ire_t *ire; 12928 12929 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12930 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12931 lir = &lifr->lifr_ifinfo; 12932 ASSERT(IAM_WRITER_IPIF(ipif)); 12933 12934 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12935 if (ipif->ipif_id != 0) 12936 return (EINVAL); 12937 12938 /* Set interface MTU. */ 12939 if (ipif->ipif_isv6) 12940 ip_min_mtu = IPV6_MIN_MTU; 12941 else 12942 ip_min_mtu = IP_MIN_MTU; 12943 12944 /* 12945 * Verify values before we set anything. Allow zero to 12946 * mean unspecified. 12947 */ 12948 if (lir->lir_maxmtu != 0 && 12949 (lir->lir_maxmtu > ill->ill_max_frag || 12950 lir->lir_maxmtu < ip_min_mtu)) 12951 return (EINVAL); 12952 if (lir->lir_reachtime != 0 && 12953 lir->lir_reachtime > ND_MAX_REACHTIME) 12954 return (EINVAL); 12955 if (lir->lir_reachretrans != 0 && 12956 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12957 return (EINVAL); 12958 12959 mutex_enter(&ill->ill_lock); 12960 ill->ill_state_flags |= ILL_CHANGING; 12961 for (nipif = ill->ill_ipif; nipif != NULL; 12962 nipif = nipif->ipif_next) { 12963 nipif->ipif_state_flags |= IPIF_CHANGING; 12964 } 12965 12966 mutex_exit(&ill->ill_lock); 12967 12968 if (lir->lir_maxmtu != 0) { 12969 ill->ill_max_mtu = lir->lir_maxmtu; 12970 ill->ill_mtu_userspecified = 1; 12971 mtu_walk = B_TRUE; 12972 } 12973 12974 if (lir->lir_reachtime != 0) 12975 ill->ill_reachable_time = lir->lir_reachtime; 12976 12977 if (lir->lir_reachretrans != 0) 12978 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12979 12980 ill->ill_max_hops = lir->lir_maxhops; 12981 12982 ill->ill_max_buf = ND_MAX_Q; 12983 12984 if (mtu_walk) { 12985 /* 12986 * Set the MTU on all ipifs associated with this ill except 12987 * for those whose MTU was fixed via SIOCSLIFMTU. 12988 */ 12989 for (nipif = ill->ill_ipif; nipif != NULL; 12990 nipif = nipif->ipif_next) { 12991 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12992 continue; 12993 12994 nipif->ipif_mtu = ill->ill_max_mtu; 12995 12996 if (!(nipif->ipif_flags & IPIF_UP)) 12997 continue; 12998 12999 if (nipif->ipif_isv6) 13000 ire = ipif_to_ire_v6(nipif); 13001 else 13002 ire = ipif_to_ire(nipif); 13003 if (ire != NULL) { 13004 ire->ire_max_frag = ipif->ipif_mtu; 13005 ire_refrele(ire); 13006 } 13007 if (ill->ill_isv6) { 13008 ire_walk_ill_v6(MATCH_IRE_ILL, 0, 13009 ipif_mtu_change, (char *)nipif, 13010 ill); 13011 } else { 13012 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 13013 ipif_mtu_change, (char *)nipif, 13014 ill); 13015 } 13016 } 13017 } 13018 13019 mutex_enter(&ill->ill_lock); 13020 for (nipif = ill->ill_ipif; nipif != NULL; 13021 nipif = nipif->ipif_next) { 13022 nipif->ipif_state_flags &= ~IPIF_CHANGING; 13023 } 13024 ILL_UNMARK_CHANGING(ill); 13025 mutex_exit(&ill->ill_lock); 13026 13027 return (0); 13028 } 13029 13030 /* ARGSUSED */ 13031 int 13032 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13033 ip_ioctl_cmd_t *ipi, void *if_req) 13034 { 13035 struct lif_ifinfo_req *lir; 13036 ill_t *ill = ipif->ipif_ill; 13037 13038 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 13039 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 13040 if (ipif->ipif_id != 0) 13041 return (EINVAL); 13042 13043 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 13044 lir->lir_maxhops = ill->ill_max_hops; 13045 lir->lir_reachtime = ill->ill_reachable_time; 13046 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 13047 lir->lir_maxmtu = ill->ill_max_mtu; 13048 13049 return (0); 13050 } 13051 13052 /* 13053 * Return best guess as to the subnet mask for the specified address. 13054 * Based on the subnet masks for all the configured interfaces. 13055 * 13056 * We end up returning a zero mask in the case of default, multicast or 13057 * experimental. 13058 */ 13059 static ipaddr_t 13060 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 13061 { 13062 ipaddr_t net_mask; 13063 ill_t *ill; 13064 ipif_t *ipif; 13065 ill_walk_context_t ctx; 13066 ipif_t *fallback_ipif = NULL; 13067 13068 net_mask = ip_net_mask(addr); 13069 if (net_mask == 0) { 13070 *ipifp = NULL; 13071 return (0); 13072 } 13073 13074 /* Let's check to see if this is maybe a local subnet route. */ 13075 /* this function only applies to IPv4 interfaces */ 13076 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 13077 ill = ILL_START_WALK_V4(&ctx, ipst); 13078 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 13079 mutex_enter(&ill->ill_lock); 13080 for (ipif = ill->ill_ipif; ipif != NULL; 13081 ipif = ipif->ipif_next) { 13082 if (!IPIF_CAN_LOOKUP(ipif)) 13083 continue; 13084 if (!(ipif->ipif_flags & IPIF_UP)) 13085 continue; 13086 if ((ipif->ipif_subnet & net_mask) == 13087 (addr & net_mask)) { 13088 /* 13089 * Don't trust pt-pt interfaces if there are 13090 * other interfaces. 13091 */ 13092 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 13093 if (fallback_ipif == NULL) { 13094 ipif_refhold_locked(ipif); 13095 fallback_ipif = ipif; 13096 } 13097 continue; 13098 } 13099 13100 /* 13101 * Fine. Just assume the same net mask as the 13102 * directly attached subnet interface is using. 13103 */ 13104 ipif_refhold_locked(ipif); 13105 mutex_exit(&ill->ill_lock); 13106 rw_exit(&ipst->ips_ill_g_lock); 13107 if (fallback_ipif != NULL) 13108 ipif_refrele(fallback_ipif); 13109 *ipifp = ipif; 13110 return (ipif->ipif_net_mask); 13111 } 13112 } 13113 mutex_exit(&ill->ill_lock); 13114 } 13115 rw_exit(&ipst->ips_ill_g_lock); 13116 13117 *ipifp = fallback_ipif; 13118 return ((fallback_ipif != NULL) ? 13119 fallback_ipif->ipif_net_mask : net_mask); 13120 } 13121 13122 /* 13123 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 13124 */ 13125 static void 13126 ip_wput_ioctl(queue_t *q, mblk_t *mp) 13127 { 13128 IOCP iocp; 13129 ipft_t *ipft; 13130 ipllc_t *ipllc; 13131 mblk_t *mp1; 13132 cred_t *cr; 13133 int error = 0; 13134 conn_t *connp; 13135 13136 ip1dbg(("ip_wput_ioctl")); 13137 iocp = (IOCP)mp->b_rptr; 13138 mp1 = mp->b_cont; 13139 if (mp1 == NULL) { 13140 iocp->ioc_error = EINVAL; 13141 mp->b_datap->db_type = M_IOCNAK; 13142 iocp->ioc_count = 0; 13143 qreply(q, mp); 13144 return; 13145 } 13146 13147 /* 13148 * These IOCTLs provide various control capabilities to 13149 * upstream agents such as ULPs and processes. There 13150 * are currently two such IOCTLs implemented. They 13151 * are used by TCP to provide update information for 13152 * existing IREs and to forcibly delete an IRE for a 13153 * host that is not responding, thereby forcing an 13154 * attempt at a new route. 13155 */ 13156 iocp->ioc_error = EINVAL; 13157 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 13158 goto done; 13159 13160 ipllc = (ipllc_t *)mp1->b_rptr; 13161 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 13162 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 13163 break; 13164 } 13165 /* 13166 * prefer credential from mblk over ioctl; 13167 * see ip_sioctl_copyin_setup 13168 */ 13169 cr = DB_CREDDEF(mp, iocp->ioc_cr); 13170 13171 /* 13172 * Refhold the conn in case the request gets queued up in some lookup 13173 */ 13174 ASSERT(CONN_Q(q)); 13175 connp = Q_TO_CONN(q); 13176 CONN_INC_REF(connp); 13177 if (ipft->ipft_pfi && 13178 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 13179 pullupmsg(mp1, ipft->ipft_min_size))) { 13180 error = (*ipft->ipft_pfi)(q, 13181 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 13182 } 13183 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 13184 /* 13185 * CONN_OPER_PENDING_DONE happens in the function called 13186 * through ipft_pfi above. 13187 */ 13188 return; 13189 } 13190 13191 CONN_OPER_PENDING_DONE(connp); 13192 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 13193 freemsg(mp); 13194 return; 13195 } 13196 iocp->ioc_error = error; 13197 13198 done: 13199 mp->b_datap->db_type = M_IOCACK; 13200 if (iocp->ioc_error) 13201 iocp->ioc_count = 0; 13202 qreply(q, mp); 13203 } 13204 13205 /* 13206 * Lookup an ipif using the sequence id (ipif_seqid) 13207 */ 13208 ipif_t * 13209 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 13210 { 13211 ipif_t *ipif; 13212 13213 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13214 13215 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13216 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 13217 return (ipif); 13218 } 13219 return (NULL); 13220 } 13221 13222 /* 13223 * Assign a unique id for the ipif. This is used later when we send 13224 * IRES to ARP for resolution where we initialize ire_ipif_seqid 13225 * to the value pointed by ire_ipif->ipif_seqid. Later when the 13226 * IRE is added, we verify that ipif has not disappeared. 13227 */ 13228 13229 static void 13230 ipif_assign_seqid(ipif_t *ipif) 13231 { 13232 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13233 13234 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 13235 } 13236 13237 /* 13238 * Insert the ipif, so that the list of ipifs on the ill will be sorted 13239 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 13240 * be inserted into the first space available in the list. The value of 13241 * ipif_id will then be set to the appropriate value for its position. 13242 */ 13243 static int 13244 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock, boolean_t acquire_ill_lock) 13245 { 13246 ill_t *ill; 13247 ipif_t *tipif; 13248 ipif_t **tipifp; 13249 int id; 13250 ip_stack_t *ipst; 13251 13252 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 13253 IAM_WRITER_IPIF(ipif)); 13254 13255 ill = ipif->ipif_ill; 13256 ASSERT(ill != NULL); 13257 ipst = ill->ill_ipst; 13258 13259 /* 13260 * In the case of lo0:0 we already hold the ill_g_lock. 13261 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13262 * ipif_insert. Another such caller is ipif_move. 13263 */ 13264 if (acquire_g_lock) 13265 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13266 if (acquire_ill_lock) 13267 mutex_enter(&ill->ill_lock); 13268 id = ipif->ipif_id; 13269 tipifp = &(ill->ill_ipif); 13270 if (id == -1) { /* need to find a real id */ 13271 id = 0; 13272 while ((tipif = *tipifp) != NULL) { 13273 ASSERT(tipif->ipif_id >= id); 13274 if (tipif->ipif_id != id) 13275 break; /* non-consecutive id */ 13276 id++; 13277 tipifp = &(tipif->ipif_next); 13278 } 13279 /* limit number of logical interfaces */ 13280 if (id >= ipst->ips_ip_addrs_per_if) { 13281 if (acquire_ill_lock) 13282 mutex_exit(&ill->ill_lock); 13283 if (acquire_g_lock) 13284 rw_exit(&ipst->ips_ill_g_lock); 13285 return (-1); 13286 } 13287 ipif->ipif_id = id; /* assign new id */ 13288 } else if (id < ipst->ips_ip_addrs_per_if) { 13289 /* we have a real id; insert ipif in the right place */ 13290 while ((tipif = *tipifp) != NULL) { 13291 ASSERT(tipif->ipif_id != id); 13292 if (tipif->ipif_id > id) 13293 break; /* found correct location */ 13294 tipifp = &(tipif->ipif_next); 13295 } 13296 } else { 13297 if (acquire_ill_lock) 13298 mutex_exit(&ill->ill_lock); 13299 if (acquire_g_lock) 13300 rw_exit(&ipst->ips_ill_g_lock); 13301 return (-1); 13302 } 13303 13304 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13305 13306 ipif->ipif_next = tipif; 13307 *tipifp = ipif; 13308 if (acquire_ill_lock) 13309 mutex_exit(&ill->ill_lock); 13310 if (acquire_g_lock) 13311 rw_exit(&ipst->ips_ill_g_lock); 13312 return (0); 13313 } 13314 13315 static void 13316 ipif_remove(ipif_t *ipif, boolean_t acquire_ill_lock) 13317 { 13318 ipif_t **ipifp; 13319 ill_t *ill = ipif->ipif_ill; 13320 13321 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 13322 if (acquire_ill_lock) 13323 mutex_enter(&ill->ill_lock); 13324 else 13325 ASSERT(MUTEX_HELD(&ill->ill_lock)); 13326 13327 ipifp = &ill->ill_ipif; 13328 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 13329 if (*ipifp == ipif) { 13330 *ipifp = ipif->ipif_next; 13331 break; 13332 } 13333 } 13334 13335 if (acquire_ill_lock) 13336 mutex_exit(&ill->ill_lock); 13337 } 13338 13339 /* 13340 * Allocate and initialize a new interface control structure. (Always 13341 * called as writer.) 13342 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13343 * is not part of the global linked list of ills. ipif_seqid is unique 13344 * in the system and to preserve the uniqueness, it is assigned only 13345 * when ill becomes part of the global list. At that point ill will 13346 * have a name. If it doesn't get assigned here, it will get assigned 13347 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13348 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13349 * the interface flags or any other information from the DL_INFO_ACK for 13350 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13351 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13352 * second DL_INFO_ACK comes in from the driver. 13353 */ 13354 static ipif_t * 13355 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize) 13356 { 13357 ipif_t *ipif; 13358 phyint_t *phyi; 13359 13360 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13361 ill->ill_name, id, (void *)ill)); 13362 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13363 13364 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13365 return (NULL); 13366 *ipif = ipif_zero; /* start clean */ 13367 13368 ipif->ipif_ill = ill; 13369 ipif->ipif_id = id; /* could be -1 */ 13370 /* 13371 * Inherit the zoneid from the ill; for the shared stack instance 13372 * this is always the global zone 13373 */ 13374 ipif->ipif_zoneid = ill->ill_zoneid; 13375 13376 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13377 13378 ipif->ipif_refcnt = 0; 13379 ipif->ipif_saved_ire_cnt = 0; 13380 13381 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK, B_TRUE)) { 13382 mi_free(ipif); 13383 return (NULL); 13384 } 13385 /* -1 id should have been replaced by real id */ 13386 id = ipif->ipif_id; 13387 ASSERT(id >= 0); 13388 13389 if (ill->ill_name[0] != '\0') 13390 ipif_assign_seqid(ipif); 13391 13392 /* 13393 * Keep a copy of original id in ipif_orig_ipifid. Failback 13394 * will attempt to restore the original id. The SIOCSLIFOINDEX 13395 * ioctl sets ipif_orig_ipifid to zero. 13396 */ 13397 ipif->ipif_orig_ipifid = id; 13398 13399 /* 13400 * We grab the ill_lock and phyint_lock to protect the flag changes. 13401 * The ipif is still not up and can't be looked up until the 13402 * ioctl completes and the IPIF_CHANGING flag is cleared. 13403 */ 13404 mutex_enter(&ill->ill_lock); 13405 mutex_enter(&ill->ill_phyint->phyint_lock); 13406 /* 13407 * Set the running flag when logical interface zero is created. 13408 * For subsequent logical interfaces, a DLPI link down 13409 * notification message may have cleared the running flag to 13410 * indicate the link is down, so we shouldn't just blindly set it. 13411 */ 13412 if (id == 0) 13413 ill->ill_phyint->phyint_flags |= PHYI_RUNNING; 13414 ipif->ipif_ire_type = ire_type; 13415 phyi = ill->ill_phyint; 13416 ipif->ipif_orig_ifindex = phyi->phyint_ifindex; 13417 13418 if (ipif->ipif_isv6) { 13419 ill->ill_flags |= ILLF_IPV6; 13420 } else { 13421 ipaddr_t inaddr_any = INADDR_ANY; 13422 13423 ill->ill_flags |= ILLF_IPV4; 13424 13425 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13426 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13427 &ipif->ipif_v6lcl_addr); 13428 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13429 &ipif->ipif_v6src_addr); 13430 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13431 &ipif->ipif_v6subnet); 13432 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13433 &ipif->ipif_v6net_mask); 13434 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13435 &ipif->ipif_v6brd_addr); 13436 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13437 &ipif->ipif_v6pp_dst_addr); 13438 } 13439 13440 /* 13441 * Don't set the interface flags etc. now, will do it in 13442 * ip_ll_subnet_defaults. 13443 */ 13444 if (!initialize) { 13445 mutex_exit(&ill->ill_lock); 13446 mutex_exit(&ill->ill_phyint->phyint_lock); 13447 return (ipif); 13448 } 13449 ipif->ipif_mtu = ill->ill_max_mtu; 13450 13451 if (ill->ill_bcast_addr_length != 0) { 13452 /* 13453 * Later detect lack of DLPI driver multicast 13454 * capability by catching DL_ENABMULTI errors in 13455 * ip_rput_dlpi. 13456 */ 13457 ill->ill_flags |= ILLF_MULTICAST; 13458 if (!ipif->ipif_isv6) 13459 ipif->ipif_flags |= IPIF_BROADCAST; 13460 } else { 13461 if (ill->ill_net_type != IRE_LOOPBACK) { 13462 if (ipif->ipif_isv6) 13463 /* 13464 * Note: xresolv interfaces will eventually need 13465 * NOARP set here as well, but that will require 13466 * those external resolvers to have some 13467 * knowledge of that flag and act appropriately. 13468 * Not to be changed at present. 13469 */ 13470 ill->ill_flags |= ILLF_NONUD; 13471 else 13472 ill->ill_flags |= ILLF_NOARP; 13473 } 13474 if (ill->ill_phys_addr_length == 0) { 13475 if (ill->ill_media && 13476 ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 13477 ipif->ipif_flags |= IPIF_NOXMIT; 13478 phyi->phyint_flags |= PHYI_VIRTUAL; 13479 } else { 13480 /* pt-pt supports multicast. */ 13481 ill->ill_flags |= ILLF_MULTICAST; 13482 if (ill->ill_net_type == IRE_LOOPBACK) { 13483 phyi->phyint_flags |= 13484 (PHYI_LOOPBACK | PHYI_VIRTUAL); 13485 } else { 13486 ipif->ipif_flags |= IPIF_POINTOPOINT; 13487 } 13488 } 13489 } 13490 } 13491 mutex_exit(&ill->ill_lock); 13492 mutex_exit(&ill->ill_phyint->phyint_lock); 13493 return (ipif); 13494 } 13495 13496 /* 13497 * If appropriate, send a message up to the resolver delete the entry 13498 * for the address of this interface which is going out of business. 13499 * (Always called as writer). 13500 * 13501 * NOTE : We need to check for NULL mps as some of the fields are 13502 * initialized only for some interface types. See ipif_resolver_up() 13503 * for details. 13504 */ 13505 void 13506 ipif_arp_down(ipif_t *ipif) 13507 { 13508 mblk_t *mp; 13509 ill_t *ill = ipif->ipif_ill; 13510 13511 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13512 ASSERT(IAM_WRITER_IPIF(ipif)); 13513 13514 /* Delete the mapping for the local address */ 13515 mp = ipif->ipif_arp_del_mp; 13516 if (mp != NULL) { 13517 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13518 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13519 putnext(ill->ill_rq, mp); 13520 ipif->ipif_arp_del_mp = NULL; 13521 } 13522 13523 /* 13524 * If this is the last ipif that is going down and there are no 13525 * duplicate addresses we may yet attempt to re-probe, then we need to 13526 * clean up ARP completely. 13527 */ 13528 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13529 13530 /* Send up AR_INTERFACE_DOWN message */ 13531 mp = ill->ill_arp_down_mp; 13532 if (mp != NULL) { 13533 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13534 *(unsigned *)mp->b_rptr, ill->ill_name, 13535 ipif->ipif_id)); 13536 putnext(ill->ill_rq, mp); 13537 ill->ill_arp_down_mp = NULL; 13538 } 13539 13540 /* Tell ARP to delete the multicast mappings */ 13541 mp = ill->ill_arp_del_mapping_mp; 13542 if (mp != NULL) { 13543 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13544 *(unsigned *)mp->b_rptr, ill->ill_name, 13545 ipif->ipif_id)); 13546 putnext(ill->ill_rq, mp); 13547 ill->ill_arp_del_mapping_mp = NULL; 13548 } 13549 } 13550 } 13551 13552 /* 13553 * This function sets up the multicast mappings in ARP. When ipif_resolver_up 13554 * calls this function, it passes a non-NULL arp_add_mapping_mp indicating 13555 * that it wants the add_mp allocated in this function to be returned 13556 * wihtout sending it to arp. When ip_rput_dlpi_writer calls this to 13557 * just re-do the multicast, it wants us to send the add_mp to ARP also. 13558 * ipif_resolver_up does not want us to do the "add" i.e sending to ARP, 13559 * as it does a ipif_arp_down after calling this function - which will 13560 * remove what we add here. 13561 * 13562 * Returns -1 on failures and 0 on success. 13563 */ 13564 int 13565 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13566 { 13567 mblk_t *del_mp = NULL; 13568 mblk_t *add_mp = NULL; 13569 mblk_t *mp; 13570 ill_t *ill = ipif->ipif_ill; 13571 phyint_t *phyi = ill->ill_phyint; 13572 ipaddr_t addr, mask, extract_mask = 0; 13573 arma_t *arma; 13574 uint8_t *maddr, *bphys_addr; 13575 uint32_t hw_start; 13576 dl_unitdata_req_t *dlur; 13577 13578 ASSERT(IAM_WRITER_IPIF(ipif)); 13579 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13580 return (0); 13581 13582 /* 13583 * Delete the existing mapping from ARP. Normally ipif_down 13584 * -> ipif_arp_down should send this up to ARP. The only 13585 * reason we would find this when we are switching from 13586 * Multicast to Broadcast where we did not do a down. 13587 */ 13588 mp = ill->ill_arp_del_mapping_mp; 13589 if (mp != NULL) { 13590 ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", 13591 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13592 putnext(ill->ill_rq, mp); 13593 ill->ill_arp_del_mapping_mp = NULL; 13594 } 13595 13596 if (arp_add_mapping_mp != NULL) 13597 *arp_add_mapping_mp = NULL; 13598 13599 /* 13600 * Check that the address is not to long for the constant 13601 * length reserved in the template arma_t. 13602 */ 13603 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13604 return (-1); 13605 13606 /* Add mapping mblk */ 13607 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13608 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13609 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13610 (caddr_t)&addr); 13611 if (add_mp == NULL) 13612 return (-1); 13613 arma = (arma_t *)add_mp->b_rptr; 13614 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13615 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13616 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13617 13618 /* 13619 * Determine the broadcast address. 13620 */ 13621 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13622 if (ill->ill_sap_length < 0) 13623 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13624 else 13625 bphys_addr = (uchar_t *)dlur + 13626 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13627 /* 13628 * Check PHYI_MULTI_BCAST and length of physical 13629 * address to determine if we use the mapping or the 13630 * broadcast address. 13631 */ 13632 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13633 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13634 bphys_addr, maddr, &hw_start, &extract_mask)) 13635 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13636 13637 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13638 (ill->ill_flags & ILLF_MULTICAST)) { 13639 /* Make sure this will not match the "exact" entry. */ 13640 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13641 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13642 (caddr_t)&addr); 13643 if (del_mp == NULL) { 13644 freemsg(add_mp); 13645 return (-1); 13646 } 13647 bcopy(&extract_mask, (char *)arma + 13648 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13649 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13650 /* Use link-layer broadcast address for MULTI_BCAST */ 13651 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13652 ip2dbg(("ipif_arp_setup_multicast: adding" 13653 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13654 } else { 13655 arma->arma_hw_mapping_start = hw_start; 13656 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13657 " ARP setup for %s\n", ill->ill_name)); 13658 } 13659 } else { 13660 freemsg(add_mp); 13661 ASSERT(del_mp == NULL); 13662 /* It is neither MULTICAST nor MULTI_BCAST */ 13663 return (0); 13664 } 13665 ASSERT(add_mp != NULL && del_mp != NULL); 13666 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13667 ill->ill_arp_del_mapping_mp = del_mp; 13668 if (arp_add_mapping_mp != NULL) { 13669 /* The caller just wants the mblks allocated */ 13670 *arp_add_mapping_mp = add_mp; 13671 } else { 13672 /* The caller wants us to send it to arp */ 13673 putnext(ill->ill_rq, add_mp); 13674 } 13675 return (0); 13676 } 13677 13678 /* 13679 * Get the resolver set up for a new interface address. 13680 * (Always called as writer.) 13681 * Called both for IPv4 and IPv6 interfaces, 13682 * though it only sets up the resolver for v6 13683 * if it's an xresolv interface (one using an external resolver). 13684 * Honors ILLF_NOARP. 13685 * The enumerated value res_act is used to tune the behavior. 13686 * If set to Res_act_initial, then we set up all the resolver 13687 * structures for a new interface. If set to Res_act_move, then 13688 * we just send an AR_ENTRY_ADD message up to ARP for IPv4 13689 * interfaces; this is called by ip_rput_dlpi_writer() to handle 13690 * asynchronous hardware address change notification. If set to 13691 * Res_act_defend, then we tell ARP that it needs to send a single 13692 * gratuitous message in defense of the address. 13693 * Returns error on failure. 13694 */ 13695 int 13696 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13697 { 13698 caddr_t addr; 13699 mblk_t *arp_up_mp = NULL; 13700 mblk_t *arp_down_mp = NULL; 13701 mblk_t *arp_add_mp = NULL; 13702 mblk_t *arp_del_mp = NULL; 13703 mblk_t *arp_add_mapping_mp = NULL; 13704 mblk_t *arp_del_mapping_mp = NULL; 13705 ill_t *ill = ipif->ipif_ill; 13706 uchar_t *area_p = NULL; 13707 uchar_t *ared_p = NULL; 13708 int err = ENOMEM; 13709 boolean_t was_dup; 13710 13711 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13712 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13713 ASSERT(IAM_WRITER_IPIF(ipif)); 13714 13715 was_dup = B_FALSE; 13716 if (res_act == Res_act_initial) { 13717 ipif->ipif_addr_ready = 0; 13718 /* 13719 * We're bringing an interface up here. There's no way that we 13720 * should need to shut down ARP now. 13721 */ 13722 mutex_enter(&ill->ill_lock); 13723 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13724 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13725 ill->ill_ipif_dup_count--; 13726 was_dup = B_TRUE; 13727 } 13728 mutex_exit(&ill->ill_lock); 13729 } 13730 if (ipif->ipif_recovery_id != 0) 13731 (void) untimeout(ipif->ipif_recovery_id); 13732 ipif->ipif_recovery_id = 0; 13733 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13734 ipif->ipif_addr_ready = 1; 13735 return (0); 13736 } 13737 /* NDP will set the ipif_addr_ready flag when it's ready */ 13738 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13739 return (0); 13740 13741 if (ill->ill_isv6) { 13742 /* 13743 * External resolver for IPv6 13744 */ 13745 ASSERT(res_act == Res_act_initial); 13746 if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 13747 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 13748 area_p = (uchar_t *)&ip6_area_template; 13749 ared_p = (uchar_t *)&ip6_ared_template; 13750 } 13751 } else { 13752 /* 13753 * IPv4 arp case. If the ARP stream has already started 13754 * closing, fail this request for ARP bringup. Else 13755 * record the fact that an ARP bringup is pending. 13756 */ 13757 mutex_enter(&ill->ill_lock); 13758 if (ill->ill_arp_closing) { 13759 mutex_exit(&ill->ill_lock); 13760 err = EINVAL; 13761 goto failed; 13762 } else { 13763 if (ill->ill_ipif_up_count == 0 && 13764 ill->ill_ipif_dup_count == 0 && !was_dup) 13765 ill->ill_arp_bringup_pending = 1; 13766 mutex_exit(&ill->ill_lock); 13767 } 13768 if (ipif->ipif_lcl_addr != INADDR_ANY) { 13769 addr = (caddr_t)&ipif->ipif_lcl_addr; 13770 area_p = (uchar_t *)&ip_area_template; 13771 ared_p = (uchar_t *)&ip_ared_template; 13772 } 13773 } 13774 13775 /* 13776 * Add an entry for the local address in ARP only if it 13777 * is not UNNUMBERED and the address is not INADDR_ANY. 13778 */ 13779 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { 13780 area_t *area; 13781 13782 /* Now ask ARP to publish our address. */ 13783 arp_add_mp = ill_arp_alloc(ill, area_p, addr); 13784 if (arp_add_mp == NULL) 13785 goto failed; 13786 area = (area_t *)arp_add_mp->b_rptr; 13787 if (res_act != Res_act_initial) { 13788 /* 13789 * Copy the new hardware address and length into 13790 * arp_add_mp to be sent to ARP. 13791 */ 13792 area->area_hw_addr_length = ill->ill_phys_addr_length; 13793 bcopy(ill->ill_phys_addr, 13794 ((char *)area + area->area_hw_addr_offset), 13795 area->area_hw_addr_length); 13796 } 13797 13798 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | 13799 ACE_F_MYADDR; 13800 13801 if (res_act == Res_act_defend) { 13802 area->area_flags |= ACE_F_DEFEND; 13803 /* 13804 * If we're just defending our address now, then 13805 * there's no need to set up ARP multicast mappings. 13806 * The publish command is enough. 13807 */ 13808 goto done; 13809 } 13810 13811 if (res_act != Res_act_initial) 13812 goto arp_setup_multicast; 13813 13814 /* 13815 * Allocate an ARP deletion message so we know we can tell ARP 13816 * when the interface goes down. 13817 */ 13818 arp_del_mp = ill_arp_alloc(ill, ared_p, addr); 13819 if (arp_del_mp == NULL) 13820 goto failed; 13821 13822 } else { 13823 if (res_act != Res_act_initial) 13824 goto done; 13825 } 13826 /* 13827 * Need to bring up ARP or setup multicast mapping only 13828 * when the first interface is coming UP. 13829 */ 13830 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 13831 was_dup) { 13832 goto done; 13833 } 13834 13835 /* 13836 * Allocate an ARP down message (to be saved) and an ARP up 13837 * message. 13838 */ 13839 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13840 if (arp_down_mp == NULL) 13841 goto failed; 13842 13843 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13844 if (arp_up_mp == NULL) 13845 goto failed; 13846 13847 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13848 goto done; 13849 13850 arp_setup_multicast: 13851 /* 13852 * Setup the multicast mappings. This function initializes 13853 * ill_arp_del_mapping_mp also. This does not need to be done for 13854 * IPv6. 13855 */ 13856 if (!ill->ill_isv6) { 13857 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13858 if (err != 0) 13859 goto failed; 13860 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13861 ASSERT(arp_add_mapping_mp != NULL); 13862 } 13863 13864 done: 13865 if (arp_del_mp != NULL) { 13866 ASSERT(ipif->ipif_arp_del_mp == NULL); 13867 ipif->ipif_arp_del_mp = arp_del_mp; 13868 } 13869 if (arp_down_mp != NULL) { 13870 ASSERT(ill->ill_arp_down_mp == NULL); 13871 ill->ill_arp_down_mp = arp_down_mp; 13872 } 13873 if (arp_del_mapping_mp != NULL) { 13874 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13875 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13876 } 13877 if (arp_up_mp != NULL) { 13878 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13879 ill->ill_name, ipif->ipif_id)); 13880 putnext(ill->ill_rq, arp_up_mp); 13881 } 13882 if (arp_add_mp != NULL) { 13883 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13884 ill->ill_name, ipif->ipif_id)); 13885 /* 13886 * If it's an extended ARP implementation, then we'll wait to 13887 * hear that DAD has finished before using the interface. 13888 */ 13889 if (!ill->ill_arp_extend) 13890 ipif->ipif_addr_ready = 1; 13891 putnext(ill->ill_rq, arp_add_mp); 13892 } else { 13893 ipif->ipif_addr_ready = 1; 13894 } 13895 if (arp_add_mapping_mp != NULL) { 13896 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13897 ill->ill_name, ipif->ipif_id)); 13898 putnext(ill->ill_rq, arp_add_mapping_mp); 13899 } 13900 if (res_act != Res_act_initial) 13901 return (0); 13902 13903 if (ill->ill_flags & ILLF_NOARP) 13904 err = ill_arp_off(ill); 13905 else 13906 err = ill_arp_on(ill); 13907 if (err != 0) { 13908 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); 13909 freemsg(ipif->ipif_arp_del_mp); 13910 freemsg(ill->ill_arp_down_mp); 13911 freemsg(ill->ill_arp_del_mapping_mp); 13912 ipif->ipif_arp_del_mp = NULL; 13913 ill->ill_arp_down_mp = NULL; 13914 ill->ill_arp_del_mapping_mp = NULL; 13915 return (err); 13916 } 13917 return ((ill->ill_ipif_up_count != 0 || was_dup || 13918 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 13919 13920 failed: 13921 ip1dbg(("ipif_resolver_up: FAILED\n")); 13922 freemsg(arp_add_mp); 13923 freemsg(arp_del_mp); 13924 freemsg(arp_add_mapping_mp); 13925 freemsg(arp_up_mp); 13926 freemsg(arp_down_mp); 13927 ill->ill_arp_bringup_pending = 0; 13928 return (err); 13929 } 13930 13931 /* 13932 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 13933 * just gone back up. 13934 */ 13935 static void 13936 ipif_arp_start_dad(ipif_t *ipif) 13937 { 13938 ill_t *ill = ipif->ipif_ill; 13939 mblk_t *arp_add_mp; 13940 area_t *area; 13941 13942 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 13943 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13944 ipif->ipif_lcl_addr == INADDR_ANY || 13945 (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 13946 (char *)&ipif->ipif_lcl_addr)) == NULL) { 13947 /* 13948 * If we can't contact ARP for some reason, that's not really a 13949 * problem. Just send out the routing socket notification that 13950 * DAD completion would have done, and continue. 13951 */ 13952 ipif_mask_reply(ipif); 13953 ip_rts_ifmsg(ipif); 13954 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13955 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13956 ipif->ipif_addr_ready = 1; 13957 return; 13958 } 13959 13960 /* Setting the 'unverified' flag restarts DAD */ 13961 area = (area_t *)arp_add_mp->b_rptr; 13962 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | 13963 ACE_F_UNVERIFIED; 13964 putnext(ill->ill_rq, arp_add_mp); 13965 } 13966 13967 static void 13968 ipif_ndp_start_dad(ipif_t *ipif) 13969 { 13970 nce_t *nce; 13971 13972 nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); 13973 if (nce == NULL) 13974 return; 13975 13976 if (!ndp_restart_dad(nce)) { 13977 /* 13978 * If we can't restart DAD for some reason, that's not really a 13979 * problem. Just send out the routing socket notification that 13980 * DAD completion would have done, and continue. 13981 */ 13982 ip_rts_ifmsg(ipif); 13983 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 13984 sctp_update_ipif(ipif, SCTP_IPIF_UP); 13985 ipif->ipif_addr_ready = 1; 13986 } 13987 NCE_REFRELE(nce); 13988 } 13989 13990 /* 13991 * Restart duplicate address detection on all interfaces on the given ill. 13992 * 13993 * This is called when an interface transitions from down to up 13994 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 13995 * 13996 * Note that since the underlying physical link has transitioned, we must cause 13997 * at least one routing socket message to be sent here, either via DAD 13998 * completion or just by default on the first ipif. (If we don't do this, then 13999 * in.mpathd will see long delays when doing link-based failure recovery.) 14000 */ 14001 void 14002 ill_restart_dad(ill_t *ill, boolean_t went_up) 14003 { 14004 ipif_t *ipif; 14005 14006 if (ill == NULL) 14007 return; 14008 14009 /* 14010 * If layer two doesn't support duplicate address detection, then just 14011 * send the routing socket message now and be done with it. 14012 */ 14013 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 14014 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 14015 ip_rts_ifmsg(ill->ill_ipif); 14016 return; 14017 } 14018 14019 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14020 if (went_up) { 14021 if (ipif->ipif_flags & IPIF_UP) { 14022 if (ill->ill_isv6) 14023 ipif_ndp_start_dad(ipif); 14024 else 14025 ipif_arp_start_dad(ipif); 14026 } else if (ill->ill_isv6 && 14027 (ipif->ipif_flags & IPIF_DUPLICATE)) { 14028 /* 14029 * For IPv4, the ARP module itself will 14030 * automatically start the DAD process when it 14031 * sees DL_NOTE_LINK_UP. We respond to the 14032 * AR_CN_READY at the completion of that task. 14033 * For IPv6, we must kick off the bring-up 14034 * process now. 14035 */ 14036 ndp_do_recovery(ipif); 14037 } else { 14038 /* 14039 * Unfortunately, the first ipif is "special" 14040 * and represents the underlying ill in the 14041 * routing socket messages. Thus, when this 14042 * one ipif is down, we must still notify so 14043 * that the user knows the IFF_RUNNING status 14044 * change. (If the first ipif is up, then 14045 * we'll handle eventual routing socket 14046 * notification via DAD completion.) 14047 */ 14048 if (ipif == ill->ill_ipif) 14049 ip_rts_ifmsg(ill->ill_ipif); 14050 } 14051 } else { 14052 /* 14053 * After link down, we'll need to send a new routing 14054 * message when the link comes back, so clear 14055 * ipif_addr_ready. 14056 */ 14057 ipif->ipif_addr_ready = 0; 14058 } 14059 } 14060 14061 /* 14062 * If we've torn down links, then notify the user right away. 14063 */ 14064 if (!went_up) 14065 ip_rts_ifmsg(ill->ill_ipif); 14066 } 14067 14068 /* 14069 * Wakeup all threads waiting to enter the ipsq, and sleeping 14070 * on any of the ills in this ipsq. The ill_lock of the ill 14071 * must be held so that waiters don't miss wakeups 14072 */ 14073 static void 14074 ill_signal_ipsq_ills(ipsq_t *ipsq, boolean_t caller_holds_lock) 14075 { 14076 phyint_t *phyint; 14077 14078 phyint = ipsq->ipsq_phyint_list; 14079 while (phyint != NULL) { 14080 if (phyint->phyint_illv4) { 14081 if (!caller_holds_lock) 14082 mutex_enter(&phyint->phyint_illv4->ill_lock); 14083 ASSERT(MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14084 cv_broadcast(&phyint->phyint_illv4->ill_cv); 14085 if (!caller_holds_lock) 14086 mutex_exit(&phyint->phyint_illv4->ill_lock); 14087 } 14088 if (phyint->phyint_illv6) { 14089 if (!caller_holds_lock) 14090 mutex_enter(&phyint->phyint_illv6->ill_lock); 14091 ASSERT(MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14092 cv_broadcast(&phyint->phyint_illv6->ill_cv); 14093 if (!caller_holds_lock) 14094 mutex_exit(&phyint->phyint_illv6->ill_lock); 14095 } 14096 phyint = phyint->phyint_ipsq_next; 14097 } 14098 } 14099 14100 static ipsq_t * 14101 ipsq_create(char *groupname, ip_stack_t *ipst) 14102 { 14103 ipsq_t *ipsq; 14104 14105 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14106 ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP); 14107 if (ipsq == NULL) { 14108 return (NULL); 14109 } 14110 14111 if (groupname != NULL) 14112 (void) strcpy(ipsq->ipsq_name, groupname); 14113 else 14114 ipsq->ipsq_name[0] = '\0'; 14115 14116 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, NULL); 14117 ipsq->ipsq_flags |= IPSQ_GROUP; 14118 ipsq->ipsq_next = ipst->ips_ipsq_g_head; 14119 ipst->ips_ipsq_g_head = ipsq; 14120 ipsq->ipsq_ipst = ipst; /* No netstack_hold */ 14121 return (ipsq); 14122 } 14123 14124 /* 14125 * Return an ipsq correspoding to the groupname. If 'create' is true 14126 * allocate a new ipsq if one does not exist. Usually an ipsq is associated 14127 * uniquely with an IPMP group. However during IPMP groupname operations, 14128 * multiple IPMP groups may be associated with a single ipsq. But no 14129 * IPMP group can be associated with more than 1 ipsq at any time. 14130 * For example 14131 * Interfaces IPMP grpname ipsq ipsq_name ipsq_refs 14132 * hme1, hme2 mpk17-84 ipsq1 mpk17-84 2 14133 * hme3, hme4 mpk17-85 ipsq2 mpk17-85 2 14134 * 14135 * Now the command ifconfig hme3 group mpk17-84 results in the temporary 14136 * status shown below during the execution of the above command. 14137 * hme1, hme2, hme3, hme4 mpk17-84, mpk17-85 ipsq1 mpk17-84 4 14138 * 14139 * After the completion of the above groupname command we return to the stable 14140 * state shown below. 14141 * hme1, hme2, hme3 mpk17-84 ipsq1 mpk17-84 3 14142 * hme4 mpk17-85 ipsq2 mpk17-85 1 14143 * 14144 * Because of the above, we don't search based on the ipsq_name since that 14145 * would miss the correct ipsq during certain windows as shown above. 14146 * The ipsq_name is only used during split of an ipsq to return the ipsq to its 14147 * natural state. 14148 */ 14149 static ipsq_t * 14150 ip_ipsq_lookup(char *groupname, boolean_t create, ipsq_t *exclude_ipsq, 14151 ip_stack_t *ipst) 14152 { 14153 ipsq_t *ipsq; 14154 int group_len; 14155 phyint_t *phyint; 14156 14157 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 14158 14159 group_len = strlen(groupname); 14160 ASSERT(group_len != 0); 14161 group_len++; 14162 14163 for (ipsq = ipst->ips_ipsq_g_head; 14164 ipsq != NULL; 14165 ipsq = ipsq->ipsq_next) { 14166 /* 14167 * When an ipsq is being split, and ill_split_ipsq 14168 * calls this function, we exclude it from being considered. 14169 */ 14170 if (ipsq == exclude_ipsq) 14171 continue; 14172 14173 /* 14174 * Compare against the ipsq_name. The groupname change happens 14175 * in 2 phases. The 1st phase merges the from group into 14176 * the to group's ipsq, by calling ill_merge_groups and restarts 14177 * the ioctl. The 2nd phase then locates the ipsq again thru 14178 * ipsq_name. At this point the phyint_groupname has not been 14179 * updated. 14180 */ 14181 if ((group_len == strlen(ipsq->ipsq_name) + 1) && 14182 (bcmp(ipsq->ipsq_name, groupname, group_len) == 0)) { 14183 /* 14184 * Verify that an ipmp groupname is exactly 14185 * part of 1 ipsq and is not found in any other 14186 * ipsq. 14187 */ 14188 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, ipst) == 14189 NULL); 14190 return (ipsq); 14191 } 14192 14193 /* 14194 * Comparison against ipsq_name alone is not sufficient. 14195 * In the case when groups are currently being 14196 * merged, the ipsq could hold other IPMP groups temporarily. 14197 * so we walk the phyint list and compare against the 14198 * phyint_groupname as well. 14199 */ 14200 phyint = ipsq->ipsq_phyint_list; 14201 while (phyint != NULL) { 14202 if ((group_len == phyint->phyint_groupname_len) && 14203 (bcmp(phyint->phyint_groupname, groupname, 14204 group_len) == 0)) { 14205 /* 14206 * Verify that an ipmp groupname is exactly 14207 * part of 1 ipsq and is not found in any other 14208 * ipsq. 14209 */ 14210 ASSERT(ip_ipsq_lookup(groupname, B_FALSE, ipsq, 14211 ipst) == NULL); 14212 return (ipsq); 14213 } 14214 phyint = phyint->phyint_ipsq_next; 14215 } 14216 } 14217 if (create) 14218 ipsq = ipsq_create(groupname, ipst); 14219 return (ipsq); 14220 } 14221 14222 static void 14223 ipsq_delete(ipsq_t *ipsq) 14224 { 14225 ipsq_t *nipsq; 14226 ipsq_t *pipsq = NULL; 14227 ip_stack_t *ipst = ipsq->ipsq_ipst; 14228 14229 /* 14230 * We don't hold the ipsq lock, but we are sure no new 14231 * messages can land up, since the ipsq_refs is zero. 14232 * i.e. this ipsq is unnamed and no phyint or phyint group 14233 * is associated with this ipsq. (Lookups are based on ill_name 14234 * or phyint_groupname) 14235 */ 14236 ASSERT(ipsq->ipsq_refs == 0); 14237 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipsq->ipsq_mphead == NULL); 14238 ASSERT(ipsq->ipsq_pending_mp == NULL); 14239 if (!(ipsq->ipsq_flags & IPSQ_GROUP)) { 14240 /* 14241 * This is not the ipsq of an IPMP group. 14242 */ 14243 ipsq->ipsq_ipst = NULL; 14244 kmem_free(ipsq, sizeof (ipsq_t)); 14245 return; 14246 } 14247 14248 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14249 14250 /* 14251 * Locate the ipsq before we can remove it from 14252 * the singly linked list of ipsq's. 14253 */ 14254 for (nipsq = ipst->ips_ipsq_g_head; nipsq != NULL; 14255 nipsq = nipsq->ipsq_next) { 14256 if (nipsq == ipsq) { 14257 break; 14258 } 14259 pipsq = nipsq; 14260 } 14261 14262 ASSERT(nipsq == ipsq); 14263 14264 /* unlink ipsq from the list */ 14265 if (pipsq != NULL) 14266 pipsq->ipsq_next = ipsq->ipsq_next; 14267 else 14268 ipst->ips_ipsq_g_head = ipsq->ipsq_next; 14269 ipsq->ipsq_ipst = NULL; 14270 kmem_free(ipsq, sizeof (ipsq_t)); 14271 rw_exit(&ipst->ips_ill_g_lock); 14272 } 14273 14274 static void 14275 ill_move_to_new_ipsq(ipsq_t *old_ipsq, ipsq_t *new_ipsq, mblk_t *current_mp, 14276 queue_t *q) 14277 { 14278 ASSERT(MUTEX_HELD(&new_ipsq->ipsq_lock)); 14279 ASSERT(old_ipsq->ipsq_mphead == NULL && old_ipsq->ipsq_mptail == NULL); 14280 ASSERT(old_ipsq->ipsq_pending_ipif == NULL); 14281 ASSERT(old_ipsq->ipsq_pending_mp == NULL); 14282 ASSERT(current_mp != NULL); 14283 14284 ipsq_enq(new_ipsq, q, current_mp, (ipsq_func_t)ip_process_ioctl, 14285 NEW_OP, NULL); 14286 14287 ASSERT(new_ipsq->ipsq_xopq_mptail != NULL && 14288 new_ipsq->ipsq_xopq_mphead != NULL); 14289 14290 /* 14291 * move from old ipsq to the new ipsq. 14292 */ 14293 new_ipsq->ipsq_xopq_mptail->b_next = old_ipsq->ipsq_xopq_mphead; 14294 if (old_ipsq->ipsq_xopq_mphead != NULL) 14295 new_ipsq->ipsq_xopq_mptail = old_ipsq->ipsq_xopq_mptail; 14296 14297 old_ipsq->ipsq_xopq_mphead = old_ipsq->ipsq_xopq_mptail = NULL; 14298 } 14299 14300 void 14301 ill_group_cleanup(ill_t *ill) 14302 { 14303 ill_t *ill_v4; 14304 ill_t *ill_v6; 14305 ipif_t *ipif; 14306 14307 ill_v4 = ill->ill_phyint->phyint_illv4; 14308 ill_v6 = ill->ill_phyint->phyint_illv6; 14309 14310 if (ill_v4 != NULL) { 14311 mutex_enter(&ill_v4->ill_lock); 14312 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14313 ipif = ipif->ipif_next) { 14314 IPIF_UNMARK_MOVING(ipif); 14315 } 14316 ill_v4->ill_up_ipifs = B_FALSE; 14317 mutex_exit(&ill_v4->ill_lock); 14318 } 14319 14320 if (ill_v6 != NULL) { 14321 mutex_enter(&ill_v6->ill_lock); 14322 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14323 ipif = ipif->ipif_next) { 14324 IPIF_UNMARK_MOVING(ipif); 14325 } 14326 ill_v6->ill_up_ipifs = B_FALSE; 14327 mutex_exit(&ill_v6->ill_lock); 14328 } 14329 } 14330 /* 14331 * This function is called when an ill has had a change in its group status 14332 * to bring up all the ipifs that were up before the change. 14333 */ 14334 int 14335 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 14336 { 14337 ipif_t *ipif; 14338 ill_t *ill_v4; 14339 ill_t *ill_v6; 14340 ill_t *from_ill; 14341 int err = 0; 14342 14343 14344 ASSERT(IAM_WRITER_ILL(ill)); 14345 14346 /* 14347 * Except for ipif_state_flags and ill_state_flags the other 14348 * fields of the ipif/ill that are modified below are protected 14349 * implicitly since we are a writer. We would have tried to down 14350 * even an ipif that was already down, in ill_down_ipifs. So we 14351 * just blindly clear the IPIF_CHANGING flag here on all ipifs. 14352 */ 14353 ill_v4 = ill->ill_phyint->phyint_illv4; 14354 ill_v6 = ill->ill_phyint->phyint_illv6; 14355 if (ill_v4 != NULL) { 14356 ill_v4->ill_up_ipifs = B_TRUE; 14357 for (ipif = ill_v4->ill_ipif; ipif != NULL; 14358 ipif = ipif->ipif_next) { 14359 mutex_enter(&ill_v4->ill_lock); 14360 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14361 IPIF_UNMARK_MOVING(ipif); 14362 mutex_exit(&ill_v4->ill_lock); 14363 if (ipif->ipif_was_up) { 14364 if (!(ipif->ipif_flags & IPIF_UP)) 14365 err = ipif_up(ipif, q, mp); 14366 ipif->ipif_was_up = B_FALSE; 14367 if (err != 0) { 14368 /* 14369 * Can there be any other error ? 14370 */ 14371 ASSERT(err == EINPROGRESS); 14372 return (err); 14373 } 14374 } 14375 } 14376 mutex_enter(&ill_v4->ill_lock); 14377 ill_v4->ill_state_flags &= ~ILL_CHANGING; 14378 mutex_exit(&ill_v4->ill_lock); 14379 ill_v4->ill_up_ipifs = B_FALSE; 14380 if (ill_v4->ill_move_in_progress) { 14381 ASSERT(ill_v4->ill_move_peer != NULL); 14382 ill_v4->ill_move_in_progress = B_FALSE; 14383 from_ill = ill_v4->ill_move_peer; 14384 from_ill->ill_move_in_progress = B_FALSE; 14385 from_ill->ill_move_peer = NULL; 14386 mutex_enter(&from_ill->ill_lock); 14387 from_ill->ill_state_flags &= ~ILL_CHANGING; 14388 mutex_exit(&from_ill->ill_lock); 14389 if (ill_v6 == NULL) { 14390 if (from_ill->ill_phyint->phyint_flags & 14391 PHYI_STANDBY) { 14392 phyint_inactive(from_ill->ill_phyint); 14393 } 14394 if (ill_v4->ill_phyint->phyint_flags & 14395 PHYI_STANDBY) { 14396 phyint_inactive(ill_v4->ill_phyint); 14397 } 14398 } 14399 ill_v4->ill_move_peer = NULL; 14400 } 14401 } 14402 14403 if (ill_v6 != NULL) { 14404 ill_v6->ill_up_ipifs = B_TRUE; 14405 for (ipif = ill_v6->ill_ipif; ipif != NULL; 14406 ipif = ipif->ipif_next) { 14407 mutex_enter(&ill_v6->ill_lock); 14408 ipif->ipif_state_flags &= ~IPIF_CHANGING; 14409 IPIF_UNMARK_MOVING(ipif); 14410 mutex_exit(&ill_v6->ill_lock); 14411 if (ipif->ipif_was_up) { 14412 if (!(ipif->ipif_flags & IPIF_UP)) 14413 err = ipif_up(ipif, q, mp); 14414 ipif->ipif_was_up = B_FALSE; 14415 if (err != 0) { 14416 /* 14417 * Can there be any other error ? 14418 */ 14419 ASSERT(err == EINPROGRESS); 14420 return (err); 14421 } 14422 } 14423 } 14424 mutex_enter(&ill_v6->ill_lock); 14425 ill_v6->ill_state_flags &= ~ILL_CHANGING; 14426 mutex_exit(&ill_v6->ill_lock); 14427 ill_v6->ill_up_ipifs = B_FALSE; 14428 if (ill_v6->ill_move_in_progress) { 14429 ASSERT(ill_v6->ill_move_peer != NULL); 14430 ill_v6->ill_move_in_progress = B_FALSE; 14431 from_ill = ill_v6->ill_move_peer; 14432 from_ill->ill_move_in_progress = B_FALSE; 14433 from_ill->ill_move_peer = NULL; 14434 mutex_enter(&from_ill->ill_lock); 14435 from_ill->ill_state_flags &= ~ILL_CHANGING; 14436 mutex_exit(&from_ill->ill_lock); 14437 if (from_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 14438 phyint_inactive(from_ill->ill_phyint); 14439 } 14440 if (ill_v6->ill_phyint->phyint_flags & PHYI_STANDBY) { 14441 phyint_inactive(ill_v6->ill_phyint); 14442 } 14443 ill_v6->ill_move_peer = NULL; 14444 } 14445 } 14446 return (0); 14447 } 14448 14449 /* 14450 * bring down all the approriate ipifs. 14451 */ 14452 /* ARGSUSED */ 14453 static void 14454 ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) 14455 { 14456 ipif_t *ipif; 14457 14458 ASSERT(IAM_WRITER_ILL(ill)); 14459 14460 /* 14461 * Except for ipif_state_flags the other fields of the ipif/ill that 14462 * are modified below are protected implicitly since we are a writer 14463 */ 14464 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 14465 if (chk_nofailover && (ipif->ipif_flags & IPIF_NOFAILOVER)) 14466 continue; 14467 if (index == 0 || index == ipif->ipif_orig_ifindex) { 14468 /* 14469 * We go through the ipif_down logic even if the ipif 14470 * is already down, since routes can be added based 14471 * on down ipifs. Going through ipif_down once again 14472 * will delete any IREs created based on these routes. 14473 */ 14474 if (ipif->ipif_flags & IPIF_UP) 14475 ipif->ipif_was_up = B_TRUE; 14476 /* 14477 * If called with chk_nofailover true ipif is moving. 14478 */ 14479 mutex_enter(&ill->ill_lock); 14480 if (chk_nofailover) { 14481 ipif->ipif_state_flags |= 14482 IPIF_MOVING | IPIF_CHANGING; 14483 } else { 14484 ipif->ipif_state_flags |= IPIF_CHANGING; 14485 } 14486 mutex_exit(&ill->ill_lock); 14487 /* 14488 * Need to re-create net/subnet bcast ires if 14489 * they are dependent on ipif. 14490 */ 14491 if (!ipif->ipif_isv6) 14492 ipif_check_bcast_ires(ipif); 14493 (void) ipif_logical_down(ipif, NULL, NULL); 14494 ipif_non_duplicate(ipif); 14495 ipif_down_tail(ipif); 14496 } 14497 } 14498 } 14499 14500 #define IPSQ_INC_REF(ipsq, ipst) { \ 14501 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ 14502 (ipsq)->ipsq_refs++; \ 14503 } 14504 14505 #define IPSQ_DEC_REF(ipsq, ipst) { \ 14506 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); \ 14507 (ipsq)->ipsq_refs--; \ 14508 if ((ipsq)->ipsq_refs == 0) \ 14509 (ipsq)->ipsq_name[0] = '\0'; \ 14510 } 14511 14512 /* 14513 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14514 * new_ipsq. 14515 */ 14516 static void 14517 ill_merge_ipsq(ipsq_t *cur_ipsq, ipsq_t *new_ipsq, ip_stack_t *ipst) 14518 { 14519 phyint_t *phyint; 14520 phyint_t *next_phyint; 14521 14522 /* 14523 * To change the ipsq of an ill, we need to hold the ill_g_lock as 14524 * writer and the ill_lock of the ill in question. Also the dest 14525 * ipsq can't vanish while we hold the ill_g_lock as writer. 14526 */ 14527 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14528 14529 phyint = cur_ipsq->ipsq_phyint_list; 14530 cur_ipsq->ipsq_phyint_list = NULL; 14531 while (phyint != NULL) { 14532 next_phyint = phyint->phyint_ipsq_next; 14533 IPSQ_DEC_REF(cur_ipsq, ipst); 14534 phyint->phyint_ipsq_next = new_ipsq->ipsq_phyint_list; 14535 new_ipsq->ipsq_phyint_list = phyint; 14536 IPSQ_INC_REF(new_ipsq, ipst); 14537 phyint->phyint_ipsq = new_ipsq; 14538 phyint = next_phyint; 14539 } 14540 } 14541 14542 #define SPLIT_SUCCESS 0 14543 #define SPLIT_NOT_NEEDED 1 14544 #define SPLIT_FAILED 2 14545 14546 int 14547 ill_split_to_grp_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, boolean_t need_retry, 14548 ip_stack_t *ipst) 14549 { 14550 ipsq_t *newipsq = NULL; 14551 14552 /* 14553 * Assertions denote pre-requisites for changing the ipsq of 14554 * a phyint 14555 */ 14556 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14557 /* 14558 * <ill-phyint> assocs can't change while ill_g_lock 14559 * is held as writer. See ill_phyint_reinit() 14560 */ 14561 ASSERT(phyint->phyint_illv4 == NULL || 14562 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14563 ASSERT(phyint->phyint_illv6 == NULL || 14564 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14565 14566 if ((phyint->phyint_groupname_len != 14567 (strlen(cur_ipsq->ipsq_name) + 1) || 14568 bcmp(phyint->phyint_groupname, cur_ipsq->ipsq_name, 14569 phyint->phyint_groupname_len) != 0)) { 14570 /* 14571 * Once we fail in creating a new ipsq due to memory shortage, 14572 * don't attempt to create new ipsq again, based on another 14573 * phyint, since we want all phyints belonging to an IPMP group 14574 * to be in the same ipsq even in the event of mem alloc fails. 14575 */ 14576 newipsq = ip_ipsq_lookup(phyint->phyint_groupname, !need_retry, 14577 cur_ipsq, ipst); 14578 if (newipsq == NULL) { 14579 /* Memory allocation failure */ 14580 return (SPLIT_FAILED); 14581 } else { 14582 /* ipsq_refs protected by ill_g_lock (writer) */ 14583 IPSQ_DEC_REF(cur_ipsq, ipst); 14584 phyint->phyint_ipsq = newipsq; 14585 phyint->phyint_ipsq_next = newipsq->ipsq_phyint_list; 14586 newipsq->ipsq_phyint_list = phyint; 14587 IPSQ_INC_REF(newipsq, ipst); 14588 return (SPLIT_SUCCESS); 14589 } 14590 } 14591 return (SPLIT_NOT_NEEDED); 14592 } 14593 14594 /* 14595 * The ill locks of the phyint and the ill_g_lock (writer) must be held 14596 * to do this split 14597 */ 14598 static int 14599 ill_split_to_own_ipsq(phyint_t *phyint, ipsq_t *cur_ipsq, ip_stack_t *ipst) 14600 { 14601 ipsq_t *newipsq; 14602 14603 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 14604 /* 14605 * <ill-phyint> assocs can't change while ill_g_lock 14606 * is held as writer. See ill_phyint_reinit() 14607 */ 14608 14609 ASSERT(phyint->phyint_illv4 == NULL || 14610 MUTEX_HELD(&phyint->phyint_illv4->ill_lock)); 14611 ASSERT(phyint->phyint_illv6 == NULL || 14612 MUTEX_HELD(&phyint->phyint_illv6->ill_lock)); 14613 14614 if (!ipsq_init((phyint->phyint_illv4 != NULL) ? 14615 phyint->phyint_illv4: phyint->phyint_illv6)) { 14616 /* 14617 * ipsq_init failed due to no memory 14618 * caller will use the same ipsq 14619 */ 14620 return (SPLIT_FAILED); 14621 } 14622 14623 /* ipsq_ref is protected by ill_g_lock (writer) */ 14624 IPSQ_DEC_REF(cur_ipsq, ipst); 14625 14626 /* 14627 * This is a new ipsq that is unknown to the world. 14628 * So we don't need to hold ipsq_lock, 14629 */ 14630 newipsq = phyint->phyint_ipsq; 14631 newipsq->ipsq_writer = NULL; 14632 newipsq->ipsq_reentry_cnt--; 14633 ASSERT(newipsq->ipsq_reentry_cnt == 0); 14634 #ifdef DEBUG 14635 newipsq->ipsq_depth = 0; 14636 #endif 14637 14638 return (SPLIT_SUCCESS); 14639 } 14640 14641 /* 14642 * Change the ipsq of all the ill's whose current ipsq is 'cur_ipsq' to 14643 * ipsq's representing their individual groups or themselves. Return 14644 * whether split needs to be retried again later. 14645 */ 14646 static boolean_t 14647 ill_split_ipsq(ipsq_t *cur_ipsq) 14648 { 14649 phyint_t *phyint; 14650 phyint_t *next_phyint; 14651 int error; 14652 boolean_t need_retry = B_FALSE; 14653 ip_stack_t *ipst = cur_ipsq->ipsq_ipst; 14654 14655 phyint = cur_ipsq->ipsq_phyint_list; 14656 cur_ipsq->ipsq_phyint_list = NULL; 14657 while (phyint != NULL) { 14658 next_phyint = phyint->phyint_ipsq_next; 14659 /* 14660 * 'created' will tell us whether the callee actually 14661 * created an ipsq. Lack of memory may force the callee 14662 * to return without creating an ipsq. 14663 */ 14664 if (phyint->phyint_groupname == NULL) { 14665 error = ill_split_to_own_ipsq(phyint, cur_ipsq, ipst); 14666 } else { 14667 error = ill_split_to_grp_ipsq(phyint, cur_ipsq, 14668 need_retry, ipst); 14669 } 14670 14671 switch (error) { 14672 case SPLIT_FAILED: 14673 need_retry = B_TRUE; 14674 /* FALLTHRU */ 14675 case SPLIT_NOT_NEEDED: 14676 /* 14677 * Keep it on the list. 14678 */ 14679 phyint->phyint_ipsq_next = cur_ipsq->ipsq_phyint_list; 14680 cur_ipsq->ipsq_phyint_list = phyint; 14681 break; 14682 case SPLIT_SUCCESS: 14683 break; 14684 default: 14685 ASSERT(0); 14686 } 14687 14688 phyint = next_phyint; 14689 } 14690 return (need_retry); 14691 } 14692 14693 /* 14694 * given an ipsq 'ipsq' lock all ills associated with this ipsq. 14695 * and return the ills in the list. This list will be 14696 * needed to unlock all the ills later on by the caller. 14697 * The <ill-ipsq> associations could change between the 14698 * lock and unlock. Hence the unlock can't traverse the 14699 * ipsq to get the list of ills. 14700 */ 14701 static int 14702 ill_lock_ipsq_ills(ipsq_t *ipsq, ill_t **list, int list_max) 14703 { 14704 int cnt = 0; 14705 phyint_t *phyint; 14706 ip_stack_t *ipst = ipsq->ipsq_ipst; 14707 14708 /* 14709 * The caller holds ill_g_lock to ensure that the ill memberships 14710 * of the ipsq don't change 14711 */ 14712 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 14713 14714 phyint = ipsq->ipsq_phyint_list; 14715 while (phyint != NULL) { 14716 if (phyint->phyint_illv4 != NULL) { 14717 ASSERT(cnt < list_max); 14718 list[cnt++] = phyint->phyint_illv4; 14719 } 14720 if (phyint->phyint_illv6 != NULL) { 14721 ASSERT(cnt < list_max); 14722 list[cnt++] = phyint->phyint_illv6; 14723 } 14724 phyint = phyint->phyint_ipsq_next; 14725 } 14726 ill_lock_ills(list, cnt); 14727 return (cnt); 14728 } 14729 14730 void 14731 ill_lock_ills(ill_t **list, int cnt) 14732 { 14733 int i; 14734 14735 if (cnt > 1) { 14736 boolean_t try_again; 14737 do { 14738 try_again = B_FALSE; 14739 for (i = 0; i < cnt - 1; i++) { 14740 if (list[i] < list[i + 1]) { 14741 ill_t *tmp; 14742 14743 /* swap the elements */ 14744 tmp = list[i]; 14745 list[i] = list[i + 1]; 14746 list[i + 1] = tmp; 14747 try_again = B_TRUE; 14748 } 14749 } 14750 } while (try_again); 14751 } 14752 14753 for (i = 0; i < cnt; i++) { 14754 if (i == 0) { 14755 if (list[i] != NULL) 14756 mutex_enter(&list[i]->ill_lock); 14757 else 14758 return; 14759 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14760 mutex_enter(&list[i]->ill_lock); 14761 } 14762 } 14763 } 14764 14765 void 14766 ill_unlock_ills(ill_t **list, int cnt) 14767 { 14768 int i; 14769 14770 for (i = 0; i < cnt; i++) { 14771 if ((i == 0) && (list[i] != NULL)) { 14772 mutex_exit(&list[i]->ill_lock); 14773 } else if ((list[i-1] != list[i]) && (list[i] != NULL)) { 14774 mutex_exit(&list[i]->ill_lock); 14775 } 14776 } 14777 } 14778 14779 /* 14780 * Merge all the ills from 1 ipsq group into another ipsq group. 14781 * The source ipsq group is specified by the ipsq associated with 14782 * 'from_ill'. The destination ipsq group is specified by the ipsq 14783 * associated with 'to_ill' or 'groupname' respectively. 14784 * Note that ipsq itself does not have a reference count mechanism 14785 * and functions don't look up an ipsq and pass it around. Instead 14786 * functions pass around an ill or groupname, and the ipsq is looked 14787 * up from the ill or groupname and the required operation performed 14788 * atomically with the lookup on the ipsq. 14789 */ 14790 static int 14791 ill_merge_groups(ill_t *from_ill, ill_t *to_ill, char *groupname, mblk_t *mp, 14792 queue_t *q) 14793 { 14794 ipsq_t *old_ipsq; 14795 ipsq_t *new_ipsq; 14796 ill_t **ill_list; 14797 int cnt; 14798 size_t ill_list_size; 14799 boolean_t became_writer_on_new_sq = B_FALSE; 14800 ip_stack_t *ipst = from_ill->ill_ipst; 14801 14802 ASSERT(to_ill == NULL || ipst == to_ill->ill_ipst); 14803 /* Exactly 1 of 'to_ill' and groupname can be specified. */ 14804 ASSERT((to_ill != NULL) ^ (groupname != NULL)); 14805 14806 /* 14807 * Need to hold ill_g_lock as writer and also the ill_lock to 14808 * change the <ill-ipsq> assoc of an ill. Need to hold the 14809 * ipsq_lock to prevent new messages from landing on an ipsq. 14810 */ 14811 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14812 14813 old_ipsq = from_ill->ill_phyint->phyint_ipsq; 14814 if (groupname != NULL) 14815 new_ipsq = ip_ipsq_lookup(groupname, B_TRUE, NULL, ipst); 14816 else { 14817 new_ipsq = to_ill->ill_phyint->phyint_ipsq; 14818 } 14819 14820 ASSERT(old_ipsq != NULL && new_ipsq != NULL); 14821 14822 /* 14823 * both groups are on the same ipsq. 14824 */ 14825 if (old_ipsq == new_ipsq) { 14826 rw_exit(&ipst->ips_ill_g_lock); 14827 return (0); 14828 } 14829 14830 cnt = old_ipsq->ipsq_refs << 1; 14831 ill_list_size = cnt * sizeof (ill_t *); 14832 ill_list = kmem_zalloc(ill_list_size, KM_NOSLEEP); 14833 if (ill_list == NULL) { 14834 rw_exit(&ipst->ips_ill_g_lock); 14835 return (ENOMEM); 14836 } 14837 cnt = ill_lock_ipsq_ills(old_ipsq, ill_list, cnt); 14838 14839 /* Need ipsq lock to enque messages on new ipsq or to become writer */ 14840 mutex_enter(&new_ipsq->ipsq_lock); 14841 if ((new_ipsq->ipsq_writer == NULL && 14842 new_ipsq->ipsq_current_ipif == NULL) || 14843 (new_ipsq->ipsq_writer == curthread)) { 14844 new_ipsq->ipsq_writer = curthread; 14845 new_ipsq->ipsq_reentry_cnt++; 14846 became_writer_on_new_sq = B_TRUE; 14847 } 14848 14849 /* 14850 * We are holding ill_g_lock as writer and all the ill locks of 14851 * the old ipsq. So the old_ipsq can't be looked up, and hence no new 14852 * message can land up on the old ipsq even though we don't hold the 14853 * ipsq_lock of the old_ipsq. Now move all messages to the newipsq. 14854 */ 14855 ill_move_to_new_ipsq(old_ipsq, new_ipsq, mp, q); 14856 14857 /* 14858 * now change the ipsq of all ills in the 'old_ipsq' to 'new_ipsq'. 14859 * 'new_ipsq' has been looked up, and it can't change its <ill-ipsq> 14860 * assocs. till we release the ill_g_lock, and hence it can't vanish. 14861 */ 14862 ill_merge_ipsq(old_ipsq, new_ipsq, ipst); 14863 14864 /* 14865 * Mark the new ipsq as needing a split since it is currently 14866 * being shared by more than 1 IPMP group. The split will 14867 * occur at the end of ipsq_exit 14868 */ 14869 new_ipsq->ipsq_split = B_TRUE; 14870 14871 /* Now release all the locks */ 14872 mutex_exit(&new_ipsq->ipsq_lock); 14873 ill_unlock_ills(ill_list, cnt); 14874 rw_exit(&ipst->ips_ill_g_lock); 14875 14876 kmem_free(ill_list, ill_list_size); 14877 14878 /* 14879 * If we succeeded in becoming writer on the new ipsq, then 14880 * drain the new ipsq and start processing all enqueued messages 14881 * including the current ioctl we are processing which is either 14882 * a set groupname or failover/failback. 14883 */ 14884 if (became_writer_on_new_sq) 14885 ipsq_exit(new_ipsq, B_TRUE, B_TRUE); 14886 14887 /* 14888 * syncq has been changed and all the messages have been moved. 14889 */ 14890 mutex_enter(&old_ipsq->ipsq_lock); 14891 old_ipsq->ipsq_current_ipif = NULL; 14892 old_ipsq->ipsq_current_ioctl = 0; 14893 mutex_exit(&old_ipsq->ipsq_lock); 14894 return (EINPROGRESS); 14895 } 14896 14897 /* 14898 * Delete and add the loopback copy and non-loopback copy of 14899 * the BROADCAST ire corresponding to ill and addr. Used to 14900 * group broadcast ires together when ill becomes part of 14901 * a group. 14902 * 14903 * This function is also called when ill is leaving the group 14904 * so that the ires belonging to the group gets re-grouped. 14905 */ 14906 static void 14907 ill_bcast_delete_and_add(ill_t *ill, ipaddr_t addr) 14908 { 14909 ire_t *ire, *nire, *nire_next, *ire_head = NULL; 14910 ire_t **ire_ptpn = &ire_head; 14911 ip_stack_t *ipst = ill->ill_ipst; 14912 14913 /* 14914 * The loopback and non-loopback IREs are inserted in the order in which 14915 * they're found, on the basis that they are correctly ordered (loopback 14916 * first). 14917 */ 14918 for (;;) { 14919 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 14920 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 14921 if (ire == NULL) 14922 break; 14923 14924 /* 14925 * we are passing in KM_SLEEP because it is not easy to 14926 * go back to a sane state in case of memory failure. 14927 */ 14928 nire = kmem_cache_alloc(ire_cache, KM_SLEEP); 14929 ASSERT(nire != NULL); 14930 bzero(nire, sizeof (ire_t)); 14931 /* 14932 * Don't use ire_max_frag directly since we don't 14933 * hold on to 'ire' until we add the new ire 'nire' and 14934 * we don't want the new ire to have a dangling reference 14935 * to 'ire'. The ire_max_frag of a broadcast ire must 14936 * be in sync with the ipif_mtu of the associate ipif. 14937 * For eg. this happens as a result of SIOCSLIFNAME, 14938 * SIOCSLIFLNKINFO or a DL_NOTE_SDU_SIZE inititated by 14939 * the driver. A change in ire_max_frag triggered as 14940 * as a result of path mtu discovery, or due to an 14941 * IP_IOC_IRE_ADVISE_NOREPLY from the transport or due a 14942 * route change -mtu command does not apply to broadcast ires. 14943 * 14944 * XXX We need a recovery strategy here if ire_init fails 14945 */ 14946 if (ire_init(nire, 14947 (uchar_t *)&ire->ire_addr, 14948 (uchar_t *)&ire->ire_mask, 14949 (uchar_t *)&ire->ire_src_addr, 14950 (uchar_t *)&ire->ire_gateway_addr, 14951 ire->ire_stq == NULL ? &ip_loopback_mtu : 14952 &ire->ire_ipif->ipif_mtu, 14953 ire->ire_nce, 14954 ire->ire_rfq, 14955 ire->ire_stq, 14956 ire->ire_type, 14957 ire->ire_ipif, 14958 ire->ire_cmask, 14959 ire->ire_phandle, 14960 ire->ire_ihandle, 14961 ire->ire_flags, 14962 &ire->ire_uinfo, 14963 NULL, 14964 NULL, 14965 ipst) == NULL) { 14966 cmn_err(CE_PANIC, "ire_init() failed"); 14967 } 14968 ire_delete(ire); 14969 ire_refrele(ire); 14970 14971 /* 14972 * The newly created IREs are inserted at the tail of the list 14973 * starting with ire_head. As we've just allocated them no one 14974 * knows about them so it's safe. 14975 */ 14976 *ire_ptpn = nire; 14977 ire_ptpn = &nire->ire_next; 14978 } 14979 14980 for (nire = ire_head; nire != NULL; nire = nire_next) { 14981 int error; 14982 ire_t *oire; 14983 /* unlink the IRE from our list before calling ire_add() */ 14984 nire_next = nire->ire_next; 14985 nire->ire_next = NULL; 14986 14987 /* ire_add adds the ire at the right place in the list */ 14988 oire = nire; 14989 error = ire_add(&nire, NULL, NULL, NULL, B_FALSE); 14990 ASSERT(error == 0); 14991 ASSERT(oire == nire); 14992 ire_refrele(nire); /* Held in ire_add */ 14993 } 14994 } 14995 14996 /* 14997 * This function is usually called when an ill is inserted in 14998 * a group and all the ipifs are already UP. As all the ipifs 14999 * are already UP, the broadcast ires have already been created 15000 * and been inserted. But, ire_add_v4 would not have grouped properly. 15001 * We need to re-group for the benefit of ip_wput_ire which 15002 * expects BROADCAST ires to be grouped properly to avoid sending 15003 * more than one copy of the broadcast packet per group. 15004 * 15005 * NOTE : We don't check for ill_ipif_up_count to be non-zero here 15006 * because when ipif_up_done ends up calling this, ires have 15007 * already been added before illgrp_insert i.e before ill_group 15008 * has been initialized. 15009 */ 15010 static void 15011 ill_group_bcast_for_xmit(ill_t *ill) 15012 { 15013 ill_group_t *illgrp; 15014 ipif_t *ipif; 15015 ipaddr_t addr; 15016 ipaddr_t net_mask; 15017 ipaddr_t subnet_netmask; 15018 15019 illgrp = ill->ill_group; 15020 15021 /* 15022 * This function is called even when an ill is deleted from 15023 * the group. Hence, illgrp could be null. 15024 */ 15025 if (illgrp != NULL && illgrp->illgrp_ill_count == 1) 15026 return; 15027 15028 /* 15029 * Delete all the BROADCAST ires matching this ill and add 15030 * them back. This time, ire_add_v4 should take care of 15031 * grouping them with others because ill is part of the 15032 * group. 15033 */ 15034 ill_bcast_delete_and_add(ill, 0); 15035 ill_bcast_delete_and_add(ill, INADDR_BROADCAST); 15036 15037 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15038 15039 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15040 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15041 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15042 } else { 15043 net_mask = htonl(IN_CLASSA_NET); 15044 } 15045 addr = net_mask & ipif->ipif_subnet; 15046 ill_bcast_delete_and_add(ill, addr); 15047 ill_bcast_delete_and_add(ill, ~net_mask | addr); 15048 15049 subnet_netmask = ipif->ipif_net_mask; 15050 addr = ipif->ipif_subnet; 15051 ill_bcast_delete_and_add(ill, addr); 15052 ill_bcast_delete_and_add(ill, ~subnet_netmask | addr); 15053 } 15054 } 15055 15056 /* 15057 * This function is called from illgrp_delete when ill is being deleted 15058 * from the group. 15059 * 15060 * As ill is not there in the group anymore, any address belonging 15061 * to this ill should be cleared of IRE_MARK_NORECV. 15062 */ 15063 static void 15064 ill_clear_bcast_mark(ill_t *ill, ipaddr_t addr) 15065 { 15066 ire_t *ire; 15067 irb_t *irb; 15068 ip_stack_t *ipst = ill->ill_ipst; 15069 15070 ASSERT(ill->ill_group == NULL); 15071 15072 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ill->ill_ipif, 15073 ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 15074 15075 if (ire != NULL) { 15076 /* 15077 * IPMP and plumbing operations are serialized on the ipsq, so 15078 * no one will insert or delete a broadcast ire under our feet. 15079 */ 15080 irb = ire->ire_bucket; 15081 rw_enter(&irb->irb_lock, RW_READER); 15082 ire_refrele(ire); 15083 15084 for (; ire != NULL; ire = ire->ire_next) { 15085 if (ire->ire_addr != addr) 15086 break; 15087 if (ire_to_ill(ire) != ill) 15088 continue; 15089 15090 ASSERT(!(ire->ire_marks & IRE_MARK_CONDEMNED)); 15091 ire->ire_marks &= ~IRE_MARK_NORECV; 15092 } 15093 rw_exit(&irb->irb_lock); 15094 } 15095 } 15096 15097 /* 15098 * This function must be called only after the broadcast ires 15099 * have been grouped together. For a given address addr, nominate 15100 * only one of the ires whose interface is not FAILED or OFFLINE. 15101 * 15102 * This is also called when an ipif goes down, so that we can nominate 15103 * a different ire with the same address for receiving. 15104 */ 15105 static void 15106 ill_mark_bcast(ill_group_t *illgrp, ipaddr_t addr, ip_stack_t *ipst) 15107 { 15108 irb_t *irb; 15109 ire_t *ire; 15110 ire_t *ire1; 15111 ire_t *save_ire; 15112 ire_t **irep = NULL; 15113 boolean_t first = B_TRUE; 15114 ire_t *clear_ire = NULL; 15115 ire_t *start_ire = NULL; 15116 ire_t *new_lb_ire; 15117 ire_t *new_nlb_ire; 15118 boolean_t new_lb_ire_used = B_FALSE; 15119 boolean_t new_nlb_ire_used = B_FALSE; 15120 uint64_t match_flags; 15121 uint64_t phyi_flags; 15122 boolean_t fallback = B_FALSE; 15123 uint_t max_frag; 15124 15125 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, NULL, ALL_ZONES, 15126 NULL, MATCH_IRE_TYPE, ipst); 15127 /* 15128 * We may not be able to find some ires if a previous 15129 * ire_create failed. This happens when an ipif goes 15130 * down and we are unable to create BROADCAST ires due 15131 * to memory failure. Thus, we have to check for NULL 15132 * below. This should handle the case for LOOPBACK, 15133 * POINTOPOINT and interfaces with some POINTOPOINT 15134 * logicals for which there are no BROADCAST ires. 15135 */ 15136 if (ire == NULL) 15137 return; 15138 /* 15139 * Currently IRE_BROADCASTS are deleted when an ipif 15140 * goes down which runs exclusively. Thus, setting 15141 * IRE_MARK_RCVD should not race with ire_delete marking 15142 * IRE_MARK_CONDEMNED. We grab the lock below just to 15143 * be consistent with other parts of the code that walks 15144 * a given bucket. 15145 */ 15146 save_ire = ire; 15147 irb = ire->ire_bucket; 15148 new_lb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15149 if (new_lb_ire == NULL) { 15150 ire_refrele(ire); 15151 return; 15152 } 15153 new_nlb_ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 15154 if (new_nlb_ire == NULL) { 15155 ire_refrele(ire); 15156 kmem_cache_free(ire_cache, new_lb_ire); 15157 return; 15158 } 15159 IRB_REFHOLD(irb); 15160 rw_enter(&irb->irb_lock, RW_WRITER); 15161 /* 15162 * Get to the first ire matching the address and the 15163 * group. If the address does not match we are done 15164 * as we could not find the IRE. If the address matches 15165 * we should get to the first one matching the group. 15166 */ 15167 while (ire != NULL) { 15168 if (ire->ire_addr != addr || 15169 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 15170 break; 15171 } 15172 ire = ire->ire_next; 15173 } 15174 match_flags = PHYI_FAILED | PHYI_INACTIVE; 15175 start_ire = ire; 15176 redo: 15177 while (ire != NULL && ire->ire_addr == addr && 15178 ire->ire_ipif->ipif_ill->ill_group == illgrp) { 15179 /* 15180 * The first ire for any address within a group 15181 * should always be the one with IRE_MARK_NORECV cleared 15182 * so that ip_wput_ire can avoid searching for one. 15183 * Note down the insertion point which will be used 15184 * later. 15185 */ 15186 if (first && (irep == NULL)) 15187 irep = ire->ire_ptpn; 15188 /* 15189 * PHYI_FAILED is set when the interface fails. 15190 * This interface might have become good, but the 15191 * daemon has not yet detected. We should still 15192 * not receive on this. PHYI_OFFLINE should never 15193 * be picked as this has been offlined and soon 15194 * be removed. 15195 */ 15196 phyi_flags = ire->ire_ipif->ipif_ill->ill_phyint->phyint_flags; 15197 if (phyi_flags & PHYI_OFFLINE) { 15198 ire->ire_marks |= IRE_MARK_NORECV; 15199 ire = ire->ire_next; 15200 continue; 15201 } 15202 if (phyi_flags & match_flags) { 15203 ire->ire_marks |= IRE_MARK_NORECV; 15204 ire = ire->ire_next; 15205 if ((phyi_flags & (PHYI_FAILED | PHYI_INACTIVE)) == 15206 PHYI_INACTIVE) { 15207 fallback = B_TRUE; 15208 } 15209 continue; 15210 } 15211 if (first) { 15212 /* 15213 * We will move this to the front of the list later 15214 * on. 15215 */ 15216 clear_ire = ire; 15217 ire->ire_marks &= ~IRE_MARK_NORECV; 15218 } else { 15219 ire->ire_marks |= IRE_MARK_NORECV; 15220 } 15221 first = B_FALSE; 15222 ire = ire->ire_next; 15223 } 15224 /* 15225 * If we never nominated anybody, try nominating at least 15226 * an INACTIVE, if we found one. Do it only once though. 15227 */ 15228 if (first && (match_flags == (PHYI_FAILED | PHYI_INACTIVE)) && 15229 fallback) { 15230 match_flags = PHYI_FAILED; 15231 ire = start_ire; 15232 irep = NULL; 15233 goto redo; 15234 } 15235 ire_refrele(save_ire); 15236 15237 /* 15238 * irep non-NULL indicates that we entered the while loop 15239 * above. If clear_ire is at the insertion point, we don't 15240 * have to do anything. clear_ire will be NULL if all the 15241 * interfaces are failed. 15242 * 15243 * We cannot unlink and reinsert the ire at the right place 15244 * in the list since there can be other walkers of this bucket. 15245 * Instead we delete and recreate the ire 15246 */ 15247 if (clear_ire != NULL && irep != NULL && *irep != clear_ire) { 15248 ire_t *clear_ire_stq = NULL; 15249 15250 bzero(new_lb_ire, sizeof (ire_t)); 15251 /* XXX We need a recovery strategy here. */ 15252 if (ire_init(new_lb_ire, 15253 (uchar_t *)&clear_ire->ire_addr, 15254 (uchar_t *)&clear_ire->ire_mask, 15255 (uchar_t *)&clear_ire->ire_src_addr, 15256 (uchar_t *)&clear_ire->ire_gateway_addr, 15257 &clear_ire->ire_max_frag, 15258 NULL, /* let ire_nce_init derive the resolver info */ 15259 clear_ire->ire_rfq, 15260 clear_ire->ire_stq, 15261 clear_ire->ire_type, 15262 clear_ire->ire_ipif, 15263 clear_ire->ire_cmask, 15264 clear_ire->ire_phandle, 15265 clear_ire->ire_ihandle, 15266 clear_ire->ire_flags, 15267 &clear_ire->ire_uinfo, 15268 NULL, 15269 NULL, 15270 ipst) == NULL) 15271 cmn_err(CE_PANIC, "ire_init() failed"); 15272 if (clear_ire->ire_stq == NULL) { 15273 ire_t *ire_next = clear_ire->ire_next; 15274 if (ire_next != NULL && 15275 ire_next->ire_stq != NULL && 15276 ire_next->ire_addr == clear_ire->ire_addr && 15277 ire_next->ire_ipif->ipif_ill == 15278 clear_ire->ire_ipif->ipif_ill) { 15279 clear_ire_stq = ire_next; 15280 15281 bzero(new_nlb_ire, sizeof (ire_t)); 15282 /* XXX We need a recovery strategy here. */ 15283 if (ire_init(new_nlb_ire, 15284 (uchar_t *)&clear_ire_stq->ire_addr, 15285 (uchar_t *)&clear_ire_stq->ire_mask, 15286 (uchar_t *)&clear_ire_stq->ire_src_addr, 15287 (uchar_t *)&clear_ire_stq->ire_gateway_addr, 15288 &clear_ire_stq->ire_max_frag, 15289 NULL, 15290 clear_ire_stq->ire_rfq, 15291 clear_ire_stq->ire_stq, 15292 clear_ire_stq->ire_type, 15293 clear_ire_stq->ire_ipif, 15294 clear_ire_stq->ire_cmask, 15295 clear_ire_stq->ire_phandle, 15296 clear_ire_stq->ire_ihandle, 15297 clear_ire_stq->ire_flags, 15298 &clear_ire_stq->ire_uinfo, 15299 NULL, 15300 NULL, 15301 ipst) == NULL) 15302 cmn_err(CE_PANIC, "ire_init() failed"); 15303 } 15304 } 15305 15306 /* 15307 * Delete the ire. We can't call ire_delete() since 15308 * we are holding the bucket lock. We can't release the 15309 * bucket lock since we can't allow irep to change. So just 15310 * mark it CONDEMNED. The IRB_REFRELE will delete the 15311 * ire from the list and do the refrele. 15312 */ 15313 clear_ire->ire_marks |= IRE_MARK_CONDEMNED; 15314 irb->irb_marks |= IRB_MARK_CONDEMNED; 15315 15316 if (clear_ire_stq != NULL && clear_ire_stq->ire_nce != NULL) { 15317 nce_fastpath_list_delete(clear_ire_stq->ire_nce); 15318 clear_ire_stq->ire_marks |= IRE_MARK_CONDEMNED; 15319 } 15320 15321 /* 15322 * Also take care of otherfields like ib/ob pkt count 15323 * etc. Need to dup them. ditto in ill_bcast_delete_and_add 15324 */ 15325 15326 /* Set the max_frag before adding the ire */ 15327 max_frag = *new_lb_ire->ire_max_fragp; 15328 new_lb_ire->ire_max_fragp = NULL; 15329 new_lb_ire->ire_max_frag = max_frag; 15330 15331 /* Add the new ire's. Insert at *irep */ 15332 new_lb_ire->ire_bucket = clear_ire->ire_bucket; 15333 ire1 = *irep; 15334 if (ire1 != NULL) 15335 ire1->ire_ptpn = &new_lb_ire->ire_next; 15336 new_lb_ire->ire_next = ire1; 15337 /* Link the new one in. */ 15338 new_lb_ire->ire_ptpn = irep; 15339 membar_producer(); 15340 *irep = new_lb_ire; 15341 new_lb_ire_used = B_TRUE; 15342 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); 15343 new_lb_ire->ire_bucket->irb_ire_cnt++; 15344 new_lb_ire->ire_ipif->ipif_ire_cnt++; 15345 15346 if (clear_ire_stq != NULL) { 15347 /* Set the max_frag before adding the ire */ 15348 max_frag = *new_nlb_ire->ire_max_fragp; 15349 new_nlb_ire->ire_max_fragp = NULL; 15350 new_nlb_ire->ire_max_frag = max_frag; 15351 15352 new_nlb_ire->ire_bucket = clear_ire->ire_bucket; 15353 irep = &new_lb_ire->ire_next; 15354 /* Add the new ire. Insert at *irep */ 15355 ire1 = *irep; 15356 if (ire1 != NULL) 15357 ire1->ire_ptpn = &new_nlb_ire->ire_next; 15358 new_nlb_ire->ire_next = ire1; 15359 /* Link the new one in. */ 15360 new_nlb_ire->ire_ptpn = irep; 15361 membar_producer(); 15362 *irep = new_nlb_ire; 15363 new_nlb_ire_used = B_TRUE; 15364 BUMP_IRE_STATS(ipst->ips_ire_stats_v4, 15365 ire_stats_inserted); 15366 new_nlb_ire->ire_bucket->irb_ire_cnt++; 15367 new_nlb_ire->ire_ipif->ipif_ire_cnt++; 15368 ((ill_t *)new_nlb_ire->ire_stq->q_ptr)->ill_ire_cnt++; 15369 } 15370 } 15371 rw_exit(&irb->irb_lock); 15372 if (!new_lb_ire_used) 15373 kmem_cache_free(ire_cache, new_lb_ire); 15374 if (!new_nlb_ire_used) 15375 kmem_cache_free(ire_cache, new_nlb_ire); 15376 IRB_REFRELE(irb); 15377 } 15378 15379 /* 15380 * Whenever an ipif goes down we have to renominate a different 15381 * broadcast ire to receive. Whenever an ipif comes up, we need 15382 * to make sure that we have only one nominated to receive. 15383 */ 15384 static void 15385 ipif_renominate_bcast(ipif_t *ipif) 15386 { 15387 ill_t *ill = ipif->ipif_ill; 15388 ipaddr_t subnet_addr; 15389 ipaddr_t net_addr; 15390 ipaddr_t net_mask = 0; 15391 ipaddr_t subnet_netmask; 15392 ipaddr_t addr; 15393 ill_group_t *illgrp; 15394 ip_stack_t *ipst = ill->ill_ipst; 15395 15396 illgrp = ill->ill_group; 15397 /* 15398 * If this is the last ipif going down, it might take 15399 * the ill out of the group. In that case ipif_down -> 15400 * illgrp_delete takes care of doing the nomination. 15401 * ipif_down does not call for this case. 15402 */ 15403 ASSERT(illgrp != NULL); 15404 15405 /* There could not have been any ires associated with this */ 15406 if (ipif->ipif_subnet == 0) 15407 return; 15408 15409 ill_mark_bcast(illgrp, 0, ipst); 15410 ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); 15411 15412 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15413 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15414 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15415 } else { 15416 net_mask = htonl(IN_CLASSA_NET); 15417 } 15418 addr = net_mask & ipif->ipif_subnet; 15419 ill_mark_bcast(illgrp, addr, ipst); 15420 15421 net_addr = ~net_mask | addr; 15422 ill_mark_bcast(illgrp, net_addr, ipst); 15423 15424 subnet_netmask = ipif->ipif_net_mask; 15425 addr = ipif->ipif_subnet; 15426 ill_mark_bcast(illgrp, addr, ipst); 15427 15428 subnet_addr = ~subnet_netmask | addr; 15429 ill_mark_bcast(illgrp, subnet_addr, ipst); 15430 } 15431 15432 /* 15433 * Whenever we form or delete ill groups, we need to nominate one set of 15434 * BROADCAST ires for receiving in the group. 15435 * 15436 * 1) When ipif_up_done -> ilgrp_insert calls this function, BROADCAST ires 15437 * have been added, but ill_ipif_up_count is 0. Thus, we don't assert 15438 * for ill_ipif_up_count to be non-zero. This is the only case where 15439 * ill_ipif_up_count is zero and we would still find the ires. 15440 * 15441 * 2) ip_sioctl_group_name/ifgrp_insert calls this function, at least one 15442 * ipif is UP and we just have to do the nomination. 15443 * 15444 * 3) When ill_handoff_responsibility calls us, some ill has been removed 15445 * from the group. So, we have to do the nomination. 15446 * 15447 * Because of (3), there could be just one ill in the group. But we have 15448 * to nominate still as IRE_MARK_NORCV may have been marked on this. 15449 * Thus, this function does not optimize when there is only one ill as 15450 * it is not correct for (3). 15451 */ 15452 static void 15453 ill_nominate_bcast_rcv(ill_group_t *illgrp) 15454 { 15455 ill_t *ill; 15456 ipif_t *ipif; 15457 ipaddr_t subnet_addr; 15458 ipaddr_t prev_subnet_addr = 0; 15459 ipaddr_t net_addr; 15460 ipaddr_t prev_net_addr = 0; 15461 ipaddr_t net_mask = 0; 15462 ipaddr_t subnet_netmask; 15463 ipaddr_t addr; 15464 ip_stack_t *ipst; 15465 15466 /* 15467 * When the last memeber is leaving, there is nothing to 15468 * nominate. 15469 */ 15470 if (illgrp->illgrp_ill_count == 0) { 15471 ASSERT(illgrp->illgrp_ill == NULL); 15472 return; 15473 } 15474 15475 ill = illgrp->illgrp_ill; 15476 ASSERT(!ill->ill_isv6); 15477 ipst = ill->ill_ipst; 15478 /* 15479 * We assume that ires with same address and belonging to the 15480 * same group, has been grouped together. Nominating a *single* 15481 * ill in the group for sending and receiving broadcast is done 15482 * by making sure that the first BROADCAST ire (which will be 15483 * the one returned by ire_ctable_lookup for ip_rput and the 15484 * one that will be used in ip_wput_ire) will be the one that 15485 * will not have IRE_MARK_NORECV set. 15486 * 15487 * 1) ip_rput checks and discards packets received on ires marked 15488 * with IRE_MARK_NORECV. Thus, we don't send up duplicate 15489 * broadcast packets. We need to clear IRE_MARK_NORECV on the 15490 * first ire in the group for every broadcast address in the group. 15491 * ip_rput will accept packets only on the first ire i.e only 15492 * one copy of the ill. 15493 * 15494 * 2) ip_wput_ire needs to send out just one copy of the broadcast 15495 * packet for the whole group. It needs to send out on the ill 15496 * whose ire has not been marked with IRE_MARK_NORECV. If it sends 15497 * on the one marked with IRE_MARK_NORECV, ip_rput will accept 15498 * the copy echoed back on other port where the ire is not marked 15499 * with IRE_MARK_NORECV. 15500 * 15501 * Note that we just need to have the first IRE either loopback or 15502 * non-loopback (either of them may not exist if ire_create failed 15503 * during ipif_down) with IRE_MARK_NORECV not set. ip_rput will 15504 * always hit the first one and hence will always accept one copy. 15505 * 15506 * We have a broadcast ire per ill for all the unique prefixes 15507 * hosted on that ill. As we don't have a way of knowing the 15508 * unique prefixes on a given ill and hence in the whole group, 15509 * we just call ill_mark_bcast on all the prefixes that exist 15510 * in the group. For the common case of one prefix, the code 15511 * below optimizes by remebering the last address used for 15512 * markng. In the case of multiple prefixes, this will still 15513 * optimize depending the order of prefixes. 15514 * 15515 * The only unique address across the whole group is 0.0.0.0 and 15516 * 255.255.255.255 and thus we call only once. ill_mark_bcast enables 15517 * the first ire in the bucket for receiving and disables the 15518 * others. 15519 */ 15520 ill_mark_bcast(illgrp, 0, ipst); 15521 ill_mark_bcast(illgrp, INADDR_BROADCAST, ipst); 15522 for (; ill != NULL; ill = ill->ill_group_next) { 15523 15524 for (ipif = ill->ill_ipif; ipif != NULL; 15525 ipif = ipif->ipif_next) { 15526 15527 if (!(ipif->ipif_flags & IPIF_UP) || 15528 ipif->ipif_subnet == 0) { 15529 continue; 15530 } 15531 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15532 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15533 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15534 } else { 15535 net_mask = htonl(IN_CLASSA_NET); 15536 } 15537 addr = net_mask & ipif->ipif_subnet; 15538 if (prev_net_addr == 0 || prev_net_addr != addr) { 15539 ill_mark_bcast(illgrp, addr, ipst); 15540 net_addr = ~net_mask | addr; 15541 ill_mark_bcast(illgrp, net_addr, ipst); 15542 } 15543 prev_net_addr = addr; 15544 15545 subnet_netmask = ipif->ipif_net_mask; 15546 addr = ipif->ipif_subnet; 15547 if (prev_subnet_addr == 0 || 15548 prev_subnet_addr != addr) { 15549 ill_mark_bcast(illgrp, addr, ipst); 15550 subnet_addr = ~subnet_netmask | addr; 15551 ill_mark_bcast(illgrp, subnet_addr, ipst); 15552 } 15553 prev_subnet_addr = addr; 15554 } 15555 } 15556 } 15557 15558 /* 15559 * This function is called while forming ill groups. 15560 * 15561 * Currently, we handle only allmulti groups. We want to join 15562 * allmulti on only one of the ills in the groups. In future, 15563 * when we have link aggregation, we may have to join normal 15564 * multicast groups on multiple ills as switch does inbound load 15565 * balancing. Following are the functions that calls this 15566 * function : 15567 * 15568 * 1) ill_recover_multicast : Interface is coming back UP. 15569 * When the first ipif comes back UP, ipif_up_done/ipif_up_done_v6 15570 * will call ill_recover_multicast to recover all the multicast 15571 * groups. We need to make sure that only one member is joined 15572 * in the ill group. 15573 * 15574 * 2) ip_addmulti/ip_addmulti_v6 : ill groups has already been formed. 15575 * Somebody is joining allmulti. We need to make sure that only one 15576 * member is joined in the group. 15577 * 15578 * 3) illgrp_insert : If allmulti has already joined, we need to make 15579 * sure that only one member is joined in the group. 15580 * 15581 * 4) ip_delmulti/ip_delmulti_v6 : Somebody in the group is leaving 15582 * allmulti who we have nominated. We need to pick someother ill. 15583 * 15584 * 5) illgrp_delete : The ill we nominated is leaving the group, 15585 * we need to pick a new ill to join the group. 15586 * 15587 * For (1), (2), (5) - we just have to check whether there is 15588 * a good ill joined in the group. If we could not find any ills 15589 * joined the group, we should join. 15590 * 15591 * For (4), the one that was nominated to receive, left the group. 15592 * There could be nobody joined in the group when this function is 15593 * called. 15594 * 15595 * For (3) - we need to explicitly check whether there are multiple 15596 * ills joined in the group. 15597 * 15598 * For simplicity, we don't differentiate any of the above cases. We 15599 * just leave the group if it is joined on any of them and join on 15600 * the first good ill. 15601 */ 15602 int 15603 ill_nominate_mcast_rcv(ill_group_t *illgrp) 15604 { 15605 ilm_t *ilm; 15606 ill_t *ill; 15607 ill_t *fallback_inactive_ill = NULL; 15608 ill_t *fallback_failed_ill = NULL; 15609 int ret = 0; 15610 15611 /* 15612 * Leave the allmulti on all the ills and start fresh. 15613 */ 15614 for (ill = illgrp->illgrp_ill; ill != NULL; 15615 ill = ill->ill_group_next) { 15616 if (ill->ill_join_allmulti) 15617 (void) ip_leave_allmulti(ill->ill_ipif); 15618 } 15619 15620 /* 15621 * Choose a good ill. Fallback to inactive or failed if 15622 * none available. We need to fallback to FAILED in the 15623 * case where we have 2 interfaces in a group - where 15624 * one of them is failed and another is a good one and 15625 * the good one (not marked inactive) is leaving the group. 15626 */ 15627 ret = 0; 15628 for (ill = illgrp->illgrp_ill; ill != NULL; 15629 ill = ill->ill_group_next) { 15630 /* Never pick an offline interface */ 15631 if (ill->ill_phyint->phyint_flags & PHYI_OFFLINE) 15632 continue; 15633 15634 if (ill->ill_phyint->phyint_flags & PHYI_FAILED) { 15635 fallback_failed_ill = ill; 15636 continue; 15637 } 15638 if (ill->ill_phyint->phyint_flags & PHYI_INACTIVE) { 15639 fallback_inactive_ill = ill; 15640 continue; 15641 } 15642 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15643 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15644 ret = ip_join_allmulti(ill->ill_ipif); 15645 /* 15646 * ip_join_allmulti can fail because of memory 15647 * failures. So, make sure we join at least 15648 * on one ill. 15649 */ 15650 if (ill->ill_join_allmulti) 15651 return (0); 15652 } 15653 } 15654 } 15655 if (ret != 0) { 15656 /* 15657 * If we tried nominating above and failed to do so, 15658 * return error. We might have tried multiple times. 15659 * But, return the latest error. 15660 */ 15661 return (ret); 15662 } 15663 if ((ill = fallback_inactive_ill) != NULL) { 15664 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15665 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15666 ret = ip_join_allmulti(ill->ill_ipif); 15667 return (ret); 15668 } 15669 } 15670 } else if ((ill = fallback_failed_ill) != NULL) { 15671 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15672 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15673 ret = ip_join_allmulti(ill->ill_ipif); 15674 return (ret); 15675 } 15676 } 15677 } 15678 return (0); 15679 } 15680 15681 /* 15682 * This function is called from illgrp_delete after it is 15683 * deleted from the group to reschedule responsibilities 15684 * to a different ill. 15685 */ 15686 static void 15687 ill_handoff_responsibility(ill_t *ill, ill_group_t *illgrp) 15688 { 15689 ilm_t *ilm; 15690 ipif_t *ipif; 15691 ipaddr_t subnet_addr; 15692 ipaddr_t net_addr; 15693 ipaddr_t net_mask = 0; 15694 ipaddr_t subnet_netmask; 15695 ipaddr_t addr; 15696 ip_stack_t *ipst = ill->ill_ipst; 15697 15698 ASSERT(ill->ill_group == NULL); 15699 /* 15700 * Broadcast Responsibility: 15701 * 15702 * 1. If this ill has been nominated for receiving broadcast 15703 * packets, we need to find a new one. Before we find a new 15704 * one, we need to re-group the ires that are part of this new 15705 * group (assumed by ill_nominate_bcast_rcv). We do this by 15706 * calling ill_group_bcast_for_xmit(ill) which will do the right 15707 * thing for us. 15708 * 15709 * 2. If this ill was not nominated for receiving broadcast 15710 * packets, we need to clear the IRE_MARK_NORECV flag 15711 * so that we continue to send up broadcast packets. 15712 */ 15713 if (!ill->ill_isv6) { 15714 /* 15715 * Case 1 above : No optimization here. Just redo the 15716 * nomination. 15717 */ 15718 ill_group_bcast_for_xmit(ill); 15719 ill_nominate_bcast_rcv(illgrp); 15720 15721 /* 15722 * Case 2 above : Lookup and clear IRE_MARK_NORECV. 15723 */ 15724 ill_clear_bcast_mark(ill, 0); 15725 ill_clear_bcast_mark(ill, INADDR_BROADCAST); 15726 15727 for (ipif = ill->ill_ipif; ipif != NULL; 15728 ipif = ipif->ipif_next) { 15729 15730 if (!(ipif->ipif_flags & IPIF_UP) || 15731 ipif->ipif_subnet == 0) { 15732 continue; 15733 } 15734 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 15735 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 15736 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 15737 } else { 15738 net_mask = htonl(IN_CLASSA_NET); 15739 } 15740 addr = net_mask & ipif->ipif_subnet; 15741 ill_clear_bcast_mark(ill, addr); 15742 15743 net_addr = ~net_mask | addr; 15744 ill_clear_bcast_mark(ill, net_addr); 15745 15746 subnet_netmask = ipif->ipif_net_mask; 15747 addr = ipif->ipif_subnet; 15748 ill_clear_bcast_mark(ill, addr); 15749 15750 subnet_addr = ~subnet_netmask | addr; 15751 ill_clear_bcast_mark(ill, subnet_addr); 15752 } 15753 } 15754 15755 /* 15756 * Multicast Responsibility. 15757 * 15758 * If we have joined allmulti on this one, find a new member 15759 * in the group to join allmulti. As this ill is already part 15760 * of allmulti, we don't have to join on this one. 15761 * 15762 * If we have not joined allmulti on this one, there is no 15763 * responsibility to handoff. But we need to take new 15764 * responsibility i.e, join allmulti on this one if we need 15765 * to. 15766 */ 15767 if (ill->ill_join_allmulti) { 15768 (void) ill_nominate_mcast_rcv(illgrp); 15769 } else { 15770 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 15771 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 15772 (void) ip_join_allmulti(ill->ill_ipif); 15773 break; 15774 } 15775 } 15776 } 15777 15778 /* 15779 * We intentionally do the flushing of IRE_CACHES only matching 15780 * on the ill and not on groups. Note that we are already deleted 15781 * from the group. 15782 * 15783 * This will make sure that all IRE_CACHES whose stq is pointing 15784 * at ill_wq or ire_ipif->ipif_ill pointing at this ill will get 15785 * deleted and IRE_CACHES that are not pointing at this ill will 15786 * be left alone. 15787 */ 15788 if (ill->ill_isv6) { 15789 ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15790 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15791 } else { 15792 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 15793 IRE_CACHE, illgrp_cache_delete, (char *)ill, ill); 15794 } 15795 15796 /* 15797 * Some conn may have cached one of the IREs deleted above. By removing 15798 * the ire reference, we clean up the extra reference to the ill held in 15799 * ire->ire_stq. 15800 */ 15801 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 15802 15803 /* 15804 * Re-do source address selection for all the members in the 15805 * group, if they borrowed source address from one of the ipifs 15806 * in this ill. 15807 */ 15808 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15809 if (ill->ill_isv6) { 15810 ipif_update_other_ipifs_v6(ipif, illgrp); 15811 } else { 15812 ipif_update_other_ipifs(ipif, illgrp); 15813 } 15814 } 15815 } 15816 15817 /* 15818 * Delete the ill from the group. The caller makes sure that it is 15819 * in a group and it okay to delete from the group. So, we always 15820 * delete here. 15821 */ 15822 static void 15823 illgrp_delete(ill_t *ill) 15824 { 15825 ill_group_t *illgrp; 15826 ill_group_t *tmpg; 15827 ill_t *tmp_ill; 15828 ip_stack_t *ipst = ill->ill_ipst; 15829 15830 /* 15831 * Reset illgrp_ill_schednext if it was pointing at us. 15832 * We need to do this before we set ill_group to NULL. 15833 */ 15834 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15835 mutex_enter(&ill->ill_lock); 15836 15837 illgrp_reset_schednext(ill); 15838 15839 illgrp = ill->ill_group; 15840 15841 /* Delete the ill from illgrp. */ 15842 if (illgrp->illgrp_ill == ill) { 15843 illgrp->illgrp_ill = ill->ill_group_next; 15844 } else { 15845 tmp_ill = illgrp->illgrp_ill; 15846 while (tmp_ill->ill_group_next != ill) { 15847 tmp_ill = tmp_ill->ill_group_next; 15848 ASSERT(tmp_ill != NULL); 15849 } 15850 tmp_ill->ill_group_next = ill->ill_group_next; 15851 } 15852 ill->ill_group = NULL; 15853 ill->ill_group_next = NULL; 15854 15855 illgrp->illgrp_ill_count--; 15856 mutex_exit(&ill->ill_lock); 15857 rw_exit(&ipst->ips_ill_g_lock); 15858 15859 /* 15860 * As this ill is leaving the group, we need to hand off 15861 * the responsibilities to the other ills in the group, if 15862 * this ill had some responsibilities. 15863 */ 15864 15865 ill_handoff_responsibility(ill, illgrp); 15866 15867 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15868 15869 if (illgrp->illgrp_ill_count == 0) { 15870 15871 ASSERT(illgrp->illgrp_ill == NULL); 15872 if (ill->ill_isv6) { 15873 if (illgrp == ipst->ips_illgrp_head_v6) { 15874 ipst->ips_illgrp_head_v6 = illgrp->illgrp_next; 15875 } else { 15876 tmpg = ipst->ips_illgrp_head_v6; 15877 while (tmpg->illgrp_next != illgrp) { 15878 tmpg = tmpg->illgrp_next; 15879 ASSERT(tmpg != NULL); 15880 } 15881 tmpg->illgrp_next = illgrp->illgrp_next; 15882 } 15883 } else { 15884 if (illgrp == ipst->ips_illgrp_head_v4) { 15885 ipst->ips_illgrp_head_v4 = illgrp->illgrp_next; 15886 } else { 15887 tmpg = ipst->ips_illgrp_head_v4; 15888 while (tmpg->illgrp_next != illgrp) { 15889 tmpg = tmpg->illgrp_next; 15890 ASSERT(tmpg != NULL); 15891 } 15892 tmpg->illgrp_next = illgrp->illgrp_next; 15893 } 15894 } 15895 mutex_destroy(&illgrp->illgrp_lock); 15896 mi_free(illgrp); 15897 } 15898 rw_exit(&ipst->ips_ill_g_lock); 15899 15900 /* 15901 * Even though the ill is out of the group its not necessary 15902 * to set ipsq_split as TRUE as the ipifs could be down temporarily 15903 * We will split the ipsq when phyint_groupname is set to NULL. 15904 */ 15905 15906 /* 15907 * Send a routing sockets message if we are deleting from 15908 * groups with names. 15909 */ 15910 if (ill->ill_phyint->phyint_groupname_len != 0) 15911 ip_rts_ifmsg(ill->ill_ipif); 15912 } 15913 15914 /* 15915 * Re-do source address selection. This is normally called when 15916 * an ill joins the group or when a non-NOLOCAL/DEPRECATED/ANYCAST 15917 * ipif comes up. 15918 */ 15919 void 15920 ill_update_source_selection(ill_t *ill) 15921 { 15922 ipif_t *ipif; 15923 15924 ASSERT(IAM_WRITER_ILL(ill)); 15925 15926 if (ill->ill_group != NULL) 15927 ill = ill->ill_group->illgrp_ill; 15928 15929 for (; ill != NULL; ill = ill->ill_group_next) { 15930 for (ipif = ill->ill_ipif; ipif != NULL; 15931 ipif = ipif->ipif_next) { 15932 if (ill->ill_isv6) 15933 ipif_recreate_interface_routes_v6(NULL, ipif); 15934 else 15935 ipif_recreate_interface_routes(NULL, ipif); 15936 } 15937 } 15938 } 15939 15940 /* 15941 * Insert ill in a group headed by illgrp_head. The caller can either 15942 * pass a groupname in which case we search for a group with the 15943 * same name to insert in or pass a group to insert in. This function 15944 * would only search groups with names. 15945 * 15946 * NOTE : The caller should make sure that there is at least one ipif 15947 * UP on this ill so that illgrp_scheduler can pick this ill 15948 * for outbound packets. If ill_ipif_up_count is zero, we have 15949 * already sent a DL_UNBIND to the driver and we don't want to 15950 * send anymore packets. We don't assert for ipif_up_count 15951 * to be greater than zero, because ipif_up_done wants to call 15952 * this function before bumping up the ipif_up_count. See 15953 * ipif_up_done() for details. 15954 */ 15955 int 15956 illgrp_insert(ill_group_t **illgrp_head, ill_t *ill, char *groupname, 15957 ill_group_t *grp_to_insert, boolean_t ipif_is_coming_up) 15958 { 15959 ill_group_t *illgrp; 15960 ill_t *prev_ill; 15961 phyint_t *phyi; 15962 ip_stack_t *ipst = ill->ill_ipst; 15963 15964 ASSERT(ill->ill_group == NULL); 15965 15966 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15967 mutex_enter(&ill->ill_lock); 15968 15969 if (groupname != NULL) { 15970 /* 15971 * Look for a group with a matching groupname to insert. 15972 */ 15973 for (illgrp = *illgrp_head; illgrp != NULL; 15974 illgrp = illgrp->illgrp_next) { 15975 15976 ill_t *tmp_ill; 15977 15978 /* 15979 * If we have an ill_group_t in the list which has 15980 * no ill_t assigned then we must be in the process of 15981 * removing this group. We skip this as illgrp_delete() 15982 * will remove it from the list. 15983 */ 15984 if ((tmp_ill = illgrp->illgrp_ill) == NULL) { 15985 ASSERT(illgrp->illgrp_ill_count == 0); 15986 continue; 15987 } 15988 15989 ASSERT(tmp_ill->ill_phyint != NULL); 15990 phyi = tmp_ill->ill_phyint; 15991 /* 15992 * Look at groups which has names only. 15993 */ 15994 if (phyi->phyint_groupname_len == 0) 15995 continue; 15996 /* 15997 * Names are stored in the phyint common to both 15998 * IPv4 and IPv6. 15999 */ 16000 if (mi_strcmp(phyi->phyint_groupname, 16001 groupname) == 0) { 16002 break; 16003 } 16004 } 16005 } else { 16006 /* 16007 * If the caller passes in a NULL "grp_to_insert", we 16008 * allocate one below and insert this singleton. 16009 */ 16010 illgrp = grp_to_insert; 16011 } 16012 16013 ill->ill_group_next = NULL; 16014 16015 if (illgrp == NULL) { 16016 illgrp = (ill_group_t *)mi_zalloc(sizeof (ill_group_t)); 16017 if (illgrp == NULL) { 16018 return (ENOMEM); 16019 } 16020 illgrp->illgrp_next = *illgrp_head; 16021 *illgrp_head = illgrp; 16022 illgrp->illgrp_ill = ill; 16023 illgrp->illgrp_ill_count = 1; 16024 ill->ill_group = illgrp; 16025 /* 16026 * Used in illgrp_scheduler to protect multiple threads 16027 * from traversing the list. 16028 */ 16029 mutex_init(&illgrp->illgrp_lock, NULL, MUTEX_DEFAULT, 0); 16030 } else { 16031 ASSERT(ill->ill_net_type == 16032 illgrp->illgrp_ill->ill_net_type); 16033 ASSERT(ill->ill_type == illgrp->illgrp_ill->ill_type); 16034 16035 /* Insert ill at tail of this group */ 16036 prev_ill = illgrp->illgrp_ill; 16037 while (prev_ill->ill_group_next != NULL) 16038 prev_ill = prev_ill->ill_group_next; 16039 prev_ill->ill_group_next = ill; 16040 ill->ill_group = illgrp; 16041 illgrp->illgrp_ill_count++; 16042 /* 16043 * Inherit group properties. Currently only forwarding 16044 * is the property we try to keep the same with all the 16045 * ills. When there are more, we will abstract this into 16046 * a function. 16047 */ 16048 ill->ill_flags &= ~ILLF_ROUTER; 16049 ill->ill_flags |= (illgrp->illgrp_ill->ill_flags & ILLF_ROUTER); 16050 } 16051 mutex_exit(&ill->ill_lock); 16052 rw_exit(&ipst->ips_ill_g_lock); 16053 16054 /* 16055 * 1) When ipif_up_done() calls this function, ipif_up_count 16056 * may be zero as it has not yet been bumped. But the ires 16057 * have already been added. So, we do the nomination here 16058 * itself. But, when ip_sioctl_groupname calls this, it checks 16059 * for ill_ipif_up_count != 0. Thus we don't check for 16060 * ill_ipif_up_count here while nominating broadcast ires for 16061 * receive. 16062 * 16063 * 2) Similarly, we need to call ill_group_bcast_for_xmit here 16064 * to group them properly as ire_add() has already happened 16065 * in the ipif_up_done() case. For ip_sioctl_groupname/ifgrp_insert 16066 * case, we need to do it here anyway. 16067 */ 16068 if (!ill->ill_isv6) { 16069 ill_group_bcast_for_xmit(ill); 16070 ill_nominate_bcast_rcv(illgrp); 16071 } 16072 16073 if (!ipif_is_coming_up) { 16074 /* 16075 * When ipif_up_done() calls this function, the multicast 16076 * groups have not been joined yet. So, there is no point in 16077 * nomination. ip_join_allmulti will handle groups when 16078 * ill_recover_multicast is called from ipif_up_done() later. 16079 */ 16080 (void) ill_nominate_mcast_rcv(illgrp); 16081 /* 16082 * ipif_up_done calls ill_update_source_selection 16083 * anyway. Moreover, we don't want to re-create 16084 * interface routes while ipif_up_done() still has reference 16085 * to them. Refer to ipif_up_done() for more details. 16086 */ 16087 ill_update_source_selection(ill); 16088 } 16089 16090 /* 16091 * Send a routing sockets message if we are inserting into 16092 * groups with names. 16093 */ 16094 if (groupname != NULL) 16095 ip_rts_ifmsg(ill->ill_ipif); 16096 return (0); 16097 } 16098 16099 /* 16100 * Return the first phyint matching the groupname. There could 16101 * be more than one when there are ill groups. 16102 * 16103 * If 'usable' is set, then we exclude ones that are marked with any of 16104 * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). 16105 * Needs work: called only from ip_sioctl_groupname and from the ipmp/netinfo 16106 * emulation of ipmp. 16107 */ 16108 phyint_t * 16109 phyint_lookup_group(char *groupname, boolean_t usable, ip_stack_t *ipst) 16110 { 16111 phyint_t *phyi; 16112 16113 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 16114 /* 16115 * Group names are stored in the phyint - a common structure 16116 * to both IPv4 and IPv6. 16117 */ 16118 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 16119 for (; phyi != NULL; 16120 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16121 phyi, AVL_AFTER)) { 16122 if (phyi->phyint_groupname_len == 0) 16123 continue; 16124 /* 16125 * Skip the ones that should not be used since the callers 16126 * sometime use this for sending packets. 16127 */ 16128 if (usable && (phyi->phyint_flags & 16129 (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE))) 16130 continue; 16131 16132 ASSERT(phyi->phyint_groupname != NULL); 16133 if (mi_strcmp(groupname, phyi->phyint_groupname) == 0) 16134 return (phyi); 16135 } 16136 return (NULL); 16137 } 16138 16139 16140 /* 16141 * Return the first usable phyint matching the group index. By 'usable' 16142 * we exclude ones that are marked ununsable with any of 16143 * (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE). 16144 * 16145 * Used only for the ipmp/netinfo emulation of ipmp. 16146 */ 16147 phyint_t * 16148 phyint_lookup_group_ifindex(uint_t group_ifindex, ip_stack_t *ipst) 16149 { 16150 phyint_t *phyi; 16151 16152 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 16153 16154 if (!ipst->ips_ipmp_hook_emulation) 16155 return (NULL); 16156 16157 /* 16158 * Group indicies are stored in the phyint - a common structure 16159 * to both IPv4 and IPv6. 16160 */ 16161 phyi = avl_first(&ipst->ips_phyint_g_list->phyint_list_avl_by_index); 16162 for (; phyi != NULL; 16163 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16164 phyi, AVL_AFTER)) { 16165 /* Ignore the ones that do not have a group */ 16166 if (phyi->phyint_groupname_len == 0) 16167 continue; 16168 16169 ASSERT(phyi->phyint_group_ifindex != 0); 16170 /* 16171 * Skip the ones that should not be used since the callers 16172 * sometime use this for sending packets. 16173 */ 16174 if (phyi->phyint_flags & 16175 (PHYI_FAILED|PHYI_OFFLINE|PHYI_INACTIVE)) 16176 continue; 16177 if (phyi->phyint_group_ifindex == group_ifindex) 16178 return (phyi); 16179 } 16180 return (NULL); 16181 } 16182 16183 16184 /* 16185 * MT notes on creation and deletion of IPMP groups 16186 * 16187 * Creation and deletion of IPMP groups introduce the need to merge or 16188 * split the associated serialization objects i.e the ipsq's. Normally all 16189 * the ills in an IPMP group would map to a single ipsq. If IPMP is not enabled 16190 * an ill-pair(v4, v6) i.e. phyint would map to a single ipsq. However during 16191 * the execution of the SIOCSLIFGROUPNAME command the picture changes. There 16192 * is a need to change the <ill-ipsq> association and we have to operate on both 16193 * the source and destination IPMP groups. For eg. attempting to set the 16194 * groupname of hme0 to mpk17-85 when it already belongs to mpk17-84 has to 16195 * handle 2 IPMP groups and 2 ipsqs. All the ills belonging to either of the 16196 * source or destination IPMP group are mapped to a single ipsq for executing 16197 * the SIOCSLIFGROUPNAME command. This is termed as a merge of the ipsq's. 16198 * The <ill-ipsq> mapping is restored back to normal at a later point. This is 16199 * termed as a split of the ipsq. The converse of the merge i.e. a split of the 16200 * ipsq happens while unwinding from ipsq_exit. If at least 1 set groupname 16201 * occurred on the ipsq, then the ipsq_split flag is set. This indicates the 16202 * ipsq has to be examined for redoing the <ill-ipsq> associations. 16203 * 16204 * In the above example the ioctl handling code locates the current ipsq of hme0 16205 * which is ipsq(mpk17-84). It then enters the above ipsq immediately or 16206 * eventually (after queueing the ioctl in ipsq(mpk17-84)). Then it locates 16207 * the destination ipsq which is ipsq(mpk17-85) and merges the source ipsq into 16208 * the destination ipsq. If the destination ipsq is not busy, it also enters 16209 * the destination ipsq exclusively. Now the actual groupname setting operation 16210 * can proceed. If the destination ipsq is busy, the operation is enqueued 16211 * on the destination (merged) ipsq and will be handled in the unwind from 16212 * ipsq_exit. 16213 * 16214 * To prevent other threads accessing the ill while the group name change is 16215 * in progres, we bring down the ipifs which also removes the ill from the 16216 * group. The group is changed in phyint and when the first ipif on the ill 16217 * is brought up, the ill is inserted into the right IPMP group by 16218 * illgrp_insert. 16219 */ 16220 /* ARGSUSED */ 16221 int 16222 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16223 ip_ioctl_cmd_t *ipip, void *ifreq) 16224 { 16225 int i; 16226 char *tmp; 16227 int namelen; 16228 ill_t *ill = ipif->ipif_ill; 16229 ill_t *ill_v4, *ill_v6; 16230 int err = 0; 16231 phyint_t *phyi; 16232 phyint_t *phyi_tmp; 16233 struct lifreq *lifr; 16234 mblk_t *mp1; 16235 char *groupname; 16236 ipsq_t *ipsq; 16237 ip_stack_t *ipst = ill->ill_ipst; 16238 16239 ASSERT(IAM_WRITER_IPIF(ipif)); 16240 16241 /* Existance verified in ip_wput_nondata */ 16242 mp1 = mp->b_cont->b_cont; 16243 lifr = (struct lifreq *)mp1->b_rptr; 16244 groupname = lifr->lifr_groupname; 16245 16246 if (ipif->ipif_id != 0) 16247 return (EINVAL); 16248 16249 phyi = ill->ill_phyint; 16250 ASSERT(phyi != NULL); 16251 16252 if (phyi->phyint_flags & PHYI_VIRTUAL) 16253 return (EINVAL); 16254 16255 tmp = groupname; 16256 for (i = 0; i < LIFNAMSIZ && *tmp != '\0'; tmp++, i++) 16257 ; 16258 16259 if (i == LIFNAMSIZ) { 16260 /* no null termination */ 16261 return (EINVAL); 16262 } 16263 16264 /* 16265 * Calculate the namelen exclusive of the null 16266 * termination character. 16267 */ 16268 namelen = tmp - groupname; 16269 16270 ill_v4 = phyi->phyint_illv4; 16271 ill_v6 = phyi->phyint_illv6; 16272 16273 /* 16274 * ILL cannot be part of a usesrc group and and IPMP group at the 16275 * same time. No need to grab the ill_g_usesrc_lock here, see 16276 * synchronization notes in ip.c 16277 */ 16278 if (ipif->ipif_ill->ill_usesrc_grp_next != NULL) { 16279 return (EINVAL); 16280 } 16281 16282 /* 16283 * mark the ill as changing. 16284 * this should queue all new requests on the syncq. 16285 */ 16286 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16287 16288 if (ill_v4 != NULL) 16289 ill_v4->ill_state_flags |= ILL_CHANGING; 16290 if (ill_v6 != NULL) 16291 ill_v6->ill_state_flags |= ILL_CHANGING; 16292 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16293 16294 if (namelen == 0) { 16295 /* 16296 * Null string means remove this interface from the 16297 * existing group. 16298 */ 16299 if (phyi->phyint_groupname_len == 0) { 16300 /* 16301 * Never was in a group. 16302 */ 16303 err = 0; 16304 goto done; 16305 } 16306 16307 /* 16308 * IPv4 or IPv6 may be temporarily out of the group when all 16309 * the ipifs are down. Thus, we need to check for ill_group to 16310 * be non-NULL. 16311 */ 16312 if (ill_v4 != NULL && ill_v4->ill_group != NULL) { 16313 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16314 mutex_enter(&ill_v4->ill_lock); 16315 if (!ill_is_quiescent(ill_v4)) { 16316 /* 16317 * ipsq_pending_mp_add will not fail since 16318 * connp is NULL 16319 */ 16320 (void) ipsq_pending_mp_add(NULL, 16321 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16322 mutex_exit(&ill_v4->ill_lock); 16323 err = EINPROGRESS; 16324 goto done; 16325 } 16326 mutex_exit(&ill_v4->ill_lock); 16327 } 16328 16329 if (ill_v6 != NULL && ill_v6->ill_group != NULL) { 16330 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16331 mutex_enter(&ill_v6->ill_lock); 16332 if (!ill_is_quiescent(ill_v6)) { 16333 (void) ipsq_pending_mp_add(NULL, 16334 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16335 mutex_exit(&ill_v6->ill_lock); 16336 err = EINPROGRESS; 16337 goto done; 16338 } 16339 mutex_exit(&ill_v6->ill_lock); 16340 } 16341 16342 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16343 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16344 mutex_enter(&phyi->phyint_lock); 16345 ASSERT(phyi->phyint_groupname != NULL); 16346 mi_free(phyi->phyint_groupname); 16347 phyi->phyint_groupname = NULL; 16348 phyi->phyint_groupname_len = 0; 16349 16350 /* Restore the ifindex used to be the per interface one */ 16351 phyi->phyint_group_ifindex = 0; 16352 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 16353 mutex_exit(&phyi->phyint_lock); 16354 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16355 rw_exit(&ipst->ips_ill_g_lock); 16356 err = ill_up_ipifs(ill, q, mp); 16357 16358 /* 16359 * set the split flag so that the ipsq can be split 16360 */ 16361 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16362 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16363 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16364 16365 } else { 16366 if (phyi->phyint_groupname_len != 0) { 16367 ASSERT(phyi->phyint_groupname != NULL); 16368 /* Are we inserting in the same group ? */ 16369 if (mi_strcmp(groupname, 16370 phyi->phyint_groupname) == 0) { 16371 err = 0; 16372 goto done; 16373 } 16374 } 16375 16376 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16377 /* 16378 * Merge ipsq for the group's. 16379 * This check is here as multiple groups/ills might be 16380 * sharing the same ipsq. 16381 * If we have to merege than the operation is restarted 16382 * on the new ipsq. 16383 */ 16384 ipsq = ip_ipsq_lookup(groupname, B_FALSE, NULL, ipst); 16385 if (phyi->phyint_ipsq != ipsq) { 16386 rw_exit(&ipst->ips_ill_g_lock); 16387 err = ill_merge_groups(ill, NULL, groupname, mp, q); 16388 goto done; 16389 } 16390 /* 16391 * Running exclusive on new ipsq. 16392 */ 16393 16394 ASSERT(ipsq != NULL); 16395 ASSERT(ipsq->ipsq_writer == curthread); 16396 16397 /* 16398 * Check whether the ill_type and ill_net_type matches before 16399 * we allocate any memory so that the cleanup is easier. 16400 * 16401 * We can't group dissimilar ones as we can't load spread 16402 * packets across the group because of potential link-level 16403 * header differences. 16404 */ 16405 phyi_tmp = phyint_lookup_group(groupname, B_FALSE, ipst); 16406 if (phyi_tmp != NULL) { 16407 if ((ill_v4 != NULL && 16408 phyi_tmp->phyint_illv4 != NULL) && 16409 ((ill_v4->ill_net_type != 16410 phyi_tmp->phyint_illv4->ill_net_type) || 16411 (ill_v4->ill_type != 16412 phyi_tmp->phyint_illv4->ill_type))) { 16413 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16414 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16415 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16416 rw_exit(&ipst->ips_ill_g_lock); 16417 return (EINVAL); 16418 } 16419 if ((ill_v6 != NULL && 16420 phyi_tmp->phyint_illv6 != NULL) && 16421 ((ill_v6->ill_net_type != 16422 phyi_tmp->phyint_illv6->ill_net_type) || 16423 (ill_v6->ill_type != 16424 phyi_tmp->phyint_illv6->ill_type))) { 16425 mutex_enter(&phyi->phyint_ipsq->ipsq_lock); 16426 phyi->phyint_ipsq->ipsq_split = B_TRUE; 16427 mutex_exit(&phyi->phyint_ipsq->ipsq_lock); 16428 rw_exit(&ipst->ips_ill_g_lock); 16429 return (EINVAL); 16430 } 16431 } 16432 16433 rw_exit(&ipst->ips_ill_g_lock); 16434 16435 /* 16436 * bring down all v4 ipifs. 16437 */ 16438 if (ill_v4 != NULL) { 16439 ill_down_ipifs(ill_v4, mp, 0, B_FALSE); 16440 } 16441 16442 /* 16443 * bring down all v6 ipifs. 16444 */ 16445 if (ill_v6 != NULL) { 16446 ill_down_ipifs(ill_v6, mp, 0, B_FALSE); 16447 } 16448 16449 /* 16450 * make sure all ipifs are down and there are no active 16451 * references. Call to ipsq_pending_mp_add will not fail 16452 * since connp is NULL. 16453 */ 16454 if (ill_v4 != NULL) { 16455 mutex_enter(&ill_v4->ill_lock); 16456 if (!ill_is_quiescent(ill_v4)) { 16457 (void) ipsq_pending_mp_add(NULL, 16458 ill_v4->ill_ipif, q, mp, ILL_DOWN); 16459 mutex_exit(&ill_v4->ill_lock); 16460 err = EINPROGRESS; 16461 goto done; 16462 } 16463 mutex_exit(&ill_v4->ill_lock); 16464 } 16465 16466 if (ill_v6 != NULL) { 16467 mutex_enter(&ill_v6->ill_lock); 16468 if (!ill_is_quiescent(ill_v6)) { 16469 (void) ipsq_pending_mp_add(NULL, 16470 ill_v6->ill_ipif, q, mp, ILL_DOWN); 16471 mutex_exit(&ill_v6->ill_lock); 16472 err = EINPROGRESS; 16473 goto done; 16474 } 16475 mutex_exit(&ill_v6->ill_lock); 16476 } 16477 16478 /* 16479 * allocate including space for null terminator 16480 * before we insert. 16481 */ 16482 tmp = (char *)mi_alloc(namelen + 1, BPRI_MED); 16483 if (tmp == NULL) 16484 return (ENOMEM); 16485 16486 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16487 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16488 mutex_enter(&phyi->phyint_lock); 16489 if (phyi->phyint_groupname_len != 0) { 16490 ASSERT(phyi->phyint_groupname != NULL); 16491 mi_free(phyi->phyint_groupname); 16492 } 16493 16494 /* 16495 * setup the new group name. 16496 */ 16497 phyi->phyint_groupname = tmp; 16498 bcopy(groupname, phyi->phyint_groupname, namelen + 1); 16499 phyi->phyint_groupname_len = namelen + 1; 16500 16501 if (ipst->ips_ipmp_hook_emulation) { 16502 /* 16503 * If the group already exists we use the existing 16504 * group_ifindex, otherwise we pick a new index here. 16505 */ 16506 if (phyi_tmp != NULL) { 16507 phyi->phyint_group_ifindex = 16508 phyi_tmp->phyint_group_ifindex; 16509 } else { 16510 /* XXX We need a recovery strategy here. */ 16511 if (!ip_assign_ifindex( 16512 &phyi->phyint_group_ifindex, ipst)) 16513 cmn_err(CE_PANIC, 16514 "ip_assign_ifindex() failed"); 16515 } 16516 } 16517 /* 16518 * Select whether the netinfo and hook use the per-interface 16519 * or per-group ifindex. 16520 */ 16521 if (ipst->ips_ipmp_hook_emulation) 16522 phyi->phyint_hook_ifindex = phyi->phyint_group_ifindex; 16523 else 16524 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 16525 16526 if (ipst->ips_ipmp_hook_emulation && 16527 phyi_tmp != NULL) { 16528 /* First phyint in group - group PLUMB event */ 16529 ill_nic_info_plumb(ill, B_TRUE); 16530 } 16531 mutex_exit(&phyi->phyint_lock); 16532 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16533 rw_exit(&ipst->ips_ill_g_lock); 16534 16535 err = ill_up_ipifs(ill, q, mp); 16536 } 16537 16538 done: 16539 /* 16540 * normally ILL_CHANGING is cleared in ill_up_ipifs. 16541 */ 16542 if (err != EINPROGRESS) { 16543 GRAB_ILL_LOCKS(ill_v4, ill_v6); 16544 if (ill_v4 != NULL) 16545 ill_v4->ill_state_flags &= ~ILL_CHANGING; 16546 if (ill_v6 != NULL) 16547 ill_v6->ill_state_flags &= ~ILL_CHANGING; 16548 RELEASE_ILL_LOCKS(ill_v4, ill_v6); 16549 } 16550 return (err); 16551 } 16552 16553 /* ARGSUSED */ 16554 int 16555 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 16556 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 16557 { 16558 ill_t *ill; 16559 phyint_t *phyi; 16560 struct lifreq *lifr; 16561 mblk_t *mp1; 16562 16563 /* Existence verified in ip_wput_nondata */ 16564 mp1 = mp->b_cont->b_cont; 16565 lifr = (struct lifreq *)mp1->b_rptr; 16566 ill = ipif->ipif_ill; 16567 phyi = ill->ill_phyint; 16568 16569 lifr->lifr_groupname[0] = '\0'; 16570 /* 16571 * ill_group may be null if all the interfaces 16572 * are down. But still, the phyint should always 16573 * hold the name. 16574 */ 16575 if (phyi->phyint_groupname_len != 0) { 16576 bcopy(phyi->phyint_groupname, lifr->lifr_groupname, 16577 phyi->phyint_groupname_len); 16578 } 16579 16580 return (0); 16581 } 16582 16583 16584 typedef struct conn_move_s { 16585 ill_t *cm_from_ill; 16586 ill_t *cm_to_ill; 16587 int cm_ifindex; 16588 } conn_move_t; 16589 16590 /* 16591 * ipcl_walk function for moving conn_multicast_ill for a given ill. 16592 */ 16593 static void 16594 conn_move(conn_t *connp, caddr_t arg) 16595 { 16596 conn_move_t *connm; 16597 int ifindex; 16598 int i; 16599 ill_t *from_ill; 16600 ill_t *to_ill; 16601 ilg_t *ilg; 16602 ilm_t *ret_ilm; 16603 16604 connm = (conn_move_t *)arg; 16605 ifindex = connm->cm_ifindex; 16606 from_ill = connm->cm_from_ill; 16607 to_ill = connm->cm_to_ill; 16608 16609 /* Change IP_BOUND_IF/IPV6_BOUND_IF associations. */ 16610 16611 /* All multicast fields protected by conn_lock */ 16612 mutex_enter(&connp->conn_lock); 16613 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 16614 if ((connp->conn_outgoing_ill == from_ill) && 16615 (ifindex == 0 || connp->conn_orig_bound_ifindex == ifindex)) { 16616 connp->conn_outgoing_ill = to_ill; 16617 connp->conn_incoming_ill = to_ill; 16618 } 16619 16620 /* Change IP_MULTICAST_IF/IPV6_MULTICAST_IF associations */ 16621 16622 if ((connp->conn_multicast_ill == from_ill) && 16623 (ifindex == 0 || connp->conn_orig_multicast_ifindex == ifindex)) { 16624 connp->conn_multicast_ill = connm->cm_to_ill; 16625 } 16626 16627 /* 16628 * Change the ilg_ill to point to the new one. This assumes 16629 * ilm_move_v6 has moved the ilms to new_ill and the driver 16630 * has been told to receive packets on this interface. 16631 * ilm_move_v6 FAILBACKS all the ilms successfully always. 16632 * But when doing a FAILOVER, it might fail with ENOMEM and so 16633 * some ilms may not have moved. We check to see whether 16634 * the ilms have moved to to_ill. We can't check on from_ill 16635 * as in the process of moving, we could have split an ilm 16636 * in to two - which has the same orig_ifindex and v6group. 16637 * 16638 * For IPv4, ilg_ipif moves implicitly. The code below really 16639 * does not do anything for IPv4 as ilg_ill is NULL for IPv4. 16640 */ 16641 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 16642 ilg = &connp->conn_ilg[i]; 16643 if ((ilg->ilg_ill == from_ill) && 16644 (ifindex == 0 || ilg->ilg_orig_ifindex == ifindex)) { 16645 /* ifindex != 0 indicates failback */ 16646 if (ifindex != 0) { 16647 connp->conn_ilg[i].ilg_ill = to_ill; 16648 continue; 16649 } 16650 16651 ret_ilm = ilm_lookup_ill_index_v6(to_ill, 16652 &ilg->ilg_v6group, ilg->ilg_orig_ifindex, 16653 connp->conn_zoneid); 16654 16655 if (ret_ilm != NULL) 16656 connp->conn_ilg[i].ilg_ill = to_ill; 16657 } 16658 } 16659 mutex_exit(&connp->conn_lock); 16660 } 16661 16662 static void 16663 conn_move_ill(ill_t *from_ill, ill_t *to_ill, int ifindex) 16664 { 16665 conn_move_t connm; 16666 ip_stack_t *ipst = from_ill->ill_ipst; 16667 16668 connm.cm_from_ill = from_ill; 16669 connm.cm_to_ill = to_ill; 16670 connm.cm_ifindex = ifindex; 16671 16672 ipcl_walk(conn_move, (caddr_t)&connm, ipst); 16673 } 16674 16675 /* 16676 * ilm has been moved from from_ill to to_ill. 16677 * Send DL_DISABMULTI_REQ to ill and DL_ENABMULTI_REQ on to_ill. 16678 * appropriately. 16679 * 16680 * NOTE : We can't reuse the code in ip_ll_addmulti/delmulti because 16681 * the code there de-references ipif_ill to get the ill to 16682 * send multicast requests. It does not work as ipif is on its 16683 * move and already moved when this function is called. 16684 * Thus, we need to use from_ill and to_ill send down multicast 16685 * requests. 16686 */ 16687 static void 16688 ilm_send_multicast_reqs(ill_t *from_ill, ill_t *to_ill) 16689 { 16690 ipif_t *ipif; 16691 ilm_t *ilm; 16692 16693 /* 16694 * See whether we need to send down DL_ENABMULTI_REQ on 16695 * to_ill as ilm has just been added. 16696 */ 16697 ASSERT(IAM_WRITER_ILL(to_ill)); 16698 ASSERT(IAM_WRITER_ILL(from_ill)); 16699 16700 ILM_WALKER_HOLD(to_ill); 16701 for (ilm = to_ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 16702 16703 if (!ilm->ilm_is_new || (ilm->ilm_flags & ILM_DELETED)) 16704 continue; 16705 /* 16706 * no locks held, ill/ipif cannot dissappear as long 16707 * as we are writer. 16708 */ 16709 ipif = to_ill->ill_ipif; 16710 /* 16711 * No need to hold any lock as we are the writer and this 16712 * can only be changed by a writer. 16713 */ 16714 ilm->ilm_is_new = B_FALSE; 16715 16716 if (to_ill->ill_net_type != IRE_IF_RESOLVER || 16717 ipif->ipif_flags & IPIF_POINTOPOINT) { 16718 ip1dbg(("ilm_send_multicast_reqs: to_ill not " 16719 "resolver\n")); 16720 continue; /* Must be IRE_IF_NORESOLVER */ 16721 } 16722 16723 16724 if (to_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16725 ip1dbg(("ilm_send_multicast_reqs: " 16726 "to_ill MULTI_BCAST\n")); 16727 goto from; 16728 } 16729 16730 if (to_ill->ill_isv6) 16731 mld_joingroup(ilm); 16732 else 16733 igmp_joingroup(ilm); 16734 16735 if (to_ill->ill_ipif_up_count == 0) { 16736 /* 16737 * Nobody there. All multicast addresses will be 16738 * re-joined when we get the DL_BIND_ACK bringing the 16739 * interface up. 16740 */ 16741 ilm->ilm_notify_driver = B_FALSE; 16742 ip1dbg(("ilm_send_multicast_reqs: to_ill nobody up\n")); 16743 goto from; 16744 } 16745 16746 /* 16747 * For allmulti address, we want to join on only one interface. 16748 * Checking for ilm_numentries_v6 is not correct as you may 16749 * find an ilm with zero address on to_ill, but we may not 16750 * have nominated to_ill for receiving. Thus, if we have 16751 * nominated from_ill (ill_join_allmulti is set), nominate 16752 * only if to_ill is not already nominated (to_ill normally 16753 * should not have been nominated if "from_ill" has already 16754 * been nominated. As we don't prevent failovers from happening 16755 * across groups, we don't assert). 16756 */ 16757 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16758 /* 16759 * There is no need to hold ill locks as we are 16760 * writer on both ills and when ill_join_allmulti 16761 * is changed the thread is always a writer. 16762 */ 16763 if (from_ill->ill_join_allmulti && 16764 !to_ill->ill_join_allmulti) { 16765 (void) ip_join_allmulti(to_ill->ill_ipif); 16766 } 16767 } else if (ilm->ilm_notify_driver) { 16768 16769 /* 16770 * This is a newly moved ilm so we need to tell the 16771 * driver about the new group. There can be more than 16772 * one ilm's for the same group in the list each with a 16773 * different orig_ifindex. We have to inform the driver 16774 * once. In ilm_move_v[4,6] we only set the flag 16775 * ilm_notify_driver for the first ilm. 16776 */ 16777 16778 (void) ip_ll_send_enabmulti_req(to_ill, 16779 &ilm->ilm_v6addr); 16780 } 16781 16782 ilm->ilm_notify_driver = B_FALSE; 16783 16784 /* 16785 * See whether we need to send down DL_DISABMULTI_REQ on 16786 * from_ill as ilm has just been removed. 16787 */ 16788 from: 16789 ipif = from_ill->ill_ipif; 16790 if (from_ill->ill_net_type != IRE_IF_RESOLVER || 16791 ipif->ipif_flags & IPIF_POINTOPOINT) { 16792 ip1dbg(("ilm_send_multicast_reqs: " 16793 "from_ill not resolver\n")); 16794 continue; /* Must be IRE_IF_NORESOLVER */ 16795 } 16796 16797 if (from_ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { 16798 ip1dbg(("ilm_send_multicast_reqs: " 16799 "from_ill MULTI_BCAST\n")); 16800 continue; 16801 } 16802 16803 if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { 16804 if (from_ill->ill_join_allmulti) 16805 (void) ip_leave_allmulti(from_ill->ill_ipif); 16806 } else if (ilm_numentries_v6(from_ill, &ilm->ilm_v6addr) == 0) { 16807 (void) ip_ll_send_disabmulti_req(from_ill, 16808 &ilm->ilm_v6addr); 16809 } 16810 } 16811 ILM_WALKER_RELE(to_ill); 16812 } 16813 16814 /* 16815 * This function is called when all multicast memberships needs 16816 * to be moved from "from_ill" to "to_ill" for IPv6. This function is 16817 * called only once unlike the IPv4 counterpart where it is called after 16818 * every logical interface is moved. The reason is due to multicast 16819 * memberships are joined using an interface address in IPv4 while in 16820 * IPv6, interface index is used. 16821 */ 16822 static void 16823 ilm_move_v6(ill_t *from_ill, ill_t *to_ill, int ifindex) 16824 { 16825 ilm_t *ilm; 16826 ilm_t *ilm_next; 16827 ilm_t *new_ilm; 16828 ilm_t **ilmp; 16829 int count; 16830 char buf[INET6_ADDRSTRLEN]; 16831 in6_addr_t ipv6_snm = ipv6_solicited_node_mcast; 16832 ip_stack_t *ipst = from_ill->ill_ipst; 16833 16834 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 16835 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 16836 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16837 16838 if (ifindex == 0) { 16839 /* 16840 * Form the solicited node mcast address which is used later. 16841 */ 16842 ipif_t *ipif; 16843 16844 ipif = from_ill->ill_ipif; 16845 ASSERT(ipif->ipif_id == 0); 16846 16847 ipv6_snm.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 16848 } 16849 16850 ilmp = &from_ill->ill_ilm; 16851 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 16852 ilm_next = ilm->ilm_next; 16853 16854 if (ilm->ilm_flags & ILM_DELETED) { 16855 ilmp = &ilm->ilm_next; 16856 continue; 16857 } 16858 16859 new_ilm = ilm_lookup_ill_index_v6(to_ill, &ilm->ilm_v6addr, 16860 ilm->ilm_orig_ifindex, ilm->ilm_zoneid); 16861 ASSERT(ilm->ilm_orig_ifindex != 0); 16862 if (ilm->ilm_orig_ifindex == ifindex) { 16863 /* 16864 * We are failing back multicast memberships. 16865 * If the same ilm exists in to_ill, it means somebody 16866 * has joined the same group there e.g. ff02::1 16867 * is joined within the kernel when the interfaces 16868 * came UP. 16869 */ 16870 ASSERT(ilm->ilm_ipif == NULL); 16871 if (new_ilm != NULL) { 16872 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 16873 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 16874 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 16875 new_ilm->ilm_is_new = B_TRUE; 16876 } 16877 } else { 16878 /* 16879 * check if we can just move the ilm 16880 */ 16881 if (from_ill->ill_ilm_walker_cnt != 0) { 16882 /* 16883 * We have walkers we cannot move 16884 * the ilm, so allocate a new ilm, 16885 * this (old) ilm will be marked 16886 * ILM_DELETED at the end of the loop 16887 * and will be freed when the 16888 * last walker exits. 16889 */ 16890 new_ilm = (ilm_t *)mi_zalloc 16891 (sizeof (ilm_t)); 16892 if (new_ilm == NULL) { 16893 ip0dbg(("ilm_move_v6: " 16894 "FAILBACK of IPv6" 16895 " multicast address %s : " 16896 "from %s to" 16897 " %s failed : ENOMEM \n", 16898 inet_ntop(AF_INET6, 16899 &ilm->ilm_v6addr, buf, 16900 sizeof (buf)), 16901 from_ill->ill_name, 16902 to_ill->ill_name)); 16903 16904 ilmp = &ilm->ilm_next; 16905 continue; 16906 } 16907 *new_ilm = *ilm; 16908 /* 16909 * we don't want new_ilm linked to 16910 * ilm's filter list. 16911 */ 16912 new_ilm->ilm_filter = NULL; 16913 } else { 16914 /* 16915 * No walkers we can move the ilm. 16916 * lets take it out of the list. 16917 */ 16918 *ilmp = ilm->ilm_next; 16919 ilm->ilm_next = NULL; 16920 new_ilm = ilm; 16921 } 16922 16923 /* 16924 * if this is the first ilm for the group 16925 * set ilm_notify_driver so that we notify the 16926 * driver in ilm_send_multicast_reqs. 16927 */ 16928 if (ilm_lookup_ill_v6(to_ill, 16929 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 16930 new_ilm->ilm_notify_driver = B_TRUE; 16931 16932 new_ilm->ilm_ill = to_ill; 16933 /* Add to the to_ill's list */ 16934 new_ilm->ilm_next = to_ill->ill_ilm; 16935 to_ill->ill_ilm = new_ilm; 16936 /* 16937 * set the flag so that mld_joingroup is 16938 * called in ilm_send_multicast_reqs(). 16939 */ 16940 new_ilm->ilm_is_new = B_TRUE; 16941 } 16942 goto bottom; 16943 } else if (ifindex != 0) { 16944 /* 16945 * If this is FAILBACK (ifindex != 0) and the ifindex 16946 * has not matched above, look at the next ilm. 16947 */ 16948 ilmp = &ilm->ilm_next; 16949 continue; 16950 } 16951 /* 16952 * If we are here, it means ifindex is 0. Failover 16953 * everything. 16954 * 16955 * We need to handle solicited node mcast address 16956 * and all_nodes mcast address differently as they 16957 * are joined witin the kenrel (ipif_multicast_up) 16958 * and potentially from the userland. We are called 16959 * after the ipifs of from_ill has been moved. 16960 * If we still find ilms on ill with solicited node 16961 * mcast address or all_nodes mcast address, it must 16962 * belong to the UP interface that has not moved e.g. 16963 * ipif_id 0 with the link local prefix does not move. 16964 * We join this on the new ill accounting for all the 16965 * userland memberships so that applications don't 16966 * see any failure. 16967 * 16968 * We need to make sure that we account only for the 16969 * solicited node and all node multicast addresses 16970 * that was brought UP on these. In the case of 16971 * a failover from A to B, we might have ilms belonging 16972 * to A (ilm_orig_ifindex pointing at A) on B accounting 16973 * for the membership from the userland. If we are failing 16974 * over from B to C now, we will find the ones belonging 16975 * to A on B. These don't account for the ill_ipif_up_count. 16976 * They just move from B to C. The check below on 16977 * ilm_orig_ifindex ensures that. 16978 */ 16979 if ((ilm->ilm_orig_ifindex == 16980 from_ill->ill_phyint->phyint_ifindex) && 16981 (IN6_ARE_ADDR_EQUAL(&ipv6_snm, &ilm->ilm_v6addr) || 16982 IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, 16983 &ilm->ilm_v6addr))) { 16984 ASSERT(ilm->ilm_refcnt > 0); 16985 count = ilm->ilm_refcnt - from_ill->ill_ipif_up_count; 16986 /* 16987 * For indentation reasons, we are not using a 16988 * "else" here. 16989 */ 16990 if (count == 0) { 16991 ilmp = &ilm->ilm_next; 16992 continue; 16993 } 16994 ilm->ilm_refcnt -= count; 16995 if (new_ilm != NULL) { 16996 /* 16997 * Can find one with the same 16998 * ilm_orig_ifindex, if we are failing 16999 * over to a STANDBY. This happens 17000 * when somebody wants to join a group 17001 * on a STANDBY interface and we 17002 * internally join on a different one. 17003 * If we had joined on from_ill then, a 17004 * failover now will find a new ilm 17005 * with this index. 17006 */ 17007 ip1dbg(("ilm_move_v6: FAILOVER, found" 17008 " new ilm on %s, group address %s\n", 17009 to_ill->ill_name, 17010 inet_ntop(AF_INET6, 17011 &ilm->ilm_v6addr, buf, 17012 sizeof (buf)))); 17013 new_ilm->ilm_refcnt += count; 17014 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17015 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17016 new_ilm->ilm_is_new = B_TRUE; 17017 } 17018 } else { 17019 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 17020 if (new_ilm == NULL) { 17021 ip0dbg(("ilm_move_v6: FAILOVER of IPv6" 17022 " multicast address %s : from %s to" 17023 " %s failed : ENOMEM \n", 17024 inet_ntop(AF_INET6, 17025 &ilm->ilm_v6addr, buf, 17026 sizeof (buf)), from_ill->ill_name, 17027 to_ill->ill_name)); 17028 ilmp = &ilm->ilm_next; 17029 continue; 17030 } 17031 *new_ilm = *ilm; 17032 new_ilm->ilm_filter = NULL; 17033 new_ilm->ilm_refcnt = count; 17034 new_ilm->ilm_timer = INFINITY; 17035 new_ilm->ilm_rtx.rtx_timer = INFINITY; 17036 new_ilm->ilm_is_new = B_TRUE; 17037 /* 17038 * If the to_ill has not joined this 17039 * group we need to tell the driver in 17040 * ill_send_multicast_reqs. 17041 */ 17042 if (ilm_lookup_ill_v6(to_ill, 17043 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17044 new_ilm->ilm_notify_driver = B_TRUE; 17045 17046 new_ilm->ilm_ill = to_ill; 17047 /* Add to the to_ill's list */ 17048 new_ilm->ilm_next = to_ill->ill_ilm; 17049 to_ill->ill_ilm = new_ilm; 17050 ASSERT(new_ilm->ilm_ipif == NULL); 17051 } 17052 if (ilm->ilm_refcnt == 0) { 17053 goto bottom; 17054 } else { 17055 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17056 CLEAR_SLIST(new_ilm->ilm_filter); 17057 ilmp = &ilm->ilm_next; 17058 } 17059 continue; 17060 } else { 17061 /* 17062 * ifindex = 0 means, move everything pointing at 17063 * from_ill. We are doing this becuase ill has 17064 * either FAILED or became INACTIVE. 17065 * 17066 * As we would like to move things later back to 17067 * from_ill, we want to retain the identity of this 17068 * ilm. Thus, we don't blindly increment the reference 17069 * count on the ilms matching the address alone. We 17070 * need to match on the ilm_orig_index also. new_ilm 17071 * was obtained by matching ilm_orig_index also. 17072 */ 17073 if (new_ilm != NULL) { 17074 /* 17075 * This is possible only if a previous restore 17076 * was incomplete i.e restore to 17077 * ilm_orig_ifindex left some ilms because 17078 * of some failures. Thus when we are failing 17079 * again, we might find our old friends there. 17080 */ 17081 ip1dbg(("ilm_move_v6: FAILOVER, found new ilm" 17082 " on %s, group address %s\n", 17083 to_ill->ill_name, 17084 inet_ntop(AF_INET6, 17085 &ilm->ilm_v6addr, buf, 17086 sizeof (buf)))); 17087 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17088 if (new_ilm->ilm_fmode != MODE_IS_EXCLUDE || 17089 !SLIST_IS_EMPTY(new_ilm->ilm_filter)) { 17090 new_ilm->ilm_is_new = B_TRUE; 17091 } 17092 } else { 17093 if (from_ill->ill_ilm_walker_cnt != 0) { 17094 new_ilm = (ilm_t *) 17095 mi_zalloc(sizeof (ilm_t)); 17096 if (new_ilm == NULL) { 17097 ip0dbg(("ilm_move_v6: " 17098 "FAILOVER of IPv6" 17099 " multicast address %s : " 17100 "from %s to" 17101 " %s failed : ENOMEM \n", 17102 inet_ntop(AF_INET6, 17103 &ilm->ilm_v6addr, buf, 17104 sizeof (buf)), 17105 from_ill->ill_name, 17106 to_ill->ill_name)); 17107 17108 ilmp = &ilm->ilm_next; 17109 continue; 17110 } 17111 *new_ilm = *ilm; 17112 new_ilm->ilm_filter = NULL; 17113 } else { 17114 *ilmp = ilm->ilm_next; 17115 new_ilm = ilm; 17116 } 17117 /* 17118 * If the to_ill has not joined this 17119 * group we need to tell the driver in 17120 * ill_send_multicast_reqs. 17121 */ 17122 if (ilm_lookup_ill_v6(to_ill, 17123 &new_ilm->ilm_v6addr, ALL_ZONES) == NULL) 17124 new_ilm->ilm_notify_driver = B_TRUE; 17125 17126 /* Add to the to_ill's list */ 17127 new_ilm->ilm_next = to_ill->ill_ilm; 17128 to_ill->ill_ilm = new_ilm; 17129 ASSERT(ilm->ilm_ipif == NULL); 17130 new_ilm->ilm_ill = to_ill; 17131 new_ilm->ilm_is_new = B_TRUE; 17132 } 17133 17134 } 17135 17136 bottom: 17137 /* 17138 * Revert multicast filter state to (EXCLUDE, NULL). 17139 * new_ilm->ilm_is_new should already be set if needed. 17140 */ 17141 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17142 CLEAR_SLIST(new_ilm->ilm_filter); 17143 /* 17144 * We allocated/got a new ilm, free the old one. 17145 */ 17146 if (new_ilm != ilm) { 17147 if (from_ill->ill_ilm_walker_cnt == 0) { 17148 *ilmp = ilm->ilm_next; 17149 ilm->ilm_next = NULL; 17150 FREE_SLIST(ilm->ilm_filter); 17151 FREE_SLIST(ilm->ilm_pendsrcs); 17152 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17153 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17154 mi_free((char *)ilm); 17155 } else { 17156 ilm->ilm_flags |= ILM_DELETED; 17157 from_ill->ill_ilm_cleanup_reqd = 1; 17158 ilmp = &ilm->ilm_next; 17159 } 17160 } 17161 } 17162 } 17163 17164 /* 17165 * Move all the multicast memberships to to_ill. Called when 17166 * an ipif moves from "from_ill" to "to_ill". This function is slightly 17167 * different from IPv6 counterpart as multicast memberships are associated 17168 * with ills in IPv6. This function is called after every ipif is moved 17169 * unlike IPv6, where it is moved only once. 17170 */ 17171 static void 17172 ilm_move_v4(ill_t *from_ill, ill_t *to_ill, ipif_t *ipif) 17173 { 17174 ilm_t *ilm; 17175 ilm_t *ilm_next; 17176 ilm_t *new_ilm; 17177 ilm_t **ilmp; 17178 ip_stack_t *ipst = from_ill->ill_ipst; 17179 17180 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17181 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17182 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 17183 17184 ilmp = &from_ill->ill_ilm; 17185 for (ilm = from_ill->ill_ilm; ilm != NULL; ilm = ilm_next) { 17186 ilm_next = ilm->ilm_next; 17187 17188 if (ilm->ilm_flags & ILM_DELETED) { 17189 ilmp = &ilm->ilm_next; 17190 continue; 17191 } 17192 17193 ASSERT(ilm->ilm_ipif != NULL); 17194 17195 if (ilm->ilm_ipif != ipif) { 17196 ilmp = &ilm->ilm_next; 17197 continue; 17198 } 17199 17200 if (V4_PART_OF_V6(ilm->ilm_v6addr) == 17201 htonl(INADDR_ALLHOSTS_GROUP)) { 17202 new_ilm = ilm_lookup_ipif(ipif, 17203 V4_PART_OF_V6(ilm->ilm_v6addr)); 17204 if (new_ilm != NULL) { 17205 new_ilm->ilm_refcnt += ilm->ilm_refcnt; 17206 /* 17207 * We still need to deal with the from_ill. 17208 */ 17209 new_ilm->ilm_is_new = B_TRUE; 17210 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17211 CLEAR_SLIST(new_ilm->ilm_filter); 17212 goto delete_ilm; 17213 } 17214 /* 17215 * If we could not find one e.g. ipif is 17216 * still down on to_ill, we add this ilm 17217 * on ill_new to preserve the reference 17218 * count. 17219 */ 17220 } 17221 /* 17222 * When ipifs move, ilms always move with it 17223 * to the NEW ill. Thus we should never be 17224 * able to find ilm till we really move it here. 17225 */ 17226 ASSERT(ilm_lookup_ipif(ipif, 17227 V4_PART_OF_V6(ilm->ilm_v6addr)) == NULL); 17228 17229 if (from_ill->ill_ilm_walker_cnt != 0) { 17230 new_ilm = (ilm_t *)mi_zalloc(sizeof (ilm_t)); 17231 if (new_ilm == NULL) { 17232 char buf[INET6_ADDRSTRLEN]; 17233 ip0dbg(("ilm_move_v4: FAILBACK of IPv4" 17234 " multicast address %s : " 17235 "from %s to" 17236 " %s failed : ENOMEM \n", 17237 inet_ntop(AF_INET, 17238 &ilm->ilm_v6addr, buf, 17239 sizeof (buf)), 17240 from_ill->ill_name, 17241 to_ill->ill_name)); 17242 17243 ilmp = &ilm->ilm_next; 17244 continue; 17245 } 17246 *new_ilm = *ilm; 17247 /* We don't want new_ilm linked to ilm's filter list */ 17248 new_ilm->ilm_filter = NULL; 17249 } else { 17250 /* Remove from the list */ 17251 *ilmp = ilm->ilm_next; 17252 new_ilm = ilm; 17253 } 17254 17255 /* 17256 * If we have never joined this group on the to_ill 17257 * make sure we tell the driver. 17258 */ 17259 if (ilm_lookup_ill_v6(to_ill, &new_ilm->ilm_v6addr, 17260 ALL_ZONES) == NULL) 17261 new_ilm->ilm_notify_driver = B_TRUE; 17262 17263 /* Add to the to_ill's list */ 17264 new_ilm->ilm_next = to_ill->ill_ilm; 17265 to_ill->ill_ilm = new_ilm; 17266 new_ilm->ilm_is_new = B_TRUE; 17267 17268 /* 17269 * Revert multicast filter state to (EXCLUDE, NULL) 17270 */ 17271 new_ilm->ilm_fmode = MODE_IS_EXCLUDE; 17272 CLEAR_SLIST(new_ilm->ilm_filter); 17273 17274 /* 17275 * Delete only if we have allocated a new ilm. 17276 */ 17277 if (new_ilm != ilm) { 17278 delete_ilm: 17279 if (from_ill->ill_ilm_walker_cnt == 0) { 17280 /* Remove from the list */ 17281 *ilmp = ilm->ilm_next; 17282 ilm->ilm_next = NULL; 17283 FREE_SLIST(ilm->ilm_filter); 17284 FREE_SLIST(ilm->ilm_pendsrcs); 17285 FREE_SLIST(ilm->ilm_rtx.rtx_allow); 17286 FREE_SLIST(ilm->ilm_rtx.rtx_block); 17287 mi_free((char *)ilm); 17288 } else { 17289 ilm->ilm_flags |= ILM_DELETED; 17290 from_ill->ill_ilm_cleanup_reqd = 1; 17291 ilmp = &ilm->ilm_next; 17292 } 17293 } 17294 } 17295 } 17296 17297 static uint_t 17298 ipif_get_id(ill_t *ill, uint_t id) 17299 { 17300 uint_t unit; 17301 ipif_t *tipif; 17302 boolean_t found = B_FALSE; 17303 ip_stack_t *ipst = ill->ill_ipst; 17304 17305 /* 17306 * During failback, we want to go back to the same id 17307 * instead of the smallest id so that the original 17308 * configuration is maintained. id is non-zero in that 17309 * case. 17310 */ 17311 if (id != 0) { 17312 /* 17313 * While failing back, if we still have an ipif with 17314 * MAX_ADDRS_PER_IF, it means this will be replaced 17315 * as soon as we return from this function. It was 17316 * to set to MAX_ADDRS_PER_IF by the caller so that 17317 * we can choose the smallest id. Thus we return zero 17318 * in that case ignoring the hint. 17319 */ 17320 if (ill->ill_ipif->ipif_id == MAX_ADDRS_PER_IF) 17321 return (0); 17322 for (tipif = ill->ill_ipif; tipif != NULL; 17323 tipif = tipif->ipif_next) { 17324 if (tipif->ipif_id == id) { 17325 found = B_TRUE; 17326 break; 17327 } 17328 } 17329 /* 17330 * If somebody already plumbed another logical 17331 * with the same id, we won't be able to find it. 17332 */ 17333 if (!found) 17334 return (id); 17335 } 17336 for (unit = 0; unit <= ipst->ips_ip_addrs_per_if; unit++) { 17337 found = B_FALSE; 17338 for (tipif = ill->ill_ipif; tipif != NULL; 17339 tipif = tipif->ipif_next) { 17340 if (tipif->ipif_id == unit) { 17341 found = B_TRUE; 17342 break; 17343 } 17344 } 17345 if (!found) 17346 break; 17347 } 17348 return (unit); 17349 } 17350 17351 /* ARGSUSED */ 17352 static int 17353 ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, 17354 ipif_t **rep_ipif_ptr) 17355 { 17356 ill_t *from_ill; 17357 ipif_t *rep_ipif; 17358 uint_t unit; 17359 int err = 0; 17360 ipif_t *to_ipif; 17361 struct iocblk *iocp; 17362 boolean_t failback_cmd; 17363 boolean_t remove_ipif; 17364 int rc; 17365 ip_stack_t *ipst; 17366 17367 ASSERT(IAM_WRITER_ILL(to_ill)); 17368 ASSERT(IAM_WRITER_IPIF(ipif)); 17369 17370 iocp = (struct iocblk *)mp->b_rptr; 17371 failback_cmd = (iocp->ioc_cmd == SIOCLIFFAILBACK); 17372 remove_ipif = B_FALSE; 17373 17374 from_ill = ipif->ipif_ill; 17375 ipst = from_ill->ill_ipst; 17376 17377 ASSERT(MUTEX_HELD(&to_ill->ill_lock)); 17378 ASSERT(MUTEX_HELD(&from_ill->ill_lock)); 17379 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 17380 17381 /* 17382 * Don't move LINK LOCAL addresses as they are tied to 17383 * physical interface. 17384 */ 17385 if (from_ill->ill_isv6 && 17386 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) { 17387 ipif->ipif_was_up = B_FALSE; 17388 IPIF_UNMARK_MOVING(ipif); 17389 return (0); 17390 } 17391 17392 /* 17393 * We set the ipif_id to maximum so that the search for 17394 * ipif_id will pick the lowest number i.e 0 in the 17395 * following 2 cases : 17396 * 17397 * 1) We have a replacement ipif at the head of to_ill. 17398 * We can't remove it yet as we can exceed ip_addrs_per_if 17399 * on to_ill and hence the MOVE might fail. We want to 17400 * remove it only if we could move the ipif. Thus, by 17401 * setting it to the MAX value, we make the search in 17402 * ipif_get_id return the zeroth id. 17403 * 17404 * 2) When DR pulls out the NIC and re-plumbs the interface, 17405 * we might just have a zero address plumbed on the ipif 17406 * with zero id in the case of IPv4. We remove that while 17407 * doing the failback. We want to remove it only if we 17408 * could move the ipif. Thus, by setting it to the MAX 17409 * value, we make the search in ipif_get_id return the 17410 * zeroth id. 17411 * 17412 * Both (1) and (2) are done only when when we are moving 17413 * an ipif (either due to failover/failback) which originally 17414 * belonged to this interface i.e the ipif_orig_ifindex is 17415 * the same as to_ill's ifindex. This is needed so that 17416 * FAILOVER from A -> B ( A failed) followed by FAILOVER 17417 * from B -> A (B is being removed from the group) and 17418 * FAILBACK from A -> B restores the original configuration. 17419 * Without the check for orig_ifindex, the second FAILOVER 17420 * could make the ipif belonging to B replace the A's zeroth 17421 * ipif and the subsequent failback re-creating the replacement 17422 * ipif again. 17423 * 17424 * NOTE : We created the replacement ipif when we did a 17425 * FAILOVER (See below). We could check for FAILBACK and 17426 * then look for replacement ipif to be removed. But we don't 17427 * want to do that because we wan't to allow the possibility 17428 * of a FAILOVER from A -> B (which creates the replacement ipif), 17429 * followed by a *FAILOVER* from B -> A instead of a FAILBACK 17430 * from B -> A. 17431 */ 17432 to_ipif = to_ill->ill_ipif; 17433 if ((to_ill->ill_phyint->phyint_ifindex == 17434 ipif->ipif_orig_ifindex) && 17435 IPIF_REPL_CHECK(to_ipif, failback_cmd)) { 17436 ASSERT(to_ipif->ipif_id == 0); 17437 remove_ipif = B_TRUE; 17438 to_ipif->ipif_id = MAX_ADDRS_PER_IF; 17439 } 17440 /* 17441 * Find the lowest logical unit number on the to_ill. 17442 * If we are failing back, try to get the original id 17443 * rather than the lowest one so that the original 17444 * configuration is maintained. 17445 * 17446 * XXX need a better scheme for this. 17447 */ 17448 if (failback_cmd) { 17449 unit = ipif_get_id(to_ill, ipif->ipif_orig_ipifid); 17450 } else { 17451 unit = ipif_get_id(to_ill, 0); 17452 } 17453 17454 /* Reset back to zero in case we fail below */ 17455 if (to_ipif->ipif_id == MAX_ADDRS_PER_IF) 17456 to_ipif->ipif_id = 0; 17457 17458 if (unit == ipst->ips_ip_addrs_per_if) { 17459 ipif->ipif_was_up = B_FALSE; 17460 IPIF_UNMARK_MOVING(ipif); 17461 return (EINVAL); 17462 } 17463 17464 /* 17465 * ipif is ready to move from "from_ill" to "to_ill". 17466 * 17467 * 1) If we are moving ipif with id zero, create a 17468 * replacement ipif for this ipif on from_ill. If this fails 17469 * fail the MOVE operation. 17470 * 17471 * 2) Remove the replacement ipif on to_ill if any. 17472 * We could remove the replacement ipif when we are moving 17473 * the ipif with id zero. But what if somebody already 17474 * unplumbed it ? Thus we always remove it if it is present. 17475 * We want to do it only if we are sure we are going to 17476 * move the ipif to to_ill which is why there are no 17477 * returns due to error till ipif is linked to to_ill. 17478 * Note that the first ipif that we failback will always 17479 * be zero if it is present. 17480 */ 17481 if (ipif->ipif_id == 0) { 17482 ipaddr_t inaddr_any = INADDR_ANY; 17483 17484 rep_ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED); 17485 if (rep_ipif == NULL) { 17486 ipif->ipif_was_up = B_FALSE; 17487 IPIF_UNMARK_MOVING(ipif); 17488 return (ENOMEM); 17489 } 17490 *rep_ipif = ipif_zero; 17491 /* 17492 * Before we put the ipif on the list, store the addresses 17493 * as mapped addresses as some of the ioctls e.g SIOCGIFADDR 17494 * assumes so. This logic is not any different from what 17495 * ipif_allocate does. 17496 */ 17497 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17498 &rep_ipif->ipif_v6lcl_addr); 17499 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17500 &rep_ipif->ipif_v6src_addr); 17501 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17502 &rep_ipif->ipif_v6subnet); 17503 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17504 &rep_ipif->ipif_v6net_mask); 17505 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17506 &rep_ipif->ipif_v6brd_addr); 17507 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 17508 &rep_ipif->ipif_v6pp_dst_addr); 17509 /* 17510 * We mark IPIF_NOFAILOVER so that this can never 17511 * move. 17512 */ 17513 rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; 17514 rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; 17515 rep_ipif->ipif_replace_zero = B_TRUE; 17516 mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, 17517 MUTEX_DEFAULT, NULL); 17518 rep_ipif->ipif_id = 0; 17519 rep_ipif->ipif_ire_type = ipif->ipif_ire_type; 17520 rep_ipif->ipif_ill = from_ill; 17521 rep_ipif->ipif_orig_ifindex = 17522 from_ill->ill_phyint->phyint_ifindex; 17523 /* Insert at head */ 17524 rep_ipif->ipif_next = from_ill->ill_ipif; 17525 from_ill->ill_ipif = rep_ipif; 17526 /* 17527 * We don't really care to let apps know about 17528 * this interface. 17529 */ 17530 } 17531 17532 if (remove_ipif) { 17533 /* 17534 * We set to a max value above for this case to get 17535 * id zero. ASSERT that we did get one. 17536 */ 17537 ASSERT((to_ipif->ipif_id == 0) && (unit == 0)); 17538 rep_ipif = to_ipif; 17539 to_ill->ill_ipif = rep_ipif->ipif_next; 17540 rep_ipif->ipif_next = NULL; 17541 /* 17542 * If some apps scanned and find this interface, 17543 * it is time to let them know, so that they can 17544 * delete it. 17545 */ 17546 17547 *rep_ipif_ptr = rep_ipif; 17548 } 17549 17550 /* Get it out of the ILL interface list. */ 17551 ipif_remove(ipif, B_FALSE); 17552 17553 /* Assign the new ill */ 17554 ipif->ipif_ill = to_ill; 17555 ipif->ipif_id = unit; 17556 /* id has already been checked */ 17557 rc = ipif_insert(ipif, B_FALSE, B_FALSE); 17558 ASSERT(rc == 0); 17559 /* Let SCTP update its list */ 17560 sctp_move_ipif(ipif, from_ill, to_ill); 17561 /* 17562 * Handle the failover and failback of ipif_t between 17563 * ill_t that have differing maximum mtu values. 17564 */ 17565 if (ipif->ipif_mtu > to_ill->ill_max_mtu) { 17566 if (ipif->ipif_saved_mtu == 0) { 17567 /* 17568 * As this ipif_t is moving to an ill_t 17569 * that has a lower ill_max_mtu, its 17570 * ipif_mtu needs to be saved so it can 17571 * be restored during failback or during 17572 * failover to an ill_t which has a 17573 * higher ill_max_mtu. 17574 */ 17575 ipif->ipif_saved_mtu = ipif->ipif_mtu; 17576 ipif->ipif_mtu = to_ill->ill_max_mtu; 17577 } else { 17578 /* 17579 * The ipif_t is, once again, moving to 17580 * an ill_t that has a lower maximum mtu 17581 * value. 17582 */ 17583 ipif->ipif_mtu = to_ill->ill_max_mtu; 17584 } 17585 } else if (ipif->ipif_mtu < to_ill->ill_max_mtu && 17586 ipif->ipif_saved_mtu != 0) { 17587 /* 17588 * The mtu of this ipif_t had to be reduced 17589 * during an earlier failover; this is an 17590 * opportunity for it to be increased (either as 17591 * part of another failover or a failback). 17592 */ 17593 if (ipif->ipif_saved_mtu <= to_ill->ill_max_mtu) { 17594 ipif->ipif_mtu = ipif->ipif_saved_mtu; 17595 ipif->ipif_saved_mtu = 0; 17596 } else { 17597 ipif->ipif_mtu = to_ill->ill_max_mtu; 17598 } 17599 } 17600 17601 /* 17602 * We preserve all the other fields of the ipif including 17603 * ipif_saved_ire_mp. The routes that are saved here will 17604 * be recreated on the new interface and back on the old 17605 * interface when we move back. 17606 */ 17607 ASSERT(ipif->ipif_arp_del_mp == NULL); 17608 17609 return (err); 17610 } 17611 17612 static int 17613 ipif_move_all(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp, 17614 int ifindex, ipif_t **rep_ipif_ptr) 17615 { 17616 ipif_t *mipif; 17617 ipif_t *ipif_next; 17618 int err; 17619 17620 /* 17621 * We don't really try to MOVE back things if some of the 17622 * operations fail. The daemon will take care of moving again 17623 * later on. 17624 */ 17625 for (mipif = from_ill->ill_ipif; mipif != NULL; mipif = ipif_next) { 17626 ipif_next = mipif->ipif_next; 17627 if (!(mipif->ipif_flags & IPIF_NOFAILOVER) && 17628 (ifindex == 0 || ifindex == mipif->ipif_orig_ifindex)) { 17629 17630 err = ipif_move(mipif, to_ill, q, mp, rep_ipif_ptr); 17631 17632 /* 17633 * When the MOVE fails, it is the job of the 17634 * application to take care of this properly 17635 * i.e try again if it is ENOMEM. 17636 */ 17637 if (mipif->ipif_ill != from_ill) { 17638 /* 17639 * ipif has moved. 17640 * 17641 * Move the multicast memberships associated 17642 * with this ipif to the new ill. For IPv6, we 17643 * do it once after all the ipifs are moved 17644 * (in ill_move) as they are not associated 17645 * with ipifs. 17646 * 17647 * We need to move the ilms as the ipif has 17648 * already been moved to a new ill even 17649 * in the case of errors. Neither 17650 * ilm_free(ipif) will find the ilm 17651 * when somebody unplumbs this ipif nor 17652 * ilm_delete(ilm) will be able to find the 17653 * ilm, if we don't move now. 17654 */ 17655 if (!from_ill->ill_isv6) 17656 ilm_move_v4(from_ill, to_ill, mipif); 17657 } 17658 17659 if (err != 0) 17660 return (err); 17661 } 17662 } 17663 return (0); 17664 } 17665 17666 static int 17667 ill_move(ill_t *from_ill, ill_t *to_ill, queue_t *q, mblk_t *mp) 17668 { 17669 int ifindex; 17670 int err; 17671 struct iocblk *iocp; 17672 ipif_t *ipif; 17673 ipif_t *rep_ipif_ptr = NULL; 17674 ipif_t *from_ipif = NULL; 17675 boolean_t check_rep_if = B_FALSE; 17676 ip_stack_t *ipst = from_ill->ill_ipst; 17677 17678 iocp = (struct iocblk *)mp->b_rptr; 17679 if (iocp->ioc_cmd == SIOCLIFFAILOVER) { 17680 /* 17681 * Move everything pointing at from_ill to to_ill. 17682 * We acheive this by passing in 0 as ifindex. 17683 */ 17684 ifindex = 0; 17685 } else { 17686 /* 17687 * Move everything pointing at from_ill whose original 17688 * ifindex of connp, ipif, ilm points at to_ill->ill_index. 17689 * We acheive this by passing in ifindex rather than 0. 17690 * Multicast vifs, ilgs move implicitly because ipifs move. 17691 */ 17692 ASSERT(iocp->ioc_cmd == SIOCLIFFAILBACK); 17693 ifindex = to_ill->ill_phyint->phyint_ifindex; 17694 } 17695 17696 /* 17697 * Determine if there is at least one ipif that would move from 17698 * 'from_ill' to 'to_ill'. If so, it is possible that the replacement 17699 * ipif (if it exists) on the to_ill would be consumed as a result of 17700 * the move, in which case we need to quiesce the replacement ipif also. 17701 */ 17702 for (from_ipif = from_ill->ill_ipif; from_ipif != NULL; 17703 from_ipif = from_ipif->ipif_next) { 17704 if (((ifindex == 0) || 17705 (ifindex == from_ipif->ipif_orig_ifindex)) && 17706 !(from_ipif->ipif_flags & IPIF_NOFAILOVER)) { 17707 check_rep_if = B_TRUE; 17708 break; 17709 } 17710 } 17711 17712 17713 ill_down_ipifs(from_ill, mp, ifindex, B_TRUE); 17714 17715 GRAB_ILL_LOCKS(from_ill, to_ill); 17716 if ((ipif = ill_quiescent_to_move(from_ill)) != NULL) { 17717 (void) ipsq_pending_mp_add(NULL, ipif, q, 17718 mp, ILL_MOVE_OK); 17719 RELEASE_ILL_LOCKS(from_ill, to_ill); 17720 return (EINPROGRESS); 17721 } 17722 17723 /* Check if the replacement ipif is quiescent to delete */ 17724 if (check_rep_if && IPIF_REPL_CHECK(to_ill->ill_ipif, 17725 (iocp->ioc_cmd == SIOCLIFFAILBACK))) { 17726 to_ill->ill_ipif->ipif_state_flags |= 17727 IPIF_MOVING | IPIF_CHANGING; 17728 if ((ipif = ill_quiescent_to_move(to_ill)) != NULL) { 17729 (void) ipsq_pending_mp_add(NULL, ipif, q, 17730 mp, ILL_MOVE_OK); 17731 RELEASE_ILL_LOCKS(from_ill, to_ill); 17732 return (EINPROGRESS); 17733 } 17734 } 17735 RELEASE_ILL_LOCKS(from_ill, to_ill); 17736 17737 ASSERT(!MUTEX_HELD(&to_ill->ill_lock)); 17738 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 17739 GRAB_ILL_LOCKS(from_ill, to_ill); 17740 err = ipif_move_all(from_ill, to_ill, q, mp, ifindex, &rep_ipif_ptr); 17741 17742 /* ilm_move is done inside ipif_move for IPv4 */ 17743 if (err == 0 && from_ill->ill_isv6) 17744 ilm_move_v6(from_ill, to_ill, ifindex); 17745 17746 RELEASE_ILL_LOCKS(from_ill, to_ill); 17747 rw_exit(&ipst->ips_ill_g_lock); 17748 17749 /* 17750 * send rts messages and multicast messages. 17751 */ 17752 if (rep_ipif_ptr != NULL) { 17753 if (rep_ipif_ptr->ipif_recovery_id != 0) { 17754 (void) untimeout(rep_ipif_ptr->ipif_recovery_id); 17755 rep_ipif_ptr->ipif_recovery_id = 0; 17756 } 17757 ip_rts_ifmsg(rep_ipif_ptr); 17758 ip_rts_newaddrmsg(RTM_DELETE, 0, rep_ipif_ptr); 17759 #ifdef DEBUG 17760 ipif_trace_cleanup(rep_ipif_ptr); 17761 #endif 17762 mi_free(rep_ipif_ptr); 17763 } 17764 17765 conn_move_ill(from_ill, to_ill, ifindex); 17766 17767 return (err); 17768 } 17769 17770 /* 17771 * Used to extract arguments for FAILOVER/FAILBACK ioctls. 17772 * Also checks for the validity of the arguments. 17773 * Note: We are already exclusive inside the from group. 17774 * It is upto the caller to release refcnt on the to_ill's. 17775 */ 17776 static int 17777 ip_extract_move_args(queue_t *q, mblk_t *mp, ill_t **ill_from_v4, 17778 ill_t **ill_from_v6, ill_t **ill_to_v4, ill_t **ill_to_v6) 17779 { 17780 int dst_index; 17781 ipif_t *ipif_v4, *ipif_v6; 17782 struct lifreq *lifr; 17783 mblk_t *mp1; 17784 boolean_t exists; 17785 sin_t *sin; 17786 int err = 0; 17787 ip_stack_t *ipst; 17788 17789 if (CONN_Q(q)) 17790 ipst = CONNQ_TO_IPST(q); 17791 else 17792 ipst = ILLQ_TO_IPST(q); 17793 17794 17795 if ((mp1 = mp->b_cont) == NULL) 17796 return (EPROTO); 17797 17798 if ((mp1 = mp1->b_cont) == NULL) 17799 return (EPROTO); 17800 17801 lifr = (struct lifreq *)mp1->b_rptr; 17802 sin = (sin_t *)&lifr->lifr_addr; 17803 17804 /* 17805 * We operate on both IPv4 and IPv6. Thus, we don't allow IPv4/IPv6 17806 * specific operations. 17807 */ 17808 if (sin->sin_family != AF_UNSPEC) 17809 return (EINVAL); 17810 17811 /* 17812 * Get ipif with id 0. We are writer on the from ill. So we can pass 17813 * NULLs for the last 4 args and we know the lookup won't fail 17814 * with EINPROGRESS. 17815 */ 17816 ipif_v4 = ipif_lookup_on_name(lifr->lifr_name, 17817 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_FALSE, 17818 ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 17819 ipif_v6 = ipif_lookup_on_name(lifr->lifr_name, 17820 mi_strlen(lifr->lifr_name), B_FALSE, &exists, B_TRUE, 17821 ALL_ZONES, NULL, NULL, NULL, NULL, ipst); 17822 17823 if (ipif_v4 == NULL && ipif_v6 == NULL) 17824 return (ENXIO); 17825 17826 if (ipif_v4 != NULL) { 17827 ASSERT(ipif_v4->ipif_refcnt != 0); 17828 if (ipif_v4->ipif_id != 0) { 17829 err = EINVAL; 17830 goto done; 17831 } 17832 17833 ASSERT(IAM_WRITER_IPIF(ipif_v4)); 17834 *ill_from_v4 = ipif_v4->ipif_ill; 17835 } 17836 17837 if (ipif_v6 != NULL) { 17838 ASSERT(ipif_v6->ipif_refcnt != 0); 17839 if (ipif_v6->ipif_id != 0) { 17840 err = EINVAL; 17841 goto done; 17842 } 17843 17844 ASSERT(IAM_WRITER_IPIF(ipif_v6)); 17845 *ill_from_v6 = ipif_v6->ipif_ill; 17846 } 17847 17848 err = 0; 17849 dst_index = lifr->lifr_movetoindex; 17850 *ill_to_v4 = ill_lookup_on_ifindex(dst_index, B_FALSE, 17851 q, mp, ip_process_ioctl, &err, ipst); 17852 if (err != 0) { 17853 /* 17854 * There could be only v6. 17855 */ 17856 if (err != ENXIO) 17857 goto done; 17858 err = 0; 17859 } 17860 17861 *ill_to_v6 = ill_lookup_on_ifindex(dst_index, B_TRUE, 17862 q, mp, ip_process_ioctl, &err, ipst); 17863 if (err != 0) { 17864 if (err != ENXIO) 17865 goto done; 17866 if (*ill_to_v4 == NULL) { 17867 err = ENXIO; 17868 goto done; 17869 } 17870 err = 0; 17871 } 17872 17873 /* 17874 * If we have something to MOVE i.e "from" not NULL, 17875 * "to" should be non-NULL. 17876 */ 17877 if ((*ill_from_v4 != NULL && *ill_to_v4 == NULL) || 17878 (*ill_from_v6 != NULL && *ill_to_v6 == NULL)) { 17879 err = EINVAL; 17880 } 17881 17882 done: 17883 if (ipif_v4 != NULL) 17884 ipif_refrele(ipif_v4); 17885 if (ipif_v6 != NULL) 17886 ipif_refrele(ipif_v6); 17887 return (err); 17888 } 17889 17890 /* 17891 * FAILOVER and FAILBACK are modelled as MOVE operations. 17892 * 17893 * We don't check whether the MOVE is within the same group or 17894 * not, because this ioctl can be used as a generic mechanism 17895 * to failover from interface A to B, though things will function 17896 * only if they are really part of the same group. Moreover, 17897 * all ipifs may be down and hence temporarily out of the group. 17898 * 17899 * ipif's that need to be moved are first brought down; V4 ipifs are brought 17900 * down first and then V6. For each we wait for the ipif's to become quiescent. 17901 * Bringing down the ipifs ensures that all ires pointing to these ipifs's 17902 * have been deleted and there are no active references. Once quiescent the 17903 * ipif's are moved and brought up on the new ill. 17904 * 17905 * Normally the source ill and destination ill belong to the same IPMP group 17906 * and hence the same ipsq_t. In the event they don't belong to the same 17907 * same group the two ipsq's are first merged into one ipsq - that of the 17908 * to_ill. The multicast memberships on the source and destination ill cannot 17909 * change during the move operation since multicast joins/leaves also have to 17910 * execute on the same ipsq and are hence serialized. 17911 */ 17912 /* ARGSUSED */ 17913 int 17914 ip_sioctl_move(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17915 ip_ioctl_cmd_t *ipip, void *ifreq) 17916 { 17917 ill_t *ill_to_v4 = NULL; 17918 ill_t *ill_to_v6 = NULL; 17919 ill_t *ill_from_v4 = NULL; 17920 ill_t *ill_from_v6 = NULL; 17921 int err = 0; 17922 17923 /* 17924 * setup from and to ill's, we can get EINPROGRESS only for 17925 * to_ill's. 17926 */ 17927 err = ip_extract_move_args(q, mp, &ill_from_v4, &ill_from_v6, 17928 &ill_to_v4, &ill_to_v6); 17929 17930 if (err != 0) { 17931 ip0dbg(("ip_sioctl_move: extract args failed\n")); 17932 goto done; 17933 } 17934 17935 /* 17936 * nothing to do. 17937 */ 17938 if ((ill_from_v4 != NULL) && (ill_from_v4 == ill_to_v4)) { 17939 goto done; 17940 } 17941 17942 /* 17943 * nothing to do. 17944 */ 17945 if ((ill_from_v6 != NULL) && (ill_from_v6 == ill_to_v6)) { 17946 goto done; 17947 } 17948 17949 /* 17950 * Mark the ill as changing. 17951 * ILL_CHANGING flag is cleared when the ipif's are brought up 17952 * in ill_up_ipifs in case of error they are cleared below. 17953 */ 17954 17955 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 17956 if (ill_from_v4 != NULL) 17957 ill_from_v4->ill_state_flags |= ILL_CHANGING; 17958 if (ill_from_v6 != NULL) 17959 ill_from_v6->ill_state_flags |= ILL_CHANGING; 17960 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 17961 17962 /* 17963 * Make sure that both src and dst are 17964 * in the same syncq group. If not make it happen. 17965 * We are not holding any locks because we are the writer 17966 * on the from_ipsq and we will hold locks in ill_merge_groups 17967 * to protect to_ipsq against changing. 17968 */ 17969 if (ill_from_v4 != NULL) { 17970 if (ill_from_v4->ill_phyint->phyint_ipsq != 17971 ill_to_v4->ill_phyint->phyint_ipsq) { 17972 err = ill_merge_groups(ill_from_v4, ill_to_v4, 17973 NULL, mp, q); 17974 goto err_ret; 17975 17976 } 17977 ASSERT(!MUTEX_HELD(&ill_to_v4->ill_lock)); 17978 } else { 17979 17980 if (ill_from_v6->ill_phyint->phyint_ipsq != 17981 ill_to_v6->ill_phyint->phyint_ipsq) { 17982 err = ill_merge_groups(ill_from_v6, ill_to_v6, 17983 NULL, mp, q); 17984 goto err_ret; 17985 17986 } 17987 ASSERT(!MUTEX_HELD(&ill_to_v6->ill_lock)); 17988 } 17989 17990 /* 17991 * Now that the ipsq's have been merged and we are the writer 17992 * lets mark to_ill as changing as well. 17993 */ 17994 17995 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 17996 if (ill_to_v4 != NULL) 17997 ill_to_v4->ill_state_flags |= ILL_CHANGING; 17998 if (ill_to_v6 != NULL) 17999 ill_to_v6->ill_state_flags |= ILL_CHANGING; 18000 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 18001 18002 /* 18003 * Its ok for us to proceed with the move even if 18004 * ill_pending_mp is non null on one of the from ill's as the reply 18005 * should not be looking at the ipif, it should only care about the 18006 * ill itself. 18007 */ 18008 18009 /* 18010 * lets move ipv4 first. 18011 */ 18012 if (ill_from_v4 != NULL) { 18013 ASSERT(IAM_WRITER_ILL(ill_to_v4)); 18014 ill_from_v4->ill_move_in_progress = B_TRUE; 18015 ill_to_v4->ill_move_in_progress = B_TRUE; 18016 ill_to_v4->ill_move_peer = ill_from_v4; 18017 ill_from_v4->ill_move_peer = ill_to_v4; 18018 err = ill_move(ill_from_v4, ill_to_v4, q, mp); 18019 } 18020 18021 /* 18022 * Now lets move ipv6. 18023 */ 18024 if (err == 0 && ill_from_v6 != NULL) { 18025 ASSERT(IAM_WRITER_ILL(ill_to_v6)); 18026 ill_from_v6->ill_move_in_progress = B_TRUE; 18027 ill_to_v6->ill_move_in_progress = B_TRUE; 18028 ill_to_v6->ill_move_peer = ill_from_v6; 18029 ill_from_v6->ill_move_peer = ill_to_v6; 18030 err = ill_move(ill_from_v6, ill_to_v6, q, mp); 18031 } 18032 18033 err_ret: 18034 /* 18035 * EINPROGRESS means we are waiting for the ipif's that need to be 18036 * moved to become quiescent. 18037 */ 18038 if (err == EINPROGRESS) { 18039 goto done; 18040 } 18041 18042 /* 18043 * if err is set ill_up_ipifs will not be called 18044 * lets clear the flags. 18045 */ 18046 18047 GRAB_ILL_LOCKS(ill_to_v4, ill_to_v6); 18048 GRAB_ILL_LOCKS(ill_from_v4, ill_from_v6); 18049 /* 18050 * Some of the clearing may be redundant. But it is simple 18051 * not making any extra checks. 18052 */ 18053 if (ill_from_v6 != NULL) { 18054 ill_from_v6->ill_move_in_progress = B_FALSE; 18055 ill_from_v6->ill_move_peer = NULL; 18056 ill_from_v6->ill_state_flags &= ~ILL_CHANGING; 18057 } 18058 if (ill_from_v4 != NULL) { 18059 ill_from_v4->ill_move_in_progress = B_FALSE; 18060 ill_from_v4->ill_move_peer = NULL; 18061 ill_from_v4->ill_state_flags &= ~ILL_CHANGING; 18062 } 18063 if (ill_to_v6 != NULL) { 18064 ill_to_v6->ill_move_in_progress = B_FALSE; 18065 ill_to_v6->ill_move_peer = NULL; 18066 ill_to_v6->ill_state_flags &= ~ILL_CHANGING; 18067 } 18068 if (ill_to_v4 != NULL) { 18069 ill_to_v4->ill_move_in_progress = B_FALSE; 18070 ill_to_v4->ill_move_peer = NULL; 18071 ill_to_v4->ill_state_flags &= ~ILL_CHANGING; 18072 } 18073 18074 /* 18075 * Check for setting INACTIVE, if STANDBY is set and FAILED is not set. 18076 * Do this always to maintain proper state i.e even in case of errors. 18077 * As phyint_inactive looks at both v4 and v6 interfaces, 18078 * we need not call on both v4 and v6 interfaces. 18079 */ 18080 if (ill_from_v4 != NULL) { 18081 if ((ill_from_v4->ill_phyint->phyint_flags & 18082 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 18083 phyint_inactive(ill_from_v4->ill_phyint); 18084 } 18085 } else if (ill_from_v6 != NULL) { 18086 if ((ill_from_v6->ill_phyint->phyint_flags & 18087 (PHYI_STANDBY | PHYI_FAILED)) == PHYI_STANDBY) { 18088 phyint_inactive(ill_from_v6->ill_phyint); 18089 } 18090 } 18091 18092 if (ill_to_v4 != NULL) { 18093 if (ill_to_v4->ill_phyint->phyint_flags & PHYI_INACTIVE) { 18094 ill_to_v4->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 18095 } 18096 } else if (ill_to_v6 != NULL) { 18097 if (ill_to_v6->ill_phyint->phyint_flags & PHYI_INACTIVE) { 18098 ill_to_v6->ill_phyint->phyint_flags &= ~PHYI_INACTIVE; 18099 } 18100 } 18101 18102 RELEASE_ILL_LOCKS(ill_to_v4, ill_to_v6); 18103 RELEASE_ILL_LOCKS(ill_from_v4, ill_from_v6); 18104 18105 no_err: 18106 /* 18107 * lets bring the interfaces up on the to_ill. 18108 */ 18109 if (err == 0) { 18110 err = ill_up_ipifs(ill_to_v4 == NULL ? ill_to_v6:ill_to_v4, 18111 q, mp); 18112 } 18113 18114 if (err == 0) { 18115 if (ill_from_v4 != NULL && ill_to_v4 != NULL) 18116 ilm_send_multicast_reqs(ill_from_v4, ill_to_v4); 18117 18118 if (ill_from_v6 != NULL && ill_to_v6 != NULL) 18119 ilm_send_multicast_reqs(ill_from_v6, ill_to_v6); 18120 } 18121 done: 18122 18123 if (ill_to_v4 != NULL) { 18124 ill_refrele(ill_to_v4); 18125 } 18126 if (ill_to_v6 != NULL) { 18127 ill_refrele(ill_to_v6); 18128 } 18129 18130 return (err); 18131 } 18132 18133 static void 18134 ill_dl_down(ill_t *ill) 18135 { 18136 /* 18137 * The ill is down; unbind but stay attached since we're still 18138 * associated with a PPA. If we have negotiated DLPI capabilites 18139 * with the data link service provider (IDS_OK) then reset them. 18140 * The interval between unbinding and rebinding is potentially 18141 * unbounded hence we cannot assume things will be the same. 18142 * The DLPI capabilities will be probed again when the data link 18143 * is brought up. 18144 */ 18145 mblk_t *mp = ill->ill_unbind_mp; 18146 hook_nic_event_t *info; 18147 18148 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 18149 18150 ill->ill_unbind_mp = NULL; 18151 if (mp != NULL) { 18152 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 18153 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 18154 ill->ill_name)); 18155 mutex_enter(&ill->ill_lock); 18156 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 18157 mutex_exit(&ill->ill_lock); 18158 /* 18159 * Reset the capabilities if the negotiation is done or is 18160 * still in progress. Note that ill_capability_reset() will 18161 * set ill_dlpi_capab_state to IDS_UNKNOWN, so the subsequent 18162 * DL_CAPABILITY_ACK and DL_NOTE_CAPAB_RENEG will be ignored. 18163 * 18164 * Further, reset ill_capab_reneg to be B_FALSE so that the 18165 * subsequent DL_CAPABILITY_ACK can be ignored, to prevent 18166 * the capabilities renegotiation from happening. 18167 */ 18168 if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) 18169 ill_capability_reset(ill); 18170 ill->ill_capab_reneg = B_FALSE; 18171 18172 ill_dlpi_send(ill, mp); 18173 } 18174 18175 /* 18176 * Toss all of our multicast memberships. We could keep them, but 18177 * then we'd have to do bookkeeping of any joins and leaves performed 18178 * by the application while the the interface is down (we can't just 18179 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 18180 * on a downed interface). 18181 */ 18182 ill_leave_multicast(ill); 18183 18184 mutex_enter(&ill->ill_lock); 18185 18186 ill->ill_dl_up = 0; 18187 18188 if ((info = ill->ill_nic_event_info) != NULL) { 18189 ip2dbg(("ill_dl_down:unexpected nic event %d attached for %s\n", 18190 info->hne_event, ill->ill_name)); 18191 if (info->hne_data != NULL) 18192 kmem_free(info->hne_data, info->hne_datalen); 18193 kmem_free(info, sizeof (hook_nic_event_t)); 18194 } 18195 18196 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 18197 if (info != NULL) { 18198 ip_stack_t *ipst = ill->ill_ipst; 18199 18200 info->hne_nic = ill->ill_phyint->phyint_hook_ifindex; 18201 info->hne_lif = 0; 18202 info->hne_event = NE_DOWN; 18203 info->hne_data = NULL; 18204 info->hne_datalen = 0; 18205 info->hne_family = ill->ill_isv6 ? 18206 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 18207 } else 18208 ip2dbg(("ill_dl_down: could not attach DOWN nic event " 18209 "information for %s (ENOMEM)\n", ill->ill_name)); 18210 18211 ill->ill_nic_event_info = info; 18212 18213 mutex_exit(&ill->ill_lock); 18214 } 18215 18216 static void 18217 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 18218 { 18219 union DL_primitives *dlp; 18220 t_uscalar_t prim; 18221 18222 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 18223 18224 dlp = (union DL_primitives *)mp->b_rptr; 18225 prim = dlp->dl_primitive; 18226 18227 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 18228 dl_primstr(prim), prim, ill->ill_name)); 18229 18230 switch (prim) { 18231 case DL_PHYS_ADDR_REQ: 18232 { 18233 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 18234 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 18235 break; 18236 } 18237 case DL_BIND_REQ: 18238 mutex_enter(&ill->ill_lock); 18239 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 18240 mutex_exit(&ill->ill_lock); 18241 break; 18242 } 18243 18244 /* 18245 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 18246 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 18247 * we only wait for the ACK of the DL_UNBIND_REQ. 18248 */ 18249 mutex_enter(&ill->ill_lock); 18250 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 18251 (prim == DL_UNBIND_REQ)) { 18252 ill->ill_dlpi_pending = prim; 18253 } 18254 mutex_exit(&ill->ill_lock); 18255 18256 putnext(ill->ill_wq, mp); 18257 } 18258 18259 /* 18260 * Helper function for ill_dlpi_send(). 18261 */ 18262 /* ARGSUSED */ 18263 static void 18264 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 18265 { 18266 ill_dlpi_send((ill_t *)q->q_ptr, mp); 18267 } 18268 18269 /* 18270 * Send a DLPI control message to the driver but make sure there 18271 * is only one outstanding message. Uses ill_dlpi_pending to tell 18272 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 18273 * when an ACK or a NAK is received to process the next queued message. 18274 */ 18275 void 18276 ill_dlpi_send(ill_t *ill, mblk_t *mp) 18277 { 18278 mblk_t **mpp; 18279 18280 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 18281 18282 /* 18283 * To ensure that any DLPI requests for current exclusive operation 18284 * are always completely sent before any DLPI messages for other 18285 * operations, require writer access before enqueuing. 18286 */ 18287 if (!IAM_WRITER_ILL(ill)) { 18288 ill_refhold(ill); 18289 /* qwriter_ip() does the ill_refrele() */ 18290 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 18291 NEW_OP, B_TRUE); 18292 return; 18293 } 18294 18295 mutex_enter(&ill->ill_lock); 18296 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 18297 /* Must queue message. Tail insertion */ 18298 mpp = &ill->ill_dlpi_deferred; 18299 while (*mpp != NULL) 18300 mpp = &((*mpp)->b_next); 18301 18302 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 18303 ill->ill_name)); 18304 18305 *mpp = mp; 18306 mutex_exit(&ill->ill_lock); 18307 return; 18308 } 18309 mutex_exit(&ill->ill_lock); 18310 ill_dlpi_dispatch(ill, mp); 18311 } 18312 18313 /* 18314 * Send all deferred DLPI messages without waiting for their ACKs. 18315 */ 18316 void 18317 ill_dlpi_send_deferred(ill_t *ill) 18318 { 18319 mblk_t *mp, *nextmp; 18320 18321 /* 18322 * Clear ill_dlpi_pending so that the message is not queued in 18323 * ill_dlpi_send(). 18324 */ 18325 mutex_enter(&ill->ill_lock); 18326 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18327 mp = ill->ill_dlpi_deferred; 18328 ill->ill_dlpi_deferred = NULL; 18329 mutex_exit(&ill->ill_lock); 18330 18331 for (; mp != NULL; mp = nextmp) { 18332 nextmp = mp->b_next; 18333 mp->b_next = NULL; 18334 ill_dlpi_send(ill, mp); 18335 } 18336 } 18337 18338 /* 18339 * Check if the DLPI primitive `prim' is pending; print a warning if not. 18340 */ 18341 boolean_t 18342 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 18343 { 18344 t_uscalar_t pending; 18345 18346 mutex_enter(&ill->ill_lock); 18347 if (ill->ill_dlpi_pending == prim) { 18348 mutex_exit(&ill->ill_lock); 18349 return (B_TRUE); 18350 } 18351 18352 /* 18353 * During teardown, ill_dlpi_dispatch() will send DLPI requests 18354 * without waiting, so don't print any warnings in that case. 18355 */ 18356 if (ill->ill_state_flags & ILL_CONDEMNED) { 18357 mutex_exit(&ill->ill_lock); 18358 return (B_FALSE); 18359 } 18360 pending = ill->ill_dlpi_pending; 18361 mutex_exit(&ill->ill_lock); 18362 18363 if (pending == DL_PRIM_INVAL) { 18364 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 18365 "received unsolicited ack for %s on %s\n", 18366 dl_primstr(prim), ill->ill_name); 18367 } else { 18368 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 18369 "received unexpected ack for %s on %s (expecting %s)\n", 18370 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 18371 } 18372 return (B_FALSE); 18373 } 18374 18375 /* 18376 * Called when an DLPI control message has been acked or nacked to 18377 * send down the next queued message (if any). 18378 */ 18379 void 18380 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 18381 { 18382 mblk_t *mp; 18383 18384 ASSERT(IAM_WRITER_ILL(ill)); 18385 mutex_enter(&ill->ill_lock); 18386 18387 ASSERT(prim != DL_PRIM_INVAL); 18388 ASSERT(ill->ill_dlpi_pending == prim); 18389 18390 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 18391 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 18392 18393 if ((mp = ill->ill_dlpi_deferred) == NULL) { 18394 ill->ill_dlpi_pending = DL_PRIM_INVAL; 18395 cv_signal(&ill->ill_cv); 18396 mutex_exit(&ill->ill_lock); 18397 return; 18398 } 18399 18400 ill->ill_dlpi_deferred = mp->b_next; 18401 mp->b_next = NULL; 18402 mutex_exit(&ill->ill_lock); 18403 18404 ill_dlpi_dispatch(ill, mp); 18405 } 18406 18407 void 18408 conn_delete_ire(conn_t *connp, caddr_t arg) 18409 { 18410 ipif_t *ipif = (ipif_t *)arg; 18411 ire_t *ire; 18412 18413 /* 18414 * Look at the cached ires on conns which has pointers to ipifs. 18415 * We just call ire_refrele which clears up the reference 18416 * to ire. Called when a conn closes. Also called from ipif_free 18417 * to cleanup indirect references to the stale ipif via the cached ire. 18418 */ 18419 mutex_enter(&connp->conn_lock); 18420 ire = connp->conn_ire_cache; 18421 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 18422 connp->conn_ire_cache = NULL; 18423 mutex_exit(&connp->conn_lock); 18424 IRE_REFRELE_NOTR(ire); 18425 return; 18426 } 18427 mutex_exit(&connp->conn_lock); 18428 18429 } 18430 18431 /* 18432 * Some operations (illgrp_delete(), ipif_down()) conditionally delete a number 18433 * of IREs. Those IREs may have been previously cached in the conn structure. 18434 * This ipcl_walk() walker function releases all references to such IREs based 18435 * on the condemned flag. 18436 */ 18437 /* ARGSUSED */ 18438 void 18439 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 18440 { 18441 ire_t *ire; 18442 18443 mutex_enter(&connp->conn_lock); 18444 ire = connp->conn_ire_cache; 18445 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 18446 connp->conn_ire_cache = NULL; 18447 mutex_exit(&connp->conn_lock); 18448 IRE_REFRELE_NOTR(ire); 18449 return; 18450 } 18451 mutex_exit(&connp->conn_lock); 18452 } 18453 18454 /* 18455 * Take down a specific interface, but don't lose any information about it. 18456 * Also delete interface from its interface group (ifgrp). 18457 * (Always called as writer.) 18458 * This function goes through the down sequence even if the interface is 18459 * already down. There are 2 reasons. 18460 * a. Currently we permit interface routes that depend on down interfaces 18461 * to be added. This behaviour itself is questionable. However it appears 18462 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 18463 * time. We go thru the cleanup in order to remove these routes. 18464 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 18465 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 18466 * down, but we need to cleanup i.e. do ill_dl_down and 18467 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 18468 * 18469 * IP-MT notes: 18470 * 18471 * Model of reference to interfaces. 18472 * 18473 * The following members in ipif_t track references to the ipif. 18474 * int ipif_refcnt; Active reference count 18475 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 18476 * The following members in ill_t track references to the ill. 18477 * int ill_refcnt; active refcnt 18478 * uint_t ill_ire_cnt; Number of ires referencing ill 18479 * uint_t ill_nce_cnt; Number of nces referencing ill 18480 * 18481 * Reference to an ipif or ill can be obtained in any of the following ways. 18482 * 18483 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 18484 * Pointers to ipif / ill from other data structures viz ire and conn. 18485 * Implicit reference to the ipif / ill by holding a reference to the ire. 18486 * 18487 * The ipif/ill lookup functions return a reference held ipif / ill. 18488 * ipif_refcnt and ill_refcnt track the reference counts respectively. 18489 * This is a purely dynamic reference count associated with threads holding 18490 * references to the ipif / ill. Pointers from other structures do not 18491 * count towards this reference count. 18492 * 18493 * ipif_ire_cnt/ill_ire_cnt is the number of ire's associated with the 18494 * ipif/ill. This is incremented whenever a new ire is created referencing the 18495 * ipif/ill. This is done atomically inside ire_add_v[46] where the ire is 18496 * actually added to the ire hash table. The count is decremented in 18497 * ire_inactive where the ire is destroyed. 18498 * 18499 * nce's reference ill's thru nce_ill and the count of nce's associated with 18500 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 18501 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 18502 * table. Similarly it is decremented in ndp_inactive() where the nce 18503 * is destroyed. 18504 * 18505 * Flow of ioctls involving interface down/up 18506 * 18507 * The following is the sequence of an attempt to set some critical flags on an 18508 * up interface. 18509 * ip_sioctl_flags 18510 * ipif_down 18511 * wait for ipif to be quiescent 18512 * ipif_down_tail 18513 * ip_sioctl_flags_tail 18514 * 18515 * All set ioctls that involve down/up sequence would have a skeleton similar 18516 * to the above. All the *tail functions are called after the refcounts have 18517 * dropped to the appropriate values. 18518 * 18519 * The mechanism to quiesce an ipif is as follows. 18520 * 18521 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 18522 * on the ipif. Callers either pass a flag requesting wait or the lookup 18523 * functions will return NULL. 18524 * 18525 * Delete all ires referencing this ipif 18526 * 18527 * Any thread attempting to do an ipif_refhold on an ipif that has been 18528 * obtained thru a cached pointer will first make sure that 18529 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 18530 * increment the refcount. 18531 * 18532 * The above guarantees that the ipif refcount will eventually come down to 18533 * zero and the ipif will quiesce, once all threads that currently hold a 18534 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 18535 * ipif_refcount has dropped to zero and all ire's associated with this ipif 18536 * have also been ire_inactive'd. i.e. when ipif_ire_cnt and ipif_refcnt both 18537 * drop to zero. 18538 * 18539 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 18540 * 18541 * Threads trying to lookup an ipif or ill can pass a flag requesting 18542 * wait and restart if the ipif / ill cannot be looked up currently. 18543 * For eg. bind, and route operations (Eg. route add / delete) cannot return 18544 * failure if the ipif is currently undergoing an exclusive operation, and 18545 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 18546 * is restarted by ipsq_exit() when the currently exclusive ioctl completes. 18547 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 18548 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 18549 * change while the ill_lock is held. Before dropping the ill_lock we acquire 18550 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 18551 * until we release the ipsq_lock, even though the the ill/ipif state flags 18552 * can change after we drop the ill_lock. 18553 * 18554 * An attempt to send out a packet using an ipif that is currently 18555 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 18556 * operation and restart it later when the exclusive condition on the ipif ends. 18557 * This is an example of not passing the wait flag to the lookup functions. For 18558 * example an attempt to refhold and use conn->conn_multicast_ipif and send 18559 * out a multicast packet on that ipif will fail while the ipif is 18560 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 18561 * currently IPIF_CHANGING will also fail. 18562 */ 18563 int 18564 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18565 { 18566 ill_t *ill = ipif->ipif_ill; 18567 phyint_t *phyi; 18568 conn_t *connp; 18569 boolean_t success; 18570 boolean_t ipif_was_up = B_FALSE; 18571 ip_stack_t *ipst = ill->ill_ipst; 18572 18573 ASSERT(IAM_WRITER_IPIF(ipif)); 18574 18575 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 18576 18577 if (ipif->ipif_flags & IPIF_UP) { 18578 mutex_enter(&ill->ill_lock); 18579 ipif->ipif_flags &= ~IPIF_UP; 18580 ASSERT(ill->ill_ipif_up_count > 0); 18581 --ill->ill_ipif_up_count; 18582 mutex_exit(&ill->ill_lock); 18583 ipif_was_up = B_TRUE; 18584 /* Update status in SCTP's list */ 18585 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 18586 } 18587 18588 /* 18589 * Blow away memberships we established in ipif_multicast_up(). 18590 */ 18591 ipif_multicast_down(ipif); 18592 18593 /* 18594 * Remove from the mapping for __sin6_src_id. We insert only 18595 * when the address is not INADDR_ANY. As IPv4 addresses are 18596 * stored as mapped addresses, we need to check for mapped 18597 * INADDR_ANY also. 18598 */ 18599 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 18600 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 18601 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 18602 int err; 18603 18604 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 18605 ipif->ipif_zoneid, ipst); 18606 if (err != 0) { 18607 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 18608 } 18609 } 18610 18611 /* 18612 * Before we delete the ill from the group (if any), we need 18613 * to make sure that we delete all the routes dependent on 18614 * this and also any ipifs dependent on this ipif for 18615 * source address. We need to do before we delete from 18616 * the group because 18617 * 18618 * 1) ipif_down_delete_ire de-references ill->ill_group. 18619 * 18620 * 2) ipif_update_other_ipifs needs to walk the whole group 18621 * for re-doing source address selection. Note that 18622 * ipif_select_source[_v6] called from 18623 * ipif_update_other_ipifs[_v6] will not pick this ipif 18624 * because we have already marked down here i.e cleared 18625 * IPIF_UP. 18626 */ 18627 if (ipif->ipif_isv6) { 18628 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 18629 ipst); 18630 } else { 18631 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 18632 ipst); 18633 } 18634 18635 /* 18636 * Cleaning up the conn_ire_cache or conns must be done only after the 18637 * ires have been deleted above. Otherwise a thread could end up 18638 * caching an ire in a conn after we have finished the cleanup of the 18639 * conn. The caching is done after making sure that the ire is not yet 18640 * condemned. Also documented in the block comment above ip_output 18641 */ 18642 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 18643 /* Also, delete the ires cached in SCTP */ 18644 sctp_ire_cache_flush(ipif); 18645 18646 /* 18647 * Update any other ipifs which have used "our" local address as 18648 * a source address. This entails removing and recreating IRE_INTERFACE 18649 * entries for such ipifs. 18650 */ 18651 if (ipif->ipif_isv6) 18652 ipif_update_other_ipifs_v6(ipif, ill->ill_group); 18653 else 18654 ipif_update_other_ipifs(ipif, ill->ill_group); 18655 18656 if (ipif_was_up) { 18657 /* 18658 * Check whether it is last ipif to leave this group. 18659 * If this is the last ipif to leave, we should remove 18660 * this ill from the group as ipif_select_source will not 18661 * be able to find any useful ipifs if this ill is selected 18662 * for load balancing. 18663 * 18664 * For nameless groups, we should call ifgrp_delete if this 18665 * belongs to some group. As this ipif is going down, we may 18666 * need to reconstruct groups. 18667 */ 18668 phyi = ill->ill_phyint; 18669 /* 18670 * If the phyint_groupname_len is 0, it may or may not 18671 * be in the nameless group. If the phyint_groupname_len is 18672 * not 0, then this ill should be part of some group. 18673 * As we always insert this ill in the group if 18674 * phyint_groupname_len is not zero when the first ipif 18675 * comes up (in ipif_up_done), it should be in a group 18676 * when the namelen is not 0. 18677 * 18678 * NOTE : When we delete the ill from the group,it will 18679 * blow away all the IRE_CACHES pointing either at this ipif or 18680 * ill_wq (illgrp_cache_delete does this). Thus, no IRES 18681 * should be pointing at this ill. 18682 */ 18683 ASSERT(phyi->phyint_groupname_len == 0 || 18684 (phyi->phyint_groupname != NULL && ill->ill_group != NULL)); 18685 18686 if (phyi->phyint_groupname_len != 0) { 18687 if (ill->ill_ipif_up_count == 0) 18688 illgrp_delete(ill); 18689 } 18690 18691 /* 18692 * If we have deleted some of the broadcast ires associated 18693 * with this ipif, we need to re-nominate somebody else if 18694 * the ires that we deleted were the nominated ones. 18695 */ 18696 if (ill->ill_group != NULL && !ill->ill_isv6) 18697 ipif_renominate_bcast(ipif); 18698 } 18699 18700 /* 18701 * neighbor-discovery or arp entries for this interface. 18702 */ 18703 ipif_ndp_down(ipif); 18704 18705 /* 18706 * If mp is NULL the caller will wait for the appropriate refcnt. 18707 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 18708 * and ill_delete -> ipif_free -> ipif_down 18709 */ 18710 if (mp == NULL) { 18711 ASSERT(q == NULL); 18712 return (0); 18713 } 18714 18715 if (CONN_Q(q)) { 18716 connp = Q_TO_CONN(q); 18717 mutex_enter(&connp->conn_lock); 18718 } else { 18719 connp = NULL; 18720 } 18721 mutex_enter(&ill->ill_lock); 18722 /* 18723 * Are there any ire's pointing to this ipif that are still active ? 18724 * If this is the last ipif going down, are there any ire's pointing 18725 * to this ill that are still active ? 18726 */ 18727 if (ipif_is_quiescent(ipif)) { 18728 mutex_exit(&ill->ill_lock); 18729 if (connp != NULL) 18730 mutex_exit(&connp->conn_lock); 18731 return (0); 18732 } 18733 18734 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 18735 ill->ill_name, (void *)ill)); 18736 /* 18737 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 18738 * drops down, the operation will be restarted by ipif_ill_refrele_tail 18739 * which in turn is called by the last refrele on the ipif/ill/ire. 18740 */ 18741 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 18742 if (!success) { 18743 /* The conn is closing. So just return */ 18744 ASSERT(connp != NULL); 18745 mutex_exit(&ill->ill_lock); 18746 mutex_exit(&connp->conn_lock); 18747 return (EINTR); 18748 } 18749 18750 mutex_exit(&ill->ill_lock); 18751 if (connp != NULL) 18752 mutex_exit(&connp->conn_lock); 18753 return (EINPROGRESS); 18754 } 18755 18756 void 18757 ipif_down_tail(ipif_t *ipif) 18758 { 18759 ill_t *ill = ipif->ipif_ill; 18760 18761 /* 18762 * Skip any loopback interface (null wq). 18763 * If this is the last logical interface on the ill 18764 * have ill_dl_down tell the driver we are gone (unbind) 18765 * Note that lun 0 can ipif_down even though 18766 * there are other logical units that are up. 18767 * This occurs e.g. when we change a "significant" IFF_ flag. 18768 */ 18769 if (ill->ill_wq != NULL && !ill->ill_logical_down && 18770 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 18771 ill->ill_dl_up) { 18772 ill_dl_down(ill); 18773 } 18774 ill->ill_logical_down = 0; 18775 18776 /* 18777 * Have to be after removing the routes in ipif_down_delete_ire. 18778 */ 18779 if (ipif->ipif_isv6) { 18780 if (ill->ill_flags & ILLF_XRESOLV) 18781 ipif_arp_down(ipif); 18782 } else { 18783 ipif_arp_down(ipif); 18784 } 18785 18786 ip_rts_ifmsg(ipif); 18787 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif); 18788 } 18789 18790 /* 18791 * Bring interface logically down without bringing the physical interface 18792 * down e.g. when the netmask is changed. This avoids long lasting link 18793 * negotiations between an ethernet interface and a certain switches. 18794 */ 18795 static int 18796 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 18797 { 18798 /* 18799 * The ill_logical_down flag is a transient flag. It is set here 18800 * and is cleared once the down has completed in ipif_down_tail. 18801 * This flag does not indicate whether the ill stream is in the 18802 * DL_BOUND state with the driver. Instead this flag is used by 18803 * ipif_down_tail to determine whether to DL_UNBIND the stream with 18804 * the driver. The state of the ill stream i.e. whether it is 18805 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 18806 */ 18807 ipif->ipif_ill->ill_logical_down = 1; 18808 return (ipif_down(ipif, q, mp)); 18809 } 18810 18811 /* 18812 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 18813 * If the usesrc client ILL is already part of a usesrc group or not, 18814 * in either case a ire_stq with the matching usesrc client ILL will 18815 * locate the IRE's that need to be deleted. We want IREs to be created 18816 * with the new source address. 18817 */ 18818 static void 18819 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 18820 { 18821 ill_t *ucill = (ill_t *)ill_arg; 18822 18823 ASSERT(IAM_WRITER_ILL(ucill)); 18824 18825 if (ire->ire_stq == NULL) 18826 return; 18827 18828 if ((ire->ire_type == IRE_CACHE) && 18829 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 18830 ire_delete(ire); 18831 } 18832 18833 /* 18834 * ire_walk routine to delete every IRE dependent on the interface 18835 * address that is going down. (Always called as writer.) 18836 * Works for both v4 and v6. 18837 * In addition for checking for ire_ipif matches it also checks for 18838 * IRE_CACHE entries which have the same source address as the 18839 * disappearing ipif since ipif_select_source might have picked 18840 * that source. Note that ipif_down/ipif_update_other_ipifs takes 18841 * care of any IRE_INTERFACE with the disappearing source address. 18842 */ 18843 static void 18844 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 18845 { 18846 ipif_t *ipif = (ipif_t *)ipif_arg; 18847 ill_t *ire_ill; 18848 ill_t *ipif_ill; 18849 18850 ASSERT(IAM_WRITER_IPIF(ipif)); 18851 if (ire->ire_ipif == NULL) 18852 return; 18853 18854 /* 18855 * For IPv4, we derive source addresses for an IRE from ipif's 18856 * belonging to the same IPMP group as the IRE's outgoing 18857 * interface. If an IRE's outgoing interface isn't in the 18858 * same IPMP group as a particular ipif, then that ipif 18859 * couldn't have been used as a source address for this IRE. 18860 * 18861 * For IPv6, source addresses are only restricted to the IPMP group 18862 * if the IRE is for a link-local address or a multicast address. 18863 * Otherwise, source addresses for an IRE can be chosen from 18864 * interfaces other than the the outgoing interface for that IRE. 18865 * 18866 * For source address selection details, see ipif_select_source() 18867 * and ipif_select_source_v6(). 18868 */ 18869 if (ire->ire_ipversion == IPV4_VERSION || 18870 IN6_IS_ADDR_LINKLOCAL(&ire->ire_addr_v6) || 18871 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 18872 ire_ill = ire->ire_ipif->ipif_ill; 18873 ipif_ill = ipif->ipif_ill; 18874 18875 if (ire_ill->ill_group != ipif_ill->ill_group) { 18876 return; 18877 } 18878 } 18879 18880 18881 if (ire->ire_ipif != ipif) { 18882 /* 18883 * Look for a matching source address. 18884 */ 18885 if (ire->ire_type != IRE_CACHE) 18886 return; 18887 if (ipif->ipif_flags & IPIF_NOLOCAL) 18888 return; 18889 18890 if (ire->ire_ipversion == IPV4_VERSION) { 18891 if (ire->ire_src_addr != ipif->ipif_src_addr) 18892 return; 18893 } else { 18894 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 18895 &ipif->ipif_v6lcl_addr)) 18896 return; 18897 } 18898 ire_delete(ire); 18899 return; 18900 } 18901 /* 18902 * ire_delete() will do an ire_flush_cache which will delete 18903 * all ire_ipif matches 18904 */ 18905 ire_delete(ire); 18906 } 18907 18908 /* 18909 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 18910 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 18911 * 2) when an interface is brought up or down (on that ill). 18912 * This ensures that the IRE_CACHE entries don't retain stale source 18913 * address selection results. 18914 */ 18915 void 18916 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 18917 { 18918 ill_t *ill = (ill_t *)ill_arg; 18919 ill_t *ipif_ill; 18920 18921 ASSERT(IAM_WRITER_ILL(ill)); 18922 /* 18923 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18924 * Hence this should be IRE_CACHE. 18925 */ 18926 ASSERT(ire->ire_type == IRE_CACHE); 18927 18928 /* 18929 * We are called for IRE_CACHES whose ire_ipif matches ill. 18930 * We are only interested in IRE_CACHES that has borrowed 18931 * the source address from ill_arg e.g. ipif_up_done[_v6] 18932 * for which we need to look at ire_ipif->ipif_ill match 18933 * with ill. 18934 */ 18935 ASSERT(ire->ire_ipif != NULL); 18936 ipif_ill = ire->ire_ipif->ipif_ill; 18937 if (ipif_ill == ill || (ill->ill_group != NULL && 18938 ipif_ill->ill_group == ill->ill_group)) { 18939 ire_delete(ire); 18940 } 18941 } 18942 18943 /* 18944 * Delete all the ire whose stq references ill_arg. 18945 */ 18946 static void 18947 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 18948 { 18949 ill_t *ill = (ill_t *)ill_arg; 18950 ill_t *ire_ill; 18951 18952 ASSERT(IAM_WRITER_ILL(ill)); 18953 /* 18954 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18955 * Hence this should be IRE_CACHE. 18956 */ 18957 ASSERT(ire->ire_type == IRE_CACHE); 18958 18959 /* 18960 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18961 * matches ill. We are only interested in IRE_CACHES that 18962 * has ire_stq->q_ptr pointing at ill_arg. Thus we do the 18963 * filtering here. 18964 */ 18965 ire_ill = (ill_t *)ire->ire_stq->q_ptr; 18966 18967 if (ire_ill == ill) 18968 ire_delete(ire); 18969 } 18970 18971 /* 18972 * This is called when an ill leaves the group. We want to delete 18973 * all IRE_CACHES whose stq is pointing at ill_wq or ire_ipif is 18974 * pointing at ill. 18975 */ 18976 static void 18977 illgrp_cache_delete(ire_t *ire, char *ill_arg) 18978 { 18979 ill_t *ill = (ill_t *)ill_arg; 18980 18981 ASSERT(IAM_WRITER_ILL(ill)); 18982 ASSERT(ill->ill_group == NULL); 18983 /* 18984 * We use MATCH_IRE_TYPE/IRE_CACHE while calling ire_walk_ill_v4. 18985 * Hence this should be IRE_CACHE. 18986 */ 18987 ASSERT(ire->ire_type == IRE_CACHE); 18988 /* 18989 * We are called for IRE_CACHES whose ire_stq and ire_ipif 18990 * matches ill. We are interested in both. 18991 */ 18992 ASSERT((ill == (ill_t *)ire->ire_stq->q_ptr) || 18993 (ire->ire_ipif->ipif_ill == ill)); 18994 18995 ire_delete(ire); 18996 } 18997 18998 /* 18999 * Initiate deallocate of an IPIF. Always called as writer. Called by 19000 * ill_delete or ip_sioctl_removeif. 19001 */ 19002 static void 19003 ipif_free(ipif_t *ipif) 19004 { 19005 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19006 19007 ASSERT(IAM_WRITER_IPIF(ipif)); 19008 19009 if (ipif->ipif_recovery_id != 0) 19010 (void) untimeout(ipif->ipif_recovery_id); 19011 ipif->ipif_recovery_id = 0; 19012 19013 /* Remove conn references */ 19014 reset_conn_ipif(ipif); 19015 19016 /* 19017 * Make sure we have valid net and subnet broadcast ire's for the 19018 * other ipif's which share them with this ipif. 19019 */ 19020 if (!ipif->ipif_isv6) 19021 ipif_check_bcast_ires(ipif); 19022 19023 /* 19024 * Take down the interface. We can be called either from ill_delete 19025 * or from ip_sioctl_removeif. 19026 */ 19027 (void) ipif_down(ipif, NULL, NULL); 19028 19029 /* 19030 * Now that the interface is down, there's no chance it can still 19031 * become a duplicate. Cancel any timer that may have been set while 19032 * tearing down. 19033 */ 19034 if (ipif->ipif_recovery_id != 0) 19035 (void) untimeout(ipif->ipif_recovery_id); 19036 ipif->ipif_recovery_id = 0; 19037 19038 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 19039 /* Remove pointers to this ill in the multicast routing tables */ 19040 reset_mrt_vif_ipif(ipif); 19041 rw_exit(&ipst->ips_ill_g_lock); 19042 } 19043 19044 /* 19045 * Warning: this is not the only function that calls mi_free on an ipif_t. See 19046 * also ill_move(). 19047 */ 19048 static void 19049 ipif_free_tail(ipif_t *ipif) 19050 { 19051 mblk_t *mp; 19052 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19053 19054 /* 19055 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 19056 */ 19057 mutex_enter(&ipif->ipif_saved_ire_lock); 19058 mp = ipif->ipif_saved_ire_mp; 19059 ipif->ipif_saved_ire_mp = NULL; 19060 mutex_exit(&ipif->ipif_saved_ire_lock); 19061 freemsg(mp); 19062 19063 /* 19064 * Need to hold both ill_g_lock and ill_lock while 19065 * inserting or removing an ipif from the linked list 19066 * of ipifs hanging off the ill. 19067 */ 19068 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 19069 /* 19070 * Remove all IPv4 multicast memberships on the interface now. 19071 * IPv6 is not handled here as the multicast memberships are 19072 * tied to the ill rather than the ipif. 19073 */ 19074 ilm_free(ipif); 19075 19076 /* 19077 * Since we held the ill_g_lock while doing the ilm_free above, 19078 * we can assert the ilms were really deleted and not just marked 19079 * ILM_DELETED. 19080 */ 19081 ASSERT(ilm_walk_ipif(ipif) == 0); 19082 19083 #ifdef DEBUG 19084 ipif_trace_cleanup(ipif); 19085 #endif 19086 19087 /* Ask SCTP to take it out of it list */ 19088 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 19089 19090 /* Get it out of the ILL interface list. */ 19091 ipif_remove(ipif, B_TRUE); 19092 rw_exit(&ipst->ips_ill_g_lock); 19093 19094 mutex_destroy(&ipif->ipif_saved_ire_lock); 19095 19096 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 19097 ASSERT(ipif->ipif_recovery_id == 0); 19098 19099 /* Free the memory. */ 19100 mi_free(ipif); 19101 } 19102 19103 /* 19104 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 19105 * is zero. 19106 */ 19107 void 19108 ipif_get_name(const ipif_t *ipif, char *buf, int len) 19109 { 19110 char lbuf[LIFNAMSIZ]; 19111 char *name; 19112 size_t name_len; 19113 19114 buf[0] = '\0'; 19115 name = ipif->ipif_ill->ill_name; 19116 name_len = ipif->ipif_ill->ill_name_length; 19117 if (ipif->ipif_id != 0) { 19118 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 19119 ipif->ipif_id); 19120 name = lbuf; 19121 name_len = mi_strlen(name) + 1; 19122 } 19123 len -= 1; 19124 buf[len] = '\0'; 19125 len = MIN(len, name_len); 19126 bcopy(name, buf, len); 19127 } 19128 19129 /* 19130 * Find an IPIF based on the name passed in. Names can be of the 19131 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 19132 * The <phys> string can have forms like <dev><#> (e.g., le0), 19133 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 19134 * When there is no colon, the implied unit id is zero. <phys> must 19135 * correspond to the name of an ILL. (May be called as writer.) 19136 */ 19137 static ipif_t * 19138 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 19139 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 19140 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 19141 { 19142 char *cp; 19143 char *endp; 19144 long id; 19145 ill_t *ill; 19146 ipif_t *ipif; 19147 uint_t ire_type; 19148 boolean_t did_alloc = B_FALSE; 19149 ipsq_t *ipsq; 19150 19151 if (error != NULL) 19152 *error = 0; 19153 19154 /* 19155 * If the caller wants to us to create the ipif, make sure we have a 19156 * valid zoneid 19157 */ 19158 ASSERT(!do_alloc || zoneid != ALL_ZONES); 19159 19160 if (namelen == 0) { 19161 if (error != NULL) 19162 *error = ENXIO; 19163 return (NULL); 19164 } 19165 19166 *exists = B_FALSE; 19167 /* Look for a colon in the name. */ 19168 endp = &name[namelen]; 19169 for (cp = endp; --cp > name; ) { 19170 if (*cp == IPIF_SEPARATOR_CHAR) 19171 break; 19172 } 19173 19174 if (*cp == IPIF_SEPARATOR_CHAR) { 19175 /* 19176 * Reject any non-decimal aliases for logical 19177 * interfaces. Aliases with leading zeroes 19178 * are also rejected as they introduce ambiguity 19179 * in the naming of the interfaces. 19180 * In order to confirm with existing semantics, 19181 * and to not break any programs/script relying 19182 * on that behaviour, if<0>:0 is considered to be 19183 * a valid interface. 19184 * 19185 * If alias has two or more digits and the first 19186 * is zero, fail. 19187 */ 19188 if (&cp[2] < endp && cp[1] == '0') { 19189 if (error != NULL) 19190 *error = EINVAL; 19191 return (NULL); 19192 } 19193 } 19194 19195 if (cp <= name) { 19196 cp = endp; 19197 } else { 19198 *cp = '\0'; 19199 } 19200 19201 /* 19202 * Look up the ILL, based on the portion of the name 19203 * before the slash. ill_lookup_on_name returns a held ill. 19204 * Temporary to check whether ill exists already. If so 19205 * ill_lookup_on_name will clear it. 19206 */ 19207 ill = ill_lookup_on_name(name, do_alloc, isv6, 19208 q, mp, func, error, &did_alloc, ipst); 19209 if (cp != endp) 19210 *cp = IPIF_SEPARATOR_CHAR; 19211 if (ill == NULL) 19212 return (NULL); 19213 19214 /* Establish the unit number in the name. */ 19215 id = 0; 19216 if (cp < endp && *endp == '\0') { 19217 /* If there was a colon, the unit number follows. */ 19218 cp++; 19219 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 19220 ill_refrele(ill); 19221 if (error != NULL) 19222 *error = ENXIO; 19223 return (NULL); 19224 } 19225 } 19226 19227 GRAB_CONN_LOCK(q); 19228 mutex_enter(&ill->ill_lock); 19229 /* Now see if there is an IPIF with this unit number. */ 19230 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19231 if (ipif->ipif_id == id) { 19232 if (zoneid != ALL_ZONES && 19233 zoneid != ipif->ipif_zoneid && 19234 ipif->ipif_zoneid != ALL_ZONES) { 19235 mutex_exit(&ill->ill_lock); 19236 RELEASE_CONN_LOCK(q); 19237 ill_refrele(ill); 19238 if (error != NULL) 19239 *error = ENXIO; 19240 return (NULL); 19241 } 19242 /* 19243 * The block comment at the start of ipif_down 19244 * explains the use of the macros used below 19245 */ 19246 if (IPIF_CAN_LOOKUP(ipif)) { 19247 ipif_refhold_locked(ipif); 19248 mutex_exit(&ill->ill_lock); 19249 if (!did_alloc) 19250 *exists = B_TRUE; 19251 /* 19252 * Drop locks before calling ill_refrele 19253 * since it can potentially call into 19254 * ipif_ill_refrele_tail which can end up 19255 * in trying to acquire any lock. 19256 */ 19257 RELEASE_CONN_LOCK(q); 19258 ill_refrele(ill); 19259 return (ipif); 19260 } else if (IPIF_CAN_WAIT(ipif, q)) { 19261 ipsq = ill->ill_phyint->phyint_ipsq; 19262 mutex_enter(&ipsq->ipsq_lock); 19263 mutex_exit(&ill->ill_lock); 19264 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 19265 mutex_exit(&ipsq->ipsq_lock); 19266 RELEASE_CONN_LOCK(q); 19267 ill_refrele(ill); 19268 if (error != NULL) 19269 *error = EINPROGRESS; 19270 return (NULL); 19271 } 19272 } 19273 } 19274 RELEASE_CONN_LOCK(q); 19275 19276 if (!do_alloc) { 19277 mutex_exit(&ill->ill_lock); 19278 ill_refrele(ill); 19279 if (error != NULL) 19280 *error = ENXIO; 19281 return (NULL); 19282 } 19283 19284 /* 19285 * If none found, atomically allocate and return a new one. 19286 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 19287 * to support "receive only" use of lo0:1 etc. as is still done 19288 * below as an initial guess. 19289 * However, this is now likely to be overriden later in ipif_up_done() 19290 * when we know for sure what address has been configured on the 19291 * interface, since we might have more than one loopback interface 19292 * with a loopback address, e.g. in the case of zones, and all the 19293 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 19294 */ 19295 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 19296 ire_type = IRE_LOOPBACK; 19297 else 19298 ire_type = IRE_LOCAL; 19299 ipif = ipif_allocate(ill, id, ire_type, B_TRUE); 19300 if (ipif != NULL) 19301 ipif_refhold_locked(ipif); 19302 else if (error != NULL) 19303 *error = ENOMEM; 19304 mutex_exit(&ill->ill_lock); 19305 ill_refrele(ill); 19306 return (ipif); 19307 } 19308 19309 /* 19310 * This routine is called whenever a new address comes up on an ipif. If 19311 * we are configured to respond to address mask requests, then we are supposed 19312 * to broadcast an address mask reply at this time. This routine is also 19313 * called if we are already up, but a netmask change is made. This is legal 19314 * but might not make the system manager very popular. (May be called 19315 * as writer.) 19316 */ 19317 void 19318 ipif_mask_reply(ipif_t *ipif) 19319 { 19320 icmph_t *icmph; 19321 ipha_t *ipha; 19322 mblk_t *mp; 19323 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19324 19325 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 19326 19327 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 19328 return; 19329 19330 /* ICMP mask reply is IPv4 only */ 19331 ASSERT(!ipif->ipif_isv6); 19332 /* ICMP mask reply is not for a loopback interface */ 19333 ASSERT(ipif->ipif_ill->ill_wq != NULL); 19334 19335 mp = allocb(REPLY_LEN, BPRI_HI); 19336 if (mp == NULL) 19337 return; 19338 mp->b_wptr = mp->b_rptr + REPLY_LEN; 19339 19340 ipha = (ipha_t *)mp->b_rptr; 19341 bzero(ipha, REPLY_LEN); 19342 *ipha = icmp_ipha; 19343 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 19344 ipha->ipha_src = ipif->ipif_src_addr; 19345 ipha->ipha_dst = ipif->ipif_brd_addr; 19346 ipha->ipha_length = htons(REPLY_LEN); 19347 ipha->ipha_ident = 0; 19348 19349 icmph = (icmph_t *)&ipha[1]; 19350 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 19351 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 19352 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 19353 19354 put(ipif->ipif_wq, mp); 19355 19356 #undef REPLY_LEN 19357 } 19358 19359 /* 19360 * When the mtu in the ipif changes, we call this routine through ire_walk 19361 * to update all the relevant IREs. 19362 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19363 */ 19364 static void 19365 ipif_mtu_change(ire_t *ire, char *ipif_arg) 19366 { 19367 ipif_t *ipif = (ipif_t *)ipif_arg; 19368 19369 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 19370 return; 19371 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 19372 } 19373 19374 /* 19375 * When the mtu in the ill changes, we call this routine through ire_walk 19376 * to update all the relevant IREs. 19377 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 19378 */ 19379 void 19380 ill_mtu_change(ire_t *ire, char *ill_arg) 19381 { 19382 ill_t *ill = (ill_t *)ill_arg; 19383 19384 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 19385 return; 19386 ire->ire_max_frag = ire->ire_ipif->ipif_mtu; 19387 } 19388 19389 /* 19390 * Join the ipif specific multicast groups. 19391 * Must be called after a mapping has been set up in the resolver. (Always 19392 * called as writer.) 19393 */ 19394 void 19395 ipif_multicast_up(ipif_t *ipif) 19396 { 19397 int err, index; 19398 ill_t *ill; 19399 19400 ASSERT(IAM_WRITER_IPIF(ipif)); 19401 19402 ill = ipif->ipif_ill; 19403 index = ill->ill_phyint->phyint_ifindex; 19404 19405 ip1dbg(("ipif_multicast_up\n")); 19406 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 19407 return; 19408 19409 if (ipif->ipif_isv6) { 19410 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 19411 return; 19412 19413 /* Join the all hosts multicast address */ 19414 ip1dbg(("ipif_multicast_up - addmulti\n")); 19415 /* 19416 * Passing B_TRUE means we have to join the multicast 19417 * membership on this interface even though this is 19418 * FAILED. If we join on a different one in the group, 19419 * we will not be able to delete the membership later 19420 * as we currently don't track where we join when we 19421 * join within the kernel unlike applications where 19422 * we have ilg/ilg_orig_index. See ip_addmulti_v6 19423 * for more on this. 19424 */ 19425 err = ip_addmulti_v6(&ipv6_all_hosts_mcast, ill, index, 19426 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19427 if (err != 0) { 19428 ip0dbg(("ipif_multicast_up: " 19429 "all_hosts_mcast failed %d\n", 19430 err)); 19431 return; 19432 } 19433 /* 19434 * Enable multicast for the solicited node multicast address 19435 */ 19436 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19437 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19438 19439 ipv6_multi.s6_addr32[3] |= 19440 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19441 19442 err = ip_addmulti_v6(&ipv6_multi, ill, index, 19443 ipif->ipif_zoneid, ILGSTAT_NONE, MODE_IS_EXCLUDE, 19444 NULL); 19445 if (err != 0) { 19446 ip0dbg(("ipif_multicast_up: solicited MC" 19447 " failed %d\n", err)); 19448 (void) ip_delmulti_v6(&ipv6_all_hosts_mcast, 19449 ill, ill->ill_phyint->phyint_ifindex, 19450 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19451 return; 19452 } 19453 } 19454 } else { 19455 if (ipif->ipif_lcl_addr == INADDR_ANY) 19456 return; 19457 19458 /* Join the all hosts multicast address */ 19459 ip1dbg(("ipif_multicast_up - addmulti\n")); 19460 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 19461 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 19462 if (err) { 19463 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 19464 return; 19465 } 19466 } 19467 ipif->ipif_multicast_up = 1; 19468 } 19469 19470 /* 19471 * Blow away any multicast groups that we joined in ipif_multicast_up(). 19472 * (Explicit memberships are blown away in ill_leave_multicast() when the 19473 * ill is brought down.) 19474 */ 19475 static void 19476 ipif_multicast_down(ipif_t *ipif) 19477 { 19478 int err; 19479 19480 ASSERT(IAM_WRITER_IPIF(ipif)); 19481 19482 ip1dbg(("ipif_multicast_down\n")); 19483 if (!ipif->ipif_multicast_up) 19484 return; 19485 19486 ip1dbg(("ipif_multicast_down - delmulti\n")); 19487 19488 if (!ipif->ipif_isv6) { 19489 err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE, 19490 B_TRUE); 19491 if (err != 0) 19492 ip0dbg(("ipif_multicast_down: failed %d\n", err)); 19493 19494 ipif->ipif_multicast_up = 0; 19495 return; 19496 } 19497 19498 /* 19499 * Leave the all hosts multicast address. Similar to ip_addmulti_v6, 19500 * we should look for ilms on this ill rather than the ones that have 19501 * been failed over here. They are here temporarily. As 19502 * ipif_multicast_up has joined on this ill, we should delete only 19503 * from this ill. 19504 */ 19505 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 19506 ipif->ipif_ill->ill_phyint->phyint_ifindex, ipif->ipif_zoneid, 19507 B_TRUE, B_TRUE); 19508 if (err != 0) { 19509 ip0dbg(("ipif_multicast_down: all_hosts_mcast failed %d\n", 19510 err)); 19511 } 19512 /* 19513 * Disable multicast for the solicited node multicast address 19514 */ 19515 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 19516 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 19517 19518 ipv6_multi.s6_addr32[3] |= 19519 ipif->ipif_v6lcl_addr.s6_addr32[3]; 19520 19521 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 19522 ipif->ipif_ill->ill_phyint->phyint_ifindex, 19523 ipif->ipif_zoneid, B_TRUE, B_TRUE); 19524 19525 if (err != 0) { 19526 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 19527 err)); 19528 } 19529 } 19530 19531 ipif->ipif_multicast_up = 0; 19532 } 19533 19534 /* 19535 * Used when an interface comes up to recreate any extra routes on this 19536 * interface. 19537 */ 19538 static ire_t ** 19539 ipif_recover_ire(ipif_t *ipif) 19540 { 19541 mblk_t *mp; 19542 ire_t **ipif_saved_irep; 19543 ire_t **irep; 19544 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 19545 19546 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 19547 ipif->ipif_id)); 19548 19549 mutex_enter(&ipif->ipif_saved_ire_lock); 19550 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 19551 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 19552 if (ipif_saved_irep == NULL) { 19553 mutex_exit(&ipif->ipif_saved_ire_lock); 19554 return (NULL); 19555 } 19556 19557 irep = ipif_saved_irep; 19558 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 19559 ire_t *ire; 19560 queue_t *rfq; 19561 queue_t *stq; 19562 ifrt_t *ifrt; 19563 uchar_t *src_addr; 19564 uchar_t *gateway_addr; 19565 ushort_t type; 19566 19567 /* 19568 * When the ire was initially created and then added in 19569 * ip_rt_add(), it was created either using ipif->ipif_net_type 19570 * in the case of a traditional interface route, or as one of 19571 * the IRE_OFFSUBNET types (with the exception of 19572 * IRE_HOST types ire which is created by icmp_redirect() and 19573 * which we don't need to save or recover). In the case where 19574 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 19575 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 19576 * to satisfy software like GateD and Sun Cluster which creates 19577 * routes using the the loopback interface's address as a 19578 * gateway. 19579 * 19580 * As ifrt->ifrt_type reflects the already updated ire_type, 19581 * ire_create() will be called in the same way here as 19582 * in ip_rt_add(), namely using ipif->ipif_net_type when 19583 * the route looks like a traditional interface route (where 19584 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 19585 * the saved ifrt->ifrt_type. This means that in the case where 19586 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 19587 * ire_create() will be an IRE_LOOPBACK, it will then be turned 19588 * into an IRE_IF_NORESOLVER and then added by ire_add(). 19589 */ 19590 ifrt = (ifrt_t *)mp->b_rptr; 19591 ASSERT(ifrt->ifrt_type != IRE_CACHE); 19592 if (ifrt->ifrt_type & IRE_INTERFACE) { 19593 rfq = NULL; 19594 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 19595 ? ipif->ipif_rq : ipif->ipif_wq; 19596 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19597 ? (uint8_t *)&ifrt->ifrt_src_addr 19598 : (uint8_t *)&ipif->ipif_src_addr; 19599 gateway_addr = NULL; 19600 type = ipif->ipif_net_type; 19601 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 19602 /* Recover multiroute broadcast IRE. */ 19603 rfq = ipif->ipif_rq; 19604 stq = ipif->ipif_wq; 19605 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19606 ? (uint8_t *)&ifrt->ifrt_src_addr 19607 : (uint8_t *)&ipif->ipif_src_addr; 19608 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19609 type = ifrt->ifrt_type; 19610 } else { 19611 rfq = NULL; 19612 stq = NULL; 19613 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 19614 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 19615 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 19616 type = ifrt->ifrt_type; 19617 } 19618 19619 /* 19620 * Create a copy of the IRE with the saved address and netmask. 19621 */ 19622 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 19623 "0x%x/0x%x\n", 19624 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 19625 ntohl(ifrt->ifrt_addr), 19626 ntohl(ifrt->ifrt_mask))); 19627 ire = ire_create( 19628 (uint8_t *)&ifrt->ifrt_addr, 19629 (uint8_t *)&ifrt->ifrt_mask, 19630 src_addr, 19631 gateway_addr, 19632 &ifrt->ifrt_max_frag, 19633 NULL, 19634 rfq, 19635 stq, 19636 type, 19637 ipif, 19638 0, 19639 0, 19640 0, 19641 ifrt->ifrt_flags, 19642 &ifrt->ifrt_iulp_info, 19643 NULL, 19644 NULL, 19645 ipst); 19646 19647 if (ire == NULL) { 19648 mutex_exit(&ipif->ipif_saved_ire_lock); 19649 kmem_free(ipif_saved_irep, 19650 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 19651 return (NULL); 19652 } 19653 19654 /* 19655 * Some software (for example, GateD and Sun Cluster) attempts 19656 * to create (what amount to) IRE_PREFIX routes with the 19657 * loopback address as the gateway. This is primarily done to 19658 * set up prefixes with the RTF_REJECT flag set (for example, 19659 * when generating aggregate routes.) 19660 * 19661 * If the IRE type (as defined by ipif->ipif_net_type) is 19662 * IRE_LOOPBACK, then we map the request into a 19663 * IRE_IF_NORESOLVER. 19664 */ 19665 if (ipif->ipif_net_type == IRE_LOOPBACK) 19666 ire->ire_type = IRE_IF_NORESOLVER; 19667 /* 19668 * ire held by ire_add, will be refreled' towards the 19669 * the end of ipif_up_done 19670 */ 19671 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 19672 *irep = ire; 19673 irep++; 19674 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 19675 } 19676 mutex_exit(&ipif->ipif_saved_ire_lock); 19677 return (ipif_saved_irep); 19678 } 19679 19680 /* 19681 * Used to set the netmask and broadcast address to default values when the 19682 * interface is brought up. (Always called as writer.) 19683 */ 19684 static void 19685 ipif_set_default(ipif_t *ipif) 19686 { 19687 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 19688 19689 if (!ipif->ipif_isv6) { 19690 /* 19691 * Interface holds an IPv4 address. Default 19692 * mask is the natural netmask. 19693 */ 19694 if (!ipif->ipif_net_mask) { 19695 ipaddr_t v4mask; 19696 19697 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 19698 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 19699 } 19700 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19701 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19702 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19703 } else { 19704 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19705 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19706 } 19707 /* 19708 * NOTE: SunOS 4.X does this even if the broadcast address 19709 * has been already set thus we do the same here. 19710 */ 19711 if (ipif->ipif_flags & IPIF_BROADCAST) { 19712 ipaddr_t v4addr; 19713 19714 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 19715 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 19716 } 19717 } else { 19718 /* 19719 * Interface holds an IPv6-only address. Default 19720 * mask is all-ones. 19721 */ 19722 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 19723 ipif->ipif_v6net_mask = ipv6_all_ones; 19724 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 19725 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 19726 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 19727 } else { 19728 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 19729 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 19730 } 19731 } 19732 } 19733 19734 /* 19735 * Return 0 if this address can be used as local address without causing 19736 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 19737 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 19738 * Special checks are needed to allow the same IPv6 link-local address 19739 * on different ills. 19740 * TODO: allowing the same site-local address on different ill's. 19741 */ 19742 int 19743 ip_addr_availability_check(ipif_t *new_ipif) 19744 { 19745 in6_addr_t our_v6addr; 19746 ill_t *ill; 19747 ipif_t *ipif; 19748 ill_walk_context_t ctx; 19749 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 19750 19751 ASSERT(IAM_WRITER_IPIF(new_ipif)); 19752 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 19753 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 19754 19755 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 19756 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 19757 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 19758 return (0); 19759 19760 our_v6addr = new_ipif->ipif_v6lcl_addr; 19761 19762 if (new_ipif->ipif_isv6) 19763 ill = ILL_START_WALK_V6(&ctx, ipst); 19764 else 19765 ill = ILL_START_WALK_V4(&ctx, ipst); 19766 19767 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 19768 for (ipif = ill->ill_ipif; ipif != NULL; 19769 ipif = ipif->ipif_next) { 19770 if ((ipif == new_ipif) || 19771 !(ipif->ipif_flags & IPIF_UP) || 19772 (ipif->ipif_flags & IPIF_UNNUMBERED)) 19773 continue; 19774 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 19775 &our_v6addr)) { 19776 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 19777 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 19778 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 19779 ipif->ipif_flags |= IPIF_UNNUMBERED; 19780 else if (IN6_IS_ADDR_LINKLOCAL(&our_v6addr) && 19781 new_ipif->ipif_ill != ill) 19782 continue; 19783 else if (IN6_IS_ADDR_SITELOCAL(&our_v6addr) && 19784 new_ipif->ipif_ill != ill) 19785 continue; 19786 else if (new_ipif->ipif_zoneid != 19787 ipif->ipif_zoneid && 19788 ipif->ipif_zoneid != ALL_ZONES && 19789 IS_LOOPBACK(ill)) 19790 continue; 19791 else if (new_ipif->ipif_ill == ill) 19792 return (EADDRINUSE); 19793 else 19794 return (EADDRNOTAVAIL); 19795 } 19796 } 19797 } 19798 19799 return (0); 19800 } 19801 19802 /* 19803 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 19804 * IREs for the ipif. 19805 * When the routine returns EINPROGRESS then mp has been consumed and 19806 * the ioctl will be acked from ip_rput_dlpi. 19807 */ 19808 static int 19809 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 19810 { 19811 ill_t *ill = ipif->ipif_ill; 19812 boolean_t isv6 = ipif->ipif_isv6; 19813 int err = 0; 19814 boolean_t success; 19815 19816 ASSERT(IAM_WRITER_IPIF(ipif)); 19817 19818 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 19819 19820 /* Shouldn't get here if it is already up. */ 19821 if (ipif->ipif_flags & IPIF_UP) 19822 return (EALREADY); 19823 19824 /* Skip arp/ndp for any loopback interface. */ 19825 if (ill->ill_wq != NULL) { 19826 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 19827 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19828 19829 if (!ill->ill_dl_up) { 19830 /* 19831 * ill_dl_up is not yet set. i.e. we are yet to 19832 * DL_BIND with the driver and this is the first 19833 * logical interface on the ill to become "up". 19834 * Tell the driver to get going (via DL_BIND_REQ). 19835 * Note that changing "significant" IFF_ flags 19836 * address/netmask etc cause a down/up dance, but 19837 * does not cause an unbind (DL_UNBIND) with the driver 19838 */ 19839 return (ill_dl_up(ill, ipif, mp, q)); 19840 } 19841 19842 /* 19843 * ipif_resolver_up may end up sending an 19844 * AR_INTERFACE_UP message to ARP, which would, in 19845 * turn send a DLPI message to the driver. ioctls are 19846 * serialized and so we cannot send more than one 19847 * interface up message at a time. If ipif_resolver_up 19848 * does send an interface up message to ARP, we get 19849 * EINPROGRESS and we will complete in ip_arp_done. 19850 */ 19851 19852 ASSERT(connp != NULL || !CONN_Q(q)); 19853 ASSERT(ipsq->ipsq_pending_mp == NULL); 19854 if (connp != NULL) 19855 mutex_enter(&connp->conn_lock); 19856 mutex_enter(&ill->ill_lock); 19857 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19858 mutex_exit(&ill->ill_lock); 19859 if (connp != NULL) 19860 mutex_exit(&connp->conn_lock); 19861 if (!success) 19862 return (EINTR); 19863 19864 /* 19865 * Crank up IPv6 neighbor discovery 19866 * Unlike ARP, this should complete when 19867 * ipif_ndp_up returns. However, for 19868 * ILLF_XRESOLV interfaces we also send a 19869 * AR_INTERFACE_UP to the external resolver. 19870 * That ioctl will complete in ip_rput. 19871 */ 19872 if (isv6) { 19873 err = ipif_ndp_up(ipif); 19874 if (err != 0) { 19875 if (err != EINPROGRESS) 19876 mp = ipsq_pending_mp_get(ipsq, &connp); 19877 return (err); 19878 } 19879 } 19880 /* Now, ARP */ 19881 err = ipif_resolver_up(ipif, Res_act_initial); 19882 if (err == EINPROGRESS) { 19883 /* We will complete it in ip_arp_done */ 19884 return (err); 19885 } 19886 mp = ipsq_pending_mp_get(ipsq, &connp); 19887 ASSERT(mp != NULL); 19888 if (err != 0) 19889 return (err); 19890 } else { 19891 /* 19892 * Interfaces without underlying hardware don't do duplicate 19893 * address detection. 19894 */ 19895 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 19896 ipif->ipif_addr_ready = 1; 19897 } 19898 return (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 19899 } 19900 19901 /* 19902 * Perform a bind for the physical device. 19903 * When the routine returns EINPROGRESS then mp has been consumed and 19904 * the ioctl will be acked from ip_rput_dlpi. 19905 * Allocate an unbind message and save it until ipif_down. 19906 */ 19907 static int 19908 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 19909 { 19910 areq_t *areq; 19911 mblk_t *areq_mp = NULL; 19912 mblk_t *bind_mp = NULL; 19913 mblk_t *unbind_mp = NULL; 19914 conn_t *connp; 19915 boolean_t success; 19916 uint16_t sap_addr; 19917 19918 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 19919 ASSERT(IAM_WRITER_ILL(ill)); 19920 ASSERT(mp != NULL); 19921 19922 /* Create a resolver cookie for ARP */ 19923 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 19924 areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0); 19925 if (areq_mp == NULL) 19926 return (ENOMEM); 19927 19928 freemsg(ill->ill_resolver_mp); 19929 ill->ill_resolver_mp = areq_mp; 19930 areq = (areq_t *)areq_mp->b_rptr; 19931 sap_addr = ill->ill_sap; 19932 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 19933 } 19934 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 19935 DL_BIND_REQ); 19936 if (bind_mp == NULL) 19937 goto bad; 19938 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 19939 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 19940 19941 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 19942 if (unbind_mp == NULL) 19943 goto bad; 19944 19945 /* 19946 * Record state needed to complete this operation when the 19947 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 19948 */ 19949 ASSERT(WR(q)->q_next == NULL); 19950 connp = Q_TO_CONN(q); 19951 19952 mutex_enter(&connp->conn_lock); 19953 mutex_enter(&ipif->ipif_ill->ill_lock); 19954 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 19955 mutex_exit(&ipif->ipif_ill->ill_lock); 19956 mutex_exit(&connp->conn_lock); 19957 if (!success) 19958 goto bad; 19959 19960 /* 19961 * Save the unbind message for ill_dl_down(); it will be consumed when 19962 * the interface goes down. 19963 */ 19964 ASSERT(ill->ill_unbind_mp == NULL); 19965 ill->ill_unbind_mp = unbind_mp; 19966 19967 ill_dlpi_send(ill, bind_mp); 19968 /* Send down link-layer capabilities probe if not already done. */ 19969 ill_capability_probe(ill); 19970 19971 /* 19972 * Sysid used to rely on the fact that netboots set domainname 19973 * and the like. Now that miniroot boots aren't strictly netboots 19974 * and miniroot network configuration is driven from userland 19975 * these things still need to be set. This situation can be detected 19976 * by comparing the interface being configured here to the one 19977 * dhcifname was set to reference by the boot loader. Once sysid is 19978 * converted to use dhcp_ipc_getinfo() this call can go away. 19979 */ 19980 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 19981 (strcmp(ill->ill_name, dhcifname) == 0) && 19982 (strlen(srpc_domain) == 0)) { 19983 if (dhcpinit() != 0) 19984 cmn_err(CE_WARN, "no cached dhcp response"); 19985 } 19986 19987 /* 19988 * This operation will complete in ip_rput_dlpi with either 19989 * a DL_BIND_ACK or DL_ERROR_ACK. 19990 */ 19991 return (EINPROGRESS); 19992 bad: 19993 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 19994 /* 19995 * We don't have to check for possible removal from illgrp 19996 * as we have not yet inserted in illgrp. For groups 19997 * without names, this ipif is still not UP and hence 19998 * this could not have possibly had any influence in forming 19999 * groups. 20000 */ 20001 20002 freemsg(bind_mp); 20003 freemsg(unbind_mp); 20004 return (ENOMEM); 20005 } 20006 20007 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 20008 20009 /* 20010 * DLPI and ARP is up. 20011 * Create all the IREs associated with an interface bring up multicast. 20012 * Set the interface flag and finish other initialization 20013 * that potentially had to be differed to after DL_BIND_ACK. 20014 */ 20015 int 20016 ipif_up_done(ipif_t *ipif) 20017 { 20018 ire_t *ire_array[20]; 20019 ire_t **irep = ire_array; 20020 ire_t **irep1; 20021 ipaddr_t net_mask = 0; 20022 ipaddr_t subnet_mask, route_mask; 20023 ill_t *ill = ipif->ipif_ill; 20024 queue_t *stq; 20025 ipif_t *src_ipif; 20026 ipif_t *tmp_ipif; 20027 boolean_t flush_ire_cache = B_TRUE; 20028 int err = 0; 20029 phyint_t *phyi; 20030 ire_t **ipif_saved_irep = NULL; 20031 int ipif_saved_ire_cnt; 20032 int cnt; 20033 boolean_t src_ipif_held = B_FALSE; 20034 boolean_t ire_added = B_FALSE; 20035 boolean_t loopback = B_FALSE; 20036 ip_stack_t *ipst = ill->ill_ipst; 20037 20038 ip1dbg(("ipif_up_done(%s:%u)\n", 20039 ipif->ipif_ill->ill_name, ipif->ipif_id)); 20040 /* Check if this is a loopback interface */ 20041 if (ipif->ipif_ill->ill_wq == NULL) 20042 loopback = B_TRUE; 20043 20044 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20045 /* 20046 * If all other interfaces for this ill are down or DEPRECATED, 20047 * or otherwise unsuitable for source address selection, remove 20048 * any IRE_CACHE entries for this ill to make sure source 20049 * address selection gets to take this new ipif into account. 20050 * No need to hold ill_lock while traversing the ipif list since 20051 * we are writer 20052 */ 20053 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 20054 tmp_ipif = tmp_ipif->ipif_next) { 20055 if (((tmp_ipif->ipif_flags & 20056 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 20057 !(tmp_ipif->ipif_flags & IPIF_UP)) || 20058 (tmp_ipif == ipif)) 20059 continue; 20060 /* first useable pre-existing interface */ 20061 flush_ire_cache = B_FALSE; 20062 break; 20063 } 20064 if (flush_ire_cache) 20065 ire_walk_ill_v4(MATCH_IRE_ILL_GROUP | MATCH_IRE_TYPE, 20066 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 20067 20068 /* 20069 * Figure out which way the send-to queue should go. Only 20070 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 20071 * should show up here. 20072 */ 20073 switch (ill->ill_net_type) { 20074 case IRE_IF_RESOLVER: 20075 stq = ill->ill_rq; 20076 break; 20077 case IRE_IF_NORESOLVER: 20078 case IRE_LOOPBACK: 20079 stq = ill->ill_wq; 20080 break; 20081 default: 20082 return (EINVAL); 20083 } 20084 20085 if (IS_LOOPBACK(ill)) { 20086 /* 20087 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 20088 * ipif_lookup_on_name(), but in the case of zones we can have 20089 * several loopback addresses on lo0. So all the interfaces with 20090 * loopback addresses need to be marked IRE_LOOPBACK. 20091 */ 20092 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 20093 htonl(INADDR_LOOPBACK)) 20094 ipif->ipif_ire_type = IRE_LOOPBACK; 20095 else 20096 ipif->ipif_ire_type = IRE_LOCAL; 20097 } 20098 20099 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)) { 20100 /* 20101 * Can't use our source address. Select a different 20102 * source address for the IRE_INTERFACE and IRE_LOCAL 20103 */ 20104 src_ipif = ipif_select_source(ipif->ipif_ill, 20105 ipif->ipif_subnet, ipif->ipif_zoneid); 20106 if (src_ipif == NULL) 20107 src_ipif = ipif; /* Last resort */ 20108 else 20109 src_ipif_held = B_TRUE; 20110 } else { 20111 src_ipif = ipif; 20112 } 20113 20114 /* Create all the IREs associated with this interface */ 20115 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 20116 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 20117 20118 /* 20119 * If we're on a labeled system then make sure that zone- 20120 * private addresses have proper remote host database entries. 20121 */ 20122 if (is_system_labeled() && 20123 ipif->ipif_ire_type != IRE_LOOPBACK && 20124 !tsol_check_interface_address(ipif)) 20125 return (EINVAL); 20126 20127 /* Register the source address for __sin6_src_id */ 20128 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 20129 ipif->ipif_zoneid, ipst); 20130 if (err != 0) { 20131 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 20132 return (err); 20133 } 20134 20135 /* If the interface address is set, create the local IRE. */ 20136 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 20137 (void *)ipif, 20138 ipif->ipif_ire_type, 20139 ntohl(ipif->ipif_lcl_addr))); 20140 *irep++ = ire_create( 20141 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 20142 (uchar_t *)&ip_g_all_ones, /* mask */ 20143 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 20144 NULL, /* no gateway */ 20145 &ip_loopback_mtuplus, /* max frag size */ 20146 NULL, 20147 ipif->ipif_rq, /* recv-from queue */ 20148 NULL, /* no send-to queue */ 20149 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 20150 ipif, 20151 0, 20152 0, 20153 0, 20154 (ipif->ipif_flags & IPIF_PRIVATE) ? 20155 RTF_PRIVATE : 0, 20156 &ire_uinfo_null, 20157 NULL, 20158 NULL, 20159 ipst); 20160 } else { 20161 ip1dbg(( 20162 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 20163 ipif->ipif_ire_type, 20164 ntohl(ipif->ipif_lcl_addr), 20165 (uint_t)ipif->ipif_flags)); 20166 } 20167 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 20168 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 20169 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 20170 } else { 20171 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 20172 } 20173 20174 subnet_mask = ipif->ipif_net_mask; 20175 20176 /* 20177 * If mask was not specified, use natural netmask of 20178 * interface address. Also, store this mask back into the 20179 * ipif struct. 20180 */ 20181 if (subnet_mask == 0) { 20182 subnet_mask = net_mask; 20183 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 20184 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 20185 ipif->ipif_v6subnet); 20186 } 20187 20188 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 20189 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 20190 ipif->ipif_subnet != INADDR_ANY) { 20191 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 20192 20193 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 20194 route_mask = IP_HOST_MASK; 20195 } else { 20196 route_mask = subnet_mask; 20197 } 20198 20199 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 20200 "creating if IRE ill_net_type 0x%x for 0x%x\n", 20201 (void *)ipif, (void *)ill, 20202 ill->ill_net_type, 20203 ntohl(ipif->ipif_subnet))); 20204 *irep++ = ire_create( 20205 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 20206 (uchar_t *)&route_mask, /* mask */ 20207 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 20208 NULL, /* no gateway */ 20209 &ipif->ipif_mtu, /* max frag */ 20210 NULL, 20211 NULL, /* no recv queue */ 20212 stq, /* send-to queue */ 20213 ill->ill_net_type, /* IF_[NO]RESOLVER */ 20214 ipif, 20215 0, 20216 0, 20217 0, 20218 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 20219 &ire_uinfo_null, 20220 NULL, 20221 NULL, 20222 ipst); 20223 } 20224 20225 /* 20226 * Create any necessary broadcast IREs. 20227 */ 20228 if (ipif->ipif_flags & IPIF_BROADCAST) 20229 irep = ipif_create_bcast_ires(ipif, irep); 20230 20231 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 20232 20233 /* If an earlier ire_create failed, get out now */ 20234 for (irep1 = irep; irep1 > ire_array; ) { 20235 irep1--; 20236 if (*irep1 == NULL) { 20237 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 20238 err = ENOMEM; 20239 goto bad; 20240 } 20241 } 20242 20243 /* 20244 * Need to atomically check for ip_addr_availablity_check 20245 * under ip_addr_avail_lock, and if it fails got bad, and remove 20246 * from group also.The ill_g_lock is grabbed as reader 20247 * just to make sure no new ills or new ipifs are being added 20248 * to the system while we are checking the uniqueness of addresses. 20249 */ 20250 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 20251 mutex_enter(&ipst->ips_ip_addr_avail_lock); 20252 /* Mark it up, and increment counters. */ 20253 ipif->ipif_flags |= IPIF_UP; 20254 ill->ill_ipif_up_count++; 20255 err = ip_addr_availability_check(ipif); 20256 mutex_exit(&ipst->ips_ip_addr_avail_lock); 20257 rw_exit(&ipst->ips_ill_g_lock); 20258 20259 if (err != 0) { 20260 /* 20261 * Our address may already be up on the same ill. In this case, 20262 * the ARP entry for our ipif replaced the one for the other 20263 * ipif. So we don't want to delete it (otherwise the other ipif 20264 * would be unable to send packets). 20265 * ip_addr_availability_check() identifies this case for us and 20266 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 20267 * which is the expected error code. 20268 */ 20269 if (err == EADDRINUSE) { 20270 freemsg(ipif->ipif_arp_del_mp); 20271 ipif->ipif_arp_del_mp = NULL; 20272 err = EADDRNOTAVAIL; 20273 } 20274 ill->ill_ipif_up_count--; 20275 ipif->ipif_flags &= ~IPIF_UP; 20276 goto bad; 20277 } 20278 20279 /* 20280 * Add in all newly created IREs. ire_create_bcast() has 20281 * already checked for duplicates of the IRE_BROADCAST type. 20282 * We want to add before we call ifgrp_insert which wants 20283 * to know whether IRE_IF_RESOLVER exists or not. 20284 * 20285 * NOTE : We refrele the ire though we may branch to "bad" 20286 * later on where we do ire_delete. This is okay 20287 * because nobody can delete it as we are running 20288 * exclusively. 20289 */ 20290 for (irep1 = irep; irep1 > ire_array; ) { 20291 irep1--; 20292 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 20293 /* 20294 * refheld by ire_add. refele towards the end of the func 20295 */ 20296 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 20297 } 20298 ire_added = B_TRUE; 20299 /* 20300 * Form groups if possible. 20301 * 20302 * If we are supposed to be in a ill_group with a name, insert it 20303 * now as we know that at least one ipif is UP. Otherwise form 20304 * nameless groups. 20305 * 20306 * If ip_enable_group_ifs is set and ipif address is not 0, insert 20307 * this ipif into the appropriate interface group, or create a 20308 * new one. If this is already in a nameless group, we try to form 20309 * a bigger group looking at other ills potentially sharing this 20310 * ipif's prefix. 20311 */ 20312 phyi = ill->ill_phyint; 20313 if (phyi->phyint_groupname_len != 0) { 20314 ASSERT(phyi->phyint_groupname != NULL); 20315 if (ill->ill_ipif_up_count == 1) { 20316 ASSERT(ill->ill_group == NULL); 20317 err = illgrp_insert(&ipst->ips_illgrp_head_v4, ill, 20318 phyi->phyint_groupname, NULL, B_TRUE); 20319 if (err != 0) { 20320 ip1dbg(("ipif_up_done: illgrp allocation " 20321 "failed, error %d\n", err)); 20322 goto bad; 20323 } 20324 } 20325 ASSERT(ill->ill_group != NULL); 20326 } 20327 20328 /* 20329 * When this is part of group, we need to make sure that 20330 * any broadcast ires created because of this ipif coming 20331 * UP gets marked/cleared with IRE_MARK_NORECV appropriately 20332 * so that we don't receive duplicate broadcast packets. 20333 */ 20334 if (ill->ill_group != NULL && ill->ill_ipif_up_count != 0) 20335 ipif_renominate_bcast(ipif); 20336 20337 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 20338 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 20339 ipif_saved_irep = ipif_recover_ire(ipif); 20340 20341 if (!loopback) { 20342 /* 20343 * If the broadcast address has been set, make sure it makes 20344 * sense based on the interface address. 20345 * Only match on ill since we are sharing broadcast addresses. 20346 */ 20347 if ((ipif->ipif_brd_addr != INADDR_ANY) && 20348 (ipif->ipif_flags & IPIF_BROADCAST)) { 20349 ire_t *ire; 20350 20351 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 20352 IRE_BROADCAST, ipif, ALL_ZONES, 20353 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); 20354 20355 if (ire == NULL) { 20356 /* 20357 * If there isn't a matching broadcast IRE, 20358 * revert to the default for this netmask. 20359 */ 20360 ipif->ipif_v6brd_addr = ipv6_all_zeros; 20361 mutex_enter(&ipif->ipif_ill->ill_lock); 20362 ipif_set_default(ipif); 20363 mutex_exit(&ipif->ipif_ill->ill_lock); 20364 } else { 20365 ire_refrele(ire); 20366 } 20367 } 20368 20369 } 20370 20371 /* This is the first interface on this ill */ 20372 if (ipif->ipif_ipif_up_count == 1 && !loopback) { 20373 /* 20374 * Need to recover all multicast memberships in the driver. 20375 * This had to be deferred until we had attached. 20376 */ 20377 ill_recover_multicast(ill); 20378 } 20379 /* Join the allhosts multicast address */ 20380 ipif_multicast_up(ipif); 20381 20382 if (!loopback) { 20383 /* 20384 * See whether anybody else would benefit from the 20385 * new ipif that we added. We call this always rather 20386 * than while adding a non-IPIF_NOLOCAL/DEPRECATED/ANYCAST 20387 * ipif is for the benefit of illgrp_insert (done above) 20388 * which does not do source address selection as it does 20389 * not want to re-create interface routes that we are 20390 * having reference to it here. 20391 */ 20392 ill_update_source_selection(ill); 20393 } 20394 20395 for (irep1 = irep; irep1 > ire_array; ) { 20396 irep1--; 20397 if (*irep1 != NULL) { 20398 /* was held in ire_add */ 20399 ire_refrele(*irep1); 20400 } 20401 } 20402 20403 cnt = ipif_saved_ire_cnt; 20404 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 20405 if (*irep1 != NULL) { 20406 /* was held in ire_add */ 20407 ire_refrele(*irep1); 20408 } 20409 } 20410 20411 if (!loopback && ipif->ipif_addr_ready) { 20412 /* Broadcast an address mask reply. */ 20413 ipif_mask_reply(ipif); 20414 } 20415 if (ipif_saved_irep != NULL) { 20416 kmem_free(ipif_saved_irep, 20417 ipif_saved_ire_cnt * sizeof (ire_t *)); 20418 } 20419 if (src_ipif_held) 20420 ipif_refrele(src_ipif); 20421 20422 /* 20423 * This had to be deferred until we had bound. Tell routing sockets and 20424 * others that this interface is up if it looks like the address has 20425 * been validated. Otherwise, if it isn't ready yet, wait for 20426 * duplicate address detection to do its thing. 20427 */ 20428 if (ipif->ipif_addr_ready) { 20429 ip_rts_ifmsg(ipif); 20430 ip_rts_newaddrmsg(RTM_ADD, 0, ipif); 20431 /* Let SCTP update the status for this ipif */ 20432 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20433 } 20434 return (0); 20435 20436 bad: 20437 ip1dbg(("ipif_up_done: FAILED \n")); 20438 /* 20439 * We don't have to bother removing from ill groups because 20440 * 20441 * 1) For groups with names, we insert only when the first ipif 20442 * comes up. In that case if it fails, it will not be in any 20443 * group. So, we need not try to remove for that case. 20444 * 20445 * 2) For groups without names, either we tried to insert ipif_ill 20446 * in a group as singleton or found some other group to become 20447 * a bigger group. For the former, if it fails we don't have 20448 * anything to do as ipif_ill is not in the group and for the 20449 * latter, there are no failures in illgrp_insert/illgrp_delete 20450 * (ENOMEM can't occur for this. Check ifgrp_insert). 20451 */ 20452 while (irep > ire_array) { 20453 irep--; 20454 if (*irep != NULL) { 20455 ire_delete(*irep); 20456 if (ire_added) 20457 ire_refrele(*irep); 20458 } 20459 } 20460 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 20461 20462 if (ipif_saved_irep != NULL) { 20463 kmem_free(ipif_saved_irep, 20464 ipif_saved_ire_cnt * sizeof (ire_t *)); 20465 } 20466 if (src_ipif_held) 20467 ipif_refrele(src_ipif); 20468 20469 ipif_arp_down(ipif); 20470 return (err); 20471 } 20472 20473 /* 20474 * Turn off the ARP with the ILLF_NOARP flag. 20475 */ 20476 static int 20477 ill_arp_off(ill_t *ill) 20478 { 20479 mblk_t *arp_off_mp = NULL; 20480 mblk_t *arp_on_mp = NULL; 20481 20482 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 20483 20484 ASSERT(IAM_WRITER_ILL(ill)); 20485 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20486 20487 /* 20488 * If the on message is still around we've already done 20489 * an arp_off without doing an arp_on thus there is no 20490 * work needed. 20491 */ 20492 if (ill->ill_arp_on_mp != NULL) 20493 return (0); 20494 20495 /* 20496 * Allocate an ARP on message (to be saved) and an ARP off message 20497 */ 20498 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 20499 if (!arp_off_mp) 20500 return (ENOMEM); 20501 20502 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 20503 if (!arp_on_mp) 20504 goto failed; 20505 20506 ASSERT(ill->ill_arp_on_mp == NULL); 20507 ill->ill_arp_on_mp = arp_on_mp; 20508 20509 /* Send an AR_INTERFACE_OFF request */ 20510 putnext(ill->ill_rq, arp_off_mp); 20511 return (0); 20512 failed: 20513 20514 if (arp_off_mp) 20515 freemsg(arp_off_mp); 20516 return (ENOMEM); 20517 } 20518 20519 /* 20520 * Turn on ARP by turning off the ILLF_NOARP flag. 20521 */ 20522 static int 20523 ill_arp_on(ill_t *ill) 20524 { 20525 mblk_t *mp; 20526 20527 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 20528 20529 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 20530 20531 ASSERT(IAM_WRITER_ILL(ill)); 20532 /* 20533 * Send an AR_INTERFACE_ON request if we have already done 20534 * an arp_off (which allocated the message). 20535 */ 20536 if (ill->ill_arp_on_mp != NULL) { 20537 mp = ill->ill_arp_on_mp; 20538 ill->ill_arp_on_mp = NULL; 20539 putnext(ill->ill_rq, mp); 20540 } 20541 return (0); 20542 } 20543 20544 /* 20545 * Called after either deleting ill from the group or when setting 20546 * FAILED or STANDBY on the interface. 20547 */ 20548 static void 20549 illgrp_reset_schednext(ill_t *ill) 20550 { 20551 ill_group_t *illgrp; 20552 ill_t *save_ill; 20553 20554 ASSERT(IAM_WRITER_ILL(ill)); 20555 /* 20556 * When called from illgrp_delete, ill_group will be non-NULL. 20557 * But when called from ip_sioctl_flags, it could be NULL if 20558 * somebody is setting FAILED/INACTIVE on some interface which 20559 * is not part of a group. 20560 */ 20561 illgrp = ill->ill_group; 20562 if (illgrp == NULL) 20563 return; 20564 if (illgrp->illgrp_ill_schednext != ill) 20565 return; 20566 20567 illgrp->illgrp_ill_schednext = NULL; 20568 save_ill = ill; 20569 /* 20570 * Choose a good ill to be the next one for 20571 * outbound traffic. As the flags FAILED/STANDBY is 20572 * not yet marked when called from ip_sioctl_flags, 20573 * we check for ill separately. 20574 */ 20575 for (ill = illgrp->illgrp_ill; ill != NULL; 20576 ill = ill->ill_group_next) { 20577 if ((ill != save_ill) && 20578 !(ill->ill_phyint->phyint_flags & 20579 (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE))) { 20580 illgrp->illgrp_ill_schednext = ill; 20581 return; 20582 } 20583 } 20584 } 20585 20586 /* 20587 * Given an ill, find the next ill in the group to be scheduled. 20588 * (This should be called by ip_newroute() before ire_create().) 20589 * The passed in ill may be pulled out of the group, after we have picked 20590 * up a different outgoing ill from the same group. However ire add will 20591 * atomically check this. 20592 */ 20593 ill_t * 20594 illgrp_scheduler(ill_t *ill) 20595 { 20596 ill_t *retill; 20597 ill_group_t *illgrp; 20598 int illcnt; 20599 int i; 20600 uint64_t flags; 20601 ip_stack_t *ipst = ill->ill_ipst; 20602 20603 /* 20604 * We don't use a lock to check for the ill_group. If this ill 20605 * is currently being inserted we may end up just returning this 20606 * ill itself. That is ok. 20607 */ 20608 if (ill->ill_group == NULL) { 20609 ill_refhold(ill); 20610 return (ill); 20611 } 20612 20613 /* 20614 * Grab the ill_g_lock as reader to make sure we are dealing with 20615 * a set of stable ills. No ill can be added or deleted or change 20616 * group while we hold the reader lock. 20617 */ 20618 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 20619 if ((illgrp = ill->ill_group) == NULL) { 20620 rw_exit(&ipst->ips_ill_g_lock); 20621 ill_refhold(ill); 20622 return (ill); 20623 } 20624 20625 illcnt = illgrp->illgrp_ill_count; 20626 mutex_enter(&illgrp->illgrp_lock); 20627 retill = illgrp->illgrp_ill_schednext; 20628 20629 if (retill == NULL) 20630 retill = illgrp->illgrp_ill; 20631 20632 /* 20633 * We do a circular search beginning at illgrp_ill_schednext 20634 * or illgrp_ill. We don't check the flags against the ill lock 20635 * since it can change anytime. The ire creation will be atomic 20636 * and will fail if the ill is FAILED or OFFLINE. 20637 */ 20638 for (i = 0; i < illcnt; i++) { 20639 flags = retill->ill_phyint->phyint_flags; 20640 20641 if (!(flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) && 20642 ILL_CAN_LOOKUP(retill)) { 20643 illgrp->illgrp_ill_schednext = retill->ill_group_next; 20644 ill_refhold(retill); 20645 break; 20646 } 20647 retill = retill->ill_group_next; 20648 if (retill == NULL) 20649 retill = illgrp->illgrp_ill; 20650 } 20651 mutex_exit(&illgrp->illgrp_lock); 20652 rw_exit(&ipst->ips_ill_g_lock); 20653 20654 return (i == illcnt ? NULL : retill); 20655 } 20656 20657 /* 20658 * Checks for availbility of a usable source address (if there is one) when the 20659 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 20660 * this selection is done regardless of the destination. 20661 */ 20662 boolean_t 20663 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 20664 { 20665 uint_t ifindex; 20666 ipif_t *ipif = NULL; 20667 ill_t *uill; 20668 boolean_t isv6; 20669 ip_stack_t *ipst = ill->ill_ipst; 20670 20671 ASSERT(ill != NULL); 20672 20673 isv6 = ill->ill_isv6; 20674 ifindex = ill->ill_usesrc_ifindex; 20675 if (ifindex != 0) { 20676 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 20677 NULL, ipst); 20678 if (uill == NULL) 20679 return (NULL); 20680 mutex_enter(&uill->ill_lock); 20681 for (ipif = uill->ill_ipif; ipif != NULL; 20682 ipif = ipif->ipif_next) { 20683 if (!IPIF_CAN_LOOKUP(ipif)) 20684 continue; 20685 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20686 continue; 20687 if (!(ipif->ipif_flags & IPIF_UP)) 20688 continue; 20689 if (ipif->ipif_zoneid != zoneid) 20690 continue; 20691 if ((isv6 && 20692 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 20693 (ipif->ipif_lcl_addr == INADDR_ANY)) 20694 continue; 20695 mutex_exit(&uill->ill_lock); 20696 ill_refrele(uill); 20697 return (B_TRUE); 20698 } 20699 mutex_exit(&uill->ill_lock); 20700 ill_refrele(uill); 20701 } 20702 return (B_FALSE); 20703 } 20704 20705 /* 20706 * Determine the best source address given a destination address and an ill. 20707 * Prefers non-deprecated over deprecated but will return a deprecated 20708 * address if there is no other choice. If there is a usable source address 20709 * on the interface pointed to by ill_usesrc_ifindex then that is given 20710 * first preference. 20711 * 20712 * Returns NULL if there is no suitable source address for the ill. 20713 * This only occurs when there is no valid source address for the ill. 20714 */ 20715 ipif_t * 20716 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 20717 { 20718 ipif_t *ipif; 20719 ipif_t *ipif_dep = NULL; /* Fallback to deprecated */ 20720 ipif_t *ipif_arr[MAX_IPIF_SELECT_SOURCE]; 20721 int index = 0; 20722 boolean_t wrapped = B_FALSE; 20723 boolean_t same_subnet_only = B_FALSE; 20724 boolean_t ipif_same_found, ipif_other_found; 20725 boolean_t specific_found; 20726 ill_t *till, *usill = NULL; 20727 tsol_tpc_t *src_rhtp, *dst_rhtp; 20728 ip_stack_t *ipst = ill->ill_ipst; 20729 20730 if (ill->ill_usesrc_ifindex != 0) { 20731 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 20732 B_FALSE, NULL, NULL, NULL, NULL, ipst); 20733 if (usill != NULL) 20734 ill = usill; /* Select source from usesrc ILL */ 20735 else 20736 return (NULL); 20737 } 20738 20739 /* 20740 * If we're dealing with an unlabeled destination on a labeled system, 20741 * make sure that we ignore source addresses that are incompatible with 20742 * the destination's default label. That destination's default label 20743 * must dominate the minimum label on the source address. 20744 */ 20745 dst_rhtp = NULL; 20746 if (is_system_labeled()) { 20747 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 20748 if (dst_rhtp == NULL) 20749 return (NULL); 20750 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 20751 TPC_RELE(dst_rhtp); 20752 dst_rhtp = NULL; 20753 } 20754 } 20755 20756 /* 20757 * Holds the ill_g_lock as reader. This makes sure that no ipif/ill 20758 * can be deleted. But an ipif/ill can get CONDEMNED any time. 20759 * After selecting the right ipif, under ill_lock make sure ipif is 20760 * not condemned, and increment refcnt. If ipif is CONDEMNED, 20761 * we retry. Inside the loop we still need to check for CONDEMNED, 20762 * but not under a lock. 20763 */ 20764 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 20765 20766 retry: 20767 till = ill; 20768 ipif_arr[0] = NULL; 20769 20770 if (till->ill_group != NULL) 20771 till = till->ill_group->illgrp_ill; 20772 20773 /* 20774 * Choose one good source address from each ill across the group. 20775 * If possible choose a source address in the same subnet as 20776 * the destination address. 20777 * 20778 * We don't check for PHYI_FAILED or PHYI_INACTIVE or PHYI_OFFLINE 20779 * This is okay because of the following. 20780 * 20781 * If PHYI_FAILED is set and we still have non-deprecated 20782 * addresses, it means the addresses have not yet been 20783 * failed over to a different interface. We potentially 20784 * select them to create IRE_CACHES, which will be later 20785 * flushed when the addresses move over. 20786 * 20787 * If PHYI_INACTIVE is set and we still have non-deprecated 20788 * addresses, it means either the user has configured them 20789 * or PHYI_INACTIVE has not been cleared after the addresses 20790 * been moved over. For the former, in.mpathd does a failover 20791 * when the interface becomes INACTIVE and hence we should 20792 * not find them. Once INACTIVE is set, we don't allow them 20793 * to create logical interfaces anymore. For the latter, a 20794 * flush will happen when INACTIVE is cleared which will 20795 * flush the IRE_CACHES. 20796 * 20797 * If PHYI_OFFLINE is set, all the addresses will be failed 20798 * over soon. We potentially select them to create IRE_CACHEs, 20799 * which will be later flushed when the addresses move over. 20800 * 20801 * NOTE : As ipif_select_source is called to borrow source address 20802 * for an ipif that is part of a group, source address selection 20803 * will be re-done whenever the group changes i.e either an 20804 * insertion/deletion in the group. 20805 * 20806 * Fill ipif_arr[] with source addresses, using these rules: 20807 * 20808 * 1. At most one source address from a given ill ends up 20809 * in ipif_arr[] -- that is, at most one of the ipif's 20810 * associated with a given ill ends up in ipif_arr[]. 20811 * 20812 * 2. If there is at least one non-deprecated ipif in the 20813 * IPMP group with a source address on the same subnet as 20814 * our destination, then fill ipif_arr[] only with 20815 * source addresses on the same subnet as our destination. 20816 * Note that because of (1), only the first 20817 * non-deprecated ipif found with a source address 20818 * matching the destination ends up in ipif_arr[]. 20819 * 20820 * 3. Otherwise, fill ipif_arr[] with non-deprecated source 20821 * addresses not in the same subnet as our destination. 20822 * Again, because of (1), only the first off-subnet source 20823 * address will be chosen. 20824 * 20825 * 4. If there are no non-deprecated ipifs, then just use 20826 * the source address associated with the last deprecated 20827 * one we find that happens to be on the same subnet, 20828 * otherwise the first one not in the same subnet. 20829 */ 20830 specific_found = B_FALSE; 20831 for (; till != NULL; till = till->ill_group_next) { 20832 ipif_same_found = B_FALSE; 20833 ipif_other_found = B_FALSE; 20834 for (ipif = till->ill_ipif; ipif != NULL; 20835 ipif = ipif->ipif_next) { 20836 if (!IPIF_CAN_LOOKUP(ipif)) 20837 continue; 20838 /* Always skip NOLOCAL and ANYCAST interfaces */ 20839 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 20840 continue; 20841 if (!(ipif->ipif_flags & IPIF_UP) || 20842 !ipif->ipif_addr_ready) 20843 continue; 20844 if (ipif->ipif_zoneid != zoneid && 20845 ipif->ipif_zoneid != ALL_ZONES) 20846 continue; 20847 /* 20848 * Interfaces with 0.0.0.0 address are allowed to be UP, 20849 * but are not valid as source addresses. 20850 */ 20851 if (ipif->ipif_lcl_addr == INADDR_ANY) 20852 continue; 20853 20854 /* 20855 * Check compatibility of local address for 20856 * destination's default label if we're on a labeled 20857 * system. Incompatible addresses can't be used at 20858 * all. 20859 */ 20860 if (dst_rhtp != NULL) { 20861 boolean_t incompat; 20862 20863 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 20864 IPV4_VERSION, B_FALSE); 20865 if (src_rhtp == NULL) 20866 continue; 20867 incompat = 20868 src_rhtp->tpc_tp.host_type != SUN_CIPSO || 20869 src_rhtp->tpc_tp.tp_doi != 20870 dst_rhtp->tpc_tp.tp_doi || 20871 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 20872 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 20873 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 20874 src_rhtp->tpc_tp.tp_sl_set_cipso)); 20875 TPC_RELE(src_rhtp); 20876 if (incompat) 20877 continue; 20878 } 20879 20880 /* 20881 * We prefer not to use all all-zones addresses, if we 20882 * can avoid it, as they pose problems with unlabeled 20883 * destinations. 20884 */ 20885 if (ipif->ipif_zoneid != ALL_ZONES) { 20886 if (!specific_found && 20887 (!same_subnet_only || 20888 (ipif->ipif_net_mask & dst) == 20889 ipif->ipif_subnet)) { 20890 index = 0; 20891 specific_found = B_TRUE; 20892 ipif_other_found = B_FALSE; 20893 } 20894 } else { 20895 if (specific_found) 20896 continue; 20897 } 20898 if (ipif->ipif_flags & IPIF_DEPRECATED) { 20899 if (ipif_dep == NULL || 20900 (ipif->ipif_net_mask & dst) == 20901 ipif->ipif_subnet) 20902 ipif_dep = ipif; 20903 continue; 20904 } 20905 if ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet) { 20906 /* found a source address in the same subnet */ 20907 if (!same_subnet_only) { 20908 same_subnet_only = B_TRUE; 20909 index = 0; 20910 } 20911 ipif_same_found = B_TRUE; 20912 } else { 20913 if (same_subnet_only || ipif_other_found) 20914 continue; 20915 ipif_other_found = B_TRUE; 20916 } 20917 ipif_arr[index++] = ipif; 20918 if (index == MAX_IPIF_SELECT_SOURCE) { 20919 wrapped = B_TRUE; 20920 index = 0; 20921 } 20922 if (ipif_same_found) 20923 break; 20924 } 20925 } 20926 20927 if (ipif_arr[0] == NULL) { 20928 ipif = ipif_dep; 20929 } else { 20930 if (wrapped) 20931 index = MAX_IPIF_SELECT_SOURCE; 20932 ipif = ipif_arr[ipif_rand(ipst) % index]; 20933 ASSERT(ipif != NULL); 20934 } 20935 20936 if (ipif != NULL) { 20937 mutex_enter(&ipif->ipif_ill->ill_lock); 20938 if (!IPIF_CAN_LOOKUP(ipif)) { 20939 mutex_exit(&ipif->ipif_ill->ill_lock); 20940 goto retry; 20941 } 20942 ipif_refhold_locked(ipif); 20943 mutex_exit(&ipif->ipif_ill->ill_lock); 20944 } 20945 20946 rw_exit(&ipst->ips_ill_g_lock); 20947 if (usill != NULL) 20948 ill_refrele(usill); 20949 if (dst_rhtp != NULL) 20950 TPC_RELE(dst_rhtp); 20951 20952 #ifdef DEBUG 20953 if (ipif == NULL) { 20954 char buf1[INET6_ADDRSTRLEN]; 20955 20956 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 20957 ill->ill_name, 20958 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 20959 } else { 20960 char buf1[INET6_ADDRSTRLEN]; 20961 char buf2[INET6_ADDRSTRLEN]; 20962 20963 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 20964 ipif->ipif_ill->ill_name, 20965 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 20966 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 20967 buf2, sizeof (buf2)))); 20968 } 20969 #endif /* DEBUG */ 20970 return (ipif); 20971 } 20972 20973 20974 /* 20975 * If old_ipif is not NULL, see if ipif was derived from old 20976 * ipif and if so, recreate the interface route by re-doing 20977 * source address selection. This happens when ipif_down -> 20978 * ipif_update_other_ipifs calls us. 20979 * 20980 * If old_ipif is NULL, just redo the source address selection 20981 * if needed. This happens when illgrp_insert or ipif_up_done 20982 * calls us. 20983 */ 20984 static void 20985 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 20986 { 20987 ire_t *ire; 20988 ire_t *ipif_ire; 20989 queue_t *stq; 20990 ipif_t *nipif; 20991 ill_t *ill; 20992 boolean_t need_rele = B_FALSE; 20993 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 20994 20995 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 20996 ASSERT(IAM_WRITER_IPIF(ipif)); 20997 20998 ill = ipif->ipif_ill; 20999 if (!(ipif->ipif_flags & 21000 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 21001 /* 21002 * Can't possibly have borrowed the source 21003 * from old_ipif. 21004 */ 21005 return; 21006 } 21007 21008 /* 21009 * Is there any work to be done? No work if the address 21010 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 21011 * ipif_select_source() does not borrow addresses from 21012 * NOLOCAL and ANYCAST interfaces). 21013 */ 21014 if ((old_ipif != NULL) && 21015 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 21016 (old_ipif->ipif_ill->ill_wq == NULL) || 21017 (old_ipif->ipif_flags & 21018 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 21019 return; 21020 } 21021 21022 /* 21023 * Perform the same checks as when creating the 21024 * IRE_INTERFACE in ipif_up_done. 21025 */ 21026 if (!(ipif->ipif_flags & IPIF_UP)) 21027 return; 21028 21029 if ((ipif->ipif_flags & IPIF_NOXMIT) || 21030 (ipif->ipif_subnet == INADDR_ANY)) 21031 return; 21032 21033 ipif_ire = ipif_to_ire(ipif); 21034 if (ipif_ire == NULL) 21035 return; 21036 21037 /* 21038 * We know that ipif uses some other source for its 21039 * IRE_INTERFACE. Is it using the source of this 21040 * old_ipif? 21041 */ 21042 if (old_ipif != NULL && 21043 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 21044 ire_refrele(ipif_ire); 21045 return; 21046 } 21047 if (ip_debug > 2) { 21048 /* ip1dbg */ 21049 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 21050 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 21051 } 21052 21053 stq = ipif_ire->ire_stq; 21054 21055 /* 21056 * Can't use our source address. Select a different 21057 * source address for the IRE_INTERFACE. 21058 */ 21059 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 21060 if (nipif == NULL) { 21061 /* Last resort - all ipif's have IPIF_NOLOCAL */ 21062 nipif = ipif; 21063 } else { 21064 need_rele = B_TRUE; 21065 } 21066 21067 ire = ire_create( 21068 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 21069 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 21070 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 21071 NULL, /* no gateway */ 21072 &ipif->ipif_mtu, /* max frag */ 21073 NULL, /* no src nce */ 21074 NULL, /* no recv from queue */ 21075 stq, /* send-to queue */ 21076 ill->ill_net_type, /* IF_[NO]RESOLVER */ 21077 ipif, 21078 0, 21079 0, 21080 0, 21081 0, 21082 &ire_uinfo_null, 21083 NULL, 21084 NULL, 21085 ipst); 21086 21087 if (ire != NULL) { 21088 ire_t *ret_ire; 21089 int error; 21090 21091 /* 21092 * We don't need ipif_ire anymore. We need to delete 21093 * before we add so that ire_add does not detect 21094 * duplicates. 21095 */ 21096 ire_delete(ipif_ire); 21097 ret_ire = ire; 21098 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 21099 ASSERT(error == 0); 21100 ASSERT(ire == ret_ire); 21101 /* Held in ire_add */ 21102 ire_refrele(ret_ire); 21103 } 21104 /* 21105 * Either we are falling through from above or could not 21106 * allocate a replacement. 21107 */ 21108 ire_refrele(ipif_ire); 21109 if (need_rele) 21110 ipif_refrele(nipif); 21111 } 21112 21113 /* 21114 * This old_ipif is going away. 21115 * 21116 * Determine if any other ipif's is using our address as 21117 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 21118 * IPIF_DEPRECATED). 21119 * Find the IRE_INTERFACE for such ipifs and recreate them 21120 * to use an different source address following the rules in 21121 * ipif_up_done. 21122 * 21123 * This function takes an illgrp as an argument so that illgrp_delete 21124 * can call this to update source address even after deleting the 21125 * old_ipif->ipif_ill from the ill group. 21126 */ 21127 static void 21128 ipif_update_other_ipifs(ipif_t *old_ipif, ill_group_t *illgrp) 21129 { 21130 ipif_t *ipif; 21131 ill_t *ill; 21132 char buf[INET6_ADDRSTRLEN]; 21133 21134 ASSERT(IAM_WRITER_IPIF(old_ipif)); 21135 ASSERT(illgrp == NULL || IAM_WRITER_IPIF(old_ipif)); 21136 21137 ill = old_ipif->ipif_ill; 21138 21139 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", 21140 ill->ill_name, 21141 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, 21142 buf, sizeof (buf)))); 21143 /* 21144 * If this part of a group, look at all ills as ipif_select_source 21145 * borrows source address across all the ills in the group. 21146 */ 21147 if (illgrp != NULL) 21148 ill = illgrp->illgrp_ill; 21149 21150 for (; ill != NULL; ill = ill->ill_group_next) { 21151 for (ipif = ill->ill_ipif; ipif != NULL; 21152 ipif = ipif->ipif_next) { 21153 21154 if (ipif == old_ipif) 21155 continue; 21156 21157 ipif_recreate_interface_routes(old_ipif, ipif); 21158 } 21159 } 21160 } 21161 21162 /* ARGSUSED */ 21163 int 21164 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21165 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21166 { 21167 /* 21168 * ill_phyint_reinit merged the v4 and v6 into a single 21169 * ipsq. Could also have become part of a ipmp group in the 21170 * process, and we might not have been able to complete the 21171 * operation in ipif_set_values, if we could not become 21172 * exclusive. If so restart it here. 21173 */ 21174 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21175 } 21176 21177 21178 /* 21179 * Can operate on either a module or a driver queue. 21180 * Returns an error if not a module queue. 21181 */ 21182 /* ARGSUSED */ 21183 int 21184 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21185 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21186 { 21187 queue_t *q1 = q; 21188 char *cp; 21189 char interf_name[LIFNAMSIZ]; 21190 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 21191 21192 if (q->q_next == NULL) { 21193 ip1dbg(( 21194 "if_unitsel: IF_UNITSEL: no q_next\n")); 21195 return (EINVAL); 21196 } 21197 21198 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 21199 return (EALREADY); 21200 21201 do { 21202 q1 = q1->q_next; 21203 } while (q1->q_next); 21204 cp = q1->q_qinfo->qi_minfo->mi_idname; 21205 (void) sprintf(interf_name, "%s%d", cp, ppa); 21206 21207 /* 21208 * Here we are not going to delay the ioack until after 21209 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 21210 * original ioctl message before sending the requests. 21211 */ 21212 return (ipif_set_values(q, mp, interf_name, &ppa)); 21213 } 21214 21215 /* ARGSUSED */ 21216 int 21217 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 21218 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 21219 { 21220 return (ENXIO); 21221 } 21222 21223 /* 21224 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 21225 * `irep'. Returns a pointer to the next free `irep' entry (just like 21226 * ire_check_and_create_bcast()). 21227 */ 21228 static ire_t ** 21229 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 21230 { 21231 ipaddr_t addr; 21232 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 21233 ipaddr_t subnetmask = ipif->ipif_net_mask; 21234 int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; 21235 21236 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 21237 21238 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 21239 21240 if (ipif->ipif_lcl_addr == INADDR_ANY || 21241 (ipif->ipif_flags & IPIF_NOLOCAL)) 21242 netmask = htonl(IN_CLASSA_NET); /* fallback */ 21243 21244 irep = ire_check_and_create_bcast(ipif, 0, irep, flags); 21245 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags); 21246 21247 /* 21248 * For backward compatibility, we create net broadcast IREs based on 21249 * the old "IP address class system", since some old machines only 21250 * respond to these class derived net broadcast. However, we must not 21251 * create these net broadcast IREs if the subnetmask is shorter than 21252 * the IP address class based derived netmask. Otherwise, we may 21253 * create a net broadcast address which is the same as an IP address 21254 * on the subnet -- and then TCP will refuse to talk to that address. 21255 */ 21256 if (netmask < subnetmask) { 21257 addr = netmask & ipif->ipif_subnet; 21258 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 21259 irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep, 21260 flags); 21261 } 21262 21263 /* 21264 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 21265 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 21266 * created. Creating these broadcast IREs will only create confusion 21267 * as `addr' will be the same as the IP address. 21268 */ 21269 if (subnetmask != 0xFFFFFFFF) { 21270 addr = ipif->ipif_subnet; 21271 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 21272 irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr, 21273 irep, flags); 21274 } 21275 21276 return (irep); 21277 } 21278 21279 /* 21280 * Broadcast IRE info structure used in the functions below. Since we 21281 * allocate BCAST_COUNT of them on the stack, keep the bit layout compact. 21282 */ 21283 typedef struct bcast_ireinfo { 21284 uchar_t bi_type; /* BCAST_* value from below */ 21285 uchar_t bi_willdie:1, /* will this IRE be going away? */ 21286 bi_needrep:1, /* do we need to replace it? */ 21287 bi_haverep:1, /* have we replaced it? */ 21288 bi_pad:5; 21289 ipaddr_t bi_addr; /* IRE address */ 21290 ipif_t *bi_backup; /* last-ditch ipif to replace it on */ 21291 } bcast_ireinfo_t; 21292 21293 enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT }; 21294 21295 /* 21296 * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and 21297 * return B_TRUE if it should immediately be used to recreate the IRE. 21298 */ 21299 static boolean_t 21300 ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop) 21301 { 21302 ipaddr_t addr; 21303 21304 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie); 21305 21306 switch (bireinfop->bi_type) { 21307 case BCAST_NET: 21308 addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet); 21309 if (addr != bireinfop->bi_addr) 21310 return (B_FALSE); 21311 break; 21312 case BCAST_SUBNET: 21313 if (ipif->ipif_subnet != bireinfop->bi_addr) 21314 return (B_FALSE); 21315 break; 21316 } 21317 21318 bireinfop->bi_needrep = 1; 21319 if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) { 21320 if (bireinfop->bi_backup == NULL) 21321 bireinfop->bi_backup = ipif; 21322 return (B_FALSE); 21323 } 21324 return (B_TRUE); 21325 } 21326 21327 /* 21328 * Create the broadcast IREs described by `bireinfop' on `ipif', and return 21329 * them ala ire_check_and_create_bcast(). 21330 */ 21331 static ire_t ** 21332 ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep) 21333 { 21334 ipaddr_t mask, addr; 21335 21336 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep); 21337 21338 addr = bireinfop->bi_addr; 21339 irep = ire_create_bcast(ipif, addr, irep); 21340 21341 switch (bireinfop->bi_type) { 21342 case BCAST_NET: 21343 mask = ip_net_mask(ipif->ipif_subnet); 21344 irep = ire_create_bcast(ipif, addr | ~mask, irep); 21345 break; 21346 case BCAST_SUBNET: 21347 mask = ipif->ipif_net_mask; 21348 irep = ire_create_bcast(ipif, addr | ~mask, irep); 21349 break; 21350 } 21351 21352 bireinfop->bi_haverep = 1; 21353 return (irep); 21354 } 21355 21356 /* 21357 * Walk through all of the ipifs on `ill' that will be affected by `test_ipif' 21358 * going away, and determine if any of the broadcast IREs (named by `bireinfop') 21359 * that are going away are still needed. If so, have ipif_create_bcast() 21360 * recreate them (except for the deprecated case, as explained below). 21361 */ 21362 static ire_t ** 21363 ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo, 21364 ire_t **irep) 21365 { 21366 int i; 21367 ipif_t *ipif; 21368 21369 ASSERT(!ill->ill_isv6); 21370 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 21371 /* 21372 * Skip this ipif if it's (a) the one being taken down, (b) 21373 * not in the same zone, or (c) has no valid local address. 21374 */ 21375 if (ipif == test_ipif || 21376 ipif->ipif_zoneid != test_ipif->ipif_zoneid || 21377 ipif->ipif_subnet == 0 || 21378 (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) != 21379 (IPIF_UP|IPIF_BROADCAST)) 21380 continue; 21381 21382 /* 21383 * For each dying IRE that hasn't yet been replaced, see if 21384 * `ipif' needs it and whether the IRE should be recreated on 21385 * `ipif'. If `ipif' is deprecated, ipif_consider_bcast() 21386 * will return B_FALSE even if `ipif' needs the IRE on the 21387 * hopes that we'll later find a needy non-deprecated ipif. 21388 * However, the ipif is recorded in bi_backup for possible 21389 * subsequent use by ipif_check_bcast_ires(). 21390 */ 21391 for (i = 0; i < BCAST_COUNT; i++) { 21392 if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep) 21393 continue; 21394 if (!ipif_consider_bcast(ipif, &bireinfo[i])) 21395 continue; 21396 irep = ipif_create_bcast(ipif, &bireinfo[i], irep); 21397 } 21398 21399 /* 21400 * If we've replaced all of the broadcast IREs that are going 21401 * to be taken down, we know we're done. 21402 */ 21403 for (i = 0; i < BCAST_COUNT; i++) { 21404 if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep) 21405 break; 21406 } 21407 if (i == BCAST_COUNT) 21408 break; 21409 } 21410 return (irep); 21411 } 21412 21413 /* 21414 * Check if `test_ipif' (which is going away) is associated with any existing 21415 * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were 21416 * using those broadcast IREs. If so, recreate the broadcast IREs on one or 21417 * more of those other ipifs. (The old IREs will be deleted in ipif_down().) 21418 * 21419 * This is necessary because broadcast IREs are shared. In particular, a 21420 * given ill has one set of all-zeroes and all-ones broadcast IREs (for every 21421 * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones, 21422 * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP 21423 * ipifs on. Thus, if there are two IPIF_UP ipifs on the same subnet with the 21424 * same zone, they will share the same set of broadcast IREs. 21425 * 21426 * Note: the upper bound of 12 IREs comes from the worst case of replacing all 21427 * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes, 21428 * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones). 21429 */ 21430 static void 21431 ipif_check_bcast_ires(ipif_t *test_ipif) 21432 { 21433 ill_t *ill = test_ipif->ipif_ill; 21434 ire_t *ire, *ire_array[12]; /* see note above */ 21435 ire_t **irep1, **irep = &ire_array[0]; 21436 uint_t i, willdie; 21437 ipaddr_t mask = ip_net_mask(test_ipif->ipif_subnet); 21438 bcast_ireinfo_t bireinfo[BCAST_COUNT]; 21439 21440 ASSERT(!test_ipif->ipif_isv6); 21441 ASSERT(IAM_WRITER_IPIF(test_ipif)); 21442 21443 /* 21444 * No broadcast IREs for the LOOPBACK interface 21445 * or others such as point to point and IPIF_NOXMIT. 21446 */ 21447 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 21448 (test_ipif->ipif_flags & IPIF_NOXMIT)) 21449 return; 21450 21451 bzero(bireinfo, sizeof (bireinfo)); 21452 bireinfo[0].bi_type = BCAST_ALLZEROES; 21453 bireinfo[0].bi_addr = 0; 21454 21455 bireinfo[1].bi_type = BCAST_ALLONES; 21456 bireinfo[1].bi_addr = INADDR_BROADCAST; 21457 21458 bireinfo[2].bi_type = BCAST_NET; 21459 bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask; 21460 21461 if (test_ipif->ipif_net_mask != 0) 21462 mask = test_ipif->ipif_net_mask; 21463 bireinfo[3].bi_type = BCAST_SUBNET; 21464 bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask; 21465 21466 /* 21467 * Figure out what (if any) broadcast IREs will die as a result of 21468 * `test_ipif' going away. If none will die, we're done. 21469 */ 21470 for (i = 0, willdie = 0; i < BCAST_COUNT; i++) { 21471 ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST, 21472 test_ipif, ALL_ZONES, NULL, 21473 (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst); 21474 if (ire != NULL) { 21475 willdie++; 21476 bireinfo[i].bi_willdie = 1; 21477 ire_refrele(ire); 21478 } 21479 } 21480 21481 if (willdie == 0) 21482 return; 21483 21484 /* 21485 * Walk through all the ipifs that will be affected by the dying IREs, 21486 * and recreate the IREs as necessary. 21487 */ 21488 irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); 21489 21490 /* 21491 * Scan through the set of broadcast IREs and see if there are any 21492 * that we need to replace that have not yet been replaced. If so, 21493 * replace them using the appropriate backup ipif. 21494 */ 21495 for (i = 0; i < BCAST_COUNT; i++) { 21496 if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep) 21497 irep = ipif_create_bcast(bireinfo[i].bi_backup, 21498 &bireinfo[i], irep); 21499 } 21500 21501 /* 21502 * If we can't create all of them, don't add any of them. (Code in 21503 * ip_wput_ire() and ire_to_ill() assumes that we always have a 21504 * non-loopback copy and loopback copy for a given address.) 21505 */ 21506 for (irep1 = irep; irep1 > ire_array; ) { 21507 irep1--; 21508 if (*irep1 == NULL) { 21509 ip0dbg(("ipif_check_bcast_ires: can't create " 21510 "IRE_BROADCAST, memory allocation failure\n")); 21511 while (irep > ire_array) { 21512 irep--; 21513 if (*irep != NULL) 21514 ire_delete(*irep); 21515 } 21516 return; 21517 } 21518 } 21519 21520 for (irep1 = irep; irep1 > ire_array; ) { 21521 irep1--; 21522 if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0) 21523 ire_refrele(*irep1); /* Held in ire_add */ 21524 } 21525 } 21526 21527 /* 21528 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 21529 * from lifr_flags and the name from lifr_name. 21530 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 21531 * since ipif_lookup_on_name uses the _isv6 flags when matching. 21532 * Returns EINPROGRESS when mp has been consumed by queueing it on 21533 * ill_pending_mp and the ioctl will complete in ip_rput. 21534 * 21535 * Can operate on either a module or a driver queue. 21536 * Returns an error if not a module queue. 21537 */ 21538 /* ARGSUSED */ 21539 int 21540 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21541 ip_ioctl_cmd_t *ipip, void *if_req) 21542 { 21543 ill_t *ill = q->q_ptr; 21544 phyint_t *phyi; 21545 ip_stack_t *ipst; 21546 struct lifreq *lifr = if_req; 21547 21548 ASSERT(ipif != NULL); 21549 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 21550 21551 if (q->q_next == NULL) { 21552 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 21553 return (EINVAL); 21554 } 21555 21556 /* 21557 * If we are not writer on 'q' then this interface exists already 21558 * and previous lookups (ip_extract_lifreq()) found this ipif -- 21559 * so return EALREADY. 21560 */ 21561 if (ill != ipif->ipif_ill) 21562 return (EALREADY); 21563 21564 if (ill->ill_name[0] != '\0') 21565 return (EALREADY); 21566 21567 /* 21568 * Set all the flags. Allows all kinds of override. Provide some 21569 * sanity checking by not allowing IFF_BROADCAST and IFF_MULTICAST 21570 * unless there is either multicast/broadcast support in the driver 21571 * or it is a pt-pt link. 21572 */ 21573 if (lifr->lifr_flags & (IFF_PROMISC|IFF_ALLMULTI)) { 21574 /* Meaningless to IP thus don't allow them to be set. */ 21575 ip1dbg(("ip_setname: EINVAL 1\n")); 21576 return (EINVAL); 21577 } 21578 21579 /* 21580 * If there's another ill already with the requested name, ensure 21581 * that it's of the same type. Otherwise, ill_phyint_reinit() will 21582 * fuse together two unrelated ills, which will cause chaos. 21583 */ 21584 ipst = ill->ill_ipst; 21585 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 21586 lifr->lifr_name, NULL); 21587 if (phyi != NULL) { 21588 ill_t *ill_mate = phyi->phyint_illv4; 21589 21590 if (ill_mate == NULL) 21591 ill_mate = phyi->phyint_illv6; 21592 ASSERT(ill_mate != NULL); 21593 21594 if (ill_mate->ill_media->ip_m_mac_type != 21595 ill->ill_media->ip_m_mac_type) { 21596 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 21597 "use the same ill name on differing media\n")); 21598 return (EINVAL); 21599 } 21600 } 21601 21602 /* 21603 * For a DL_STYLE2 driver (ill_needs_attach), we would not have the 21604 * ill_bcast_addr_length info. 21605 */ 21606 if (!ill->ill_needs_attach && 21607 ((lifr->lifr_flags & IFF_MULTICAST) && 21608 !(lifr->lifr_flags & IFF_POINTOPOINT) && 21609 ill->ill_bcast_addr_length == 0)) { 21610 /* Link not broadcast/pt-pt capable i.e. no multicast */ 21611 ip1dbg(("ip_setname: EINVAL 2\n")); 21612 return (EINVAL); 21613 } 21614 if ((lifr->lifr_flags & IFF_BROADCAST) && 21615 ((lifr->lifr_flags & IFF_IPV6) || 21616 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 21617 /* Link not broadcast capable or IPv6 i.e. no broadcast */ 21618 ip1dbg(("ip_setname: EINVAL 3\n")); 21619 return (EINVAL); 21620 } 21621 if (lifr->lifr_flags & IFF_UP) { 21622 /* Can only be set with SIOCSLIFFLAGS */ 21623 ip1dbg(("ip_setname: EINVAL 4\n")); 21624 return (EINVAL); 21625 } 21626 if ((lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV6 && 21627 (lifr->lifr_flags & (IFF_IPV6|IFF_IPV4)) != IFF_IPV4) { 21628 ip1dbg(("ip_setname: EINVAL 5\n")); 21629 return (EINVAL); 21630 } 21631 /* 21632 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 21633 */ 21634 if ((lifr->lifr_flags & IFF_XRESOLV) && 21635 !(lifr->lifr_flags & IFF_IPV6) && 21636 !(ipif->ipif_isv6)) { 21637 ip1dbg(("ip_setname: EINVAL 6\n")); 21638 return (EINVAL); 21639 } 21640 21641 /* 21642 * The user has done SIOCGLIFFLAGS prior to this ioctl and hence 21643 * we have all the flags here. So, we assign rather than we OR. 21644 * We can't OR the flags here because we don't want to set 21645 * both IFF_IPV4 and IFF_IPV6. We start off as IFF_IPV4 in 21646 * ipif_allocate and become IFF_IPV4 or IFF_IPV6 here depending 21647 * on lifr_flags value here. 21648 */ 21649 /* 21650 * This ill has not been inserted into the global list. 21651 * So we are still single threaded and don't need any lock 21652 */ 21653 ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS & ~IFF_DUPLICATE; 21654 ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; 21655 ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; 21656 21657 /* We started off as V4. */ 21658 if (ill->ill_flags & ILLF_IPV6) { 21659 ill->ill_phyint->phyint_illv6 = ill; 21660 ill->ill_phyint->phyint_illv4 = NULL; 21661 } 21662 21663 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 21664 } 21665 21666 /* ARGSUSED */ 21667 int 21668 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21669 ip_ioctl_cmd_t *ipip, void *if_req) 21670 { 21671 /* 21672 * ill_phyint_reinit merged the v4 and v6 into a single 21673 * ipsq. Could also have become part of a ipmp group in the 21674 * process, and we might not have been able to complete the 21675 * slifname in ipif_set_values, if we could not become 21676 * exclusive. If so restart it here 21677 */ 21678 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 21679 } 21680 21681 /* 21682 * Return a pointer to the ipif which matches the index, IP version type and 21683 * zoneid. 21684 */ 21685 ipif_t * 21686 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 21687 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst) 21688 { 21689 ill_t *ill; 21690 ipif_t *ipif = NULL; 21691 21692 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 21693 (q != NULL && mp != NULL && func != NULL && err != NULL)); 21694 21695 if (err != NULL) 21696 *err = 0; 21697 21698 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 21699 if (ill != NULL) { 21700 mutex_enter(&ill->ill_lock); 21701 for (ipif = ill->ill_ipif; ipif != NULL; 21702 ipif = ipif->ipif_next) { 21703 if (IPIF_CAN_LOOKUP(ipif) && (zoneid == ALL_ZONES || 21704 zoneid == ipif->ipif_zoneid || 21705 ipif->ipif_zoneid == ALL_ZONES)) { 21706 ipif_refhold_locked(ipif); 21707 break; 21708 } 21709 } 21710 mutex_exit(&ill->ill_lock); 21711 ill_refrele(ill); 21712 if (ipif == NULL && err != NULL) 21713 *err = ENXIO; 21714 } 21715 return (ipif); 21716 } 21717 21718 typedef struct conn_change_s { 21719 uint_t cc_old_ifindex; 21720 uint_t cc_new_ifindex; 21721 } conn_change_t; 21722 21723 /* 21724 * ipcl_walk function for changing interface index. 21725 */ 21726 static void 21727 conn_change_ifindex(conn_t *connp, caddr_t arg) 21728 { 21729 conn_change_t *connc; 21730 uint_t old_ifindex; 21731 uint_t new_ifindex; 21732 int i; 21733 ilg_t *ilg; 21734 21735 connc = (conn_change_t *)arg; 21736 old_ifindex = connc->cc_old_ifindex; 21737 new_ifindex = connc->cc_new_ifindex; 21738 21739 if (connp->conn_orig_bound_ifindex == old_ifindex) 21740 connp->conn_orig_bound_ifindex = new_ifindex; 21741 21742 if (connp->conn_orig_multicast_ifindex == old_ifindex) 21743 connp->conn_orig_multicast_ifindex = new_ifindex; 21744 21745 for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { 21746 ilg = &connp->conn_ilg[i]; 21747 if (ilg->ilg_orig_ifindex == old_ifindex) 21748 ilg->ilg_orig_ifindex = new_ifindex; 21749 } 21750 } 21751 21752 /* 21753 * Walk all the ipifs and ilms on this ill and change the orig_ifindex 21754 * to new_index if it matches the old_index. 21755 * 21756 * Failovers typically happen within a group of ills. But somebody 21757 * can remove an ill from the group after a failover happened. If 21758 * we are setting the ifindex after this, we potentially need to 21759 * look at all the ills rather than just the ones in the group. 21760 * We cut down the work by looking at matching ill_net_types 21761 * and ill_types as we could not possibly grouped them together. 21762 */ 21763 static void 21764 ip_change_ifindex(ill_t *ill_orig, conn_change_t *connc) 21765 { 21766 ill_t *ill; 21767 ipif_t *ipif; 21768 uint_t old_ifindex; 21769 uint_t new_ifindex; 21770 ilm_t *ilm; 21771 ill_walk_context_t ctx; 21772 ip_stack_t *ipst = ill_orig->ill_ipst; 21773 21774 old_ifindex = connc->cc_old_ifindex; 21775 new_ifindex = connc->cc_new_ifindex; 21776 21777 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 21778 ill = ILL_START_WALK_ALL(&ctx, ipst); 21779 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 21780 if ((ill_orig->ill_net_type != ill->ill_net_type) || 21781 (ill_orig->ill_type != ill->ill_type)) { 21782 continue; 21783 } 21784 for (ipif = ill->ill_ipif; ipif != NULL; 21785 ipif = ipif->ipif_next) { 21786 if (ipif->ipif_orig_ifindex == old_ifindex) 21787 ipif->ipif_orig_ifindex = new_ifindex; 21788 } 21789 for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { 21790 if (ilm->ilm_orig_ifindex == old_ifindex) 21791 ilm->ilm_orig_ifindex = new_ifindex; 21792 } 21793 } 21794 rw_exit(&ipst->ips_ill_g_lock); 21795 } 21796 21797 /* 21798 * We first need to ensure that the new index is unique, and 21799 * then carry the change across both v4 and v6 ill representation 21800 * of the physical interface. 21801 */ 21802 /* ARGSUSED */ 21803 int 21804 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21805 ip_ioctl_cmd_t *ipip, void *ifreq) 21806 { 21807 ill_t *ill; 21808 ill_t *ill_other; 21809 phyint_t *phyi; 21810 int old_index; 21811 conn_change_t connc; 21812 struct ifreq *ifr = (struct ifreq *)ifreq; 21813 struct lifreq *lifr = (struct lifreq *)ifreq; 21814 uint_t index; 21815 ill_t *ill_v4; 21816 ill_t *ill_v6; 21817 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 21818 21819 if (ipip->ipi_cmd_type == IF_CMD) 21820 index = ifr->ifr_index; 21821 else 21822 index = lifr->lifr_index; 21823 21824 /* 21825 * Only allow on physical interface. Also, index zero is illegal. 21826 * 21827 * Need to check for PHYI_FAILED and PHYI_INACTIVE 21828 * 21829 * 1) If PHYI_FAILED is set, a failover could have happened which 21830 * implies a possible failback might have to happen. As failback 21831 * depends on the old index, we should fail setting the index. 21832 * 21833 * 2) If PHYI_INACTIVE is set, in.mpathd does a failover so that 21834 * any addresses or multicast memberships are failed over to 21835 * a non-STANDBY interface. As failback depends on the old 21836 * index, we should fail setting the index for this case also. 21837 * 21838 * 3) If PHYI_OFFLINE is set, a possible failover has happened. 21839 * Be consistent with PHYI_FAILED and fail the ioctl. 21840 */ 21841 ill = ipif->ipif_ill; 21842 phyi = ill->ill_phyint; 21843 if ((phyi->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE|PHYI_OFFLINE)) || 21844 ipif->ipif_id != 0 || index == 0) { 21845 return (EINVAL); 21846 } 21847 old_index = phyi->phyint_ifindex; 21848 21849 /* If the index is not changing, no work to do */ 21850 if (old_index == index) 21851 return (0); 21852 21853 /* 21854 * Use ill_lookup_on_ifindex to determine if the 21855 * new index is unused and if so allow the change. 21856 */ 21857 ill_v6 = ill_lookup_on_ifindex(index, B_TRUE, NULL, NULL, NULL, NULL, 21858 ipst); 21859 ill_v4 = ill_lookup_on_ifindex(index, B_FALSE, NULL, NULL, NULL, NULL, 21860 ipst); 21861 if (ill_v6 != NULL || ill_v4 != NULL) { 21862 if (ill_v4 != NULL) 21863 ill_refrele(ill_v4); 21864 if (ill_v6 != NULL) 21865 ill_refrele(ill_v6); 21866 return (EBUSY); 21867 } 21868 21869 /* 21870 * The new index is unused. Set it in the phyint. 21871 * Locate the other ill so that we can send a routing 21872 * sockets message. 21873 */ 21874 if (ill->ill_isv6) { 21875 ill_other = phyi->phyint_illv4; 21876 } else { 21877 ill_other = phyi->phyint_illv6; 21878 } 21879 21880 phyi->phyint_ifindex = index; 21881 21882 /* Update SCTP's ILL list */ 21883 sctp_ill_reindex(ill, old_index); 21884 21885 connc.cc_old_ifindex = old_index; 21886 connc.cc_new_ifindex = index; 21887 ip_change_ifindex(ill, &connc); 21888 ipcl_walk(conn_change_ifindex, (caddr_t)&connc, ipst); 21889 21890 /* Send the routing sockets message */ 21891 ip_rts_ifmsg(ipif); 21892 if (ill_other != NULL) 21893 ip_rts_ifmsg(ill_other->ill_ipif); 21894 21895 return (0); 21896 } 21897 21898 /* ARGSUSED */ 21899 int 21900 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21901 ip_ioctl_cmd_t *ipip, void *ifreq) 21902 { 21903 struct ifreq *ifr = (struct ifreq *)ifreq; 21904 struct lifreq *lifr = (struct lifreq *)ifreq; 21905 21906 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 21907 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21908 /* Get the interface index */ 21909 if (ipip->ipi_cmd_type == IF_CMD) { 21910 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21911 } else { 21912 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 21913 } 21914 return (0); 21915 } 21916 21917 /* ARGSUSED */ 21918 int 21919 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21920 ip_ioctl_cmd_t *ipip, void *ifreq) 21921 { 21922 struct lifreq *lifr = (struct lifreq *)ifreq; 21923 21924 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 21925 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 21926 /* Get the interface zone */ 21927 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21928 lifr->lifr_zoneid = ipif->ipif_zoneid; 21929 return (0); 21930 } 21931 21932 /* 21933 * Set the zoneid of an interface. 21934 */ 21935 /* ARGSUSED */ 21936 int 21937 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 21938 ip_ioctl_cmd_t *ipip, void *ifreq) 21939 { 21940 struct lifreq *lifr = (struct lifreq *)ifreq; 21941 int err = 0; 21942 boolean_t need_up = B_FALSE; 21943 zone_t *zptr; 21944 zone_status_t status; 21945 zoneid_t zoneid; 21946 21947 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 21948 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 21949 if (!is_system_labeled()) 21950 return (ENOTSUP); 21951 zoneid = GLOBAL_ZONEID; 21952 } 21953 21954 /* cannot assign instance zero to a non-global zone */ 21955 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 21956 return (ENOTSUP); 21957 21958 /* 21959 * Cannot assign to a zone that doesn't exist or is shutting down. In 21960 * the event of a race with the zone shutdown processing, since IP 21961 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 21962 * interface will be cleaned up even if the zone is shut down 21963 * immediately after the status check. If the interface can't be brought 21964 * down right away, and the zone is shut down before the restart 21965 * function is called, we resolve the possible races by rechecking the 21966 * zone status in the restart function. 21967 */ 21968 if ((zptr = zone_find_by_id(zoneid)) == NULL) 21969 return (EINVAL); 21970 status = zone_status_get(zptr); 21971 zone_rele(zptr); 21972 21973 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 21974 return (EINVAL); 21975 21976 if (ipif->ipif_flags & IPIF_UP) { 21977 /* 21978 * If the interface is already marked up, 21979 * we call ipif_down which will take care 21980 * of ditching any IREs that have been set 21981 * up based on the old interface address. 21982 */ 21983 err = ipif_logical_down(ipif, q, mp); 21984 if (err == EINPROGRESS) 21985 return (err); 21986 ipif_down_tail(ipif); 21987 need_up = B_TRUE; 21988 } 21989 21990 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 21991 return (err); 21992 } 21993 21994 static int 21995 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 21996 queue_t *q, mblk_t *mp, boolean_t need_up) 21997 { 21998 int err = 0; 21999 ip_stack_t *ipst; 22000 22001 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 22002 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22003 22004 if (CONN_Q(q)) 22005 ipst = CONNQ_TO_IPST(q); 22006 else 22007 ipst = ILLQ_TO_IPST(q); 22008 22009 /* 22010 * For exclusive stacks we don't allow a different zoneid than 22011 * global. 22012 */ 22013 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 22014 zoneid != GLOBAL_ZONEID) 22015 return (EINVAL); 22016 22017 /* Set the new zone id. */ 22018 ipif->ipif_zoneid = zoneid; 22019 22020 /* Update sctp list */ 22021 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 22022 22023 if (need_up) { 22024 /* 22025 * Now bring the interface back up. If this 22026 * is the only IPIF for the ILL, ipif_up 22027 * will have to re-bind to the device, so 22028 * we may get back EINPROGRESS, in which 22029 * case, this IOCTL will get completed in 22030 * ip_rput_dlpi when we see the DL_BIND_ACK. 22031 */ 22032 err = ipif_up(ipif, q, mp); 22033 } 22034 return (err); 22035 } 22036 22037 /* ARGSUSED */ 22038 int 22039 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22040 ip_ioctl_cmd_t *ipip, void *if_req) 22041 { 22042 struct lifreq *lifr = (struct lifreq *)if_req; 22043 zoneid_t zoneid; 22044 zone_t *zptr; 22045 zone_status_t status; 22046 22047 ASSERT(ipif->ipif_id != 0); 22048 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 22049 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 22050 zoneid = GLOBAL_ZONEID; 22051 22052 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 22053 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22054 22055 /* 22056 * We recheck the zone status to resolve the following race condition: 22057 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 22058 * 2) hme0:1 is up and can't be brought down right away; 22059 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 22060 * 3) zone "myzone" is halted; the zone status switches to 22061 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 22062 * the interfaces to remove - hme0:1 is not returned because it's not 22063 * yet in "myzone", so it won't be removed; 22064 * 4) the restart function for SIOCSLIFZONE is called; without the 22065 * status check here, we would have hme0:1 in "myzone" after it's been 22066 * destroyed. 22067 * Note that if the status check fails, we need to bring the interface 22068 * back to its state prior to ip_sioctl_slifzone(), hence the call to 22069 * ipif_up_done[_v6](). 22070 */ 22071 status = ZONE_IS_UNINITIALIZED; 22072 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 22073 status = zone_status_get(zptr); 22074 zone_rele(zptr); 22075 } 22076 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 22077 if (ipif->ipif_isv6) { 22078 (void) ipif_up_done_v6(ipif); 22079 } else { 22080 (void) ipif_up_done(ipif); 22081 } 22082 return (EINVAL); 22083 } 22084 22085 ipif_down_tail(ipif); 22086 22087 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 22088 B_TRUE)); 22089 } 22090 22091 /* ARGSUSED */ 22092 int 22093 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22094 ip_ioctl_cmd_t *ipip, void *ifreq) 22095 { 22096 struct lifreq *lifr = ifreq; 22097 22098 ASSERT(q->q_next == NULL); 22099 ASSERT(CONN_Q(q)); 22100 22101 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 22102 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 22103 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 22104 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 22105 22106 return (0); 22107 } 22108 22109 22110 /* Find the previous ILL in this usesrc group */ 22111 static ill_t * 22112 ill_prev_usesrc(ill_t *uill) 22113 { 22114 ill_t *ill; 22115 22116 for (ill = uill->ill_usesrc_grp_next; 22117 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 22118 ill = ill->ill_usesrc_grp_next) 22119 /* do nothing */; 22120 return (ill); 22121 } 22122 22123 /* 22124 * Release all members of the usesrc group. This routine is called 22125 * from ill_delete when the interface being unplumbed is the 22126 * group head. 22127 */ 22128 static void 22129 ill_disband_usesrc_group(ill_t *uill) 22130 { 22131 ill_t *next_ill, *tmp_ill; 22132 ip_stack_t *ipst = uill->ill_ipst; 22133 22134 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 22135 next_ill = uill->ill_usesrc_grp_next; 22136 22137 do { 22138 ASSERT(next_ill != NULL); 22139 tmp_ill = next_ill->ill_usesrc_grp_next; 22140 ASSERT(tmp_ill != NULL); 22141 next_ill->ill_usesrc_grp_next = NULL; 22142 next_ill->ill_usesrc_ifindex = 0; 22143 next_ill = tmp_ill; 22144 } while (next_ill->ill_usesrc_ifindex != 0); 22145 uill->ill_usesrc_grp_next = NULL; 22146 } 22147 22148 /* 22149 * Remove the client usesrc ILL from the list and relink to a new list 22150 */ 22151 int 22152 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 22153 { 22154 ill_t *ill, *tmp_ill; 22155 ip_stack_t *ipst = ucill->ill_ipst; 22156 22157 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 22158 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 22159 22160 /* 22161 * Check if the usesrc client ILL passed in is not already 22162 * in use as a usesrc ILL i.e one whose source address is 22163 * in use OR a usesrc ILL is not already in use as a usesrc 22164 * client ILL 22165 */ 22166 if ((ucill->ill_usesrc_ifindex == 0) || 22167 (uill->ill_usesrc_ifindex != 0)) { 22168 return (-1); 22169 } 22170 22171 ill = ill_prev_usesrc(ucill); 22172 ASSERT(ill->ill_usesrc_grp_next != NULL); 22173 22174 /* Remove from the current list */ 22175 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 22176 /* Only two elements in the list */ 22177 ASSERT(ill->ill_usesrc_ifindex == 0); 22178 ill->ill_usesrc_grp_next = NULL; 22179 } else { 22180 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 22181 } 22182 22183 if (ifindex == 0) { 22184 ucill->ill_usesrc_ifindex = 0; 22185 ucill->ill_usesrc_grp_next = NULL; 22186 return (0); 22187 } 22188 22189 ucill->ill_usesrc_ifindex = ifindex; 22190 tmp_ill = uill->ill_usesrc_grp_next; 22191 uill->ill_usesrc_grp_next = ucill; 22192 ucill->ill_usesrc_grp_next = 22193 (tmp_ill != NULL) ? tmp_ill : uill; 22194 return (0); 22195 } 22196 22197 /* 22198 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 22199 * ip.c for locking details. 22200 */ 22201 /* ARGSUSED */ 22202 int 22203 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 22204 ip_ioctl_cmd_t *ipip, void *ifreq) 22205 { 22206 struct lifreq *lifr = (struct lifreq *)ifreq; 22207 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 22208 ill_flag_changed = B_FALSE; 22209 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 22210 int err = 0, ret; 22211 uint_t ifindex; 22212 phyint_t *us_phyint, *us_cli_phyint; 22213 ipsq_t *ipsq = NULL; 22214 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 22215 22216 ASSERT(IAM_WRITER_IPIF(ipif)); 22217 ASSERT(q->q_next == NULL); 22218 ASSERT(CONN_Q(q)); 22219 22220 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 22221 us_cli_phyint = usesrc_cli_ill->ill_phyint; 22222 22223 ASSERT(us_cli_phyint != NULL); 22224 22225 /* 22226 * If the client ILL is being used for IPMP, abort. 22227 * Note, this can be done before ipsq_try_enter since we are already 22228 * exclusive on this ILL 22229 */ 22230 if ((us_cli_phyint->phyint_groupname != NULL) || 22231 (us_cli_phyint->phyint_flags & PHYI_STANDBY)) { 22232 return (EINVAL); 22233 } 22234 22235 ifindex = lifr->lifr_index; 22236 if (ifindex == 0) { 22237 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 22238 /* non usesrc group interface, nothing to reset */ 22239 return (0); 22240 } 22241 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 22242 /* valid reset request */ 22243 reset_flg = B_TRUE; 22244 } 22245 22246 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 22247 ip_process_ioctl, &err, ipst); 22248 22249 if (usesrc_ill == NULL) { 22250 return (err); 22251 } 22252 22253 /* 22254 * The usesrc_cli_ill or the usesrc_ill cannot be part of an IPMP 22255 * group nor can either of the interfaces be used for standy. So 22256 * to guarantee mutual exclusion with ip_sioctl_flags (which sets 22257 * PHYI_STANDBY) and ip_sioctl_groupname (which sets the groupname) 22258 * we need to be exclusive on the ipsq belonging to the usesrc_ill. 22259 * We are already exlusive on this ipsq i.e ipsq corresponding to 22260 * the usesrc_cli_ill 22261 */ 22262 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 22263 NEW_OP, B_TRUE); 22264 if (ipsq == NULL) { 22265 err = EINPROGRESS; 22266 /* Operation enqueued on the ipsq of the usesrc ILL */ 22267 goto done; 22268 } 22269 22270 /* Check if the usesrc_ill is used for IPMP */ 22271 us_phyint = usesrc_ill->ill_phyint; 22272 if ((us_phyint->phyint_groupname != NULL) || 22273 (us_phyint->phyint_flags & PHYI_STANDBY)) { 22274 err = EINVAL; 22275 goto done; 22276 } 22277 22278 /* 22279 * If the client is already in use as a usesrc_ill or a usesrc_ill is 22280 * already a client then return EINVAL 22281 */ 22282 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 22283 err = EINVAL; 22284 goto done; 22285 } 22286 22287 /* 22288 * If the ill_usesrc_ifindex field is already set to what it needs to 22289 * be then this is a duplicate operation. 22290 */ 22291 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 22292 err = 0; 22293 goto done; 22294 } 22295 22296 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 22297 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 22298 usesrc_ill->ill_isv6)); 22299 22300 /* 22301 * The next step ensures that no new ires will be created referencing 22302 * the client ill, until the ILL_CHANGING flag is cleared. Then 22303 * we go through an ire walk deleting all ire caches that reference 22304 * the client ill. New ires referencing the client ill that are added 22305 * to the ire table before the ILL_CHANGING flag is set, will be 22306 * cleaned up by the ire walk below. Attempt to add new ires referencing 22307 * the client ill while the ILL_CHANGING flag is set will be failed 22308 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 22309 * checks (under the ill_g_usesrc_lock) that the ire being added 22310 * is not stale, i.e the ire_stq and ire_ipif are consistent and 22311 * belong to the same usesrc group. 22312 */ 22313 mutex_enter(&usesrc_cli_ill->ill_lock); 22314 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 22315 mutex_exit(&usesrc_cli_ill->ill_lock); 22316 ill_flag_changed = B_TRUE; 22317 22318 if (ipif->ipif_isv6) 22319 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22320 ALL_ZONES, ipst); 22321 else 22322 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 22323 ALL_ZONES, ipst); 22324 22325 /* 22326 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 22327 * and the ill_usesrc_ifindex fields 22328 */ 22329 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 22330 22331 if (reset_flg) { 22332 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 22333 if (ret != 0) { 22334 err = EINVAL; 22335 } 22336 rw_exit(&ipst->ips_ill_g_usesrc_lock); 22337 goto done; 22338 } 22339 22340 /* 22341 * Four possibilities to consider: 22342 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 22343 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 22344 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 22345 * 4. Both are part of their respective usesrc groups 22346 */ 22347 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 22348 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22349 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 22350 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22351 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22352 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 22353 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 22354 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 22355 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 22356 /* Insert at head of list */ 22357 usesrc_cli_ill->ill_usesrc_grp_next = 22358 usesrc_ill->ill_usesrc_grp_next; 22359 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 22360 } else { 22361 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 22362 ifindex); 22363 if (ret != 0) 22364 err = EINVAL; 22365 } 22366 rw_exit(&ipst->ips_ill_g_usesrc_lock); 22367 22368 done: 22369 if (ill_flag_changed) { 22370 mutex_enter(&usesrc_cli_ill->ill_lock); 22371 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 22372 mutex_exit(&usesrc_cli_ill->ill_lock); 22373 } 22374 if (ipsq != NULL) 22375 ipsq_exit(ipsq, B_TRUE, B_TRUE); 22376 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 22377 ill_refrele(usesrc_ill); 22378 return (err); 22379 } 22380 22381 /* 22382 * comparison function used by avl. 22383 */ 22384 static int 22385 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 22386 { 22387 22388 uint_t index; 22389 22390 ASSERT(phyip != NULL && index_ptr != NULL); 22391 22392 index = *((uint_t *)index_ptr); 22393 /* 22394 * let the phyint with the lowest index be on top. 22395 */ 22396 if (((phyint_t *)phyip)->phyint_ifindex < index) 22397 return (1); 22398 if (((phyint_t *)phyip)->phyint_ifindex > index) 22399 return (-1); 22400 return (0); 22401 } 22402 22403 /* 22404 * comparison function used by avl. 22405 */ 22406 static int 22407 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 22408 { 22409 ill_t *ill; 22410 int res = 0; 22411 22412 ASSERT(phyip != NULL && name_ptr != NULL); 22413 22414 if (((phyint_t *)phyip)->phyint_illv4) 22415 ill = ((phyint_t *)phyip)->phyint_illv4; 22416 else 22417 ill = ((phyint_t *)phyip)->phyint_illv6; 22418 ASSERT(ill != NULL); 22419 22420 res = strcmp(ill->ill_name, (char *)name_ptr); 22421 if (res > 0) 22422 return (1); 22423 else if (res < 0) 22424 return (-1); 22425 return (0); 22426 } 22427 /* 22428 * This function is called from ill_delete when the ill is being 22429 * unplumbed. We remove the reference from the phyint and we also 22430 * free the phyint when there are no more references to it. 22431 */ 22432 static void 22433 ill_phyint_free(ill_t *ill) 22434 { 22435 phyint_t *phyi; 22436 phyint_t *next_phyint; 22437 ipsq_t *cur_ipsq; 22438 ip_stack_t *ipst = ill->ill_ipst; 22439 22440 ASSERT(ill->ill_phyint != NULL); 22441 22442 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 22443 phyi = ill->ill_phyint; 22444 ill->ill_phyint = NULL; 22445 /* 22446 * ill_init allocates a phyint always to store the copy 22447 * of flags relevant to phyint. At that point in time, we could 22448 * not assign the name and hence phyint_illv4/v6 could not be 22449 * initialized. Later in ipif_set_values, we assign the name to 22450 * the ill, at which point in time we assign phyint_illv4/v6. 22451 * Thus we don't rely on phyint_illv6 to be initialized always. 22452 */ 22453 if (ill->ill_flags & ILLF_IPV6) { 22454 phyi->phyint_illv6 = NULL; 22455 } else { 22456 phyi->phyint_illv4 = NULL; 22457 } 22458 /* 22459 * ipif_down removes it from the group when the last ipif goes 22460 * down. 22461 */ 22462 ASSERT(ill->ill_group == NULL); 22463 22464 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) 22465 return; 22466 22467 /* 22468 * Make sure this phyint was put in the list. 22469 */ 22470 if (phyi->phyint_ifindex > 0) { 22471 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 22472 phyi); 22473 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 22474 phyi); 22475 } 22476 /* 22477 * remove phyint from the ipsq list. 22478 */ 22479 cur_ipsq = phyi->phyint_ipsq; 22480 if (phyi == cur_ipsq->ipsq_phyint_list) { 22481 cur_ipsq->ipsq_phyint_list = phyi->phyint_ipsq_next; 22482 } else { 22483 next_phyint = cur_ipsq->ipsq_phyint_list; 22484 while (next_phyint != NULL) { 22485 if (next_phyint->phyint_ipsq_next == phyi) { 22486 next_phyint->phyint_ipsq_next = 22487 phyi->phyint_ipsq_next; 22488 break; 22489 } 22490 next_phyint = next_phyint->phyint_ipsq_next; 22491 } 22492 ASSERT(next_phyint != NULL); 22493 } 22494 IPSQ_DEC_REF(cur_ipsq, ipst); 22495 22496 if (phyi->phyint_groupname_len != 0) { 22497 ASSERT(phyi->phyint_groupname != NULL); 22498 mi_free(phyi->phyint_groupname); 22499 } 22500 mi_free(phyi); 22501 } 22502 22503 /* 22504 * Attach the ill to the phyint structure which can be shared by both 22505 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 22506 * function is called from ipif_set_values and ill_lookup_on_name (for 22507 * loopback) where we know the name of the ill. We lookup the ill and if 22508 * there is one present already with the name use that phyint. Otherwise 22509 * reuse the one allocated by ill_init. 22510 */ 22511 static void 22512 ill_phyint_reinit(ill_t *ill) 22513 { 22514 boolean_t isv6 = ill->ill_isv6; 22515 phyint_t *phyi_old; 22516 phyint_t *phyi; 22517 avl_index_t where = 0; 22518 ill_t *ill_other = NULL; 22519 ipsq_t *ipsq; 22520 ip_stack_t *ipst = ill->ill_ipst; 22521 22522 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 22523 22524 phyi_old = ill->ill_phyint; 22525 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 22526 phyi_old->phyint_illv6 == NULL)); 22527 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 22528 phyi_old->phyint_illv4 == NULL)); 22529 ASSERT(phyi_old->phyint_ifindex == 0); 22530 22531 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 22532 ill->ill_name, &where); 22533 22534 /* 22535 * 1. We grabbed the ill_g_lock before inserting this ill into 22536 * the global list of ills. So no other thread could have located 22537 * this ill and hence the ipsq of this ill is guaranteed to be empty. 22538 * 2. Now locate the other protocol instance of this ill. 22539 * 3. Now grab both ill locks in the right order, and the phyint lock of 22540 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 22541 * of neither ill can change. 22542 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 22543 * other ill. 22544 * 5. Release all locks. 22545 */ 22546 22547 /* 22548 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 22549 * we are initializing IPv4. 22550 */ 22551 if (phyi != NULL) { 22552 ill_other = (isv6) ? phyi->phyint_illv4 : 22553 phyi->phyint_illv6; 22554 ASSERT(ill_other->ill_phyint != NULL); 22555 ASSERT((isv6 && !ill_other->ill_isv6) || 22556 (!isv6 && ill_other->ill_isv6)); 22557 GRAB_ILL_LOCKS(ill, ill_other); 22558 /* 22559 * We are potentially throwing away phyint_flags which 22560 * could be different from the one that we obtain from 22561 * ill_other->ill_phyint. But it is okay as we are assuming 22562 * that the state maintained within IP is correct. 22563 */ 22564 mutex_enter(&phyi->phyint_lock); 22565 if (isv6) { 22566 ASSERT(phyi->phyint_illv6 == NULL); 22567 phyi->phyint_illv6 = ill; 22568 } else { 22569 ASSERT(phyi->phyint_illv4 == NULL); 22570 phyi->phyint_illv4 = ill; 22571 } 22572 /* 22573 * This is a new ill, currently undergoing SLIFNAME 22574 * So we could not have joined an IPMP group until now. 22575 */ 22576 ASSERT(phyi_old->phyint_ipsq_next == NULL && 22577 phyi_old->phyint_groupname == NULL); 22578 22579 /* 22580 * This phyi_old is going away. Decref ipsq_refs and 22581 * assert it is zero. The ipsq itself will be freed in 22582 * ipsq_exit 22583 */ 22584 ipsq = phyi_old->phyint_ipsq; 22585 IPSQ_DEC_REF(ipsq, ipst); 22586 ASSERT(ipsq->ipsq_refs == 0); 22587 /* Get the singleton phyint out of the ipsq list */ 22588 ASSERT(phyi_old->phyint_ipsq_next == NULL); 22589 ipsq->ipsq_phyint_list = NULL; 22590 phyi_old->phyint_illv4 = NULL; 22591 phyi_old->phyint_illv6 = NULL; 22592 mi_free(phyi_old); 22593 } else { 22594 mutex_enter(&ill->ill_lock); 22595 /* 22596 * We don't need to acquire any lock, since 22597 * the ill is not yet visible globally and we 22598 * have not yet released the ill_g_lock. 22599 */ 22600 phyi = phyi_old; 22601 mutex_enter(&phyi->phyint_lock); 22602 /* XXX We need a recovery strategy here. */ 22603 if (!phyint_assign_ifindex(phyi, ipst)) 22604 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 22605 22606 /* No IPMP group yet, thus the hook uses the ifindex */ 22607 phyi->phyint_hook_ifindex = phyi->phyint_ifindex; 22608 22609 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 22610 (void *)phyi, where); 22611 22612 (void) avl_find(&ipst->ips_phyint_g_list-> 22613 phyint_list_avl_by_index, 22614 &phyi->phyint_ifindex, &where); 22615 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 22616 (void *)phyi, where); 22617 } 22618 22619 /* 22620 * Reassigning ill_phyint automatically reassigns the ipsq also. 22621 * pending mp is not affected because that is per ill basis. 22622 */ 22623 ill->ill_phyint = phyi; 22624 22625 /* 22626 * Keep the index on ipif_orig_index to be used by FAILOVER. 22627 * We do this here as when the first ipif was allocated, 22628 * ipif_allocate does not know the right interface index. 22629 */ 22630 22631 ill->ill_ipif->ipif_orig_ifindex = ill->ill_phyint->phyint_ifindex; 22632 /* 22633 * Now that the phyint's ifindex has been assigned, complete the 22634 * remaining 22635 */ 22636 22637 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 22638 if (ill->ill_isv6) { 22639 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 22640 ill->ill_phyint->phyint_ifindex; 22641 ill->ill_mcast_type = ipst->ips_mld_max_version; 22642 } else { 22643 ill->ill_mcast_type = ipst->ips_igmp_max_version; 22644 } 22645 22646 /* 22647 * Generate an event within the hooks framework to indicate that 22648 * a new interface has just been added to IP. For this event to 22649 * be generated, the network interface must, at least, have an 22650 * ifindex assigned to it. 22651 * 22652 * This needs to be run inside the ill_g_lock perimeter to ensure 22653 * that the ordering of delivered events to listeners matches the 22654 * order of them in the kernel. 22655 * 22656 * This function could be called from ill_lookup_on_name. In that case 22657 * the interface is loopback "lo", which will not generate a NIC event. 22658 */ 22659 if (ill->ill_name_length <= 2 || 22660 ill->ill_name[0] != 'l' || ill->ill_name[1] != 'o') { 22661 /* 22662 * Generate nic plumb event for ill_name even if 22663 * ipmp_hook_emulation is set. That avoids generating events 22664 * for the ill_names should ipmp_hook_emulation be turned on 22665 * later. 22666 */ 22667 ill_nic_info_plumb(ill, B_FALSE); 22668 } 22669 RELEASE_ILL_LOCKS(ill, ill_other); 22670 mutex_exit(&phyi->phyint_lock); 22671 } 22672 22673 /* 22674 * Allocate a NE_PLUMB nic info event and store in the ill. 22675 * If 'group' is set we do it for the group name, otherwise the ill name. 22676 * It will be sent when we leave the ipsq. 22677 */ 22678 void 22679 ill_nic_info_plumb(ill_t *ill, boolean_t group) 22680 { 22681 phyint_t *phyi = ill->ill_phyint; 22682 ip_stack_t *ipst = ill->ill_ipst; 22683 hook_nic_event_t *info; 22684 char *name; 22685 int namelen; 22686 22687 ASSERT(MUTEX_HELD(&ill->ill_lock)); 22688 22689 if ((info = ill->ill_nic_event_info) != NULL) { 22690 ip2dbg(("ill_nic_info_plumb: unexpected nic event %d " 22691 "attached for %s\n", info->hne_event, 22692 ill->ill_name)); 22693 if (info->hne_data != NULL) 22694 kmem_free(info->hne_data, info->hne_datalen); 22695 kmem_free(info, sizeof (hook_nic_event_t)); 22696 ill->ill_nic_event_info = NULL; 22697 } 22698 22699 info = kmem_alloc(sizeof (hook_nic_event_t), KM_NOSLEEP); 22700 if (info == NULL) { 22701 ip2dbg(("ill_nic_info_plumb: could not attach PLUMB nic " 22702 "event information for %s (ENOMEM)\n", 22703 ill->ill_name)); 22704 return; 22705 } 22706 22707 if (group) { 22708 ASSERT(phyi->phyint_groupname_len != 0); 22709 namelen = phyi->phyint_groupname_len; 22710 name = phyi->phyint_groupname; 22711 } else { 22712 namelen = ill->ill_name_length; 22713 name = ill->ill_name; 22714 } 22715 22716 info->hne_nic = phyi->phyint_hook_ifindex; 22717 info->hne_lif = 0; 22718 info->hne_event = NE_PLUMB; 22719 info->hne_family = ill->ill_isv6 ? 22720 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 22721 22722 info->hne_data = kmem_alloc(namelen, KM_NOSLEEP); 22723 if (info->hne_data != NULL) { 22724 info->hne_datalen = namelen; 22725 bcopy(name, info->hne_data, info->hne_datalen); 22726 } else { 22727 ip2dbg(("ill_nic_info_plumb: could not attach " 22728 "name information for PLUMB nic event " 22729 "of %s (ENOMEM)\n", name)); 22730 kmem_free(info, sizeof (hook_nic_event_t)); 22731 info = NULL; 22732 } 22733 ill->ill_nic_event_info = info; 22734 } 22735 22736 /* 22737 * Unhook the nic event message from the ill and enqueue it 22738 * into the nic event taskq. 22739 */ 22740 void 22741 ill_nic_info_dispatch(ill_t *ill) 22742 { 22743 hook_nic_event_t *info; 22744 22745 ASSERT(MUTEX_HELD(&ill->ill_lock)); 22746 22747 if ((info = ill->ill_nic_event_info) != NULL) { 22748 if (ddi_taskq_dispatch(eventq_queue_nic, 22749 ip_ne_queue_func, info, DDI_SLEEP) == DDI_FAILURE) { 22750 ip2dbg(("ill_nic_info_dispatch: " 22751 "ddi_taskq_dispatch failed\n")); 22752 if (info->hne_data != NULL) 22753 kmem_free(info->hne_data, info->hne_datalen); 22754 kmem_free(info, sizeof (hook_nic_event_t)); 22755 } 22756 ill->ill_nic_event_info = NULL; 22757 } 22758 } 22759 22760 /* 22761 * Notify any downstream modules of the name of this interface. 22762 * An M_IOCTL is used even though we don't expect a successful reply. 22763 * Any reply message from the driver (presumably an M_IOCNAK) will 22764 * eventually get discarded somewhere upstream. The message format is 22765 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 22766 * to IP. 22767 */ 22768 static void 22769 ip_ifname_notify(ill_t *ill, queue_t *q) 22770 { 22771 mblk_t *mp1, *mp2; 22772 struct iocblk *iocp; 22773 struct lifreq *lifr; 22774 22775 mp1 = mkiocb(SIOCSLIFNAME); 22776 if (mp1 == NULL) 22777 return; 22778 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 22779 if (mp2 == NULL) { 22780 freeb(mp1); 22781 return; 22782 } 22783 22784 mp1->b_cont = mp2; 22785 iocp = (struct iocblk *)mp1->b_rptr; 22786 iocp->ioc_count = sizeof (struct lifreq); 22787 22788 lifr = (struct lifreq *)mp2->b_rptr; 22789 mp2->b_wptr += sizeof (struct lifreq); 22790 bzero(lifr, sizeof (struct lifreq)); 22791 22792 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 22793 lifr->lifr_ppa = ill->ill_ppa; 22794 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 22795 22796 putnext(q, mp1); 22797 } 22798 22799 static int 22800 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 22801 { 22802 int err; 22803 ip_stack_t *ipst = ill->ill_ipst; 22804 22805 /* Set the obsolete NDD per-interface forwarding name. */ 22806 err = ill_set_ndd_name(ill); 22807 if (err != 0) { 22808 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 22809 err); 22810 } 22811 22812 /* Tell downstream modules where they are. */ 22813 ip_ifname_notify(ill, q); 22814 22815 /* 22816 * ill_dl_phys returns EINPROGRESS in the usual case. 22817 * Error cases are ENOMEM ... 22818 */ 22819 err = ill_dl_phys(ill, ipif, mp, q); 22820 22821 /* 22822 * If there is no IRE expiration timer running, get one started. 22823 * igmp and mld timers will be triggered by the first multicast 22824 */ 22825 if (ipst->ips_ip_ire_expire_id == 0) { 22826 /* 22827 * acquire the lock and check again. 22828 */ 22829 mutex_enter(&ipst->ips_ip_trash_timer_lock); 22830 if (ipst->ips_ip_ire_expire_id == 0) { 22831 ipst->ips_ip_ire_expire_id = timeout( 22832 ip_trash_timer_expire, ipst, 22833 MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 22834 } 22835 mutex_exit(&ipst->ips_ip_trash_timer_lock); 22836 } 22837 22838 if (ill->ill_isv6) { 22839 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 22840 if (ipst->ips_mld_slowtimeout_id == 0) { 22841 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 22842 (void *)ipst, 22843 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22844 } 22845 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 22846 } else { 22847 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 22848 if (ipst->ips_igmp_slowtimeout_id == 0) { 22849 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 22850 (void *)ipst, 22851 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 22852 } 22853 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 22854 } 22855 22856 return (err); 22857 } 22858 22859 /* 22860 * Common routine for ppa and ifname setting. Should be called exclusive. 22861 * 22862 * Returns EINPROGRESS when mp has been consumed by queueing it on 22863 * ill_pending_mp and the ioctl will complete in ip_rput. 22864 * 22865 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 22866 * the new name and new ppa in lifr_name and lifr_ppa respectively. 22867 * For SLIFNAME, we pass these values back to the userland. 22868 */ 22869 static int 22870 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 22871 { 22872 ill_t *ill; 22873 ipif_t *ipif; 22874 ipsq_t *ipsq; 22875 char *ppa_ptr; 22876 char *old_ptr; 22877 char old_char; 22878 int error; 22879 ip_stack_t *ipst; 22880 22881 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 22882 ASSERT(q->q_next != NULL); 22883 ASSERT(interf_name != NULL); 22884 22885 ill = (ill_t *)q->q_ptr; 22886 ipst = ill->ill_ipst; 22887 22888 ASSERT(ill->ill_ipst != NULL); 22889 ASSERT(ill->ill_name[0] == '\0'); 22890 ASSERT(IAM_WRITER_ILL(ill)); 22891 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 22892 ASSERT(ill->ill_ppa == UINT_MAX); 22893 22894 /* The ppa is sent down by ifconfig or is chosen */ 22895 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 22896 return (EINVAL); 22897 } 22898 22899 /* 22900 * make sure ppa passed in is same as ppa in the name. 22901 * This check is not made when ppa == UINT_MAX in that case ppa 22902 * in the name could be anything. System will choose a ppa and 22903 * update new_ppa_ptr and inter_name to contain the choosen ppa. 22904 */ 22905 if (*new_ppa_ptr != UINT_MAX) { 22906 /* stoi changes the pointer */ 22907 old_ptr = ppa_ptr; 22908 /* 22909 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 22910 * (they don't have an externally visible ppa). We assign one 22911 * here so that we can manage the interface. Note that in 22912 * the past this value was always 0 for DLPI 1 drivers. 22913 */ 22914 if (*new_ppa_ptr == 0) 22915 *new_ppa_ptr = stoi(&old_ptr); 22916 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 22917 return (EINVAL); 22918 } 22919 /* 22920 * terminate string before ppa 22921 * save char at that location. 22922 */ 22923 old_char = ppa_ptr[0]; 22924 ppa_ptr[0] = '\0'; 22925 22926 ill->ill_ppa = *new_ppa_ptr; 22927 /* 22928 * Finish as much work now as possible before calling ill_glist_insert 22929 * which makes the ill globally visible and also merges it with the 22930 * other protocol instance of this phyint. The remaining work is 22931 * done after entering the ipsq which may happen sometime later. 22932 * ill_set_ndd_name occurs after the ill has been made globally visible. 22933 */ 22934 ipif = ill->ill_ipif; 22935 22936 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 22937 ipif_assign_seqid(ipif); 22938 22939 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 22940 ill->ill_flags |= ILLF_IPV4; 22941 22942 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 22943 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 22944 22945 if (ill->ill_flags & ILLF_IPV6) { 22946 22947 ill->ill_isv6 = B_TRUE; 22948 if (ill->ill_rq != NULL) { 22949 ill->ill_rq->q_qinfo = &iprinitv6; 22950 ill->ill_wq->q_qinfo = &ipwinitv6; 22951 } 22952 22953 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 22954 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 22955 ipif->ipif_v6src_addr = ipv6_all_zeros; 22956 ipif->ipif_v6subnet = ipv6_all_zeros; 22957 ipif->ipif_v6net_mask = ipv6_all_zeros; 22958 ipif->ipif_v6brd_addr = ipv6_all_zeros; 22959 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 22960 /* 22961 * point-to-point or Non-mulicast capable 22962 * interfaces won't do NUD unless explicitly 22963 * configured to do so. 22964 */ 22965 if (ipif->ipif_flags & IPIF_POINTOPOINT || 22966 !(ill->ill_flags & ILLF_MULTICAST)) { 22967 ill->ill_flags |= ILLF_NONUD; 22968 } 22969 /* Make sure IPv4 specific flag is not set on IPv6 if */ 22970 if (ill->ill_flags & ILLF_NOARP) { 22971 /* 22972 * Note: xresolv interfaces will eventually need 22973 * NOARP set here as well, but that will require 22974 * those external resolvers to have some 22975 * knowledge of that flag and act appropriately. 22976 * Not to be changed at present. 22977 */ 22978 ill->ill_flags &= ~ILLF_NOARP; 22979 } 22980 /* 22981 * Set the ILLF_ROUTER flag according to the global 22982 * IPv6 forwarding policy. 22983 */ 22984 if (ipst->ips_ipv6_forward != 0) 22985 ill->ill_flags |= ILLF_ROUTER; 22986 } else if (ill->ill_flags & ILLF_IPV4) { 22987 ill->ill_isv6 = B_FALSE; 22988 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 22989 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 22990 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 22991 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 22992 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 22993 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 22994 /* 22995 * Set the ILLF_ROUTER flag according to the global 22996 * IPv4 forwarding policy. 22997 */ 22998 if (ipst->ips_ip_g_forward != 0) 22999 ill->ill_flags |= ILLF_ROUTER; 23000 } 23001 23002 ASSERT(ill->ill_phyint != NULL); 23003 23004 /* 23005 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 23006 * be completed in ill_glist_insert -> ill_phyint_reinit 23007 */ 23008 if (!ill_allocate_mibs(ill)) 23009 return (ENOMEM); 23010 23011 /* 23012 * Pick a default sap until we get the DL_INFO_ACK back from 23013 * the driver. 23014 */ 23015 if (ill->ill_sap == 0) { 23016 if (ill->ill_isv6) 23017 ill->ill_sap = IP6_DL_SAP; 23018 else 23019 ill->ill_sap = IP_DL_SAP; 23020 } 23021 23022 ill->ill_ifname_pending = 1; 23023 ill->ill_ifname_pending_err = 0; 23024 23025 ill_refhold(ill); 23026 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 23027 if ((error = ill_glist_insert(ill, interf_name, 23028 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 23029 ill->ill_ppa = UINT_MAX; 23030 ill->ill_name[0] = '\0'; 23031 /* 23032 * undo null termination done above. 23033 */ 23034 ppa_ptr[0] = old_char; 23035 rw_exit(&ipst->ips_ill_g_lock); 23036 ill_refrele(ill); 23037 return (error); 23038 } 23039 23040 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 23041 23042 /* 23043 * When we return the buffer pointed to by interf_name should contain 23044 * the same name as in ill_name. 23045 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 23046 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 23047 * so copy full name and update the ppa ptr. 23048 * When ppa passed in != UINT_MAX all values are correct just undo 23049 * null termination, this saves a bcopy. 23050 */ 23051 if (*new_ppa_ptr == UINT_MAX) { 23052 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 23053 *new_ppa_ptr = ill->ill_ppa; 23054 } else { 23055 /* 23056 * undo null termination done above. 23057 */ 23058 ppa_ptr[0] = old_char; 23059 } 23060 23061 /* Let SCTP know about this ILL */ 23062 sctp_update_ill(ill, SCTP_ILL_INSERT); 23063 23064 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 23065 B_TRUE); 23066 23067 rw_exit(&ipst->ips_ill_g_lock); 23068 ill_refrele(ill); 23069 if (ipsq == NULL) 23070 return (EINPROGRESS); 23071 23072 /* 23073 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 23074 */ 23075 if (ipsq->ipsq_current_ipif == NULL) 23076 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 23077 else 23078 ASSERT(ipsq->ipsq_current_ipif == ipif); 23079 23080 error = ipif_set_values_tail(ill, ipif, mp, q); 23081 ipsq_exit(ipsq, B_TRUE, B_TRUE); 23082 if (error != 0 && error != EINPROGRESS) { 23083 /* 23084 * restore previous values 23085 */ 23086 ill->ill_isv6 = B_FALSE; 23087 } 23088 return (error); 23089 } 23090 23091 23092 void 23093 ipif_init(ip_stack_t *ipst) 23094 { 23095 hrtime_t hrt; 23096 int i; 23097 23098 /* 23099 * Can't call drv_getparm here as it is too early in the boot. 23100 * As we use ipif_src_random just for picking a different 23101 * source address everytime, this need not be really random. 23102 */ 23103 hrt = gethrtime(); 23104 ipst->ips_ipif_src_random = 23105 ((hrt >> 32) & 0xffffffff) * (hrt & 0xffffffff); 23106 23107 for (i = 0; i < MAX_G_HEADS; i++) { 23108 ipst->ips_ill_g_heads[i].ill_g_list_head = 23109 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 23110 ipst->ips_ill_g_heads[i].ill_g_list_tail = 23111 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 23112 } 23113 23114 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 23115 ill_phyint_compare_index, 23116 sizeof (phyint_t), 23117 offsetof(struct phyint, phyint_avl_by_index)); 23118 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 23119 ill_phyint_compare_name, 23120 sizeof (phyint_t), 23121 offsetof(struct phyint, phyint_avl_by_name)); 23122 } 23123 23124 /* 23125 * Lookup the ipif corresponding to the onlink destination address. For 23126 * point-to-point interfaces, it matches with remote endpoint destination 23127 * address. For point-to-multipoint interfaces it only tries to match the 23128 * destination with the interface's subnet address. The longest, most specific 23129 * match is found to take care of such rare network configurations like - 23130 * le0: 129.146.1.1/16 23131 * le1: 129.146.2.2/24 23132 * It is used only by SO_DONTROUTE at the moment. 23133 */ 23134 ipif_t * 23135 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 23136 { 23137 ipif_t *ipif, *best_ipif; 23138 ill_t *ill; 23139 ill_walk_context_t ctx; 23140 23141 ASSERT(zoneid != ALL_ZONES); 23142 best_ipif = NULL; 23143 23144 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 23145 ill = ILL_START_WALK_V4(&ctx, ipst); 23146 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 23147 mutex_enter(&ill->ill_lock); 23148 for (ipif = ill->ill_ipif; ipif != NULL; 23149 ipif = ipif->ipif_next) { 23150 if (!IPIF_CAN_LOOKUP(ipif)) 23151 continue; 23152 if (ipif->ipif_zoneid != zoneid && 23153 ipif->ipif_zoneid != ALL_ZONES) 23154 continue; 23155 /* 23156 * Point-to-point case. Look for exact match with 23157 * destination address. 23158 */ 23159 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 23160 if (ipif->ipif_pp_dst_addr == addr) { 23161 ipif_refhold_locked(ipif); 23162 mutex_exit(&ill->ill_lock); 23163 rw_exit(&ipst->ips_ill_g_lock); 23164 if (best_ipif != NULL) 23165 ipif_refrele(best_ipif); 23166 return (ipif); 23167 } 23168 } else if (ipif->ipif_subnet == (addr & 23169 ipif->ipif_net_mask)) { 23170 /* 23171 * Point-to-multipoint case. Looping through to 23172 * find the most specific match. If there are 23173 * multiple best match ipif's then prefer ipif's 23174 * that are UP. If there is only one best match 23175 * ipif and it is DOWN we must still return it. 23176 */ 23177 if ((best_ipif == NULL) || 23178 (ipif->ipif_net_mask > 23179 best_ipif->ipif_net_mask) || 23180 ((ipif->ipif_net_mask == 23181 best_ipif->ipif_net_mask) && 23182 ((ipif->ipif_flags & IPIF_UP) && 23183 (!(best_ipif->ipif_flags & IPIF_UP))))) { 23184 ipif_refhold_locked(ipif); 23185 mutex_exit(&ill->ill_lock); 23186 rw_exit(&ipst->ips_ill_g_lock); 23187 if (best_ipif != NULL) 23188 ipif_refrele(best_ipif); 23189 best_ipif = ipif; 23190 rw_enter(&ipst->ips_ill_g_lock, 23191 RW_READER); 23192 mutex_enter(&ill->ill_lock); 23193 } 23194 } 23195 } 23196 mutex_exit(&ill->ill_lock); 23197 } 23198 rw_exit(&ipst->ips_ill_g_lock); 23199 return (best_ipif); 23200 } 23201 23202 23203 /* 23204 * Save enough information so that we can recreate the IRE if 23205 * the interface goes down and then up. 23206 */ 23207 static void 23208 ipif_save_ire(ipif_t *ipif, ire_t *ire) 23209 { 23210 mblk_t *save_mp; 23211 23212 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 23213 if (save_mp != NULL) { 23214 ifrt_t *ifrt; 23215 23216 save_mp->b_wptr += sizeof (ifrt_t); 23217 ifrt = (ifrt_t *)save_mp->b_rptr; 23218 bzero(ifrt, sizeof (ifrt_t)); 23219 ifrt->ifrt_type = ire->ire_type; 23220 ifrt->ifrt_addr = ire->ire_addr; 23221 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 23222 ifrt->ifrt_src_addr = ire->ire_src_addr; 23223 ifrt->ifrt_mask = ire->ire_mask; 23224 ifrt->ifrt_flags = ire->ire_flags; 23225 ifrt->ifrt_max_frag = ire->ire_max_frag; 23226 mutex_enter(&ipif->ipif_saved_ire_lock); 23227 save_mp->b_cont = ipif->ipif_saved_ire_mp; 23228 ipif->ipif_saved_ire_mp = save_mp; 23229 ipif->ipif_saved_ire_cnt++; 23230 mutex_exit(&ipif->ipif_saved_ire_lock); 23231 } 23232 } 23233 23234 23235 static void 23236 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 23237 { 23238 mblk_t **mpp; 23239 mblk_t *mp; 23240 ifrt_t *ifrt; 23241 23242 /* Remove from ipif_saved_ire_mp list if it is there */ 23243 mutex_enter(&ipif->ipif_saved_ire_lock); 23244 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 23245 mpp = &(*mpp)->b_cont) { 23246 /* 23247 * On a given ipif, the triple of address, gateway and 23248 * mask is unique for each saved IRE (in the case of 23249 * ordinary interface routes, the gateway address is 23250 * all-zeroes). 23251 */ 23252 mp = *mpp; 23253 ifrt = (ifrt_t *)mp->b_rptr; 23254 if (ifrt->ifrt_addr == ire->ire_addr && 23255 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 23256 ifrt->ifrt_mask == ire->ire_mask) { 23257 *mpp = mp->b_cont; 23258 ipif->ipif_saved_ire_cnt--; 23259 freeb(mp); 23260 break; 23261 } 23262 } 23263 mutex_exit(&ipif->ipif_saved_ire_lock); 23264 } 23265 23266 23267 /* 23268 * IP multirouting broadcast routes handling 23269 * Append CGTP broadcast IREs to regular ones created 23270 * at ifconfig time. 23271 */ 23272 static void 23273 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst) 23274 { 23275 ire_t *ire_prim; 23276 23277 ASSERT(ire != NULL); 23278 ASSERT(ire_dst != NULL); 23279 23280 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23281 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 23282 if (ire_prim != NULL) { 23283 /* 23284 * We are in the special case of broadcasts for 23285 * CGTP. We add an IRE_BROADCAST that holds 23286 * the RTF_MULTIRT flag, the destination 23287 * address of ire_dst and the low level 23288 * info of ire_prim. In other words, CGTP 23289 * broadcast is added to the redundant ipif. 23290 */ 23291 ipif_t *ipif_prim; 23292 ire_t *bcast_ire; 23293 23294 ipif_prim = ire_prim->ire_ipif; 23295 23296 ip2dbg(("ip_cgtp_filter_bcast_add: " 23297 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23298 (void *)ire_dst, (void *)ire_prim, 23299 (void *)ipif_prim)); 23300 23301 bcast_ire = ire_create( 23302 (uchar_t *)&ire->ire_addr, 23303 (uchar_t *)&ip_g_all_ones, 23304 (uchar_t *)&ire_dst->ire_src_addr, 23305 (uchar_t *)&ire->ire_gateway_addr, 23306 &ipif_prim->ipif_mtu, 23307 NULL, 23308 ipif_prim->ipif_rq, 23309 ipif_prim->ipif_wq, 23310 IRE_BROADCAST, 23311 ipif_prim, 23312 0, 23313 0, 23314 0, 23315 ire->ire_flags, 23316 &ire_uinfo_null, 23317 NULL, 23318 NULL, 23319 ipst); 23320 23321 if (bcast_ire != NULL) { 23322 23323 if (ire_add(&bcast_ire, NULL, NULL, NULL, 23324 B_FALSE) == 0) { 23325 ip2dbg(("ip_cgtp_filter_bcast_add: " 23326 "added bcast_ire %p\n", 23327 (void *)bcast_ire)); 23328 23329 ipif_save_ire(bcast_ire->ire_ipif, 23330 bcast_ire); 23331 ire_refrele(bcast_ire); 23332 } 23333 } 23334 ire_refrele(ire_prim); 23335 } 23336 } 23337 23338 23339 /* 23340 * IP multirouting broadcast routes handling 23341 * Remove the broadcast ire 23342 */ 23343 static void 23344 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 23345 { 23346 ire_t *ire_dst; 23347 23348 ASSERT(ire != NULL); 23349 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 23350 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 23351 if (ire_dst != NULL) { 23352 ire_t *ire_prim; 23353 23354 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 23355 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 23356 if (ire_prim != NULL) { 23357 ipif_t *ipif_prim; 23358 ire_t *bcast_ire; 23359 23360 ipif_prim = ire_prim->ire_ipif; 23361 23362 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23363 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 23364 (void *)ire_dst, (void *)ire_prim, 23365 (void *)ipif_prim)); 23366 23367 bcast_ire = ire_ctable_lookup(ire->ire_addr, 23368 ire->ire_gateway_addr, 23369 IRE_BROADCAST, 23370 ipif_prim, ALL_ZONES, 23371 NULL, 23372 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 23373 MATCH_IRE_MASK, ipst); 23374 23375 if (bcast_ire != NULL) { 23376 ip2dbg(("ip_cgtp_filter_bcast_delete: " 23377 "looked up bcast_ire %p\n", 23378 (void *)bcast_ire)); 23379 ipif_remove_ire(bcast_ire->ire_ipif, 23380 bcast_ire); 23381 ire_delete(bcast_ire); 23382 ire_refrele(bcast_ire); 23383 } 23384 ire_refrele(ire_prim); 23385 } 23386 ire_refrele(ire_dst); 23387 } 23388 } 23389 23390 /* 23391 * IPsec hardware acceleration capabilities related functions. 23392 */ 23393 23394 /* 23395 * Free a per-ill IPsec capabilities structure. 23396 */ 23397 static void 23398 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 23399 { 23400 if (capab->auth_hw_algs != NULL) 23401 kmem_free(capab->auth_hw_algs, capab->algs_size); 23402 if (capab->encr_hw_algs != NULL) 23403 kmem_free(capab->encr_hw_algs, capab->algs_size); 23404 if (capab->encr_algparm != NULL) 23405 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 23406 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 23407 } 23408 23409 /* 23410 * Allocate a new per-ill IPsec capabilities structure. This structure 23411 * is specific to an IPsec protocol (AH or ESP). It is implemented as 23412 * an array which specifies, for each algorithm, whether this algorithm 23413 * is supported by the ill or not. 23414 */ 23415 static ill_ipsec_capab_t * 23416 ill_ipsec_capab_alloc(void) 23417 { 23418 ill_ipsec_capab_t *capab; 23419 uint_t nelems; 23420 23421 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 23422 if (capab == NULL) 23423 return (NULL); 23424 23425 /* we need one bit per algorithm */ 23426 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 23427 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 23428 23429 /* allocate memory to store algorithm flags */ 23430 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23431 if (capab->encr_hw_algs == NULL) 23432 goto nomem; 23433 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 23434 if (capab->auth_hw_algs == NULL) 23435 goto nomem; 23436 /* 23437 * Leave encr_algparm NULL for now since we won't need it half 23438 * the time 23439 */ 23440 return (capab); 23441 23442 nomem: 23443 ill_ipsec_capab_free(capab); 23444 return (NULL); 23445 } 23446 23447 /* 23448 * Resize capability array. Since we're exclusive, this is OK. 23449 */ 23450 static boolean_t 23451 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 23452 { 23453 ipsec_capab_algparm_t *nalp, *oalp; 23454 uint32_t olen, nlen; 23455 23456 oalp = capab->encr_algparm; 23457 olen = capab->encr_algparm_size; 23458 23459 if (oalp != NULL) { 23460 if (algid < capab->encr_algparm_end) 23461 return (B_TRUE); 23462 } 23463 23464 nlen = (algid + 1) * sizeof (*nalp); 23465 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 23466 if (nalp == NULL) 23467 return (B_FALSE); 23468 23469 if (oalp != NULL) { 23470 bcopy(oalp, nalp, olen); 23471 kmem_free(oalp, olen); 23472 } 23473 capab->encr_algparm = nalp; 23474 capab->encr_algparm_size = nlen; 23475 capab->encr_algparm_end = algid + 1; 23476 23477 return (B_TRUE); 23478 } 23479 23480 /* 23481 * Compare the capabilities of the specified ill with the protocol 23482 * and algorithms specified by the SA passed as argument. 23483 * If they match, returns B_TRUE, B_FALSE if they do not match. 23484 * 23485 * The ill can be passed as a pointer to it, or by specifying its index 23486 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 23487 * 23488 * Called by ipsec_out_is_accelerated() do decide whether an outbound 23489 * packet is eligible for hardware acceleration, and by 23490 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 23491 * to a particular ill. 23492 */ 23493 boolean_t 23494 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 23495 ipsa_t *sa, netstack_t *ns) 23496 { 23497 boolean_t sa_isv6; 23498 uint_t algid; 23499 struct ill_ipsec_capab_s *cpp; 23500 boolean_t need_refrele = B_FALSE; 23501 ip_stack_t *ipst = ns->netstack_ip; 23502 23503 if (ill == NULL) { 23504 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 23505 NULL, NULL, NULL, ipst); 23506 if (ill == NULL) { 23507 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 23508 return (B_FALSE); 23509 } 23510 need_refrele = B_TRUE; 23511 } 23512 23513 /* 23514 * Use the address length specified by the SA to determine 23515 * if it corresponds to a IPv6 address, and fail the matching 23516 * if the isv6 flag passed as argument does not match. 23517 * Note: this check is used for SADB capability checking before 23518 * sending SA information to an ill. 23519 */ 23520 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 23521 if (sa_isv6 != ill_isv6) 23522 /* protocol mismatch */ 23523 goto done; 23524 23525 /* 23526 * Check if the ill supports the protocol, algorithm(s) and 23527 * key size(s) specified by the SA, and get the pointers to 23528 * the algorithms supported by the ill. 23529 */ 23530 switch (sa->ipsa_type) { 23531 23532 case SADB_SATYPE_ESP: 23533 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 23534 /* ill does not support ESP acceleration */ 23535 goto done; 23536 cpp = ill->ill_ipsec_capab_esp; 23537 algid = sa->ipsa_auth_alg; 23538 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 23539 goto done; 23540 algid = sa->ipsa_encr_alg; 23541 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 23542 goto done; 23543 if (algid < cpp->encr_algparm_end) { 23544 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 23545 if (sa->ipsa_encrkeybits < alp->minkeylen) 23546 goto done; 23547 if (sa->ipsa_encrkeybits > alp->maxkeylen) 23548 goto done; 23549 } 23550 break; 23551 23552 case SADB_SATYPE_AH: 23553 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 23554 /* ill does not support AH acceleration */ 23555 goto done; 23556 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 23557 ill->ill_ipsec_capab_ah->auth_hw_algs)) 23558 goto done; 23559 break; 23560 } 23561 23562 if (need_refrele) 23563 ill_refrele(ill); 23564 return (B_TRUE); 23565 done: 23566 if (need_refrele) 23567 ill_refrele(ill); 23568 return (B_FALSE); 23569 } 23570 23571 23572 /* 23573 * Add a new ill to the list of IPsec capable ills. 23574 * Called from ill_capability_ipsec_ack() when an ACK was received 23575 * indicating that IPsec hardware processing was enabled for an ill. 23576 * 23577 * ill must point to the ill for which acceleration was enabled. 23578 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 23579 */ 23580 static void 23581 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 23582 { 23583 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 23584 uint_t sa_type; 23585 uint_t ipproto; 23586 ip_stack_t *ipst = ill->ill_ipst; 23587 23588 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 23589 (dl_cap == DL_CAPAB_IPSEC_ESP)); 23590 23591 switch (dl_cap) { 23592 case DL_CAPAB_IPSEC_AH: 23593 sa_type = SADB_SATYPE_AH; 23594 ills = &ipst->ips_ipsec_capab_ills_ah; 23595 ipproto = IPPROTO_AH; 23596 break; 23597 case DL_CAPAB_IPSEC_ESP: 23598 sa_type = SADB_SATYPE_ESP; 23599 ills = &ipst->ips_ipsec_capab_ills_esp; 23600 ipproto = IPPROTO_ESP; 23601 break; 23602 } 23603 23604 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 23605 23606 /* 23607 * Add ill index to list of hardware accelerators. If 23608 * already in list, do nothing. 23609 */ 23610 for (cur_ill = *ills; cur_ill != NULL && 23611 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 23612 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 23613 ; 23614 23615 if (cur_ill == NULL) { 23616 /* if this is a new entry for this ill */ 23617 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 23618 if (new_ill == NULL) { 23619 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23620 return; 23621 } 23622 23623 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 23624 new_ill->ill_isv6 = ill->ill_isv6; 23625 new_ill->next = *ills; 23626 *ills = new_ill; 23627 } else if (!sadb_resync) { 23628 /* not resync'ing SADB and an entry exists for this ill */ 23629 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23630 return; 23631 } 23632 23633 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23634 23635 if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 23636 /* 23637 * IPsec module for protocol loaded, initiate dump 23638 * of the SADB to this ill. 23639 */ 23640 sadb_ill_download(ill, sa_type); 23641 } 23642 23643 /* 23644 * Remove an ill from the list of IPsec capable ills. 23645 */ 23646 static void 23647 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 23648 { 23649 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 23650 ip_stack_t *ipst = ill->ill_ipst; 23651 23652 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 23653 dl_cap == DL_CAPAB_IPSEC_ESP); 23654 23655 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah : 23656 &ipst->ips_ipsec_capab_ills_esp; 23657 23658 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 23659 23660 prev_ill = NULL; 23661 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 23662 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 23663 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 23664 ; 23665 if (cur_ill == NULL) { 23666 /* entry not found */ 23667 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23668 return; 23669 } 23670 if (prev_ill == NULL) { 23671 /* entry at front of list */ 23672 *ills = NULL; 23673 } else { 23674 prev_ill->next = cur_ill->next; 23675 } 23676 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 23677 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23678 } 23679 23680 /* 23681 * Called by SADB to send a DL_CONTROL_REQ message to every ill 23682 * supporting the specified IPsec protocol acceleration. 23683 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 23684 * We free the mblk and, if sa is non-null, release the held referece. 23685 */ 23686 void 23687 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, 23688 netstack_t *ns) 23689 { 23690 ipsec_capab_ill_t *ici, *cur_ici; 23691 ill_t *ill; 23692 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 23693 ip_stack_t *ipst = ns->netstack_ip; 23694 23695 ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah : 23696 ipst->ips_ipsec_capab_ills_esp; 23697 23698 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER); 23699 23700 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 23701 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 23702 cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst); 23703 23704 /* 23705 * Handle the case where the ill goes away while the SADB is 23706 * attempting to send messages. If it's going away, it's 23707 * nuking its shadow SADB, so we don't care.. 23708 */ 23709 23710 if (ill == NULL) 23711 continue; 23712 23713 if (sa != NULL) { 23714 /* 23715 * Make sure capabilities match before 23716 * sending SA to ill. 23717 */ 23718 if (!ipsec_capab_match(ill, cur_ici->ill_index, 23719 cur_ici->ill_isv6, sa, ipst->ips_netstack)) { 23720 ill_refrele(ill); 23721 continue; 23722 } 23723 23724 mutex_enter(&sa->ipsa_lock); 23725 sa->ipsa_flags |= IPSA_F_HW; 23726 mutex_exit(&sa->ipsa_lock); 23727 } 23728 23729 /* 23730 * Copy template message, and add it to the front 23731 * of the mblk ship list. We want to avoid holding 23732 * the ipsec_capab_ills_lock while sending the 23733 * message to the ills. 23734 * 23735 * The b_next and b_prev are temporarily used 23736 * to build a list of mblks to be sent down, and to 23737 * save the ill to which they must be sent. 23738 */ 23739 nmp = copymsg(mp); 23740 if (nmp == NULL) { 23741 ill_refrele(ill); 23742 continue; 23743 } 23744 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 23745 nmp->b_next = mp_ship_list; 23746 mp_ship_list = nmp; 23747 nmp->b_prev = (mblk_t *)ill; 23748 } 23749 23750 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 23751 23752 for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) { 23753 /* restore the mblk to a sane state */ 23754 next_mp = nmp->b_next; 23755 nmp->b_next = NULL; 23756 ill = (ill_t *)nmp->b_prev; 23757 nmp->b_prev = NULL; 23758 23759 ill_dlpi_send(ill, nmp); 23760 ill_refrele(ill); 23761 } 23762 23763 if (sa != NULL) 23764 IPSA_REFRELE(sa); 23765 freemsg(mp); 23766 } 23767 23768 /* 23769 * Derive an interface id from the link layer address. 23770 * Knows about IEEE 802 and IEEE EUI-64 mappings. 23771 */ 23772 static boolean_t 23773 ip_ether_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23774 { 23775 char *addr; 23776 23777 if (phys_length != ETHERADDRL) 23778 return (B_FALSE); 23779 23780 /* Form EUI-64 like address */ 23781 addr = (char *)&v6addr->s6_addr32[2]; 23782 bcopy((char *)phys_addr, addr, 3); 23783 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 23784 addr[3] = (char)0xff; 23785 addr[4] = (char)0xfe; 23786 bcopy((char *)phys_addr + 3, addr + 5, 3); 23787 return (B_TRUE); 23788 } 23789 23790 /* ARGSUSED */ 23791 static boolean_t 23792 ip_nodef_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23793 { 23794 return (B_FALSE); 23795 } 23796 23797 /* ARGSUSED */ 23798 static boolean_t 23799 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23800 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23801 { 23802 /* 23803 * Multicast address mappings used over Ethernet/802.X. 23804 * This address is used as a base for mappings. 23805 */ 23806 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 23807 0x00, 0x00, 0x00}; 23808 23809 /* 23810 * Extract low order 32 bits from IPv6 multicast address. 23811 * Or that into the link layer address, starting from the 23812 * second byte. 23813 */ 23814 *hw_start = 2; 23815 v6_extract_mask->s6_addr32[0] = 0; 23816 v6_extract_mask->s6_addr32[1] = 0; 23817 v6_extract_mask->s6_addr32[2] = 0; 23818 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23819 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 23820 return (B_TRUE); 23821 } 23822 23823 /* 23824 * Indicate by return value whether multicast is supported. If not, 23825 * this code should not touch/change any parameters. 23826 */ 23827 /* ARGSUSED */ 23828 static boolean_t 23829 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23830 uint32_t *hw_start, ipaddr_t *extract_mask) 23831 { 23832 /* 23833 * Multicast address mappings used over Ethernet/802.X. 23834 * This address is used as a base for mappings. 23835 */ 23836 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 23837 0x00, 0x00, 0x00 }; 23838 23839 if (phys_length != ETHERADDRL) 23840 return (B_FALSE); 23841 23842 *extract_mask = htonl(0x007fffff); 23843 *hw_start = 2; 23844 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 23845 return (B_TRUE); 23846 } 23847 23848 /* 23849 * Derive IPoIB interface id from the link layer address. 23850 */ 23851 static boolean_t 23852 ip_ib_v6intfid(uint_t phys_length, uint8_t *phys_addr, in6_addr_t *v6addr) 23853 { 23854 char *addr; 23855 23856 if (phys_length != 20) 23857 return (B_FALSE); 23858 addr = (char *)&v6addr->s6_addr32[2]; 23859 bcopy(phys_addr + 12, addr, 8); 23860 /* 23861 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 23862 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 23863 * rules. In these cases, the IBA considers these GUIDs to be in 23864 * "Modified EUI-64" format, and thus toggling the u/l bit is not 23865 * required; vendors are required not to assign global EUI-64's 23866 * that differ only in u/l bit values, thus guaranteeing uniqueness 23867 * of the interface identifier. Whether the GUID is in modified 23868 * or proper EUI-64 format, the ipv6 identifier must have the u/l 23869 * bit set to 1. 23870 */ 23871 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 23872 return (B_TRUE); 23873 } 23874 23875 /* 23876 * Note on mapping from multicast IP addresses to IPoIB multicast link 23877 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 23878 * The format of an IPoIB multicast address is: 23879 * 23880 * 4 byte QPN Scope Sign. Pkey 23881 * +--------------------------------------------+ 23882 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 23883 * +--------------------------------------------+ 23884 * 23885 * The Scope and Pkey components are properties of the IBA port and 23886 * network interface. They can be ascertained from the broadcast address. 23887 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 23888 */ 23889 23890 static boolean_t 23891 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 23892 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 23893 { 23894 /* 23895 * Base IPoIB IPv6 multicast address used for mappings. 23896 * Does not contain the IBA scope/Pkey values. 23897 */ 23898 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23899 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 23900 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23901 23902 /* 23903 * Extract low order 80 bits from IPv6 multicast address. 23904 * Or that into the link layer address, starting from the 23905 * sixth byte. 23906 */ 23907 *hw_start = 6; 23908 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 23909 23910 /* 23911 * Now fill in the IBA scope/Pkey values from the broadcast address. 23912 */ 23913 *(maddr + 5) = *(bphys_addr + 5); 23914 *(maddr + 8) = *(bphys_addr + 8); 23915 *(maddr + 9) = *(bphys_addr + 9); 23916 23917 v6_extract_mask->s6_addr32[0] = 0; 23918 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 23919 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 23920 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 23921 return (B_TRUE); 23922 } 23923 23924 static boolean_t 23925 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 23926 uint32_t *hw_start, ipaddr_t *extract_mask) 23927 { 23928 /* 23929 * Base IPoIB IPv4 multicast address used for mappings. 23930 * Does not contain the IBA scope/Pkey values. 23931 */ 23932 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 23933 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 23934 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 23935 23936 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 23937 return (B_FALSE); 23938 23939 /* 23940 * Extract low order 28 bits from IPv4 multicast address. 23941 * Or that into the link layer address, starting from the 23942 * sixteenth byte. 23943 */ 23944 *extract_mask = htonl(0x0fffffff); 23945 *hw_start = 16; 23946 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 23947 23948 /* 23949 * Now fill in the IBA scope/Pkey values from the broadcast address. 23950 */ 23951 *(maddr + 5) = *(bphys_addr + 5); 23952 *(maddr + 8) = *(bphys_addr + 8); 23953 *(maddr + 9) = *(bphys_addr + 9); 23954 return (B_TRUE); 23955 } 23956 23957 /* 23958 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 23959 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 23960 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 23961 * the link-local address is preferred. 23962 */ 23963 boolean_t 23964 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 23965 { 23966 ipif_t *ipif; 23967 ipif_t *maybe_ipif = NULL; 23968 23969 mutex_enter(&ill->ill_lock); 23970 if (ill->ill_state_flags & ILL_CONDEMNED) { 23971 mutex_exit(&ill->ill_lock); 23972 if (ipifp != NULL) 23973 *ipifp = NULL; 23974 return (B_FALSE); 23975 } 23976 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 23977 if (!IPIF_CAN_LOOKUP(ipif)) 23978 continue; 23979 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 23980 ipif->ipif_zoneid != ALL_ZONES) 23981 continue; 23982 if ((ipif->ipif_flags & flags) != flags) 23983 continue; 23984 23985 if (ipifp == NULL) { 23986 mutex_exit(&ill->ill_lock); 23987 ASSERT(maybe_ipif == NULL); 23988 return (B_TRUE); 23989 } 23990 if (!ill->ill_isv6 || 23991 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 23992 ipif_refhold_locked(ipif); 23993 mutex_exit(&ill->ill_lock); 23994 *ipifp = ipif; 23995 return (B_TRUE); 23996 } 23997 if (maybe_ipif == NULL) 23998 maybe_ipif = ipif; 23999 } 24000 if (ipifp != NULL) { 24001 if (maybe_ipif != NULL) 24002 ipif_refhold_locked(maybe_ipif); 24003 *ipifp = maybe_ipif; 24004 } 24005 mutex_exit(&ill->ill_lock); 24006 return (maybe_ipif != NULL); 24007 } 24008 24009 /* 24010 * Same as ipif_lookup_zoneid() but looks at all the ills in the same group. 24011 */ 24012 boolean_t 24013 ipif_lookup_zoneid_group(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 24014 { 24015 ill_t *illg; 24016 ip_stack_t *ipst = ill->ill_ipst; 24017 24018 /* 24019 * We look at the passed-in ill first without grabbing ill_g_lock. 24020 */ 24021 if (ipif_lookup_zoneid(ill, zoneid, flags, ipifp)) { 24022 return (B_TRUE); 24023 } 24024 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 24025 if (ill->ill_group == NULL) { 24026 /* ill not in a group */ 24027 rw_exit(&ipst->ips_ill_g_lock); 24028 return (B_FALSE); 24029 } 24030 24031 /* 24032 * There's no ipif in the zone on ill, however ill is part of an IPMP 24033 * group. We need to look for an ipif in the zone on all the ills in the 24034 * group. 24035 */ 24036 illg = ill->ill_group->illgrp_ill; 24037 do { 24038 /* 24039 * We don't call ipif_lookup_zoneid() on ill as we already know 24040 * that it's not there. 24041 */ 24042 if (illg != ill && 24043 ipif_lookup_zoneid(illg, zoneid, flags, ipifp)) { 24044 break; 24045 } 24046 } while ((illg = illg->ill_group_next) != NULL); 24047 rw_exit(&ipst->ips_ill_g_lock); 24048 return (illg != NULL); 24049 } 24050 24051 /* 24052 * Check if this ill is only being used to send ICMP probes for IPMP 24053 */ 24054 boolean_t 24055 ill_is_probeonly(ill_t *ill) 24056 { 24057 /* 24058 * Check if the interface is FAILED, or INACTIVE 24059 */ 24060 if (ill->ill_phyint->phyint_flags & (PHYI_FAILED|PHYI_INACTIVE)) 24061 return (B_TRUE); 24062 24063 return (B_FALSE); 24064 } 24065 24066 /* 24067 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 24068 * If a pointer to an ipif_t is returned then the caller will need to do 24069 * an ill_refrele(). 24070 * 24071 * If there is no real interface which matches the ifindex, then it looks 24072 * for a group that has a matching index. In the case of a group match the 24073 * lifidx must be zero. We don't need emulate the logical interfaces 24074 * since IP Filter's use of netinfo doesn't use that. 24075 */ 24076 ipif_t * 24077 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 24078 ip_stack_t *ipst) 24079 { 24080 ipif_t *ipif; 24081 ill_t *ill; 24082 24083 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 24084 ipst); 24085 24086 if (ill == NULL) { 24087 /* Fallback to group names only if hook_emulation set */ 24088 if (!ipst->ips_ipmp_hook_emulation) 24089 return (NULL); 24090 24091 if (lifidx != 0) 24092 return (NULL); 24093 ill = ill_group_lookup_on_ifindex(ifindex, isv6, ipst); 24094 if (ill == NULL) 24095 return (NULL); 24096 } 24097 24098 mutex_enter(&ill->ill_lock); 24099 if (ill->ill_state_flags & ILL_CONDEMNED) { 24100 mutex_exit(&ill->ill_lock); 24101 ill_refrele(ill); 24102 return (NULL); 24103 } 24104 24105 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 24106 if (!IPIF_CAN_LOOKUP(ipif)) 24107 continue; 24108 if (lifidx == ipif->ipif_id) { 24109 ipif_refhold_locked(ipif); 24110 break; 24111 } 24112 } 24113 24114 mutex_exit(&ill->ill_lock); 24115 ill_refrele(ill); 24116 return (ipif); 24117 } 24118 24119 /* 24120 * Flush the fastpath by deleting any nce's that are waiting for the fastpath, 24121 * There is one exceptions IRE_BROADCAST are difficult to recreate, 24122 * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush() 24123 * for details. 24124 */ 24125 void 24126 ill_fastpath_flush(ill_t *ill) 24127 { 24128 ip_stack_t *ipst = ill->ill_ipst; 24129 24130 nce_fastpath_list_dispatch(ill, NULL, NULL); 24131 ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4), 24132 ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE); 24133 } 24134 24135 /* 24136 * Set the physical address information for `ill' to the contents of the 24137 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 24138 * asynchronous if `ill' cannot immediately be quiesced -- in which case 24139 * EINPROGRESS will be returned. 24140 */ 24141 int 24142 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 24143 { 24144 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 24145 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 24146 24147 ASSERT(IAM_WRITER_IPSQ(ipsq)); 24148 24149 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 24150 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 24151 /* Changing DL_IPV6_TOKEN is not yet supported */ 24152 return (0); 24153 } 24154 24155 /* 24156 * We need to store up to two copies of `mp' in `ill'. Due to the 24157 * design of ipsq_pending_mp_add(), we can't pass them as separate 24158 * arguments to ill_set_phys_addr_tail(). Instead, chain them 24159 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 24160 */ 24161 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 24162 freemsg(mp); 24163 return (ENOMEM); 24164 } 24165 24166 ipsq_current_start(ipsq, ill->ill_ipif, 0); 24167 24168 /* 24169 * If we can quiesce the ill, then set the address. If not, then 24170 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 24171 */ 24172 ill_down_ipifs(ill, NULL, 0, B_FALSE); 24173 mutex_enter(&ill->ill_lock); 24174 if (!ill_is_quiescent(ill)) { 24175 /* call cannot fail since `conn_t *' argument is NULL */ 24176 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 24177 mp, ILL_DOWN); 24178 mutex_exit(&ill->ill_lock); 24179 return (EINPROGRESS); 24180 } 24181 mutex_exit(&ill->ill_lock); 24182 24183 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 24184 return (0); 24185 } 24186 24187 /* 24188 * Once the ill associated with `q' has quiesced, set its physical address 24189 * information to the values in `addrmp'. Note that two copies of `addrmp' 24190 * are passed (linked by b_cont), since we sometimes need to save two distinct 24191 * copies in the ill_t, and our context doesn't permit sleeping or allocation 24192 * failure (we'll free the other copy if it's not needed). Since the ill_t 24193 * is quiesced, we know any stale IREs with the old address information have 24194 * already been removed, so we don't need to call ill_fastpath_flush(). 24195 */ 24196 /* ARGSUSED */ 24197 static void 24198 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 24199 { 24200 ill_t *ill = q->q_ptr; 24201 mblk_t *addrmp2 = unlinkb(addrmp); 24202 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 24203 uint_t addrlen, addroff; 24204 24205 ASSERT(IAM_WRITER_IPSQ(ipsq)); 24206 24207 addroff = dlindp->dl_addr_offset; 24208 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 24209 24210 switch (dlindp->dl_data) { 24211 case DL_IPV6_LINK_LAYER_ADDR: 24212 ill_set_ndmp(ill, addrmp, addroff, addrlen); 24213 freemsg(addrmp2); 24214 break; 24215 24216 case DL_CURR_PHYS_ADDR: 24217 freemsg(ill->ill_phys_addr_mp); 24218 ill->ill_phys_addr = addrmp->b_rptr + addroff; 24219 ill->ill_phys_addr_mp = addrmp; 24220 ill->ill_phys_addr_length = addrlen; 24221 24222 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 24223 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 24224 else 24225 freemsg(addrmp2); 24226 break; 24227 default: 24228 ASSERT(0); 24229 } 24230 24231 /* 24232 * If there are ipifs to bring up, ill_up_ipifs() will return 24233 * EINPROGRESS, and ipsq_current_finish() will be called by 24234 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 24235 * brought up. 24236 */ 24237 if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS) 24238 ipsq_current_finish(ipsq); 24239 } 24240 24241 /* 24242 * Helper routine for setting the ill_nd_lla fields. 24243 */ 24244 void 24245 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 24246 { 24247 freemsg(ill->ill_nd_lla_mp); 24248 ill->ill_nd_lla = ndmp->b_rptr + addroff; 24249 ill->ill_nd_lla_mp = ndmp; 24250 ill->ill_nd_lla_len = addrlen; 24251 } 24252 24253 major_t IP_MAJ; 24254 #define IP "ip" 24255 24256 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 24257 #define UDPDEV "/devices/pseudo/udp@0:udp" 24258 24259 /* 24260 * Issue REMOVEIF ioctls to have the loopback interfaces 24261 * go away. Other interfaces are either I_LINKed or I_PLINKed; 24262 * the former going away when the user-level processes in the zone 24263 * are killed * and the latter are cleaned up by the stream head 24264 * str_stack_shutdown callback that undoes all I_PLINKs. 24265 */ 24266 void 24267 ip_loopback_cleanup(ip_stack_t *ipst) 24268 { 24269 int error; 24270 ldi_handle_t lh = NULL; 24271 ldi_ident_t li = NULL; 24272 int rval; 24273 cred_t *cr; 24274 struct strioctl iocb; 24275 struct lifreq lifreq; 24276 24277 IP_MAJ = ddi_name_to_major(IP); 24278 24279 #ifdef NS_DEBUG 24280 (void) printf("ip_loopback_cleanup() stackid %d\n", 24281 ipst->ips_netstack->netstack_stackid); 24282 #endif 24283 24284 bzero(&lifreq, sizeof (lifreq)); 24285 (void) strcpy(lifreq.lifr_name, ipif_loopback_name); 24286 24287 error = ldi_ident_from_major(IP_MAJ, &li); 24288 if (error) { 24289 #ifdef DEBUG 24290 printf("ip_loopback_cleanup: lyr ident get failed error %d\n", 24291 error); 24292 #endif 24293 return; 24294 } 24295 24296 cr = zone_get_kcred(netstackid_to_zoneid( 24297 ipst->ips_netstack->netstack_stackid)); 24298 ASSERT(cr != NULL); 24299 error = ldi_open_by_name(UDP6DEV, FREAD|FWRITE, cr, &lh, li); 24300 if (error) { 24301 #ifdef DEBUG 24302 printf("ip_loopback_cleanup: open of UDP6DEV failed error %d\n", 24303 error); 24304 #endif 24305 goto out; 24306 } 24307 iocb.ic_cmd = SIOCLIFREMOVEIF; 24308 iocb.ic_timout = 15; 24309 iocb.ic_len = sizeof (lifreq); 24310 iocb.ic_dp = (char *)&lifreq; 24311 24312 error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval); 24313 /* LINTED - statement has no consequent */ 24314 if (error) { 24315 #ifdef NS_DEBUG 24316 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on " 24317 "UDP6 error %d\n", error); 24318 #endif 24319 } 24320 (void) ldi_close(lh, FREAD|FWRITE, cr); 24321 lh = NULL; 24322 24323 error = ldi_open_by_name(UDPDEV, FREAD|FWRITE, cr, &lh, li); 24324 if (error) { 24325 #ifdef NS_DEBUG 24326 printf("ip_loopback_cleanup: open of UDPDEV failed error %d\n", 24327 error); 24328 #endif 24329 goto out; 24330 } 24331 24332 iocb.ic_cmd = SIOCLIFREMOVEIF; 24333 iocb.ic_timout = 15; 24334 iocb.ic_len = sizeof (lifreq); 24335 iocb.ic_dp = (char *)&lifreq; 24336 24337 error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval); 24338 /* LINTED - statement has no consequent */ 24339 if (error) { 24340 #ifdef NS_DEBUG 24341 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on " 24342 "UDP error %d\n", error); 24343 #endif 24344 } 24345 (void) ldi_close(lh, FREAD|FWRITE, cr); 24346 lh = NULL; 24347 24348 out: 24349 /* Close layered handles */ 24350 if (lh) 24351 (void) ldi_close(lh, FREAD|FWRITE, cr); 24352 if (li) 24353 ldi_ident_release(li); 24354 24355 crfree(cr); 24356 } 24357