1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains the interface control functions for IP. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/dlpi.h> 34 #include <sys/stropts.h> 35 #include <sys/strsun.h> 36 #include <sys/sysmacros.h> 37 #include <sys/strsubr.h> 38 #include <sys/strlog.h> 39 #include <sys/ddi.h> 40 #include <sys/sunddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kstat.h> 43 #include <sys/debug.h> 44 #include <sys/zone.h> 45 #include <sys/sunldi.h> 46 #include <sys/file.h> 47 #include <sys/bitmap.h> 48 #include <sys/cpuvar.h> 49 #include <sys/time.h> 50 #include <sys/ctype.h> 51 #include <sys/kmem.h> 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/socket.h> 55 #include <sys/isa_defs.h> 56 #include <net/if.h> 57 #include <net/if_arp.h> 58 #include <net/if_types.h> 59 #include <net/if_dl.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/igmp_var.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 #include <sys/callb.h> 69 #include <sys/md5.h> 70 71 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 #include <inet/mi.h> 73 #include <inet/nd.h> 74 #include <inet/arp.h> 75 #include <inet/mib2.h> 76 #include <inet/ip.h> 77 #include <inet/ip6.h> 78 #include <inet/ip6_asp.h> 79 #include <inet/tcp.h> 80 #include <inet/ip_multi.h> 81 #include <inet/ip_ire.h> 82 #include <inet/ip_ftable.h> 83 #include <inet/ip_rts.h> 84 #include <inet/ip_ndp.h> 85 #include <inet/ip_if.h> 86 #include <inet/ip_impl.h> 87 #include <inet/sctp_ip.h> 88 #include <inet/ip_netinfo.h> 89 #include <inet/ilb_ip.h> 90 91 #include <net/pfkeyv2.h> 92 #include <inet/ipsec_info.h> 93 #include <inet/sadb.h> 94 #include <inet/ipsec_impl.h> 95 #include <sys/iphada.h> 96 97 #include <netinet/igmp.h> 98 #include <inet/ip_listutils.h> 99 #include <inet/ipclassifier.h> 100 #include <sys/mac_client.h> 101 #include <sys/dld.h> 102 103 #include <sys/systeminfo.h> 104 #include <sys/bootconf.h> 105 106 #include <sys/tsol/tndb.h> 107 #include <sys/tsol/tnet.h> 108 109 /* The character which tells where the ill_name ends */ 110 #define IPIF_SEPARATOR_CHAR ':' 111 112 /* IP ioctl function table entry */ 113 typedef struct ipft_s { 114 int ipft_cmd; 115 pfi_t ipft_pfi; 116 int ipft_min_size; 117 int ipft_flags; 118 } ipft_t; 119 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 120 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 121 122 typedef struct ip_sock_ar_s { 123 union { 124 area_t ip_sock_area; 125 ared_t ip_sock_ared; 126 areq_t ip_sock_areq; 127 } ip_sock_ar_u; 128 queue_t *ip_sock_ar_q; 129 } ip_sock_ar_t; 130 131 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 132 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 133 char *value, caddr_t cp, cred_t *ioc_cr); 134 135 static boolean_t ill_is_quiescent(ill_t *); 136 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 137 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 138 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 139 mblk_t *mp, boolean_t need_up); 140 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 141 mblk_t *mp, boolean_t need_up); 142 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 145 mblk_t *mp); 146 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 147 mblk_t *mp); 148 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 149 queue_t *q, mblk_t *mp, boolean_t need_up); 150 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 151 int ioccmd, struct linkblk *li, boolean_t doconsist); 152 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 153 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 154 static void ipsq_flush(ill_t *ill); 155 156 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 157 queue_t *q, mblk_t *mp, boolean_t need_up); 158 static void ipsq_delete(ipsq_t *); 159 160 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 161 boolean_t initialize, boolean_t insert); 162 static void ipif_check_bcast_ires(ipif_t *test_ipif); 163 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 164 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 165 boolean_t isv6); 166 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 167 static void ipif_delete_cache_ire(ire_t *, char *); 168 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 169 static void ipif_free(ipif_t *ipif); 170 static void ipif_free_tail(ipif_t *ipif); 171 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 172 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 173 static void ipif_set_default(ipif_t *ipif); 174 static int ipif_set_values(queue_t *q, mblk_t *mp, 175 char *interf_name, uint_t *ppa); 176 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 177 queue_t *q); 178 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 179 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 180 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); 181 static void ipif_update_other_ipifs(ipif_t *old_ipif); 182 183 static int ill_alloc_ppa(ill_if_t *, ill_t *); 184 static int ill_arp_off(ill_t *ill); 185 static int ill_arp_on(ill_t *ill); 186 static void ill_delete_interface_type(ill_if_t *); 187 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 188 static void ill_dl_down(ill_t *ill); 189 static void ill_down(ill_t *ill); 190 static void ill_downi(ire_t *ire, char *ill_arg); 191 static void ill_free_mib(ill_t *ill); 192 static void ill_glist_delete(ill_t *); 193 static void ill_phyint_reinit(ill_t *ill); 194 static void ill_set_nce_router_flags(ill_t *, boolean_t); 195 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 196 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 197 198 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 199 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 200 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 201 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 202 static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo; 203 static ip_v6mapinfo_func_t ip_nodef_v6mapinfo; 204 static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo; 205 static ip_v4mapinfo_func_t ip_nodef_v4mapinfo; 206 static void ipif_save_ire(ipif_t *, ire_t *); 207 static void ipif_remove_ire(ipif_t *, ire_t *); 208 static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); 209 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 210 static void phyint_free(phyint_t *); 211 212 /* 213 * Per-ill IPsec capabilities management. 214 */ 215 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 216 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 217 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 218 static void ill_ipsec_capab_delete(ill_t *, uint_t); 219 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 220 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 221 boolean_t); 222 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 223 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 224 static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *); 225 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 226 static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *); 227 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 228 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 229 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 230 dl_capability_sub_t *); 231 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 232 static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *, 233 int *); 234 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 235 static void ill_capability_dld_ack(ill_t *, mblk_t *, 236 dl_capability_sub_t *); 237 static void ill_capability_dld_enable(ill_t *); 238 static void ill_capability_ack_thr(void *); 239 static void ill_capability_lso_enable(ill_t *); 240 static void ill_capability_send(ill_t *, mblk_t *); 241 242 static ill_t *ill_prev_usesrc(ill_t *); 243 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 244 static void ill_disband_usesrc_group(ill_t *); 245 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 246 247 #ifdef DEBUG 248 static void ill_trace_cleanup(const ill_t *); 249 static void ipif_trace_cleanup(const ipif_t *); 250 #endif 251 252 /* 253 * if we go over the memory footprint limit more than once in this msec 254 * interval, we'll start pruning aggressively. 255 */ 256 int ip_min_frag_prune_time = 0; 257 258 /* 259 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 260 * and the IPsec DOI 261 */ 262 #define MAX_IPSEC_ALGS 256 263 264 #define BITSPERBYTE 8 265 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 266 267 #define IPSEC_ALG_ENABLE(algs, algid) \ 268 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 269 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 270 271 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 272 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 273 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 274 275 typedef uint8_t ipsec_capab_elem_t; 276 277 /* 278 * Per-algorithm parameters. Note that at present, only encryption 279 * algorithms have variable keysize (IKE does not provide a way to negotiate 280 * auth algorithm keysize). 281 * 282 * All sizes here are in bits. 283 */ 284 typedef struct 285 { 286 uint16_t minkeylen; 287 uint16_t maxkeylen; 288 } ipsec_capab_algparm_t; 289 290 /* 291 * Per-ill capabilities. 292 */ 293 struct ill_ipsec_capab_s { 294 ipsec_capab_elem_t *encr_hw_algs; 295 ipsec_capab_elem_t *auth_hw_algs; 296 uint32_t algs_size; /* size of _hw_algs in bytes */ 297 /* algorithm key lengths */ 298 ipsec_capab_algparm_t *encr_algparm; 299 uint32_t encr_algparm_size; 300 uint32_t encr_algparm_end; 301 }; 302 303 /* 304 * The field values are larger than strictly necessary for simple 305 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 306 */ 307 static area_t ip_area_template = { 308 AR_ENTRY_ADD, /* area_cmd */ 309 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 310 /* area_name_offset */ 311 /* area_name_length temporarily holds this structure length */ 312 sizeof (area_t), /* area_name_length */ 313 IP_ARP_PROTO_TYPE, /* area_proto */ 314 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 315 IP_ADDR_LEN, /* area_proto_addr_length */ 316 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 317 /* area_proto_mask_offset */ 318 0, /* area_flags */ 319 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 320 /* area_hw_addr_offset */ 321 /* Zero length hw_addr_length means 'use your idea of the address' */ 322 0 /* area_hw_addr_length */ 323 }; 324 325 /* 326 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 327 * support 328 */ 329 static area_t ip6_area_template = { 330 AR_ENTRY_ADD, /* area_cmd */ 331 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 332 /* area_name_offset */ 333 /* area_name_length temporarily holds this structure length */ 334 sizeof (area_t), /* area_name_length */ 335 IP_ARP_PROTO_TYPE, /* area_proto */ 336 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 337 IPV6_ADDR_LEN, /* area_proto_addr_length */ 338 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 339 /* area_proto_mask_offset */ 340 0, /* area_flags */ 341 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 342 /* area_hw_addr_offset */ 343 /* Zero length hw_addr_length means 'use your idea of the address' */ 344 0 /* area_hw_addr_length */ 345 }; 346 347 static ared_t ip_ared_template = { 348 AR_ENTRY_DELETE, 349 sizeof (ared_t) + IP_ADDR_LEN, 350 sizeof (ared_t), 351 IP_ARP_PROTO_TYPE, 352 sizeof (ared_t), 353 IP_ADDR_LEN, 354 0 355 }; 356 357 static ared_t ip6_ared_template = { 358 AR_ENTRY_DELETE, 359 sizeof (ared_t) + IPV6_ADDR_LEN, 360 sizeof (ared_t), 361 IP_ARP_PROTO_TYPE, 362 sizeof (ared_t), 363 IPV6_ADDR_LEN, 364 0 365 }; 366 367 /* 368 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 369 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 370 * areq is used). 371 */ 372 static areq_t ip_areq_template = { 373 AR_ENTRY_QUERY, /* cmd */ 374 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 375 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 376 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 377 sizeof (areq_t), /* target addr offset */ 378 IP_ADDR_LEN, /* target addr_length */ 379 0, /* flags */ 380 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 381 IP_ADDR_LEN, /* sender addr length */ 382 AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ 383 AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ 384 AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ 385 /* anything else filled in by the code */ 386 }; 387 388 static arc_t ip_aru_template = { 389 AR_INTERFACE_UP, 390 sizeof (arc_t), /* Name offset */ 391 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 392 }; 393 394 static arc_t ip_ard_template = { 395 AR_INTERFACE_DOWN, 396 sizeof (arc_t), /* Name offset */ 397 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 398 }; 399 400 static arc_t ip_aron_template = { 401 AR_INTERFACE_ON, 402 sizeof (arc_t), /* Name offset */ 403 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 404 }; 405 406 static arc_t ip_aroff_template = { 407 AR_INTERFACE_OFF, 408 sizeof (arc_t), /* Name offset */ 409 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 410 }; 411 412 static arma_t ip_arma_multi_template = { 413 AR_MAPPING_ADD, 414 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 415 /* Name offset */ 416 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 417 IP_ARP_PROTO_TYPE, 418 sizeof (arma_t), /* proto_addr_offset */ 419 IP_ADDR_LEN, /* proto_addr_length */ 420 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 421 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 422 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 423 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 424 IP_MAX_HW_LEN, /* hw_addr_length */ 425 0, /* hw_mapping_start */ 426 }; 427 428 static ipft_t ip_ioctl_ftbl[] = { 429 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 430 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 431 IPFT_F_NO_REPLY }, 432 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 433 IPFT_F_NO_REPLY }, 434 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 435 { 0 } 436 }; 437 438 /* Simple ICMP IP Header Template */ 439 static ipha_t icmp_ipha = { 440 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 441 }; 442 443 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 444 445 static ip_m_t ip_m_tbl[] = { 446 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 447 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid, 448 ip_nodef_v6intfid }, 449 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 450 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, 451 ip_nodef_v6intfid }, 452 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 453 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, 454 ip_nodef_v6intfid }, 455 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 456 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, 457 ip_nodef_v6intfid }, 458 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 459 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid, 460 ip_nodef_v6intfid }, 461 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 462 ip_ib_v4mapinfo, ip_ib_v6mapinfo, ip_ib_v6intfid, 463 ip_nodef_v6intfid }, 464 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, 465 ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_ipv4_v6destintfid }, 466 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, 467 ip_nodef_v6mapinfo, ip_ipv6_v6intfid, ip_ipv6_v6destintfid }, 468 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, 469 ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_nodef_v6intfid }, 470 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 471 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 472 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 473 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 474 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 475 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, 476 ip_nodef_v6intfid } 477 }; 478 479 static ill_t ill_null; /* Empty ILL for init. */ 480 char ipif_loopback_name[] = "lo0"; 481 static char *ipv4_forward_suffix = ":ip_forwarding"; 482 static char *ipv6_forward_suffix = ":ip6_forwarding"; 483 static sin6_t sin6_null; /* Zero address for quick clears */ 484 static sin_t sin_null; /* Zero address for quick clears */ 485 486 /* When set search for unused ipif_seqid */ 487 static ipif_t ipif_zero; 488 489 /* 490 * ppa arena is created after these many 491 * interfaces have been plumbed. 492 */ 493 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 494 495 /* 496 * Allocate per-interface mibs. 497 * Returns true if ok. False otherwise. 498 * ipsq may not yet be allocated (loopback case ). 499 */ 500 static boolean_t 501 ill_allocate_mibs(ill_t *ill) 502 { 503 /* Already allocated? */ 504 if (ill->ill_ip_mib != NULL) { 505 if (ill->ill_isv6) 506 ASSERT(ill->ill_icmp6_mib != NULL); 507 return (B_TRUE); 508 } 509 510 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 511 KM_NOSLEEP); 512 if (ill->ill_ip_mib == NULL) { 513 return (B_FALSE); 514 } 515 516 /* Setup static information */ 517 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 518 sizeof (mib2_ipIfStatsEntry_t)); 519 if (ill->ill_isv6) { 520 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 521 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 522 sizeof (mib2_ipv6AddrEntry_t)); 523 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 524 sizeof (mib2_ipv6RouteEntry_t)); 525 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 526 sizeof (mib2_ipv6NetToMediaEntry_t)); 527 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 528 sizeof (ipv6_member_t)); 529 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 530 sizeof (ipv6_grpsrc_t)); 531 } else { 532 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 533 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 534 sizeof (mib2_ipAddrEntry_t)); 535 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 536 sizeof (mib2_ipRouteEntry_t)); 537 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 538 sizeof (mib2_ipNetToMediaEntry_t)); 539 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 540 sizeof (ip_member_t)); 541 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 542 sizeof (ip_grpsrc_t)); 543 544 /* 545 * For a v4 ill, we are done at this point, because per ill 546 * icmp mibs are only used for v6. 547 */ 548 return (B_TRUE); 549 } 550 551 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 552 KM_NOSLEEP); 553 if (ill->ill_icmp6_mib == NULL) { 554 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 555 ill->ill_ip_mib = NULL; 556 return (B_FALSE); 557 } 558 /* static icmp info */ 559 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 560 sizeof (mib2_ipv6IfIcmpEntry_t); 561 /* 562 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 563 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 564 * -> ill_phyint_reinit 565 */ 566 return (B_TRUE); 567 } 568 569 /* 570 * Common code for preparation of ARP commands. Two points to remember: 571 * 1) The ill_name is tacked on at the end of the allocated space so 572 * the templates name_offset field must contain the total space 573 * to allocate less the name length. 574 * 575 * 2) The templates name_length field should contain the *template* 576 * length. We use it as a parameter to bcopy() and then write 577 * the real ill_name_length into the name_length field of the copy. 578 * (Always called as writer.) 579 */ 580 mblk_t * 581 ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr) 582 { 583 arc_t *arc = (arc_t *)template; 584 char *cp; 585 int len; 586 mblk_t *mp; 587 uint_t name_length = ill->ill_name_length; 588 uint_t template_len = arc->arc_name_length; 589 590 len = arc->arc_name_offset + name_length; 591 mp = allocb(len, BPRI_HI); 592 if (mp == NULL) 593 return (NULL); 594 cp = (char *)mp->b_rptr; 595 mp->b_wptr = (uchar_t *)&cp[len]; 596 if (template_len) 597 bcopy(template, cp, template_len); 598 if (len > template_len) 599 bzero(&cp[template_len], len - template_len); 600 mp->b_datap->db_type = M_PROTO; 601 602 arc = (arc_t *)cp; 603 arc->arc_name_length = name_length; 604 cp = (char *)arc + arc->arc_name_offset; 605 bcopy(ill->ill_name, cp, name_length); 606 607 if (addr) { 608 area_t *area = (area_t *)mp->b_rptr; 609 610 cp = (char *)area + area->area_proto_addr_offset; 611 bcopy(addr, cp, area->area_proto_addr_length); 612 if (area->area_cmd == AR_ENTRY_ADD) { 613 cp = (char *)area; 614 len = area->area_proto_addr_length; 615 if (area->area_proto_mask_offset) 616 cp += area->area_proto_mask_offset; 617 else 618 cp += area->area_proto_addr_offset + len; 619 while (len-- > 0) 620 *cp++ = (char)~0; 621 } 622 } 623 return (mp); 624 } 625 626 mblk_t * 627 ipif_area_alloc(ipif_t *ipif, uint_t optflags) 628 { 629 caddr_t addr; 630 mblk_t *mp; 631 area_t *area; 632 uchar_t *areap; 633 ill_t *ill = ipif->ipif_ill; 634 635 if (ill->ill_isv6) { 636 ASSERT(ill->ill_flags & ILLF_XRESOLV); 637 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 638 areap = (uchar_t *)&ip6_area_template; 639 } else { 640 addr = (caddr_t)&ipif->ipif_lcl_addr; 641 areap = (uchar_t *)&ip_area_template; 642 } 643 644 if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL) 645 return (NULL); 646 647 /* 648 * IPMP requires that the hardware address be included in all 649 * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on. 650 * If there are no active underlying ills in the group (and thus no 651 * hardware address, DAD will be deferred until an underlying ill 652 * becomes active. 653 */ 654 if (IS_IPMP(ill)) { 655 if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { 656 freemsg(mp); 657 return (NULL); 658 } 659 } else { 660 ill_refhold(ill); 661 } 662 663 area = (area_t *)mp->b_rptr; 664 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; 665 area->area_flags |= optflags; 666 area->area_hw_addr_length = ill->ill_phys_addr_length; 667 bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset, 668 area->area_hw_addr_length); 669 670 ill_refrele(ill); 671 return (mp); 672 } 673 674 mblk_t * 675 ipif_ared_alloc(ipif_t *ipif) 676 { 677 caddr_t addr; 678 uchar_t *aredp; 679 680 if (ipif->ipif_ill->ill_isv6) { 681 ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV); 682 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 683 aredp = (uchar_t *)&ip6_ared_template; 684 } else { 685 addr = (caddr_t)&ipif->ipif_lcl_addr; 686 aredp = (uchar_t *)&ip_ared_template; 687 } 688 689 return (ill_arp_alloc(ipif->ipif_ill, aredp, addr)); 690 } 691 692 mblk_t * 693 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 694 { 695 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 696 (char *)&addr)); 697 } 698 699 mblk_t * 700 ill_arie_alloc(ill_t *ill, const char *grifname, const void *template) 701 { 702 mblk_t *mp = ill_arp_alloc(ill, template, 0); 703 arie_t *arie; 704 705 if (mp != NULL) { 706 arie = (arie_t *)mp->b_rptr; 707 (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ); 708 } 709 return (mp); 710 } 711 712 /* 713 * Completely vaporize a lower level tap and all associated interfaces. 714 * ill_delete is called only out of ip_close when the device control 715 * stream is being closed. 716 */ 717 void 718 ill_delete(ill_t *ill) 719 { 720 ipif_t *ipif; 721 ill_t *prev_ill; 722 ip_stack_t *ipst = ill->ill_ipst; 723 724 /* 725 * ill_delete may be forcibly entering the ipsq. The previous 726 * ioctl may not have completed and may need to be aborted. 727 * ipsq_flush takes care of it. If we don't need to enter the 728 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 729 * ill_delete_tail is sufficient. 730 */ 731 ipsq_flush(ill); 732 733 /* 734 * Nuke all interfaces. ipif_free will take down the interface, 735 * remove it from the list, and free the data structure. 736 * Walk down the ipif list and remove the logical interfaces 737 * first before removing the main ipif. We can't unplumb 738 * zeroth interface first in the case of IPv6 as reset_conn_ill 739 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 740 * POINTOPOINT. 741 * 742 * If ill_ipif was not properly initialized (i.e low on memory), 743 * then no interfaces to clean up. In this case just clean up the 744 * ill. 745 */ 746 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 747 ipif_free(ipif); 748 749 /* 750 * Used only by ill_arp_on and ill_arp_off, which are writers. 751 * So nobody can be using this mp now. Free the mp allocated for 752 * honoring ILLF_NOARP 753 */ 754 freemsg(ill->ill_arp_on_mp); 755 ill->ill_arp_on_mp = NULL; 756 757 /* Clean up msgs on pending upcalls for mrouted */ 758 reset_mrt_ill(ill); 759 760 /* 761 * ipif_free -> reset_conn_ipif will remove all multicast 762 * references for IPv4. For IPv6, we need to do it here as 763 * it points only at ills. 764 */ 765 reset_conn_ill(ill); 766 767 /* 768 * Remove multicast references added as a result of calls to 769 * ip_join_allmulti(). 770 */ 771 ip_purge_allmulti(ill); 772 773 /* 774 * If the ill being deleted is under IPMP, boot it out of the illgrp. 775 */ 776 if (IS_UNDER_IPMP(ill)) 777 ipmp_ill_leave_illgrp(ill); 778 779 /* 780 * ill_down will arrange to blow off any IRE's dependent on this 781 * ILL, and shut down fragmentation reassembly. 782 */ 783 ill_down(ill); 784 785 /* Let SCTP know, so that it can remove this from its list. */ 786 sctp_update_ill(ill, SCTP_ILL_REMOVE); 787 788 /* 789 * If an address on this ILL is being used as a source address then 790 * clear out the pointers in other ILLs that point to this ILL. 791 */ 792 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 793 if (ill->ill_usesrc_grp_next != NULL) { 794 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 795 ill_disband_usesrc_group(ill); 796 } else { /* consumer of the usesrc ILL */ 797 prev_ill = ill_prev_usesrc(ill); 798 prev_ill->ill_usesrc_grp_next = 799 ill->ill_usesrc_grp_next; 800 } 801 } 802 rw_exit(&ipst->ips_ill_g_usesrc_lock); 803 } 804 805 static void 806 ipif_non_duplicate(ipif_t *ipif) 807 { 808 ill_t *ill = ipif->ipif_ill; 809 mutex_enter(&ill->ill_lock); 810 if (ipif->ipif_flags & IPIF_DUPLICATE) { 811 ipif->ipif_flags &= ~IPIF_DUPLICATE; 812 ASSERT(ill->ill_ipif_dup_count > 0); 813 ill->ill_ipif_dup_count--; 814 } 815 mutex_exit(&ill->ill_lock); 816 } 817 818 /* 819 * ill_delete_tail is called from ip_modclose after all references 820 * to the closing ill are gone. The wait is done in ip_modclose 821 */ 822 void 823 ill_delete_tail(ill_t *ill) 824 { 825 mblk_t **mpp; 826 ipif_t *ipif; 827 ip_stack_t *ipst = ill->ill_ipst; 828 829 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 830 ipif_non_duplicate(ipif); 831 ipif_down_tail(ipif); 832 } 833 834 ASSERT(ill->ill_ipif_dup_count == 0 && 835 ill->ill_arp_down_mp == NULL && 836 ill->ill_arp_del_mapping_mp == NULL); 837 838 /* 839 * If polling capability is enabled (which signifies direct 840 * upcall into IP and driver has ill saved as a handle), 841 * we need to make sure that unbind has completed before we 842 * let the ill disappear and driver no longer has any reference 843 * to this ill. 844 */ 845 mutex_enter(&ill->ill_lock); 846 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 847 cv_wait(&ill->ill_cv, &ill->ill_lock); 848 mutex_exit(&ill->ill_lock); 849 ASSERT(!(ill->ill_capabilities & 850 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 851 852 if (ill->ill_net_type != IRE_LOOPBACK) 853 qprocsoff(ill->ill_rq); 854 855 /* 856 * We do an ipsq_flush once again now. New messages could have 857 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 858 * could also have landed up if an ioctl thread had looked up 859 * the ill before we set the ILL_CONDEMNED flag, but not yet 860 * enqueued the ioctl when we did the ipsq_flush last time. 861 */ 862 ipsq_flush(ill); 863 864 /* 865 * Free capabilities. 866 */ 867 if (ill->ill_ipsec_capab_ah != NULL) { 868 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 869 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 870 ill->ill_ipsec_capab_ah = NULL; 871 } 872 873 if (ill->ill_ipsec_capab_esp != NULL) { 874 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 875 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 876 ill->ill_ipsec_capab_esp = NULL; 877 } 878 879 if (ill->ill_mdt_capab != NULL) { 880 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 881 ill->ill_mdt_capab = NULL; 882 } 883 884 if (ill->ill_hcksum_capab != NULL) { 885 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 886 ill->ill_hcksum_capab = NULL; 887 } 888 889 if (ill->ill_zerocopy_capab != NULL) { 890 kmem_free(ill->ill_zerocopy_capab, 891 sizeof (ill_zerocopy_capab_t)); 892 ill->ill_zerocopy_capab = NULL; 893 } 894 895 if (ill->ill_lso_capab != NULL) { 896 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 897 ill->ill_lso_capab = NULL; 898 } 899 900 if (ill->ill_dld_capab != NULL) { 901 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 902 ill->ill_dld_capab = NULL; 903 } 904 905 while (ill->ill_ipif != NULL) 906 ipif_free_tail(ill->ill_ipif); 907 908 /* 909 * We have removed all references to ilm from conn and the ones joined 910 * within the kernel. 911 * 912 * We don't walk conns, mrts and ires because 913 * 914 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 915 * 2) ill_down ->ill_downi walks all the ires and cleans up 916 * ill references. 917 */ 918 ASSERT(ilm_walk_ill(ill) == 0); 919 920 /* 921 * If this ill is an IPMP meta-interface, blow away the illgrp. This 922 * is safe to do because the illgrp has already been unlinked from the 923 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 924 */ 925 if (IS_IPMP(ill)) { 926 ipmp_illgrp_destroy(ill->ill_grp); 927 ill->ill_grp = NULL; 928 } 929 930 /* 931 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 932 * could free the phyint. No more reference to the phyint after this 933 * point. 934 */ 935 (void) ill_glist_delete(ill); 936 937 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 938 if (ill->ill_ndd_name != NULL) 939 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 940 rw_exit(&ipst->ips_ip_g_nd_lock); 941 942 if (ill->ill_frag_ptr != NULL) { 943 uint_t count; 944 945 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 946 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 947 } 948 mi_free(ill->ill_frag_ptr); 949 ill->ill_frag_ptr = NULL; 950 ill->ill_frag_hash_tbl = NULL; 951 } 952 953 freemsg(ill->ill_nd_lla_mp); 954 /* Free all retained control messages. */ 955 mpp = &ill->ill_first_mp_to_free; 956 do { 957 while (mpp[0]) { 958 mblk_t *mp; 959 mblk_t *mp1; 960 961 mp = mpp[0]; 962 mpp[0] = mp->b_next; 963 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 964 mp1->b_next = NULL; 965 mp1->b_prev = NULL; 966 } 967 freemsg(mp); 968 } 969 } while (mpp++ != &ill->ill_last_mp_to_free); 970 971 ill_free_mib(ill); 972 973 #ifdef DEBUG 974 ill_trace_cleanup(ill); 975 #endif 976 977 /* Drop refcnt here */ 978 netstack_rele(ill->ill_ipst->ips_netstack); 979 ill->ill_ipst = NULL; 980 } 981 982 static void 983 ill_free_mib(ill_t *ill) 984 { 985 ip_stack_t *ipst = ill->ill_ipst; 986 987 /* 988 * MIB statistics must not be lost, so when an interface 989 * goes away the counter values will be added to the global 990 * MIBs. 991 */ 992 if (ill->ill_ip_mib != NULL) { 993 if (ill->ill_isv6) { 994 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 995 ill->ill_ip_mib); 996 } else { 997 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 998 ill->ill_ip_mib); 999 } 1000 1001 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 1002 ill->ill_ip_mib = NULL; 1003 } 1004 if (ill->ill_icmp6_mib != NULL) { 1005 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 1006 ill->ill_icmp6_mib); 1007 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 1008 ill->ill_icmp6_mib = NULL; 1009 } 1010 } 1011 1012 /* 1013 * Concatenate together a physical address and a sap. 1014 * 1015 * Sap_lengths are interpreted as follows: 1016 * sap_length == 0 ==> no sap 1017 * sap_length > 0 ==> sap is at the head of the dlpi address 1018 * sap_length < 0 ==> sap is at the tail of the dlpi address 1019 */ 1020 static void 1021 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 1022 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 1023 { 1024 uint16_t sap_addr = (uint16_t)sap_src; 1025 1026 if (sap_length == 0) { 1027 if (phys_src == NULL) 1028 bzero(dst, phys_length); 1029 else 1030 bcopy(phys_src, dst, phys_length); 1031 } else if (sap_length < 0) { 1032 if (phys_src == NULL) 1033 bzero(dst, phys_length); 1034 else 1035 bcopy(phys_src, dst, phys_length); 1036 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 1037 } else { 1038 bcopy(&sap_addr, dst, sizeof (sap_addr)); 1039 if (phys_src == NULL) 1040 bzero((char *)dst + sap_length, phys_length); 1041 else 1042 bcopy(phys_src, (char *)dst + sap_length, phys_length); 1043 } 1044 } 1045 1046 /* 1047 * Generate a dl_unitdata_req mblk for the device and address given. 1048 * addr_length is the length of the physical portion of the address. 1049 * If addr is NULL include an all zero address of the specified length. 1050 * TRUE? In any case, addr_length is taken to be the entire length of the 1051 * dlpi address, including the absolute value of sap_length. 1052 */ 1053 mblk_t * 1054 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1055 t_scalar_t sap_length) 1056 { 1057 dl_unitdata_req_t *dlur; 1058 mblk_t *mp; 1059 t_scalar_t abs_sap_length; /* absolute value */ 1060 1061 abs_sap_length = ABS(sap_length); 1062 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1063 DL_UNITDATA_REQ); 1064 if (mp == NULL) 1065 return (NULL); 1066 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1067 /* HACK: accomodate incompatible DLPI drivers */ 1068 if (addr_length == 8) 1069 addr_length = 6; 1070 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1071 dlur->dl_dest_addr_offset = sizeof (*dlur); 1072 dlur->dl_priority.dl_min = 0; 1073 dlur->dl_priority.dl_max = 0; 1074 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1075 (uchar_t *)&dlur[1]); 1076 return (mp); 1077 } 1078 1079 /* 1080 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp. Return 1081 * an error if we already have 1 or more ioctls in progress. This is only 1082 * needed for SIOCG*ARP. 1083 */ 1084 boolean_t 1085 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1086 { 1087 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1088 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1089 /* We should only see M_IOCDATA arp ioctls here. */ 1090 ASSERT(add_mp->b_datap->db_type == M_IOCDATA); 1091 1092 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1093 /* 1094 * Return error if the conn has started closing. The conn 1095 * could have finished cleaning up the pending mp list, 1096 * If so we should not add another mp to the list negating 1097 * the cleanup. 1098 */ 1099 if (connp->conn_state_flags & CONN_CLOSING) 1100 return (B_FALSE); 1101 /* 1102 * Add the pending mp to the head of the list, chained by b_next. 1103 * Note down the conn on which the ioctl request came, in b_prev. 1104 * This will be used to later get the conn, when we get a response 1105 * on the ill queue, from some other module (typically arp) 1106 */ 1107 add_mp->b_next = (void *)ill->ill_pending_mp; 1108 add_mp->b_queue = CONNP_TO_WQ(connp); 1109 ill->ill_pending_mp = add_mp; 1110 if (connp != NULL) 1111 connp->conn_oper_pending_ill = ill; 1112 return (B_TRUE); 1113 } 1114 1115 /* 1116 * Retrieve the ill_pending_mp and return it. We have to walk the list 1117 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1118 */ 1119 mblk_t * 1120 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1121 { 1122 mblk_t *prev = NULL; 1123 mblk_t *curr = NULL; 1124 uint_t id; 1125 conn_t *connp; 1126 1127 /* 1128 * When the conn closes, conn_ioctl_cleanup needs to clean 1129 * up the pending mp, but it does not know the ioc_id and 1130 * passes in a zero for it. 1131 */ 1132 mutex_enter(&ill->ill_lock); 1133 if (ioc_id != 0) 1134 *connpp = NULL; 1135 1136 /* Search the list for the appropriate ioctl based on ioc_id */ 1137 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1138 prev = curr, curr = curr->b_next) { 1139 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1140 connp = Q_TO_CONN(curr->b_queue); 1141 /* Match based on the ioc_id or based on the conn */ 1142 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1143 break; 1144 } 1145 1146 if (curr != NULL) { 1147 /* Unlink the mblk from the pending mp list */ 1148 if (prev != NULL) { 1149 prev->b_next = curr->b_next; 1150 } else { 1151 ASSERT(ill->ill_pending_mp == curr); 1152 ill->ill_pending_mp = curr->b_next; 1153 } 1154 1155 /* 1156 * conn refcnt must have been bumped up at the start of 1157 * the ioctl. So we can safely access the conn. 1158 */ 1159 ASSERT(CONN_Q(curr->b_queue)); 1160 *connpp = Q_TO_CONN(curr->b_queue); 1161 curr->b_next = NULL; 1162 curr->b_queue = NULL; 1163 } 1164 1165 mutex_exit(&ill->ill_lock); 1166 1167 return (curr); 1168 } 1169 1170 /* 1171 * Add the pending mp to the list. There can be only 1 pending mp 1172 * in the list. Any exclusive ioctl that needs to wait for a response 1173 * from another module or driver needs to use this function to set 1174 * the ipx_pending_mp to the ioctl mblk and wait for the response from 1175 * the other module/driver. This is also used while waiting for the 1176 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1177 */ 1178 boolean_t 1179 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1180 int waitfor) 1181 { 1182 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 1183 1184 ASSERT(IAM_WRITER_IPIF(ipif)); 1185 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1186 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1187 ASSERT(ipx->ipx_pending_mp == NULL); 1188 /* 1189 * The caller may be using a different ipif than the one passed into 1190 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 1191 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 1192 * that `ipx_current_ipif == ipif'. 1193 */ 1194 ASSERT(ipx->ipx_current_ipif != NULL); 1195 1196 /* 1197 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 1198 * driver. 1199 */ 1200 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 1201 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 1202 (DB_TYPE(add_mp) == M_PCPROTO)); 1203 1204 if (connp != NULL) { 1205 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1206 /* 1207 * Return error if the conn has started closing. The conn 1208 * could have finished cleaning up the pending mp list, 1209 * If so we should not add another mp to the list negating 1210 * the cleanup. 1211 */ 1212 if (connp->conn_state_flags & CONN_CLOSING) 1213 return (B_FALSE); 1214 } 1215 mutex_enter(&ipx->ipx_lock); 1216 ipx->ipx_pending_ipif = ipif; 1217 /* 1218 * Note down the queue in b_queue. This will be returned by 1219 * ipsq_pending_mp_get. Caller will then use these values to restart 1220 * the processing 1221 */ 1222 add_mp->b_next = NULL; 1223 add_mp->b_queue = q; 1224 ipx->ipx_pending_mp = add_mp; 1225 ipx->ipx_waitfor = waitfor; 1226 mutex_exit(&ipx->ipx_lock); 1227 1228 if (connp != NULL) 1229 connp->conn_oper_pending_ill = ipif->ipif_ill; 1230 1231 return (B_TRUE); 1232 } 1233 1234 /* 1235 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 1236 * queued in the list. 1237 */ 1238 mblk_t * 1239 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1240 { 1241 mblk_t *curr = NULL; 1242 ipxop_t *ipx = ipsq->ipsq_xop; 1243 1244 *connpp = NULL; 1245 mutex_enter(&ipx->ipx_lock); 1246 if (ipx->ipx_pending_mp == NULL) { 1247 mutex_exit(&ipx->ipx_lock); 1248 return (NULL); 1249 } 1250 1251 /* There can be only 1 such excl message */ 1252 curr = ipx->ipx_pending_mp; 1253 ASSERT(curr->b_next == NULL); 1254 ipx->ipx_pending_ipif = NULL; 1255 ipx->ipx_pending_mp = NULL; 1256 ipx->ipx_waitfor = 0; 1257 mutex_exit(&ipx->ipx_lock); 1258 1259 if (CONN_Q(curr->b_queue)) { 1260 /* 1261 * This mp did a refhold on the conn, at the start of the ioctl. 1262 * So we can safely return a pointer to the conn to the caller. 1263 */ 1264 *connpp = Q_TO_CONN(curr->b_queue); 1265 } else { 1266 *connpp = NULL; 1267 } 1268 curr->b_next = NULL; 1269 curr->b_prev = NULL; 1270 return (curr); 1271 } 1272 1273 /* 1274 * Cleanup the ioctl mp queued in ipx_pending_mp 1275 * - Called in the ill_delete path 1276 * - Called in the M_ERROR or M_HANGUP path on the ill. 1277 * - Called in the conn close path. 1278 */ 1279 boolean_t 1280 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1281 { 1282 mblk_t *mp; 1283 ipxop_t *ipx; 1284 queue_t *q; 1285 ipif_t *ipif; 1286 1287 ASSERT(IAM_WRITER_ILL(ill)); 1288 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 1289 1290 /* 1291 * If connp is null, unconditionally clean up the ipx_pending_mp. 1292 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1293 * even if it is meant for another ill, since we have to enqueue 1294 * a new mp now in ipx_pending_mp to complete the ipif_down. 1295 * If connp is non-null we are called from the conn close path. 1296 */ 1297 mutex_enter(&ipx->ipx_lock); 1298 mp = ipx->ipx_pending_mp; 1299 if (mp == NULL || (connp != NULL && 1300 mp->b_queue != CONNP_TO_WQ(connp))) { 1301 mutex_exit(&ipx->ipx_lock); 1302 return (B_FALSE); 1303 } 1304 /* Now remove from the ipx_pending_mp */ 1305 ipx->ipx_pending_mp = NULL; 1306 q = mp->b_queue; 1307 mp->b_next = NULL; 1308 mp->b_prev = NULL; 1309 mp->b_queue = NULL; 1310 1311 ipif = ipx->ipx_pending_ipif; 1312 ipx->ipx_pending_ipif = NULL; 1313 ipx->ipx_waitfor = 0; 1314 ipx->ipx_current_ipif = NULL; 1315 ipx->ipx_current_ioctl = 0; 1316 ipx->ipx_current_done = B_TRUE; 1317 mutex_exit(&ipx->ipx_lock); 1318 1319 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1320 if (connp == NULL) { 1321 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1322 } else { 1323 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 1324 mutex_enter(&ipif->ipif_ill->ill_lock); 1325 ipif->ipif_state_flags &= ~IPIF_CHANGING; 1326 mutex_exit(&ipif->ipif_ill->ill_lock); 1327 } 1328 } else { 1329 /* 1330 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1331 * be just inet_freemsg. we have to restart it 1332 * otherwise the thread will be stuck. 1333 */ 1334 inet_freemsg(mp); 1335 } 1336 return (B_TRUE); 1337 } 1338 1339 /* 1340 * The ill is closing. Cleanup all the pending mps. Called exclusively 1341 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1342 * knows this ill, and hence nobody can add an mp to this list 1343 */ 1344 static void 1345 ill_pending_mp_cleanup(ill_t *ill) 1346 { 1347 mblk_t *mp; 1348 queue_t *q; 1349 1350 ASSERT(IAM_WRITER_ILL(ill)); 1351 1352 mutex_enter(&ill->ill_lock); 1353 /* 1354 * Every mp on the pending mp list originating from an ioctl 1355 * added 1 to the conn refcnt, at the start of the ioctl. 1356 * So bump it down now. See comments in ip_wput_nondata() 1357 */ 1358 while (ill->ill_pending_mp != NULL) { 1359 mp = ill->ill_pending_mp; 1360 ill->ill_pending_mp = mp->b_next; 1361 mutex_exit(&ill->ill_lock); 1362 1363 q = mp->b_queue; 1364 ASSERT(CONN_Q(q)); 1365 mp->b_next = NULL; 1366 mp->b_prev = NULL; 1367 mp->b_queue = NULL; 1368 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1369 mutex_enter(&ill->ill_lock); 1370 } 1371 ill->ill_pending_ipif = NULL; 1372 1373 mutex_exit(&ill->ill_lock); 1374 } 1375 1376 /* 1377 * Called in the conn close path and ill delete path 1378 */ 1379 static void 1380 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1381 { 1382 ipsq_t *ipsq; 1383 mblk_t *prev; 1384 mblk_t *curr; 1385 mblk_t *next; 1386 queue_t *q; 1387 mblk_t *tmp_list = NULL; 1388 1389 ASSERT(IAM_WRITER_ILL(ill)); 1390 if (connp != NULL) 1391 q = CONNP_TO_WQ(connp); 1392 else 1393 q = ill->ill_wq; 1394 1395 ipsq = ill->ill_phyint->phyint_ipsq; 1396 /* 1397 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1398 * In the case of ioctl from a conn, there can be only 1 mp 1399 * queued on the ipsq. If an ill is being unplumbed, only messages 1400 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1401 * ioctls meant for this ill form conn's are not flushed. They will 1402 * be processed during ipsq_exit and will not find the ill and will 1403 * return error. 1404 */ 1405 mutex_enter(&ipsq->ipsq_lock); 1406 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1407 curr = next) { 1408 next = curr->b_next; 1409 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1410 /* Unlink the mblk from the pending mp list */ 1411 if (prev != NULL) { 1412 prev->b_next = curr->b_next; 1413 } else { 1414 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1415 ipsq->ipsq_xopq_mphead = curr->b_next; 1416 } 1417 if (ipsq->ipsq_xopq_mptail == curr) 1418 ipsq->ipsq_xopq_mptail = prev; 1419 /* 1420 * Create a temporary list and release the ipsq lock 1421 * New elements are added to the head of the tmp_list 1422 */ 1423 curr->b_next = tmp_list; 1424 tmp_list = curr; 1425 } else { 1426 prev = curr; 1427 } 1428 } 1429 mutex_exit(&ipsq->ipsq_lock); 1430 1431 while (tmp_list != NULL) { 1432 curr = tmp_list; 1433 tmp_list = curr->b_next; 1434 curr->b_next = NULL; 1435 curr->b_prev = NULL; 1436 curr->b_queue = NULL; 1437 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1438 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1439 CONN_CLOSE : NO_COPYOUT, NULL); 1440 } else { 1441 /* 1442 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1443 * this can't be just inet_freemsg. we have to 1444 * restart it otherwise the thread will be stuck. 1445 */ 1446 inet_freemsg(curr); 1447 } 1448 } 1449 } 1450 1451 /* 1452 * This conn has started closing. Cleanup any pending ioctl from this conn. 1453 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1454 */ 1455 void 1456 conn_ioctl_cleanup(conn_t *connp) 1457 { 1458 mblk_t *curr; 1459 ipsq_t *ipsq; 1460 ill_t *ill; 1461 boolean_t refheld; 1462 1463 /* 1464 * Is any exclusive ioctl pending ? If so clean it up. If the 1465 * ioctl has not yet started, the mp is pending in the list headed by 1466 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1467 * ipx_pending_mp. If the ioctl timed out in the streamhead but 1468 * is currently executing now the mp is not queued anywhere but 1469 * conn_oper_pending_ill is null. The conn close will wait 1470 * till the conn_ref drops to zero. 1471 */ 1472 mutex_enter(&connp->conn_lock); 1473 ill = connp->conn_oper_pending_ill; 1474 if (ill == NULL) { 1475 mutex_exit(&connp->conn_lock); 1476 return; 1477 } 1478 1479 curr = ill_pending_mp_get(ill, &connp, 0); 1480 if (curr != NULL) { 1481 mutex_exit(&connp->conn_lock); 1482 CONN_DEC_REF(connp); 1483 inet_freemsg(curr); 1484 return; 1485 } 1486 /* 1487 * We may not be able to refhold the ill if the ill/ipif 1488 * is changing. But we need to make sure that the ill will 1489 * not vanish. So we just bump up the ill_waiter count. 1490 */ 1491 refheld = ill_waiter_inc(ill); 1492 mutex_exit(&connp->conn_lock); 1493 if (refheld) { 1494 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1495 ill_waiter_dcr(ill); 1496 /* 1497 * Check whether this ioctl has started and is 1498 * pending. If it is not found there then check 1499 * whether this ioctl has not even started and is in 1500 * the ipsq_xopq list. 1501 */ 1502 if (!ipsq_pending_mp_cleanup(ill, connp)) 1503 ipsq_xopq_mp_cleanup(ill, connp); 1504 ipsq = ill->ill_phyint->phyint_ipsq; 1505 ipsq_exit(ipsq); 1506 return; 1507 } 1508 } 1509 1510 /* 1511 * The ill is also closing and we could not bump up the 1512 * ill_waiter_count or we could not enter the ipsq. Leave 1513 * the cleanup to ill_delete 1514 */ 1515 mutex_enter(&connp->conn_lock); 1516 while (connp->conn_oper_pending_ill != NULL) 1517 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1518 mutex_exit(&connp->conn_lock); 1519 if (refheld) 1520 ill_waiter_dcr(ill); 1521 } 1522 1523 /* 1524 * ipcl_walk function for cleaning up conn_*_ill fields. 1525 */ 1526 static void 1527 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1528 { 1529 ill_t *ill = (ill_t *)arg; 1530 ire_t *ire; 1531 1532 mutex_enter(&connp->conn_lock); 1533 if (connp->conn_multicast_ill == ill) { 1534 /* Revert to late binding */ 1535 connp->conn_multicast_ill = NULL; 1536 } 1537 if (connp->conn_incoming_ill == ill) 1538 connp->conn_incoming_ill = NULL; 1539 if (connp->conn_outgoing_ill == ill) 1540 connp->conn_outgoing_ill = NULL; 1541 if (connp->conn_dhcpinit_ill == ill) { 1542 connp->conn_dhcpinit_ill = NULL; 1543 ASSERT(ill->ill_dhcpinit != 0); 1544 atomic_dec_32(&ill->ill_dhcpinit); 1545 } 1546 if (connp->conn_ire_cache != NULL) { 1547 ire = connp->conn_ire_cache; 1548 /* 1549 * Source address selection makes it possible for IRE_CACHE 1550 * entries to be created with ire_stq coming from interface X 1551 * and ipif coming from interface Y. Thus whenever interface 1552 * X goes down, remove all references to it by checking both 1553 * on ire_ipif and ire_stq. 1554 */ 1555 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1556 (ire->ire_type == IRE_CACHE && 1557 ire->ire_stq == ill->ill_wq)) { 1558 connp->conn_ire_cache = NULL; 1559 mutex_exit(&connp->conn_lock); 1560 ire_refrele_notr(ire); 1561 return; 1562 } 1563 } 1564 mutex_exit(&connp->conn_lock); 1565 } 1566 1567 static void 1568 ill_down_ipifs_tail(ill_t *ill) 1569 { 1570 ipif_t *ipif; 1571 1572 ASSERT(IAM_WRITER_ILL(ill)); 1573 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1574 ipif_non_duplicate(ipif); 1575 ipif_down_tail(ipif); 1576 } 1577 } 1578 1579 /* ARGSUSED */ 1580 void 1581 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1582 { 1583 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1584 ill_down_ipifs_tail(q->q_ptr); 1585 freemsg(mp); 1586 ipsq_current_finish(ipsq); 1587 } 1588 1589 /* 1590 * ill_down_start is called when we want to down this ill and bring it up again 1591 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1592 * all interfaces, but don't tear down any plumbing. 1593 */ 1594 boolean_t 1595 ill_down_start(queue_t *q, mblk_t *mp) 1596 { 1597 ill_t *ill = q->q_ptr; 1598 ipif_t *ipif; 1599 1600 ASSERT(IAM_WRITER_ILL(ill)); 1601 1602 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1603 (void) ipif_down(ipif, NULL, NULL); 1604 1605 ill_down(ill); 1606 1607 (void) ipsq_pending_mp_cleanup(ill, NULL); 1608 1609 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1610 1611 /* 1612 * Atomically test and add the pending mp if references are active. 1613 */ 1614 mutex_enter(&ill->ill_lock); 1615 if (!ill_is_quiescent(ill)) { 1616 /* call cannot fail since `conn_t *' argument is NULL */ 1617 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1618 mp, ILL_DOWN); 1619 mutex_exit(&ill->ill_lock); 1620 return (B_FALSE); 1621 } 1622 mutex_exit(&ill->ill_lock); 1623 return (B_TRUE); 1624 } 1625 1626 static void 1627 ill_down(ill_t *ill) 1628 { 1629 ip_stack_t *ipst = ill->ill_ipst; 1630 1631 /* Blow off any IREs dependent on this ILL. */ 1632 ire_walk(ill_downi, ill, ipst); 1633 1634 /* Remove any conn_*_ill depending on this ill */ 1635 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1636 } 1637 1638 /* 1639 * ire_walk routine used to delete every IRE that depends on queues 1640 * associated with 'ill'. (Always called as writer.) 1641 */ 1642 static void 1643 ill_downi(ire_t *ire, char *ill_arg) 1644 { 1645 ill_t *ill = (ill_t *)ill_arg; 1646 1647 /* 1648 * Source address selection makes it possible for IRE_CACHE 1649 * entries to be created with ire_stq coming from interface X 1650 * and ipif coming from interface Y. Thus whenever interface 1651 * X goes down, remove all references to it by checking both 1652 * on ire_ipif and ire_stq. 1653 */ 1654 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1655 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1656 ire_delete(ire); 1657 } 1658 } 1659 1660 /* 1661 * Remove ire/nce from the fastpath list. 1662 */ 1663 void 1664 ill_fastpath_nack(ill_t *ill) 1665 { 1666 nce_fastpath_list_dispatch(ill, NULL, NULL); 1667 } 1668 1669 /* Consume an M_IOCACK of the fastpath probe. */ 1670 void 1671 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1672 { 1673 mblk_t *mp1 = mp; 1674 1675 /* 1676 * If this was the first attempt turn on the fastpath probing. 1677 */ 1678 mutex_enter(&ill->ill_lock); 1679 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1680 ill->ill_dlpi_fastpath_state = IDS_OK; 1681 mutex_exit(&ill->ill_lock); 1682 1683 /* Free the M_IOCACK mblk, hold on to the data */ 1684 mp = mp->b_cont; 1685 freeb(mp1); 1686 if (mp == NULL) 1687 return; 1688 if (mp->b_cont != NULL) { 1689 /* 1690 * Update all IRE's or NCE's that are waiting for 1691 * fastpath update. 1692 */ 1693 nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp); 1694 mp1 = mp->b_cont; 1695 freeb(mp); 1696 mp = mp1; 1697 } else { 1698 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1699 } 1700 1701 freeb(mp); 1702 } 1703 1704 /* 1705 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1706 * The data portion of the request is a dl_unitdata_req_t template for 1707 * what we would send downstream in the absence of a fastpath confirmation. 1708 */ 1709 int 1710 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1711 { 1712 struct iocblk *ioc; 1713 mblk_t *mp; 1714 1715 if (dlur_mp == NULL) 1716 return (EINVAL); 1717 1718 mutex_enter(&ill->ill_lock); 1719 switch (ill->ill_dlpi_fastpath_state) { 1720 case IDS_FAILED: 1721 /* 1722 * Driver NAKed the first fastpath ioctl - assume it doesn't 1723 * support it. 1724 */ 1725 mutex_exit(&ill->ill_lock); 1726 return (ENOTSUP); 1727 case IDS_UNKNOWN: 1728 /* This is the first probe */ 1729 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1730 break; 1731 default: 1732 break; 1733 } 1734 mutex_exit(&ill->ill_lock); 1735 1736 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1737 return (EAGAIN); 1738 1739 mp->b_cont = copyb(dlur_mp); 1740 if (mp->b_cont == NULL) { 1741 freeb(mp); 1742 return (EAGAIN); 1743 } 1744 1745 ioc = (struct iocblk *)mp->b_rptr; 1746 ioc->ioc_count = msgdsize(mp->b_cont); 1747 1748 putnext(ill->ill_wq, mp); 1749 return (0); 1750 } 1751 1752 void 1753 ill_capability_probe(ill_t *ill) 1754 { 1755 mblk_t *mp; 1756 1757 ASSERT(IAM_WRITER_ILL(ill)); 1758 1759 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1760 ill->ill_dlpi_capab_state != IDCS_FAILED) 1761 return; 1762 1763 /* 1764 * We are starting a new cycle of capability negotiation. 1765 * Free up the capab reset messages of any previous incarnation. 1766 * We will do a fresh allocation when we get the response to our probe 1767 */ 1768 if (ill->ill_capab_reset_mp != NULL) { 1769 freemsg(ill->ill_capab_reset_mp); 1770 ill->ill_capab_reset_mp = NULL; 1771 } 1772 1773 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1774 1775 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1776 if (mp == NULL) 1777 return; 1778 1779 ill_capability_send(ill, mp); 1780 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1781 } 1782 1783 void 1784 ill_capability_reset(ill_t *ill, boolean_t reneg) 1785 { 1786 ASSERT(IAM_WRITER_ILL(ill)); 1787 1788 if (ill->ill_dlpi_capab_state != IDCS_OK) 1789 return; 1790 1791 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1792 1793 ill_capability_send(ill, ill->ill_capab_reset_mp); 1794 ill->ill_capab_reset_mp = NULL; 1795 /* 1796 * We turn off all capabilities except those pertaining to 1797 * direct function call capabilities viz. ILL_CAPAB_DLD* 1798 * which will be turned off by the corresponding reset functions. 1799 */ 1800 ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM | 1801 ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP); 1802 } 1803 1804 static void 1805 ill_capability_reset_alloc(ill_t *ill) 1806 { 1807 mblk_t *mp; 1808 size_t size = 0; 1809 int err; 1810 dl_capability_req_t *capb; 1811 1812 ASSERT(IAM_WRITER_ILL(ill)); 1813 ASSERT(ill->ill_capab_reset_mp == NULL); 1814 1815 if (ILL_MDT_CAPABLE(ill)) 1816 size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 1817 1818 if (ILL_HCKSUM_CAPABLE(ill)) { 1819 size += sizeof (dl_capability_sub_t) + 1820 sizeof (dl_capab_hcksum_t); 1821 } 1822 1823 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1824 size += sizeof (dl_capability_sub_t) + 1825 sizeof (dl_capab_zerocopy_t); 1826 } 1827 1828 if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) { 1829 size += sizeof (dl_capability_sub_t); 1830 size += ill_capability_ipsec_reset_size(ill, NULL, NULL, 1831 NULL, NULL); 1832 } 1833 1834 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1835 size += sizeof (dl_capability_sub_t) + 1836 sizeof (dl_capab_dld_t); 1837 } 1838 1839 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1840 STR_NOSIG, &err); 1841 1842 mp->b_datap->db_type = M_PROTO; 1843 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1844 1845 capb = (dl_capability_req_t *)mp->b_rptr; 1846 capb->dl_primitive = DL_CAPABILITY_REQ; 1847 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1848 capb->dl_sub_length = size; 1849 1850 mp->b_wptr += sizeof (dl_capability_req_t); 1851 1852 /* 1853 * Each handler fills in the corresponding dl_capability_sub_t 1854 * inside the mblk, 1855 */ 1856 ill_capability_mdt_reset_fill(ill, mp); 1857 ill_capability_hcksum_reset_fill(ill, mp); 1858 ill_capability_zerocopy_reset_fill(ill, mp); 1859 ill_capability_ipsec_reset_fill(ill, mp); 1860 ill_capability_dld_reset_fill(ill, mp); 1861 1862 ill->ill_capab_reset_mp = mp; 1863 } 1864 1865 static void 1866 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1867 { 1868 dl_capab_id_t *id_ic; 1869 uint_t sub_dl_cap = outers->dl_cap; 1870 dl_capability_sub_t *inners; 1871 uint8_t *capend; 1872 1873 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1874 1875 /* 1876 * Note: range checks here are not absolutely sufficient to 1877 * make us robust against malformed messages sent by drivers; 1878 * this is in keeping with the rest of IP's dlpi handling. 1879 * (Remember, it's coming from something else in the kernel 1880 * address space) 1881 */ 1882 1883 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1884 if (capend > mp->b_wptr) { 1885 cmn_err(CE_WARN, "ill_capability_id_ack: " 1886 "malformed sub-capability too long for mblk"); 1887 return; 1888 } 1889 1890 id_ic = (dl_capab_id_t *)(outers + 1); 1891 1892 if (outers->dl_length < sizeof (*id_ic) || 1893 (inners = &id_ic->id_subcap, 1894 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1895 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1896 "encapsulated capab type %d too long for mblk", 1897 inners->dl_cap); 1898 return; 1899 } 1900 1901 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1902 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1903 "isn't as expected; pass-thru module(s) detected, " 1904 "discarding capability\n", inners->dl_cap)); 1905 return; 1906 } 1907 1908 /* Process the encapsulated sub-capability */ 1909 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1910 } 1911 1912 /* 1913 * Process Multidata Transmit capability negotiation ack received from a 1914 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1915 * DL_CAPABILITY_ACK message. 1916 */ 1917 static void 1918 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1919 { 1920 mblk_t *nmp = NULL; 1921 dl_capability_req_t *oc; 1922 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1923 ill_mdt_capab_t **ill_mdt_capab; 1924 uint_t sub_dl_cap = isub->dl_cap; 1925 uint8_t *capend; 1926 1927 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1928 1929 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1930 1931 /* 1932 * Note: range checks here are not absolutely sufficient to 1933 * make us robust against malformed messages sent by drivers; 1934 * this is in keeping with the rest of IP's dlpi handling. 1935 * (Remember, it's coming from something else in the kernel 1936 * address space) 1937 */ 1938 1939 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1940 if (capend > mp->b_wptr) { 1941 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1942 "malformed sub-capability too long for mblk"); 1943 return; 1944 } 1945 1946 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1947 1948 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1949 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1950 "unsupported MDT sub-capability (version %d, expected %d)", 1951 mdt_ic->mdt_version, MDT_VERSION_2); 1952 return; 1953 } 1954 1955 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 1956 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 1957 "capability isn't as expected; pass-thru module(s) " 1958 "detected, discarding capability\n")); 1959 return; 1960 } 1961 1962 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 1963 1964 if (*ill_mdt_capab == NULL) { 1965 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 1966 KM_NOSLEEP); 1967 if (*ill_mdt_capab == NULL) { 1968 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1969 "could not enable MDT version %d " 1970 "for %s (ENOMEM)\n", MDT_VERSION_2, 1971 ill->ill_name); 1972 return; 1973 } 1974 } 1975 1976 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 1977 "MDT version %d (%d bytes leading, %d bytes trailing " 1978 "header spaces, %d max pld bufs, %d span limit)\n", 1979 ill->ill_name, MDT_VERSION_2, 1980 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 1981 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 1982 1983 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 1984 (*ill_mdt_capab)->ill_mdt_on = 1; 1985 /* 1986 * Round the following values to the nearest 32-bit; ULP 1987 * may further adjust them to accomodate for additional 1988 * protocol headers. We pass these values to ULP during 1989 * bind time. 1990 */ 1991 (*ill_mdt_capab)->ill_mdt_hdr_head = 1992 roundup(mdt_ic->mdt_hdr_head, 4); 1993 (*ill_mdt_capab)->ill_mdt_hdr_tail = 1994 roundup(mdt_ic->mdt_hdr_tail, 4); 1995 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 1996 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 1997 1998 ill->ill_capabilities |= ILL_CAPAB_MDT; 1999 } else { 2000 uint_t size; 2001 uchar_t *rptr; 2002 2003 size = sizeof (dl_capability_req_t) + 2004 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2005 2006 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2007 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2008 "could not enable MDT for %s (ENOMEM)\n", 2009 ill->ill_name); 2010 return; 2011 } 2012 2013 rptr = nmp->b_rptr; 2014 /* initialize dl_capability_req_t */ 2015 oc = (dl_capability_req_t *)nmp->b_rptr; 2016 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2017 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2018 sizeof (dl_capab_mdt_t); 2019 nmp->b_rptr += sizeof (dl_capability_req_t); 2020 2021 /* initialize dl_capability_sub_t */ 2022 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2023 nmp->b_rptr += sizeof (*isub); 2024 2025 /* initialize dl_capab_mdt_t */ 2026 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2027 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2028 2029 nmp->b_rptr = rptr; 2030 2031 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2032 "to enable MDT version %d\n", ill->ill_name, 2033 MDT_VERSION_2)); 2034 2035 /* set ENABLE flag */ 2036 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2037 2038 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2039 ill_capability_send(ill, nmp); 2040 } 2041 } 2042 2043 static void 2044 ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp) 2045 { 2046 dl_capab_mdt_t *mdt_subcap; 2047 dl_capability_sub_t *dl_subcap; 2048 2049 if (!ILL_MDT_CAPABLE(ill)) 2050 return; 2051 2052 ASSERT(ill->ill_mdt_capab != NULL); 2053 2054 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2055 dl_subcap->dl_cap = DL_CAPAB_MDT; 2056 dl_subcap->dl_length = sizeof (*mdt_subcap); 2057 2058 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2059 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2060 mdt_subcap->mdt_flags = 0; 2061 mdt_subcap->mdt_hdr_head = 0; 2062 mdt_subcap->mdt_hdr_tail = 0; 2063 2064 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2065 } 2066 2067 static void 2068 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 2069 { 2070 dl_capability_sub_t *dl_subcap; 2071 2072 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2073 return; 2074 2075 /* 2076 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 2077 * initialized below since it is not used by DLD. 2078 */ 2079 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2080 dl_subcap->dl_cap = DL_CAPAB_DLD; 2081 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 2082 2083 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 2084 } 2085 2086 /* 2087 * Allocate an IPsec capability request which will be filled by our 2088 * caller to turn on support for one or more algorithms. 2089 */ 2090 /* ARGSUSED */ 2091 static mblk_t * 2092 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2093 { 2094 mblk_t *nmp; 2095 dl_capability_req_t *ocap; 2096 dl_capab_ipsec_t *ocip; 2097 dl_capab_ipsec_t *icip; 2098 uint8_t *ptr; 2099 icip = (dl_capab_ipsec_t *)(isub + 1); 2100 2101 /* 2102 * Allocate new mblk which will contain a new capability 2103 * request to enable the capabilities. 2104 */ 2105 2106 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2107 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2108 if (nmp == NULL) 2109 return (NULL); 2110 2111 ptr = nmp->b_rptr; 2112 2113 /* initialize dl_capability_req_t */ 2114 ocap = (dl_capability_req_t *)ptr; 2115 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2116 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2117 ptr += sizeof (dl_capability_req_t); 2118 2119 /* initialize dl_capability_sub_t */ 2120 bcopy(isub, ptr, sizeof (*isub)); 2121 ptr += sizeof (*isub); 2122 2123 /* initialize dl_capab_ipsec_t */ 2124 ocip = (dl_capab_ipsec_t *)ptr; 2125 bcopy(icip, ocip, sizeof (*icip)); 2126 2127 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2128 return (nmp); 2129 } 2130 2131 /* 2132 * Process an IPsec capability negotiation ack received from a DLS Provider. 2133 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2134 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2135 */ 2136 static void 2137 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2138 { 2139 dl_capab_ipsec_t *icip; 2140 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2141 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2142 uint_t cipher, nciphers; 2143 mblk_t *nmp; 2144 uint_t alg_len; 2145 boolean_t need_sadb_dump; 2146 uint_t sub_dl_cap = isub->dl_cap; 2147 ill_ipsec_capab_t **ill_capab; 2148 uint64_t ill_capab_flag; 2149 uint8_t *capend, *ciphend; 2150 boolean_t sadb_resync; 2151 2152 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2153 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2154 2155 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2156 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2157 ill_capab_flag = ILL_CAPAB_AH; 2158 } else { 2159 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2160 ill_capab_flag = ILL_CAPAB_ESP; 2161 } 2162 2163 /* 2164 * If the ill capability structure exists, then this incoming 2165 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2166 * If this is so, then we'd need to resynchronize the SADB 2167 * after re-enabling the offloaded ciphers. 2168 */ 2169 sadb_resync = (*ill_capab != NULL); 2170 2171 /* 2172 * Note: range checks here are not absolutely sufficient to 2173 * make us robust against malformed messages sent by drivers; 2174 * this is in keeping with the rest of IP's dlpi handling. 2175 * (Remember, it's coming from something else in the kernel 2176 * address space) 2177 */ 2178 2179 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2180 if (capend > mp->b_wptr) { 2181 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2182 "malformed sub-capability too long for mblk"); 2183 return; 2184 } 2185 2186 /* 2187 * There are two types of acks we process here: 2188 * 1. acks in reply to a (first form) generic capability req 2189 * (no ENABLE flag set) 2190 * 2. acks in reply to a ENABLE capability req. 2191 * (ENABLE flag set) 2192 * 2193 * We process the subcapability passed as argument as follows: 2194 * 1 do initializations 2195 * 1.1 initialize nmp = NULL 2196 * 1.2 set need_sadb_dump to B_FALSE 2197 * 2 for each cipher in subcapability: 2198 * 2.1 if ENABLE flag is set: 2199 * 2.1.1 update per-ill ipsec capabilities info 2200 * 2.1.2 set need_sadb_dump to B_TRUE 2201 * 2.2 if ENABLE flag is not set: 2202 * 2.2.1 if nmp is NULL: 2203 * 2.2.1.1 allocate and initialize nmp 2204 * 2.2.1.2 init current pos in nmp 2205 * 2.2.2 copy current cipher to current pos in nmp 2206 * 2.2.3 set ENABLE flag in nmp 2207 * 2.2.4 update current pos 2208 * 3 if nmp is not equal to NULL, send enable request 2209 * 3.1 send capability request 2210 * 4 if need_sadb_dump is B_TRUE 2211 * 4.1 enable promiscuous on/off notifications 2212 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2213 * AH or ESP SA's to interface. 2214 */ 2215 2216 nmp = NULL; 2217 oalg = NULL; 2218 need_sadb_dump = B_FALSE; 2219 icip = (dl_capab_ipsec_t *)(isub + 1); 2220 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2221 2222 nciphers = icip->cip_nciphers; 2223 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2224 2225 if (ciphend > capend) { 2226 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2227 "too many ciphers for sub-capability len"); 2228 return; 2229 } 2230 2231 for (cipher = 0; cipher < nciphers; cipher++) { 2232 alg_len = sizeof (dl_capab_ipsec_alg_t); 2233 2234 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2235 /* 2236 * TBD: when we provide a way to disable capabilities 2237 * from above, need to manage the request-pending state 2238 * and fail if we were not expecting this ACK. 2239 */ 2240 IPSECHW_DEBUG(IPSECHW_CAPAB, 2241 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2242 2243 /* 2244 * Update IPsec capabilities for this ill 2245 */ 2246 2247 if (*ill_capab == NULL) { 2248 IPSECHW_DEBUG(IPSECHW_CAPAB, 2249 ("ill_capability_ipsec_ack: " 2250 "allocating ipsec_capab for ill\n")); 2251 *ill_capab = ill_ipsec_capab_alloc(); 2252 2253 if (*ill_capab == NULL) { 2254 cmn_err(CE_WARN, 2255 "ill_capability_ipsec_ack: " 2256 "could not enable IPsec Hardware " 2257 "acceleration for %s (ENOMEM)\n", 2258 ill->ill_name); 2259 return; 2260 } 2261 } 2262 2263 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2264 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2265 2266 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2267 cmn_err(CE_WARN, 2268 "ill_capability_ipsec_ack: " 2269 "malformed IPsec algorithm id %d", 2270 ialg->alg_prim); 2271 continue; 2272 } 2273 2274 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2275 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2276 ialg->alg_prim); 2277 } else { 2278 ipsec_capab_algparm_t *alp; 2279 2280 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2281 ialg->alg_prim); 2282 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2283 ialg->alg_prim)) { 2284 cmn_err(CE_WARN, 2285 "ill_capability_ipsec_ack: " 2286 "no space for IPsec alg id %d", 2287 ialg->alg_prim); 2288 continue; 2289 } 2290 alp = &((*ill_capab)->encr_algparm[ 2291 ialg->alg_prim]); 2292 alp->minkeylen = ialg->alg_minbits; 2293 alp->maxkeylen = ialg->alg_maxbits; 2294 } 2295 ill->ill_capabilities |= ill_capab_flag; 2296 /* 2297 * indicate that a capability was enabled, which 2298 * will be used below to kick off a SADB dump 2299 * to the ill. 2300 */ 2301 need_sadb_dump = B_TRUE; 2302 } else { 2303 IPSECHW_DEBUG(IPSECHW_CAPAB, 2304 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2305 ialg->alg_prim)); 2306 2307 if (nmp == NULL) { 2308 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2309 if (nmp == NULL) { 2310 /* 2311 * Sending the PROMISC_ON/OFF 2312 * notification request failed. 2313 * We cannot enable the algorithms 2314 * since the Provider will not 2315 * notify IP of promiscous mode 2316 * changes, which could lead 2317 * to leakage of packets. 2318 */ 2319 cmn_err(CE_WARN, 2320 "ill_capability_ipsec_ack: " 2321 "could not enable IPsec Hardware " 2322 "acceleration for %s (ENOMEM)\n", 2323 ill->ill_name); 2324 return; 2325 } 2326 /* ptr to current output alg specifier */ 2327 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2328 } 2329 2330 /* 2331 * Copy current alg specifier, set ENABLE 2332 * flag, and advance to next output alg. 2333 * For now we enable all IPsec capabilities. 2334 */ 2335 ASSERT(oalg != NULL); 2336 bcopy(ialg, oalg, alg_len); 2337 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2338 nmp->b_wptr += alg_len; 2339 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2340 } 2341 2342 /* move to next input algorithm specifier */ 2343 ialg = (dl_capab_ipsec_alg_t *) 2344 ((char *)ialg + alg_len); 2345 } 2346 2347 if (nmp != NULL) 2348 /* 2349 * nmp points to a DL_CAPABILITY_REQ message to enable 2350 * IPsec hardware acceleration. 2351 */ 2352 ill_capability_send(ill, nmp); 2353 2354 if (need_sadb_dump) 2355 /* 2356 * An acknowledgement corresponding to a request to 2357 * enable acceleration was received, notify SADB. 2358 */ 2359 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2360 } 2361 2362 /* 2363 * Given an mblk with enough space in it, create sub-capability entries for 2364 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2365 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2366 * in preparation for the reset the DL_CAPABILITY_REQ message. 2367 */ 2368 static void 2369 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2370 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2371 { 2372 dl_capab_ipsec_t *oipsec; 2373 dl_capab_ipsec_alg_t *oalg; 2374 dl_capability_sub_t *dl_subcap; 2375 int i, k; 2376 2377 ASSERT(nciphers > 0); 2378 ASSERT(ill_cap != NULL); 2379 ASSERT(mp != NULL); 2380 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2381 2382 /* dl_capability_sub_t for "stype" */ 2383 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2384 dl_subcap->dl_cap = stype; 2385 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2386 mp->b_wptr += sizeof (dl_capability_sub_t); 2387 2388 /* dl_capab_ipsec_t for "stype" */ 2389 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2390 oipsec->cip_version = 1; 2391 oipsec->cip_nciphers = nciphers; 2392 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2393 2394 /* create entries for "stype" AUTH ciphers */ 2395 for (i = 0; i < ill_cap->algs_size; i++) { 2396 for (k = 0; k < BITSPERBYTE; k++) { 2397 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2398 continue; 2399 2400 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2401 bzero((void *)oalg, sizeof (*oalg)); 2402 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2403 oalg->alg_prim = k + (BITSPERBYTE * i); 2404 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2405 } 2406 } 2407 /* create entries for "stype" ENCR ciphers */ 2408 for (i = 0; i < ill_cap->algs_size; i++) { 2409 for (k = 0; k < BITSPERBYTE; k++) { 2410 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2411 continue; 2412 2413 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2414 bzero((void *)oalg, sizeof (*oalg)); 2415 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2416 oalg->alg_prim = k + (BITSPERBYTE * i); 2417 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2418 } 2419 } 2420 } 2421 2422 /* 2423 * Macro to count number of 1s in a byte (8-bit word). The total count is 2424 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2425 * POPC instruction, but our macro is more flexible for an arbitrary length 2426 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2427 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2428 * stays that way, we can reduce the number of iterations required. 2429 */ 2430 #define COUNT_1S(val, sum) { \ 2431 uint8_t x = val & 0xff; \ 2432 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2433 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2434 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2435 } 2436 2437 /* ARGSUSED */ 2438 static int 2439 ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp, 2440 int *esp_cntp, int *esp_lenp) 2441 { 2442 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2443 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2444 uint64_t ill_capabilities = ill->ill_capabilities; 2445 int ah_cnt = 0, esp_cnt = 0; 2446 int ah_len = 0, esp_len = 0; 2447 int i, size = 0; 2448 2449 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2450 return (0); 2451 2452 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2453 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2454 2455 /* Find out the number of ciphers for AH */ 2456 if (cap_ah != NULL) { 2457 for (i = 0; i < cap_ah->algs_size; i++) { 2458 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2459 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2460 } 2461 if (ah_cnt > 0) { 2462 size += sizeof (dl_capability_sub_t) + 2463 sizeof (dl_capab_ipsec_t); 2464 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2465 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2466 size += ah_len; 2467 } 2468 } 2469 2470 /* Find out the number of ciphers for ESP */ 2471 if (cap_esp != NULL) { 2472 for (i = 0; i < cap_esp->algs_size; i++) { 2473 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2474 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2475 } 2476 if (esp_cnt > 0) { 2477 size += sizeof (dl_capability_sub_t) + 2478 sizeof (dl_capab_ipsec_t); 2479 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2480 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2481 size += esp_len; 2482 } 2483 } 2484 2485 if (ah_cntp != NULL) 2486 *ah_cntp = ah_cnt; 2487 if (ah_lenp != NULL) 2488 *ah_lenp = ah_len; 2489 if (esp_cntp != NULL) 2490 *esp_cntp = esp_cnt; 2491 if (esp_lenp != NULL) 2492 *esp_lenp = esp_len; 2493 2494 return (size); 2495 } 2496 2497 /* ARGSUSED */ 2498 static void 2499 ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp) 2500 { 2501 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2502 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2503 int ah_cnt = 0, esp_cnt = 0; 2504 int ah_len = 0, esp_len = 0; 2505 int size; 2506 2507 size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len, 2508 &esp_cnt, &esp_len); 2509 if (size == 0) 2510 return; 2511 2512 /* 2513 * Clear the capability flags for IPsec HA but retain the ill 2514 * capability structures since it's possible that another thread 2515 * is still referring to them. The structures only get deallocated 2516 * when we destroy the ill. 2517 * 2518 * Various places check the flags to see if the ill is capable of 2519 * hardware acceleration, and by clearing them we ensure that new 2520 * outbound IPsec packets are sent down encrypted. 2521 */ 2522 2523 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2524 if (ah_cnt > 0) { 2525 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2526 cap_ah, mp); 2527 } 2528 2529 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2530 if (esp_cnt > 0) { 2531 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2532 cap_esp, mp); 2533 } 2534 2535 /* 2536 * At this point we've composed a bunch of sub-capabilities to be 2537 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2538 * by the caller. Upon receiving this reset message, the driver 2539 * must stop inbound decryption (by destroying all inbound SAs) 2540 * and let the corresponding packets come in encrypted. 2541 */ 2542 } 2543 2544 static void 2545 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2546 boolean_t encapsulated) 2547 { 2548 boolean_t legacy = B_FALSE; 2549 2550 /* 2551 * Note that only the following two sub-capabilities may be 2552 * considered as "legacy", since their original definitions 2553 * do not incorporate the dl_mid_t module ID token, and hence 2554 * may require the use of the wrapper sub-capability. 2555 */ 2556 switch (subp->dl_cap) { 2557 case DL_CAPAB_IPSEC_AH: 2558 case DL_CAPAB_IPSEC_ESP: 2559 legacy = B_TRUE; 2560 break; 2561 } 2562 2563 /* 2564 * For legacy sub-capabilities which don't incorporate a queue_t 2565 * pointer in their structures, discard them if we detect that 2566 * there are intermediate modules in between IP and the driver. 2567 */ 2568 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2569 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2570 "%d discarded; %d module(s) present below IP\n", 2571 subp->dl_cap, ill->ill_lmod_cnt)); 2572 return; 2573 } 2574 2575 switch (subp->dl_cap) { 2576 case DL_CAPAB_IPSEC_AH: 2577 case DL_CAPAB_IPSEC_ESP: 2578 ill_capability_ipsec_ack(ill, mp, subp); 2579 break; 2580 case DL_CAPAB_MDT: 2581 ill_capability_mdt_ack(ill, mp, subp); 2582 break; 2583 case DL_CAPAB_HCKSUM: 2584 ill_capability_hcksum_ack(ill, mp, subp); 2585 break; 2586 case DL_CAPAB_ZEROCOPY: 2587 ill_capability_zerocopy_ack(ill, mp, subp); 2588 break; 2589 case DL_CAPAB_DLD: 2590 ill_capability_dld_ack(ill, mp, subp); 2591 break; 2592 default: 2593 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2594 subp->dl_cap)); 2595 } 2596 } 2597 2598 /* 2599 * Process a hardware checksum offload capability negotiation ack received 2600 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 2601 * of a DL_CAPABILITY_ACK message. 2602 */ 2603 static void 2604 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2605 { 2606 dl_capability_req_t *ocap; 2607 dl_capab_hcksum_t *ihck, *ohck; 2608 ill_hcksum_capab_t **ill_hcksum; 2609 mblk_t *nmp = NULL; 2610 uint_t sub_dl_cap = isub->dl_cap; 2611 uint8_t *capend; 2612 2613 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 2614 2615 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 2616 2617 /* 2618 * Note: range checks here are not absolutely sufficient to 2619 * make us robust against malformed messages sent by drivers; 2620 * this is in keeping with the rest of IP's dlpi handling. 2621 * (Remember, it's coming from something else in the kernel 2622 * address space) 2623 */ 2624 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2625 if (capend > mp->b_wptr) { 2626 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2627 "malformed sub-capability too long for mblk"); 2628 return; 2629 } 2630 2631 /* 2632 * There are two types of acks we process here: 2633 * 1. acks in reply to a (first form) generic capability req 2634 * (no ENABLE flag set) 2635 * 2. acks in reply to a ENABLE capability req. 2636 * (ENABLE flag set) 2637 */ 2638 ihck = (dl_capab_hcksum_t *)(isub + 1); 2639 2640 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 2641 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 2642 "unsupported hardware checksum " 2643 "sub-capability (version %d, expected %d)", 2644 ihck->hcksum_version, HCKSUM_VERSION_1); 2645 return; 2646 } 2647 2648 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 2649 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 2650 "checksum capability isn't as expected; pass-thru " 2651 "module(s) detected, discarding capability\n")); 2652 return; 2653 } 2654 2655 #define CURR_HCKSUM_CAPAB \ 2656 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 2657 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 2658 2659 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 2660 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 2661 /* do ENABLE processing */ 2662 if (*ill_hcksum == NULL) { 2663 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 2664 KM_NOSLEEP); 2665 2666 if (*ill_hcksum == NULL) { 2667 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2668 "could not enable hcksum version %d " 2669 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 2670 ill->ill_name); 2671 return; 2672 } 2673 } 2674 2675 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 2676 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 2677 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 2678 ip1dbg(("ill_capability_hcksum_ack: interface %s " 2679 "has enabled hardware checksumming\n ", 2680 ill->ill_name)); 2681 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 2682 /* 2683 * Enabling hardware checksum offload 2684 * Currently IP supports {TCP,UDP}/IPv4 2685 * partial and full cksum offload and 2686 * IPv4 header checksum offload. 2687 * Allocate new mblk which will 2688 * contain a new capability request 2689 * to enable hardware checksum offload. 2690 */ 2691 uint_t size; 2692 uchar_t *rptr; 2693 2694 size = sizeof (dl_capability_req_t) + 2695 sizeof (dl_capability_sub_t) + isub->dl_length; 2696 2697 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2698 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2699 "could not enable hardware cksum for %s (ENOMEM)\n", 2700 ill->ill_name); 2701 return; 2702 } 2703 2704 rptr = nmp->b_rptr; 2705 /* initialize dl_capability_req_t */ 2706 ocap = (dl_capability_req_t *)nmp->b_rptr; 2707 ocap->dl_sub_offset = 2708 sizeof (dl_capability_req_t); 2709 ocap->dl_sub_length = 2710 sizeof (dl_capability_sub_t) + 2711 isub->dl_length; 2712 nmp->b_rptr += sizeof (dl_capability_req_t); 2713 2714 /* initialize dl_capability_sub_t */ 2715 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2716 nmp->b_rptr += sizeof (*isub); 2717 2718 /* initialize dl_capab_hcksum_t */ 2719 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 2720 bcopy(ihck, ohck, sizeof (*ihck)); 2721 2722 nmp->b_rptr = rptr; 2723 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2724 2725 /* Set ENABLE flag */ 2726 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 2727 ohck->hcksum_txflags |= HCKSUM_ENABLE; 2728 2729 /* 2730 * nmp points to a DL_CAPABILITY_REQ message to enable 2731 * hardware checksum acceleration. 2732 */ 2733 ill_capability_send(ill, nmp); 2734 } else { 2735 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 2736 "advertised %x hardware checksum capability flags\n", 2737 ill->ill_name, ihck->hcksum_txflags)); 2738 } 2739 } 2740 2741 static void 2742 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 2743 { 2744 dl_capab_hcksum_t *hck_subcap; 2745 dl_capability_sub_t *dl_subcap; 2746 2747 if (!ILL_HCKSUM_CAPABLE(ill)) 2748 return; 2749 2750 ASSERT(ill->ill_hcksum_capab != NULL); 2751 2752 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2753 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 2754 dl_subcap->dl_length = sizeof (*hck_subcap); 2755 2756 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 2757 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 2758 hck_subcap->hcksum_txflags = 0; 2759 2760 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 2761 } 2762 2763 static void 2764 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2765 { 2766 mblk_t *nmp = NULL; 2767 dl_capability_req_t *oc; 2768 dl_capab_zerocopy_t *zc_ic, *zc_oc; 2769 ill_zerocopy_capab_t **ill_zerocopy_capab; 2770 uint_t sub_dl_cap = isub->dl_cap; 2771 uint8_t *capend; 2772 2773 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 2774 2775 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 2776 2777 /* 2778 * Note: range checks here are not absolutely sufficient to 2779 * make us robust against malformed messages sent by drivers; 2780 * this is in keeping with the rest of IP's dlpi handling. 2781 * (Remember, it's coming from something else in the kernel 2782 * address space) 2783 */ 2784 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2785 if (capend > mp->b_wptr) { 2786 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2787 "malformed sub-capability too long for mblk"); 2788 return; 2789 } 2790 2791 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 2792 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 2793 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 2794 "unsupported ZEROCOPY sub-capability (version %d, " 2795 "expected %d)", zc_ic->zerocopy_version, 2796 ZEROCOPY_VERSION_1); 2797 return; 2798 } 2799 2800 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 2801 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 2802 "capability isn't as expected; pass-thru module(s) " 2803 "detected, discarding capability\n")); 2804 return; 2805 } 2806 2807 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 2808 if (*ill_zerocopy_capab == NULL) { 2809 *ill_zerocopy_capab = 2810 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 2811 KM_NOSLEEP); 2812 2813 if (*ill_zerocopy_capab == NULL) { 2814 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2815 "could not enable Zero-copy version %d " 2816 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 2817 ill->ill_name); 2818 return; 2819 } 2820 } 2821 2822 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 2823 "supports Zero-copy version %d\n", ill->ill_name, 2824 ZEROCOPY_VERSION_1)); 2825 2826 (*ill_zerocopy_capab)->ill_zerocopy_version = 2827 zc_ic->zerocopy_version; 2828 (*ill_zerocopy_capab)->ill_zerocopy_flags = 2829 zc_ic->zerocopy_flags; 2830 2831 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 2832 } else { 2833 uint_t size; 2834 uchar_t *rptr; 2835 2836 size = sizeof (dl_capability_req_t) + 2837 sizeof (dl_capability_sub_t) + 2838 sizeof (dl_capab_zerocopy_t); 2839 2840 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2841 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2842 "could not enable zerocopy for %s (ENOMEM)\n", 2843 ill->ill_name); 2844 return; 2845 } 2846 2847 rptr = nmp->b_rptr; 2848 /* initialize dl_capability_req_t */ 2849 oc = (dl_capability_req_t *)rptr; 2850 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2851 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2852 sizeof (dl_capab_zerocopy_t); 2853 rptr += sizeof (dl_capability_req_t); 2854 2855 /* initialize dl_capability_sub_t */ 2856 bcopy(isub, rptr, sizeof (*isub)); 2857 rptr += sizeof (*isub); 2858 2859 /* initialize dl_capab_zerocopy_t */ 2860 zc_oc = (dl_capab_zerocopy_t *)rptr; 2861 *zc_oc = *zc_ic; 2862 2863 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 2864 "to enable zero-copy version %d\n", ill->ill_name, 2865 ZEROCOPY_VERSION_1)); 2866 2867 /* set VMSAFE_MEM flag */ 2868 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 2869 2870 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 2871 ill_capability_send(ill, nmp); 2872 } 2873 } 2874 2875 static void 2876 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 2877 { 2878 dl_capab_zerocopy_t *zerocopy_subcap; 2879 dl_capability_sub_t *dl_subcap; 2880 2881 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 2882 return; 2883 2884 ASSERT(ill->ill_zerocopy_capab != NULL); 2885 2886 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2887 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 2888 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 2889 2890 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 2891 zerocopy_subcap->zerocopy_version = 2892 ill->ill_zerocopy_capab->ill_zerocopy_version; 2893 zerocopy_subcap->zerocopy_flags = 0; 2894 2895 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 2896 } 2897 2898 /* 2899 * DLD capability 2900 * Refer to dld.h for more information regarding the purpose and usage 2901 * of this capability. 2902 */ 2903 static void 2904 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2905 { 2906 dl_capab_dld_t *dld_ic, dld; 2907 uint_t sub_dl_cap = isub->dl_cap; 2908 uint8_t *capend; 2909 ill_dld_capab_t *idc; 2910 2911 ASSERT(IAM_WRITER_ILL(ill)); 2912 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 2913 2914 /* 2915 * Note: range checks here are not absolutely sufficient to 2916 * make us robust against malformed messages sent by drivers; 2917 * this is in keeping with the rest of IP's dlpi handling. 2918 * (Remember, it's coming from something else in the kernel 2919 * address space) 2920 */ 2921 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2922 if (capend > mp->b_wptr) { 2923 cmn_err(CE_WARN, "ill_capability_dld_ack: " 2924 "malformed sub-capability too long for mblk"); 2925 return; 2926 } 2927 dld_ic = (dl_capab_dld_t *)(isub + 1); 2928 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 2929 cmn_err(CE_CONT, "ill_capability_dld_ack: " 2930 "unsupported DLD sub-capability (version %d, " 2931 "expected %d)", dld_ic->dld_version, 2932 DLD_CURRENT_VERSION); 2933 return; 2934 } 2935 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 2936 ip1dbg(("ill_capability_dld_ack: mid token for dld " 2937 "capability isn't as expected; pass-thru module(s) " 2938 "detected, discarding capability\n")); 2939 return; 2940 } 2941 2942 /* 2943 * Copy locally to ensure alignment. 2944 */ 2945 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 2946 2947 if ((idc = ill->ill_dld_capab) == NULL) { 2948 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 2949 if (idc == NULL) { 2950 cmn_err(CE_WARN, "ill_capability_dld_ack: " 2951 "could not enable DLD version %d " 2952 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 2953 ill->ill_name); 2954 return; 2955 } 2956 ill->ill_dld_capab = idc; 2957 } 2958 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 2959 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 2960 ip1dbg(("ill_capability_dld_ack: interface %s " 2961 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 2962 2963 ill_capability_dld_enable(ill); 2964 } 2965 2966 /* 2967 * Typically capability negotiation between IP and the driver happens via 2968 * DLPI message exchange. However GLD also offers a direct function call 2969 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 2970 * But arbitrary function calls into IP or GLD are not permitted, since both 2971 * of them are protected by their own perimeter mechanism. The perimeter can 2972 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 2973 * these perimeters is IP -> MAC. Thus for example to enable the squeue 2974 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 2975 * to enter the mac perimeter and then do the direct function calls into 2976 * GLD to enable squeue polling. The ring related callbacks from the mac into 2977 * the stack to add, bind, quiesce, restart or cleanup a ring are all 2978 * protected by the mac perimeter. 2979 */ 2980 static void 2981 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 2982 { 2983 ill_dld_capab_t *idc = ill->ill_dld_capab; 2984 int err; 2985 2986 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 2987 DLD_ENABLE); 2988 ASSERT(err == 0); 2989 } 2990 2991 static void 2992 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 2993 { 2994 ill_dld_capab_t *idc = ill->ill_dld_capab; 2995 int err; 2996 2997 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 2998 DLD_DISABLE); 2999 ASSERT(err == 0); 3000 } 3001 3002 boolean_t 3003 ill_mac_perim_held(ill_t *ill) 3004 { 3005 ill_dld_capab_t *idc = ill->ill_dld_capab; 3006 3007 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 3008 DLD_QUERY)); 3009 } 3010 3011 static void 3012 ill_capability_direct_enable(ill_t *ill) 3013 { 3014 ill_dld_capab_t *idc = ill->ill_dld_capab; 3015 ill_dld_direct_t *idd = &idc->idc_direct; 3016 dld_capab_direct_t direct; 3017 int rc; 3018 3019 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3020 3021 bzero(&direct, sizeof (direct)); 3022 direct.di_rx_cf = (uintptr_t)ip_input; 3023 direct.di_rx_ch = ill; 3024 3025 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 3026 DLD_ENABLE); 3027 if (rc == 0) { 3028 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 3029 idd->idd_tx_dh = direct.di_tx_dh; 3030 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 3031 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 3032 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 3033 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 3034 ASSERT(idd->idd_tx_cb_df != NULL); 3035 ASSERT(idd->idd_tx_fctl_df != NULL); 3036 ASSERT(idd->idd_tx_df != NULL); 3037 /* 3038 * One time registration of flow enable callback function 3039 */ 3040 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 3041 ill_flow_enable, ill); 3042 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 3043 DTRACE_PROBE1(direct_on, (ill_t *), ill); 3044 } else { 3045 cmn_err(CE_WARN, "warning: could not enable DIRECT " 3046 "capability, rc = %d\n", rc); 3047 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 3048 } 3049 } 3050 3051 static void 3052 ill_capability_poll_enable(ill_t *ill) 3053 { 3054 ill_dld_capab_t *idc = ill->ill_dld_capab; 3055 dld_capab_poll_t poll; 3056 int rc; 3057 3058 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3059 3060 bzero(&poll, sizeof (poll)); 3061 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 3062 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 3063 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 3064 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 3065 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 3066 poll.poll_ring_ch = ill; 3067 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 3068 DLD_ENABLE); 3069 if (rc == 0) { 3070 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 3071 DTRACE_PROBE1(poll_on, (ill_t *), ill); 3072 } else { 3073 ip1dbg(("warning: could not enable POLL " 3074 "capability, rc = %d\n", rc)); 3075 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 3076 } 3077 } 3078 3079 /* 3080 * Enable the LSO capability. 3081 */ 3082 static void 3083 ill_capability_lso_enable(ill_t *ill) 3084 { 3085 ill_dld_capab_t *idc = ill->ill_dld_capab; 3086 dld_capab_lso_t lso; 3087 int rc; 3088 3089 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3090 3091 if (ill->ill_lso_capab == NULL) { 3092 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 3093 KM_NOSLEEP); 3094 if (ill->ill_lso_capab == NULL) { 3095 cmn_err(CE_WARN, "ill_capability_lso_enable: " 3096 "could not enable LSO for %s (ENOMEM)\n", 3097 ill->ill_name); 3098 return; 3099 } 3100 } 3101 3102 bzero(&lso, sizeof (lso)); 3103 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 3104 DLD_ENABLE)) == 0) { 3105 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 3106 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 3107 ill->ill_capabilities |= ILL_CAPAB_DLD_LSO; 3108 ip1dbg(("ill_capability_lso_enable: interface %s " 3109 "has enabled LSO\n ", ill->ill_name)); 3110 } else { 3111 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 3112 ill->ill_lso_capab = NULL; 3113 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 3114 } 3115 } 3116 3117 static void 3118 ill_capability_dld_enable(ill_t *ill) 3119 { 3120 mac_perim_handle_t mph; 3121 3122 ASSERT(IAM_WRITER_ILL(ill)); 3123 3124 if (ill->ill_isv6) 3125 return; 3126 3127 ill_mac_perim_enter(ill, &mph); 3128 if (!ill->ill_isv6) { 3129 ill_capability_direct_enable(ill); 3130 ill_capability_poll_enable(ill); 3131 ill_capability_lso_enable(ill); 3132 } 3133 ill->ill_capabilities |= ILL_CAPAB_DLD; 3134 ill_mac_perim_exit(ill, mph); 3135 } 3136 3137 static void 3138 ill_capability_dld_disable(ill_t *ill) 3139 { 3140 ill_dld_capab_t *idc; 3141 ill_dld_direct_t *idd; 3142 mac_perim_handle_t mph; 3143 3144 ASSERT(IAM_WRITER_ILL(ill)); 3145 3146 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 3147 return; 3148 3149 ill_mac_perim_enter(ill, &mph); 3150 3151 idc = ill->ill_dld_capab; 3152 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 3153 /* 3154 * For performance we avoid locks in the transmit data path 3155 * and don't maintain a count of the number of threads using 3156 * direct calls. Thus some threads could be using direct 3157 * transmit calls to GLD, even after the capability mechanism 3158 * turns it off. This is still safe since the handles used in 3159 * the direct calls continue to be valid until the unplumb is 3160 * completed. Remove the callback that was added (1-time) at 3161 * capab enable time. 3162 */ 3163 mutex_enter(&ill->ill_lock); 3164 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 3165 mutex_exit(&ill->ill_lock); 3166 if (ill->ill_flownotify_mh != NULL) { 3167 idd = &idc->idc_direct; 3168 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 3169 ill->ill_flownotify_mh); 3170 ill->ill_flownotify_mh = NULL; 3171 } 3172 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 3173 NULL, DLD_DISABLE); 3174 } 3175 3176 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 3177 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 3178 ip_squeue_clean_all(ill); 3179 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 3180 NULL, DLD_DISABLE); 3181 } 3182 3183 if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) { 3184 ASSERT(ill->ill_lso_capab != NULL); 3185 /* 3186 * Clear the capability flag for LSO but retain the 3187 * ill_lso_capab structure since it's possible that another 3188 * thread is still referring to it. The structure only gets 3189 * deallocated when we destroy the ill. 3190 */ 3191 3192 ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO; 3193 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 3194 NULL, DLD_DISABLE); 3195 } 3196 3197 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 3198 ill_mac_perim_exit(ill, mph); 3199 } 3200 3201 /* 3202 * Capability Negotiation protocol 3203 * 3204 * We don't wait for DLPI capability operations to finish during interface 3205 * bringup or teardown. Doing so would introduce more asynchrony and the 3206 * interface up/down operations will need multiple return and restarts. 3207 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 3208 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 3209 * exclusive operation won't start until the DLPI operations of the previous 3210 * exclusive operation complete. 3211 * 3212 * The capability state machine is shown below. 3213 * 3214 * state next state event, action 3215 * 3216 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 3217 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 3218 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 3219 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 3220 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 3221 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 3222 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 3223 * ill_capability_probe. 3224 */ 3225 3226 /* 3227 * Dedicated thread started from ip_stack_init that handles capability 3228 * disable. This thread ensures the taskq dispatch does not fail by waiting 3229 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 3230 * that direct calls to DLD are done in a cv_waitable context. 3231 */ 3232 void 3233 ill_taskq_dispatch(ip_stack_t *ipst) 3234 { 3235 callb_cpr_t cprinfo; 3236 char name[64]; 3237 mblk_t *mp; 3238 3239 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 3240 ipst->ips_netstack->netstack_stackid); 3241 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 3242 name); 3243 mutex_enter(&ipst->ips_capab_taskq_lock); 3244 3245 for (;;) { 3246 mp = ipst->ips_capab_taskq_head; 3247 while (mp != NULL) { 3248 ipst->ips_capab_taskq_head = mp->b_next; 3249 if (ipst->ips_capab_taskq_head == NULL) 3250 ipst->ips_capab_taskq_tail = NULL; 3251 mutex_exit(&ipst->ips_capab_taskq_lock); 3252 mp->b_next = NULL; 3253 3254 VERIFY(taskq_dispatch(system_taskq, 3255 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 3256 mutex_enter(&ipst->ips_capab_taskq_lock); 3257 mp = ipst->ips_capab_taskq_head; 3258 } 3259 3260 if (ipst->ips_capab_taskq_quit) 3261 break; 3262 CALLB_CPR_SAFE_BEGIN(&cprinfo); 3263 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 3264 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 3265 } 3266 VERIFY(ipst->ips_capab_taskq_head == NULL); 3267 VERIFY(ipst->ips_capab_taskq_tail == NULL); 3268 CALLB_CPR_EXIT(&cprinfo); 3269 thread_exit(); 3270 } 3271 3272 /* 3273 * Consume a new-style hardware capabilities negotiation ack. 3274 * Called via taskq on receipt of DL_CAPABBILITY_ACK. 3275 */ 3276 static void 3277 ill_capability_ack_thr(void *arg) 3278 { 3279 mblk_t *mp = arg; 3280 dl_capability_ack_t *capp; 3281 dl_capability_sub_t *subp, *endp; 3282 ill_t *ill; 3283 boolean_t reneg; 3284 3285 ill = (ill_t *)mp->b_prev; 3286 mp->b_prev = NULL; 3287 3288 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 3289 3290 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 3291 ill->ill_dlpi_capab_state == IDCS_RENEG) { 3292 /* 3293 * We have received the ack for our DL_CAPAB reset request. 3294 * There isnt' anything in the message that needs processing. 3295 * All message based capabilities have been disabled, now 3296 * do the function call based capability disable. 3297 */ 3298 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 3299 ill_capability_dld_disable(ill); 3300 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 3301 if (reneg) 3302 ill_capability_probe(ill); 3303 goto done; 3304 } 3305 3306 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 3307 ill->ill_dlpi_capab_state = IDCS_OK; 3308 3309 capp = (dl_capability_ack_t *)mp->b_rptr; 3310 3311 if (capp->dl_sub_length == 0) { 3312 /* no new-style capabilities */ 3313 goto done; 3314 } 3315 3316 /* make sure the driver supplied correct dl_sub_length */ 3317 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3318 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3319 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3320 goto done; 3321 } 3322 3323 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3324 /* 3325 * There are sub-capabilities. Process the ones we know about. 3326 * Loop until we don't have room for another sub-cap header.. 3327 */ 3328 for (subp = SC(capp, capp->dl_sub_offset), 3329 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3330 subp <= endp; 3331 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3332 3333 switch (subp->dl_cap) { 3334 case DL_CAPAB_ID_WRAPPER: 3335 ill_capability_id_ack(ill, mp, subp); 3336 break; 3337 default: 3338 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3339 break; 3340 } 3341 } 3342 #undef SC 3343 done: 3344 inet_freemsg(mp); 3345 ill_capability_done(ill); 3346 ipsq_exit(ill->ill_phyint->phyint_ipsq); 3347 } 3348 3349 /* 3350 * This needs to be started in a taskq thread to provide a cv_waitable 3351 * context. 3352 */ 3353 void 3354 ill_capability_ack(ill_t *ill, mblk_t *mp) 3355 { 3356 ip_stack_t *ipst = ill->ill_ipst; 3357 3358 mp->b_prev = (mblk_t *)ill; 3359 ASSERT(mp->b_next == NULL); 3360 3361 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 3362 TQ_NOSLEEP) != 0) 3363 return; 3364 3365 /* 3366 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 3367 * which will do the dispatch using TQ_SLEEP to guarantee success. 3368 */ 3369 mutex_enter(&ipst->ips_capab_taskq_lock); 3370 if (ipst->ips_capab_taskq_head == NULL) { 3371 ASSERT(ipst->ips_capab_taskq_tail == NULL); 3372 ipst->ips_capab_taskq_head = mp; 3373 } else { 3374 ipst->ips_capab_taskq_tail->b_next = mp; 3375 } 3376 ipst->ips_capab_taskq_tail = mp; 3377 3378 cv_signal(&ipst->ips_capab_taskq_cv); 3379 mutex_exit(&ipst->ips_capab_taskq_lock); 3380 } 3381 3382 /* 3383 * This routine is called to scan the fragmentation reassembly table for 3384 * the specified ILL for any packets that are starting to smell. 3385 * dead_interval is the maximum time in seconds that will be tolerated. It 3386 * will either be the value specified in ip_g_frag_timeout, or zero if the 3387 * ILL is shutting down and it is time to blow everything off. 3388 * 3389 * It returns the number of seconds (as a time_t) that the next frag timer 3390 * should be scheduled for, 0 meaning that the timer doesn't need to be 3391 * re-started. Note that the method of calculating next_timeout isn't 3392 * entirely accurate since time will flow between the time we grab 3393 * current_time and the time we schedule the next timeout. This isn't a 3394 * big problem since this is the timer for sending an ICMP reassembly time 3395 * exceeded messages, and it doesn't have to be exactly accurate. 3396 * 3397 * This function is 3398 * sometimes called as writer, although this is not required. 3399 */ 3400 time_t 3401 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3402 { 3403 ipfb_t *ipfb; 3404 ipfb_t *endp; 3405 ipf_t *ipf; 3406 ipf_t *ipfnext; 3407 mblk_t *mp; 3408 time_t current_time = gethrestime_sec(); 3409 time_t next_timeout = 0; 3410 uint32_t hdr_length; 3411 mblk_t *send_icmp_head; 3412 mblk_t *send_icmp_head_v6; 3413 zoneid_t zoneid; 3414 ip_stack_t *ipst = ill->ill_ipst; 3415 3416 ipfb = ill->ill_frag_hash_tbl; 3417 if (ipfb == NULL) 3418 return (B_FALSE); 3419 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3420 /* Walk the frag hash table. */ 3421 for (; ipfb < endp; ipfb++) { 3422 send_icmp_head = NULL; 3423 send_icmp_head_v6 = NULL; 3424 mutex_enter(&ipfb->ipfb_lock); 3425 while ((ipf = ipfb->ipfb_ipf) != 0) { 3426 time_t frag_time = current_time - ipf->ipf_timestamp; 3427 time_t frag_timeout; 3428 3429 if (frag_time < dead_interval) { 3430 /* 3431 * There are some outstanding fragments 3432 * that will timeout later. Make note of 3433 * the time so that we can reschedule the 3434 * next timeout appropriately. 3435 */ 3436 frag_timeout = dead_interval - frag_time; 3437 if (next_timeout == 0 || 3438 frag_timeout < next_timeout) { 3439 next_timeout = frag_timeout; 3440 } 3441 break; 3442 } 3443 /* Time's up. Get it out of here. */ 3444 hdr_length = ipf->ipf_nf_hdr_len; 3445 ipfnext = ipf->ipf_hash_next; 3446 if (ipfnext) 3447 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3448 *ipf->ipf_ptphn = ipfnext; 3449 mp = ipf->ipf_mp->b_cont; 3450 for (; mp; mp = mp->b_cont) { 3451 /* Extra points for neatness. */ 3452 IP_REASS_SET_START(mp, 0); 3453 IP_REASS_SET_END(mp, 0); 3454 } 3455 mp = ipf->ipf_mp->b_cont; 3456 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 3457 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3458 ipfb->ipfb_count -= ipf->ipf_count; 3459 ASSERT(ipfb->ipfb_frag_pkts > 0); 3460 ipfb->ipfb_frag_pkts--; 3461 /* 3462 * We do not send any icmp message from here because 3463 * we currently are holding the ipfb_lock for this 3464 * hash chain. If we try and send any icmp messages 3465 * from here we may end up via a put back into ip 3466 * trying to get the same lock, causing a recursive 3467 * mutex panic. Instead we build a list and send all 3468 * the icmp messages after we have dropped the lock. 3469 */ 3470 if (ill->ill_isv6) { 3471 if (hdr_length != 0) { 3472 mp->b_next = send_icmp_head_v6; 3473 send_icmp_head_v6 = mp; 3474 } else { 3475 freemsg(mp); 3476 } 3477 } else { 3478 if (hdr_length != 0) { 3479 mp->b_next = send_icmp_head; 3480 send_icmp_head = mp; 3481 } else { 3482 freemsg(mp); 3483 } 3484 } 3485 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3486 freeb(ipf->ipf_mp); 3487 } 3488 mutex_exit(&ipfb->ipfb_lock); 3489 /* 3490 * Now need to send any icmp messages that we delayed from 3491 * above. 3492 */ 3493 while (send_icmp_head_v6 != NULL) { 3494 ip6_t *ip6h; 3495 3496 mp = send_icmp_head_v6; 3497 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3498 mp->b_next = NULL; 3499 if (mp->b_datap->db_type == M_CTL) 3500 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3501 else 3502 ip6h = (ip6_t *)mp->b_rptr; 3503 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3504 ill, ipst); 3505 if (zoneid == ALL_ZONES) { 3506 freemsg(mp); 3507 } else { 3508 icmp_time_exceeded_v6(ill->ill_wq, mp, 3509 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3510 B_FALSE, zoneid, ipst); 3511 } 3512 } 3513 while (send_icmp_head != NULL) { 3514 ipaddr_t dst; 3515 3516 mp = send_icmp_head; 3517 send_icmp_head = send_icmp_head->b_next; 3518 mp->b_next = NULL; 3519 3520 if (mp->b_datap->db_type == M_CTL) 3521 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3522 else 3523 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3524 3525 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 3526 if (zoneid == ALL_ZONES) { 3527 freemsg(mp); 3528 } else { 3529 icmp_time_exceeded(ill->ill_wq, mp, 3530 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid, 3531 ipst); 3532 } 3533 } 3534 } 3535 /* 3536 * A non-dying ILL will use the return value to decide whether to 3537 * restart the frag timer, and for how long. 3538 */ 3539 return (next_timeout); 3540 } 3541 3542 /* 3543 * This routine is called when the approximate count of mblk memory used 3544 * for the specified ILL has exceeded max_count. 3545 */ 3546 void 3547 ill_frag_prune(ill_t *ill, uint_t max_count) 3548 { 3549 ipfb_t *ipfb; 3550 ipf_t *ipf; 3551 size_t count; 3552 3553 /* 3554 * If we are here within ip_min_frag_prune_time msecs remove 3555 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3556 * ill_frag_free_num_pkts. 3557 */ 3558 mutex_enter(&ill->ill_lock); 3559 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3560 (ip_min_frag_prune_time != 0 ? 3561 ip_min_frag_prune_time : msec_per_tick)) { 3562 3563 ill->ill_frag_free_num_pkts++; 3564 3565 } else { 3566 ill->ill_frag_free_num_pkts = 0; 3567 } 3568 ill->ill_last_frag_clean_time = lbolt; 3569 mutex_exit(&ill->ill_lock); 3570 3571 /* 3572 * free ill_frag_free_num_pkts oldest packets from each bucket. 3573 */ 3574 if (ill->ill_frag_free_num_pkts != 0) { 3575 int ix; 3576 3577 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3578 ipfb = &ill->ill_frag_hash_tbl[ix]; 3579 mutex_enter(&ipfb->ipfb_lock); 3580 if (ipfb->ipfb_ipf != NULL) { 3581 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3582 ill->ill_frag_free_num_pkts); 3583 } 3584 mutex_exit(&ipfb->ipfb_lock); 3585 } 3586 } 3587 /* 3588 * While the reassembly list for this ILL is too big, prune a fragment 3589 * queue by age, oldest first. 3590 */ 3591 while (ill->ill_frag_count > max_count) { 3592 int ix; 3593 ipfb_t *oipfb = NULL; 3594 uint_t oldest = UINT_MAX; 3595 3596 count = 0; 3597 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3598 ipfb = &ill->ill_frag_hash_tbl[ix]; 3599 mutex_enter(&ipfb->ipfb_lock); 3600 ipf = ipfb->ipfb_ipf; 3601 if (ipf != NULL && ipf->ipf_gen < oldest) { 3602 oldest = ipf->ipf_gen; 3603 oipfb = ipfb; 3604 } 3605 count += ipfb->ipfb_count; 3606 mutex_exit(&ipfb->ipfb_lock); 3607 } 3608 if (oipfb == NULL) 3609 break; 3610 3611 if (count <= max_count) 3612 return; /* Somebody beat us to it, nothing to do */ 3613 mutex_enter(&oipfb->ipfb_lock); 3614 ipf = oipfb->ipfb_ipf; 3615 if (ipf != NULL) { 3616 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3617 } 3618 mutex_exit(&oipfb->ipfb_lock); 3619 } 3620 } 3621 3622 /* 3623 * free 'free_cnt' fragmented packets starting at ipf. 3624 */ 3625 void 3626 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3627 { 3628 size_t count; 3629 mblk_t *mp; 3630 mblk_t *tmp; 3631 ipf_t **ipfp = ipf->ipf_ptphn; 3632 3633 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3634 ASSERT(ipfp != NULL); 3635 ASSERT(ipf != NULL); 3636 3637 while (ipf != NULL && free_cnt-- > 0) { 3638 count = ipf->ipf_count; 3639 mp = ipf->ipf_mp; 3640 ipf = ipf->ipf_hash_next; 3641 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3642 IP_REASS_SET_START(tmp, 0); 3643 IP_REASS_SET_END(tmp, 0); 3644 } 3645 atomic_add_32(&ill->ill_frag_count, -count); 3646 ASSERT(ipfb->ipfb_count >= count); 3647 ipfb->ipfb_count -= count; 3648 ASSERT(ipfb->ipfb_frag_pkts > 0); 3649 ipfb->ipfb_frag_pkts--; 3650 freemsg(mp); 3651 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3652 } 3653 3654 if (ipf) 3655 ipf->ipf_ptphn = ipfp; 3656 ipfp[0] = ipf; 3657 } 3658 3659 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3660 "obsolete and may be removed in a future release of Solaris. Use " \ 3661 "ifconfig(1M) to manipulate the forwarding status of an interface." 3662 3663 /* 3664 * For obsolete per-interface forwarding configuration; 3665 * called in response to ND_GET. 3666 */ 3667 /* ARGSUSED */ 3668 static int 3669 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3670 { 3671 ill_t *ill = (ill_t *)cp; 3672 3673 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3674 3675 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3676 return (0); 3677 } 3678 3679 /* 3680 * For obsolete per-interface forwarding configuration; 3681 * called in response to ND_SET. 3682 */ 3683 /* ARGSUSED */ 3684 static int 3685 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3686 cred_t *ioc_cr) 3687 { 3688 long value; 3689 int retval; 3690 ip_stack_t *ipst = CONNQ_TO_IPST(q); 3691 3692 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3693 3694 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3695 value < 0 || value > 1) { 3696 return (EINVAL); 3697 } 3698 3699 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3700 retval = ill_forward_set((ill_t *)cp, (value != 0)); 3701 rw_exit(&ipst->ips_ill_g_lock); 3702 return (retval); 3703 } 3704 3705 /* 3706 * Helper function for ill_forward_set(). 3707 */ 3708 static void 3709 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 3710 { 3711 ip_stack_t *ipst = ill->ill_ipst; 3712 3713 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3714 3715 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3716 (enable ? "Enabling" : "Disabling"), 3717 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3718 mutex_enter(&ill->ill_lock); 3719 if (enable) 3720 ill->ill_flags |= ILLF_ROUTER; 3721 else 3722 ill->ill_flags &= ~ILLF_ROUTER; 3723 mutex_exit(&ill->ill_lock); 3724 if (ill->ill_isv6) 3725 ill_set_nce_router_flags(ill, enable); 3726 /* Notify routing socket listeners of this change. */ 3727 if (ill->ill_ipif != NULL) 3728 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 3729 } 3730 3731 /* 3732 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 3733 * socket messages for each interface whose flags we change. 3734 */ 3735 int 3736 ill_forward_set(ill_t *ill, boolean_t enable) 3737 { 3738 ipmp_illgrp_t *illg; 3739 ip_stack_t *ipst = ill->ill_ipst; 3740 3741 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3742 3743 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3744 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 3745 return (0); 3746 3747 if (IS_LOOPBACK(ill)) 3748 return (EINVAL); 3749 3750 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 3751 /* 3752 * Update all of the interfaces in the group. 3753 */ 3754 illg = ill->ill_grp; 3755 ill = list_head(&illg->ig_if); 3756 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 3757 ill_forward_set_on_ill(ill, enable); 3758 3759 /* 3760 * Update the IPMP meta-interface. 3761 */ 3762 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 3763 return (0); 3764 } 3765 3766 ill_forward_set_on_ill(ill, enable); 3767 return (0); 3768 } 3769 3770 /* 3771 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3772 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3773 * set or clear. 3774 */ 3775 static void 3776 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3777 { 3778 ipif_t *ipif; 3779 nce_t *nce; 3780 3781 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3782 /* 3783 * NOTE: we match across the illgrp because nce's for 3784 * addresses on IPMP interfaces have an nce_ill that points to 3785 * the bound underlying ill. 3786 */ 3787 nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr, 3788 B_FALSE); 3789 if (nce != NULL) { 3790 mutex_enter(&nce->nce_lock); 3791 if (enable) 3792 nce->nce_flags |= NCE_F_ISROUTER; 3793 else 3794 nce->nce_flags &= ~NCE_F_ISROUTER; 3795 mutex_exit(&nce->nce_lock); 3796 NCE_REFRELE(nce); 3797 } 3798 } 3799 } 3800 3801 /* 3802 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 3803 * for this ill. Make sure the v6/v4 question has been answered about this 3804 * ill. The creation of this ndd variable is only for backwards compatibility. 3805 * The preferred way to control per-interface IP forwarding is through the 3806 * ILLF_ROUTER interface flag. 3807 */ 3808 static int 3809 ill_set_ndd_name(ill_t *ill) 3810 { 3811 char *suffix; 3812 ip_stack_t *ipst = ill->ill_ipst; 3813 3814 ASSERT(IAM_WRITER_ILL(ill)); 3815 3816 if (ill->ill_isv6) 3817 suffix = ipv6_forward_suffix; 3818 else 3819 suffix = ipv4_forward_suffix; 3820 3821 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 3822 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 3823 /* 3824 * Copies over the '\0'. 3825 * Note that strlen(suffix) is always bounded. 3826 */ 3827 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 3828 strlen(suffix) + 1); 3829 3830 /* 3831 * Use of the nd table requires holding the reader lock. 3832 * Modifying the nd table thru nd_load/nd_unload requires 3833 * the writer lock. 3834 */ 3835 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 3836 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 3837 nd_ill_forward_set, (caddr_t)ill)) { 3838 /* 3839 * If the nd_load failed, it only meant that it could not 3840 * allocate a new bunch of room for further NDD expansion. 3841 * Because of that, the ill_ndd_name will be set to 0, and 3842 * this interface is at the mercy of the global ip_forwarding 3843 * variable. 3844 */ 3845 rw_exit(&ipst->ips_ip_g_nd_lock); 3846 ill->ill_ndd_name = NULL; 3847 return (ENOMEM); 3848 } 3849 rw_exit(&ipst->ips_ip_g_nd_lock); 3850 return (0); 3851 } 3852 3853 /* 3854 * Intializes the context structure and returns the first ill in the list 3855 * cuurently start_list and end_list can have values: 3856 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 3857 * IP_V4_G_HEAD Traverse IPV4 list only. 3858 * IP_V6_G_HEAD Traverse IPV6 list only. 3859 */ 3860 3861 /* 3862 * We don't check for CONDEMNED ills here. Caller must do that if 3863 * necessary under the ill lock. 3864 */ 3865 ill_t * 3866 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 3867 ip_stack_t *ipst) 3868 { 3869 ill_if_t *ifp; 3870 ill_t *ill; 3871 avl_tree_t *avl_tree; 3872 3873 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3874 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 3875 3876 /* 3877 * setup the lists to search 3878 */ 3879 if (end_list != MAX_G_HEADS) { 3880 ctx->ctx_current_list = start_list; 3881 ctx->ctx_last_list = end_list; 3882 } else { 3883 ctx->ctx_last_list = MAX_G_HEADS - 1; 3884 ctx->ctx_current_list = 0; 3885 } 3886 3887 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 3888 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 3889 if (ifp != (ill_if_t *) 3890 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 3891 avl_tree = &ifp->illif_avl_by_ppa; 3892 ill = avl_first(avl_tree); 3893 /* 3894 * ill is guaranteed to be non NULL or ifp should have 3895 * not existed. 3896 */ 3897 ASSERT(ill != NULL); 3898 return (ill); 3899 } 3900 ctx->ctx_current_list++; 3901 } 3902 3903 return (NULL); 3904 } 3905 3906 /* 3907 * returns the next ill in the list. ill_first() must have been called 3908 * before calling ill_next() or bad things will happen. 3909 */ 3910 3911 /* 3912 * We don't check for CONDEMNED ills here. Caller must do that if 3913 * necessary under the ill lock. 3914 */ 3915 ill_t * 3916 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 3917 { 3918 ill_if_t *ifp; 3919 ill_t *ill; 3920 ip_stack_t *ipst = lastill->ill_ipst; 3921 3922 ASSERT(lastill->ill_ifptr != (ill_if_t *) 3923 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 3924 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 3925 AVL_AFTER)) != NULL) { 3926 return (ill); 3927 } 3928 3929 /* goto next ill_ifp in the list. */ 3930 ifp = lastill->ill_ifptr->illif_next; 3931 3932 /* make sure not at end of circular list */ 3933 while (ifp == 3934 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 3935 if (++ctx->ctx_current_list > ctx->ctx_last_list) 3936 return (NULL); 3937 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 3938 } 3939 3940 return (avl_first(&ifp->illif_avl_by_ppa)); 3941 } 3942 3943 /* 3944 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 3945 * The final number (PPA) must not have any leading zeros. Upon success, a 3946 * pointer to the start of the PPA is returned; otherwise NULL is returned. 3947 */ 3948 static char * 3949 ill_get_ppa_ptr(char *name) 3950 { 3951 int namelen = strlen(name); 3952 int end_ndx = namelen - 1; 3953 int ppa_ndx, i; 3954 3955 /* 3956 * Check that the first character is [a-zA-Z], and that the last 3957 * character is [0-9]. 3958 */ 3959 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 3960 return (NULL); 3961 3962 /* 3963 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 3964 */ 3965 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 3966 if (!isdigit(name[ppa_ndx - 1])) 3967 break; 3968 3969 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 3970 return (NULL); 3971 3972 /* 3973 * Check that the intermediate characters are [a-z0-9.] 3974 */ 3975 for (i = 1; i < ppa_ndx; i++) { 3976 if (!isalpha(name[i]) && !isdigit(name[i]) && 3977 name[i] != '.' && name[i] != '_') { 3978 return (NULL); 3979 } 3980 } 3981 3982 return (name + ppa_ndx); 3983 } 3984 3985 /* 3986 * use avl tree to locate the ill. 3987 */ 3988 static ill_t * 3989 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 3990 ipsq_func_t func, int *error, ip_stack_t *ipst) 3991 { 3992 char *ppa_ptr = NULL; 3993 int len; 3994 uint_t ppa; 3995 ill_t *ill = NULL; 3996 ill_if_t *ifp; 3997 int list; 3998 ipsq_t *ipsq; 3999 4000 if (error != NULL) 4001 *error = 0; 4002 4003 /* 4004 * get ppa ptr 4005 */ 4006 if (isv6) 4007 list = IP_V6_G_HEAD; 4008 else 4009 list = IP_V4_G_HEAD; 4010 4011 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4012 if (error != NULL) 4013 *error = ENXIO; 4014 return (NULL); 4015 } 4016 4017 len = ppa_ptr - name + 1; 4018 4019 ppa = stoi(&ppa_ptr); 4020 4021 ifp = IP_VX_ILL_G_LIST(list, ipst); 4022 4023 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4024 /* 4025 * match is done on len - 1 as the name is not null 4026 * terminated it contains ppa in addition to the interface 4027 * name. 4028 */ 4029 if ((ifp->illif_name_len == len) && 4030 bcmp(ifp->illif_name, name, len - 1) == 0) { 4031 break; 4032 } else { 4033 ifp = ifp->illif_next; 4034 } 4035 } 4036 4037 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4038 /* 4039 * Even the interface type does not exist. 4040 */ 4041 if (error != NULL) 4042 *error = ENXIO; 4043 return (NULL); 4044 } 4045 4046 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4047 if (ill != NULL) { 4048 /* 4049 * The block comment at the start of ipif_down 4050 * explains the use of the macros used below 4051 */ 4052 GRAB_CONN_LOCK(q); 4053 mutex_enter(&ill->ill_lock); 4054 if (ILL_CAN_LOOKUP(ill)) { 4055 ill_refhold_locked(ill); 4056 mutex_exit(&ill->ill_lock); 4057 RELEASE_CONN_LOCK(q); 4058 return (ill); 4059 } else if (ILL_CAN_WAIT(ill, q)) { 4060 ipsq = ill->ill_phyint->phyint_ipsq; 4061 mutex_enter(&ipsq->ipsq_lock); 4062 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 4063 mutex_exit(&ill->ill_lock); 4064 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4065 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 4066 mutex_exit(&ipsq->ipsq_lock); 4067 RELEASE_CONN_LOCK(q); 4068 if (error != NULL) 4069 *error = EINPROGRESS; 4070 return (NULL); 4071 } 4072 mutex_exit(&ill->ill_lock); 4073 RELEASE_CONN_LOCK(q); 4074 } 4075 if (error != NULL) 4076 *error = ENXIO; 4077 return (NULL); 4078 } 4079 4080 /* 4081 * comparison function for use with avl. 4082 */ 4083 static int 4084 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4085 { 4086 uint_t ppa; 4087 uint_t ill_ppa; 4088 4089 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4090 4091 ppa = *((uint_t *)ppa_ptr); 4092 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4093 /* 4094 * We want the ill with the lowest ppa to be on the 4095 * top. 4096 */ 4097 if (ill_ppa < ppa) 4098 return (1); 4099 if (ill_ppa > ppa) 4100 return (-1); 4101 return (0); 4102 } 4103 4104 /* 4105 * remove an interface type from the global list. 4106 */ 4107 static void 4108 ill_delete_interface_type(ill_if_t *interface) 4109 { 4110 ASSERT(interface != NULL); 4111 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4112 4113 avl_destroy(&interface->illif_avl_by_ppa); 4114 if (interface->illif_ppa_arena != NULL) 4115 vmem_destroy(interface->illif_ppa_arena); 4116 4117 remque(interface); 4118 4119 mi_free(interface); 4120 } 4121 4122 /* 4123 * remove ill from the global list. 4124 */ 4125 static void 4126 ill_glist_delete(ill_t *ill) 4127 { 4128 ip_stack_t *ipst; 4129 phyint_t *phyi; 4130 4131 if (ill == NULL) 4132 return; 4133 ipst = ill->ill_ipst; 4134 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4135 4136 /* 4137 * If the ill was never inserted into the AVL tree 4138 * we skip the if branch. 4139 */ 4140 if (ill->ill_ifptr != NULL) { 4141 /* 4142 * remove from AVL tree and free ppa number 4143 */ 4144 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4145 4146 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4147 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4148 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4149 } 4150 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4151 ill_delete_interface_type(ill->ill_ifptr); 4152 } 4153 4154 /* 4155 * Indicate ill is no longer in the list. 4156 */ 4157 ill->ill_ifptr = NULL; 4158 ill->ill_name_length = 0; 4159 ill->ill_name[0] = '\0'; 4160 ill->ill_ppa = UINT_MAX; 4161 } 4162 4163 /* Generate one last event for this ill. */ 4164 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 4165 ill->ill_name_length); 4166 4167 ASSERT(ill->ill_phyint != NULL); 4168 phyi = ill->ill_phyint; 4169 ill->ill_phyint = NULL; 4170 4171 /* 4172 * ill_init allocates a phyint always to store the copy 4173 * of flags relevant to phyint. At that point in time, we could 4174 * not assign the name and hence phyint_illv4/v6 could not be 4175 * initialized. Later in ipif_set_values, we assign the name to 4176 * the ill, at which point in time we assign phyint_illv4/v6. 4177 * Thus we don't rely on phyint_illv6 to be initialized always. 4178 */ 4179 if (ill->ill_flags & ILLF_IPV6) 4180 phyi->phyint_illv6 = NULL; 4181 else 4182 phyi->phyint_illv4 = NULL; 4183 4184 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 4185 rw_exit(&ipst->ips_ill_g_lock); 4186 return; 4187 } 4188 4189 /* 4190 * There are no ills left on this phyint; pull it out of the phyint 4191 * avl trees, and free it. 4192 */ 4193 if (phyi->phyint_ifindex > 0) { 4194 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4195 phyi); 4196 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4197 phyi); 4198 } 4199 rw_exit(&ipst->ips_ill_g_lock); 4200 4201 phyint_free(phyi); 4202 } 4203 4204 /* 4205 * allocate a ppa, if the number of plumbed interfaces of this type are 4206 * less than ill_no_arena do a linear search to find a unused ppa. 4207 * When the number goes beyond ill_no_arena switch to using an arena. 4208 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4209 * is the return value for an error condition, so allocation starts at one 4210 * and is decremented by one. 4211 */ 4212 static int 4213 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4214 { 4215 ill_t *tmp_ill; 4216 uint_t start, end; 4217 int ppa; 4218 4219 if (ifp->illif_ppa_arena == NULL && 4220 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4221 /* 4222 * Create an arena. 4223 */ 4224 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4225 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4226 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4227 /* allocate what has already been assigned */ 4228 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4229 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4230 tmp_ill, AVL_AFTER)) { 4231 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4232 1, /* size */ 4233 1, /* align/quantum */ 4234 0, /* phase */ 4235 0, /* nocross */ 4236 /* minaddr */ 4237 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 4238 /* maxaddr */ 4239 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 4240 VM_NOSLEEP|VM_FIRSTFIT); 4241 if (ppa == 0) { 4242 ip1dbg(("ill_alloc_ppa: ppa allocation" 4243 " failed while switching")); 4244 vmem_destroy(ifp->illif_ppa_arena); 4245 ifp->illif_ppa_arena = NULL; 4246 break; 4247 } 4248 } 4249 } 4250 4251 if (ifp->illif_ppa_arena != NULL) { 4252 if (ill->ill_ppa == UINT_MAX) { 4253 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4254 1, VM_NOSLEEP|VM_FIRSTFIT); 4255 if (ppa == 0) 4256 return (EAGAIN); 4257 ill->ill_ppa = --ppa; 4258 } else { 4259 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4260 1, /* size */ 4261 1, /* align/quantum */ 4262 0, /* phase */ 4263 0, /* nocross */ 4264 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4265 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4266 VM_NOSLEEP|VM_FIRSTFIT); 4267 /* 4268 * Most likely the allocation failed because 4269 * the requested ppa was in use. 4270 */ 4271 if (ppa == 0) 4272 return (EEXIST); 4273 } 4274 return (0); 4275 } 4276 4277 /* 4278 * No arena is in use and not enough (>ill_no_arena) interfaces have 4279 * been plumbed to create one. Do a linear search to get a unused ppa. 4280 */ 4281 if (ill->ill_ppa == UINT_MAX) { 4282 end = UINT_MAX - 1; 4283 start = 0; 4284 } else { 4285 end = start = ill->ill_ppa; 4286 } 4287 4288 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4289 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4290 if (start++ >= end) { 4291 if (ill->ill_ppa == UINT_MAX) 4292 return (EAGAIN); 4293 else 4294 return (EEXIST); 4295 } 4296 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4297 } 4298 ill->ill_ppa = start; 4299 return (0); 4300 } 4301 4302 /* 4303 * Insert ill into the list of configured ill's. Once this function completes, 4304 * the ill is globally visible and is available through lookups. More precisely 4305 * this happens after the caller drops the ill_g_lock. 4306 */ 4307 static int 4308 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4309 { 4310 ill_if_t *ill_interface; 4311 avl_index_t where = 0; 4312 int error; 4313 int name_length; 4314 int index; 4315 boolean_t check_length = B_FALSE; 4316 ip_stack_t *ipst = ill->ill_ipst; 4317 4318 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 4319 4320 name_length = mi_strlen(name) + 1; 4321 4322 if (isv6) 4323 index = IP_V6_G_HEAD; 4324 else 4325 index = IP_V4_G_HEAD; 4326 4327 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 4328 /* 4329 * Search for interface type based on name 4330 */ 4331 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4332 if ((ill_interface->illif_name_len == name_length) && 4333 (strcmp(ill_interface->illif_name, name) == 0)) { 4334 break; 4335 } 4336 ill_interface = ill_interface->illif_next; 4337 } 4338 4339 /* 4340 * Interface type not found, create one. 4341 */ 4342 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4343 ill_g_head_t ghead; 4344 4345 /* 4346 * allocate ill_if_t structure 4347 */ 4348 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4349 if (ill_interface == NULL) { 4350 return (ENOMEM); 4351 } 4352 4353 (void) strcpy(ill_interface->illif_name, name); 4354 ill_interface->illif_name_len = name_length; 4355 4356 avl_create(&ill_interface->illif_avl_by_ppa, 4357 ill_compare_ppa, sizeof (ill_t), 4358 offsetof(struct ill_s, ill_avl_byppa)); 4359 4360 /* 4361 * link the structure in the back to maintain order 4362 * of configuration for ifconfig output. 4363 */ 4364 ghead = ipst->ips_ill_g_heads[index]; 4365 insque(ill_interface, ghead.ill_g_list_tail); 4366 } 4367 4368 if (ill->ill_ppa == UINT_MAX) 4369 check_length = B_TRUE; 4370 4371 error = ill_alloc_ppa(ill_interface, ill); 4372 if (error != 0) { 4373 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4374 ill_delete_interface_type(ill->ill_ifptr); 4375 return (error); 4376 } 4377 4378 /* 4379 * When the ppa is choosen by the system, check that there is 4380 * enough space to insert ppa. if a specific ppa was passed in this 4381 * check is not required as the interface name passed in will have 4382 * the right ppa in it. 4383 */ 4384 if (check_length) { 4385 /* 4386 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4387 */ 4388 char buf[sizeof (uint_t) * 3]; 4389 4390 /* 4391 * convert ppa to string to calculate the amount of space 4392 * required for it in the name. 4393 */ 4394 numtos(ill->ill_ppa, buf); 4395 4396 /* Do we have enough space to insert ppa ? */ 4397 4398 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4399 /* Free ppa and interface type struct */ 4400 if (ill_interface->illif_ppa_arena != NULL) { 4401 vmem_free(ill_interface->illif_ppa_arena, 4402 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4403 } 4404 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4405 ill_delete_interface_type(ill->ill_ifptr); 4406 4407 return (EINVAL); 4408 } 4409 } 4410 4411 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4412 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4413 4414 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4415 &where); 4416 ill->ill_ifptr = ill_interface; 4417 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4418 4419 ill_phyint_reinit(ill); 4420 return (0); 4421 } 4422 4423 /* Initialize the per phyint ipsq used for serialization */ 4424 static boolean_t 4425 ipsq_init(ill_t *ill, boolean_t enter) 4426 { 4427 ipsq_t *ipsq; 4428 ipxop_t *ipx; 4429 4430 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 4431 return (B_FALSE); 4432 4433 ill->ill_phyint->phyint_ipsq = ipsq; 4434 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 4435 ipx->ipx_ipsq = ipsq; 4436 ipsq->ipsq_next = ipsq; 4437 ipsq->ipsq_phyint = ill->ill_phyint; 4438 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4439 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 4440 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 4441 if (enter) { 4442 ipx->ipx_writer = curthread; 4443 ipx->ipx_forced = B_FALSE; 4444 ipx->ipx_reentry_cnt = 1; 4445 #ifdef DEBUG 4446 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 4447 #endif 4448 } 4449 return (B_TRUE); 4450 } 4451 4452 /* 4453 * ill_init is called by ip_open when a device control stream is opened. 4454 * It does a few initializations, and shoots a DL_INFO_REQ message down 4455 * to the driver. The response is later picked up in ip_rput_dlpi and 4456 * used to set up default mechanisms for talking to the driver. (Always 4457 * called as writer.) 4458 * 4459 * If this function returns error, ip_open will call ip_close which in 4460 * turn will call ill_delete to clean up any memory allocated here that 4461 * is not yet freed. 4462 */ 4463 int 4464 ill_init(queue_t *q, ill_t *ill) 4465 { 4466 int count; 4467 dl_info_req_t *dlir; 4468 mblk_t *info_mp; 4469 uchar_t *frag_ptr; 4470 4471 /* 4472 * The ill is initialized to zero by mi_alloc*(). In addition 4473 * some fields already contain valid values, initialized in 4474 * ip_open(), before we reach here. 4475 */ 4476 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4477 4478 ill->ill_rq = q; 4479 ill->ill_wq = WR(q); 4480 4481 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4482 BPRI_HI); 4483 if (info_mp == NULL) 4484 return (ENOMEM); 4485 4486 /* 4487 * Allocate sufficient space to contain our fragment hash table and 4488 * the device name. 4489 */ 4490 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4491 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4492 if (frag_ptr == NULL) { 4493 freemsg(info_mp); 4494 return (ENOMEM); 4495 } 4496 ill->ill_frag_ptr = frag_ptr; 4497 ill->ill_frag_free_num_pkts = 0; 4498 ill->ill_last_frag_clean_time = 0; 4499 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4500 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4501 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4502 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4503 NULL, MUTEX_DEFAULT, NULL); 4504 } 4505 4506 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4507 if (ill->ill_phyint == NULL) { 4508 freemsg(info_mp); 4509 mi_free(frag_ptr); 4510 return (ENOMEM); 4511 } 4512 4513 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4514 /* 4515 * For now pretend this is a v4 ill. We need to set phyint_ill* 4516 * at this point because of the following reason. If we can't 4517 * enter the ipsq at some point and cv_wait, the writer that 4518 * wakes us up tries to locate us using the list of all phyints 4519 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4520 * If we don't set it now, we risk a missed wakeup. 4521 */ 4522 ill->ill_phyint->phyint_illv4 = ill; 4523 ill->ill_ppa = UINT_MAX; 4524 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4525 4526 if (!ipsq_init(ill, B_TRUE)) { 4527 freemsg(info_mp); 4528 mi_free(frag_ptr); 4529 mi_free(ill->ill_phyint); 4530 return (ENOMEM); 4531 } 4532 4533 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4534 4535 /* Frag queue limit stuff */ 4536 ill->ill_frag_count = 0; 4537 ill->ill_ipf_gen = 0; 4538 4539 ill->ill_global_timer = INFINITY; 4540 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4541 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4542 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4543 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4544 4545 /* 4546 * Initialize IPv6 configuration variables. The IP module is always 4547 * opened as an IPv4 module. Instead tracking down the cases where 4548 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4549 * here for convenience, this has no effect until the ill is set to do 4550 * IPv6. 4551 */ 4552 ill->ill_reachable_time = ND_REACHABLE_TIME; 4553 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4554 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4555 ill->ill_max_buf = ND_MAX_Q; 4556 ill->ill_refcnt = 0; 4557 4558 /* Send down the Info Request to the driver. */ 4559 info_mp->b_datap->db_type = M_PCPROTO; 4560 dlir = (dl_info_req_t *)info_mp->b_rptr; 4561 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4562 dlir->dl_primitive = DL_INFO_REQ; 4563 4564 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4565 4566 qprocson(q); 4567 ill_dlpi_send(ill, info_mp); 4568 4569 return (0); 4570 } 4571 4572 /* 4573 * ill_dls_info 4574 * creates datalink socket info from the device. 4575 */ 4576 int 4577 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4578 { 4579 size_t len; 4580 ill_t *ill = ipif->ipif_ill; 4581 4582 sdl->sdl_family = AF_LINK; 4583 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4584 sdl->sdl_type = ill->ill_type; 4585 ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4586 len = strlen(sdl->sdl_data); 4587 ASSERT(len < 256); 4588 sdl->sdl_nlen = (uchar_t)len; 4589 sdl->sdl_alen = ill->ill_phys_addr_length; 4590 sdl->sdl_slen = 0; 4591 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 4592 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 4593 4594 return (sizeof (struct sockaddr_dl)); 4595 } 4596 4597 /* 4598 * ill_xarp_info 4599 * creates xarp info from the device. 4600 */ 4601 static int 4602 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4603 { 4604 sdl->sdl_family = AF_LINK; 4605 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4606 sdl->sdl_type = ill->ill_type; 4607 ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4608 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4609 sdl->sdl_alen = ill->ill_phys_addr_length; 4610 sdl->sdl_slen = 0; 4611 return (sdl->sdl_nlen); 4612 } 4613 4614 static int 4615 loopback_kstat_update(kstat_t *ksp, int rw) 4616 { 4617 kstat_named_t *kn; 4618 netstackid_t stackid; 4619 netstack_t *ns; 4620 ip_stack_t *ipst; 4621 4622 if (ksp == NULL || ksp->ks_data == NULL) 4623 return (EIO); 4624 4625 if (rw == KSTAT_WRITE) 4626 return (EACCES); 4627 4628 kn = KSTAT_NAMED_PTR(ksp); 4629 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 4630 4631 ns = netstack_find_by_stackid(stackid); 4632 if (ns == NULL) 4633 return (-1); 4634 4635 ipst = ns->netstack_ip; 4636 if (ipst == NULL) { 4637 netstack_rele(ns); 4638 return (-1); 4639 } 4640 kn[0].value.ui32 = ipst->ips_loopback_packets; 4641 kn[1].value.ui32 = ipst->ips_loopback_packets; 4642 netstack_rele(ns); 4643 return (0); 4644 } 4645 4646 /* 4647 * Has ifindex been plumbed already? 4648 */ 4649 boolean_t 4650 phyint_exists(uint_t index, ip_stack_t *ipst) 4651 { 4652 ASSERT(index != 0); 4653 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4654 4655 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4656 &index, NULL) != NULL); 4657 } 4658 4659 /* Pick a unique ifindex */ 4660 boolean_t 4661 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 4662 { 4663 uint_t starting_index; 4664 4665 if (!ipst->ips_ill_index_wrap) { 4666 *indexp = ipst->ips_ill_index++; 4667 if (ipst->ips_ill_index == 0) { 4668 /* Reached the uint_t limit Next time wrap */ 4669 ipst->ips_ill_index_wrap = B_TRUE; 4670 } 4671 return (B_TRUE); 4672 } 4673 4674 /* 4675 * Start reusing unused indexes. Note that we hold the ill_g_lock 4676 * at this point and don't want to call any function that attempts 4677 * to get the lock again. 4678 */ 4679 starting_index = ipst->ips_ill_index++; 4680 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 4681 if (ipst->ips_ill_index != 0 && 4682 !phyint_exists(ipst->ips_ill_index, ipst)) { 4683 /* found unused index - use it */ 4684 *indexp = ipst->ips_ill_index; 4685 return (B_TRUE); 4686 } 4687 } 4688 4689 /* 4690 * all interface indicies are inuse. 4691 */ 4692 return (B_FALSE); 4693 } 4694 4695 /* 4696 * Assign a unique interface index for the phyint. 4697 */ 4698 static boolean_t 4699 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 4700 { 4701 ASSERT(phyi->phyint_ifindex == 0); 4702 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 4703 } 4704 4705 /* 4706 * Initialize the flags on `phyi' as per the provided mactype. 4707 */ 4708 static void 4709 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 4710 { 4711 uint64_t flags = 0; 4712 4713 /* 4714 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 4715 * we always presume the underlying hardware is working and set 4716 * PHYI_RUNNING (if it's not, the driver will subsequently send a 4717 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 4718 * there are no active interfaces in the group so we set PHYI_FAILED. 4719 */ 4720 if (mactype == SUNW_DL_IPMP) 4721 flags |= PHYI_FAILED; 4722 else 4723 flags |= PHYI_RUNNING; 4724 4725 switch (mactype) { 4726 case SUNW_DL_VNI: 4727 flags |= PHYI_VIRTUAL; 4728 break; 4729 case SUNW_DL_IPMP: 4730 flags |= PHYI_IPMP; 4731 break; 4732 case DL_LOOP: 4733 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 4734 break; 4735 } 4736 4737 mutex_enter(&phyi->phyint_lock); 4738 phyi->phyint_flags |= flags; 4739 mutex_exit(&phyi->phyint_lock); 4740 } 4741 4742 /* 4743 * Return a pointer to the ill which matches the supplied name. Note that 4744 * the ill name length includes the null termination character. (May be 4745 * called as writer.) 4746 * If do_alloc and the interface is "lo0" it will be automatically created. 4747 * Cannot bump up reference on condemned ills. So dup detect can't be done 4748 * using this func. 4749 */ 4750 ill_t * 4751 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4752 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc, 4753 ip_stack_t *ipst) 4754 { 4755 ill_t *ill; 4756 ipif_t *ipif; 4757 ipsq_t *ipsq; 4758 kstat_named_t *kn; 4759 boolean_t isloopback; 4760 in6_addr_t ov6addr; 4761 4762 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4763 4764 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4765 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4766 rw_exit(&ipst->ips_ill_g_lock); 4767 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4768 return (ill); 4769 4770 /* 4771 * Couldn't find it. Does this happen to be a lookup for the 4772 * loopback device and are we allowed to allocate it? 4773 */ 4774 if (!isloopback || !do_alloc) 4775 return (NULL); 4776 4777 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4778 4779 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4780 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4781 rw_exit(&ipst->ips_ill_g_lock); 4782 return (ill); 4783 } 4784 4785 /* Create the loopback device on demand */ 4786 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4787 sizeof (ipif_loopback_name), BPRI_MED)); 4788 if (ill == NULL) 4789 goto done; 4790 4791 *ill = ill_null; 4792 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4793 ill->ill_ipst = ipst; 4794 netstack_hold(ipst->ips_netstack); 4795 /* 4796 * For exclusive stacks we set the zoneid to zero 4797 * to make IP operate as if in the global zone. 4798 */ 4799 ill->ill_zoneid = GLOBAL_ZONEID; 4800 4801 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4802 if (ill->ill_phyint == NULL) 4803 goto done; 4804 4805 if (isv6) 4806 ill->ill_phyint->phyint_illv6 = ill; 4807 else 4808 ill->ill_phyint->phyint_illv4 = ill; 4809 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4810 phyint_flags_init(ill->ill_phyint, DL_LOOP); 4811 4812 ill->ill_max_frag = IP_LOOPBACK_MTU; 4813 /* Add room for tcp+ip headers */ 4814 if (isv6) { 4815 ill->ill_isv6 = B_TRUE; 4816 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 4817 } else { 4818 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 4819 } 4820 if (!ill_allocate_mibs(ill)) 4821 goto done; 4822 ill->ill_max_mtu = ill->ill_max_frag; 4823 /* 4824 * ipif_loopback_name can't be pointed at directly because its used 4825 * by both the ipv4 and ipv6 interfaces. When the ill is removed 4826 * from the glist, ill_glist_delete() sets the first character of 4827 * ill_name to '\0'. 4828 */ 4829 ill->ill_name = (char *)ill + sizeof (*ill); 4830 (void) strcpy(ill->ill_name, ipif_loopback_name); 4831 ill->ill_name_length = sizeof (ipif_loopback_name); 4832 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 4833 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4834 4835 ill->ill_global_timer = INFINITY; 4836 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4837 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4838 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4839 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4840 4841 /* No resolver here. */ 4842 ill->ill_net_type = IRE_LOOPBACK; 4843 4844 /* Initialize the ipsq */ 4845 if (!ipsq_init(ill, B_FALSE)) 4846 goto done; 4847 4848 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE); 4849 if (ipif == NULL) 4850 goto done; 4851 4852 ill->ill_flags = ILLF_MULTICAST; 4853 4854 ov6addr = ipif->ipif_v6lcl_addr; 4855 /* Set up default loopback address and mask. */ 4856 if (!isv6) { 4857 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 4858 4859 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 4860 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4861 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 4862 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4863 ipif->ipif_v6subnet); 4864 ill->ill_flags |= ILLF_IPV4; 4865 } else { 4866 ipif->ipif_v6lcl_addr = ipv6_loopback; 4867 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4868 ipif->ipif_v6net_mask = ipv6_all_ones; 4869 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4870 ipif->ipif_v6subnet); 4871 ill->ill_flags |= ILLF_IPV6; 4872 } 4873 4874 /* 4875 * Chain us in at the end of the ill list. hold the ill 4876 * before we make it globally visible. 1 for the lookup. 4877 */ 4878 ill->ill_refcnt = 0; 4879 ill_refhold(ill); 4880 4881 ill->ill_frag_count = 0; 4882 ill->ill_frag_free_num_pkts = 0; 4883 ill->ill_last_frag_clean_time = 0; 4884 4885 ipsq = ill->ill_phyint->phyint_ipsq; 4886 4887 if (ill_glist_insert(ill, "lo", isv6) != 0) 4888 cmn_err(CE_PANIC, "cannot insert loopback interface"); 4889 4890 /* Let SCTP know so that it can add this to its list */ 4891 sctp_update_ill(ill, SCTP_ILL_INSERT); 4892 4893 /* 4894 * We have already assigned ipif_v6lcl_addr above, but we need to 4895 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 4896 * requires to be after ill_glist_insert() since we need the 4897 * ill_index set. Pass on ipv6_loopback as the old address. 4898 */ 4899 sctp_update_ipif_addr(ipif, ov6addr); 4900 4901 /* 4902 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 4903 * If so, free our original one. 4904 */ 4905 if (ipsq != ill->ill_phyint->phyint_ipsq) 4906 ipsq_delete(ipsq); 4907 4908 if (ipst->ips_loopback_ksp == NULL) { 4909 /* Export loopback interface statistics */ 4910 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 4911 ipif_loopback_name, "net", 4912 KSTAT_TYPE_NAMED, 2, 0, 4913 ipst->ips_netstack->netstack_stackid); 4914 if (ipst->ips_loopback_ksp != NULL) { 4915 ipst->ips_loopback_ksp->ks_update = 4916 loopback_kstat_update; 4917 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 4918 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 4919 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 4920 ipst->ips_loopback_ksp->ks_private = 4921 (void *)(uintptr_t)ipst->ips_netstack-> 4922 netstack_stackid; 4923 kstat_install(ipst->ips_loopback_ksp); 4924 } 4925 } 4926 4927 if (error != NULL) 4928 *error = 0; 4929 *did_alloc = B_TRUE; 4930 rw_exit(&ipst->ips_ill_g_lock); 4931 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 4932 NE_PLUMB, ill->ill_name, ill->ill_name_length); 4933 return (ill); 4934 done: 4935 if (ill != NULL) { 4936 if (ill->ill_phyint != NULL) { 4937 ipsq = ill->ill_phyint->phyint_ipsq; 4938 if (ipsq != NULL) { 4939 ipsq->ipsq_phyint = NULL; 4940 ipsq_delete(ipsq); 4941 } 4942 mi_free(ill->ill_phyint); 4943 } 4944 ill_free_mib(ill); 4945 if (ill->ill_ipst != NULL) 4946 netstack_rele(ill->ill_ipst->ips_netstack); 4947 mi_free(ill); 4948 } 4949 rw_exit(&ipst->ips_ill_g_lock); 4950 if (error != NULL) 4951 *error = ENOMEM; 4952 return (NULL); 4953 } 4954 4955 /* 4956 * For IPP calls - use the ip_stack_t for global stack. 4957 */ 4958 ill_t * 4959 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, 4960 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 4961 { 4962 ip_stack_t *ipst; 4963 ill_t *ill; 4964 4965 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 4966 if (ipst == NULL) { 4967 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 4968 return (NULL); 4969 } 4970 4971 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 4972 netstack_rele(ipst->ips_netstack); 4973 return (ill); 4974 } 4975 4976 /* 4977 * Return a pointer to the ill which matches the index and IP version type. 4978 */ 4979 ill_t * 4980 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 4981 ipsq_func_t func, int *err, ip_stack_t *ipst) 4982 { 4983 ill_t *ill; 4984 ipsq_t *ipsq; 4985 phyint_t *phyi; 4986 4987 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 4988 (q != NULL && mp != NULL && func != NULL && err != NULL)); 4989 4990 if (err != NULL) 4991 *err = 0; 4992 4993 /* 4994 * Indexes are stored in the phyint - a common structure 4995 * to both IPv4 and IPv6. 4996 */ 4997 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4998 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4999 (void *) &index, NULL); 5000 if (phyi != NULL) { 5001 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5002 if (ill != NULL) { 5003 /* 5004 * The block comment at the start of ipif_down 5005 * explains the use of the macros used below 5006 */ 5007 GRAB_CONN_LOCK(q); 5008 mutex_enter(&ill->ill_lock); 5009 if (ILL_CAN_LOOKUP(ill)) { 5010 ill_refhold_locked(ill); 5011 mutex_exit(&ill->ill_lock); 5012 RELEASE_CONN_LOCK(q); 5013 rw_exit(&ipst->ips_ill_g_lock); 5014 return (ill); 5015 } else if (ILL_CAN_WAIT(ill, q)) { 5016 ipsq = ill->ill_phyint->phyint_ipsq; 5017 mutex_enter(&ipsq->ipsq_lock); 5018 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5019 rw_exit(&ipst->ips_ill_g_lock); 5020 mutex_exit(&ill->ill_lock); 5021 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5022 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5023 mutex_exit(&ipsq->ipsq_lock); 5024 RELEASE_CONN_LOCK(q); 5025 if (err != NULL) 5026 *err = EINPROGRESS; 5027 return (NULL); 5028 } 5029 RELEASE_CONN_LOCK(q); 5030 mutex_exit(&ill->ill_lock); 5031 } 5032 } 5033 rw_exit(&ipst->ips_ill_g_lock); 5034 if (err != NULL) 5035 *err = ENXIO; 5036 return (NULL); 5037 } 5038 5039 /* 5040 * Return the ifindex next in sequence after the passed in ifindex. 5041 * If there is no next ifindex for the given protocol, return 0. 5042 */ 5043 uint_t 5044 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 5045 { 5046 phyint_t *phyi; 5047 phyint_t *phyi_initial; 5048 uint_t ifindex; 5049 5050 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5051 5052 if (index == 0) { 5053 phyi = avl_first( 5054 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 5055 } else { 5056 phyi = phyi_initial = avl_find( 5057 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5058 (void *) &index, NULL); 5059 } 5060 5061 for (; phyi != NULL; 5062 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5063 phyi, AVL_AFTER)) { 5064 /* 5065 * If we're not returning the first interface in the tree 5066 * and we still haven't moved past the phyint_t that 5067 * corresponds to index, avl_walk needs to be called again 5068 */ 5069 if (!((index != 0) && (phyi == phyi_initial))) { 5070 if (isv6) { 5071 if ((phyi->phyint_illv6) && 5072 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5073 (phyi->phyint_illv6->ill_isv6 == 1)) 5074 break; 5075 } else { 5076 if ((phyi->phyint_illv4) && 5077 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5078 (phyi->phyint_illv4->ill_isv6 == 0)) 5079 break; 5080 } 5081 } 5082 } 5083 5084 rw_exit(&ipst->ips_ill_g_lock); 5085 5086 if (phyi != NULL) 5087 ifindex = phyi->phyint_ifindex; 5088 else 5089 ifindex = 0; 5090 5091 return (ifindex); 5092 } 5093 5094 /* 5095 * Return the ifindex for the named interface. 5096 * If there is no next ifindex for the interface, return 0. 5097 */ 5098 uint_t 5099 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 5100 { 5101 phyint_t *phyi; 5102 avl_index_t where = 0; 5103 uint_t ifindex; 5104 5105 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5106 5107 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 5108 name, &where)) == NULL) { 5109 rw_exit(&ipst->ips_ill_g_lock); 5110 return (0); 5111 } 5112 5113 ifindex = phyi->phyint_ifindex; 5114 5115 rw_exit(&ipst->ips_ill_g_lock); 5116 5117 return (ifindex); 5118 } 5119 5120 /* 5121 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5122 * that gives a running thread a reference to the ill. This reference must be 5123 * released by the thread when it is done accessing the ill and related 5124 * objects. ill_refcnt can not be used to account for static references 5125 * such as other structures pointing to an ill. Callers must generally 5126 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5127 * or be sure that the ill is not being deleted or changing state before 5128 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5129 * ill won't change any of its critical state such as address, netmask etc. 5130 */ 5131 void 5132 ill_refhold(ill_t *ill) 5133 { 5134 mutex_enter(&ill->ill_lock); 5135 ill->ill_refcnt++; 5136 ILL_TRACE_REF(ill); 5137 mutex_exit(&ill->ill_lock); 5138 } 5139 5140 void 5141 ill_refhold_locked(ill_t *ill) 5142 { 5143 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5144 ill->ill_refcnt++; 5145 ILL_TRACE_REF(ill); 5146 } 5147 5148 int 5149 ill_check_and_refhold(ill_t *ill) 5150 { 5151 mutex_enter(&ill->ill_lock); 5152 if (ILL_CAN_LOOKUP(ill)) { 5153 ill_refhold_locked(ill); 5154 mutex_exit(&ill->ill_lock); 5155 return (0); 5156 } 5157 mutex_exit(&ill->ill_lock); 5158 return (ILL_LOOKUP_FAILED); 5159 } 5160 5161 /* 5162 * Must not be called while holding any locks. Otherwise if this is 5163 * the last reference to be released, there is a chance of recursive mutex 5164 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5165 * to restart an ioctl. 5166 */ 5167 void 5168 ill_refrele(ill_t *ill) 5169 { 5170 mutex_enter(&ill->ill_lock); 5171 ASSERT(ill->ill_refcnt != 0); 5172 ill->ill_refcnt--; 5173 ILL_UNTRACE_REF(ill); 5174 if (ill->ill_refcnt != 0) { 5175 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5176 mutex_exit(&ill->ill_lock); 5177 return; 5178 } 5179 5180 /* Drops the ill_lock */ 5181 ipif_ill_refrele_tail(ill); 5182 } 5183 5184 /* 5185 * Obtain a weak reference count on the ill. This reference ensures the 5186 * ill won't be freed, but the ill may change any of its critical state 5187 * such as netmask, address etc. Returns an error if the ill has started 5188 * closing. 5189 */ 5190 boolean_t 5191 ill_waiter_inc(ill_t *ill) 5192 { 5193 mutex_enter(&ill->ill_lock); 5194 if (ill->ill_state_flags & ILL_CONDEMNED) { 5195 mutex_exit(&ill->ill_lock); 5196 return (B_FALSE); 5197 } 5198 ill->ill_waiters++; 5199 mutex_exit(&ill->ill_lock); 5200 return (B_TRUE); 5201 } 5202 5203 void 5204 ill_waiter_dcr(ill_t *ill) 5205 { 5206 mutex_enter(&ill->ill_lock); 5207 ill->ill_waiters--; 5208 if (ill->ill_waiters == 0) 5209 cv_broadcast(&ill->ill_cv); 5210 mutex_exit(&ill->ill_lock); 5211 } 5212 5213 /* 5214 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5215 * driver. We construct best guess defaults for lower level information that 5216 * we need. If an interface is brought up without injection of any overriding 5217 * information from outside, we have to be ready to go with these defaults. 5218 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5219 * we primarely want the dl_provider_style. 5220 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5221 * at which point we assume the other part of the information is valid. 5222 */ 5223 void 5224 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5225 { 5226 uchar_t *brdcst_addr; 5227 uint_t brdcst_addr_length, phys_addr_length; 5228 t_scalar_t sap_length; 5229 dl_info_ack_t *dlia; 5230 ip_m_t *ipm; 5231 dl_qos_cl_sel1_t *sel1; 5232 int min_mtu; 5233 5234 ASSERT(IAM_WRITER_ILL(ill)); 5235 5236 /* 5237 * Till the ill is fully up ILL_CHANGING will be set and 5238 * the ill is not globally visible. So no need for a lock. 5239 */ 5240 dlia = (dl_info_ack_t *)mp->b_rptr; 5241 ill->ill_mactype = dlia->dl_mac_type; 5242 5243 ipm = ip_m_lookup(dlia->dl_mac_type); 5244 if (ipm == NULL) { 5245 ipm = ip_m_lookup(DL_OTHER); 5246 ASSERT(ipm != NULL); 5247 } 5248 ill->ill_media = ipm; 5249 5250 /* 5251 * When the new DLPI stuff is ready we'll pull lengths 5252 * from dlia. 5253 */ 5254 if (dlia->dl_version == DL_VERSION_2) { 5255 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5256 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5257 brdcst_addr_length); 5258 if (brdcst_addr == NULL) { 5259 brdcst_addr_length = 0; 5260 } 5261 sap_length = dlia->dl_sap_length; 5262 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5263 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5264 brdcst_addr_length, sap_length, phys_addr_length)); 5265 } else { 5266 brdcst_addr_length = 6; 5267 brdcst_addr = ip_six_byte_all_ones; 5268 sap_length = -2; 5269 phys_addr_length = brdcst_addr_length; 5270 } 5271 5272 ill->ill_bcast_addr_length = brdcst_addr_length; 5273 ill->ill_phys_addr_length = phys_addr_length; 5274 ill->ill_sap_length = sap_length; 5275 5276 /* 5277 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 5278 * but we must ensure a minimum IP MTU is used since other bits of 5279 * IP will fly apart otherwise. 5280 */ 5281 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 5282 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 5283 ill->ill_max_mtu = ill->ill_max_frag; 5284 5285 ill->ill_type = ipm->ip_m_type; 5286 5287 if (!ill->ill_dlpi_style_set) { 5288 if (dlia->dl_provider_style == DL_STYLE2) 5289 ill->ill_needs_attach = 1; 5290 5291 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 5292 5293 /* 5294 * Allocate the first ipif on this ill. We don't delay it 5295 * further as ioctl handling assumes at least one ipif exists. 5296 * 5297 * At this point we don't know whether the ill is v4 or v6. 5298 * We will know this whan the SIOCSLIFNAME happens and 5299 * the correct value for ill_isv6 will be assigned in 5300 * ipif_set_values(). We need to hold the ill lock and 5301 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5302 * the wakeup. 5303 */ 5304 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5305 dlia->dl_provider_style != DL_STYLE2, B_TRUE); 5306 mutex_enter(&ill->ill_lock); 5307 ASSERT(ill->ill_dlpi_style_set == 0); 5308 ill->ill_dlpi_style_set = 1; 5309 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5310 cv_broadcast(&ill->ill_cv); 5311 mutex_exit(&ill->ill_lock); 5312 freemsg(mp); 5313 return; 5314 } 5315 ASSERT(ill->ill_ipif != NULL); 5316 /* 5317 * We know whether it is IPv4 or IPv6 now, as this is the 5318 * second DL_INFO_ACK we are recieving in response to the 5319 * DL_INFO_REQ sent in ipif_set_values. 5320 */ 5321 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 5322 /* 5323 * Set ipif_mtu which is used to set the IRE's 5324 * ire_max_frag value. The driver could have sent 5325 * a different mtu from what it sent last time. No 5326 * need to call ipif_mtu_change because IREs have 5327 * not yet been created. 5328 */ 5329 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5330 /* 5331 * Clear all the flags that were set based on ill_bcast_addr_length 5332 * and ill_phys_addr_length (in ipif_set_values) as these could have 5333 * changed now and we need to re-evaluate. 5334 */ 5335 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5336 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5337 5338 /* 5339 * Free ill_resolver_mp and ill_bcast_mp as things could have 5340 * changed now. 5341 * 5342 * NOTE: The IPMP meta-interface is special-cased because it starts 5343 * with no underlying interfaces (and thus an unknown broadcast 5344 * address length), but we enforce that an interface is broadcast- 5345 * capable as part of allowing it to join a group. 5346 */ 5347 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 5348 if (ill->ill_resolver_mp != NULL) 5349 freemsg(ill->ill_resolver_mp); 5350 if (ill->ill_bcast_mp != NULL) 5351 freemsg(ill->ill_bcast_mp); 5352 if (ill->ill_flags & ILLF_XRESOLV) 5353 ill->ill_net_type = IRE_IF_RESOLVER; 5354 else 5355 ill->ill_net_type = IRE_IF_NORESOLVER; 5356 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5357 ill->ill_phys_addr_length, 5358 ill->ill_sap, 5359 ill->ill_sap_length); 5360 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5361 5362 if (ill->ill_isv6) 5363 /* 5364 * Note: xresolv interfaces will eventually need NOARP 5365 * set here as well, but that will require those 5366 * external resolvers to have some knowledge of 5367 * that flag and act appropriately. Not to be changed 5368 * at present. 5369 */ 5370 ill->ill_flags |= ILLF_NONUD; 5371 else 5372 ill->ill_flags |= ILLF_NOARP; 5373 5374 if (ill->ill_mactype == SUNW_DL_VNI) { 5375 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5376 } else if (ill->ill_phys_addr_length == 0 || 5377 ill->ill_mactype == DL_IPV4 || 5378 ill->ill_mactype == DL_IPV6) { 5379 /* 5380 * The underying link is point-to-point, so mark the 5381 * interface as such. We can do IP multicast over 5382 * such a link since it transmits all network-layer 5383 * packets to the remote side the same way. 5384 */ 5385 ill->ill_flags |= ILLF_MULTICAST; 5386 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5387 } 5388 } else { 5389 ill->ill_net_type = IRE_IF_RESOLVER; 5390 if (ill->ill_bcast_mp != NULL) 5391 freemsg(ill->ill_bcast_mp); 5392 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5393 ill->ill_bcast_addr_length, ill->ill_sap, 5394 ill->ill_sap_length); 5395 /* 5396 * Later detect lack of DLPI driver multicast 5397 * capability by catching DL_ENABMULTI errors in 5398 * ip_rput_dlpi. 5399 */ 5400 ill->ill_flags |= ILLF_MULTICAST; 5401 if (!ill->ill_isv6) 5402 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5403 } 5404 5405 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 5406 if (ill->ill_mactype == SUNW_DL_IPMP) 5407 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 5408 5409 /* By default an interface does not support any CoS marking */ 5410 ill->ill_flags &= ~ILLF_COS_ENABLED; 5411 5412 /* 5413 * If we get QoS information in DL_INFO_ACK, the device supports 5414 * some form of CoS marking, set ILLF_COS_ENABLED. 5415 */ 5416 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5417 dlia->dl_qos_length); 5418 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5419 ill->ill_flags |= ILLF_COS_ENABLED; 5420 } 5421 5422 /* Clear any previous error indication. */ 5423 ill->ill_error = 0; 5424 freemsg(mp); 5425 } 5426 5427 /* 5428 * Perform various checks to verify that an address would make sense as a 5429 * local, remote, or subnet interface address. 5430 */ 5431 static boolean_t 5432 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5433 { 5434 ipaddr_t net_mask; 5435 5436 /* 5437 * Don't allow all zeroes, or all ones, but allow 5438 * all ones netmask. 5439 */ 5440 if ((net_mask = ip_net_mask(addr)) == 0) 5441 return (B_FALSE); 5442 /* A given netmask overrides the "guess" netmask */ 5443 if (subnet_mask != 0) 5444 net_mask = subnet_mask; 5445 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5446 (addr == (addr | ~net_mask)))) { 5447 return (B_FALSE); 5448 } 5449 5450 /* 5451 * Even if the netmask is all ones, we do not allow address to be 5452 * 255.255.255.255 5453 */ 5454 if (addr == INADDR_BROADCAST) 5455 return (B_FALSE); 5456 5457 if (CLASSD(addr)) 5458 return (B_FALSE); 5459 5460 return (B_TRUE); 5461 } 5462 5463 #define V6_IPIF_LINKLOCAL(p) \ 5464 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 5465 5466 /* 5467 * Compare two given ipifs and check if the second one is better than 5468 * the first one using the order of preference (not taking deprecated 5469 * into acount) specified in ipif_lookup_multicast(). 5470 */ 5471 static boolean_t 5472 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 5473 { 5474 /* Check the least preferred first. */ 5475 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 5476 /* If both ipifs are the same, use the first one. */ 5477 if (IS_LOOPBACK(new_ipif->ipif_ill)) 5478 return (B_FALSE); 5479 else 5480 return (B_TRUE); 5481 } 5482 5483 /* For IPv6, check for link local address. */ 5484 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 5485 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5486 V6_IPIF_LINKLOCAL(new_ipif)) { 5487 /* The second one is equal or less preferred. */ 5488 return (B_FALSE); 5489 } else { 5490 return (B_TRUE); 5491 } 5492 } 5493 5494 /* Then check for point to point interface. */ 5495 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 5496 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5497 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 5498 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 5499 return (B_FALSE); 5500 } else { 5501 return (B_TRUE); 5502 } 5503 } 5504 5505 /* old_ipif is a normal interface, so no need to use the new one. */ 5506 return (B_FALSE); 5507 } 5508 5509 /* 5510 * Find a mulitcast-capable ipif given an IP instance and zoneid. 5511 * The ipif must be up, and its ill must multicast-capable, not 5512 * condemned, not an underlying interface in an IPMP group, and 5513 * not a VNI interface. Order of preference: 5514 * 5515 * 1a. normal 5516 * 1b. normal, but deprecated 5517 * 2a. point to point 5518 * 2b. point to point, but deprecated 5519 * 3a. link local 5520 * 3b. link local, but deprecated 5521 * 4. loopback. 5522 */ 5523 ipif_t * 5524 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 5525 { 5526 ill_t *ill; 5527 ill_walk_context_t ctx; 5528 ipif_t *ipif; 5529 ipif_t *saved_ipif = NULL; 5530 ipif_t *dep_ipif = NULL; 5531 5532 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5533 if (isv6) 5534 ill = ILL_START_WALK_V6(&ctx, ipst); 5535 else 5536 ill = ILL_START_WALK_V4(&ctx, ipst); 5537 5538 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5539 mutex_enter(&ill->ill_lock); 5540 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) || 5541 !(ill->ill_flags & ILLF_MULTICAST)) { 5542 mutex_exit(&ill->ill_lock); 5543 continue; 5544 } 5545 for (ipif = ill->ill_ipif; ipif != NULL; 5546 ipif = ipif->ipif_next) { 5547 if (zoneid != ipif->ipif_zoneid && 5548 zoneid != ALL_ZONES && 5549 ipif->ipif_zoneid != ALL_ZONES) { 5550 continue; 5551 } 5552 if (!(ipif->ipif_flags & IPIF_UP) || 5553 !IPIF_CAN_LOOKUP(ipif)) { 5554 continue; 5555 } 5556 5557 /* 5558 * Found one candidate. If it is deprecated, 5559 * remember it in dep_ipif. If it is not deprecated, 5560 * remember it in saved_ipif. 5561 */ 5562 if (ipif->ipif_flags & IPIF_DEPRECATED) { 5563 if (dep_ipif == NULL) { 5564 dep_ipif = ipif; 5565 } else if (ipif_comp_multi(dep_ipif, ipif, 5566 isv6)) { 5567 /* 5568 * If the previous dep_ipif does not 5569 * belong to the same ill, we've done 5570 * a ipif_refhold() on it. So we need 5571 * to release it. 5572 */ 5573 if (dep_ipif->ipif_ill != ill) 5574 ipif_refrele(dep_ipif); 5575 dep_ipif = ipif; 5576 } 5577 continue; 5578 } 5579 if (saved_ipif == NULL) { 5580 saved_ipif = ipif; 5581 } else { 5582 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 5583 if (saved_ipif->ipif_ill != ill) 5584 ipif_refrele(saved_ipif); 5585 saved_ipif = ipif; 5586 } 5587 } 5588 } 5589 /* 5590 * Before going to the next ill, do a ipif_refhold() on the 5591 * saved ones. 5592 */ 5593 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 5594 ipif_refhold_locked(saved_ipif); 5595 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 5596 ipif_refhold_locked(dep_ipif); 5597 mutex_exit(&ill->ill_lock); 5598 } 5599 rw_exit(&ipst->ips_ill_g_lock); 5600 5601 /* 5602 * If we have only the saved_ipif, return it. But if we have both 5603 * saved_ipif and dep_ipif, check to see which one is better. 5604 */ 5605 if (saved_ipif != NULL) { 5606 if (dep_ipif != NULL) { 5607 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 5608 ipif_refrele(saved_ipif); 5609 return (dep_ipif); 5610 } else { 5611 ipif_refrele(dep_ipif); 5612 return (saved_ipif); 5613 } 5614 } 5615 return (saved_ipif); 5616 } else { 5617 return (dep_ipif); 5618 } 5619 } 5620 5621 /* 5622 * This function is called when an application does not specify an interface 5623 * to be used for multicast traffic (joining a group/sending data). It 5624 * calls ire_lookup_multi() to look for an interface route for the 5625 * specified multicast group. Doing this allows the administrator to add 5626 * prefix routes for multicast to indicate which interface to be used for 5627 * multicast traffic in the above scenario. The route could be for all 5628 * multicast (224.0/4), for a single multicast group (a /32 route) or 5629 * anything in between. If there is no such multicast route, we just find 5630 * any multicast capable interface and return it. The returned ipif 5631 * is refhold'ed. 5632 */ 5633 ipif_t * 5634 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) 5635 { 5636 ire_t *ire; 5637 ipif_t *ipif; 5638 5639 ire = ire_lookup_multi(group, zoneid, ipst); 5640 if (ire != NULL) { 5641 ipif = ire->ire_ipif; 5642 ipif_refhold(ipif); 5643 ire_refrele(ire); 5644 return (ipif); 5645 } 5646 5647 return (ipif_lookup_multicast(ipst, zoneid, B_FALSE)); 5648 } 5649 5650 /* 5651 * Look for an ipif with the specified interface address and destination. 5652 * The destination address is used only for matching point-to-point interfaces. 5653 */ 5654 ipif_t * 5655 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5656 ipsq_func_t func, int *error, ip_stack_t *ipst) 5657 { 5658 ipif_t *ipif; 5659 ill_t *ill; 5660 ill_walk_context_t ctx; 5661 ipsq_t *ipsq; 5662 5663 if (error != NULL) 5664 *error = 0; 5665 5666 /* 5667 * First match all the point-to-point interfaces 5668 * before looking at non-point-to-point interfaces. 5669 * This is done to avoid returning non-point-to-point 5670 * ipif instead of unnumbered point-to-point ipif. 5671 */ 5672 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5673 ill = ILL_START_WALK_V4(&ctx, ipst); 5674 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5675 GRAB_CONN_LOCK(q); 5676 mutex_enter(&ill->ill_lock); 5677 for (ipif = ill->ill_ipif; ipif != NULL; 5678 ipif = ipif->ipif_next) { 5679 /* Allow the ipif to be down */ 5680 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5681 (ipif->ipif_lcl_addr == if_addr) && 5682 (ipif->ipif_pp_dst_addr == dst)) { 5683 /* 5684 * The block comment at the start of ipif_down 5685 * explains the use of the macros used below 5686 */ 5687 if (IPIF_CAN_LOOKUP(ipif)) { 5688 ipif_refhold_locked(ipif); 5689 mutex_exit(&ill->ill_lock); 5690 RELEASE_CONN_LOCK(q); 5691 rw_exit(&ipst->ips_ill_g_lock); 5692 return (ipif); 5693 } else if (IPIF_CAN_WAIT(ipif, q)) { 5694 ipsq = ill->ill_phyint->phyint_ipsq; 5695 mutex_enter(&ipsq->ipsq_lock); 5696 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5697 mutex_exit(&ill->ill_lock); 5698 rw_exit(&ipst->ips_ill_g_lock); 5699 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5700 ill); 5701 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5702 mutex_exit(&ipsq->ipsq_lock); 5703 RELEASE_CONN_LOCK(q); 5704 if (error != NULL) 5705 *error = EINPROGRESS; 5706 return (NULL); 5707 } 5708 } 5709 } 5710 mutex_exit(&ill->ill_lock); 5711 RELEASE_CONN_LOCK(q); 5712 } 5713 rw_exit(&ipst->ips_ill_g_lock); 5714 5715 /* lookup the ipif based on interface address */ 5716 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error, 5717 ipst); 5718 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5719 return (ipif); 5720 } 5721 5722 /* 5723 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 5724 */ 5725 static ipif_t * 5726 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp, 5727 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, 5728 ip_stack_t *ipst) 5729 { 5730 ipif_t *ipif; 5731 ill_t *ill; 5732 boolean_t ptp = B_FALSE; 5733 ipsq_t *ipsq; 5734 ill_walk_context_t ctx; 5735 5736 if (error != NULL) 5737 *error = 0; 5738 5739 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5740 /* 5741 * Repeat twice, first based on local addresses and 5742 * next time for pointopoint. 5743 */ 5744 repeat: 5745 ill = ILL_START_WALK_V4(&ctx, ipst); 5746 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5747 if (match_ill != NULL && ill != match_ill && 5748 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 5749 continue; 5750 } 5751 GRAB_CONN_LOCK(q); 5752 mutex_enter(&ill->ill_lock); 5753 for (ipif = ill->ill_ipif; ipif != NULL; 5754 ipif = ipif->ipif_next) { 5755 if (zoneid != ALL_ZONES && 5756 zoneid != ipif->ipif_zoneid && 5757 ipif->ipif_zoneid != ALL_ZONES) 5758 continue; 5759 /* Allow the ipif to be down */ 5760 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5761 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5762 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5763 (ipif->ipif_pp_dst_addr == addr))) { 5764 /* 5765 * The block comment at the start of ipif_down 5766 * explains the use of the macros used below 5767 */ 5768 if (IPIF_CAN_LOOKUP(ipif)) { 5769 ipif_refhold_locked(ipif); 5770 mutex_exit(&ill->ill_lock); 5771 RELEASE_CONN_LOCK(q); 5772 rw_exit(&ipst->ips_ill_g_lock); 5773 return (ipif); 5774 } else if (IPIF_CAN_WAIT(ipif, q)) { 5775 ipsq = ill->ill_phyint->phyint_ipsq; 5776 mutex_enter(&ipsq->ipsq_lock); 5777 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5778 mutex_exit(&ill->ill_lock); 5779 rw_exit(&ipst->ips_ill_g_lock); 5780 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5781 ill); 5782 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5783 mutex_exit(&ipsq->ipsq_lock); 5784 RELEASE_CONN_LOCK(q); 5785 if (error != NULL) 5786 *error = EINPROGRESS; 5787 return (NULL); 5788 } 5789 } 5790 } 5791 mutex_exit(&ill->ill_lock); 5792 RELEASE_CONN_LOCK(q); 5793 } 5794 5795 /* If we already did the ptp case, then we are done */ 5796 if (ptp) { 5797 rw_exit(&ipst->ips_ill_g_lock); 5798 if (error != NULL) 5799 *error = ENXIO; 5800 return (NULL); 5801 } 5802 ptp = B_TRUE; 5803 goto repeat; 5804 } 5805 5806 /* 5807 * Check if the address exists in the system. 5808 * We don't hold the conn_lock as we will not perform defered ipsqueue 5809 * operation. 5810 */ 5811 boolean_t 5812 ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 5813 { 5814 ipif_t *ipif; 5815 ill_t *ill; 5816 ill_walk_context_t ctx; 5817 5818 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5819 5820 ill = ILL_START_WALK_V4(&ctx, ipst); 5821 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5822 mutex_enter(&ill->ill_lock); 5823 for (ipif = ill->ill_ipif; ipif != NULL; 5824 ipif = ipif->ipif_next) { 5825 if (zoneid != ALL_ZONES && 5826 zoneid != ipif->ipif_zoneid && 5827 ipif->ipif_zoneid != ALL_ZONES) 5828 continue; 5829 /* Allow the ipif to be down */ 5830 /* 5831 * XXX Different from ipif_lookup_addr(), we don't do 5832 * twice lookups. As from bind()'s point of view, we 5833 * may return once we find a match. 5834 */ 5835 if (((ipif->ipif_lcl_addr == addr) && 5836 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5837 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5838 (ipif->ipif_pp_dst_addr == addr))) { 5839 /* 5840 * Allow bind() to be successful even if the 5841 * ipif is with IPIF_CHANGING bit set. 5842 */ 5843 mutex_exit(&ill->ill_lock); 5844 rw_exit(&ipst->ips_ill_g_lock); 5845 return (B_TRUE); 5846 } 5847 } 5848 mutex_exit(&ill->ill_lock); 5849 } 5850 5851 rw_exit(&ipst->ips_ill_g_lock); 5852 return (B_FALSE); 5853 } 5854 5855 /* 5856 * Lookup an ipif with the specified address. For point-to-point links we 5857 * look for matches on either the destination address or the local address, 5858 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 5859 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 5860 * (or illgrp if `match_ill' is in an IPMP group). 5861 */ 5862 ipif_t * 5863 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5864 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 5865 { 5866 return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp, 5867 func, error, ipst)); 5868 } 5869 5870 /* 5871 * Special abbreviated version of ipif_lookup_addr() that doesn't match 5872 * `match_ill' across the IPMP group. This function is only needed in some 5873 * corner-cases; almost everything should use ipif_lookup_addr(). 5874 */ 5875 static ipif_t * 5876 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 5877 { 5878 ASSERT(match_ill != NULL); 5879 return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES, 5880 NULL, NULL, NULL, NULL, ipst)); 5881 } 5882 5883 /* 5884 * Look for an ipif with the specified address. For point-point links 5885 * we look for matches on either the destination address and the local 5886 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5887 * is set. 5888 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 5889 * ill (or illgrp if `match_ill' is in an IPMP group). 5890 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 5891 */ 5892 zoneid_t 5893 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 5894 { 5895 zoneid_t zoneid; 5896 ipif_t *ipif; 5897 ill_t *ill; 5898 boolean_t ptp = B_FALSE; 5899 ill_walk_context_t ctx; 5900 5901 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5902 /* 5903 * Repeat twice, first based on local addresses and 5904 * next time for pointopoint. 5905 */ 5906 repeat: 5907 ill = ILL_START_WALK_V4(&ctx, ipst); 5908 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5909 if (match_ill != NULL && ill != match_ill && 5910 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 5911 continue; 5912 } 5913 mutex_enter(&ill->ill_lock); 5914 for (ipif = ill->ill_ipif; ipif != NULL; 5915 ipif = ipif->ipif_next) { 5916 /* Allow the ipif to be down */ 5917 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5918 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5919 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5920 (ipif->ipif_pp_dst_addr == addr)) && 5921 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 5922 zoneid = ipif->ipif_zoneid; 5923 mutex_exit(&ill->ill_lock); 5924 rw_exit(&ipst->ips_ill_g_lock); 5925 /* 5926 * If ipif_zoneid was ALL_ZONES then we have 5927 * a trusted extensions shared IP address. 5928 * In that case GLOBAL_ZONEID works to send. 5929 */ 5930 if (zoneid == ALL_ZONES) 5931 zoneid = GLOBAL_ZONEID; 5932 return (zoneid); 5933 } 5934 } 5935 mutex_exit(&ill->ill_lock); 5936 } 5937 5938 /* If we already did the ptp case, then we are done */ 5939 if (ptp) { 5940 rw_exit(&ipst->ips_ill_g_lock); 5941 return (ALL_ZONES); 5942 } 5943 ptp = B_TRUE; 5944 goto repeat; 5945 } 5946 5947 /* 5948 * Look for an ipif that matches the specified remote address i.e. the 5949 * ipif that would receive the specified packet. 5950 * First look for directly connected interfaces and then do a recursive 5951 * IRE lookup and pick the first ipif corresponding to the source address in the 5952 * ire. 5953 * Returns: held ipif 5954 */ 5955 ipif_t * 5956 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 5957 { 5958 ipif_t *ipif; 5959 ire_t *ire; 5960 ip_stack_t *ipst = ill->ill_ipst; 5961 5962 ASSERT(!ill->ill_isv6); 5963 5964 /* 5965 * Someone could be changing this ipif currently or change it 5966 * after we return this. Thus a few packets could use the old 5967 * old values. However structure updates/creates (ire, ilg, ilm etc) 5968 * will atomically be updated or cleaned up with the new value 5969 * Thus we don't need a lock to check the flags or other attrs below. 5970 */ 5971 mutex_enter(&ill->ill_lock); 5972 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5973 if (!IPIF_CAN_LOOKUP(ipif)) 5974 continue; 5975 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 5976 ipif->ipif_zoneid != ALL_ZONES) 5977 continue; 5978 /* Allow the ipif to be down */ 5979 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5980 if ((ipif->ipif_pp_dst_addr == addr) || 5981 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 5982 ipif->ipif_lcl_addr == addr)) { 5983 ipif_refhold_locked(ipif); 5984 mutex_exit(&ill->ill_lock); 5985 return (ipif); 5986 } 5987 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 5988 ipif_refhold_locked(ipif); 5989 mutex_exit(&ill->ill_lock); 5990 return (ipif); 5991 } 5992 } 5993 mutex_exit(&ill->ill_lock); 5994 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 5995 NULL, MATCH_IRE_RECURSIVE, ipst); 5996 if (ire != NULL) { 5997 /* 5998 * The callers of this function wants to know the 5999 * interface on which they have to send the replies 6000 * back. For IREs that have ire_stq and ire_ipif 6001 * derived from different ills, we really don't care 6002 * what we return here. 6003 */ 6004 ipif = ire->ire_ipif; 6005 if (ipif != NULL) { 6006 ipif_refhold(ipif); 6007 ire_refrele(ire); 6008 return (ipif); 6009 } 6010 ire_refrele(ire); 6011 } 6012 /* Pick the first interface */ 6013 ipif = ipif_get_next_ipif(NULL, ill); 6014 return (ipif); 6015 } 6016 6017 /* 6018 * This func does not prevent refcnt from increasing. But if 6019 * the caller has taken steps to that effect, then this func 6020 * can be used to determine whether the ill has become quiescent 6021 */ 6022 static boolean_t 6023 ill_is_quiescent(ill_t *ill) 6024 { 6025 ipif_t *ipif; 6026 6027 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6028 6029 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6030 if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) { 6031 return (B_FALSE); 6032 } 6033 } 6034 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 6035 return (B_FALSE); 6036 } 6037 return (B_TRUE); 6038 } 6039 6040 boolean_t 6041 ill_is_freeable(ill_t *ill) 6042 { 6043 ipif_t *ipif; 6044 6045 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6046 6047 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6048 if (ipif->ipif_refcnt != 0 || !IPIF_FREE_OK(ipif)) { 6049 return (B_FALSE); 6050 } 6051 } 6052 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 6053 return (B_FALSE); 6054 } 6055 return (B_TRUE); 6056 } 6057 6058 /* 6059 * This func does not prevent refcnt from increasing. But if 6060 * the caller has taken steps to that effect, then this func 6061 * can be used to determine whether the ipif has become quiescent 6062 */ 6063 static boolean_t 6064 ipif_is_quiescent(ipif_t *ipif) 6065 { 6066 ill_t *ill; 6067 6068 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6069 6070 if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) { 6071 return (B_FALSE); 6072 } 6073 6074 ill = ipif->ipif_ill; 6075 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 6076 ill->ill_logical_down) { 6077 return (B_TRUE); 6078 } 6079 6080 /* This is the last ipif going down or being deleted on this ill */ 6081 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 6082 return (B_FALSE); 6083 } 6084 6085 return (B_TRUE); 6086 } 6087 6088 /* 6089 * return true if the ipif can be destroyed: the ipif has to be quiescent 6090 * with zero references from ire/nce/ilm to it. 6091 */ 6092 static boolean_t 6093 ipif_is_freeable(ipif_t *ipif) 6094 { 6095 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6096 ASSERT(ipif->ipif_id != 0); 6097 return (ipif->ipif_refcnt == 0 && IPIF_FREE_OK(ipif)); 6098 } 6099 6100 /* 6101 * The ipif/ill/ire has been refreled. Do the tail processing. 6102 * Determine if the ipif or ill in question has become quiescent and if so 6103 * wakeup close and/or restart any queued pending ioctl that is waiting 6104 * for the ipif_down (or ill_down) 6105 */ 6106 void 6107 ipif_ill_refrele_tail(ill_t *ill) 6108 { 6109 mblk_t *mp; 6110 conn_t *connp; 6111 ipsq_t *ipsq; 6112 ipxop_t *ipx; 6113 ipif_t *ipif; 6114 dl_notify_ind_t *dlindp; 6115 6116 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6117 6118 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 6119 /* ip_modclose() may be waiting */ 6120 cv_broadcast(&ill->ill_cv); 6121 } 6122 6123 ipsq = ill->ill_phyint->phyint_ipsq; 6124 mutex_enter(&ipsq->ipsq_lock); 6125 ipx = ipsq->ipsq_xop; 6126 mutex_enter(&ipx->ipx_lock); 6127 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 6128 goto unlock; 6129 6130 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 6131 6132 ipif = ipx->ipx_pending_ipif; 6133 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 6134 goto unlock; 6135 6136 switch (ipx->ipx_waitfor) { 6137 case IPIF_DOWN: 6138 if (!ipif_is_quiescent(ipif)) 6139 goto unlock; 6140 break; 6141 case IPIF_FREE: 6142 if (!ipif_is_freeable(ipif)) 6143 goto unlock; 6144 break; 6145 case ILL_DOWN: 6146 if (!ill_is_quiescent(ill)) 6147 goto unlock; 6148 break; 6149 case ILL_FREE: 6150 /* 6151 * ILL_FREE is only for loopback; normal ill teardown waits 6152 * synchronously in ip_modclose() without using ipx_waitfor, 6153 * handled by the cv_broadcast() at the top of this function. 6154 */ 6155 if (!ill_is_freeable(ill)) 6156 goto unlock; 6157 break; 6158 default: 6159 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 6160 (void *)ipsq, ipx->ipx_waitfor); 6161 } 6162 6163 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 6164 mutex_exit(&ipx->ipx_lock); 6165 mp = ipsq_pending_mp_get(ipsq, &connp); 6166 mutex_exit(&ipsq->ipsq_lock); 6167 mutex_exit(&ill->ill_lock); 6168 6169 ASSERT(mp != NULL); 6170 /* 6171 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 6172 * we can only get here when the current operation decides it 6173 * it needs to quiesce via ipsq_pending_mp_add(). 6174 */ 6175 switch (mp->b_datap->db_type) { 6176 case M_PCPROTO: 6177 case M_PROTO: 6178 /* 6179 * For now, only DL_NOTIFY_IND messages can use this facility. 6180 */ 6181 dlindp = (dl_notify_ind_t *)mp->b_rptr; 6182 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 6183 6184 switch (dlindp->dl_notification) { 6185 case DL_NOTE_PHYS_ADDR: 6186 qwriter_ip(ill, ill->ill_rq, mp, 6187 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 6188 return; 6189 case DL_NOTE_REPLUMB: 6190 qwriter_ip(ill, ill->ill_rq, mp, 6191 ill_replumb_tail, CUR_OP, B_TRUE); 6192 return; 6193 default: 6194 ASSERT(0); 6195 ill_refrele(ill); 6196 } 6197 break; 6198 6199 case M_ERROR: 6200 case M_HANGUP: 6201 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 6202 B_TRUE); 6203 return; 6204 6205 case M_IOCTL: 6206 case M_IOCDATA: 6207 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 6208 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 6209 return; 6210 6211 default: 6212 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6213 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6214 } 6215 return; 6216 unlock: 6217 mutex_exit(&ipsq->ipsq_lock); 6218 mutex_exit(&ipx->ipx_lock); 6219 mutex_exit(&ill->ill_lock); 6220 } 6221 6222 #ifdef DEBUG 6223 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6224 static void 6225 th_trace_rrecord(th_trace_t *th_trace) 6226 { 6227 tr_buf_t *tr_buf; 6228 uint_t lastref; 6229 6230 lastref = th_trace->th_trace_lastref; 6231 lastref++; 6232 if (lastref == TR_BUF_MAX) 6233 lastref = 0; 6234 th_trace->th_trace_lastref = lastref; 6235 tr_buf = &th_trace->th_trbuf[lastref]; 6236 tr_buf->tr_time = lbolt; 6237 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 6238 } 6239 6240 static void 6241 th_trace_free(void *value) 6242 { 6243 th_trace_t *th_trace = value; 6244 6245 ASSERT(th_trace->th_refcnt == 0); 6246 kmem_free(th_trace, sizeof (*th_trace)); 6247 } 6248 6249 /* 6250 * Find or create the per-thread hash table used to track object references. 6251 * The ipst argument is NULL if we shouldn't allocate. 6252 * 6253 * Accesses per-thread data, so there's no need to lock here. 6254 */ 6255 static mod_hash_t * 6256 th_trace_gethash(ip_stack_t *ipst) 6257 { 6258 th_hash_t *thh; 6259 6260 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 6261 mod_hash_t *mh; 6262 char name[256]; 6263 size_t objsize, rshift; 6264 int retv; 6265 6266 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 6267 return (NULL); 6268 (void) snprintf(name, sizeof (name), "th_trace_%p", 6269 (void *)curthread); 6270 6271 /* 6272 * We use mod_hash_create_extended here rather than the more 6273 * obvious mod_hash_create_ptrhash because the latter has a 6274 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 6275 * block. 6276 */ 6277 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 6278 MAX(sizeof (ire_t), sizeof (nce_t))); 6279 rshift = highbit(objsize); 6280 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 6281 th_trace_free, mod_hash_byptr, (void *)rshift, 6282 mod_hash_ptrkey_cmp, KM_NOSLEEP); 6283 if (mh == NULL) { 6284 kmem_free(thh, sizeof (*thh)); 6285 return (NULL); 6286 } 6287 thh->thh_hash = mh; 6288 thh->thh_ipst = ipst; 6289 /* 6290 * We trace ills, ipifs, ires, and nces. All of these are 6291 * per-IP-stack, so the lock on the thread list is as well. 6292 */ 6293 rw_enter(&ip_thread_rwlock, RW_WRITER); 6294 list_insert_tail(&ip_thread_list, thh); 6295 rw_exit(&ip_thread_rwlock); 6296 retv = tsd_set(ip_thread_data, thh); 6297 ASSERT(retv == 0); 6298 } 6299 return (thh != NULL ? thh->thh_hash : NULL); 6300 } 6301 6302 boolean_t 6303 th_trace_ref(const void *obj, ip_stack_t *ipst) 6304 { 6305 th_trace_t *th_trace; 6306 mod_hash_t *mh; 6307 mod_hash_val_t val; 6308 6309 if ((mh = th_trace_gethash(ipst)) == NULL) 6310 return (B_FALSE); 6311 6312 /* 6313 * Attempt to locate the trace buffer for this obj and thread. 6314 * If it does not exist, then allocate a new trace buffer and 6315 * insert into the hash. 6316 */ 6317 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 6318 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 6319 if (th_trace == NULL) 6320 return (B_FALSE); 6321 6322 th_trace->th_id = curthread; 6323 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 6324 (mod_hash_val_t)th_trace) != 0) { 6325 kmem_free(th_trace, sizeof (th_trace_t)); 6326 return (B_FALSE); 6327 } 6328 } else { 6329 th_trace = (th_trace_t *)val; 6330 } 6331 6332 ASSERT(th_trace->th_refcnt >= 0 && 6333 th_trace->th_refcnt < TR_BUF_MAX - 1); 6334 6335 th_trace->th_refcnt++; 6336 th_trace_rrecord(th_trace); 6337 return (B_TRUE); 6338 } 6339 6340 /* 6341 * For the purpose of tracing a reference release, we assume that global 6342 * tracing is always on and that the same thread initiated the reference hold 6343 * is releasing. 6344 */ 6345 void 6346 th_trace_unref(const void *obj) 6347 { 6348 int retv; 6349 mod_hash_t *mh; 6350 th_trace_t *th_trace; 6351 mod_hash_val_t val; 6352 6353 mh = th_trace_gethash(NULL); 6354 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 6355 ASSERT(retv == 0); 6356 th_trace = (th_trace_t *)val; 6357 6358 ASSERT(th_trace->th_refcnt > 0); 6359 th_trace->th_refcnt--; 6360 th_trace_rrecord(th_trace); 6361 } 6362 6363 /* 6364 * If tracing has been disabled, then we assume that the reference counts are 6365 * now useless, and we clear them out before destroying the entries. 6366 */ 6367 void 6368 th_trace_cleanup(const void *obj, boolean_t trace_disable) 6369 { 6370 th_hash_t *thh; 6371 mod_hash_t *mh; 6372 mod_hash_val_t val; 6373 th_trace_t *th_trace; 6374 int retv; 6375 6376 rw_enter(&ip_thread_rwlock, RW_READER); 6377 for (thh = list_head(&ip_thread_list); thh != NULL; 6378 thh = list_next(&ip_thread_list, thh)) { 6379 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 6380 &val) == 0) { 6381 th_trace = (th_trace_t *)val; 6382 if (trace_disable) 6383 th_trace->th_refcnt = 0; 6384 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 6385 ASSERT(retv == 0); 6386 } 6387 } 6388 rw_exit(&ip_thread_rwlock); 6389 } 6390 6391 void 6392 ipif_trace_ref(ipif_t *ipif) 6393 { 6394 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6395 6396 if (ipif->ipif_trace_disable) 6397 return; 6398 6399 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 6400 ipif->ipif_trace_disable = B_TRUE; 6401 ipif_trace_cleanup(ipif); 6402 } 6403 } 6404 6405 void 6406 ipif_untrace_ref(ipif_t *ipif) 6407 { 6408 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6409 6410 if (!ipif->ipif_trace_disable) 6411 th_trace_unref(ipif); 6412 } 6413 6414 void 6415 ill_trace_ref(ill_t *ill) 6416 { 6417 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6418 6419 if (ill->ill_trace_disable) 6420 return; 6421 6422 if (!th_trace_ref(ill, ill->ill_ipst)) { 6423 ill->ill_trace_disable = B_TRUE; 6424 ill_trace_cleanup(ill); 6425 } 6426 } 6427 6428 void 6429 ill_untrace_ref(ill_t *ill) 6430 { 6431 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6432 6433 if (!ill->ill_trace_disable) 6434 th_trace_unref(ill); 6435 } 6436 6437 /* 6438 * Called when ipif is unplumbed or when memory alloc fails. Note that on 6439 * failure, ipif_trace_disable is set. 6440 */ 6441 static void 6442 ipif_trace_cleanup(const ipif_t *ipif) 6443 { 6444 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 6445 } 6446 6447 /* 6448 * Called when ill is unplumbed or when memory alloc fails. Note that on 6449 * failure, ill_trace_disable is set. 6450 */ 6451 static void 6452 ill_trace_cleanup(const ill_t *ill) 6453 { 6454 th_trace_cleanup(ill, ill->ill_trace_disable); 6455 } 6456 #endif /* DEBUG */ 6457 6458 void 6459 ipif_refhold_locked(ipif_t *ipif) 6460 { 6461 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6462 ipif->ipif_refcnt++; 6463 IPIF_TRACE_REF(ipif); 6464 } 6465 6466 void 6467 ipif_refhold(ipif_t *ipif) 6468 { 6469 ill_t *ill; 6470 6471 ill = ipif->ipif_ill; 6472 mutex_enter(&ill->ill_lock); 6473 ipif->ipif_refcnt++; 6474 IPIF_TRACE_REF(ipif); 6475 mutex_exit(&ill->ill_lock); 6476 } 6477 6478 /* 6479 * Must not be called while holding any locks. Otherwise if this is 6480 * the last reference to be released there is a chance of recursive mutex 6481 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6482 * to restart an ioctl. 6483 */ 6484 void 6485 ipif_refrele(ipif_t *ipif) 6486 { 6487 ill_t *ill; 6488 6489 ill = ipif->ipif_ill; 6490 6491 mutex_enter(&ill->ill_lock); 6492 ASSERT(ipif->ipif_refcnt != 0); 6493 ipif->ipif_refcnt--; 6494 IPIF_UNTRACE_REF(ipif); 6495 if (ipif->ipif_refcnt != 0) { 6496 mutex_exit(&ill->ill_lock); 6497 return; 6498 } 6499 6500 /* Drops the ill_lock */ 6501 ipif_ill_refrele_tail(ill); 6502 } 6503 6504 ipif_t * 6505 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6506 { 6507 ipif_t *ipif; 6508 6509 mutex_enter(&ill->ill_lock); 6510 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6511 ipif != NULL; ipif = ipif->ipif_next) { 6512 if (!IPIF_CAN_LOOKUP(ipif)) 6513 continue; 6514 ipif_refhold_locked(ipif); 6515 mutex_exit(&ill->ill_lock); 6516 return (ipif); 6517 } 6518 mutex_exit(&ill->ill_lock); 6519 return (NULL); 6520 } 6521 6522 /* 6523 * TODO: make this table extendible at run time 6524 * Return a pointer to the mac type info for 'mac_type' 6525 */ 6526 static ip_m_t * 6527 ip_m_lookup(t_uscalar_t mac_type) 6528 { 6529 ip_m_t *ipm; 6530 6531 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6532 if (ipm->ip_m_mac_type == mac_type) 6533 return (ipm); 6534 return (NULL); 6535 } 6536 6537 /* 6538 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6539 * ipif_arg is passed in to associate it with the correct interface. 6540 * We may need to restart this operation if the ipif cannot be looked up 6541 * due to an exclusive operation that is currently in progress. The restart 6542 * entry point is specified by 'func' 6543 */ 6544 int 6545 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6546 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg, 6547 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func, 6548 struct rtsa_s *sp, ip_stack_t *ipst) 6549 { 6550 ire_t *ire; 6551 ire_t *gw_ire = NULL; 6552 ipif_t *ipif = NULL; 6553 boolean_t ipif_refheld = B_FALSE; 6554 uint_t type; 6555 int match_flags = MATCH_IRE_TYPE; 6556 int error; 6557 tsol_gc_t *gc = NULL; 6558 tsol_gcgrp_t *gcgrp = NULL; 6559 boolean_t gcgrp_xtraref = B_FALSE; 6560 6561 ip1dbg(("ip_rt_add:")); 6562 6563 if (ire_arg != NULL) 6564 *ire_arg = NULL; 6565 6566 /* 6567 * If this is the case of RTF_HOST being set, then we set the netmask 6568 * to all ones (regardless if one was supplied). 6569 */ 6570 if (flags & RTF_HOST) 6571 mask = IP_HOST_MASK; 6572 6573 /* 6574 * Prevent routes with a zero gateway from being created (since 6575 * interfaces can currently be plumbed and brought up no assigned 6576 * address). 6577 */ 6578 if (gw_addr == 0) 6579 return (ENETUNREACH); 6580 /* 6581 * Get the ipif, if any, corresponding to the gw_addr 6582 */ 6583 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error, 6584 ipst); 6585 if (ipif != NULL) { 6586 if (IS_VNI(ipif->ipif_ill)) { 6587 ipif_refrele(ipif); 6588 return (EINVAL); 6589 } 6590 ipif_refheld = B_TRUE; 6591 } else if (error == EINPROGRESS) { 6592 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6593 return (EINPROGRESS); 6594 } else { 6595 error = 0; 6596 } 6597 6598 if (ipif != NULL) { 6599 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6600 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6601 } else { 6602 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6603 } 6604 6605 /* 6606 * GateD will attempt to create routes with a loopback interface 6607 * address as the gateway and with RTF_GATEWAY set. We allow 6608 * these routes to be added, but create them as interface routes 6609 * since the gateway is an interface address. 6610 */ 6611 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6612 flags &= ~RTF_GATEWAY; 6613 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6614 mask == IP_HOST_MASK) { 6615 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6616 ALL_ZONES, NULL, match_flags, ipst); 6617 if (ire != NULL) { 6618 ire_refrele(ire); 6619 if (ipif_refheld) 6620 ipif_refrele(ipif); 6621 return (EEXIST); 6622 } 6623 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 6624 "for 0x%x\n", (void *)ipif, 6625 ipif->ipif_ire_type, 6626 ntohl(ipif->ipif_lcl_addr))); 6627 ire = ire_create( 6628 (uchar_t *)&dst_addr, /* dest address */ 6629 (uchar_t *)&mask, /* mask */ 6630 (uchar_t *)&ipif->ipif_src_addr, 6631 NULL, /* no gateway */ 6632 &ipif->ipif_mtu, 6633 NULL, 6634 ipif->ipif_rq, /* recv-from queue */ 6635 NULL, /* no send-to queue */ 6636 ipif->ipif_ire_type, /* LOOPBACK */ 6637 ipif, 6638 0, 6639 0, 6640 0, 6641 (ipif->ipif_flags & IPIF_PRIVATE) ? 6642 RTF_PRIVATE : 0, 6643 &ire_uinfo_null, 6644 NULL, 6645 NULL, 6646 ipst); 6647 6648 if (ire == NULL) { 6649 if (ipif_refheld) 6650 ipif_refrele(ipif); 6651 return (ENOMEM); 6652 } 6653 error = ire_add(&ire, q, mp, func, B_FALSE); 6654 if (error == 0) 6655 goto save_ire; 6656 if (ipif_refheld) 6657 ipif_refrele(ipif); 6658 return (error); 6659 6660 } 6661 } 6662 6663 /* 6664 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6665 * and the gateway address provided is one of the system's interface 6666 * addresses. By using the routing socket interface and supplying an 6667 * RTA_IFP sockaddr with an interface index, an alternate method of 6668 * specifying an interface route to be created is available which uses 6669 * the interface index that specifies the outgoing interface rather than 6670 * the address of an outgoing interface (which may not be able to 6671 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6672 * flag, routes can be specified which not only specify the next-hop to 6673 * be used when routing to a certain prefix, but also which outgoing 6674 * interface should be used. 6675 * 6676 * Previously, interfaces would have unique addresses assigned to them 6677 * and so the address assigned to a particular interface could be used 6678 * to identify a particular interface. One exception to this was the 6679 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6680 * 6681 * With the advent of IPv6 and its link-local addresses, this 6682 * restriction was relaxed and interfaces could share addresses between 6683 * themselves. In fact, typically all of the link-local interfaces on 6684 * an IPv6 node or router will have the same link-local address. In 6685 * order to differentiate between these interfaces, the use of an 6686 * interface index is necessary and this index can be carried inside a 6687 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6688 * of using the interface index, however, is that all of the ipif's that 6689 * are part of an ill have the same index and so the RTA_IFP sockaddr 6690 * cannot be used to differentiate between ipif's (or logical 6691 * interfaces) that belong to the same ill (physical interface). 6692 * 6693 * For example, in the following case involving IPv4 interfaces and 6694 * logical interfaces 6695 * 6696 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6697 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6698 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6699 * 6700 * the ipif's corresponding to each of these interface routes can be 6701 * uniquely identified by the "gateway" (actually interface address). 6702 * 6703 * In this case involving multiple IPv6 default routes to a particular 6704 * link-local gateway, the use of RTA_IFP is necessary to specify which 6705 * default route is of interest: 6706 * 6707 * default fe80::123:4567:89ab:cdef U if0 6708 * default fe80::123:4567:89ab:cdef U if1 6709 */ 6710 6711 /* RTF_GATEWAY not set */ 6712 if (!(flags & RTF_GATEWAY)) { 6713 queue_t *stq; 6714 6715 if (sp != NULL) { 6716 ip2dbg(("ip_rt_add: gateway security attributes " 6717 "cannot be set with interface route\n")); 6718 if (ipif_refheld) 6719 ipif_refrele(ipif); 6720 return (EINVAL); 6721 } 6722 6723 /* 6724 * As the interface index specified with the RTA_IFP sockaddr is 6725 * the same for all ipif's off of an ill, the matching logic 6726 * below uses MATCH_IRE_ILL if such an index was specified. 6727 * This means that routes sharing the same prefix when added 6728 * using a RTA_IFP sockaddr must have distinct interface 6729 * indices (namely, they must be on distinct ill's). 6730 * 6731 * On the other hand, since the gateway address will usually be 6732 * different for each ipif on the system, the matching logic 6733 * uses MATCH_IRE_IPIF in the case of a traditional interface 6734 * route. This means that interface routes for the same prefix 6735 * can be created if they belong to distinct ipif's and if a 6736 * RTA_IFP sockaddr is not present. 6737 */ 6738 if (ipif_arg != NULL) { 6739 if (ipif_refheld) { 6740 ipif_refrele(ipif); 6741 ipif_refheld = B_FALSE; 6742 } 6743 ipif = ipif_arg; 6744 match_flags |= MATCH_IRE_ILL; 6745 } else { 6746 /* 6747 * Check the ipif corresponding to the gw_addr 6748 */ 6749 if (ipif == NULL) 6750 return (ENETUNREACH); 6751 match_flags |= MATCH_IRE_IPIF; 6752 } 6753 ASSERT(ipif != NULL); 6754 6755 /* 6756 * We check for an existing entry at this point. 6757 * 6758 * Since a netmask isn't passed in via the ioctl interface 6759 * (SIOCADDRT), we don't check for a matching netmask in that 6760 * case. 6761 */ 6762 if (!ioctl_msg) 6763 match_flags |= MATCH_IRE_MASK; 6764 ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif, 6765 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 6766 if (ire != NULL) { 6767 ire_refrele(ire); 6768 if (ipif_refheld) 6769 ipif_refrele(ipif); 6770 return (EEXIST); 6771 } 6772 6773 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6774 ? ipif->ipif_rq : ipif->ipif_wq; 6775 6776 /* 6777 * Create a copy of the IRE_LOOPBACK, 6778 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6779 * the modified address and netmask. 6780 */ 6781 ire = ire_create( 6782 (uchar_t *)&dst_addr, 6783 (uint8_t *)&mask, 6784 (uint8_t *)&ipif->ipif_src_addr, 6785 NULL, 6786 &ipif->ipif_mtu, 6787 NULL, 6788 NULL, 6789 stq, 6790 ipif->ipif_net_type, 6791 ipif, 6792 0, 6793 0, 6794 0, 6795 flags, 6796 &ire_uinfo_null, 6797 NULL, 6798 NULL, 6799 ipst); 6800 if (ire == NULL) { 6801 if (ipif_refheld) 6802 ipif_refrele(ipif); 6803 return (ENOMEM); 6804 } 6805 6806 /* 6807 * Some software (for example, GateD and Sun Cluster) attempts 6808 * to create (what amount to) IRE_PREFIX routes with the 6809 * loopback address as the gateway. This is primarily done to 6810 * set up prefixes with the RTF_REJECT flag set (for example, 6811 * when generating aggregate routes.) 6812 * 6813 * If the IRE type (as defined by ipif->ipif_net_type) is 6814 * IRE_LOOPBACK, then we map the request into a 6815 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 6816 * these interface routes, by definition, can only be that. 6817 * 6818 * Needless to say, the real IRE_LOOPBACK is NOT created by this 6819 * routine, but rather using ire_create() directly. 6820 * 6821 */ 6822 if (ipif->ipif_net_type == IRE_LOOPBACK) { 6823 ire->ire_type = IRE_IF_NORESOLVER; 6824 ire->ire_flags |= RTF_BLACKHOLE; 6825 } 6826 6827 error = ire_add(&ire, q, mp, func, B_FALSE); 6828 if (error == 0) 6829 goto save_ire; 6830 6831 /* 6832 * In the result of failure, ire_add() will have already 6833 * deleted the ire in question, so there is no need to 6834 * do that here. 6835 */ 6836 if (ipif_refheld) 6837 ipif_refrele(ipif); 6838 return (error); 6839 } 6840 if (ipif_refheld) { 6841 ipif_refrele(ipif); 6842 ipif_refheld = B_FALSE; 6843 } 6844 6845 /* 6846 * Get an interface IRE for the specified gateway. 6847 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 6848 * gateway, it is currently unreachable and we fail the request 6849 * accordingly. 6850 */ 6851 ipif = ipif_arg; 6852 if (ipif_arg != NULL) 6853 match_flags |= MATCH_IRE_ILL; 6854 again: 6855 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 6856 ALL_ZONES, 0, NULL, match_flags, ipst); 6857 if (gw_ire == NULL) { 6858 /* 6859 * With IPMP, we allow host routes to influence in.mpathd's 6860 * target selection. However, if the test addresses are on 6861 * their own network, the above lookup will fail since the 6862 * underlying IRE_INTERFACEs are marked hidden. So allow 6863 * hidden test IREs to be found and try again. 6864 */ 6865 if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) { 6866 match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 6867 goto again; 6868 } 6869 return (ENETUNREACH); 6870 } 6871 6872 /* 6873 * We create one of three types of IREs as a result of this request 6874 * based on the netmask. A netmask of all ones (which is automatically 6875 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 6876 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 6877 * created. Otherwise, an IRE_PREFIX route is created for the 6878 * destination prefix. 6879 */ 6880 if (mask == IP_HOST_MASK) 6881 type = IRE_HOST; 6882 else if (mask == 0) 6883 type = IRE_DEFAULT; 6884 else 6885 type = IRE_PREFIX; 6886 6887 /* check for a duplicate entry */ 6888 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 6889 NULL, ALL_ZONES, 0, NULL, 6890 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst); 6891 if (ire != NULL) { 6892 ire_refrele(gw_ire); 6893 ire_refrele(ire); 6894 return (EEXIST); 6895 } 6896 6897 /* Security attribute exists */ 6898 if (sp != NULL) { 6899 tsol_gcgrp_addr_t ga; 6900 6901 /* find or create the gateway credentials group */ 6902 ga.ga_af = AF_INET; 6903 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 6904 6905 /* we hold reference to it upon success */ 6906 gcgrp = gcgrp_lookup(&ga, B_TRUE); 6907 if (gcgrp == NULL) { 6908 ire_refrele(gw_ire); 6909 return (ENOMEM); 6910 } 6911 6912 /* 6913 * Create and add the security attribute to the group; a 6914 * reference to the group is made upon allocating a new 6915 * entry successfully. If it finds an already-existing 6916 * entry for the security attribute in the group, it simply 6917 * returns it and no new reference is made to the group. 6918 */ 6919 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 6920 if (gc == NULL) { 6921 /* release reference held by gcgrp_lookup */ 6922 GCGRP_REFRELE(gcgrp); 6923 ire_refrele(gw_ire); 6924 return (ENOMEM); 6925 } 6926 } 6927 6928 /* Create the IRE. */ 6929 ire = ire_create( 6930 (uchar_t *)&dst_addr, /* dest address */ 6931 (uchar_t *)&mask, /* mask */ 6932 /* src address assigned by the caller? */ 6933 (uchar_t *)(((src_addr != INADDR_ANY) && 6934 (flags & RTF_SETSRC)) ? &src_addr : NULL), 6935 (uchar_t *)&gw_addr, /* gateway address */ 6936 &gw_ire->ire_max_frag, 6937 NULL, /* no src nce */ 6938 NULL, /* no recv-from queue */ 6939 NULL, /* no send-to queue */ 6940 (ushort_t)type, /* IRE type */ 6941 ipif_arg, 6942 0, 6943 0, 6944 0, 6945 flags, 6946 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 6947 gc, /* security attribute */ 6948 NULL, 6949 ipst); 6950 6951 /* 6952 * The ire holds a reference to the 'gc' and the 'gc' holds a 6953 * reference to the 'gcgrp'. We can now release the extra reference 6954 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 6955 */ 6956 if (gcgrp_xtraref) 6957 GCGRP_REFRELE(gcgrp); 6958 if (ire == NULL) { 6959 if (gc != NULL) 6960 GC_REFRELE(gc); 6961 ire_refrele(gw_ire); 6962 return (ENOMEM); 6963 } 6964 6965 /* 6966 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 6967 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 6968 */ 6969 6970 /* Add the new IRE. */ 6971 error = ire_add(&ire, q, mp, func, B_FALSE); 6972 if (error != 0) { 6973 /* 6974 * In the result of failure, ire_add() will have already 6975 * deleted the ire in question, so there is no need to 6976 * do that here. 6977 */ 6978 ire_refrele(gw_ire); 6979 return (error); 6980 } 6981 6982 if (flags & RTF_MULTIRT) { 6983 /* 6984 * Invoke the CGTP (multirouting) filtering module 6985 * to add the dst address in the filtering database. 6986 * Replicated inbound packets coming from that address 6987 * will be filtered to discard the duplicates. 6988 * It is not necessary to call the CGTP filter hook 6989 * when the dst address is a broadcast or multicast, 6990 * because an IP source address cannot be a broadcast 6991 * or a multicast. 6992 */ 6993 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 6994 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 6995 if (ire_dst != NULL) { 6996 ip_cgtp_bcast_add(ire, ire_dst, ipst); 6997 ire_refrele(ire_dst); 6998 goto save_ire; 6999 } 7000 if (ipst->ips_ip_cgtp_filter_ops != NULL && 7001 !CLASSD(ire->ire_addr)) { 7002 int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4( 7003 ipst->ips_netstack->netstack_stackid, 7004 ire->ire_addr, 7005 ire->ire_gateway_addr, 7006 ire->ire_src_addr, 7007 gw_ire->ire_src_addr); 7008 if (res != 0) { 7009 ire_refrele(gw_ire); 7010 ire_delete(ire); 7011 return (res); 7012 } 7013 } 7014 } 7015 7016 /* 7017 * Now that the prefix IRE entry has been created, delete any 7018 * existing gateway IRE cache entries as well as any IRE caches 7019 * using the gateway, and force them to be created through 7020 * ip_newroute. 7021 */ 7022 if (gc != NULL) { 7023 ASSERT(gcgrp != NULL); 7024 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst); 7025 } 7026 7027 save_ire: 7028 if (gw_ire != NULL) { 7029 ire_refrele(gw_ire); 7030 } 7031 if (ipif != NULL) { 7032 /* 7033 * Save enough information so that we can recreate the IRE if 7034 * the interface goes down and then up. The metrics associated 7035 * with the route will be saved as well when rts_setmetrics() is 7036 * called after the IRE has been created. In the case where 7037 * memory cannot be allocated, none of this information will be 7038 * saved. 7039 */ 7040 ipif_save_ire(ipif, ire); 7041 } 7042 if (ioctl_msg) 7043 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 7044 if (ire_arg != NULL) { 7045 /* 7046 * Store the ire that was successfully added into where ire_arg 7047 * points to so that callers don't have to look it up 7048 * themselves (but they are responsible for ire_refrele()ing 7049 * the ire when they are finished with it). 7050 */ 7051 *ire_arg = ire; 7052 } else { 7053 ire_refrele(ire); /* Held in ire_add */ 7054 } 7055 if (ipif_refheld) 7056 ipif_refrele(ipif); 7057 return (0); 7058 } 7059 7060 /* 7061 * ip_rt_delete is called to delete an IPv4 route. 7062 * ipif_arg is passed in to associate it with the correct interface. 7063 * We may need to restart this operation if the ipif cannot be looked up 7064 * due to an exclusive operation that is currently in progress. The restart 7065 * entry point is specified by 'func' 7066 */ 7067 /* ARGSUSED4 */ 7068 int 7069 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7070 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg, 7071 queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst) 7072 { 7073 ire_t *ire = NULL; 7074 ipif_t *ipif; 7075 boolean_t ipif_refheld = B_FALSE; 7076 uint_t type; 7077 uint_t match_flags = MATCH_IRE_TYPE; 7078 int err = 0; 7079 7080 ip1dbg(("ip_rt_delete:")); 7081 /* 7082 * If this is the case of RTF_HOST being set, then we set the netmask 7083 * to all ones. Otherwise, we use the netmask if one was supplied. 7084 */ 7085 if (flags & RTF_HOST) { 7086 mask = IP_HOST_MASK; 7087 match_flags |= MATCH_IRE_MASK; 7088 } else if (rtm_addrs & RTA_NETMASK) { 7089 match_flags |= MATCH_IRE_MASK; 7090 } 7091 7092 /* 7093 * Note that RTF_GATEWAY is never set on a delete, therefore 7094 * we check if the gateway address is one of our interfaces first, 7095 * and fall back on RTF_GATEWAY routes. 7096 * 7097 * This makes it possible to delete an original 7098 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7099 * 7100 * As the interface index specified with the RTA_IFP sockaddr is the 7101 * same for all ipif's off of an ill, the matching logic below uses 7102 * MATCH_IRE_ILL if such an index was specified. This means a route 7103 * sharing the same prefix and interface index as the the route 7104 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7105 * is specified in the request. 7106 * 7107 * On the other hand, since the gateway address will usually be 7108 * different for each ipif on the system, the matching logic 7109 * uses MATCH_IRE_IPIF in the case of a traditional interface 7110 * route. This means that interface routes for the same prefix can be 7111 * uniquely identified if they belong to distinct ipif's and if a 7112 * RTA_IFP sockaddr is not present. 7113 * 7114 * For more detail on specifying routes by gateway address and by 7115 * interface index, see the comments in ip_rt_add(). 7116 */ 7117 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err, 7118 ipst); 7119 if (ipif != NULL) 7120 ipif_refheld = B_TRUE; 7121 else if (err == EINPROGRESS) 7122 return (err); 7123 else 7124 err = 0; 7125 if (ipif != NULL) { 7126 if (ipif_arg != NULL) { 7127 if (ipif_refheld) { 7128 ipif_refrele(ipif); 7129 ipif_refheld = B_FALSE; 7130 } 7131 ipif = ipif_arg; 7132 match_flags |= MATCH_IRE_ILL; 7133 } else { 7134 match_flags |= MATCH_IRE_IPIF; 7135 } 7136 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7137 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 7138 ALL_ZONES, NULL, match_flags, ipst); 7139 } 7140 if (ire == NULL) { 7141 ire = ire_ftable_lookup(dst_addr, mask, 0, 7142 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, 7143 match_flags, ipst); 7144 } 7145 } 7146 7147 if (ire == NULL) { 7148 /* 7149 * At this point, the gateway address is not one of our own 7150 * addresses or a matching interface route was not found. We 7151 * set the IRE type to lookup based on whether 7152 * this is a host route, a default route or just a prefix. 7153 * 7154 * If an ipif_arg was passed in, then the lookup is based on an 7155 * interface index so MATCH_IRE_ILL is added to match_flags. 7156 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7157 * set as the route being looked up is not a traditional 7158 * interface route. 7159 */ 7160 match_flags &= ~MATCH_IRE_IPIF; 7161 match_flags |= MATCH_IRE_GW; 7162 if (ipif_arg != NULL) 7163 match_flags |= MATCH_IRE_ILL; 7164 if (mask == IP_HOST_MASK) 7165 type = IRE_HOST; 7166 else if (mask == 0) 7167 type = IRE_DEFAULT; 7168 else 7169 type = IRE_PREFIX; 7170 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7171 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 7172 } 7173 7174 if (ipif_refheld) 7175 ipif_refrele(ipif); 7176 7177 /* ipif is not refheld anymore */ 7178 if (ire == NULL) 7179 return (ESRCH); 7180 7181 if (ire->ire_flags & RTF_MULTIRT) { 7182 /* 7183 * Invoke the CGTP (multirouting) filtering module 7184 * to remove the dst address from the filtering database. 7185 * Packets coming from that address will no longer be 7186 * filtered to remove duplicates. 7187 */ 7188 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 7189 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 7190 ipst->ips_netstack->netstack_stackid, 7191 ire->ire_addr, ire->ire_gateway_addr); 7192 } 7193 ip_cgtp_bcast_delete(ire, ipst); 7194 } 7195 7196 ipif = ire->ire_ipif; 7197 if (ipif != NULL) 7198 ipif_remove_ire(ipif, ire); 7199 if (ioctl_msg) 7200 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 7201 ire_delete(ire); 7202 ire_refrele(ire); 7203 return (err); 7204 } 7205 7206 /* 7207 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7208 */ 7209 /* ARGSUSED */ 7210 int 7211 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7212 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7213 { 7214 ipaddr_t dst_addr; 7215 ipaddr_t gw_addr; 7216 ipaddr_t mask; 7217 int error = 0; 7218 mblk_t *mp1; 7219 struct rtentry *rt; 7220 ipif_t *ipif = NULL; 7221 ip_stack_t *ipst; 7222 7223 ASSERT(q->q_next == NULL); 7224 ipst = CONNQ_TO_IPST(q); 7225 7226 ip1dbg(("ip_siocaddrt:")); 7227 /* Existence of mp1 verified in ip_wput_nondata */ 7228 mp1 = mp->b_cont->b_cont; 7229 rt = (struct rtentry *)mp1->b_rptr; 7230 7231 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7232 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7233 7234 /* 7235 * If the RTF_HOST flag is on, this is a request to assign a gateway 7236 * to a particular host address. In this case, we set the netmask to 7237 * all ones for the particular destination address. Otherwise, 7238 * determine the netmask to be used based on dst_addr and the interfaces 7239 * in use. 7240 */ 7241 if (rt->rt_flags & RTF_HOST) { 7242 mask = IP_HOST_MASK; 7243 } else { 7244 /* 7245 * Note that ip_subnet_mask returns a zero mask in the case of 7246 * default (an all-zeroes address). 7247 */ 7248 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7249 } 7250 7251 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7252 B_TRUE, q, mp, ip_process_ioctl, NULL, ipst); 7253 if (ipif != NULL) 7254 ipif_refrele(ipif); 7255 return (error); 7256 } 7257 7258 /* 7259 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7260 */ 7261 /* ARGSUSED */ 7262 int 7263 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7264 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7265 { 7266 ipaddr_t dst_addr; 7267 ipaddr_t gw_addr; 7268 ipaddr_t mask; 7269 int error; 7270 mblk_t *mp1; 7271 struct rtentry *rt; 7272 ipif_t *ipif = NULL; 7273 ip_stack_t *ipst; 7274 7275 ASSERT(q->q_next == NULL); 7276 ipst = CONNQ_TO_IPST(q); 7277 7278 ip1dbg(("ip_siocdelrt:")); 7279 /* Existence of mp1 verified in ip_wput_nondata */ 7280 mp1 = mp->b_cont->b_cont; 7281 rt = (struct rtentry *)mp1->b_rptr; 7282 7283 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7284 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7285 7286 /* 7287 * If the RTF_HOST flag is on, this is a request to delete a gateway 7288 * to a particular host address. In this case, we set the netmask to 7289 * all ones for the particular destination address. Otherwise, 7290 * determine the netmask to be used based on dst_addr and the interfaces 7291 * in use. 7292 */ 7293 if (rt->rt_flags & RTF_HOST) { 7294 mask = IP_HOST_MASK; 7295 } else { 7296 /* 7297 * Note that ip_subnet_mask returns a zero mask in the case of 7298 * default (an all-zeroes address). 7299 */ 7300 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7301 } 7302 7303 error = ip_rt_delete(dst_addr, mask, gw_addr, 7304 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q, 7305 mp, ip_process_ioctl, ipst); 7306 if (ipif != NULL) 7307 ipif_refrele(ipif); 7308 return (error); 7309 } 7310 7311 /* 7312 * Enqueue the mp onto the ipsq, chained by b_next. 7313 * b_prev stores the function to be executed later, and b_queue the queue 7314 * where this mp originated. 7315 */ 7316 void 7317 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7318 ill_t *pending_ill) 7319 { 7320 conn_t *connp; 7321 ipxop_t *ipx = ipsq->ipsq_xop; 7322 7323 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7324 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 7325 ASSERT(func != NULL); 7326 7327 mp->b_queue = q; 7328 mp->b_prev = (void *)func; 7329 mp->b_next = NULL; 7330 7331 switch (type) { 7332 case CUR_OP: 7333 if (ipx->ipx_mptail != NULL) { 7334 ASSERT(ipx->ipx_mphead != NULL); 7335 ipx->ipx_mptail->b_next = mp; 7336 } else { 7337 ASSERT(ipx->ipx_mphead == NULL); 7338 ipx->ipx_mphead = mp; 7339 } 7340 ipx->ipx_mptail = mp; 7341 break; 7342 7343 case NEW_OP: 7344 if (ipsq->ipsq_xopq_mptail != NULL) { 7345 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7346 ipsq->ipsq_xopq_mptail->b_next = mp; 7347 } else { 7348 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7349 ipsq->ipsq_xopq_mphead = mp; 7350 } 7351 ipsq->ipsq_xopq_mptail = mp; 7352 ipx->ipx_ipsq_queued = B_TRUE; 7353 break; 7354 7355 case SWITCH_OP: 7356 ASSERT(ipsq->ipsq_swxop != NULL); 7357 /* only one switch operation is currently allowed */ 7358 ASSERT(ipsq->ipsq_switch_mp == NULL); 7359 ipsq->ipsq_switch_mp = mp; 7360 ipx->ipx_ipsq_queued = B_TRUE; 7361 break; 7362 default: 7363 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7364 } 7365 7366 if (CONN_Q(q) && pending_ill != NULL) { 7367 connp = Q_TO_CONN(q); 7368 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7369 connp->conn_oper_pending_ill = pending_ill; 7370 } 7371 } 7372 7373 /* 7374 * Dequeue the next message that requested exclusive access to this IPSQ's 7375 * xop. Specifically: 7376 * 7377 * 1. If we're still processing the current operation on `ipsq', then 7378 * dequeue the next message for the operation (from ipx_mphead), or 7379 * return NULL if there are no queued messages for the operation. 7380 * These messages are queued via CUR_OP to qwriter_ip() and friends. 7381 * 7382 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 7383 * not set) see if the ipsq has requested an xop switch. If so, switch 7384 * `ipsq' to a different xop. Xop switches only happen when joining or 7385 * leaving IPMP groups and require a careful dance -- see the comments 7386 * in-line below for details. If we're leaving a group xop or if we're 7387 * joining a group xop and become writer on it, then we proceed to (3). 7388 * Otherwise, we return NULL and exit the xop. 7389 * 7390 * 3. For each IPSQ in the xop, return any switch operation stored on 7391 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 7392 * any other messages queued on the IPSQ. Otherwise, dequeue the next 7393 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 7394 * Note that if the phyint tied to `ipsq' is not using IPMP there will 7395 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 7396 * each phyint in the group, including the IPMP meta-interface phyint. 7397 */ 7398 static mblk_t * 7399 ipsq_dq(ipsq_t *ipsq) 7400 { 7401 ill_t *illv4, *illv6; 7402 mblk_t *mp; 7403 ipsq_t *xopipsq; 7404 ipsq_t *leftipsq = NULL; 7405 ipxop_t *ipx; 7406 phyint_t *phyi = ipsq->ipsq_phyint; 7407 ip_stack_t *ipst = ipsq->ipsq_ipst; 7408 boolean_t emptied = B_FALSE; 7409 7410 /* 7411 * Grab all the locks we need in the defined order (ill_g_lock -> 7412 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 7413 */ 7414 rw_enter(&ipst->ips_ill_g_lock, 7415 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 7416 mutex_enter(&ipsq->ipsq_lock); 7417 ipx = ipsq->ipsq_xop; 7418 mutex_enter(&ipx->ipx_lock); 7419 7420 /* 7421 * Dequeue the next message associated with the current exclusive 7422 * operation, if any. 7423 */ 7424 if ((mp = ipx->ipx_mphead) != NULL) { 7425 ipx->ipx_mphead = mp->b_next; 7426 if (ipx->ipx_mphead == NULL) 7427 ipx->ipx_mptail = NULL; 7428 mp->b_next = (void *)ipsq; 7429 goto out; 7430 } 7431 7432 if (ipx->ipx_current_ipif != NULL) 7433 goto empty; 7434 7435 if (ipsq->ipsq_swxop != NULL) { 7436 /* 7437 * The exclusive operation that is now being completed has 7438 * requested a switch to a different xop. This happens 7439 * when an interface joins or leaves an IPMP group. Joins 7440 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 7441 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 7442 * (phyint_free()), or interface plumb for an ill type 7443 * not in the IPMP group (ip_rput_dlpi_writer()). 7444 * 7445 * Xop switches are not allowed on the IPMP meta-interface. 7446 */ 7447 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 7448 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 7449 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 7450 7451 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 7452 /* 7453 * We're switching back to our own xop, so we have two 7454 * xop's to drain/exit: our own, and the group xop 7455 * that we are leaving. 7456 * 7457 * First, pull ourselves out of the group ipsq list. 7458 * This is safe since we're writer on ill_g_lock. 7459 */ 7460 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 7461 7462 xopipsq = ipx->ipx_ipsq; 7463 while (xopipsq->ipsq_next != ipsq) 7464 xopipsq = xopipsq->ipsq_next; 7465 7466 xopipsq->ipsq_next = ipsq->ipsq_next; 7467 ipsq->ipsq_next = ipsq; 7468 ipsq->ipsq_xop = ipsq->ipsq_swxop; 7469 ipsq->ipsq_swxop = NULL; 7470 7471 /* 7472 * Second, prepare to exit the group xop. The actual 7473 * ipsq_exit() is done at the end of this function 7474 * since we cannot hold any locks across ipsq_exit(). 7475 * Note that although we drop the group's ipx_lock, no 7476 * threads can proceed since we're still ipx_writer. 7477 */ 7478 leftipsq = xopipsq; 7479 mutex_exit(&ipx->ipx_lock); 7480 7481 /* 7482 * Third, set ipx to point to our own xop (which was 7483 * inactive and therefore can be entered). 7484 */ 7485 ipx = ipsq->ipsq_xop; 7486 mutex_enter(&ipx->ipx_lock); 7487 ASSERT(ipx->ipx_writer == NULL); 7488 ASSERT(ipx->ipx_current_ipif == NULL); 7489 } else { 7490 /* 7491 * We're switching from our own xop to a group xop. 7492 * The requestor of the switch must ensure that the 7493 * group xop cannot go away (e.g. by ensuring the 7494 * phyint associated with the xop cannot go away). 7495 * 7496 * If we can become writer on our new xop, then we'll 7497 * do the drain. Otherwise, the current writer of our 7498 * new xop will do the drain when it exits. 7499 * 7500 * First, splice ourselves into the group IPSQ list. 7501 * This is safe since we're writer on ill_g_lock. 7502 */ 7503 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 7504 7505 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 7506 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 7507 xopipsq = xopipsq->ipsq_next; 7508 7509 xopipsq->ipsq_next = ipsq; 7510 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 7511 ipsq->ipsq_xop = ipsq->ipsq_swxop; 7512 ipsq->ipsq_swxop = NULL; 7513 7514 /* 7515 * Second, exit our own xop, since it's now unused. 7516 * This is safe since we've got the only reference. 7517 */ 7518 ASSERT(ipx->ipx_writer == curthread); 7519 ipx->ipx_writer = NULL; 7520 VERIFY(--ipx->ipx_reentry_cnt == 0); 7521 ipx->ipx_ipsq_queued = B_FALSE; 7522 mutex_exit(&ipx->ipx_lock); 7523 7524 /* 7525 * Third, set ipx to point to our new xop, and check 7526 * if we can become writer on it. If we cannot, then 7527 * the current writer will drain the IPSQ group when 7528 * it exits. Our ipsq_xop is guaranteed to be stable 7529 * because we're still holding ipsq_lock. 7530 */ 7531 ipx = ipsq->ipsq_xop; 7532 mutex_enter(&ipx->ipx_lock); 7533 if (ipx->ipx_writer != NULL || 7534 ipx->ipx_current_ipif != NULL) { 7535 goto out; 7536 } 7537 } 7538 7539 /* 7540 * Fourth, become writer on our new ipx before we continue 7541 * with the drain. Note that we never dropped ipsq_lock 7542 * above, so no other thread could've raced with us to 7543 * become writer first. Also, we're holding ipx_lock, so 7544 * no other thread can examine the ipx right now. 7545 */ 7546 ASSERT(ipx->ipx_current_ipif == NULL); 7547 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 7548 VERIFY(ipx->ipx_reentry_cnt++ == 0); 7549 ipx->ipx_writer = curthread; 7550 ipx->ipx_forced = B_FALSE; 7551 #ifdef DEBUG 7552 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7553 #endif 7554 } 7555 7556 xopipsq = ipsq; 7557 do { 7558 /* 7559 * So that other operations operate on a consistent and 7560 * complete phyint, a switch message on an IPSQ must be 7561 * handled prior to any other operations on that IPSQ. 7562 */ 7563 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 7564 xopipsq->ipsq_switch_mp = NULL; 7565 ASSERT(mp->b_next == NULL); 7566 mp->b_next = (void *)xopipsq; 7567 goto out; 7568 } 7569 7570 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 7571 xopipsq->ipsq_xopq_mphead = mp->b_next; 7572 if (xopipsq->ipsq_xopq_mphead == NULL) 7573 xopipsq->ipsq_xopq_mptail = NULL; 7574 mp->b_next = (void *)xopipsq; 7575 goto out; 7576 } 7577 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 7578 empty: 7579 /* 7580 * There are no messages. Further, we are holding ipx_lock, hence no 7581 * new messages can end up on any IPSQ in the xop. 7582 */ 7583 ipx->ipx_writer = NULL; 7584 ipx->ipx_forced = B_FALSE; 7585 VERIFY(--ipx->ipx_reentry_cnt == 0); 7586 ipx->ipx_ipsq_queued = B_FALSE; 7587 emptied = B_TRUE; 7588 #ifdef DEBUG 7589 ipx->ipx_depth = 0; 7590 #endif 7591 out: 7592 mutex_exit(&ipx->ipx_lock); 7593 mutex_exit(&ipsq->ipsq_lock); 7594 7595 /* 7596 * If we completely emptied the xop, then wake up any threads waiting 7597 * to enter any of the IPSQ's associated with it. 7598 */ 7599 if (emptied) { 7600 xopipsq = ipsq; 7601 do { 7602 if ((phyi = xopipsq->ipsq_phyint) == NULL) 7603 continue; 7604 7605 illv4 = phyi->phyint_illv4; 7606 illv6 = phyi->phyint_illv6; 7607 7608 GRAB_ILL_LOCKS(illv4, illv6); 7609 if (illv4 != NULL) 7610 cv_broadcast(&illv4->ill_cv); 7611 if (illv6 != NULL) 7612 cv_broadcast(&illv6->ill_cv); 7613 RELEASE_ILL_LOCKS(illv4, illv6); 7614 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 7615 } 7616 rw_exit(&ipst->ips_ill_g_lock); 7617 7618 /* 7619 * Now that all locks are dropped, exit the IPSQ we left. 7620 */ 7621 if (leftipsq != NULL) 7622 ipsq_exit(leftipsq); 7623 7624 return (mp); 7625 } 7626 7627 /* 7628 * Return completion status of previously initiated DLPI operations on 7629 * ills in the purview of an ipsq. 7630 */ 7631 static boolean_t 7632 ipsq_dlpi_done(ipsq_t *ipsq) 7633 { 7634 ipsq_t *ipsq_start; 7635 phyint_t *phyi; 7636 ill_t *ill; 7637 7638 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 7639 ipsq_start = ipsq; 7640 7641 do { 7642 /* 7643 * The only current users of this function are ipsq_try_enter 7644 * and ipsq_enter which have made sure that ipsq_writer is 7645 * NULL before we reach here. ill_dlpi_pending is modified 7646 * only by an ipsq writer 7647 */ 7648 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 7649 phyi = ipsq->ipsq_phyint; 7650 /* 7651 * phyi could be NULL if a phyint that is part of an 7652 * IPMP group is being unplumbed. A more detailed 7653 * comment is in ipmp_grp_update_kstats() 7654 */ 7655 if (phyi != NULL) { 7656 ill = phyi->phyint_illv4; 7657 if (ill != NULL && 7658 ill->ill_dlpi_pending != DL_PRIM_INVAL) 7659 return (B_FALSE); 7660 7661 ill = phyi->phyint_illv6; 7662 if (ill != NULL && 7663 ill->ill_dlpi_pending != DL_PRIM_INVAL) 7664 return (B_FALSE); 7665 } 7666 7667 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 7668 7669 return (B_TRUE); 7670 } 7671 7672 /* 7673 * Enter the ipsq corresponding to ill, by waiting synchronously till 7674 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7675 * will have to drain completely before ipsq_enter returns success. 7676 * ipx_current_ipif will be set if some exclusive op is in progress, 7677 * and the ipsq_exit logic will start the next enqueued op after 7678 * completion of the current op. If 'force' is used, we don't wait 7679 * for the enqueued ops. This is needed when a conn_close wants to 7680 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7681 * of an ill can also use this option. But we dont' use it currently. 7682 */ 7683 #define ENTER_SQ_WAIT_TICKS 100 7684 boolean_t 7685 ipsq_enter(ill_t *ill, boolean_t force, int type) 7686 { 7687 ipsq_t *ipsq; 7688 ipxop_t *ipx; 7689 boolean_t waited_enough = B_FALSE; 7690 ip_stack_t *ipst = ill->ill_ipst; 7691 7692 /* 7693 * Note that the relationship between ill and ipsq is fixed as long as 7694 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 7695 * relationship between the IPSQ and xop cannot change. However, 7696 * since we cannot hold ipsq_lock across the cv_wait(), it may change 7697 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 7698 * waking up all ills in the xop when it becomes available. 7699 */ 7700 for (;;) { 7701 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7702 mutex_enter(&ill->ill_lock); 7703 if (ill->ill_state_flags & ILL_CONDEMNED) { 7704 mutex_exit(&ill->ill_lock); 7705 rw_exit(&ipst->ips_ill_g_lock); 7706 return (B_FALSE); 7707 } 7708 7709 ipsq = ill->ill_phyint->phyint_ipsq; 7710 mutex_enter(&ipsq->ipsq_lock); 7711 ipx = ipsq->ipsq_xop; 7712 mutex_enter(&ipx->ipx_lock); 7713 7714 if (ipx->ipx_writer == NULL && (type == CUR_OP || 7715 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 7716 waited_enough)) 7717 break; 7718 7719 rw_exit(&ipst->ips_ill_g_lock); 7720 7721 if (!force || ipx->ipx_writer != NULL) { 7722 mutex_exit(&ipx->ipx_lock); 7723 mutex_exit(&ipsq->ipsq_lock); 7724 cv_wait(&ill->ill_cv, &ill->ill_lock); 7725 } else { 7726 mutex_exit(&ipx->ipx_lock); 7727 mutex_exit(&ipsq->ipsq_lock); 7728 (void) cv_timedwait(&ill->ill_cv, 7729 &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS); 7730 waited_enough = B_TRUE; 7731 } 7732 mutex_exit(&ill->ill_lock); 7733 } 7734 7735 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 7736 ASSERT(ipx->ipx_reentry_cnt == 0); 7737 ipx->ipx_writer = curthread; 7738 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 7739 ipx->ipx_reentry_cnt++; 7740 #ifdef DEBUG 7741 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7742 #endif 7743 mutex_exit(&ipx->ipx_lock); 7744 mutex_exit(&ipsq->ipsq_lock); 7745 mutex_exit(&ill->ill_lock); 7746 rw_exit(&ipst->ips_ill_g_lock); 7747 7748 return (B_TRUE); 7749 } 7750 7751 /* 7752 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 7753 * across the call to the core interface ipsq_try_enter() and hence calls this 7754 * function directly. This is explained more fully in ipif_set_values(). 7755 * In order to support the above constraint, ipsq_try_enter is implemented as 7756 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 7757 */ 7758 static ipsq_t * 7759 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 7760 int type, boolean_t reentry_ok) 7761 { 7762 ipsq_t *ipsq; 7763 ipxop_t *ipx; 7764 ip_stack_t *ipst = ill->ill_ipst; 7765 7766 /* 7767 * lock ordering: 7768 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 7769 * 7770 * ipx of an ipsq can't change when ipsq_lock is held. 7771 */ 7772 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 7773 GRAB_CONN_LOCK(q); 7774 mutex_enter(&ill->ill_lock); 7775 ipsq = ill->ill_phyint->phyint_ipsq; 7776 mutex_enter(&ipsq->ipsq_lock); 7777 ipx = ipsq->ipsq_xop; 7778 mutex_enter(&ipx->ipx_lock); 7779 7780 /* 7781 * 1. Enter the ipsq if we are already writer and reentry is ok. 7782 * (Note: If the caller does not specify reentry_ok then neither 7783 * 'func' nor any of its callees must ever attempt to enter the ipsq 7784 * again. Otherwise it can lead to an infinite loop 7785 * 2. Enter the ipsq if there is no current writer and this attempted 7786 * entry is part of the current operation 7787 * 3. Enter the ipsq if there is no current writer and this is a new 7788 * operation and the operation queue is empty and there is no 7789 * operation currently in progress and if all previously initiated 7790 * DLPI operations have completed. 7791 */ 7792 if ((ipx->ipx_writer == curthread && reentry_ok) || 7793 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 7794 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 7795 ipsq_dlpi_done(ipsq))))) { 7796 /* Success. */ 7797 ipx->ipx_reentry_cnt++; 7798 ipx->ipx_writer = curthread; 7799 ipx->ipx_forced = B_FALSE; 7800 mutex_exit(&ipx->ipx_lock); 7801 mutex_exit(&ipsq->ipsq_lock); 7802 mutex_exit(&ill->ill_lock); 7803 RELEASE_CONN_LOCK(q); 7804 #ifdef DEBUG 7805 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7806 #endif 7807 return (ipsq); 7808 } 7809 7810 if (func != NULL) 7811 ipsq_enq(ipsq, q, mp, func, type, ill); 7812 7813 mutex_exit(&ipx->ipx_lock); 7814 mutex_exit(&ipsq->ipsq_lock); 7815 mutex_exit(&ill->ill_lock); 7816 RELEASE_CONN_LOCK(q); 7817 return (NULL); 7818 } 7819 7820 /* 7821 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7822 * certain critical operations like plumbing (i.e. most set ioctls), multicast 7823 * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq 7824 * serializes exclusive ioctls issued by applications on a per ipsq basis in 7825 * ipsq_xopq_mphead. It also protects against multiple threads executing in 7826 * the ipsq. Responses from the driver pertain to the current ioctl (say a 7827 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 7828 * up the interface) and are enqueued in ipx_mphead. 7829 * 7830 * If a thread does not want to reenter the ipsq when it is already writer, 7831 * it must make sure that the specified reentry point to be called later 7832 * when the ipsq is empty, nor any code path starting from the specified reentry 7833 * point must never ever try to enter the ipsq again. Otherwise it can lead 7834 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7835 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7836 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 7837 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 7838 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7839 * ioctl if the current ioctl has completed. If the current ioctl is still 7840 * in progress it simply returns. The current ioctl could be waiting for 7841 * a response from another module (arp or the driver or could be waiting for 7842 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 7843 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 7844 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7845 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 7846 * all associated DLPI operations have completed. 7847 */ 7848 7849 /* 7850 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 7851 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 7852 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 7853 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 7854 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 7855 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 7856 */ 7857 ipsq_t * 7858 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7859 ipsq_func_t func, int type, boolean_t reentry_ok) 7860 { 7861 ip_stack_t *ipst; 7862 ipsq_t *ipsq; 7863 7864 /* Only 1 of ipif or ill can be specified */ 7865 ASSERT((ipif != NULL) ^ (ill != NULL)); 7866 7867 if (ipif != NULL) 7868 ill = ipif->ipif_ill; 7869 ipst = ill->ill_ipst; 7870 7871 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7872 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 7873 rw_exit(&ipst->ips_ill_g_lock); 7874 7875 return (ipsq); 7876 } 7877 7878 /* 7879 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 7880 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 7881 * cannot be entered, the mp is queued for completion. 7882 */ 7883 void 7884 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7885 boolean_t reentry_ok) 7886 { 7887 ipsq_t *ipsq; 7888 7889 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 7890 7891 /* 7892 * Drop the caller's refhold on the ill. This is safe since we either 7893 * entered the IPSQ (and thus are exclusive), or failed to enter the 7894 * IPSQ, in which case we return without accessing ill anymore. This 7895 * is needed because func needs to see the correct refcount. 7896 * e.g. removeif can work only then. 7897 */ 7898 ill_refrele(ill); 7899 if (ipsq != NULL) { 7900 (*func)(ipsq, q, mp, NULL); 7901 ipsq_exit(ipsq); 7902 } 7903 } 7904 7905 /* 7906 * Exit the specified IPSQ. If this is the final exit on it then drain it 7907 * prior to exiting. Caller must be writer on the specified IPSQ. 7908 */ 7909 void 7910 ipsq_exit(ipsq_t *ipsq) 7911 { 7912 mblk_t *mp; 7913 ipsq_t *mp_ipsq; 7914 queue_t *q; 7915 phyint_t *phyi; 7916 ipsq_func_t func; 7917 7918 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7919 7920 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 7921 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 7922 ipsq->ipsq_xop->ipx_reentry_cnt--; 7923 return; 7924 } 7925 7926 for (;;) { 7927 phyi = ipsq->ipsq_phyint; 7928 mp = ipsq_dq(ipsq); 7929 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 7930 7931 /* 7932 * If we've changed to a new IPSQ, and the phyint associated 7933 * with the old one has gone away, free the old IPSQ. Note 7934 * that this cannot happen while the IPSQ is in a group. 7935 */ 7936 if (mp_ipsq != ipsq && phyi == NULL) { 7937 ASSERT(ipsq->ipsq_next == ipsq); 7938 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 7939 ipsq_delete(ipsq); 7940 } 7941 7942 if (mp == NULL) 7943 break; 7944 7945 q = mp->b_queue; 7946 func = (ipsq_func_t)mp->b_prev; 7947 ipsq = mp_ipsq; 7948 mp->b_next = mp->b_prev = NULL; 7949 mp->b_queue = NULL; 7950 7951 /* 7952 * If 'q' is an conn queue, it is valid, since we did a 7953 * a refhold on the conn at the start of the ioctl. 7954 * If 'q' is an ill queue, it is valid, since close of an 7955 * ill will clean up its IPSQ. 7956 */ 7957 (*func)(ipsq, q, mp, NULL); 7958 } 7959 } 7960 7961 /* 7962 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 7963 * and `ioccmd'. 7964 */ 7965 void 7966 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 7967 { 7968 ill_t *ill = ipif->ipif_ill; 7969 ipxop_t *ipx = ipsq->ipsq_xop; 7970 7971 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7972 ASSERT(ipx->ipx_current_ipif == NULL); 7973 ASSERT(ipx->ipx_current_ioctl == 0); 7974 7975 ipx->ipx_current_done = B_FALSE; 7976 ipx->ipx_current_ioctl = ioccmd; 7977 mutex_enter(&ipx->ipx_lock); 7978 ipx->ipx_current_ipif = ipif; 7979 mutex_exit(&ipx->ipx_lock); 7980 7981 /* 7982 * Set IPIF_CHANGING on one or more ipifs associated with the 7983 * current exclusive operation. IPIF_CHANGING prevents any new 7984 * references to the ipif (so that the references will eventually 7985 * drop to zero) and also prevents any "get" operations (e.g., 7986 * SIOCGLIFFLAGS) from being able to access the ipif until the 7987 * operation has completed and the ipif is again in a stable state. 7988 * 7989 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 7990 * ioctl. For internal operations (where ioccmd is zero), all ipifs 7991 * on the ill are marked with IPIF_CHANGING since it's unclear which 7992 * ipifs will be affected. 7993 * 7994 * Note that SIOCLIFREMOVEIF is a special case as it sets 7995 * IPIF_CONDEMNED internally after identifying the right ipif to 7996 * operate on. 7997 */ 7998 switch (ioccmd) { 7999 case SIOCLIFREMOVEIF: 8000 break; 8001 case 0: 8002 mutex_enter(&ill->ill_lock); 8003 ipif = ipif->ipif_ill->ill_ipif; 8004 for (; ipif != NULL; ipif = ipif->ipif_next) 8005 ipif->ipif_state_flags |= IPIF_CHANGING; 8006 mutex_exit(&ill->ill_lock); 8007 break; 8008 default: 8009 mutex_enter(&ill->ill_lock); 8010 ipif->ipif_state_flags |= IPIF_CHANGING; 8011 mutex_exit(&ill->ill_lock); 8012 } 8013 } 8014 8015 /* 8016 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 8017 * the next exclusive operation to begin once we ipsq_exit(). However, if 8018 * pending DLPI operations remain, then we will wait for the queue to drain 8019 * before allowing the next exclusive operation to begin. This ensures that 8020 * DLPI operations from one exclusive operation are never improperly processed 8021 * as part of a subsequent exclusive operation. 8022 */ 8023 void 8024 ipsq_current_finish(ipsq_t *ipsq) 8025 { 8026 ipxop_t *ipx = ipsq->ipsq_xop; 8027 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 8028 ipif_t *ipif = ipx->ipx_current_ipif; 8029 8030 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8031 8032 /* 8033 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 8034 * (but in that case, IPIF_CHANGING will already be clear and no 8035 * pending DLPI messages can remain). 8036 */ 8037 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 8038 ill_t *ill = ipif->ipif_ill; 8039 8040 mutex_enter(&ill->ill_lock); 8041 dlpi_pending = ill->ill_dlpi_pending; 8042 if (ipx->ipx_current_ioctl == 0) { 8043 ipif = ill->ill_ipif; 8044 for (; ipif != NULL; ipif = ipif->ipif_next) 8045 ipif->ipif_state_flags &= ~IPIF_CHANGING; 8046 } else { 8047 ipif->ipif_state_flags &= ~IPIF_CHANGING; 8048 } 8049 mutex_exit(&ill->ill_lock); 8050 } 8051 8052 ASSERT(!ipx->ipx_current_done); 8053 ipx->ipx_current_done = B_TRUE; 8054 ipx->ipx_current_ioctl = 0; 8055 if (dlpi_pending == DL_PRIM_INVAL) { 8056 mutex_enter(&ipx->ipx_lock); 8057 ipx->ipx_current_ipif = NULL; 8058 mutex_exit(&ipx->ipx_lock); 8059 } 8060 } 8061 8062 /* 8063 * The ill is closing. Flush all messages on the ipsq that originated 8064 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 8065 * for this ill since ipsq_enter could not have entered until then. 8066 * New messages can't be queued since the CONDEMNED flag is set. 8067 */ 8068 static void 8069 ipsq_flush(ill_t *ill) 8070 { 8071 queue_t *q; 8072 mblk_t *prev; 8073 mblk_t *mp; 8074 mblk_t *mp_next; 8075 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 8076 8077 ASSERT(IAM_WRITER_ILL(ill)); 8078 8079 /* 8080 * Flush any messages sent up by the driver. 8081 */ 8082 mutex_enter(&ipx->ipx_lock); 8083 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 8084 mp_next = mp->b_next; 8085 q = mp->b_queue; 8086 if (q == ill->ill_rq || q == ill->ill_wq) { 8087 /* dequeue mp */ 8088 if (prev == NULL) 8089 ipx->ipx_mphead = mp->b_next; 8090 else 8091 prev->b_next = mp->b_next; 8092 if (ipx->ipx_mptail == mp) { 8093 ASSERT(mp_next == NULL); 8094 ipx->ipx_mptail = prev; 8095 } 8096 inet_freemsg(mp); 8097 } else { 8098 prev = mp; 8099 } 8100 } 8101 mutex_exit(&ipx->ipx_lock); 8102 (void) ipsq_pending_mp_cleanup(ill, NULL); 8103 ipsq_xopq_mp_cleanup(ill, NULL); 8104 ill_pending_mp_cleanup(ill); 8105 } 8106 8107 /* 8108 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8109 * and return the associated ipif. 8110 * Return value: 8111 * Non zero: An error has occurred. ci may not be filled out. 8112 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8113 * a held ipif in ci.ci_ipif. 8114 */ 8115 int 8116 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8117 cmd_info_t *ci, ipsq_func_t func) 8118 { 8119 char *name; 8120 struct ifreq *ifr; 8121 struct lifreq *lifr; 8122 ipif_t *ipif = NULL; 8123 ill_t *ill; 8124 conn_t *connp; 8125 boolean_t isv6; 8126 boolean_t exists; 8127 int err; 8128 mblk_t *mp1; 8129 zoneid_t zoneid; 8130 ip_stack_t *ipst; 8131 8132 if (q->q_next != NULL) { 8133 ill = (ill_t *)q->q_ptr; 8134 isv6 = ill->ill_isv6; 8135 connp = NULL; 8136 zoneid = ALL_ZONES; 8137 ipst = ill->ill_ipst; 8138 } else { 8139 ill = NULL; 8140 connp = Q_TO_CONN(q); 8141 isv6 = connp->conn_af_isv6; 8142 zoneid = connp->conn_zoneid; 8143 if (zoneid == GLOBAL_ZONEID) { 8144 /* global zone can access ipifs in all zones */ 8145 zoneid = ALL_ZONES; 8146 } 8147 ipst = connp->conn_netstack->netstack_ip; 8148 } 8149 8150 /* Has been checked in ip_wput_nondata */ 8151 mp1 = mp->b_cont->b_cont; 8152 8153 if (ipip->ipi_cmd_type == IF_CMD) { 8154 /* This a old style SIOC[GS]IF* command */ 8155 ifr = (struct ifreq *)mp1->b_rptr; 8156 /* 8157 * Null terminate the string to protect against buffer 8158 * overrun. String was generated by user code and may not 8159 * be trusted. 8160 */ 8161 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8162 name = ifr->ifr_name; 8163 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 8164 ci->ci_sin6 = NULL; 8165 ci->ci_lifr = (struct lifreq *)ifr; 8166 } else { 8167 /* This a new style SIOC[GS]LIF* command */ 8168 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 8169 lifr = (struct lifreq *)mp1->b_rptr; 8170 /* 8171 * Null terminate the string to protect against buffer 8172 * overrun. String was generated by user code and may not 8173 * be trusted. 8174 */ 8175 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8176 name = lifr->lifr_name; 8177 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 8178 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 8179 ci->ci_lifr = lifr; 8180 } 8181 8182 if (ipip->ipi_cmd == SIOCSLIFNAME) { 8183 /* 8184 * The ioctl will be failed if the ioctl comes down 8185 * an conn stream 8186 */ 8187 if (ill == NULL) { 8188 /* 8189 * Not an ill queue, return EINVAL same as the 8190 * old error code. 8191 */ 8192 return (ENXIO); 8193 } 8194 ipif = ill->ill_ipif; 8195 ipif_refhold(ipif); 8196 } else { 8197 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8198 &exists, isv6, zoneid, 8199 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err, 8200 ipst); 8201 if (ipif == NULL) { 8202 if (err == EINPROGRESS) 8203 return (err); 8204 err = 0; /* Ensure we don't use it below */ 8205 } 8206 } 8207 8208 /* 8209 * Old style [GS]IFCMD does not admit IPv6 ipif 8210 */ 8211 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 8212 ipif_refrele(ipif); 8213 return (ENXIO); 8214 } 8215 8216 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8217 name[0] == '\0') { 8218 /* 8219 * Handle a or a SIOC?IF* with a null name 8220 * during plumb (on the ill queue before the I_PLINK). 8221 */ 8222 ipif = ill->ill_ipif; 8223 ipif_refhold(ipif); 8224 } 8225 8226 if (ipif == NULL) 8227 return (ENXIO); 8228 8229 ci->ci_ipif = ipif; 8230 return (0); 8231 } 8232 8233 /* 8234 * Return the total number of ipifs. 8235 */ 8236 static uint_t 8237 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 8238 { 8239 uint_t numifs = 0; 8240 ill_t *ill; 8241 ill_walk_context_t ctx; 8242 ipif_t *ipif; 8243 8244 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8245 ill = ILL_START_WALK_V4(&ctx, ipst); 8246 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8247 if (IS_UNDER_IPMP(ill)) 8248 continue; 8249 for (ipif = ill->ill_ipif; ipif != NULL; 8250 ipif = ipif->ipif_next) { 8251 if (ipif->ipif_zoneid == zoneid || 8252 ipif->ipif_zoneid == ALL_ZONES) 8253 numifs++; 8254 } 8255 } 8256 rw_exit(&ipst->ips_ill_g_lock); 8257 return (numifs); 8258 } 8259 8260 /* 8261 * Return the total number of ipifs. 8262 */ 8263 static uint_t 8264 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 8265 { 8266 uint_t numifs = 0; 8267 ill_t *ill; 8268 ipif_t *ipif; 8269 ill_walk_context_t ctx; 8270 8271 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8272 8273 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8274 if (family == AF_INET) 8275 ill = ILL_START_WALK_V4(&ctx, ipst); 8276 else if (family == AF_INET6) 8277 ill = ILL_START_WALK_V6(&ctx, ipst); 8278 else 8279 ill = ILL_START_WALK_ALL(&ctx, ipst); 8280 8281 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8282 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 8283 continue; 8284 8285 for (ipif = ill->ill_ipif; ipif != NULL; 8286 ipif = ipif->ipif_next) { 8287 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8288 !(lifn_flags & LIFC_NOXMIT)) 8289 continue; 8290 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8291 !(lifn_flags & LIFC_TEMPORARY)) 8292 continue; 8293 if (((ipif->ipif_flags & 8294 (IPIF_NOXMIT|IPIF_NOLOCAL| 8295 IPIF_DEPRECATED)) || 8296 IS_LOOPBACK(ill) || 8297 !(ipif->ipif_flags & IPIF_UP)) && 8298 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8299 continue; 8300 8301 if (zoneid != ipif->ipif_zoneid && 8302 ipif->ipif_zoneid != ALL_ZONES && 8303 (zoneid != GLOBAL_ZONEID || 8304 !(lifn_flags & LIFC_ALLZONES))) 8305 continue; 8306 8307 numifs++; 8308 } 8309 } 8310 rw_exit(&ipst->ips_ill_g_lock); 8311 return (numifs); 8312 } 8313 8314 uint_t 8315 ip_get_lifsrcofnum(ill_t *ill) 8316 { 8317 uint_t numifs = 0; 8318 ill_t *ill_head = ill; 8319 ip_stack_t *ipst = ill->ill_ipst; 8320 8321 /* 8322 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8323 * other thread may be trying to relink the ILLs in this usesrc group 8324 * and adjusting the ill_usesrc_grp_next pointers 8325 */ 8326 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8327 if ((ill->ill_usesrc_ifindex == 0) && 8328 (ill->ill_usesrc_grp_next != NULL)) { 8329 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8330 ill = ill->ill_usesrc_grp_next) 8331 numifs++; 8332 } 8333 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8334 8335 return (numifs); 8336 } 8337 8338 /* Null values are passed in for ipif, sin, and ifreq */ 8339 /* ARGSUSED */ 8340 int 8341 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8342 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8343 { 8344 int *nump; 8345 conn_t *connp = Q_TO_CONN(q); 8346 8347 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8348 8349 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8350 nump = (int *)mp->b_cont->b_cont->b_rptr; 8351 8352 *nump = ip_get_numifs(connp->conn_zoneid, 8353 connp->conn_netstack->netstack_ip); 8354 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8355 return (0); 8356 } 8357 8358 /* Null values are passed in for ipif, sin, and ifreq */ 8359 /* ARGSUSED */ 8360 int 8361 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8362 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8363 { 8364 struct lifnum *lifn; 8365 mblk_t *mp1; 8366 conn_t *connp = Q_TO_CONN(q); 8367 8368 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8369 8370 /* Existence checked in ip_wput_nondata */ 8371 mp1 = mp->b_cont->b_cont; 8372 8373 lifn = (struct lifnum *)mp1->b_rptr; 8374 switch (lifn->lifn_family) { 8375 case AF_UNSPEC: 8376 case AF_INET: 8377 case AF_INET6: 8378 break; 8379 default: 8380 return (EAFNOSUPPORT); 8381 } 8382 8383 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8384 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 8385 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8386 return (0); 8387 } 8388 8389 /* ARGSUSED */ 8390 int 8391 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8392 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8393 { 8394 STRUCT_HANDLE(ifconf, ifc); 8395 mblk_t *mp1; 8396 struct iocblk *iocp; 8397 struct ifreq *ifr; 8398 ill_walk_context_t ctx; 8399 ill_t *ill; 8400 ipif_t *ipif; 8401 struct sockaddr_in *sin; 8402 int32_t ifclen; 8403 zoneid_t zoneid; 8404 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8405 8406 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8407 8408 ip1dbg(("ip_sioctl_get_ifconf")); 8409 /* Existence verified in ip_wput_nondata */ 8410 mp1 = mp->b_cont->b_cont; 8411 iocp = (struct iocblk *)mp->b_rptr; 8412 zoneid = Q_TO_CONN(q)->conn_zoneid; 8413 8414 /* 8415 * The original SIOCGIFCONF passed in a struct ifconf which specified 8416 * the user buffer address and length into which the list of struct 8417 * ifreqs was to be copied. Since AT&T Streams does not seem to 8418 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8419 * the SIOCGIFCONF operation was redefined to simply provide 8420 * a large output buffer into which we are supposed to jam the ifreq 8421 * array. The same ioctl command code was used, despite the fact that 8422 * both the applications and the kernel code had to change, thus making 8423 * it impossible to support both interfaces. 8424 * 8425 * For reasons not good enough to try to explain, the following 8426 * algorithm is used for deciding what to do with one of these: 8427 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8428 * form with the output buffer coming down as the continuation message. 8429 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8430 * and we have to copy in the ifconf structure to find out how big the 8431 * output buffer is and where to copy out to. Sure no problem... 8432 * 8433 */ 8434 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8435 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8436 int numifs = 0; 8437 size_t ifc_bufsize; 8438 8439 /* 8440 * Must be (better be!) continuation of a TRANSPARENT 8441 * IOCTL. We just copied in the ifconf structure. 8442 */ 8443 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8444 (struct ifconf *)mp1->b_rptr); 8445 8446 /* 8447 * Allocate a buffer to hold requested information. 8448 * 8449 * If ifc_len is larger than what is needed, we only 8450 * allocate what we will use. 8451 * 8452 * If ifc_len is smaller than what is needed, return 8453 * EINVAL. 8454 * 8455 * XXX: the ill_t structure can hava 2 counters, for 8456 * v4 and v6 (not just ill_ipif_up_count) to store the 8457 * number of interfaces for a device, so we don't need 8458 * to count them here... 8459 */ 8460 numifs = ip_get_numifs(zoneid, ipst); 8461 8462 ifclen = STRUCT_FGET(ifc, ifc_len); 8463 ifc_bufsize = numifs * sizeof (struct ifreq); 8464 if (ifc_bufsize > ifclen) { 8465 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8466 /* old behaviour */ 8467 return (EINVAL); 8468 } else { 8469 ifc_bufsize = ifclen; 8470 } 8471 } 8472 8473 mp1 = mi_copyout_alloc(q, mp, 8474 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8475 if (mp1 == NULL) 8476 return (ENOMEM); 8477 8478 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8479 } 8480 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8481 /* 8482 * the SIOCGIFCONF ioctl only knows about 8483 * IPv4 addresses, so don't try to tell 8484 * it about interfaces with IPv6-only 8485 * addresses. (Last parm 'isv6' is B_FALSE) 8486 */ 8487 8488 ifr = (struct ifreq *)mp1->b_rptr; 8489 8490 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8491 ill = ILL_START_WALK_V4(&ctx, ipst); 8492 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8493 if (IS_UNDER_IPMP(ill)) 8494 continue; 8495 for (ipif = ill->ill_ipif; ipif != NULL; 8496 ipif = ipif->ipif_next) { 8497 if (zoneid != ipif->ipif_zoneid && 8498 ipif->ipif_zoneid != ALL_ZONES) 8499 continue; 8500 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8501 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8502 /* old behaviour */ 8503 rw_exit(&ipst->ips_ill_g_lock); 8504 return (EINVAL); 8505 } else { 8506 goto if_copydone; 8507 } 8508 } 8509 ipif_get_name(ipif, ifr->ifr_name, 8510 sizeof (ifr->ifr_name)); 8511 sin = (sin_t *)&ifr->ifr_addr; 8512 *sin = sin_null; 8513 sin->sin_family = AF_INET; 8514 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8515 ifr++; 8516 } 8517 } 8518 if_copydone: 8519 rw_exit(&ipst->ips_ill_g_lock); 8520 mp1->b_wptr = (uchar_t *)ifr; 8521 8522 if (STRUCT_BUF(ifc) != NULL) { 8523 STRUCT_FSET(ifc, ifc_len, 8524 (int)((uchar_t *)ifr - mp1->b_rptr)); 8525 } 8526 return (0); 8527 } 8528 8529 /* 8530 * Get the interfaces using the address hosted on the interface passed in, 8531 * as a source adddress 8532 */ 8533 /* ARGSUSED */ 8534 int 8535 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8536 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8537 { 8538 mblk_t *mp1; 8539 ill_t *ill, *ill_head; 8540 ipif_t *ipif, *orig_ipif; 8541 int numlifs = 0; 8542 size_t lifs_bufsize, lifsmaxlen; 8543 struct lifreq *lifr; 8544 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8545 uint_t ifindex; 8546 zoneid_t zoneid; 8547 int err = 0; 8548 boolean_t isv6 = B_FALSE; 8549 struct sockaddr_in *sin; 8550 struct sockaddr_in6 *sin6; 8551 STRUCT_HANDLE(lifsrcof, lifs); 8552 ip_stack_t *ipst; 8553 8554 ipst = CONNQ_TO_IPST(q); 8555 8556 ASSERT(q->q_next == NULL); 8557 8558 zoneid = Q_TO_CONN(q)->conn_zoneid; 8559 8560 /* Existence verified in ip_wput_nondata */ 8561 mp1 = mp->b_cont->b_cont; 8562 8563 /* 8564 * Must be (better be!) continuation of a TRANSPARENT 8565 * IOCTL. We just copied in the lifsrcof structure. 8566 */ 8567 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8568 (struct lifsrcof *)mp1->b_rptr); 8569 8570 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8571 return (EINVAL); 8572 8573 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8574 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8575 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8576 ip_process_ioctl, &err, ipst); 8577 if (ipif == NULL) { 8578 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8579 ifindex)); 8580 return (err); 8581 } 8582 8583 /* Allocate a buffer to hold requested information */ 8584 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8585 lifs_bufsize = numlifs * sizeof (struct lifreq); 8586 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8587 /* The actual size needed is always returned in lifs_len */ 8588 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8589 8590 /* If the amount we need is more than what is passed in, abort */ 8591 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8592 ipif_refrele(ipif); 8593 return (0); 8594 } 8595 8596 mp1 = mi_copyout_alloc(q, mp, 8597 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8598 if (mp1 == NULL) { 8599 ipif_refrele(ipif); 8600 return (ENOMEM); 8601 } 8602 8603 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8604 bzero(mp1->b_rptr, lifs_bufsize); 8605 8606 lifr = (struct lifreq *)mp1->b_rptr; 8607 8608 ill = ill_head = ipif->ipif_ill; 8609 orig_ipif = ipif; 8610 8611 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8612 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8613 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8614 8615 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8616 for (; (ill != NULL) && (ill != ill_head); 8617 ill = ill->ill_usesrc_grp_next) { 8618 8619 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8620 break; 8621 8622 ipif = ill->ill_ipif; 8623 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 8624 if (ipif->ipif_isv6) { 8625 sin6 = (sin6_t *)&lifr->lifr_addr; 8626 *sin6 = sin6_null; 8627 sin6->sin6_family = AF_INET6; 8628 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8629 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8630 &ipif->ipif_v6net_mask); 8631 } else { 8632 sin = (sin_t *)&lifr->lifr_addr; 8633 *sin = sin_null; 8634 sin->sin_family = AF_INET; 8635 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8636 lifr->lifr_addrlen = ip_mask_to_plen( 8637 ipif->ipif_net_mask); 8638 } 8639 lifr++; 8640 } 8641 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8642 rw_exit(&ipst->ips_ill_g_lock); 8643 ipif_refrele(orig_ipif); 8644 mp1->b_wptr = (uchar_t *)lifr; 8645 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8646 8647 return (0); 8648 } 8649 8650 /* ARGSUSED */ 8651 int 8652 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8653 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8654 { 8655 mblk_t *mp1; 8656 int list; 8657 ill_t *ill; 8658 ipif_t *ipif; 8659 int flags; 8660 int numlifs = 0; 8661 size_t lifc_bufsize; 8662 struct lifreq *lifr; 8663 sa_family_t family; 8664 struct sockaddr_in *sin; 8665 struct sockaddr_in6 *sin6; 8666 ill_walk_context_t ctx; 8667 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8668 int32_t lifclen; 8669 zoneid_t zoneid; 8670 STRUCT_HANDLE(lifconf, lifc); 8671 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8672 8673 ip1dbg(("ip_sioctl_get_lifconf")); 8674 8675 ASSERT(q->q_next == NULL); 8676 8677 zoneid = Q_TO_CONN(q)->conn_zoneid; 8678 8679 /* Existence verified in ip_wput_nondata */ 8680 mp1 = mp->b_cont->b_cont; 8681 8682 /* 8683 * An extended version of SIOCGIFCONF that takes an 8684 * additional address family and flags field. 8685 * AF_UNSPEC retrieve both IPv4 and IPv6. 8686 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8687 * interfaces are omitted. 8688 * Similarly, IPIF_TEMPORARY interfaces are omitted 8689 * unless LIFC_TEMPORARY is specified. 8690 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8691 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8692 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8693 * has priority over LIFC_NOXMIT. 8694 */ 8695 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8696 8697 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8698 return (EINVAL); 8699 8700 /* 8701 * Must be (better be!) continuation of a TRANSPARENT 8702 * IOCTL. We just copied in the lifconf structure. 8703 */ 8704 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8705 8706 family = STRUCT_FGET(lifc, lifc_family); 8707 flags = STRUCT_FGET(lifc, lifc_flags); 8708 8709 switch (family) { 8710 case AF_UNSPEC: 8711 /* 8712 * walk all ILL's. 8713 */ 8714 list = MAX_G_HEADS; 8715 break; 8716 case AF_INET: 8717 /* 8718 * walk only IPV4 ILL's. 8719 */ 8720 list = IP_V4_G_HEAD; 8721 break; 8722 case AF_INET6: 8723 /* 8724 * walk only IPV6 ILL's. 8725 */ 8726 list = IP_V6_G_HEAD; 8727 break; 8728 default: 8729 return (EAFNOSUPPORT); 8730 } 8731 8732 /* 8733 * Allocate a buffer to hold requested information. 8734 * 8735 * If lifc_len is larger than what is needed, we only 8736 * allocate what we will use. 8737 * 8738 * If lifc_len is smaller than what is needed, return 8739 * EINVAL. 8740 */ 8741 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 8742 lifc_bufsize = numlifs * sizeof (struct lifreq); 8743 lifclen = STRUCT_FGET(lifc, lifc_len); 8744 if (lifc_bufsize > lifclen) { 8745 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8746 return (EINVAL); 8747 else 8748 lifc_bufsize = lifclen; 8749 } 8750 8751 mp1 = mi_copyout_alloc(q, mp, 8752 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8753 if (mp1 == NULL) 8754 return (ENOMEM); 8755 8756 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8757 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8758 8759 lifr = (struct lifreq *)mp1->b_rptr; 8760 8761 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8762 ill = ill_first(list, list, &ctx, ipst); 8763 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8764 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 8765 continue; 8766 8767 for (ipif = ill->ill_ipif; ipif != NULL; 8768 ipif = ipif->ipif_next) { 8769 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8770 !(flags & LIFC_NOXMIT)) 8771 continue; 8772 8773 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8774 !(flags & LIFC_TEMPORARY)) 8775 continue; 8776 8777 if (((ipif->ipif_flags & 8778 (IPIF_NOXMIT|IPIF_NOLOCAL| 8779 IPIF_DEPRECATED)) || 8780 IS_LOOPBACK(ill) || 8781 !(ipif->ipif_flags & IPIF_UP)) && 8782 (flags & LIFC_EXTERNAL_SOURCE)) 8783 continue; 8784 8785 if (zoneid != ipif->ipif_zoneid && 8786 ipif->ipif_zoneid != ALL_ZONES && 8787 (zoneid != GLOBAL_ZONEID || 8788 !(flags & LIFC_ALLZONES))) 8789 continue; 8790 8791 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8792 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8793 rw_exit(&ipst->ips_ill_g_lock); 8794 return (EINVAL); 8795 } else { 8796 goto lif_copydone; 8797 } 8798 } 8799 8800 ipif_get_name(ipif, lifr->lifr_name, 8801 sizeof (lifr->lifr_name)); 8802 lifr->lifr_type = ill->ill_type; 8803 if (ipif->ipif_isv6) { 8804 sin6 = (sin6_t *)&lifr->lifr_addr; 8805 *sin6 = sin6_null; 8806 sin6->sin6_family = AF_INET6; 8807 sin6->sin6_addr = 8808 ipif->ipif_v6lcl_addr; 8809 lifr->lifr_addrlen = 8810 ip_mask_to_plen_v6( 8811 &ipif->ipif_v6net_mask); 8812 } else { 8813 sin = (sin_t *)&lifr->lifr_addr; 8814 *sin = sin_null; 8815 sin->sin_family = AF_INET; 8816 sin->sin_addr.s_addr = 8817 ipif->ipif_lcl_addr; 8818 lifr->lifr_addrlen = 8819 ip_mask_to_plen( 8820 ipif->ipif_net_mask); 8821 } 8822 lifr++; 8823 } 8824 } 8825 lif_copydone: 8826 rw_exit(&ipst->ips_ill_g_lock); 8827 8828 mp1->b_wptr = (uchar_t *)lifr; 8829 if (STRUCT_BUF(lifc) != NULL) { 8830 STRUCT_FSET(lifc, lifc_len, 8831 (int)((uchar_t *)lifr - mp1->b_rptr)); 8832 } 8833 return (0); 8834 } 8835 8836 static void 8837 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 8838 { 8839 ip6_asp_t *table; 8840 size_t table_size; 8841 mblk_t *data_mp; 8842 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8843 ip_stack_t *ipst; 8844 8845 if (q->q_next == NULL) 8846 ipst = CONNQ_TO_IPST(q); 8847 else 8848 ipst = ILLQ_TO_IPST(q); 8849 8850 /* These two ioctls are I_STR only */ 8851 if (iocp->ioc_count == TRANSPARENT) { 8852 miocnak(q, mp, 0, EINVAL); 8853 return; 8854 } 8855 8856 data_mp = mp->b_cont; 8857 if (data_mp == NULL) { 8858 /* The user passed us a NULL argument */ 8859 table = NULL; 8860 table_size = iocp->ioc_count; 8861 } else { 8862 /* 8863 * The user provided a table. The stream head 8864 * may have copied in the user data in chunks, 8865 * so make sure everything is pulled up 8866 * properly. 8867 */ 8868 if (MBLKL(data_mp) < iocp->ioc_count) { 8869 mblk_t *new_data_mp; 8870 if ((new_data_mp = msgpullup(data_mp, -1)) == 8871 NULL) { 8872 miocnak(q, mp, 0, ENOMEM); 8873 return; 8874 } 8875 freemsg(data_mp); 8876 data_mp = new_data_mp; 8877 mp->b_cont = data_mp; 8878 } 8879 table = (ip6_asp_t *)data_mp->b_rptr; 8880 table_size = iocp->ioc_count; 8881 } 8882 8883 switch (iocp->ioc_cmd) { 8884 case SIOCGIP6ADDRPOLICY: 8885 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 8886 if (iocp->ioc_rval == -1) 8887 iocp->ioc_error = EINVAL; 8888 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8889 else if (table != NULL && 8890 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 8891 ip6_asp_t *src = table; 8892 ip6_asp32_t *dst = (void *)table; 8893 int count = table_size / sizeof (ip6_asp_t); 8894 int i; 8895 8896 /* 8897 * We need to do an in-place shrink of the array 8898 * to match the alignment attributes of the 8899 * 32-bit ABI looking at it. 8900 */ 8901 /* LINTED: logical expression always true: op "||" */ 8902 ASSERT(sizeof (*src) > sizeof (*dst)); 8903 for (i = 1; i < count; i++) 8904 bcopy(src + i, dst + i, sizeof (*dst)); 8905 } 8906 #endif 8907 break; 8908 8909 case SIOCSIP6ADDRPOLICY: 8910 ASSERT(mp->b_prev == NULL); 8911 mp->b_prev = (void *)q; 8912 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8913 /* 8914 * We pass in the datamodel here so that the ip6_asp_replace() 8915 * routine can handle converting from 32-bit to native formats 8916 * where necessary. 8917 * 8918 * A better way to handle this might be to convert the inbound 8919 * data structure here, and hang it off a new 'mp'; thus the 8920 * ip6_asp_replace() logic would always be dealing with native 8921 * format data structures.. 8922 * 8923 * (An even simpler way to handle these ioctls is to just 8924 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 8925 * and just recompile everything that depends on it.) 8926 */ 8927 #endif 8928 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 8929 iocp->ioc_flag & IOC_MODELS); 8930 return; 8931 } 8932 8933 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 8934 qreply(q, mp); 8935 } 8936 8937 static void 8938 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 8939 { 8940 mblk_t *data_mp; 8941 struct dstinforeq *dir; 8942 uint8_t *end, *cur; 8943 in6_addr_t *daddr, *saddr; 8944 ipaddr_t v4daddr; 8945 ire_t *ire; 8946 char *slabel, *dlabel; 8947 boolean_t isipv4; 8948 int match_ire; 8949 ill_t *dst_ill; 8950 ipif_t *src_ipif, *ire_ipif; 8951 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8952 zoneid_t zoneid; 8953 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8954 8955 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8956 zoneid = Q_TO_CONN(q)->conn_zoneid; 8957 8958 /* 8959 * This ioctl is I_STR only, and must have a 8960 * data mblk following the M_IOCTL mblk. 8961 */ 8962 data_mp = mp->b_cont; 8963 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 8964 miocnak(q, mp, 0, EINVAL); 8965 return; 8966 } 8967 8968 if (MBLKL(data_mp) < iocp->ioc_count) { 8969 mblk_t *new_data_mp; 8970 8971 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 8972 miocnak(q, mp, 0, ENOMEM); 8973 return; 8974 } 8975 freemsg(data_mp); 8976 data_mp = new_data_mp; 8977 mp->b_cont = data_mp; 8978 } 8979 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 8980 8981 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 8982 end - cur >= sizeof (struct dstinforeq); 8983 cur += sizeof (struct dstinforeq)) { 8984 dir = (struct dstinforeq *)cur; 8985 daddr = &dir->dir_daddr; 8986 saddr = &dir->dir_saddr; 8987 8988 /* 8989 * ip_addr_scope_v6() and ip6_asp_lookup() handle 8990 * v4 mapped addresses; ire_ftable_lookup[_v6]() 8991 * and ipif_select_source[_v6]() do not. 8992 */ 8993 dir->dir_dscope = ip_addr_scope_v6(daddr); 8994 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 8995 8996 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 8997 if (isipv4) { 8998 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 8999 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 9000 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9001 } else { 9002 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 9003 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9004 } 9005 if (ire == NULL) { 9006 dir->dir_dreachable = 0; 9007 9008 /* move on to next dst addr */ 9009 continue; 9010 } 9011 dir->dir_dreachable = 1; 9012 9013 ire_ipif = ire->ire_ipif; 9014 if (ire_ipif == NULL) 9015 goto next_dst; 9016 9017 /* 9018 * We expect to get back an interface ire or a 9019 * gateway ire cache entry. For both types, the 9020 * output interface is ire_ipif->ipif_ill. 9021 */ 9022 dst_ill = ire_ipif->ipif_ill; 9023 dir->dir_dmactype = dst_ill->ill_mactype; 9024 9025 if (isipv4) { 9026 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 9027 } else { 9028 src_ipif = ipif_select_source_v6(dst_ill, 9029 daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); 9030 } 9031 if (src_ipif == NULL) 9032 goto next_dst; 9033 9034 *saddr = src_ipif->ipif_v6lcl_addr; 9035 dir->dir_sscope = ip_addr_scope_v6(saddr); 9036 slabel = ip6_asp_lookup(saddr, NULL, ipst); 9037 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 9038 dir->dir_sdeprecated = 9039 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 9040 ipif_refrele(src_ipif); 9041 next_dst: 9042 ire_refrele(ire); 9043 } 9044 miocack(q, mp, iocp->ioc_count, 0); 9045 } 9046 9047 /* 9048 * Check if this is an address assigned to this machine. 9049 * Skips interfaces that are down by using ire checks. 9050 * Translates mapped addresses to v4 addresses and then 9051 * treats them as such, returning true if the v4 address 9052 * associated with this mapped address is configured. 9053 * Note: Applications will have to be careful what they do 9054 * with the response; use of mapped addresses limits 9055 * what can be done with the socket, especially with 9056 * respect to socket options and ioctls - neither IPv4 9057 * options nor IPv6 sticky options/ancillary data options 9058 * may be used. 9059 */ 9060 /* ARGSUSED */ 9061 int 9062 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9063 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9064 { 9065 struct sioc_addrreq *sia; 9066 sin_t *sin; 9067 ire_t *ire; 9068 mblk_t *mp1; 9069 zoneid_t zoneid; 9070 ip_stack_t *ipst; 9071 9072 ip1dbg(("ip_sioctl_tmyaddr")); 9073 9074 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9075 zoneid = Q_TO_CONN(q)->conn_zoneid; 9076 ipst = CONNQ_TO_IPST(q); 9077 9078 /* Existence verified in ip_wput_nondata */ 9079 mp1 = mp->b_cont->b_cont; 9080 sia = (struct sioc_addrreq *)mp1->b_rptr; 9081 sin = (sin_t *)&sia->sa_addr; 9082 switch (sin->sin_family) { 9083 case AF_INET6: { 9084 sin6_t *sin6 = (sin6_t *)sin; 9085 9086 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9087 ipaddr_t v4_addr; 9088 9089 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9090 v4_addr); 9091 ire = ire_ctable_lookup(v4_addr, 0, 9092 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9093 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9094 } else { 9095 in6_addr_t v6addr; 9096 9097 v6addr = sin6->sin6_addr; 9098 ire = ire_ctable_lookup_v6(&v6addr, 0, 9099 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9100 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9101 } 9102 break; 9103 } 9104 case AF_INET: { 9105 ipaddr_t v4addr; 9106 9107 v4addr = sin->sin_addr.s_addr; 9108 ire = ire_ctable_lookup(v4addr, 0, 9109 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9110 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9111 break; 9112 } 9113 default: 9114 return (EAFNOSUPPORT); 9115 } 9116 if (ire != NULL) { 9117 sia->sa_res = 1; 9118 ire_refrele(ire); 9119 } else { 9120 sia->sa_res = 0; 9121 } 9122 return (0); 9123 } 9124 9125 /* 9126 * Check if this is an address assigned on-link i.e. neighbor, 9127 * and makes sure it's reachable from the current zone. 9128 * Returns true for my addresses as well. 9129 * Translates mapped addresses to v4 addresses and then 9130 * treats them as such, returning true if the v4 address 9131 * associated with this mapped address is configured. 9132 * Note: Applications will have to be careful what they do 9133 * with the response; use of mapped addresses limits 9134 * what can be done with the socket, especially with 9135 * respect to socket options and ioctls - neither IPv4 9136 * options nor IPv6 sticky options/ancillary data options 9137 * may be used. 9138 */ 9139 /* ARGSUSED */ 9140 int 9141 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9142 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9143 { 9144 struct sioc_addrreq *sia; 9145 sin_t *sin; 9146 mblk_t *mp1; 9147 ire_t *ire = NULL; 9148 zoneid_t zoneid; 9149 ip_stack_t *ipst; 9150 9151 ip1dbg(("ip_sioctl_tonlink")); 9152 9153 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9154 zoneid = Q_TO_CONN(q)->conn_zoneid; 9155 ipst = CONNQ_TO_IPST(q); 9156 9157 /* Existence verified in ip_wput_nondata */ 9158 mp1 = mp->b_cont->b_cont; 9159 sia = (struct sioc_addrreq *)mp1->b_rptr; 9160 sin = (sin_t *)&sia->sa_addr; 9161 9162 /* 9163 * Match addresses with a zero gateway field to avoid 9164 * routes going through a router. 9165 * Exclude broadcast and multicast addresses. 9166 */ 9167 switch (sin->sin_family) { 9168 case AF_INET6: { 9169 sin6_t *sin6 = (sin6_t *)sin; 9170 9171 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9172 ipaddr_t v4_addr; 9173 9174 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9175 v4_addr); 9176 if (!CLASSD(v4_addr)) { 9177 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9178 NULL, NULL, zoneid, NULL, 9179 MATCH_IRE_GW, ipst); 9180 } 9181 } else { 9182 in6_addr_t v6addr; 9183 in6_addr_t v6gw; 9184 9185 v6addr = sin6->sin6_addr; 9186 v6gw = ipv6_all_zeros; 9187 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9188 ire = ire_route_lookup_v6(&v6addr, 0, 9189 &v6gw, 0, NULL, NULL, zoneid, 9190 NULL, MATCH_IRE_GW, ipst); 9191 } 9192 } 9193 break; 9194 } 9195 case AF_INET: { 9196 ipaddr_t v4addr; 9197 9198 v4addr = sin->sin_addr.s_addr; 9199 if (!CLASSD(v4addr)) { 9200 ire = ire_route_lookup(v4addr, 0, 0, 0, 9201 NULL, NULL, zoneid, NULL, 9202 MATCH_IRE_GW, ipst); 9203 } 9204 break; 9205 } 9206 default: 9207 return (EAFNOSUPPORT); 9208 } 9209 sia->sa_res = 0; 9210 if (ire != NULL) { 9211 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9212 IRE_LOCAL|IRE_LOOPBACK)) { 9213 sia->sa_res = 1; 9214 } 9215 ire_refrele(ire); 9216 } 9217 return (0); 9218 } 9219 9220 /* 9221 * TBD: implement when kernel maintaines a list of site prefixes. 9222 */ 9223 /* ARGSUSED */ 9224 int 9225 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9226 ip_ioctl_cmd_t *ipip, void *ifreq) 9227 { 9228 return (ENXIO); 9229 } 9230 9231 /* 9232 * ARP IOCTLs. 9233 * How does IP get in the business of fronting ARP configuration/queries? 9234 * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9235 * are by tradition passed in through a datagram socket. That lands in IP. 9236 * As it happens, this is just as well since the interface is quite crude in 9237 * that it passes in no information about protocol or hardware types, or 9238 * interface association. After making the protocol assumption, IP is in 9239 * the position to look up the name of the ILL, which ARP will need, and 9240 * format a request that can be handled by ARP. The request is passed up 9241 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9242 * back a response. ARP supports its own set of more general IOCTLs, in 9243 * case anyone is interested. 9244 */ 9245 /* ARGSUSED */ 9246 int 9247 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9248 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9249 { 9250 mblk_t *mp1; 9251 mblk_t *mp2; 9252 mblk_t *pending_mp; 9253 ipaddr_t ipaddr; 9254 area_t *area; 9255 struct iocblk *iocp; 9256 conn_t *connp; 9257 struct arpreq *ar; 9258 struct xarpreq *xar; 9259 int flags, alength; 9260 uchar_t *lladdr; 9261 ire_t *ire; 9262 ip_stack_t *ipst; 9263 ill_t *ill = ipif->ipif_ill; 9264 ill_t *proxy_ill = NULL; 9265 ipmp_arpent_t *entp = NULL; 9266 boolean_t if_arp_ioctl = B_FALSE; 9267 boolean_t proxyarp = B_FALSE; 9268 9269 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9270 connp = Q_TO_CONN(q); 9271 ipst = connp->conn_netstack->netstack_ip; 9272 9273 if (ipip->ipi_cmd_type == XARP_CMD) { 9274 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9275 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9276 ar = NULL; 9277 9278 flags = xar->xarp_flags; 9279 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 9280 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 9281 /* 9282 * Validate against user's link layer address length 9283 * input and name and addr length limits. 9284 */ 9285 alength = ill->ill_phys_addr_length; 9286 if (ipip->ipi_cmd == SIOCSXARP) { 9287 if (alength != xar->xarp_ha.sdl_alen || 9288 (alength + xar->xarp_ha.sdl_nlen > 9289 sizeof (xar->xarp_ha.sdl_data))) 9290 return (EINVAL); 9291 } 9292 } else { 9293 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9294 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9295 xar = NULL; 9296 9297 flags = ar->arp_flags; 9298 lladdr = (uchar_t *)ar->arp_ha.sa_data; 9299 /* 9300 * Theoretically, the sa_family could tell us what link 9301 * layer type this operation is trying to deal with. By 9302 * common usage AF_UNSPEC means ethernet. We'll assume 9303 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9304 * for now. Our new SIOC*XARP ioctls can be used more 9305 * generally. 9306 * 9307 * If the underlying media happens to have a non 6 byte 9308 * address, arp module will fail set/get, but the del 9309 * operation will succeed. 9310 */ 9311 alength = 6; 9312 if ((ipip->ipi_cmd != SIOCDARP) && 9313 (alength != ill->ill_phys_addr_length)) { 9314 return (EINVAL); 9315 } 9316 } 9317 9318 ipaddr = sin->sin_addr.s_addr; 9319 9320 /* 9321 * IPMP ARP special handling: 9322 * 9323 * 1. Since ARP mappings must appear consistent across the group, 9324 * prohibit changing ARP mappings on the underlying interfaces. 9325 * 9326 * 2. Since ARP mappings for IPMP data addresses are maintained by 9327 * IP itself, prohibit changing them. 9328 * 9329 * 3. For proxy ARP, use a functioning hardware address in the group, 9330 * provided one exists. If one doesn't, just add the entry as-is; 9331 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 9332 */ 9333 if (IS_UNDER_IPMP(ill)) { 9334 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 9335 return (EPERM); 9336 } 9337 if (IS_IPMP(ill)) { 9338 ipmp_illgrp_t *illg = ill->ill_grp; 9339 9340 switch (ipip->ipi_cmd) { 9341 case SIOCSARP: 9342 case SIOCSXARP: 9343 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 9344 if (proxy_ill != NULL) { 9345 proxyarp = B_TRUE; 9346 if (!ipmp_ill_is_active(proxy_ill)) 9347 proxy_ill = ipmp_illgrp_next_ill(illg); 9348 if (proxy_ill != NULL) 9349 lladdr = proxy_ill->ill_phys_addr; 9350 } 9351 /* FALLTHRU */ 9352 case SIOCDARP: 9353 case SIOCDXARP: 9354 ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL, 9355 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 9356 if (ire != NULL) { 9357 ire_refrele(ire); 9358 return (EPERM); 9359 } 9360 } 9361 } 9362 9363 /* 9364 * We are going to pass up to ARP a packet chain that looks 9365 * like: 9366 * 9367 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9368 * 9369 * Get a copy of the original IOCTL mblk to head the chain, 9370 * to be sent up (in mp1). Also get another copy to store 9371 * in the ill_pending_mp list, for matching the response 9372 * when it comes back from ARP. 9373 */ 9374 mp1 = copyb(mp); 9375 pending_mp = copymsg(mp); 9376 if (mp1 == NULL || pending_mp == NULL) { 9377 if (mp1 != NULL) 9378 freeb(mp1); 9379 if (pending_mp != NULL) 9380 inet_freemsg(pending_mp); 9381 return (ENOMEM); 9382 } 9383 9384 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9385 (caddr_t)&ipaddr); 9386 if (mp2 == NULL) { 9387 freeb(mp1); 9388 inet_freemsg(pending_mp); 9389 return (ENOMEM); 9390 } 9391 /* Put together the chain. */ 9392 mp1->b_cont = mp2; 9393 mp1->b_datap->db_type = M_IOCTL; 9394 mp2->b_cont = mp; 9395 mp2->b_datap->db_type = M_DATA; 9396 9397 iocp = (struct iocblk *)mp1->b_rptr; 9398 9399 /* 9400 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9401 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9402 * cp_private field (or cp_rval on 32-bit systems) in place of the 9403 * ioc_count field; set ioc_count to be correct. 9404 */ 9405 iocp->ioc_count = MBLKL(mp1->b_cont); 9406 9407 /* 9408 * Set the proper command in the ARP message. 9409 * Convert the SIOC{G|S|D}ARP calls into our 9410 * AR_ENTRY_xxx calls. 9411 */ 9412 area = (area_t *)mp2->b_rptr; 9413 switch (iocp->ioc_cmd) { 9414 case SIOCDARP: 9415 case SIOCDXARP: 9416 /* 9417 * We defer deleting the corresponding IRE until 9418 * we return from arp. 9419 */ 9420 area->area_cmd = AR_ENTRY_DELETE; 9421 area->area_proto_mask_offset = 0; 9422 break; 9423 case SIOCGARP: 9424 case SIOCGXARP: 9425 area->area_cmd = AR_ENTRY_SQUERY; 9426 area->area_proto_mask_offset = 0; 9427 break; 9428 case SIOCSARP: 9429 case SIOCSXARP: 9430 /* 9431 * Delete the corresponding ire to make sure IP will 9432 * pick up any change from arp. 9433 */ 9434 if (!if_arp_ioctl) { 9435 (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst); 9436 } else { 9437 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9438 if (ipif != NULL) { 9439 (void) ip_ire_clookup_and_delete(ipaddr, ipif, 9440 ipst); 9441 ipif_refrele(ipif); 9442 } 9443 } 9444 break; 9445 } 9446 iocp->ioc_cmd = area->area_cmd; 9447 9448 /* 9449 * Fill in the rest of the ARP operation fields. 9450 */ 9451 area->area_hw_addr_length = alength; 9452 bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength); 9453 9454 /* Translate the flags. */ 9455 if (flags & ATF_PERM) 9456 area->area_flags |= ACE_F_PERMANENT; 9457 if (flags & ATF_PUBL) 9458 area->area_flags |= ACE_F_PUBLISH; 9459 if (flags & ATF_AUTHORITY) 9460 area->area_flags |= ACE_F_AUTHORITY; 9461 9462 /* 9463 * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it 9464 * so that IP can update ARP as the active ills in the group change. 9465 */ 9466 if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD && 9467 (area->area_flags & ACE_F_PERMANENT)) { 9468 entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp); 9469 9470 /* 9471 * The second part of the conditional below handles a corner 9472 * case: if this is proxy ARP and the IPMP group has no active 9473 * interfaces, we can't send the request to ARP now since it 9474 * won't be able to build an ACE. So we return success and 9475 * notify ARP about the proxy ARP entry once an interface 9476 * becomes active. 9477 */ 9478 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 9479 mp2->b_cont = NULL; 9480 inet_freemsg(mp1); 9481 inet_freemsg(pending_mp); 9482 return (entp == NULL ? ENOMEM : 0); 9483 } 9484 } 9485 9486 /* 9487 * Before sending 'mp' to ARP, we have to clear the b_next 9488 * and b_prev. Otherwise if STREAMS encounters such a message 9489 * in freemsg(), (because ARP can close any time) it can cause 9490 * a panic. But mi code needs the b_next and b_prev values of 9491 * mp->b_cont, to complete the ioctl. So we store it here 9492 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9493 * when the response comes down from ARP. 9494 */ 9495 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9496 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9497 mp->b_cont->b_next = NULL; 9498 mp->b_cont->b_prev = NULL; 9499 9500 mutex_enter(&connp->conn_lock); 9501 mutex_enter(&ill->ill_lock); 9502 /* conn has not yet started closing, hence this can't fail */ 9503 if (ipip->ipi_flags & IPI_WR) { 9504 VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9505 pending_mp, 0) != 0); 9506 } else { 9507 VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); 9508 } 9509 mutex_exit(&ill->ill_lock); 9510 mutex_exit(&connp->conn_lock); 9511 9512 /* 9513 * Up to ARP it goes. The response will come back in ip_wput() as an 9514 * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion. 9515 */ 9516 putnext(ill->ill_rq, mp1); 9517 9518 /* 9519 * If we created an IPMP ARP entry, mark that we've notified ARP. 9520 */ 9521 if (entp != NULL) 9522 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 9523 9524 return (EINPROGRESS); 9525 } 9526 9527 /* 9528 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 9529 * the associated sin and refhold and return the associated ipif via `ci'. 9530 */ 9531 int 9532 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 9533 cmd_info_t *ci, ipsq_func_t func) 9534 { 9535 mblk_t *mp1; 9536 int err; 9537 sin_t *sin; 9538 conn_t *connp; 9539 ipif_t *ipif; 9540 ire_t *ire = NULL; 9541 ill_t *ill = NULL; 9542 boolean_t exists; 9543 ip_stack_t *ipst; 9544 struct arpreq *ar; 9545 struct xarpreq *xar; 9546 struct sockaddr_dl *sdl; 9547 9548 /* ioctl comes down on a conn */ 9549 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9550 connp = Q_TO_CONN(q); 9551 if (connp->conn_af_isv6) 9552 return (ENXIO); 9553 9554 ipst = connp->conn_netstack->netstack_ip; 9555 9556 /* Verified in ip_wput_nondata */ 9557 mp1 = mp->b_cont->b_cont; 9558 9559 if (ipip->ipi_cmd_type == XARP_CMD) { 9560 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 9561 xar = (struct xarpreq *)mp1->b_rptr; 9562 sin = (sin_t *)&xar->xarp_pa; 9563 sdl = &xar->xarp_ha; 9564 9565 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 9566 return (ENXIO); 9567 if (sdl->sdl_nlen >= LIFNAMSIZ) 9568 return (EINVAL); 9569 } else { 9570 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 9571 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 9572 ar = (struct arpreq *)mp1->b_rptr; 9573 sin = (sin_t *)&ar->arp_pa; 9574 } 9575 9576 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 9577 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 9578 B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp), 9579 mp, func, &err, ipst); 9580 if (ipif == NULL) 9581 return (err); 9582 if (ipif->ipif_id != 0) { 9583 ipif_refrele(ipif); 9584 return (ENXIO); 9585 } 9586 } else { 9587 /* 9588 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 9589 * of 0: use the IP address to find the ipif. If the IP 9590 * address is an IPMP test address, ire_ftable_lookup() will 9591 * find the wrong ill, so we first do an ipif_lookup_addr(). 9592 */ 9593 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 9594 CONNP_TO_WQ(connp), mp, func, &err, ipst); 9595 if (ipif == NULL) { 9596 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0, 9597 IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL, 9598 MATCH_IRE_TYPE, ipst); 9599 if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) { 9600 if (ire != NULL) 9601 ire_refrele(ire); 9602 return (ENXIO); 9603 } 9604 ipif = ill->ill_ipif; 9605 ipif_refhold(ipif); 9606 ire_refrele(ire); 9607 } 9608 } 9609 9610 if (ipif->ipif_net_type != IRE_IF_RESOLVER) { 9611 ipif_refrele(ipif); 9612 return (ENXIO); 9613 } 9614 9615 ci->ci_sin = sin; 9616 ci->ci_ipif = ipif; 9617 return (0); 9618 } 9619 9620 /* 9621 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 9622 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 9623 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 9624 * up and thus an ill can join that illgrp. 9625 * 9626 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 9627 * open()/close() primarily because close() is not allowed to fail or block 9628 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 9629 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 9630 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 9631 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 9632 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 9633 * state if I_UNLINK didn't occur. 9634 * 9635 * Note that for each plumb/unplumb operation, we may end up here more than 9636 * once because of the way ifconfig works. However, it's OK to link the same 9637 * illgrp more than once, or unlink an illgrp that's already unlinked. 9638 */ 9639 static int 9640 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 9641 { 9642 int err; 9643 ip_stack_t *ipst = ill->ill_ipst; 9644 9645 ASSERT(IS_IPMP(ill)); 9646 ASSERT(IAM_WRITER_ILL(ill)); 9647 9648 switch (ioccmd) { 9649 case I_LINK: 9650 return (ENOTSUP); 9651 9652 case I_PLINK: 9653 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 9654 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 9655 rw_exit(&ipst->ips_ipmp_lock); 9656 break; 9657 9658 case I_PUNLINK: 9659 /* 9660 * Require all UP ipifs be brought down prior to unlinking the 9661 * illgrp so any associated IREs (and other state) is torched. 9662 */ 9663 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 9664 return (EBUSY); 9665 9666 /* 9667 * NOTE: We hold ipmp_lock across the unlink to prevent a race 9668 * with an SIOCSLIFGROUPNAME request from an ill trying to 9669 * join this group. Specifically: ills trying to join grab 9670 * ipmp_lock and bump a "pending join" counter checked by 9671 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 9672 * joins can occur (since we have ipmp_lock). Once we drop 9673 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 9674 * find the illgrp (since we unlinked it) and will return 9675 * EAFNOSUPPORT. This will then take them back through the 9676 * IPMP meta-interface plumbing logic in ifconfig, and thus 9677 * back through I_PLINK above. 9678 */ 9679 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 9680 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 9681 rw_exit(&ipst->ips_ipmp_lock); 9682 return (err); 9683 default: 9684 break; 9685 } 9686 return (0); 9687 } 9688 9689 /* 9690 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9691 * atomically set/clear the muxids. Also complete the ioctl by acking or 9692 * naking it. Note that the code is structured such that the link type, 9693 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9694 * its clones use the persistent link, while pppd(1M) and perhaps many 9695 * other daemons may use non-persistent link. When combined with some 9696 * ill_t states, linking and unlinking lower streams may be used as 9697 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9698 */ 9699 /* ARGSUSED */ 9700 void 9701 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9702 { 9703 mblk_t *mp1, *mp2; 9704 struct linkblk *li; 9705 struct ipmx_s *ipmxp; 9706 ill_t *ill; 9707 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 9708 int err = 0; 9709 boolean_t entered_ipsq = B_FALSE; 9710 boolean_t islink; 9711 ip_stack_t *ipst; 9712 9713 if (CONN_Q(q)) 9714 ipst = CONNQ_TO_IPST(q); 9715 else 9716 ipst = ILLQ_TO_IPST(q); 9717 9718 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 9719 ioccmd == I_LINK || ioccmd == I_UNLINK); 9720 9721 islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9722 9723 mp1 = mp->b_cont; /* This is the linkblk info */ 9724 li = (struct linkblk *)mp1->b_rptr; 9725 9726 /* 9727 * ARP has added this special mblk, and the utility is asking us 9728 * to perform consistency checks, and also atomically set the 9729 * muxid. Ifconfig is an example. It achieves this by using 9730 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9731 * to /dev/udp[6] stream for use as the mux when plinking the IP 9732 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9733 * and other comments in this routine for more details. 9734 */ 9735 mp2 = mp1->b_cont; /* This is added by ARP */ 9736 9737 /* 9738 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9739 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9740 * get the special mblk above. For backward compatibility, we 9741 * request ip_sioctl_plink_ipmod() to skip the consistency checks. 9742 * The utility will use SIOCSLIFMUXID to store the muxids. This is 9743 * not atomic, and can leave the streams unplumbable if the utility 9744 * is interrupted before it does the SIOCSLIFMUXID. 9745 */ 9746 if (mp2 == NULL) { 9747 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE); 9748 if (err == EINPROGRESS) 9749 return; 9750 goto done; 9751 } 9752 9753 /* 9754 * This is an I_{P}LINK sent down by ifconfig through the ARP module; 9755 * ARP has appended this last mblk to tell us whether the lower stream 9756 * is an arp-dev stream or an IP module stream. 9757 */ 9758 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9759 if (ipmxp->ipmx_arpdev_stream) { 9760 /* 9761 * The lower stream is the arp-dev stream. 9762 */ 9763 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9764 q, mp, ip_sioctl_plink, &err, NULL, ipst); 9765 if (ill == NULL) { 9766 if (err == EINPROGRESS) 9767 return; 9768 err = EINVAL; 9769 goto done; 9770 } 9771 9772 if (ipsq == NULL) { 9773 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9774 NEW_OP, B_FALSE); 9775 if (ipsq == NULL) { 9776 ill_refrele(ill); 9777 return; 9778 } 9779 entered_ipsq = B_TRUE; 9780 } 9781 ASSERT(IAM_WRITER_ILL(ill)); 9782 ill_refrele(ill); 9783 9784 /* 9785 * To ensure consistency between IP and ARP, the following 9786 * LIFO scheme is used in plink/punlink. (IP first, ARP last). 9787 * This is because the muxid's are stored in the IP stream on 9788 * the ill. 9789 * 9790 * I_{P}LINK: ifconfig plinks the IP stream before plinking 9791 * the ARP stream. On an arp-dev stream, IP checks that it is 9792 * not yet plinked, and it also checks that the corresponding 9793 * IP stream is already plinked. 9794 * 9795 * I_{P}UNLINK: ifconfig punlinks the ARP stream before 9796 * punlinking the IP stream. IP does not allow punlink of the 9797 * IP stream unless the arp stream has been punlinked. 9798 */ 9799 if ((islink && 9800 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9801 (!islink && ill->ill_arp_muxid != li->l_index)) { 9802 err = EINVAL; 9803 goto done; 9804 } 9805 9806 if (IS_IPMP(ill) && 9807 (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 9808 goto done; 9809 9810 ill->ill_arp_muxid = islink ? li->l_index : 0; 9811 } else { 9812 /* 9813 * The lower stream is probably an IP module stream. Do 9814 * consistency checking. 9815 */ 9816 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE); 9817 if (err == EINPROGRESS) 9818 return; 9819 } 9820 done: 9821 if (err == 0) 9822 miocack(q, mp, 0, 0); 9823 else 9824 miocnak(q, mp, 0, err); 9825 9826 /* Conn was refheld in ip_sioctl_copyin_setup */ 9827 if (CONN_Q(q)) 9828 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9829 if (entered_ipsq) 9830 ipsq_exit(ipsq); 9831 } 9832 9833 /* 9834 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 9835 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 9836 * module stream). If `doconsist' is set, then do the extended consistency 9837 * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here. 9838 * Returns zero on success, EINPROGRESS if the operation is still pending, or 9839 * an error code on failure. 9840 */ 9841 static int 9842 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 9843 struct linkblk *li, boolean_t doconsist) 9844 { 9845 int err = 0; 9846 ill_t *ill; 9847 queue_t *ipwq, *dwq; 9848 const char *name; 9849 struct qinit *qinfo; 9850 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9851 boolean_t entered_ipsq = B_FALSE; 9852 9853 /* 9854 * Walk the lower stream to verify it's the IP module stream. 9855 * The IP module is identified by its name, wput function, 9856 * and non-NULL q_next. STREAMS ensures that the lower stream 9857 * (li->l_qbot) will not vanish until this ioctl completes. 9858 */ 9859 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 9860 qinfo = ipwq->q_qinfo; 9861 name = qinfo->qi_minfo->mi_idname; 9862 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 9863 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 9864 break; 9865 } 9866 } 9867 9868 /* 9869 * If this isn't an IP module stream, bail. 9870 */ 9871 if (ipwq == NULL) 9872 return (0); 9873 9874 ill = ipwq->q_ptr; 9875 ASSERT(ill != NULL); 9876 9877 if (ipsq == NULL) { 9878 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9879 NEW_OP, B_FALSE); 9880 if (ipsq == NULL) 9881 return (EINPROGRESS); 9882 entered_ipsq = B_TRUE; 9883 } 9884 ASSERT(IAM_WRITER_ILL(ill)); 9885 9886 if (doconsist) { 9887 /* 9888 * Consistency checking requires that I_{P}LINK occurs 9889 * prior to setting ill_ip_muxid, and that I_{P}UNLINK 9890 * occurs prior to clearing ill_arp_muxid. 9891 */ 9892 if ((islink && ill->ill_ip_muxid != 0) || 9893 (!islink && ill->ill_arp_muxid != 0)) { 9894 err = EINVAL; 9895 goto done; 9896 } 9897 } 9898 9899 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 9900 goto done; 9901 9902 /* 9903 * As part of I_{P}LINKing, stash the number of downstream modules and 9904 * the read queue of the module immediately below IP in the ill. 9905 * These are used during the capability negotiation below. 9906 */ 9907 ill->ill_lmod_rq = NULL; 9908 ill->ill_lmod_cnt = 0; 9909 if (islink && ((dwq = ipwq->q_next) != NULL)) { 9910 ill->ill_lmod_rq = RD(dwq); 9911 for (; dwq != NULL; dwq = dwq->q_next) 9912 ill->ill_lmod_cnt++; 9913 } 9914 9915 if (doconsist) 9916 ill->ill_ip_muxid = islink ? li->l_index : 0; 9917 9918 /* 9919 * Mark the ipsq busy until the capability operations initiated below 9920 * complete. The PLINK/UNLINK ioctl itself completes when our caller 9921 * returns, but the capability operation may complete asynchronously 9922 * much later. 9923 */ 9924 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 9925 /* 9926 * If there's at least one up ipif on this ill, then we're bound to 9927 * the underlying driver via DLPI. In that case, renegotiate 9928 * capabilities to account for any possible change in modules 9929 * interposed between IP and the driver. 9930 */ 9931 if (ill->ill_ipif_up_count > 0) { 9932 if (islink) 9933 ill_capability_probe(ill); 9934 else 9935 ill_capability_reset(ill, B_FALSE); 9936 } 9937 ipsq_current_finish(ipsq); 9938 done: 9939 if (entered_ipsq) 9940 ipsq_exit(ipsq); 9941 9942 return (err); 9943 } 9944 9945 /* 9946 * Search the ioctl command in the ioctl tables and return a pointer 9947 * to the ioctl command information. The ioctl command tables are 9948 * static and fully populated at compile time. 9949 */ 9950 ip_ioctl_cmd_t * 9951 ip_sioctl_lookup(int ioc_cmd) 9952 { 9953 int index; 9954 ip_ioctl_cmd_t *ipip; 9955 ip_ioctl_cmd_t *ipip_end; 9956 9957 if (ioc_cmd == IPI_DONTCARE) 9958 return (NULL); 9959 9960 /* 9961 * Do a 2 step search. First search the indexed table 9962 * based on the least significant byte of the ioctl cmd. 9963 * If we don't find a match, then search the misc table 9964 * serially. 9965 */ 9966 index = ioc_cmd & 0xFF; 9967 if (index < ip_ndx_ioctl_count) { 9968 ipip = &ip_ndx_ioctl_table[index]; 9969 if (ipip->ipi_cmd == ioc_cmd) { 9970 /* Found a match in the ndx table */ 9971 return (ipip); 9972 } 9973 } 9974 9975 /* Search the misc table */ 9976 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 9977 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 9978 if (ipip->ipi_cmd == ioc_cmd) 9979 /* Found a match in the misc table */ 9980 return (ipip); 9981 } 9982 9983 return (NULL); 9984 } 9985 9986 /* 9987 * Wrapper function for resuming deferred ioctl processing 9988 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 9989 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 9990 */ 9991 /* ARGSUSED */ 9992 void 9993 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 9994 void *dummy_arg) 9995 { 9996 ip_sioctl_copyin_setup(q, mp); 9997 } 9998 9999 /* 10000 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 10001 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 10002 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 10003 * We establish here the size of the block to be copied in. mi_copyin 10004 * arranges for this to happen, an processing continues in ip_wput with 10005 * an M_IOCDATA message. 10006 */ 10007 void 10008 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 10009 { 10010 int copyin_size; 10011 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10012 ip_ioctl_cmd_t *ipip; 10013 cred_t *cr; 10014 ip_stack_t *ipst; 10015 10016 if (CONN_Q(q)) 10017 ipst = CONNQ_TO_IPST(q); 10018 else 10019 ipst = ILLQ_TO_IPST(q); 10020 10021 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 10022 if (ipip == NULL) { 10023 /* 10024 * The ioctl is not one we understand or own. 10025 * Pass it along to be processed down stream, 10026 * if this is a module instance of IP, else nak 10027 * the ioctl. 10028 */ 10029 if (q->q_next == NULL) { 10030 goto nak; 10031 } else { 10032 putnext(q, mp); 10033 return; 10034 } 10035 } 10036 10037 /* 10038 * If this is deferred, then we will do all the checks when we 10039 * come back. 10040 */ 10041 if ((iocp->ioc_cmd == SIOCGDSTINFO || 10042 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 10043 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 10044 return; 10045 } 10046 10047 /* 10048 * Only allow a very small subset of IP ioctls on this stream if 10049 * IP is a module and not a driver. Allowing ioctls to be processed 10050 * in this case may cause assert failures or data corruption. 10051 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10052 * ioctls allowed on an IP module stream, after which this stream 10053 * normally becomes a multiplexor (at which time the stream head 10054 * will fail all ioctls). 10055 */ 10056 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10057 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10058 /* 10059 * Pass common Streams ioctls which the IP 10060 * module does not own or consume along to 10061 * be processed down stream. 10062 */ 10063 putnext(q, mp); 10064 return; 10065 } else { 10066 goto nak; 10067 } 10068 } 10069 10070 /* Make sure we have ioctl data to process. */ 10071 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10072 goto nak; 10073 10074 /* 10075 * Prefer dblk credential over ioctl credential; some synthesized 10076 * ioctls have kcred set because there's no way to crhold() 10077 * a credential in some contexts. (ioc_cr is not crfree() by 10078 * the framework; the caller of ioctl needs to hold the reference 10079 * for the duration of the call). 10080 */ 10081 cr = msg_getcred(mp, NULL); 10082 if (cr == NULL) 10083 cr = iocp->ioc_cr; 10084 10085 /* Make sure normal users don't send down privileged ioctls */ 10086 if ((ipip->ipi_flags & IPI_PRIV) && 10087 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 10088 /* We checked the privilege earlier but log it here */ 10089 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 10090 return; 10091 } 10092 10093 /* 10094 * The ioctl command tables can only encode fixed length 10095 * ioctl data. If the length is variable, the table will 10096 * encode the length as zero. Such special cases are handled 10097 * below in the switch. 10098 */ 10099 if (ipip->ipi_copyin_size != 0) { 10100 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10101 return; 10102 } 10103 10104 switch (iocp->ioc_cmd) { 10105 case O_SIOCGIFCONF: 10106 case SIOCGIFCONF: 10107 /* 10108 * This IOCTL is hilarious. See comments in 10109 * ip_sioctl_get_ifconf for the story. 10110 */ 10111 if (iocp->ioc_count == TRANSPARENT) 10112 copyin_size = SIZEOF_STRUCT(ifconf, 10113 iocp->ioc_flag); 10114 else 10115 copyin_size = iocp->ioc_count; 10116 mi_copyin(q, mp, NULL, copyin_size); 10117 return; 10118 10119 case O_SIOCGLIFCONF: 10120 case SIOCGLIFCONF: 10121 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10122 mi_copyin(q, mp, NULL, copyin_size); 10123 return; 10124 10125 case SIOCGLIFSRCOF: 10126 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10127 mi_copyin(q, mp, NULL, copyin_size); 10128 return; 10129 case SIOCGIP6ADDRPOLICY: 10130 ip_sioctl_ip6addrpolicy(q, mp); 10131 ip6_asp_table_refrele(ipst); 10132 return; 10133 10134 case SIOCSIP6ADDRPOLICY: 10135 ip_sioctl_ip6addrpolicy(q, mp); 10136 return; 10137 10138 case SIOCGDSTINFO: 10139 ip_sioctl_dstinfo(q, mp); 10140 ip6_asp_table_refrele(ipst); 10141 return; 10142 10143 case I_PLINK: 10144 case I_PUNLINK: 10145 case I_LINK: 10146 case I_UNLINK: 10147 /* 10148 * We treat non-persistent link similarly as the persistent 10149 * link case, in terms of plumbing/unplumbing, as well as 10150 * dynamic re-plumbing events indicator. See comments 10151 * in ip_sioctl_plink() for more. 10152 * 10153 * Request can be enqueued in the 'ipsq' while waiting 10154 * to become exclusive. So bump up the conn ref. 10155 */ 10156 if (CONN_Q(q)) 10157 CONN_INC_REF(Q_TO_CONN(q)); 10158 ip_sioctl_plink(NULL, q, mp, NULL); 10159 return; 10160 10161 case ND_GET: 10162 case ND_SET: 10163 /* 10164 * Use of the nd table requires holding the reader lock. 10165 * Modifying the nd table thru nd_load/nd_unload requires 10166 * the writer lock. 10167 */ 10168 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 10169 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 10170 rw_exit(&ipst->ips_ip_g_nd_lock); 10171 10172 if (iocp->ioc_error) 10173 iocp->ioc_count = 0; 10174 mp->b_datap->db_type = M_IOCACK; 10175 qreply(q, mp); 10176 return; 10177 } 10178 rw_exit(&ipst->ips_ip_g_nd_lock); 10179 /* 10180 * We don't understand this subioctl of ND_GET / ND_SET. 10181 * Maybe intended for some driver / module below us 10182 */ 10183 if (q->q_next) { 10184 putnext(q, mp); 10185 } else { 10186 iocp->ioc_error = ENOENT; 10187 mp->b_datap->db_type = M_IOCNAK; 10188 iocp->ioc_count = 0; 10189 qreply(q, mp); 10190 } 10191 return; 10192 10193 case IP_IOCTL: 10194 ip_wput_ioctl(q, mp); 10195 return; 10196 10197 case SIOCILB: 10198 /* The ioctl length varies depending on the ILB command. */ 10199 copyin_size = iocp->ioc_count; 10200 if (copyin_size < sizeof (ilb_cmd_t)) 10201 goto nak; 10202 mi_copyin(q, mp, NULL, copyin_size); 10203 return; 10204 10205 default: 10206 cmn_err(CE_PANIC, "should not happen "); 10207 } 10208 nak: 10209 if (mp->b_cont != NULL) { 10210 freemsg(mp->b_cont); 10211 mp->b_cont = NULL; 10212 } 10213 iocp->ioc_error = EINVAL; 10214 mp->b_datap->db_type = M_IOCNAK; 10215 iocp->ioc_count = 0; 10216 qreply(q, mp); 10217 } 10218 10219 /* ip_wput hands off ARP IOCTL responses to us */ 10220 /* ARGSUSED3 */ 10221 void 10222 ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 10223 { 10224 struct arpreq *ar; 10225 struct xarpreq *xar; 10226 area_t *area; 10227 mblk_t *area_mp; 10228 struct iocblk *iocp; 10229 mblk_t *orig_ioc_mp, *tmp; 10230 struct iocblk *orig_iocp; 10231 ill_t *ill; 10232 conn_t *connp = NULL; 10233 mblk_t *pending_mp; 10234 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10235 int *flagsp; 10236 char *storage = NULL; 10237 sin_t *sin; 10238 ipaddr_t addr; 10239 int err; 10240 ip_stack_t *ipst; 10241 10242 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 10243 ill = q->q_ptr; 10244 ASSERT(ill != NULL); 10245 ipst = ill->ill_ipst; 10246 10247 /* 10248 * We should get back from ARP a packet chain that looks like: 10249 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10250 */ 10251 if (!(area_mp = mp->b_cont) || 10252 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10253 !(orig_ioc_mp = area_mp->b_cont) || 10254 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10255 freemsg(mp); 10256 return; 10257 } 10258 10259 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10260 10261 tmp = (orig_ioc_mp->b_cont)->b_cont; 10262 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10263 (orig_iocp->ioc_cmd == SIOCSXARP) || 10264 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10265 x_arp_ioctl = B_TRUE; 10266 xar = (struct xarpreq *)tmp->b_rptr; 10267 sin = (sin_t *)&xar->xarp_pa; 10268 flagsp = &xar->xarp_flags; 10269 storage = xar->xarp_ha.sdl_data; 10270 if (xar->xarp_ha.sdl_nlen != 0) 10271 ifx_arp_ioctl = B_TRUE; 10272 } else { 10273 ar = (struct arpreq *)tmp->b_rptr; 10274 sin = (sin_t *)&ar->arp_pa; 10275 flagsp = &ar->arp_flags; 10276 storage = ar->arp_ha.sa_data; 10277 } 10278 10279 iocp = (struct iocblk *)mp->b_rptr; 10280 10281 /* 10282 * Find the pending message; if we're exclusive, it'll be on our IPSQ. 10283 * Otherwise, we can find it from our ioc_id. 10284 */ 10285 if (ipsq != NULL) 10286 pending_mp = ipsq_pending_mp_get(ipsq, &connp); 10287 else 10288 pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 10289 10290 if (pending_mp == NULL) { 10291 ASSERT(connp == NULL); 10292 inet_freemsg(mp); 10293 return; 10294 } 10295 ASSERT(connp != NULL); 10296 q = CONNP_TO_WQ(connp); 10297 10298 /* Uncouple the internally generated IOCTL from the original one */ 10299 area = (area_t *)area_mp->b_rptr; 10300 area_mp->b_cont = NULL; 10301 10302 /* 10303 * Restore the b_next and b_prev used by mi code. This is needed 10304 * to complete the ioctl using mi* functions. We stored them in 10305 * the pending mp prior to sending the request to ARP. 10306 */ 10307 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10308 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10309 inet_freemsg(pending_mp); 10310 10311 /* 10312 * We're done if there was an error or if this is not an SIOCG{X}ARP 10313 * Catch the case where there is an IRE_CACHE by no entry in the 10314 * arp table. 10315 */ 10316 addr = sin->sin_addr.s_addr; 10317 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10318 ire_t *ire; 10319 dl_unitdata_req_t *dlup; 10320 mblk_t *llmp; 10321 int addr_len; 10322 ill_t *ipsqill = NULL; 10323 10324 if (ifx_arp_ioctl) { 10325 /* 10326 * There's no need to lookup the ill, since 10327 * we've already done that when we started 10328 * processing the ioctl and sent the message 10329 * to ARP on that ill. So use the ill that 10330 * is stored in q->q_ptr. 10331 */ 10332 ipsqill = ill; 10333 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10334 ipsqill->ill_ipif, ALL_ZONES, 10335 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 10336 } else { 10337 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10338 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 10339 if (ire != NULL) 10340 ipsqill = ire_to_ill(ire); 10341 } 10342 10343 if ((x_arp_ioctl) && (ipsqill != NULL)) 10344 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10345 10346 if (ire != NULL) { 10347 /* 10348 * Since the ire obtained from cachetable is used for 10349 * mac addr copying below, treat an incomplete ire as if 10350 * as if we never found it. 10351 */ 10352 if (ire->ire_nce != NULL && 10353 ire->ire_nce->nce_state != ND_REACHABLE) { 10354 ire_refrele(ire); 10355 ire = NULL; 10356 ipsqill = NULL; 10357 goto errack; 10358 } 10359 *flagsp = ATF_INUSE; 10360 llmp = (ire->ire_nce != NULL ? 10361 ire->ire_nce->nce_res_mp : NULL); 10362 if (llmp != NULL && ipsqill != NULL) { 10363 uchar_t *macaddr; 10364 10365 addr_len = ipsqill->ill_phys_addr_length; 10366 if (x_arp_ioctl && ((addr_len + 10367 ipsqill->ill_name_length) > 10368 sizeof (xar->xarp_ha.sdl_data))) { 10369 ire_refrele(ire); 10370 freemsg(mp); 10371 ip_ioctl_finish(q, orig_ioc_mp, 10372 EINVAL, NO_COPYOUT, ipsq); 10373 return; 10374 } 10375 *flagsp |= ATF_COM; 10376 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10377 if (ipsqill->ill_sap_length < 0) 10378 macaddr = llmp->b_rptr + 10379 dlup->dl_dest_addr_offset; 10380 else 10381 macaddr = llmp->b_rptr + 10382 dlup->dl_dest_addr_offset + 10383 ipsqill->ill_sap_length; 10384 /* 10385 * For SIOCGARP, MAC address length 10386 * validation has already been done 10387 * before the ioctl was issued to ARP to 10388 * allow it to progress only on 6 byte 10389 * addressable (ethernet like) media. Thus 10390 * the mac address copying can not overwrite 10391 * the sa_data area below. 10392 */ 10393 bcopy(macaddr, storage, addr_len); 10394 } 10395 /* Ditch the internal IOCTL. */ 10396 freemsg(mp); 10397 ire_refrele(ire); 10398 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); 10399 return; 10400 } 10401 } 10402 10403 /* 10404 * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE 10405 * on the IPMP meta-interface, ensure any ARP entries added in 10406 * ip_sioctl_arp() are deleted. 10407 */ 10408 if (IS_IPMP(ill) && 10409 ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) || 10410 ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) { 10411 ipmp_illgrp_t *illg = ill->ill_grp; 10412 ipmp_arpent_t *entp; 10413 10414 if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL) 10415 ipmp_illgrp_destroy_arpent(illg, entp); 10416 } 10417 10418 /* 10419 * Delete the coresponding IRE_CACHE if any. 10420 * Reset the error if there was one (in case there was no entry 10421 * in arp.) 10422 */ 10423 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10424 ipif_t *ipintf = NULL; 10425 10426 if (ifx_arp_ioctl) { 10427 /* 10428 * There's no need to lookup the ill, since 10429 * we've already done that when we started 10430 * processing the ioctl and sent the message 10431 * to ARP on that ill. So use the ill that 10432 * is stored in q->q_ptr. 10433 */ 10434 ipintf = ill->ill_ipif; 10435 } 10436 if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) { 10437 /* 10438 * The address in "addr" may be an entry for a 10439 * router. If that's true, then any off-net 10440 * IRE_CACHE entries that go through the router 10441 * with address "addr" must be clobbered. Use 10442 * ire_walk to achieve this goal. 10443 */ 10444 if (ifx_arp_ioctl) 10445 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10446 ire_delete_cache_gw, (char *)&addr, ill); 10447 else 10448 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10449 ALL_ZONES, ipst); 10450 iocp->ioc_error = 0; 10451 } 10452 } 10453 errack: 10454 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10455 err = iocp->ioc_error; 10456 freemsg(mp); 10457 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq); 10458 return; 10459 } 10460 10461 /* 10462 * Completion of an SIOCG{X}ARP. Translate the information from 10463 * the area_t into the struct {x}arpreq. 10464 */ 10465 if (x_arp_ioctl) { 10466 storage += ill_xarp_info(&xar->xarp_ha, ill); 10467 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10468 sizeof (xar->xarp_ha.sdl_data)) { 10469 freemsg(mp); 10470 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, 10471 ipsq); 10472 return; 10473 } 10474 } 10475 *flagsp = ATF_INUSE; 10476 if (area->area_flags & ACE_F_PERMANENT) 10477 *flagsp |= ATF_PERM; 10478 if (area->area_flags & ACE_F_PUBLISH) 10479 *flagsp |= ATF_PUBL; 10480 if (area->area_flags & ACE_F_AUTHORITY) 10481 *flagsp |= ATF_AUTHORITY; 10482 if (area->area_hw_addr_length != 0) { 10483 *flagsp |= ATF_COM; 10484 /* 10485 * For SIOCGARP, MAC address length validation has 10486 * already been done before the ioctl was issued to ARP 10487 * to allow it to progress only on 6 byte addressable 10488 * (ethernet like) media. Thus the mac address copying 10489 * can not overwrite the sa_data area below. 10490 */ 10491 bcopy((char *)area + area->area_hw_addr_offset, 10492 storage, area->area_hw_addr_length); 10493 } 10494 10495 /* Ditch the internal IOCTL. */ 10496 freemsg(mp); 10497 /* Complete the original. */ 10498 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); 10499 } 10500 10501 /* 10502 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10503 * interface) create the next available logical interface for this 10504 * physical interface. 10505 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10506 * ipif with the specified name. 10507 * 10508 * If the address family is not AF_UNSPEC then set the address as well. 10509 * 10510 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10511 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10512 * 10513 * Executed as a writer on the ill. 10514 * So no lock is needed to traverse the ipif chain, or examine the 10515 * phyint flags. 10516 */ 10517 /* ARGSUSED */ 10518 int 10519 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10520 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10521 { 10522 mblk_t *mp1; 10523 struct lifreq *lifr; 10524 boolean_t isv6; 10525 boolean_t exists; 10526 char *name; 10527 char *endp; 10528 char *cp; 10529 int namelen; 10530 ipif_t *ipif; 10531 long id; 10532 ipsq_t *ipsq; 10533 ill_t *ill; 10534 sin_t *sin; 10535 int err = 0; 10536 boolean_t found_sep = B_FALSE; 10537 conn_t *connp; 10538 zoneid_t zoneid; 10539 ip_stack_t *ipst = CONNQ_TO_IPST(q); 10540 10541 ASSERT(q->q_next == NULL); 10542 ip1dbg(("ip_sioctl_addif\n")); 10543 /* Existence of mp1 has been checked in ip_wput_nondata */ 10544 mp1 = mp->b_cont->b_cont; 10545 /* 10546 * Null terminate the string to protect against buffer 10547 * overrun. String was generated by user code and may not 10548 * be trusted. 10549 */ 10550 lifr = (struct lifreq *)mp1->b_rptr; 10551 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10552 name = lifr->lifr_name; 10553 ASSERT(CONN_Q(q)); 10554 connp = Q_TO_CONN(q); 10555 isv6 = connp->conn_af_isv6; 10556 zoneid = connp->conn_zoneid; 10557 namelen = mi_strlen(name); 10558 if (namelen == 0) 10559 return (EINVAL); 10560 10561 exists = B_FALSE; 10562 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10563 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10564 /* 10565 * Allow creating lo0 using SIOCLIFADDIF. 10566 * can't be any other writer thread. So can pass null below 10567 * for the last 4 args to ipif_lookup_name. 10568 */ 10569 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 10570 &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst); 10571 /* Prevent any further action */ 10572 if (ipif == NULL) { 10573 return (ENOBUFS); 10574 } else if (!exists) { 10575 /* We created the ipif now and as writer */ 10576 ipif_refrele(ipif); 10577 return (0); 10578 } else { 10579 ill = ipif->ipif_ill; 10580 ill_refhold(ill); 10581 ipif_refrele(ipif); 10582 } 10583 } else { 10584 /* Look for a colon in the name. */ 10585 endp = &name[namelen]; 10586 for (cp = endp; --cp > name; ) { 10587 if (*cp == IPIF_SEPARATOR_CHAR) { 10588 found_sep = B_TRUE; 10589 /* 10590 * Reject any non-decimal aliases for plumbing 10591 * of logical interfaces. Aliases with leading 10592 * zeroes are also rejected as they introduce 10593 * ambiguity in the naming of the interfaces. 10594 * Comparing with "0" takes care of all such 10595 * cases. 10596 */ 10597 if ((strncmp("0", cp+1, 1)) == 0) 10598 return (EINVAL); 10599 10600 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10601 id <= 0 || *endp != '\0') { 10602 return (EINVAL); 10603 } 10604 *cp = '\0'; 10605 break; 10606 } 10607 } 10608 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10609 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst); 10610 if (found_sep) 10611 *cp = IPIF_SEPARATOR_CHAR; 10612 if (ill == NULL) 10613 return (err); 10614 } 10615 10616 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10617 B_TRUE); 10618 10619 /* 10620 * Release the refhold due to the lookup, now that we are excl 10621 * or we are just returning 10622 */ 10623 ill_refrele(ill); 10624 10625 if (ipsq == NULL) 10626 return (EINPROGRESS); 10627 10628 /* We are now exclusive on the IPSQ */ 10629 ASSERT(IAM_WRITER_ILL(ill)); 10630 10631 if (found_sep) { 10632 /* Now see if there is an IPIF with this unit number. */ 10633 for (ipif = ill->ill_ipif; ipif != NULL; 10634 ipif = ipif->ipif_next) { 10635 if (ipif->ipif_id == id) { 10636 err = EEXIST; 10637 goto done; 10638 } 10639 } 10640 } 10641 10642 /* 10643 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10644 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 10645 * instead. 10646 */ 10647 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 10648 B_TRUE, B_TRUE)) == NULL) { 10649 err = ENOBUFS; 10650 goto done; 10651 } 10652 10653 /* Return created name with ioctl */ 10654 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10655 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10656 ip1dbg(("created %s\n", lifr->lifr_name)); 10657 10658 /* Set address */ 10659 sin = (sin_t *)&lifr->lifr_addr; 10660 if (sin->sin_family != AF_UNSPEC) { 10661 err = ip_sioctl_addr(ipif, sin, q, mp, 10662 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10663 } 10664 10665 done: 10666 ipsq_exit(ipsq); 10667 return (err); 10668 } 10669 10670 /* 10671 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10672 * interface) delete it based on the IP address (on this physical interface). 10673 * Otherwise delete it based on the ipif_id. 10674 * Also, special handling to allow a removeif of lo0. 10675 */ 10676 /* ARGSUSED */ 10677 int 10678 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10679 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10680 { 10681 conn_t *connp; 10682 ill_t *ill = ipif->ipif_ill; 10683 boolean_t success; 10684 ip_stack_t *ipst; 10685 10686 ipst = CONNQ_TO_IPST(q); 10687 10688 ASSERT(q->q_next == NULL); 10689 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10690 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10691 ASSERT(IAM_WRITER_IPIF(ipif)); 10692 10693 connp = Q_TO_CONN(q); 10694 /* 10695 * Special case for unplumbing lo0 (the loopback physical interface). 10696 * If unplumbing lo0, the incoming address structure has been 10697 * initialized to all zeros. When unplumbing lo0, all its logical 10698 * interfaces must be removed too. 10699 * 10700 * Note that this interface may be called to remove a specific 10701 * loopback logical interface (eg, lo0:1). But in that case 10702 * ipif->ipif_id != 0 so that the code path for that case is the 10703 * same as any other interface (meaning it skips the code directly 10704 * below). 10705 */ 10706 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10707 if (sin->sin_family == AF_UNSPEC && 10708 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10709 /* 10710 * Mark it condemned. No new ref. will be made to ill. 10711 */ 10712 mutex_enter(&ill->ill_lock); 10713 ill->ill_state_flags |= ILL_CONDEMNED; 10714 for (ipif = ill->ill_ipif; ipif != NULL; 10715 ipif = ipif->ipif_next) { 10716 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10717 } 10718 mutex_exit(&ill->ill_lock); 10719 10720 ipif = ill->ill_ipif; 10721 /* unplumb the loopback interface */ 10722 ill_delete(ill); 10723 mutex_enter(&connp->conn_lock); 10724 mutex_enter(&ill->ill_lock); 10725 10726 /* Are any references to this ill active */ 10727 if (ill_is_freeable(ill)) { 10728 mutex_exit(&ill->ill_lock); 10729 mutex_exit(&connp->conn_lock); 10730 ill_delete_tail(ill); 10731 mi_free(ill); 10732 return (0); 10733 } 10734 success = ipsq_pending_mp_add(connp, ipif, 10735 CONNP_TO_WQ(connp), mp, ILL_FREE); 10736 mutex_exit(&connp->conn_lock); 10737 mutex_exit(&ill->ill_lock); 10738 if (success) 10739 return (EINPROGRESS); 10740 else 10741 return (EINTR); 10742 } 10743 } 10744 10745 if (ipif->ipif_id == 0) { 10746 ipsq_t *ipsq; 10747 10748 /* Find based on address */ 10749 if (ipif->ipif_isv6) { 10750 sin6_t *sin6; 10751 10752 if (sin->sin_family != AF_INET6) 10753 return (EAFNOSUPPORT); 10754 10755 sin6 = (sin6_t *)sin; 10756 /* We are a writer, so we should be able to lookup */ 10757 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 10758 ipst); 10759 } else { 10760 if (sin->sin_family != AF_INET) 10761 return (EAFNOSUPPORT); 10762 10763 /* We are a writer, so we should be able to lookup */ 10764 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 10765 ipst); 10766 } 10767 if (ipif == NULL) { 10768 return (EADDRNOTAVAIL); 10769 } 10770 10771 /* 10772 * It is possible for a user to send an SIOCLIFREMOVEIF with 10773 * lifr_name of the physical interface but with an ip address 10774 * lifr_addr of a logical interface plumbed over it. 10775 * So update ipx_current_ipif now that ipif points to the 10776 * correct one. 10777 */ 10778 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 10779 ipsq->ipsq_xop->ipx_current_ipif = ipif; 10780 10781 /* This is a writer */ 10782 ipif_refrele(ipif); 10783 } 10784 10785 /* 10786 * Can not delete instance zero since it is tied to the ill. 10787 */ 10788 if (ipif->ipif_id == 0) 10789 return (EBUSY); 10790 10791 mutex_enter(&ill->ill_lock); 10792 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10793 mutex_exit(&ill->ill_lock); 10794 10795 ipif_free(ipif); 10796 10797 mutex_enter(&connp->conn_lock); 10798 mutex_enter(&ill->ill_lock); 10799 10800 /* Are any references to this ipif active */ 10801 if (ipif_is_freeable(ipif)) { 10802 mutex_exit(&ill->ill_lock); 10803 mutex_exit(&connp->conn_lock); 10804 ipif_non_duplicate(ipif); 10805 ipif_down_tail(ipif); 10806 ipif_free_tail(ipif); /* frees ipif */ 10807 return (0); 10808 } 10809 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10810 IPIF_FREE); 10811 mutex_exit(&ill->ill_lock); 10812 mutex_exit(&connp->conn_lock); 10813 if (success) 10814 return (EINPROGRESS); 10815 else 10816 return (EINTR); 10817 } 10818 10819 /* 10820 * Restart the removeif ioctl. The refcnt has gone down to 0. 10821 * The ipif is already condemned. So can't find it thru lookups. 10822 */ 10823 /* ARGSUSED */ 10824 int 10825 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10826 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10827 { 10828 ill_t *ill = ipif->ipif_ill; 10829 10830 ASSERT(IAM_WRITER_IPIF(ipif)); 10831 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10832 10833 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10834 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10835 10836 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10837 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 10838 ill_delete_tail(ill); 10839 mi_free(ill); 10840 return (0); 10841 } 10842 10843 ipif_non_duplicate(ipif); 10844 ipif_down_tail(ipif); 10845 ipif_free_tail(ipif); 10846 10847 ILL_UNMARK_CHANGING(ill); 10848 return (0); 10849 } 10850 10851 /* 10852 * Set the local interface address. 10853 * Allow an address of all zero when the interface is down. 10854 */ 10855 /* ARGSUSED */ 10856 int 10857 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10858 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10859 { 10860 int err = 0; 10861 in6_addr_t v6addr; 10862 boolean_t need_up = B_FALSE; 10863 10864 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10865 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10866 10867 ASSERT(IAM_WRITER_IPIF(ipif)); 10868 10869 if (ipif->ipif_isv6) { 10870 sin6_t *sin6; 10871 ill_t *ill; 10872 phyint_t *phyi; 10873 10874 if (sin->sin_family != AF_INET6) 10875 return (EAFNOSUPPORT); 10876 10877 sin6 = (sin6_t *)sin; 10878 v6addr = sin6->sin6_addr; 10879 ill = ipif->ipif_ill; 10880 phyi = ill->ill_phyint; 10881 10882 /* 10883 * Enforce that true multicast interfaces have a link-local 10884 * address for logical unit 0. 10885 */ 10886 if (ipif->ipif_id == 0 && 10887 (ill->ill_flags & ILLF_MULTICAST) && 10888 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 10889 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 10890 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 10891 return (EADDRNOTAVAIL); 10892 } 10893 10894 /* 10895 * up interfaces shouldn't have the unspecified address 10896 * unless they also have the IPIF_NOLOCAL flags set and 10897 * have a subnet assigned. 10898 */ 10899 if ((ipif->ipif_flags & IPIF_UP) && 10900 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 10901 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 10902 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 10903 return (EADDRNOTAVAIL); 10904 } 10905 10906 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10907 return (EADDRNOTAVAIL); 10908 } else { 10909 ipaddr_t addr; 10910 10911 if (sin->sin_family != AF_INET) 10912 return (EAFNOSUPPORT); 10913 10914 addr = sin->sin_addr.s_addr; 10915 10916 /* Allow 0 as the local address. */ 10917 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10918 return (EADDRNOTAVAIL); 10919 10920 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10921 } 10922 10923 /* 10924 * Even if there is no change we redo things just to rerun 10925 * ipif_set_default. 10926 */ 10927 if (ipif->ipif_flags & IPIF_UP) { 10928 /* 10929 * Setting a new local address, make sure 10930 * we have net and subnet bcast ire's for 10931 * the old address if we need them. 10932 */ 10933 if (!ipif->ipif_isv6) 10934 ipif_check_bcast_ires(ipif); 10935 /* 10936 * If the interface is already marked up, 10937 * we call ipif_down which will take care 10938 * of ditching any IREs that have been set 10939 * up based on the old interface address. 10940 */ 10941 err = ipif_logical_down(ipif, q, mp); 10942 if (err == EINPROGRESS) 10943 return (err); 10944 ipif_down_tail(ipif); 10945 need_up = 1; 10946 } 10947 10948 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 10949 return (err); 10950 } 10951 10952 int 10953 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10954 boolean_t need_up) 10955 { 10956 in6_addr_t v6addr; 10957 in6_addr_t ov6addr; 10958 ipaddr_t addr; 10959 sin6_t *sin6; 10960 int sinlen; 10961 int err = 0; 10962 ill_t *ill = ipif->ipif_ill; 10963 boolean_t need_dl_down; 10964 boolean_t need_arp_down; 10965 struct iocblk *iocp; 10966 10967 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 10968 10969 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 10970 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10971 ASSERT(IAM_WRITER_IPIF(ipif)); 10972 10973 /* Must cancel any pending timer before taking the ill_lock */ 10974 if (ipif->ipif_recovery_id != 0) 10975 (void) untimeout(ipif->ipif_recovery_id); 10976 ipif->ipif_recovery_id = 0; 10977 10978 if (ipif->ipif_isv6) { 10979 sin6 = (sin6_t *)sin; 10980 v6addr = sin6->sin6_addr; 10981 sinlen = sizeof (struct sockaddr_in6); 10982 } else { 10983 addr = sin->sin_addr.s_addr; 10984 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10985 sinlen = sizeof (struct sockaddr_in); 10986 } 10987 mutex_enter(&ill->ill_lock); 10988 ov6addr = ipif->ipif_v6lcl_addr; 10989 ipif->ipif_v6lcl_addr = v6addr; 10990 sctp_update_ipif_addr(ipif, ov6addr); 10991 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 10992 ipif->ipif_v6src_addr = ipv6_all_zeros; 10993 } else { 10994 ipif->ipif_v6src_addr = v6addr; 10995 } 10996 ipif->ipif_addr_ready = 0; 10997 10998 /* 10999 * If the interface was previously marked as a duplicate, then since 11000 * we've now got a "new" address, it should no longer be considered a 11001 * duplicate -- even if the "new" address is the same as the old one. 11002 * Note that if all ipifs are down, we may have a pending ARP down 11003 * event to handle. This is because we want to recover from duplicates 11004 * and thus delay tearing down ARP until the duplicates have been 11005 * removed or disabled. 11006 */ 11007 need_dl_down = need_arp_down = B_FALSE; 11008 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11009 need_arp_down = !need_up; 11010 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11011 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11012 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11013 need_dl_down = B_TRUE; 11014 } 11015 } 11016 11017 ipif_set_default(ipif); 11018 11019 /* 11020 * If we've just manually set the IPv6 link-local address (0th ipif), 11021 * tag the ill so that future updates to the interface ID don't result 11022 * in this address getting automatically reconfigured from under the 11023 * administrator. 11024 */ 11025 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 11026 ill->ill_manual_linklocal = 1; 11027 11028 /* 11029 * When publishing an interface address change event, we only notify 11030 * the event listeners of the new address. It is assumed that if they 11031 * actively care about the addresses assigned that they will have 11032 * already discovered the previous address assigned (if there was one.) 11033 * 11034 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11035 */ 11036 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 11037 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 11038 NE_ADDRESS_CHANGE, sin, sinlen); 11039 } 11040 11041 mutex_exit(&ill->ill_lock); 11042 11043 if (need_up) { 11044 /* 11045 * Now bring the interface back up. If this 11046 * is the only IPIF for the ILL, ipif_up 11047 * will have to re-bind to the device, so 11048 * we may get back EINPROGRESS, in which 11049 * case, this IOCTL will get completed in 11050 * ip_rput_dlpi when we see the DL_BIND_ACK. 11051 */ 11052 err = ipif_up(ipif, q, mp); 11053 } 11054 11055 if (need_dl_down) 11056 ill_dl_down(ill); 11057 if (need_arp_down) 11058 ipif_resolver_down(ipif); 11059 11060 return (err); 11061 } 11062 11063 /* 11064 * Restart entry point to restart the address set operation after the 11065 * refcounts have dropped to zero. 11066 */ 11067 /* ARGSUSED */ 11068 int 11069 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11070 ip_ioctl_cmd_t *ipip, void *ifreq) 11071 { 11072 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11073 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11074 ASSERT(IAM_WRITER_IPIF(ipif)); 11075 ipif_down_tail(ipif); 11076 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11077 } 11078 11079 /* ARGSUSED */ 11080 int 11081 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11082 ip_ioctl_cmd_t *ipip, void *if_req) 11083 { 11084 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11085 struct lifreq *lifr = (struct lifreq *)if_req; 11086 11087 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11088 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11089 /* 11090 * The net mask and address can't change since we have a 11091 * reference to the ipif. So no lock is necessary. 11092 */ 11093 if (ipif->ipif_isv6) { 11094 *sin6 = sin6_null; 11095 sin6->sin6_family = AF_INET6; 11096 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11097 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11098 lifr->lifr_addrlen = 11099 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11100 } else { 11101 *sin = sin_null; 11102 sin->sin_family = AF_INET; 11103 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11104 if (ipip->ipi_cmd_type == LIF_CMD) { 11105 lifr->lifr_addrlen = 11106 ip_mask_to_plen(ipif->ipif_net_mask); 11107 } 11108 } 11109 return (0); 11110 } 11111 11112 /* 11113 * Set the destination address for a pt-pt interface. 11114 */ 11115 /* ARGSUSED */ 11116 int 11117 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11118 ip_ioctl_cmd_t *ipip, void *if_req) 11119 { 11120 int err = 0; 11121 in6_addr_t v6addr; 11122 boolean_t need_up = B_FALSE; 11123 11124 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11125 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11126 ASSERT(IAM_WRITER_IPIF(ipif)); 11127 11128 if (ipif->ipif_isv6) { 11129 sin6_t *sin6; 11130 11131 if (sin->sin_family != AF_INET6) 11132 return (EAFNOSUPPORT); 11133 11134 sin6 = (sin6_t *)sin; 11135 v6addr = sin6->sin6_addr; 11136 11137 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11138 return (EADDRNOTAVAIL); 11139 } else { 11140 ipaddr_t addr; 11141 11142 if (sin->sin_family != AF_INET) 11143 return (EAFNOSUPPORT); 11144 11145 addr = sin->sin_addr.s_addr; 11146 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11147 return (EADDRNOTAVAIL); 11148 11149 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11150 } 11151 11152 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11153 return (0); /* No change */ 11154 11155 if (ipif->ipif_flags & IPIF_UP) { 11156 /* 11157 * If the interface is already marked up, 11158 * we call ipif_down which will take care 11159 * of ditching any IREs that have been set 11160 * up based on the old pp dst address. 11161 */ 11162 err = ipif_logical_down(ipif, q, mp); 11163 if (err == EINPROGRESS) 11164 return (err); 11165 ipif_down_tail(ipif); 11166 need_up = B_TRUE; 11167 } 11168 /* 11169 * could return EINPROGRESS. If so ioctl will complete in 11170 * ip_rput_dlpi_writer 11171 */ 11172 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11173 return (err); 11174 } 11175 11176 static int 11177 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11178 boolean_t need_up) 11179 { 11180 in6_addr_t v6addr; 11181 ill_t *ill = ipif->ipif_ill; 11182 int err = 0; 11183 boolean_t need_dl_down; 11184 boolean_t need_arp_down; 11185 11186 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11187 ipif->ipif_id, (void *)ipif)); 11188 11189 /* Must cancel any pending timer before taking the ill_lock */ 11190 if (ipif->ipif_recovery_id != 0) 11191 (void) untimeout(ipif->ipif_recovery_id); 11192 ipif->ipif_recovery_id = 0; 11193 11194 if (ipif->ipif_isv6) { 11195 sin6_t *sin6; 11196 11197 sin6 = (sin6_t *)sin; 11198 v6addr = sin6->sin6_addr; 11199 } else { 11200 ipaddr_t addr; 11201 11202 addr = sin->sin_addr.s_addr; 11203 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11204 } 11205 mutex_enter(&ill->ill_lock); 11206 /* Set point to point destination address. */ 11207 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11208 /* 11209 * Allow this as a means of creating logical 11210 * pt-pt interfaces on top of e.g. an Ethernet. 11211 * XXX Undocumented HACK for testing. 11212 * pt-pt interfaces are created with NUD disabled. 11213 */ 11214 ipif->ipif_flags |= IPIF_POINTOPOINT; 11215 ipif->ipif_flags &= ~IPIF_BROADCAST; 11216 if (ipif->ipif_isv6) 11217 ill->ill_flags |= ILLF_NONUD; 11218 } 11219 11220 /* 11221 * If the interface was previously marked as a duplicate, then since 11222 * we've now got a "new" address, it should no longer be considered a 11223 * duplicate -- even if the "new" address is the same as the old one. 11224 * Note that if all ipifs are down, we may have a pending ARP down 11225 * event to handle. 11226 */ 11227 need_dl_down = need_arp_down = B_FALSE; 11228 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11229 need_arp_down = !need_up; 11230 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11231 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11232 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11233 need_dl_down = B_TRUE; 11234 } 11235 } 11236 11237 /* Set the new address. */ 11238 ipif->ipif_v6pp_dst_addr = v6addr; 11239 /* Make sure subnet tracks pp_dst */ 11240 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11241 mutex_exit(&ill->ill_lock); 11242 11243 if (need_up) { 11244 /* 11245 * Now bring the interface back up. If this 11246 * is the only IPIF for the ILL, ipif_up 11247 * will have to re-bind to the device, so 11248 * we may get back EINPROGRESS, in which 11249 * case, this IOCTL will get completed in 11250 * ip_rput_dlpi when we see the DL_BIND_ACK. 11251 */ 11252 err = ipif_up(ipif, q, mp); 11253 } 11254 11255 if (need_dl_down) 11256 ill_dl_down(ill); 11257 if (need_arp_down) 11258 ipif_resolver_down(ipif); 11259 11260 return (err); 11261 } 11262 11263 /* 11264 * Restart entry point to restart the dstaddress set operation after the 11265 * refcounts have dropped to zero. 11266 */ 11267 /* ARGSUSED */ 11268 int 11269 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11270 ip_ioctl_cmd_t *ipip, void *ifreq) 11271 { 11272 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11273 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11274 ipif_down_tail(ipif); 11275 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11276 } 11277 11278 /* ARGSUSED */ 11279 int 11280 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11281 ip_ioctl_cmd_t *ipip, void *if_req) 11282 { 11283 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11284 11285 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11286 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11287 /* 11288 * Get point to point destination address. The addresses can't 11289 * change since we hold a reference to the ipif. 11290 */ 11291 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11292 return (EADDRNOTAVAIL); 11293 11294 if (ipif->ipif_isv6) { 11295 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11296 *sin6 = sin6_null; 11297 sin6->sin6_family = AF_INET6; 11298 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11299 } else { 11300 *sin = sin_null; 11301 sin->sin_family = AF_INET; 11302 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11303 } 11304 return (0); 11305 } 11306 11307 /* 11308 * Set interface flags. Many flags require special handling (e.g., 11309 * bringing the interface down); see below for details. 11310 * 11311 * NOTE : We really don't enforce that ipif_id zero should be used 11312 * for setting any flags other than IFF_LOGINT_FLAGS. This 11313 * is because applications generally does SICGLIFFLAGS and 11314 * ORs in the new flags (that affects the logical) and does a 11315 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11316 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11317 * flags that will be turned on is correct with respect to 11318 * ipif_id 0. For backward compatibility reasons, it is not done. 11319 */ 11320 /* ARGSUSED */ 11321 int 11322 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11323 ip_ioctl_cmd_t *ipip, void *if_req) 11324 { 11325 uint64_t turn_on; 11326 uint64_t turn_off; 11327 int err = 0; 11328 phyint_t *phyi; 11329 ill_t *ill; 11330 uint64_t intf_flags, cantchange_flags; 11331 boolean_t phyint_flags_modified = B_FALSE; 11332 uint64_t flags; 11333 struct ifreq *ifr; 11334 struct lifreq *lifr; 11335 boolean_t set_linklocal = B_FALSE; 11336 boolean_t zero_source = B_FALSE; 11337 11338 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11339 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11340 11341 ASSERT(IAM_WRITER_IPIF(ipif)); 11342 11343 ill = ipif->ipif_ill; 11344 phyi = ill->ill_phyint; 11345 11346 if (ipip->ipi_cmd_type == IF_CMD) { 11347 ifr = (struct ifreq *)if_req; 11348 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11349 } else { 11350 lifr = (struct lifreq *)if_req; 11351 flags = lifr->lifr_flags; 11352 } 11353 11354 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11355 11356 /* 11357 * Have the flags been set correctly until now? 11358 */ 11359 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11360 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11361 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11362 /* 11363 * Compare the new flags to the old, and partition 11364 * into those coming on and those going off. 11365 * For the 16 bit command keep the bits above bit 16 unchanged. 11366 */ 11367 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11368 flags |= intf_flags & ~0xFFFF; 11369 11370 /* 11371 * Explicitly fail attempts to change flags that are always invalid on 11372 * an IPMP meta-interface. 11373 */ 11374 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 11375 return (EINVAL); 11376 11377 /* 11378 * Check which flags will change; silently ignore flags which userland 11379 * is not allowed to control. (Because these flags may change between 11380 * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's 11381 * control, we need to silently ignore them rather than fail.) 11382 */ 11383 cantchange_flags = IFF_CANTCHANGE; 11384 if (IS_IPMP(ill)) 11385 cantchange_flags |= IFF_IPMP_CANTCHANGE; 11386 11387 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 11388 if (turn_on == 0) 11389 return (0); /* No change */ 11390 11391 turn_off = intf_flags & turn_on; 11392 turn_on ^= turn_off; 11393 11394 /* 11395 * All test addresses must be IFF_DEPRECATED (to ensure source address 11396 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 11397 * allow it to be turned off. 11398 */ 11399 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 11400 (turn_on|intf_flags) & IFF_NOFAILOVER) 11401 return (EINVAL); 11402 11403 if (turn_on & IFF_NOFAILOVER) { 11404 turn_on |= IFF_DEPRECATED; 11405 flags |= IFF_DEPRECATED; 11406 } 11407 11408 /* 11409 * On underlying interfaces, only allow applications to manage test 11410 * addresses -- otherwise, they may get confused when the address 11411 * moves as part of being brought up. Likewise, prevent an 11412 * application-managed test address from being converted to a data 11413 * address. To prevent migration of administratively up addresses in 11414 * the kernel, we don't allow them to be converted either. 11415 */ 11416 if (IS_UNDER_IPMP(ill)) { 11417 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 11418 11419 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 11420 return (EINVAL); 11421 11422 if ((turn_off & IFF_NOFAILOVER) && 11423 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 11424 return (EINVAL); 11425 } 11426 11427 /* 11428 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11429 * IPv6 interfaces. 11430 */ 11431 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11432 return (EINVAL); 11433 11434 /* 11435 * cannot turn off IFF_NOXMIT on VNI interfaces. 11436 */ 11437 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 11438 return (EINVAL); 11439 11440 /* 11441 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11442 * interfaces. It makes no sense in that context. 11443 */ 11444 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11445 return (EINVAL); 11446 11447 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11448 zero_source = B_TRUE; 11449 11450 /* 11451 * For IPv6 ipif_id 0, don't allow the interface to be up without 11452 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11453 * If the link local address isn't set, and can be set, it will get 11454 * set later on in this function. 11455 */ 11456 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11457 (flags & IFF_UP) && !zero_source && 11458 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11459 if (ipif_cant_setlinklocal(ipif)) 11460 return (EINVAL); 11461 set_linklocal = B_TRUE; 11462 } 11463 11464 /* 11465 * If we modify physical interface flags, we'll potentially need to 11466 * send up two routing socket messages for the changes (one for the 11467 * IPv4 ill, and another for the IPv6 ill). Note that here. 11468 */ 11469 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11470 phyint_flags_modified = B_TRUE; 11471 11472 /* 11473 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 11474 * (otherwise, we'd immediately use them, defeating standby). Also, 11475 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 11476 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 11477 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 11478 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 11479 * will not be honored. 11480 */ 11481 if (turn_on & PHYI_STANDBY) { 11482 /* 11483 * No need to grab ill_g_usesrc_lock here; see the 11484 * synchronization notes in ip.c. 11485 */ 11486 if (ill->ill_usesrc_grp_next != NULL || 11487 intf_flags & PHYI_INACTIVE) 11488 return (EINVAL); 11489 if (!(flags & PHYI_FAILED)) { 11490 flags |= PHYI_INACTIVE; 11491 turn_on |= PHYI_INACTIVE; 11492 } 11493 } 11494 11495 if (turn_off & PHYI_STANDBY) { 11496 flags &= ~PHYI_INACTIVE; 11497 turn_off |= PHYI_INACTIVE; 11498 } 11499 11500 /* 11501 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 11502 * would end up on. 11503 */ 11504 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 11505 (PHYI_FAILED | PHYI_INACTIVE)) 11506 return (EINVAL); 11507 11508 /* 11509 * If ILLF_ROUTER changes, we need to change the ip forwarding 11510 * status of the interface. 11511 */ 11512 if ((turn_on | turn_off) & ILLF_ROUTER) 11513 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 11514 11515 /* 11516 * If the interface is not UP and we are not going to 11517 * bring it UP, record the flags and return. When the 11518 * interface comes UP later, the right actions will be 11519 * taken. 11520 */ 11521 if (!(ipif->ipif_flags & IPIF_UP) && 11522 !(turn_on & IPIF_UP)) { 11523 /* Record new flags in their respective places. */ 11524 mutex_enter(&ill->ill_lock); 11525 mutex_enter(&ill->ill_phyint->phyint_lock); 11526 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11527 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11528 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11529 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11530 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11531 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11532 mutex_exit(&ill->ill_lock); 11533 mutex_exit(&ill->ill_phyint->phyint_lock); 11534 11535 /* 11536 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 11537 * same to the kernel: if any of them has been set by 11538 * userland, the interface cannot be used for data traffic. 11539 */ 11540 if ((turn_on|turn_off) & 11541 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 11542 ASSERT(!IS_IPMP(ill)); 11543 /* 11544 * It's possible the ill is part of an "anonymous" 11545 * IPMP group rather than a real group. In that case, 11546 * there are no other interfaces in the group and thus 11547 * no need to call ipmp_phyint_refresh_active(). 11548 */ 11549 if (IS_UNDER_IPMP(ill)) 11550 ipmp_phyint_refresh_active(phyi); 11551 } 11552 11553 if (phyint_flags_modified) { 11554 if (phyi->phyint_illv4 != NULL) { 11555 ip_rts_ifmsg(phyi->phyint_illv4-> 11556 ill_ipif, RTSQ_DEFAULT); 11557 } 11558 if (phyi->phyint_illv6 != NULL) { 11559 ip_rts_ifmsg(phyi->phyint_illv6-> 11560 ill_ipif, RTSQ_DEFAULT); 11561 } 11562 } 11563 return (0); 11564 } else if (set_linklocal || zero_source) { 11565 mutex_enter(&ill->ill_lock); 11566 if (set_linklocal) 11567 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11568 if (zero_source) 11569 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11570 mutex_exit(&ill->ill_lock); 11571 } 11572 11573 /* 11574 * Disallow IPv6 interfaces coming up that have the unspecified address, 11575 * or point-to-point interfaces with an unspecified destination. We do 11576 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11577 * have a subnet assigned, which is how in.ndpd currently manages its 11578 * onlink prefix list when no addresses are configured with those 11579 * prefixes. 11580 */ 11581 if (ipif->ipif_isv6 && 11582 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11583 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11584 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11585 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11586 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11587 return (EINVAL); 11588 } 11589 11590 /* 11591 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11592 * from being brought up. 11593 */ 11594 if (!ipif->ipif_isv6 && 11595 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11596 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11597 return (EINVAL); 11598 } 11599 11600 /* 11601 * The only flag changes that we currently take specific action on are 11602 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 11603 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 11604 * IPIF_NOFAILOVER. This is done by bring the ipif down, changing the 11605 * flags and bringing it back up again. For IPIF_NOFAILOVER, the act 11606 * of bringing it back up will trigger the address to be moved. 11607 */ 11608 if ((turn_on|turn_off) & 11609 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11610 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 11611 IPIF_NOFAILOVER)) { 11612 /* 11613 * Taking this ipif down, make sure we have 11614 * valid net and subnet bcast ire's for other 11615 * logical interfaces, if we need them. 11616 */ 11617 if (!ipif->ipif_isv6) 11618 ipif_check_bcast_ires(ipif); 11619 11620 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11621 !(turn_off & IPIF_UP)) { 11622 if (ipif->ipif_flags & IPIF_UP) 11623 ill->ill_logical_down = 1; 11624 turn_on &= ~IPIF_UP; 11625 } 11626 err = ipif_down(ipif, q, mp); 11627 ip1dbg(("ipif_down returns %d err ", err)); 11628 if (err == EINPROGRESS) 11629 return (err); 11630 ipif_down_tail(ipif); 11631 } 11632 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 11633 } 11634 11635 static int 11636 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 11637 { 11638 ill_t *ill; 11639 phyint_t *phyi; 11640 uint64_t turn_on, turn_off; 11641 uint64_t intf_flags, cantchange_flags; 11642 boolean_t phyint_flags_modified = B_FALSE; 11643 int err = 0; 11644 boolean_t set_linklocal = B_FALSE; 11645 boolean_t zero_source = B_FALSE; 11646 11647 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 11648 ipif->ipif_ill->ill_name, ipif->ipif_id)); 11649 11650 ASSERT(IAM_WRITER_IPIF(ipif)); 11651 11652 ill = ipif->ipif_ill; 11653 phyi = ill->ill_phyint; 11654 11655 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11656 cantchange_flags = IFF_CANTCHANGE | IFF_UP; 11657 if (IS_IPMP(ill)) 11658 cantchange_flags |= IFF_IPMP_CANTCHANGE; 11659 11660 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 11661 turn_off = intf_flags & turn_on; 11662 turn_on ^= turn_off; 11663 11664 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11665 phyint_flags_modified = B_TRUE; 11666 11667 /* 11668 * Now we change the flags. Track current value of 11669 * other flags in their respective places. 11670 */ 11671 mutex_enter(&ill->ill_lock); 11672 mutex_enter(&phyi->phyint_lock); 11673 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11674 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11675 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11676 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11677 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11678 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11679 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 11680 set_linklocal = B_TRUE; 11681 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 11682 } 11683 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 11684 zero_source = B_TRUE; 11685 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 11686 } 11687 mutex_exit(&ill->ill_lock); 11688 mutex_exit(&phyi->phyint_lock); 11689 11690 if (set_linklocal) 11691 (void) ipif_setlinklocal(ipif); 11692 11693 if (zero_source) 11694 ipif->ipif_v6src_addr = ipv6_all_zeros; 11695 else 11696 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 11697 11698 /* 11699 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 11700 * the kernel: if any of them has been set by userland, the interface 11701 * cannot be used for data traffic. 11702 */ 11703 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 11704 ASSERT(!IS_IPMP(ill)); 11705 /* 11706 * It's possible the ill is part of an "anonymous" IPMP group 11707 * rather than a real group. In that case, there are no other 11708 * interfaces in the group and thus no need for us to call 11709 * ipmp_phyint_refresh_active(). 11710 */ 11711 if (IS_UNDER_IPMP(ill)) 11712 ipmp_phyint_refresh_active(phyi); 11713 } 11714 11715 if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 11716 /* 11717 * XXX ipif_up really does not know whether a phyint flags 11718 * was modified or not. So, it sends up information on 11719 * only one routing sockets message. As we don't bring up 11720 * the interface and also set PHYI_ flags simultaneously 11721 * it should be okay. 11722 */ 11723 err = ipif_up(ipif, q, mp); 11724 } else { 11725 /* 11726 * Make sure routing socket sees all changes to the flags. 11727 * ipif_up_done* handles this when we use ipif_up. 11728 */ 11729 if (phyint_flags_modified) { 11730 if (phyi->phyint_illv4 != NULL) { 11731 ip_rts_ifmsg(phyi->phyint_illv4-> 11732 ill_ipif, RTSQ_DEFAULT); 11733 } 11734 if (phyi->phyint_illv6 != NULL) { 11735 ip_rts_ifmsg(phyi->phyint_illv6-> 11736 ill_ipif, RTSQ_DEFAULT); 11737 } 11738 } else { 11739 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 11740 } 11741 /* 11742 * Update the flags in SCTP's IPIF list, ipif_up() will do 11743 * this in need_up case. 11744 */ 11745 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11746 } 11747 return (err); 11748 } 11749 11750 /* 11751 * Restart the flags operation now that the refcounts have dropped to zero. 11752 */ 11753 /* ARGSUSED */ 11754 int 11755 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11756 ip_ioctl_cmd_t *ipip, void *if_req) 11757 { 11758 uint64_t flags; 11759 struct ifreq *ifr = if_req; 11760 struct lifreq *lifr = if_req; 11761 11762 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 11763 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11764 11765 ipif_down_tail(ipif); 11766 if (ipip->ipi_cmd_type == IF_CMD) { 11767 /* cast to uint16_t prevents unwanted sign extension */ 11768 flags = (uint16_t)ifr->ifr_flags; 11769 } else { 11770 flags = lifr->lifr_flags; 11771 } 11772 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 11773 } 11774 11775 /* 11776 * Can operate on either a module or a driver queue. 11777 */ 11778 /* ARGSUSED */ 11779 int 11780 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11781 ip_ioctl_cmd_t *ipip, void *if_req) 11782 { 11783 /* 11784 * Has the flags been set correctly till now ? 11785 */ 11786 ill_t *ill = ipif->ipif_ill; 11787 phyint_t *phyi = ill->ill_phyint; 11788 11789 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 11790 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11791 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11792 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11793 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11794 11795 /* 11796 * Need a lock since some flags can be set even when there are 11797 * references to the ipif. 11798 */ 11799 mutex_enter(&ill->ill_lock); 11800 if (ipip->ipi_cmd_type == IF_CMD) { 11801 struct ifreq *ifr = (struct ifreq *)if_req; 11802 11803 /* Get interface flags (low 16 only). */ 11804 ifr->ifr_flags = ((ipif->ipif_flags | 11805 ill->ill_flags | phyi->phyint_flags) & 0xffff); 11806 } else { 11807 struct lifreq *lifr = (struct lifreq *)if_req; 11808 11809 /* Get interface flags. */ 11810 lifr->lifr_flags = ipif->ipif_flags | 11811 ill->ill_flags | phyi->phyint_flags; 11812 } 11813 mutex_exit(&ill->ill_lock); 11814 return (0); 11815 } 11816 11817 /* ARGSUSED */ 11818 int 11819 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11820 ip_ioctl_cmd_t *ipip, void *if_req) 11821 { 11822 int mtu; 11823 int ip_min_mtu; 11824 struct ifreq *ifr; 11825 struct lifreq *lifr; 11826 ire_t *ire; 11827 ip_stack_t *ipst; 11828 11829 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 11830 ipif->ipif_id, (void *)ipif)); 11831 if (ipip->ipi_cmd_type == IF_CMD) { 11832 ifr = (struct ifreq *)if_req; 11833 mtu = ifr->ifr_metric; 11834 } else { 11835 lifr = (struct lifreq *)if_req; 11836 mtu = lifr->lifr_mtu; 11837 } 11838 11839 if (ipif->ipif_isv6) 11840 ip_min_mtu = IPV6_MIN_MTU; 11841 else 11842 ip_min_mtu = IP_MIN_MTU; 11843 11844 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 11845 return (EINVAL); 11846 11847 /* 11848 * Change the MTU size in all relevant ire's. 11849 * Mtu change Vs. new ire creation - protocol below. 11850 * First change ipif_mtu and the ire_max_frag of the 11851 * interface ire. Then do an ire walk and change the 11852 * ire_max_frag of all affected ires. During ire_add 11853 * under the bucket lock, set the ire_max_frag of the 11854 * new ire being created from the ipif/ire from which 11855 * it is being derived. If an mtu change happens after 11856 * the ire is added, the new ire will be cleaned up. 11857 * Conversely if the mtu change happens before the ire 11858 * is added, ire_add will see the new value of the mtu. 11859 */ 11860 ipif->ipif_mtu = mtu; 11861 ipif->ipif_flags |= IPIF_FIXEDMTU; 11862 11863 if (ipif->ipif_isv6) 11864 ire = ipif_to_ire_v6(ipif); 11865 else 11866 ire = ipif_to_ire(ipif); 11867 if (ire != NULL) { 11868 ire->ire_max_frag = ipif->ipif_mtu; 11869 ire_refrele(ire); 11870 } 11871 ipst = ipif->ipif_ill->ill_ipst; 11872 if (ipif->ipif_flags & IPIF_UP) { 11873 if (ipif->ipif_isv6) 11874 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES, 11875 ipst); 11876 else 11877 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES, 11878 ipst); 11879 } 11880 /* Update the MTU in SCTP's list */ 11881 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11882 return (0); 11883 } 11884 11885 /* Get interface MTU. */ 11886 /* ARGSUSED */ 11887 int 11888 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11889 ip_ioctl_cmd_t *ipip, void *if_req) 11890 { 11891 struct ifreq *ifr; 11892 struct lifreq *lifr; 11893 11894 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 11895 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11896 if (ipip->ipi_cmd_type == IF_CMD) { 11897 ifr = (struct ifreq *)if_req; 11898 ifr->ifr_metric = ipif->ipif_mtu; 11899 } else { 11900 lifr = (struct lifreq *)if_req; 11901 lifr->lifr_mtu = ipif->ipif_mtu; 11902 } 11903 return (0); 11904 } 11905 11906 /* Set interface broadcast address. */ 11907 /* ARGSUSED2 */ 11908 int 11909 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11910 ip_ioctl_cmd_t *ipip, void *if_req) 11911 { 11912 ipaddr_t addr; 11913 ire_t *ire; 11914 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11915 11916 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 11917 ipif->ipif_id)); 11918 11919 ASSERT(IAM_WRITER_IPIF(ipif)); 11920 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11921 return (EADDRNOTAVAIL); 11922 11923 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 11924 11925 if (sin->sin_family != AF_INET) 11926 return (EAFNOSUPPORT); 11927 11928 addr = sin->sin_addr.s_addr; 11929 if (ipif->ipif_flags & IPIF_UP) { 11930 /* 11931 * If we are already up, make sure the new 11932 * broadcast address makes sense. If it does, 11933 * there should be an IRE for it already. 11934 * Don't match on ipif, only on the ill 11935 * since we are sharing these now. 11936 */ 11937 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 11938 ipif, ALL_ZONES, NULL, 11939 (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst); 11940 if (ire == NULL) { 11941 return (EINVAL); 11942 } else { 11943 ire_refrele(ire); 11944 } 11945 } 11946 /* 11947 * Changing the broadcast addr for this ipif. 11948 * Make sure we have valid net and subnet bcast 11949 * ire's for other logical interfaces, if needed. 11950 */ 11951 if (addr != ipif->ipif_brd_addr) 11952 ipif_check_bcast_ires(ipif); 11953 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 11954 return (0); 11955 } 11956 11957 /* Get interface broadcast address. */ 11958 /* ARGSUSED */ 11959 int 11960 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11961 ip_ioctl_cmd_t *ipip, void *if_req) 11962 { 11963 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 11964 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11965 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11966 return (EADDRNOTAVAIL); 11967 11968 /* IPIF_BROADCAST not possible with IPv6 */ 11969 ASSERT(!ipif->ipif_isv6); 11970 *sin = sin_null; 11971 sin->sin_family = AF_INET; 11972 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 11973 return (0); 11974 } 11975 11976 /* 11977 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 11978 */ 11979 /* ARGSUSED */ 11980 int 11981 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11982 ip_ioctl_cmd_t *ipip, void *if_req) 11983 { 11984 int err = 0; 11985 in6_addr_t v6mask; 11986 11987 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 11988 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11989 11990 ASSERT(IAM_WRITER_IPIF(ipif)); 11991 11992 if (ipif->ipif_isv6) { 11993 sin6_t *sin6; 11994 11995 if (sin->sin_family != AF_INET6) 11996 return (EAFNOSUPPORT); 11997 11998 sin6 = (sin6_t *)sin; 11999 v6mask = sin6->sin6_addr; 12000 } else { 12001 ipaddr_t mask; 12002 12003 if (sin->sin_family != AF_INET) 12004 return (EAFNOSUPPORT); 12005 12006 mask = sin->sin_addr.s_addr; 12007 V4MASK_TO_V6(mask, v6mask); 12008 } 12009 12010 /* 12011 * No big deal if the interface isn't already up, or the mask 12012 * isn't really changing, or this is pt-pt. 12013 */ 12014 if (!(ipif->ipif_flags & IPIF_UP) || 12015 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12016 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12017 ipif->ipif_v6net_mask = v6mask; 12018 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12019 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12020 ipif->ipif_v6net_mask, 12021 ipif->ipif_v6subnet); 12022 } 12023 return (0); 12024 } 12025 /* 12026 * Make sure we have valid net and subnet broadcast ire's 12027 * for the old netmask, if needed by other logical interfaces. 12028 */ 12029 if (!ipif->ipif_isv6) 12030 ipif_check_bcast_ires(ipif); 12031 12032 err = ipif_logical_down(ipif, q, mp); 12033 if (err == EINPROGRESS) 12034 return (err); 12035 ipif_down_tail(ipif); 12036 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12037 return (err); 12038 } 12039 12040 static int 12041 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12042 { 12043 in6_addr_t v6mask; 12044 int err = 0; 12045 12046 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12047 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12048 12049 if (ipif->ipif_isv6) { 12050 sin6_t *sin6; 12051 12052 sin6 = (sin6_t *)sin; 12053 v6mask = sin6->sin6_addr; 12054 } else { 12055 ipaddr_t mask; 12056 12057 mask = sin->sin_addr.s_addr; 12058 V4MASK_TO_V6(mask, v6mask); 12059 } 12060 12061 ipif->ipif_v6net_mask = v6mask; 12062 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12063 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12064 ipif->ipif_v6subnet); 12065 } 12066 err = ipif_up(ipif, q, mp); 12067 12068 if (err == 0 || err == EINPROGRESS) { 12069 /* 12070 * The interface must be DL_BOUND if this packet has to 12071 * go out on the wire. Since we only go through a logical 12072 * down and are bound with the driver during an internal 12073 * down/up that is satisfied. 12074 */ 12075 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12076 /* Potentially broadcast an address mask reply. */ 12077 ipif_mask_reply(ipif); 12078 } 12079 } 12080 return (err); 12081 } 12082 12083 /* ARGSUSED */ 12084 int 12085 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12086 ip_ioctl_cmd_t *ipip, void *if_req) 12087 { 12088 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12089 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12090 ipif_down_tail(ipif); 12091 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12092 } 12093 12094 /* Get interface net mask. */ 12095 /* ARGSUSED */ 12096 int 12097 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12098 ip_ioctl_cmd_t *ipip, void *if_req) 12099 { 12100 struct lifreq *lifr = (struct lifreq *)if_req; 12101 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12102 12103 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12104 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12105 12106 /* 12107 * net mask can't change since we have a reference to the ipif. 12108 */ 12109 if (ipif->ipif_isv6) { 12110 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12111 *sin6 = sin6_null; 12112 sin6->sin6_family = AF_INET6; 12113 sin6->sin6_addr = ipif->ipif_v6net_mask; 12114 lifr->lifr_addrlen = 12115 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12116 } else { 12117 *sin = sin_null; 12118 sin->sin_family = AF_INET; 12119 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12120 if (ipip->ipi_cmd_type == LIF_CMD) { 12121 lifr->lifr_addrlen = 12122 ip_mask_to_plen(ipif->ipif_net_mask); 12123 } 12124 } 12125 return (0); 12126 } 12127 12128 /* ARGSUSED */ 12129 int 12130 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12131 ip_ioctl_cmd_t *ipip, void *if_req) 12132 { 12133 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12134 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12135 12136 /* 12137 * Since no applications should ever be setting metrics on underlying 12138 * interfaces, we explicitly fail to smoke 'em out. 12139 */ 12140 if (IS_UNDER_IPMP(ipif->ipif_ill)) 12141 return (EINVAL); 12142 12143 /* 12144 * Set interface metric. We don't use this for 12145 * anything but we keep track of it in case it is 12146 * important to routing applications or such. 12147 */ 12148 if (ipip->ipi_cmd_type == IF_CMD) { 12149 struct ifreq *ifr; 12150 12151 ifr = (struct ifreq *)if_req; 12152 ipif->ipif_metric = ifr->ifr_metric; 12153 } else { 12154 struct lifreq *lifr; 12155 12156 lifr = (struct lifreq *)if_req; 12157 ipif->ipif_metric = lifr->lifr_metric; 12158 } 12159 return (0); 12160 } 12161 12162 /* ARGSUSED */ 12163 int 12164 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12165 ip_ioctl_cmd_t *ipip, void *if_req) 12166 { 12167 /* Get interface metric. */ 12168 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12169 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12170 12171 if (ipip->ipi_cmd_type == IF_CMD) { 12172 struct ifreq *ifr; 12173 12174 ifr = (struct ifreq *)if_req; 12175 ifr->ifr_metric = ipif->ipif_metric; 12176 } else { 12177 struct lifreq *lifr; 12178 12179 lifr = (struct lifreq *)if_req; 12180 lifr->lifr_metric = ipif->ipif_metric; 12181 } 12182 12183 return (0); 12184 } 12185 12186 /* ARGSUSED */ 12187 int 12188 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12189 ip_ioctl_cmd_t *ipip, void *if_req) 12190 { 12191 12192 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12193 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12194 /* 12195 * Set the muxid returned from I_PLINK. 12196 */ 12197 if (ipip->ipi_cmd_type == IF_CMD) { 12198 struct ifreq *ifr = (struct ifreq *)if_req; 12199 12200 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12201 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12202 } else { 12203 struct lifreq *lifr = (struct lifreq *)if_req; 12204 12205 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12206 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12207 } 12208 return (0); 12209 } 12210 12211 /* ARGSUSED */ 12212 int 12213 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12214 ip_ioctl_cmd_t *ipip, void *if_req) 12215 { 12216 12217 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12218 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12219 /* 12220 * Get the muxid saved in ill for I_PUNLINK. 12221 */ 12222 if (ipip->ipi_cmd_type == IF_CMD) { 12223 struct ifreq *ifr = (struct ifreq *)if_req; 12224 12225 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12226 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12227 } else { 12228 struct lifreq *lifr = (struct lifreq *)if_req; 12229 12230 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12231 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12232 } 12233 return (0); 12234 } 12235 12236 /* 12237 * Set the subnet prefix. Does not modify the broadcast address. 12238 */ 12239 /* ARGSUSED */ 12240 int 12241 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12242 ip_ioctl_cmd_t *ipip, void *if_req) 12243 { 12244 int err = 0; 12245 in6_addr_t v6addr; 12246 in6_addr_t v6mask; 12247 boolean_t need_up = B_FALSE; 12248 int addrlen; 12249 12250 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12251 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12252 12253 ASSERT(IAM_WRITER_IPIF(ipif)); 12254 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12255 12256 if (ipif->ipif_isv6) { 12257 sin6_t *sin6; 12258 12259 if (sin->sin_family != AF_INET6) 12260 return (EAFNOSUPPORT); 12261 12262 sin6 = (sin6_t *)sin; 12263 v6addr = sin6->sin6_addr; 12264 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12265 return (EADDRNOTAVAIL); 12266 } else { 12267 ipaddr_t addr; 12268 12269 if (sin->sin_family != AF_INET) 12270 return (EAFNOSUPPORT); 12271 12272 addr = sin->sin_addr.s_addr; 12273 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12274 return (EADDRNOTAVAIL); 12275 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12276 /* Add 96 bits */ 12277 addrlen += IPV6_ABITS - IP_ABITS; 12278 } 12279 12280 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12281 return (EINVAL); 12282 12283 /* Check if bits in the address is set past the mask */ 12284 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12285 return (EINVAL); 12286 12287 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12288 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12289 return (0); /* No change */ 12290 12291 if (ipif->ipif_flags & IPIF_UP) { 12292 /* 12293 * If the interface is already marked up, 12294 * we call ipif_down which will take care 12295 * of ditching any IREs that have been set 12296 * up based on the old interface address. 12297 */ 12298 err = ipif_logical_down(ipif, q, mp); 12299 if (err == EINPROGRESS) 12300 return (err); 12301 ipif_down_tail(ipif); 12302 need_up = B_TRUE; 12303 } 12304 12305 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12306 return (err); 12307 } 12308 12309 static int 12310 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12311 queue_t *q, mblk_t *mp, boolean_t need_up) 12312 { 12313 ill_t *ill = ipif->ipif_ill; 12314 int err = 0; 12315 12316 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12317 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12318 12319 /* Set the new address. */ 12320 mutex_enter(&ill->ill_lock); 12321 ipif->ipif_v6net_mask = v6mask; 12322 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12323 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12324 ipif->ipif_v6subnet); 12325 } 12326 mutex_exit(&ill->ill_lock); 12327 12328 if (need_up) { 12329 /* 12330 * Now bring the interface back up. If this 12331 * is the only IPIF for the ILL, ipif_up 12332 * will have to re-bind to the device, so 12333 * we may get back EINPROGRESS, in which 12334 * case, this IOCTL will get completed in 12335 * ip_rput_dlpi when we see the DL_BIND_ACK. 12336 */ 12337 err = ipif_up(ipif, q, mp); 12338 if (err == EINPROGRESS) 12339 return (err); 12340 } 12341 return (err); 12342 } 12343 12344 /* ARGSUSED */ 12345 int 12346 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12347 ip_ioctl_cmd_t *ipip, void *if_req) 12348 { 12349 int addrlen; 12350 in6_addr_t v6addr; 12351 in6_addr_t v6mask; 12352 struct lifreq *lifr = (struct lifreq *)if_req; 12353 12354 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12355 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12356 ipif_down_tail(ipif); 12357 12358 addrlen = lifr->lifr_addrlen; 12359 if (ipif->ipif_isv6) { 12360 sin6_t *sin6; 12361 12362 sin6 = (sin6_t *)sin; 12363 v6addr = sin6->sin6_addr; 12364 } else { 12365 ipaddr_t addr; 12366 12367 addr = sin->sin_addr.s_addr; 12368 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12369 addrlen += IPV6_ABITS - IP_ABITS; 12370 } 12371 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12372 12373 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12374 } 12375 12376 /* ARGSUSED */ 12377 int 12378 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12379 ip_ioctl_cmd_t *ipip, void *if_req) 12380 { 12381 struct lifreq *lifr = (struct lifreq *)if_req; 12382 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12383 12384 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12385 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12386 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12387 12388 if (ipif->ipif_isv6) { 12389 *sin6 = sin6_null; 12390 sin6->sin6_family = AF_INET6; 12391 sin6->sin6_addr = ipif->ipif_v6subnet; 12392 lifr->lifr_addrlen = 12393 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12394 } else { 12395 *sin = sin_null; 12396 sin->sin_family = AF_INET; 12397 sin->sin_addr.s_addr = ipif->ipif_subnet; 12398 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12399 } 12400 return (0); 12401 } 12402 12403 /* 12404 * Set the IPv6 address token. 12405 */ 12406 /* ARGSUSED */ 12407 int 12408 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12409 ip_ioctl_cmd_t *ipi, void *if_req) 12410 { 12411 ill_t *ill = ipif->ipif_ill; 12412 int err; 12413 in6_addr_t v6addr; 12414 in6_addr_t v6mask; 12415 boolean_t need_up = B_FALSE; 12416 int i; 12417 sin6_t *sin6 = (sin6_t *)sin; 12418 struct lifreq *lifr = (struct lifreq *)if_req; 12419 int addrlen; 12420 12421 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12422 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12423 ASSERT(IAM_WRITER_IPIF(ipif)); 12424 12425 addrlen = lifr->lifr_addrlen; 12426 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12427 if (ipif->ipif_id != 0) 12428 return (EINVAL); 12429 12430 if (!ipif->ipif_isv6) 12431 return (EINVAL); 12432 12433 if (addrlen > IPV6_ABITS) 12434 return (EINVAL); 12435 12436 v6addr = sin6->sin6_addr; 12437 12438 /* 12439 * The length of the token is the length from the end. To get 12440 * the proper mask for this, compute the mask of the bits not 12441 * in the token; ie. the prefix, and then xor to get the mask. 12442 */ 12443 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12444 return (EINVAL); 12445 for (i = 0; i < 4; i++) { 12446 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12447 } 12448 12449 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12450 ill->ill_token_length == addrlen) 12451 return (0); /* No change */ 12452 12453 if (ipif->ipif_flags & IPIF_UP) { 12454 err = ipif_logical_down(ipif, q, mp); 12455 if (err == EINPROGRESS) 12456 return (err); 12457 ipif_down_tail(ipif); 12458 need_up = B_TRUE; 12459 } 12460 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12461 return (err); 12462 } 12463 12464 static int 12465 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12466 mblk_t *mp, boolean_t need_up) 12467 { 12468 in6_addr_t v6addr; 12469 in6_addr_t v6mask; 12470 ill_t *ill = ipif->ipif_ill; 12471 int i; 12472 int err = 0; 12473 12474 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12475 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12476 v6addr = sin6->sin6_addr; 12477 /* 12478 * The length of the token is the length from the end. To get 12479 * the proper mask for this, compute the mask of the bits not 12480 * in the token; ie. the prefix, and then xor to get the mask. 12481 */ 12482 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12483 for (i = 0; i < 4; i++) 12484 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12485 12486 mutex_enter(&ill->ill_lock); 12487 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12488 ill->ill_token_length = addrlen; 12489 ill->ill_manual_token = 1; 12490 12491 /* Reconfigure the link-local address based on this new token */ 12492 ipif_setlinklocal(ill->ill_ipif); 12493 12494 mutex_exit(&ill->ill_lock); 12495 12496 if (need_up) { 12497 /* 12498 * Now bring the interface back up. If this 12499 * is the only IPIF for the ILL, ipif_up 12500 * will have to re-bind to the device, so 12501 * we may get back EINPROGRESS, in which 12502 * case, this IOCTL will get completed in 12503 * ip_rput_dlpi when we see the DL_BIND_ACK. 12504 */ 12505 err = ipif_up(ipif, q, mp); 12506 if (err == EINPROGRESS) 12507 return (err); 12508 } 12509 return (err); 12510 } 12511 12512 /* ARGSUSED */ 12513 int 12514 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12515 ip_ioctl_cmd_t *ipi, void *if_req) 12516 { 12517 ill_t *ill; 12518 sin6_t *sin6 = (sin6_t *)sin; 12519 struct lifreq *lifr = (struct lifreq *)if_req; 12520 12521 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12522 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12523 if (ipif->ipif_id != 0) 12524 return (EINVAL); 12525 12526 ill = ipif->ipif_ill; 12527 if (!ill->ill_isv6) 12528 return (ENXIO); 12529 12530 *sin6 = sin6_null; 12531 sin6->sin6_family = AF_INET6; 12532 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12533 sin6->sin6_addr = ill->ill_token; 12534 lifr->lifr_addrlen = ill->ill_token_length; 12535 return (0); 12536 } 12537 12538 /* 12539 * Set (hardware) link specific information that might override 12540 * what was acquired through the DL_INFO_ACK. 12541 * The logic is as follows. 12542 * 12543 * become exclusive 12544 * set CHANGING flag 12545 * change mtu on affected IREs 12546 * clear CHANGING flag 12547 * 12548 * An ire add that occurs before the CHANGING flag is set will have its mtu 12549 * changed by the ip_sioctl_lnkinfo. 12550 * 12551 * During the time the CHANGING flag is set, no new ires will be added to the 12552 * bucket, and ire add will fail (due the CHANGING flag). 12553 * 12554 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12555 * before it is added to the bucket. 12556 * 12557 * Obviously only 1 thread can set the CHANGING flag and we need to become 12558 * exclusive to set the flag. 12559 */ 12560 /* ARGSUSED */ 12561 int 12562 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12563 ip_ioctl_cmd_t *ipi, void *if_req) 12564 { 12565 ill_t *ill = ipif->ipif_ill; 12566 ipif_t *nipif; 12567 int ip_min_mtu; 12568 boolean_t mtu_walk = B_FALSE; 12569 struct lifreq *lifr = (struct lifreq *)if_req; 12570 lif_ifinfo_req_t *lir; 12571 ire_t *ire; 12572 12573 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12574 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12575 lir = &lifr->lifr_ifinfo; 12576 ASSERT(IAM_WRITER_IPIF(ipif)); 12577 12578 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12579 if (ipif->ipif_id != 0) 12580 return (EINVAL); 12581 12582 /* Set interface MTU. */ 12583 if (ipif->ipif_isv6) 12584 ip_min_mtu = IPV6_MIN_MTU; 12585 else 12586 ip_min_mtu = IP_MIN_MTU; 12587 12588 /* 12589 * Verify values before we set anything. Allow zero to 12590 * mean unspecified. 12591 */ 12592 if (lir->lir_maxmtu != 0 && 12593 (lir->lir_maxmtu > ill->ill_max_frag || 12594 lir->lir_maxmtu < ip_min_mtu)) 12595 return (EINVAL); 12596 if (lir->lir_reachtime != 0 && 12597 lir->lir_reachtime > ND_MAX_REACHTIME) 12598 return (EINVAL); 12599 if (lir->lir_reachretrans != 0 && 12600 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12601 return (EINVAL); 12602 12603 mutex_enter(&ill->ill_lock); 12604 ill->ill_state_flags |= ILL_CHANGING; 12605 for (nipif = ill->ill_ipif; nipif != NULL; 12606 nipif = nipif->ipif_next) { 12607 nipif->ipif_state_flags |= IPIF_CHANGING; 12608 } 12609 12610 if (lir->lir_maxmtu != 0) { 12611 ill->ill_max_mtu = lir->lir_maxmtu; 12612 ill->ill_user_mtu = lir->lir_maxmtu; 12613 mtu_walk = B_TRUE; 12614 } 12615 mutex_exit(&ill->ill_lock); 12616 12617 if (lir->lir_reachtime != 0) 12618 ill->ill_reachable_time = lir->lir_reachtime; 12619 12620 if (lir->lir_reachretrans != 0) 12621 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12622 12623 ill->ill_max_hops = lir->lir_maxhops; 12624 12625 ill->ill_max_buf = ND_MAX_Q; 12626 12627 if (mtu_walk) { 12628 /* 12629 * Set the MTU on all ipifs associated with this ill except 12630 * for those whose MTU was fixed via SIOCSLIFMTU. 12631 */ 12632 for (nipif = ill->ill_ipif; nipif != NULL; 12633 nipif = nipif->ipif_next) { 12634 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12635 continue; 12636 12637 nipif->ipif_mtu = ill->ill_max_mtu; 12638 12639 if (!(nipif->ipif_flags & IPIF_UP)) 12640 continue; 12641 12642 if (nipif->ipif_isv6) 12643 ire = ipif_to_ire_v6(nipif); 12644 else 12645 ire = ipif_to_ire(nipif); 12646 if (ire != NULL) { 12647 ire->ire_max_frag = ipif->ipif_mtu; 12648 ire_refrele(ire); 12649 } 12650 12651 ire_walk_ill(MATCH_IRE_ILL, 0, ipif_mtu_change, 12652 nipif, ill); 12653 } 12654 } 12655 12656 mutex_enter(&ill->ill_lock); 12657 for (nipif = ill->ill_ipif; nipif != NULL; 12658 nipif = nipif->ipif_next) { 12659 nipif->ipif_state_flags &= ~IPIF_CHANGING; 12660 } 12661 ILL_UNMARK_CHANGING(ill); 12662 mutex_exit(&ill->ill_lock); 12663 12664 /* 12665 * Refresh IPMP meta-interface MTU if necessary. 12666 */ 12667 if (IS_UNDER_IPMP(ill)) 12668 ipmp_illgrp_refresh_mtu(ill->ill_grp); 12669 12670 return (0); 12671 } 12672 12673 /* ARGSUSED */ 12674 int 12675 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12676 ip_ioctl_cmd_t *ipi, void *if_req) 12677 { 12678 struct lif_ifinfo_req *lir; 12679 ill_t *ill = ipif->ipif_ill; 12680 12681 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 12682 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12683 if (ipif->ipif_id != 0) 12684 return (EINVAL); 12685 12686 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 12687 lir->lir_maxhops = ill->ill_max_hops; 12688 lir->lir_reachtime = ill->ill_reachable_time; 12689 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 12690 lir->lir_maxmtu = ill->ill_max_mtu; 12691 12692 return (0); 12693 } 12694 12695 /* 12696 * Return best guess as to the subnet mask for the specified address. 12697 * Based on the subnet masks for all the configured interfaces. 12698 * 12699 * We end up returning a zero mask in the case of default, multicast or 12700 * experimental. 12701 */ 12702 static ipaddr_t 12703 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 12704 { 12705 ipaddr_t net_mask; 12706 ill_t *ill; 12707 ipif_t *ipif; 12708 ill_walk_context_t ctx; 12709 ipif_t *fallback_ipif = NULL; 12710 12711 net_mask = ip_net_mask(addr); 12712 if (net_mask == 0) { 12713 *ipifp = NULL; 12714 return (0); 12715 } 12716 12717 /* Let's check to see if this is maybe a local subnet route. */ 12718 /* this function only applies to IPv4 interfaces */ 12719 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 12720 ill = ILL_START_WALK_V4(&ctx, ipst); 12721 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 12722 mutex_enter(&ill->ill_lock); 12723 for (ipif = ill->ill_ipif; ipif != NULL; 12724 ipif = ipif->ipif_next) { 12725 if (!IPIF_CAN_LOOKUP(ipif)) 12726 continue; 12727 if (!(ipif->ipif_flags & IPIF_UP)) 12728 continue; 12729 if ((ipif->ipif_subnet & net_mask) == 12730 (addr & net_mask)) { 12731 /* 12732 * Don't trust pt-pt interfaces if there are 12733 * other interfaces. 12734 */ 12735 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 12736 if (fallback_ipif == NULL) { 12737 ipif_refhold_locked(ipif); 12738 fallback_ipif = ipif; 12739 } 12740 continue; 12741 } 12742 12743 /* 12744 * Fine. Just assume the same net mask as the 12745 * directly attached subnet interface is using. 12746 */ 12747 ipif_refhold_locked(ipif); 12748 mutex_exit(&ill->ill_lock); 12749 rw_exit(&ipst->ips_ill_g_lock); 12750 if (fallback_ipif != NULL) 12751 ipif_refrele(fallback_ipif); 12752 *ipifp = ipif; 12753 return (ipif->ipif_net_mask); 12754 } 12755 } 12756 mutex_exit(&ill->ill_lock); 12757 } 12758 rw_exit(&ipst->ips_ill_g_lock); 12759 12760 *ipifp = fallback_ipif; 12761 return ((fallback_ipif != NULL) ? 12762 fallback_ipif->ipif_net_mask : net_mask); 12763 } 12764 12765 /* 12766 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 12767 */ 12768 static void 12769 ip_wput_ioctl(queue_t *q, mblk_t *mp) 12770 { 12771 IOCP iocp; 12772 ipft_t *ipft; 12773 ipllc_t *ipllc; 12774 mblk_t *mp1; 12775 cred_t *cr; 12776 int error = 0; 12777 conn_t *connp; 12778 12779 ip1dbg(("ip_wput_ioctl")); 12780 iocp = (IOCP)mp->b_rptr; 12781 mp1 = mp->b_cont; 12782 if (mp1 == NULL) { 12783 iocp->ioc_error = EINVAL; 12784 mp->b_datap->db_type = M_IOCNAK; 12785 iocp->ioc_count = 0; 12786 qreply(q, mp); 12787 return; 12788 } 12789 12790 /* 12791 * These IOCTLs provide various control capabilities to 12792 * upstream agents such as ULPs and processes. There 12793 * are currently two such IOCTLs implemented. They 12794 * are used by TCP to provide update information for 12795 * existing IREs and to forcibly delete an IRE for a 12796 * host that is not responding, thereby forcing an 12797 * attempt at a new route. 12798 */ 12799 iocp->ioc_error = EINVAL; 12800 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 12801 goto done; 12802 12803 ipllc = (ipllc_t *)mp1->b_rptr; 12804 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 12805 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 12806 break; 12807 } 12808 /* 12809 * prefer credential from mblk over ioctl; 12810 * see ip_sioctl_copyin_setup 12811 */ 12812 cr = msg_getcred(mp, NULL); 12813 if (cr == NULL) 12814 cr = iocp->ioc_cr; 12815 12816 /* 12817 * Refhold the conn in case the request gets queued up in some lookup 12818 */ 12819 ASSERT(CONN_Q(q)); 12820 connp = Q_TO_CONN(q); 12821 CONN_INC_REF(connp); 12822 if (ipft->ipft_pfi && 12823 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 12824 pullupmsg(mp1, ipft->ipft_min_size))) { 12825 error = (*ipft->ipft_pfi)(q, 12826 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 12827 } 12828 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 12829 /* 12830 * CONN_OPER_PENDING_DONE happens in the function called 12831 * through ipft_pfi above. 12832 */ 12833 return; 12834 } 12835 12836 CONN_OPER_PENDING_DONE(connp); 12837 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 12838 freemsg(mp); 12839 return; 12840 } 12841 iocp->ioc_error = error; 12842 12843 done: 12844 mp->b_datap->db_type = M_IOCACK; 12845 if (iocp->ioc_error) 12846 iocp->ioc_count = 0; 12847 qreply(q, mp); 12848 } 12849 12850 /* 12851 * Lookup an ipif using the sequence id (ipif_seqid) 12852 */ 12853 ipif_t * 12854 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 12855 { 12856 ipif_t *ipif; 12857 12858 ASSERT(MUTEX_HELD(&ill->ill_lock)); 12859 12860 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12861 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 12862 return (ipif); 12863 } 12864 return (NULL); 12865 } 12866 12867 /* 12868 * Assign a unique id for the ipif. This is used later when we send 12869 * IRES to ARP for resolution where we initialize ire_ipif_seqid 12870 * to the value pointed by ire_ipif->ipif_seqid. Later when the 12871 * IRE is added, we verify that ipif has not disappeared. 12872 */ 12873 12874 static void 12875 ipif_assign_seqid(ipif_t *ipif) 12876 { 12877 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12878 12879 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 12880 } 12881 12882 /* 12883 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 12884 * administratively down (i.e., no DAD), of the same type, and locked. Note 12885 * that the clone is complete -- including the seqid -- and the expectation is 12886 * that the caller will either free or overwrite `sipif' before it's unlocked. 12887 */ 12888 static void 12889 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 12890 { 12891 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 12892 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 12893 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 12894 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 12895 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 12896 ASSERT(sipif->ipif_arp_del_mp == NULL); 12897 ASSERT(dipif->ipif_arp_del_mp == NULL); 12898 ASSERT(sipif->ipif_igmp_rpt == NULL); 12899 ASSERT(dipif->ipif_igmp_rpt == NULL); 12900 ASSERT(sipif->ipif_multicast_up == 0); 12901 ASSERT(dipif->ipif_multicast_up == 0); 12902 ASSERT(sipif->ipif_joined_allhosts == 0); 12903 ASSERT(dipif->ipif_joined_allhosts == 0); 12904 12905 dipif->ipif_mtu = sipif->ipif_mtu; 12906 dipif->ipif_flags = sipif->ipif_flags; 12907 dipif->ipif_metric = sipif->ipif_metric; 12908 dipif->ipif_zoneid = sipif->ipif_zoneid; 12909 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 12910 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 12911 dipif->ipif_v6src_addr = sipif->ipif_v6src_addr; 12912 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 12913 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 12914 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 12915 12916 /* 12917 * While dipif is down right now, it might've been up before. Since 12918 * it's changing identity, its packet counters need to be reset. 12919 */ 12920 dipif->ipif_ib_pkt_count = 0; 12921 dipif->ipif_ob_pkt_count = 0; 12922 dipif->ipif_fo_pkt_count = 0; 12923 12924 /* 12925 * As per the comment atop the function, we assume that these sipif 12926 * fields will be changed before sipif is unlocked. 12927 */ 12928 dipif->ipif_seqid = sipif->ipif_seqid; 12929 dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp; 12930 dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt; 12931 dipif->ipif_state_flags = sipif->ipif_state_flags; 12932 } 12933 12934 /* 12935 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 12936 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 12937 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 12938 * transfer the xop to `dipif'. Requires that all ipifs are administratively 12939 * down (i.e., no DAD), of the same type, and unlocked. 12940 */ 12941 static void 12942 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 12943 { 12944 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 12945 ipxop_t *ipx = ipsq->ipsq_xop; 12946 12947 ASSERT(sipif != dipif); 12948 ASSERT(sipif != virgipif); 12949 12950 /* 12951 * Grab all of the locks that protect the ipif in a defined order. 12952 */ 12953 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 12954 if (sipif > dipif) { 12955 mutex_enter(&sipif->ipif_saved_ire_lock); 12956 mutex_enter(&dipif->ipif_saved_ire_lock); 12957 } else { 12958 mutex_enter(&dipif->ipif_saved_ire_lock); 12959 mutex_enter(&sipif->ipif_saved_ire_lock); 12960 } 12961 12962 ipif_clone(sipif, dipif); 12963 if (virgipif != NULL) { 12964 ipif_clone(virgipif, sipif); 12965 mi_free(virgipif); 12966 } 12967 12968 mutex_exit(&sipif->ipif_saved_ire_lock); 12969 mutex_exit(&dipif->ipif_saved_ire_lock); 12970 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 12971 12972 /* 12973 * Transfer ownership of the current xop, if necessary. 12974 */ 12975 if (ipx->ipx_current_ipif == sipif) { 12976 ASSERT(ipx->ipx_pending_ipif == NULL); 12977 mutex_enter(&ipx->ipx_lock); 12978 ipx->ipx_current_ipif = dipif; 12979 mutex_exit(&ipx->ipx_lock); 12980 } 12981 12982 if (virgipif == NULL) 12983 mi_free(sipif); 12984 } 12985 12986 /* 12987 * Insert the ipif, so that the list of ipifs on the ill will be sorted 12988 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 12989 * be inserted into the first space available in the list. The value of 12990 * ipif_id will then be set to the appropriate value for its position. 12991 */ 12992 static int 12993 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 12994 { 12995 ill_t *ill; 12996 ipif_t *tipif; 12997 ipif_t **tipifp; 12998 int id; 12999 ip_stack_t *ipst; 13000 13001 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 13002 IAM_WRITER_IPIF(ipif)); 13003 13004 ill = ipif->ipif_ill; 13005 ASSERT(ill != NULL); 13006 ipst = ill->ill_ipst; 13007 13008 /* 13009 * In the case of lo0:0 we already hold the ill_g_lock. 13010 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13011 * ipif_insert. 13012 */ 13013 if (acquire_g_lock) 13014 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13015 mutex_enter(&ill->ill_lock); 13016 id = ipif->ipif_id; 13017 tipifp = &(ill->ill_ipif); 13018 if (id == -1) { /* need to find a real id */ 13019 id = 0; 13020 while ((tipif = *tipifp) != NULL) { 13021 ASSERT(tipif->ipif_id >= id); 13022 if (tipif->ipif_id != id) 13023 break; /* non-consecutive id */ 13024 id++; 13025 tipifp = &(tipif->ipif_next); 13026 } 13027 /* limit number of logical interfaces */ 13028 if (id >= ipst->ips_ip_addrs_per_if) { 13029 mutex_exit(&ill->ill_lock); 13030 if (acquire_g_lock) 13031 rw_exit(&ipst->ips_ill_g_lock); 13032 return (-1); 13033 } 13034 ipif->ipif_id = id; /* assign new id */ 13035 } else if (id < ipst->ips_ip_addrs_per_if) { 13036 /* we have a real id; insert ipif in the right place */ 13037 while ((tipif = *tipifp) != NULL) { 13038 ASSERT(tipif->ipif_id != id); 13039 if (tipif->ipif_id > id) 13040 break; /* found correct location */ 13041 tipifp = &(tipif->ipif_next); 13042 } 13043 } else { 13044 mutex_exit(&ill->ill_lock); 13045 if (acquire_g_lock) 13046 rw_exit(&ipst->ips_ill_g_lock); 13047 return (-1); 13048 } 13049 13050 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13051 13052 ipif->ipif_next = tipif; 13053 *tipifp = ipif; 13054 mutex_exit(&ill->ill_lock); 13055 if (acquire_g_lock) 13056 rw_exit(&ipst->ips_ill_g_lock); 13057 13058 return (0); 13059 } 13060 13061 static void 13062 ipif_remove(ipif_t *ipif) 13063 { 13064 ipif_t **ipifp; 13065 ill_t *ill = ipif->ipif_ill; 13066 13067 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 13068 13069 mutex_enter(&ill->ill_lock); 13070 ipifp = &ill->ill_ipif; 13071 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 13072 if (*ipifp == ipif) { 13073 *ipifp = ipif->ipif_next; 13074 break; 13075 } 13076 } 13077 mutex_exit(&ill->ill_lock); 13078 } 13079 13080 /* 13081 * Allocate and initialize a new interface control structure. (Always 13082 * called as writer.) 13083 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13084 * is not part of the global linked list of ills. ipif_seqid is unique 13085 * in the system and to preserve the uniqueness, it is assigned only 13086 * when ill becomes part of the global list. At that point ill will 13087 * have a name. If it doesn't get assigned here, it will get assigned 13088 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13089 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13090 * the interface flags or any other information from the DL_INFO_ACK for 13091 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13092 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13093 * second DL_INFO_ACK comes in from the driver. 13094 */ 13095 static ipif_t * 13096 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 13097 boolean_t insert) 13098 { 13099 ipif_t *ipif; 13100 ip_stack_t *ipst = ill->ill_ipst; 13101 13102 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13103 ill->ill_name, id, (void *)ill)); 13104 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13105 13106 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13107 return (NULL); 13108 *ipif = ipif_zero; /* start clean */ 13109 13110 ipif->ipif_ill = ill; 13111 ipif->ipif_id = id; /* could be -1 */ 13112 /* 13113 * Inherit the zoneid from the ill; for the shared stack instance 13114 * this is always the global zone 13115 */ 13116 ipif->ipif_zoneid = ill->ill_zoneid; 13117 13118 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13119 13120 ipif->ipif_refcnt = 0; 13121 ipif->ipif_saved_ire_cnt = 0; 13122 13123 if (insert) { 13124 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) { 13125 mi_free(ipif); 13126 return (NULL); 13127 } 13128 /* -1 id should have been replaced by real id */ 13129 id = ipif->ipif_id; 13130 ASSERT(id >= 0); 13131 } 13132 13133 if (ill->ill_name[0] != '\0') 13134 ipif_assign_seqid(ipif); 13135 13136 /* 13137 * If this is the zeroth ipif on the IPMP ill, create the illgrp 13138 * (which must not exist yet because the zeroth ipif is created once 13139 * per ill). However, do not not link it to the ipmp_grp_t until 13140 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 13141 */ 13142 if (id == 0 && IS_IPMP(ill)) { 13143 if (ipmp_illgrp_create(ill) == NULL) { 13144 if (insert) { 13145 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13146 ipif_remove(ipif); 13147 rw_exit(&ipst->ips_ill_g_lock); 13148 } 13149 mi_free(ipif); 13150 return (NULL); 13151 } 13152 } 13153 13154 /* 13155 * We grab ill_lock to protect the flag changes. The ipif is still 13156 * not up and can't be looked up until the ioctl completes and the 13157 * IPIF_CHANGING flag is cleared. 13158 */ 13159 mutex_enter(&ill->ill_lock); 13160 13161 ipif->ipif_ire_type = ire_type; 13162 13163 if (ipif->ipif_isv6) { 13164 ill->ill_flags |= ILLF_IPV6; 13165 } else { 13166 ipaddr_t inaddr_any = INADDR_ANY; 13167 13168 ill->ill_flags |= ILLF_IPV4; 13169 13170 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13171 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13172 &ipif->ipif_v6lcl_addr); 13173 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13174 &ipif->ipif_v6src_addr); 13175 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13176 &ipif->ipif_v6subnet); 13177 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13178 &ipif->ipif_v6net_mask); 13179 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13180 &ipif->ipif_v6brd_addr); 13181 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13182 &ipif->ipif_v6pp_dst_addr); 13183 } 13184 13185 /* 13186 * Don't set the interface flags etc. now, will do it in 13187 * ip_ll_subnet_defaults. 13188 */ 13189 if (!initialize) 13190 goto out; 13191 13192 ipif->ipif_mtu = ill->ill_max_mtu; 13193 13194 /* 13195 * NOTE: The IPMP meta-interface is special-cased because it starts 13196 * with no underlying interfaces (and thus an unknown broadcast 13197 * address length), but all interfaces that can be placed into an IPMP 13198 * group are required to be broadcast-capable. 13199 */ 13200 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 13201 /* 13202 * Later detect lack of DLPI driver multicast capability by 13203 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 13204 */ 13205 ill->ill_flags |= ILLF_MULTICAST; 13206 if (!ipif->ipif_isv6) 13207 ipif->ipif_flags |= IPIF_BROADCAST; 13208 } else { 13209 if (ill->ill_net_type != IRE_LOOPBACK) { 13210 if (ipif->ipif_isv6) 13211 /* 13212 * Note: xresolv interfaces will eventually need 13213 * NOARP set here as well, but that will require 13214 * those external resolvers to have some 13215 * knowledge of that flag and act appropriately. 13216 * Not to be changed at present. 13217 */ 13218 ill->ill_flags |= ILLF_NONUD; 13219 else 13220 ill->ill_flags |= ILLF_NOARP; 13221 } 13222 if (ill->ill_phys_addr_length == 0) { 13223 if (IS_VNI(ill)) { 13224 ipif->ipif_flags |= IPIF_NOXMIT; 13225 } else { 13226 /* pt-pt supports multicast. */ 13227 ill->ill_flags |= ILLF_MULTICAST; 13228 if (ill->ill_net_type != IRE_LOOPBACK) 13229 ipif->ipif_flags |= IPIF_POINTOPOINT; 13230 } 13231 } 13232 } 13233 out: 13234 mutex_exit(&ill->ill_lock); 13235 return (ipif); 13236 } 13237 13238 /* 13239 * If appropriate, send a message up to the resolver delete the entry 13240 * for the address of this interface which is going out of business. 13241 * (Always called as writer). 13242 * 13243 * NOTE : We need to check for NULL mps as some of the fields are 13244 * initialized only for some interface types. See ipif_resolver_up() 13245 * for details. 13246 */ 13247 void 13248 ipif_resolver_down(ipif_t *ipif) 13249 { 13250 mblk_t *mp; 13251 ill_t *ill = ipif->ipif_ill; 13252 13253 ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13254 ASSERT(IAM_WRITER_IPIF(ipif)); 13255 13256 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13257 return; 13258 13259 /* Delete the mapping for the local address */ 13260 mp = ipif->ipif_arp_del_mp; 13261 if (mp != NULL) { 13262 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13263 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13264 putnext(ill->ill_rq, mp); 13265 ipif->ipif_arp_del_mp = NULL; 13266 } 13267 13268 /* 13269 * Make IPMP aware of the deleted data address. 13270 */ 13271 if (IS_IPMP(ill)) 13272 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 13273 13274 /* 13275 * If this is the last ipif that is going down and there are no 13276 * duplicate addresses we may yet attempt to re-probe, then we need to 13277 * clean up ARP completely. 13278 */ 13279 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13280 /* 13281 * If this was the last ipif on an IPMP interface, purge any 13282 * IPMP ARP entries associated with it. 13283 */ 13284 if (IS_IPMP(ill)) 13285 ipmp_illgrp_refresh_arpent(ill->ill_grp); 13286 13287 /* Send up AR_INTERFACE_DOWN message */ 13288 mp = ill->ill_arp_down_mp; 13289 if (mp != NULL) { 13290 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13291 *(unsigned *)mp->b_rptr, ill->ill_name, 13292 ipif->ipif_id)); 13293 putnext(ill->ill_rq, mp); 13294 ill->ill_arp_down_mp = NULL; 13295 } 13296 13297 /* Tell ARP to delete the multicast mappings */ 13298 mp = ill->ill_arp_del_mapping_mp; 13299 if (mp != NULL) { 13300 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13301 *(unsigned *)mp->b_rptr, ill->ill_name, 13302 ipif->ipif_id)); 13303 putnext(ill->ill_rq, mp); 13304 ill->ill_arp_del_mapping_mp = NULL; 13305 } 13306 } 13307 } 13308 13309 /* 13310 * Set up the multicast mappings for `ipif' in ARP. If `arp_add_mapping_mp' 13311 * is non-NULL, then upon success it will contain an mblk that can be passed 13312 * to ARP to create the mapping. Otherwise, if it's NULL, upon success ARP 13313 * will have already been notified to create the mapping. Returns zero on 13314 * success, -1 upon failure. 13315 */ 13316 int 13317 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13318 { 13319 mblk_t *del_mp = NULL; 13320 mblk_t *add_mp = NULL; 13321 mblk_t *mp; 13322 ill_t *ill = ipif->ipif_ill; 13323 phyint_t *phyi = ill->ill_phyint; 13324 ipaddr_t addr, mask, extract_mask = 0; 13325 arma_t *arma; 13326 uint8_t *maddr, *bphys_addr; 13327 uint32_t hw_start; 13328 dl_unitdata_req_t *dlur; 13329 13330 ASSERT(IAM_WRITER_IPIF(ipif)); 13331 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13332 return (0); 13333 13334 /* 13335 * IPMP meta-interfaces don't have any inherent multicast mappings, 13336 * and instead use the ones on the underlying interfaces. 13337 */ 13338 if (IS_IPMP(ill)) 13339 return (0); 13340 13341 /* 13342 * Delete the existing mapping from ARP. Normally, ipif_down() -> 13343 * ipif_resolver_down() will send this up to ARP, but it may be that 13344 * we are enabling PHYI_MULTI_BCAST via ip_rput_dlpi_writer(). 13345 */ 13346 mp = ill->ill_arp_del_mapping_mp; 13347 if (mp != NULL) { 13348 ip1dbg(("ipif_arp_setup_multicast: arp cmd %x for %s:%u\n", 13349 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13350 putnext(ill->ill_rq, mp); 13351 ill->ill_arp_del_mapping_mp = NULL; 13352 } 13353 13354 if (arp_add_mapping_mp != NULL) 13355 *arp_add_mapping_mp = NULL; 13356 13357 /* 13358 * Check that the address is not to long for the constant 13359 * length reserved in the template arma_t. 13360 */ 13361 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13362 return (-1); 13363 13364 /* Add mapping mblk */ 13365 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13366 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13367 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13368 (caddr_t)&addr); 13369 if (add_mp == NULL) 13370 return (-1); 13371 arma = (arma_t *)add_mp->b_rptr; 13372 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13373 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13374 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13375 13376 /* 13377 * Determine the broadcast address. 13378 */ 13379 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13380 if (ill->ill_sap_length < 0) 13381 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13382 else 13383 bphys_addr = (uchar_t *)dlur + 13384 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13385 /* 13386 * Check PHYI_MULTI_BCAST and length of physical 13387 * address to determine if we use the mapping or the 13388 * broadcast address. 13389 */ 13390 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13391 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13392 bphys_addr, maddr, &hw_start, &extract_mask)) 13393 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13394 13395 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13396 (ill->ill_flags & ILLF_MULTICAST)) { 13397 /* Make sure this will not match the "exact" entry. */ 13398 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13399 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13400 (caddr_t)&addr); 13401 if (del_mp == NULL) { 13402 freemsg(add_mp); 13403 return (-1); 13404 } 13405 bcopy(&extract_mask, (char *)arma + 13406 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13407 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13408 /* Use link-layer broadcast address for MULTI_BCAST */ 13409 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13410 ip2dbg(("ipif_arp_setup_multicast: adding" 13411 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13412 } else { 13413 arma->arma_hw_mapping_start = hw_start; 13414 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13415 " ARP setup for %s\n", ill->ill_name)); 13416 } 13417 } else { 13418 freemsg(add_mp); 13419 ASSERT(del_mp == NULL); 13420 /* It is neither MULTICAST nor MULTI_BCAST */ 13421 return (0); 13422 } 13423 ASSERT(add_mp != NULL && del_mp != NULL); 13424 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13425 ill->ill_arp_del_mapping_mp = del_mp; 13426 if (arp_add_mapping_mp != NULL) { 13427 /* The caller just wants the mblks allocated */ 13428 *arp_add_mapping_mp = add_mp; 13429 } else { 13430 /* The caller wants us to send it to arp */ 13431 putnext(ill->ill_rq, add_mp); 13432 } 13433 return (0); 13434 } 13435 13436 /* 13437 * Get the resolver set up for a new IP address. (Always called as writer.) 13438 * Called both for IPv4 and IPv6 interfaces, though it only sets up the 13439 * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP. 13440 * 13441 * The enumerated value res_act tunes the behavior: 13442 * * Res_act_initial: set up all the resolver structures for a new 13443 * IP address. 13444 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 13445 * ARP message in defense of the address. 13446 * * Res_act_rebind: tell ARP to change the hardware address for an IP 13447 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 13448 * 13449 * Returns zero on success, or an errno upon failure. 13450 */ 13451 int 13452 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13453 { 13454 mblk_t *arp_up_mp = NULL; 13455 mblk_t *arp_down_mp = NULL; 13456 mblk_t *arp_add_mp = NULL; 13457 mblk_t *arp_del_mp = NULL; 13458 mblk_t *arp_add_mapping_mp = NULL; 13459 mblk_t *arp_del_mapping_mp = NULL; 13460 ill_t *ill = ipif->ipif_ill; 13461 int err = ENOMEM; 13462 boolean_t added_ipif = B_FALSE; 13463 boolean_t publish; 13464 boolean_t was_dup; 13465 13466 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13467 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13468 ASSERT(IAM_WRITER_IPIF(ipif)); 13469 13470 was_dup = B_FALSE; 13471 if (res_act == Res_act_initial) { 13472 ipif->ipif_addr_ready = 0; 13473 /* 13474 * We're bringing an interface up here. There's no way that we 13475 * should need to shut down ARP now. 13476 */ 13477 mutex_enter(&ill->ill_lock); 13478 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13479 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13480 ill->ill_ipif_dup_count--; 13481 was_dup = B_TRUE; 13482 } 13483 mutex_exit(&ill->ill_lock); 13484 } 13485 if (ipif->ipif_recovery_id != 0) 13486 (void) untimeout(ipif->ipif_recovery_id); 13487 ipif->ipif_recovery_id = 0; 13488 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13489 ipif->ipif_addr_ready = 1; 13490 return (0); 13491 } 13492 /* NDP will set the ipif_addr_ready flag when it's ready */ 13493 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13494 return (0); 13495 13496 if (ill->ill_isv6) { 13497 /* 13498 * External resolver for IPv6 13499 */ 13500 ASSERT(res_act == Res_act_initial); 13501 publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr); 13502 } else { 13503 /* 13504 * IPv4 arp case. If the ARP stream has already started 13505 * closing, fail this request for ARP bringup. Else 13506 * record the fact that an ARP bringup is pending. 13507 */ 13508 mutex_enter(&ill->ill_lock); 13509 if (ill->ill_arp_closing) { 13510 mutex_exit(&ill->ill_lock); 13511 err = EINVAL; 13512 goto failed; 13513 } else { 13514 if (ill->ill_ipif_up_count == 0 && 13515 ill->ill_ipif_dup_count == 0 && !was_dup) 13516 ill->ill_arp_bringup_pending = 1; 13517 mutex_exit(&ill->ill_lock); 13518 } 13519 publish = (ipif->ipif_lcl_addr != INADDR_ANY); 13520 } 13521 13522 if (IS_IPMP(ill) && publish) { 13523 /* 13524 * If we're here via ipif_up(), then the ipif won't be bound 13525 * yet -- add it to the group, which will bind it if possible. 13526 * (We would add it in ipif_up(), but deleting on failure 13527 * there is gruesome.) If we're here via ipmp_ill_bind_ipif(), 13528 * then the ipif has already been added to the group and we 13529 * just need to use the binding. 13530 */ 13531 if (ipmp_ipif_bound_ill(ipif) == NULL) { 13532 if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) { 13533 /* 13534 * We couldn't bind the ipif to an ill yet, 13535 * so we have nothing to publish. 13536 */ 13537 publish = B_FALSE; 13538 } 13539 added_ipif = B_TRUE; 13540 } 13541 } 13542 13543 /* 13544 * Add an entry for the local address in ARP only if it 13545 * is not UNNUMBERED and it is suitable for publishing. 13546 */ 13547 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) { 13548 if (res_act == Res_act_defend) { 13549 arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND); 13550 if (arp_add_mp == NULL) 13551 goto failed; 13552 /* 13553 * If we're just defending our address now, then 13554 * there's no need to set up ARP multicast mappings. 13555 * The publish command is enough. 13556 */ 13557 goto done; 13558 } 13559 13560 /* 13561 * Allocate an ARP add message and an ARP delete message (the 13562 * latter is saved for use when the address goes down). 13563 */ 13564 if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL) 13565 goto failed; 13566 13567 if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) 13568 goto failed; 13569 13570 if (res_act != Res_act_initial) 13571 goto arp_setup_multicast; 13572 } else { 13573 if (res_act != Res_act_initial) 13574 goto done; 13575 } 13576 /* 13577 * Need to bring up ARP or setup multicast mapping only 13578 * when the first interface is coming UP. 13579 */ 13580 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup) 13581 goto done; 13582 13583 /* 13584 * Allocate an ARP down message (to be saved) and an ARP up message. 13585 */ 13586 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13587 if (arp_down_mp == NULL) 13588 goto failed; 13589 13590 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13591 if (arp_up_mp == NULL) 13592 goto failed; 13593 13594 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13595 goto done; 13596 13597 arp_setup_multicast: 13598 /* 13599 * Setup the multicast mappings. This function initializes 13600 * ill_arp_del_mapping_mp also. This does not need to be done for 13601 * IPv6, or for the IPMP interface (since it has no link-layer). 13602 */ 13603 if (!ill->ill_isv6 && !IS_IPMP(ill)) { 13604 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13605 if (err != 0) 13606 goto failed; 13607 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13608 ASSERT(arp_add_mapping_mp != NULL); 13609 } 13610 done: 13611 if (arp_up_mp != NULL) { 13612 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13613 ill->ill_name, ipif->ipif_id)); 13614 putnext(ill->ill_rq, arp_up_mp); 13615 arp_up_mp = NULL; 13616 } 13617 if (arp_add_mp != NULL) { 13618 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13619 ill->ill_name, ipif->ipif_id)); 13620 /* 13621 * If it's an extended ARP implementation, then we'll wait to 13622 * hear that DAD has finished before using the interface. 13623 */ 13624 if (!ill->ill_arp_extend) 13625 ipif->ipif_addr_ready = 1; 13626 putnext(ill->ill_rq, arp_add_mp); 13627 arp_add_mp = NULL; 13628 } else { 13629 ipif->ipif_addr_ready = 1; 13630 } 13631 if (arp_add_mapping_mp != NULL) { 13632 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13633 ill->ill_name, ipif->ipif_id)); 13634 putnext(ill->ill_rq, arp_add_mapping_mp); 13635 arp_add_mapping_mp = NULL; 13636 } 13637 13638 if (res_act == Res_act_initial) { 13639 if (ill->ill_flags & ILLF_NOARP) 13640 err = ill_arp_off(ill); 13641 else 13642 err = ill_arp_on(ill); 13643 if (err != 0) { 13644 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", 13645 err)); 13646 goto failed; 13647 } 13648 } 13649 13650 if (arp_del_mp != NULL) { 13651 ASSERT(ipif->ipif_arp_del_mp == NULL); 13652 ipif->ipif_arp_del_mp = arp_del_mp; 13653 } 13654 if (arp_down_mp != NULL) { 13655 ASSERT(ill->ill_arp_down_mp == NULL); 13656 ill->ill_arp_down_mp = arp_down_mp; 13657 } 13658 if (arp_del_mapping_mp != NULL) { 13659 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13660 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13661 } 13662 13663 return ((ill->ill_ipif_up_count != 0 || was_dup || 13664 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 13665 failed: 13666 ip1dbg(("ipif_resolver_up: FAILED\n")); 13667 if (added_ipif) 13668 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 13669 freemsg(arp_add_mp); 13670 freemsg(arp_del_mp); 13671 freemsg(arp_add_mapping_mp); 13672 freemsg(arp_up_mp); 13673 freemsg(arp_down_mp); 13674 ill->ill_arp_bringup_pending = 0; 13675 return (err); 13676 } 13677 13678 /* 13679 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 13680 * just gone back up. 13681 */ 13682 static void 13683 ipif_arp_start_dad(ipif_t *ipif) 13684 { 13685 ill_t *ill = ipif->ipif_ill; 13686 mblk_t *arp_add_mp; 13687 13688 /* ACE_F_UNVERIFIED restarts DAD */ 13689 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 13690 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13691 ipif->ipif_lcl_addr == INADDR_ANY || 13692 (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) { 13693 /* 13694 * If we can't contact ARP for some reason, that's not really a 13695 * problem. Just send out the routing socket notification that 13696 * DAD completion would have done, and continue. 13697 */ 13698 ipif_mask_reply(ipif); 13699 ipif_up_notify(ipif); 13700 ipif->ipif_addr_ready = 1; 13701 return; 13702 } 13703 13704 putnext(ill->ill_rq, arp_add_mp); 13705 } 13706 13707 static void 13708 ipif_ndp_start_dad(ipif_t *ipif) 13709 { 13710 nce_t *nce; 13711 13712 nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr, 13713 B_FALSE); 13714 if (nce == NULL) 13715 return; 13716 13717 if (!ndp_restart_dad(nce)) { 13718 /* 13719 * If we can't restart DAD for some reason, that's not really a 13720 * problem. Just send out the routing socket notification that 13721 * DAD completion would have done, and continue. 13722 */ 13723 ipif_up_notify(ipif); 13724 ipif->ipif_addr_ready = 1; 13725 } 13726 NCE_REFRELE(nce); 13727 } 13728 13729 /* 13730 * Restart duplicate address detection on all interfaces on the given ill. 13731 * 13732 * This is called when an interface transitions from down to up 13733 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 13734 * 13735 * Note that since the underlying physical link has transitioned, we must cause 13736 * at least one routing socket message to be sent here, either via DAD 13737 * completion or just by default on the first ipif. (If we don't do this, then 13738 * in.mpathd will see long delays when doing link-based failure recovery.) 13739 */ 13740 void 13741 ill_restart_dad(ill_t *ill, boolean_t went_up) 13742 { 13743 ipif_t *ipif; 13744 13745 if (ill == NULL) 13746 return; 13747 13748 /* 13749 * If layer two doesn't support duplicate address detection, then just 13750 * send the routing socket message now and be done with it. 13751 */ 13752 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 13753 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 13754 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 13755 return; 13756 } 13757 13758 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13759 if (went_up) { 13760 if (ipif->ipif_flags & IPIF_UP) { 13761 if (ill->ill_isv6) 13762 ipif_ndp_start_dad(ipif); 13763 else 13764 ipif_arp_start_dad(ipif); 13765 } else if (ill->ill_isv6 && 13766 (ipif->ipif_flags & IPIF_DUPLICATE)) { 13767 /* 13768 * For IPv4, the ARP module itself will 13769 * automatically start the DAD process when it 13770 * sees DL_NOTE_LINK_UP. We respond to the 13771 * AR_CN_READY at the completion of that task. 13772 * For IPv6, we must kick off the bring-up 13773 * process now. 13774 */ 13775 ndp_do_recovery(ipif); 13776 } else { 13777 /* 13778 * Unfortunately, the first ipif is "special" 13779 * and represents the underlying ill in the 13780 * routing socket messages. Thus, when this 13781 * one ipif is down, we must still notify so 13782 * that the user knows the IFF_RUNNING status 13783 * change. (If the first ipif is up, then 13784 * we'll handle eventual routing socket 13785 * notification via DAD completion.) 13786 */ 13787 if (ipif == ill->ill_ipif) { 13788 ip_rts_ifmsg(ill->ill_ipif, 13789 RTSQ_DEFAULT); 13790 } 13791 } 13792 } else { 13793 /* 13794 * After link down, we'll need to send a new routing 13795 * message when the link comes back, so clear 13796 * ipif_addr_ready. 13797 */ 13798 ipif->ipif_addr_ready = 0; 13799 } 13800 } 13801 13802 /* 13803 * If we've torn down links, then notify the user right away. 13804 */ 13805 if (!went_up) 13806 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 13807 } 13808 13809 static void 13810 ipsq_delete(ipsq_t *ipsq) 13811 { 13812 ipxop_t *ipx = ipsq->ipsq_xop; 13813 13814 ipsq->ipsq_ipst = NULL; 13815 ASSERT(ipsq->ipsq_phyint == NULL); 13816 ASSERT(ipsq->ipsq_xop != NULL); 13817 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 13818 ASSERT(ipx->ipx_pending_mp == NULL); 13819 kmem_free(ipsq, sizeof (ipsq_t)); 13820 } 13821 13822 static int 13823 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 13824 { 13825 int err; 13826 ipif_t *ipif; 13827 13828 if (ill == NULL) 13829 return (0); 13830 13831 ASSERT(IAM_WRITER_ILL(ill)); 13832 ill->ill_up_ipifs = B_TRUE; 13833 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13834 if (ipif->ipif_was_up) { 13835 if (!(ipif->ipif_flags & IPIF_UP)) 13836 err = ipif_up(ipif, q, mp); 13837 ipif->ipif_was_up = B_FALSE; 13838 if (err != 0) { 13839 ASSERT(err == EINPROGRESS); 13840 return (err); 13841 } 13842 } 13843 } 13844 mutex_enter(&ill->ill_lock); 13845 ill->ill_state_flags &= ~ILL_CHANGING; 13846 mutex_exit(&ill->ill_lock); 13847 ill->ill_up_ipifs = B_FALSE; 13848 return (0); 13849 } 13850 13851 /* 13852 * This function is called to bring up all the ipifs that were up before 13853 * bringing the ill down via ill_down_ipifs(). 13854 */ 13855 int 13856 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 13857 { 13858 int err; 13859 13860 ASSERT(IAM_WRITER_ILL(ill)); 13861 13862 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 13863 if (err != 0) 13864 return (err); 13865 13866 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 13867 } 13868 13869 /* 13870 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 13871 * down the ipifs without sending DL_UNBIND_REQ to the driver. 13872 */ 13873 static void 13874 ill_down_ipifs(ill_t *ill, boolean_t logical) 13875 { 13876 ipif_t *ipif; 13877 13878 ASSERT(IAM_WRITER_ILL(ill)); 13879 13880 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13881 /* 13882 * We go through the ipif_down logic even if the ipif 13883 * is already down, since routes can be added based 13884 * on down ipifs. Going through ipif_down once again 13885 * will delete any IREs created based on these routes. 13886 */ 13887 if (ipif->ipif_flags & IPIF_UP) 13888 ipif->ipif_was_up = B_TRUE; 13889 13890 /* 13891 * Need to re-create net/subnet bcast ires if 13892 * they are dependent on ipif. 13893 */ 13894 if (!ipif->ipif_isv6) 13895 ipif_check_bcast_ires(ipif); 13896 if (logical) { 13897 (void) ipif_logical_down(ipif, NULL, NULL); 13898 ipif_non_duplicate(ipif); 13899 ipif_down_tail(ipif); 13900 } else { 13901 (void) ipif_down(ipif, NULL, NULL); 13902 } 13903 } 13904 } 13905 13906 /* 13907 * Redo source address selection. This is called when a 13908 * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up. 13909 */ 13910 void 13911 ill_update_source_selection(ill_t *ill) 13912 { 13913 ipif_t *ipif; 13914 13915 ASSERT(IAM_WRITER_ILL(ill)); 13916 13917 /* 13918 * Underlying interfaces are only used for test traffic and thus 13919 * should always send with their (deprecated) source addresses. 13920 */ 13921 if (IS_UNDER_IPMP(ill)) 13922 return; 13923 13924 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13925 if (ill->ill_isv6) 13926 ipif_recreate_interface_routes_v6(NULL, ipif); 13927 else 13928 ipif_recreate_interface_routes(NULL, ipif); 13929 } 13930 } 13931 13932 /* 13933 * Finish the group join started in ip_sioctl_groupname(). 13934 */ 13935 /* ARGSUSED */ 13936 static void 13937 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 13938 { 13939 ill_t *ill = q->q_ptr; 13940 phyint_t *phyi = ill->ill_phyint; 13941 ipmp_grp_t *grp = phyi->phyint_grp; 13942 ip_stack_t *ipst = ill->ill_ipst; 13943 13944 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 13945 ASSERT(!IS_IPMP(ill) && grp != NULL); 13946 ASSERT(IAM_WRITER_IPSQ(ipsq)); 13947 13948 if (phyi->phyint_illv4 != NULL) { 13949 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 13950 VERIFY(grp->gr_pendv4-- > 0); 13951 rw_exit(&ipst->ips_ipmp_lock); 13952 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 13953 } 13954 if (phyi->phyint_illv6 != NULL) { 13955 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 13956 VERIFY(grp->gr_pendv6-- > 0); 13957 rw_exit(&ipst->ips_ipmp_lock); 13958 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 13959 } 13960 freemsg(mp); 13961 } 13962 13963 /* 13964 * Process an SIOCSLIFGROUPNAME request. 13965 */ 13966 /* ARGSUSED */ 13967 int 13968 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13969 ip_ioctl_cmd_t *ipip, void *ifreq) 13970 { 13971 struct lifreq *lifr = ifreq; 13972 ill_t *ill = ipif->ipif_ill; 13973 ip_stack_t *ipst = ill->ill_ipst; 13974 phyint_t *phyi = ill->ill_phyint; 13975 ipmp_grp_t *grp = phyi->phyint_grp; 13976 mblk_t *ipsq_mp; 13977 int err = 0; 13978 13979 /* 13980 * Note that phyint_grp can only change here, where we're exclusive. 13981 */ 13982 ASSERT(IAM_WRITER_ILL(ill)); 13983 13984 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 13985 (phyi->phyint_flags & PHYI_VIRTUAL)) 13986 return (EINVAL); 13987 13988 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 13989 13990 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 13991 13992 /* 13993 * If the name hasn't changed, there's nothing to do. 13994 */ 13995 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 13996 goto unlock; 13997 13998 /* 13999 * Handle requests to rename an IPMP meta-interface. 14000 * 14001 * Note that creation of the IPMP meta-interface is handled in 14002 * userland through the standard plumbing sequence. As part of the 14003 * plumbing the IPMP meta-interface, its initial groupname is set to 14004 * the name of the interface (see ipif_set_values_tail()). 14005 */ 14006 if (IS_IPMP(ill)) { 14007 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 14008 goto unlock; 14009 } 14010 14011 /* 14012 * Handle requests to add or remove an IP interface from a group. 14013 */ 14014 if (lifr->lifr_groupname[0] != '\0') { /* add */ 14015 /* 14016 * Moves are handled by first removing the interface from 14017 * its existing group, and then adding it to another group. 14018 * So, fail if it's already in a group. 14019 */ 14020 if (IS_UNDER_IPMP(ill)) { 14021 err = EALREADY; 14022 goto unlock; 14023 } 14024 14025 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 14026 if (grp == NULL) { 14027 err = ENOENT; 14028 goto unlock; 14029 } 14030 14031 /* 14032 * Check if the phyint and its ills are suitable for 14033 * inclusion into the group. 14034 */ 14035 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 14036 goto unlock; 14037 14038 /* 14039 * Checks pass; join the group, and enqueue the remaining 14040 * illgrp joins for when we've become part of the group xop 14041 * and are exclusive across its IPSQs. Since qwriter_ip() 14042 * requires an mblk_t to scribble on, and since `mp' will be 14043 * freed as part of completing the ioctl, allocate another. 14044 */ 14045 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 14046 err = ENOMEM; 14047 goto unlock; 14048 } 14049 14050 /* 14051 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 14052 * IPMP meta-interface ills needed by `phyi' cannot go away 14053 * before ip_join_illgrps() is called back. See the comments 14054 * in ip_sioctl_plink_ipmp() for more. 14055 */ 14056 if (phyi->phyint_illv4 != NULL) 14057 grp->gr_pendv4++; 14058 if (phyi->phyint_illv6 != NULL) 14059 grp->gr_pendv6++; 14060 14061 rw_exit(&ipst->ips_ipmp_lock); 14062 14063 ipmp_phyint_join_grp(phyi, grp); 14064 ill_refhold(ill); 14065 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 14066 SWITCH_OP, B_FALSE); 14067 return (0); 14068 } else { 14069 /* 14070 * Request to remove the interface from a group. If the 14071 * interface is not in a group, this trivially succeeds. 14072 */ 14073 rw_exit(&ipst->ips_ipmp_lock); 14074 if (IS_UNDER_IPMP(ill)) 14075 ipmp_phyint_leave_grp(phyi); 14076 return (0); 14077 } 14078 unlock: 14079 rw_exit(&ipst->ips_ipmp_lock); 14080 return (err); 14081 } 14082 14083 /* 14084 * Process an SIOCGLIFBINDING request. 14085 */ 14086 /* ARGSUSED */ 14087 int 14088 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14089 ip_ioctl_cmd_t *ipip, void *ifreq) 14090 { 14091 ill_t *ill; 14092 struct lifreq *lifr = ifreq; 14093 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14094 14095 if (!IS_IPMP(ipif->ipif_ill)) 14096 return (EINVAL); 14097 14098 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14099 if ((ill = ipif->ipif_bound_ill) == NULL) 14100 lifr->lifr_binding[0] = '\0'; 14101 else 14102 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 14103 rw_exit(&ipst->ips_ipmp_lock); 14104 return (0); 14105 } 14106 14107 /* 14108 * Process an SIOCGLIFGROUPNAME request. 14109 */ 14110 /* ARGSUSED */ 14111 int 14112 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14113 ip_ioctl_cmd_t *ipip, void *ifreq) 14114 { 14115 ipmp_grp_t *grp; 14116 struct lifreq *lifr = ifreq; 14117 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14118 14119 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14120 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 14121 lifr->lifr_groupname[0] = '\0'; 14122 else 14123 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 14124 rw_exit(&ipst->ips_ipmp_lock); 14125 return (0); 14126 } 14127 14128 /* 14129 * Process an SIOCGLIFGROUPINFO request. 14130 */ 14131 /* ARGSUSED */ 14132 int 14133 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14134 ip_ioctl_cmd_t *ipip, void *dummy) 14135 { 14136 ipmp_grp_t *grp; 14137 lifgroupinfo_t *lifgr; 14138 ip_stack_t *ipst = CONNQ_TO_IPST(q); 14139 14140 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 14141 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 14142 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 14143 14144 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14145 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 14146 rw_exit(&ipst->ips_ipmp_lock); 14147 return (ENOENT); 14148 } 14149 ipmp_grp_info(grp, lifgr); 14150 rw_exit(&ipst->ips_ipmp_lock); 14151 return (0); 14152 } 14153 14154 static void 14155 ill_dl_down(ill_t *ill) 14156 { 14157 /* 14158 * The ill is down; unbind but stay attached since we're still 14159 * associated with a PPA. If we have negotiated DLPI capabilites 14160 * with the data link service provider (IDS_OK) then reset them. 14161 * The interval between unbinding and rebinding is potentially 14162 * unbounded hence we cannot assume things will be the same. 14163 * The DLPI capabilities will be probed again when the data link 14164 * is brought up. 14165 */ 14166 mblk_t *mp = ill->ill_unbind_mp; 14167 14168 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 14169 14170 ill->ill_unbind_mp = NULL; 14171 if (mp != NULL) { 14172 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 14173 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 14174 ill->ill_name)); 14175 mutex_enter(&ill->ill_lock); 14176 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 14177 mutex_exit(&ill->ill_lock); 14178 /* 14179 * ip_rput does not pass up normal (M_PROTO) DLPI messages 14180 * after ILL_CONDEMNED is set. So in the unplumb case, we call 14181 * ill_capability_dld_disable disable rightaway. If this is not 14182 * an unplumb operation then the disable happens on receipt of 14183 * the capab ack via ip_rput_dlpi_writer -> 14184 * ill_capability_ack_thr. In both cases the order of 14185 * the operations seen by DLD is capability disable followed 14186 * by DL_UNBIND. Also the DLD capability disable needs a 14187 * cv_wait'able context. 14188 */ 14189 if (ill->ill_state_flags & ILL_CONDEMNED) 14190 ill_capability_dld_disable(ill); 14191 ill_capability_reset(ill, B_FALSE); 14192 ill_dlpi_send(ill, mp); 14193 } 14194 14195 /* 14196 * Toss all of our multicast memberships. We could keep them, but 14197 * then we'd have to do bookkeeping of any joins and leaves performed 14198 * by the application while the the interface is down (we can't just 14199 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 14200 * on a downed interface). 14201 */ 14202 ill_leave_multicast(ill); 14203 14204 mutex_enter(&ill->ill_lock); 14205 ill->ill_dl_up = 0; 14206 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 14207 mutex_exit(&ill->ill_lock); 14208 } 14209 14210 static void 14211 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 14212 { 14213 union DL_primitives *dlp; 14214 t_uscalar_t prim; 14215 boolean_t waitack = B_FALSE; 14216 14217 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 14218 14219 dlp = (union DL_primitives *)mp->b_rptr; 14220 prim = dlp->dl_primitive; 14221 14222 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 14223 dl_primstr(prim), prim, ill->ill_name)); 14224 14225 switch (prim) { 14226 case DL_PHYS_ADDR_REQ: 14227 { 14228 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 14229 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 14230 break; 14231 } 14232 case DL_BIND_REQ: 14233 mutex_enter(&ill->ill_lock); 14234 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14235 mutex_exit(&ill->ill_lock); 14236 break; 14237 } 14238 14239 /* 14240 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 14241 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 14242 * we only wait for the ACK of the DL_UNBIND_REQ. 14243 */ 14244 mutex_enter(&ill->ill_lock); 14245 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 14246 (prim == DL_UNBIND_REQ)) { 14247 ill->ill_dlpi_pending = prim; 14248 waitack = B_TRUE; 14249 } 14250 14251 mutex_exit(&ill->ill_lock); 14252 putnext(ill->ill_wq, mp); 14253 14254 /* 14255 * There is no ack for DL_NOTIFY_CONF messages 14256 */ 14257 if (waitack && prim == DL_NOTIFY_CONF) 14258 ill_dlpi_done(ill, prim); 14259 } 14260 14261 /* 14262 * Helper function for ill_dlpi_send(). 14263 */ 14264 /* ARGSUSED */ 14265 static void 14266 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 14267 { 14268 ill_dlpi_send(q->q_ptr, mp); 14269 } 14270 14271 /* 14272 * Send a DLPI control message to the driver but make sure there 14273 * is only one outstanding message. Uses ill_dlpi_pending to tell 14274 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 14275 * when an ACK or a NAK is received to process the next queued message. 14276 */ 14277 void 14278 ill_dlpi_send(ill_t *ill, mblk_t *mp) 14279 { 14280 mblk_t **mpp; 14281 14282 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 14283 14284 /* 14285 * To ensure that any DLPI requests for current exclusive operation 14286 * are always completely sent before any DLPI messages for other 14287 * operations, require writer access before enqueuing. 14288 */ 14289 if (!IAM_WRITER_ILL(ill)) { 14290 ill_refhold(ill); 14291 /* qwriter_ip() does the ill_refrele() */ 14292 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 14293 NEW_OP, B_TRUE); 14294 return; 14295 } 14296 14297 mutex_enter(&ill->ill_lock); 14298 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 14299 /* Must queue message. Tail insertion */ 14300 mpp = &ill->ill_dlpi_deferred; 14301 while (*mpp != NULL) 14302 mpp = &((*mpp)->b_next); 14303 14304 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 14305 ill->ill_name)); 14306 14307 *mpp = mp; 14308 mutex_exit(&ill->ill_lock); 14309 return; 14310 } 14311 mutex_exit(&ill->ill_lock); 14312 ill_dlpi_dispatch(ill, mp); 14313 } 14314 14315 static void 14316 ill_capability_send(ill_t *ill, mblk_t *mp) 14317 { 14318 ill->ill_capab_pending_cnt++; 14319 ill_dlpi_send(ill, mp); 14320 } 14321 14322 void 14323 ill_capability_done(ill_t *ill) 14324 { 14325 ASSERT(ill->ill_capab_pending_cnt != 0); 14326 14327 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 14328 14329 ill->ill_capab_pending_cnt--; 14330 if (ill->ill_capab_pending_cnt == 0 && 14331 ill->ill_dlpi_capab_state == IDCS_OK) 14332 ill_capability_reset_alloc(ill); 14333 } 14334 14335 /* 14336 * Send all deferred DLPI messages without waiting for their ACKs. 14337 */ 14338 void 14339 ill_dlpi_send_deferred(ill_t *ill) 14340 { 14341 mblk_t *mp, *nextmp; 14342 14343 /* 14344 * Clear ill_dlpi_pending so that the message is not queued in 14345 * ill_dlpi_send(). 14346 */ 14347 mutex_enter(&ill->ill_lock); 14348 ill->ill_dlpi_pending = DL_PRIM_INVAL; 14349 mp = ill->ill_dlpi_deferred; 14350 ill->ill_dlpi_deferred = NULL; 14351 mutex_exit(&ill->ill_lock); 14352 14353 for (; mp != NULL; mp = nextmp) { 14354 nextmp = mp->b_next; 14355 mp->b_next = NULL; 14356 ill_dlpi_send(ill, mp); 14357 } 14358 } 14359 14360 /* 14361 * Check if the DLPI primitive `prim' is pending; print a warning if not. 14362 */ 14363 boolean_t 14364 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 14365 { 14366 t_uscalar_t pending; 14367 14368 mutex_enter(&ill->ill_lock); 14369 if (ill->ill_dlpi_pending == prim) { 14370 mutex_exit(&ill->ill_lock); 14371 return (B_TRUE); 14372 } 14373 14374 /* 14375 * During teardown, ill_dlpi_dispatch() will send DLPI requests 14376 * without waiting, so don't print any warnings in that case. 14377 */ 14378 if (ill->ill_state_flags & ILL_CONDEMNED) { 14379 mutex_exit(&ill->ill_lock); 14380 return (B_FALSE); 14381 } 14382 pending = ill->ill_dlpi_pending; 14383 mutex_exit(&ill->ill_lock); 14384 14385 if (pending == DL_PRIM_INVAL) { 14386 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14387 "received unsolicited ack for %s on %s\n", 14388 dl_primstr(prim), ill->ill_name); 14389 } else { 14390 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14391 "received unexpected ack for %s on %s (expecting %s)\n", 14392 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 14393 } 14394 return (B_FALSE); 14395 } 14396 14397 /* 14398 * Complete the current DLPI operation associated with `prim' on `ill' and 14399 * start the next queued DLPI operation (if any). If there are no queued DLPI 14400 * operations and the ill's current exclusive IPSQ operation has finished 14401 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 14402 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 14403 * the comments above ipsq_current_finish() for details. 14404 */ 14405 void 14406 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 14407 { 14408 mblk_t *mp; 14409 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 14410 ipxop_t *ipx = ipsq->ipsq_xop; 14411 14412 ASSERT(IAM_WRITER_IPSQ(ipsq)); 14413 mutex_enter(&ill->ill_lock); 14414 14415 ASSERT(prim != DL_PRIM_INVAL); 14416 ASSERT(ill->ill_dlpi_pending == prim); 14417 14418 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 14419 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 14420 14421 if ((mp = ill->ill_dlpi_deferred) == NULL) { 14422 ill->ill_dlpi_pending = DL_PRIM_INVAL; 14423 if (ipx->ipx_current_done) { 14424 mutex_enter(&ipx->ipx_lock); 14425 ipx->ipx_current_ipif = NULL; 14426 mutex_exit(&ipx->ipx_lock); 14427 } 14428 cv_signal(&ill->ill_cv); 14429 mutex_exit(&ill->ill_lock); 14430 return; 14431 } 14432 14433 ill->ill_dlpi_deferred = mp->b_next; 14434 mp->b_next = NULL; 14435 mutex_exit(&ill->ill_lock); 14436 14437 ill_dlpi_dispatch(ill, mp); 14438 } 14439 14440 void 14441 conn_delete_ire(conn_t *connp, caddr_t arg) 14442 { 14443 ipif_t *ipif = (ipif_t *)arg; 14444 ire_t *ire; 14445 14446 /* 14447 * Look at the cached ires on conns which has pointers to ipifs. 14448 * We just call ire_refrele which clears up the reference 14449 * to ire. Called when a conn closes. Also called from ipif_free 14450 * to cleanup indirect references to the stale ipif via the cached ire. 14451 */ 14452 mutex_enter(&connp->conn_lock); 14453 ire = connp->conn_ire_cache; 14454 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 14455 connp->conn_ire_cache = NULL; 14456 mutex_exit(&connp->conn_lock); 14457 IRE_REFRELE_NOTR(ire); 14458 return; 14459 } 14460 mutex_exit(&connp->conn_lock); 14461 14462 } 14463 14464 /* 14465 * Some operations (e.g., ipif_down()) conditionally delete a number 14466 * of IREs. Those IREs may have been previously cached in the conn structure. 14467 * This ipcl_walk() walker function releases all references to such IREs based 14468 * on the condemned flag. 14469 */ 14470 /* ARGSUSED */ 14471 void 14472 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 14473 { 14474 ire_t *ire; 14475 14476 mutex_enter(&connp->conn_lock); 14477 ire = connp->conn_ire_cache; 14478 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 14479 connp->conn_ire_cache = NULL; 14480 mutex_exit(&connp->conn_lock); 14481 IRE_REFRELE_NOTR(ire); 14482 return; 14483 } 14484 mutex_exit(&connp->conn_lock); 14485 } 14486 14487 /* 14488 * Take down a specific interface, but don't lose any information about it. 14489 * (Always called as writer.) 14490 * This function goes through the down sequence even if the interface is 14491 * already down. There are 2 reasons. 14492 * a. Currently we permit interface routes that depend on down interfaces 14493 * to be added. This behaviour itself is questionable. However it appears 14494 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 14495 * time. We go thru the cleanup in order to remove these routes. 14496 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 14497 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 14498 * down, but we need to cleanup i.e. do ill_dl_down and 14499 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 14500 * 14501 * IP-MT notes: 14502 * 14503 * Model of reference to interfaces. 14504 * 14505 * The following members in ipif_t track references to the ipif. 14506 * int ipif_refcnt; Active reference count 14507 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 14508 * uint_t ipif_ilm_cnt; Number of ilms's references this ipif. 14509 * 14510 * The following members in ill_t track references to the ill. 14511 * int ill_refcnt; active refcnt 14512 * uint_t ill_ire_cnt; Number of ires referencing ill 14513 * uint_t ill_nce_cnt; Number of nces referencing ill 14514 * uint_t ill_ilm_cnt; Number of ilms referencing ill 14515 * 14516 * Reference to an ipif or ill can be obtained in any of the following ways. 14517 * 14518 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 14519 * Pointers to ipif / ill from other data structures viz ire and conn. 14520 * Implicit reference to the ipif / ill by holding a reference to the ire. 14521 * 14522 * The ipif/ill lookup functions return a reference held ipif / ill. 14523 * ipif_refcnt and ill_refcnt track the reference counts respectively. 14524 * This is a purely dynamic reference count associated with threads holding 14525 * references to the ipif / ill. Pointers from other structures do not 14526 * count towards this reference count. 14527 * 14528 * ipif_ire_cnt/ill_ire_cnt is the number of ire's 14529 * associated with the ipif/ill. This is incremented whenever a new 14530 * ire is created referencing the ipif/ill. This is done atomically inside 14531 * ire_add_v[46] where the ire is actually added to the ire hash table. 14532 * The count is decremented in ire_inactive where the ire is destroyed. 14533 * 14534 * nce's reference ill's thru nce_ill and the count of nce's associated with 14535 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 14536 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 14537 * table. Similarly it is decremented in ndp_inactive() where the nce 14538 * is destroyed. 14539 * 14540 * ilm's reference to the ipif (for IPv4 ilm's) or the ill (for IPv6 ilm's) 14541 * is incremented in ilm_add_v6() and decremented before the ilm is freed 14542 * in ilm_walker_cleanup() or ilm_delete(). 14543 * 14544 * Flow of ioctls involving interface down/up 14545 * 14546 * The following is the sequence of an attempt to set some critical flags on an 14547 * up interface. 14548 * ip_sioctl_flags 14549 * ipif_down 14550 * wait for ipif to be quiescent 14551 * ipif_down_tail 14552 * ip_sioctl_flags_tail 14553 * 14554 * All set ioctls that involve down/up sequence would have a skeleton similar 14555 * to the above. All the *tail functions are called after the refcounts have 14556 * dropped to the appropriate values. 14557 * 14558 * The mechanism to quiesce an ipif is as follows. 14559 * 14560 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 14561 * on the ipif. Callers either pass a flag requesting wait or the lookup 14562 * functions will return NULL. 14563 * 14564 * Delete all ires referencing this ipif 14565 * 14566 * Any thread attempting to do an ipif_refhold on an ipif that has been 14567 * obtained thru a cached pointer will first make sure that 14568 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 14569 * increment the refcount. 14570 * 14571 * The above guarantees that the ipif refcount will eventually come down to 14572 * zero and the ipif will quiesce, once all threads that currently hold a 14573 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 14574 * ipif_refcount has dropped to zero and all ire's associated with this ipif 14575 * have also been ire_inactive'd. i.e. when ipif_{ire, ill}_cnt and 14576 * ipif_refcnt both drop to zero. See also: comments above IPIF_DOWN_OK() 14577 * in ip.h 14578 * 14579 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 14580 * 14581 * Threads trying to lookup an ipif or ill can pass a flag requesting 14582 * wait and restart if the ipif / ill cannot be looked up currently. 14583 * For eg. bind, and route operations (Eg. route add / delete) cannot return 14584 * failure if the ipif is currently undergoing an exclusive operation, and 14585 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 14586 * is restarted by ipsq_exit() when the current exclusive operation completes. 14587 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 14588 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 14589 * change while the ill_lock is held. Before dropping the ill_lock we acquire 14590 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 14591 * until we release the ipsq_lock, even though the the ill/ipif state flags 14592 * can change after we drop the ill_lock. 14593 * 14594 * An attempt to send out a packet using an ipif that is currently 14595 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 14596 * operation and restart it later when the exclusive condition on the ipif ends. 14597 * This is an example of not passing the wait flag to the lookup functions. For 14598 * example an attempt to refhold and use conn->conn_multicast_ipif and send 14599 * out a multicast packet on that ipif will fail while the ipif is 14600 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 14601 * currently IPIF_CHANGING will also fail. 14602 */ 14603 int 14604 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 14605 { 14606 ill_t *ill = ipif->ipif_ill; 14607 conn_t *connp; 14608 boolean_t success; 14609 boolean_t ipif_was_up = B_FALSE; 14610 ip_stack_t *ipst = ill->ill_ipst; 14611 14612 ASSERT(IAM_WRITER_IPIF(ipif)); 14613 14614 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 14615 14616 if (ipif->ipif_flags & IPIF_UP) { 14617 mutex_enter(&ill->ill_lock); 14618 ipif->ipif_flags &= ~IPIF_UP; 14619 ASSERT(ill->ill_ipif_up_count > 0); 14620 --ill->ill_ipif_up_count; 14621 mutex_exit(&ill->ill_lock); 14622 ipif_was_up = B_TRUE; 14623 /* Update status in SCTP's list */ 14624 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 14625 ill_nic_event_dispatch(ipif->ipif_ill, 14626 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 14627 } 14628 14629 /* 14630 * Blow away memberships we established in ipif_multicast_up(). 14631 */ 14632 ipif_multicast_down(ipif); 14633 14634 /* 14635 * Remove from the mapping for __sin6_src_id. We insert only 14636 * when the address is not INADDR_ANY. As IPv4 addresses are 14637 * stored as mapped addresses, we need to check for mapped 14638 * INADDR_ANY also. 14639 */ 14640 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 14641 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 14642 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14643 int err; 14644 14645 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 14646 ipif->ipif_zoneid, ipst); 14647 if (err != 0) { 14648 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 14649 } 14650 } 14651 14652 /* 14653 * Delete all IRE's pointing at this ipif or its source address. 14654 */ 14655 if (ipif->ipif_isv6) { 14656 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 14657 ipst); 14658 } else { 14659 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 14660 ipst); 14661 } 14662 14663 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 14664 /* 14665 * Since the interface is now down, it may have just become 14666 * inactive. Note that this needs to be done even for a 14667 * lll_logical_down(), or ARP entries will not get correctly 14668 * restored when the interface comes back up. 14669 */ 14670 if (IS_UNDER_IPMP(ill)) 14671 ipmp_ill_refresh_active(ill); 14672 } 14673 14674 /* 14675 * Cleaning up the conn_ire_cache or conns must be done only after the 14676 * ires have been deleted above. Otherwise a thread could end up 14677 * caching an ire in a conn after we have finished the cleanup of the 14678 * conn. The caching is done after making sure that the ire is not yet 14679 * condemned. Also documented in the block comment above ip_output 14680 */ 14681 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 14682 /* Also, delete the ires cached in SCTP */ 14683 sctp_ire_cache_flush(ipif); 14684 14685 /* 14686 * Update any other ipifs which have used "our" local address as 14687 * a source address. This entails removing and recreating IRE_INTERFACE 14688 * entries for such ipifs. 14689 */ 14690 if (ipif->ipif_isv6) 14691 ipif_update_other_ipifs_v6(ipif); 14692 else 14693 ipif_update_other_ipifs(ipif); 14694 14695 /* 14696 * neighbor-discovery or arp entries for this interface. 14697 */ 14698 ipif_ndp_down(ipif); 14699 14700 /* 14701 * If mp is NULL the caller will wait for the appropriate refcnt. 14702 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 14703 * and ill_delete -> ipif_free -> ipif_down 14704 */ 14705 if (mp == NULL) { 14706 ASSERT(q == NULL); 14707 return (0); 14708 } 14709 14710 if (CONN_Q(q)) { 14711 connp = Q_TO_CONN(q); 14712 mutex_enter(&connp->conn_lock); 14713 } else { 14714 connp = NULL; 14715 } 14716 mutex_enter(&ill->ill_lock); 14717 /* 14718 * Are there any ire's pointing to this ipif that are still active ? 14719 * If this is the last ipif going down, are there any ire's pointing 14720 * to this ill that are still active ? 14721 */ 14722 if (ipif_is_quiescent(ipif)) { 14723 mutex_exit(&ill->ill_lock); 14724 if (connp != NULL) 14725 mutex_exit(&connp->conn_lock); 14726 return (0); 14727 } 14728 14729 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 14730 ill->ill_name, (void *)ill)); 14731 /* 14732 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 14733 * drops down, the operation will be restarted by ipif_ill_refrele_tail 14734 * which in turn is called by the last refrele on the ipif/ill/ire. 14735 */ 14736 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 14737 if (!success) { 14738 /* The conn is closing. So just return */ 14739 ASSERT(connp != NULL); 14740 mutex_exit(&ill->ill_lock); 14741 mutex_exit(&connp->conn_lock); 14742 return (EINTR); 14743 } 14744 14745 mutex_exit(&ill->ill_lock); 14746 if (connp != NULL) 14747 mutex_exit(&connp->conn_lock); 14748 return (EINPROGRESS); 14749 } 14750 14751 void 14752 ipif_down_tail(ipif_t *ipif) 14753 { 14754 ill_t *ill = ipif->ipif_ill; 14755 14756 /* 14757 * Skip any loopback interface (null wq). 14758 * If this is the last logical interface on the ill 14759 * have ill_dl_down tell the driver we are gone (unbind) 14760 * Note that lun 0 can ipif_down even though 14761 * there are other logical units that are up. 14762 * This occurs e.g. when we change a "significant" IFF_ flag. 14763 */ 14764 if (ill->ill_wq != NULL && !ill->ill_logical_down && 14765 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 14766 ill->ill_dl_up) { 14767 ill_dl_down(ill); 14768 } 14769 ill->ill_logical_down = 0; 14770 14771 /* 14772 * Has to be after removing the routes in ipif_down_delete_ire. 14773 */ 14774 ipif_resolver_down(ipif); 14775 14776 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 14777 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 14778 } 14779 14780 /* 14781 * Bring interface logically down without bringing the physical interface 14782 * down e.g. when the netmask is changed. This avoids long lasting link 14783 * negotiations between an ethernet interface and a certain switches. 14784 */ 14785 static int 14786 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 14787 { 14788 /* 14789 * The ill_logical_down flag is a transient flag. It is set here 14790 * and is cleared once the down has completed in ipif_down_tail. 14791 * This flag does not indicate whether the ill stream is in the 14792 * DL_BOUND state with the driver. Instead this flag is used by 14793 * ipif_down_tail to determine whether to DL_UNBIND the stream with 14794 * the driver. The state of the ill stream i.e. whether it is 14795 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 14796 */ 14797 ipif->ipif_ill->ill_logical_down = 1; 14798 return (ipif_down(ipif, q, mp)); 14799 } 14800 14801 /* 14802 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 14803 * If the usesrc client ILL is already part of a usesrc group or not, 14804 * in either case a ire_stq with the matching usesrc client ILL will 14805 * locate the IRE's that need to be deleted. We want IREs to be created 14806 * with the new source address. 14807 */ 14808 static void 14809 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 14810 { 14811 ill_t *ucill = (ill_t *)ill_arg; 14812 14813 ASSERT(IAM_WRITER_ILL(ucill)); 14814 14815 if (ire->ire_stq == NULL) 14816 return; 14817 14818 if ((ire->ire_type == IRE_CACHE) && 14819 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 14820 ire_delete(ire); 14821 } 14822 14823 /* 14824 * ire_walk routine to delete every IRE dependent on the interface 14825 * address that is going down. (Always called as writer.) 14826 * Works for both v4 and v6. 14827 * In addition for checking for ire_ipif matches it also checks for 14828 * IRE_CACHE entries which have the same source address as the 14829 * disappearing ipif since ipif_select_source might have picked 14830 * that source. Note that ipif_down/ipif_update_other_ipifs takes 14831 * care of any IRE_INTERFACE with the disappearing source address. 14832 */ 14833 static void 14834 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 14835 { 14836 ipif_t *ipif = (ipif_t *)ipif_arg; 14837 14838 ASSERT(IAM_WRITER_IPIF(ipif)); 14839 if (ire->ire_ipif == NULL) 14840 return; 14841 14842 if (ire->ire_ipif != ipif) { 14843 /* 14844 * Look for a matching source address. 14845 */ 14846 if (ire->ire_type != IRE_CACHE) 14847 return; 14848 if (ipif->ipif_flags & IPIF_NOLOCAL) 14849 return; 14850 14851 if (ire->ire_ipversion == IPV4_VERSION) { 14852 if (ire->ire_src_addr != ipif->ipif_src_addr) 14853 return; 14854 } else { 14855 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 14856 &ipif->ipif_v6lcl_addr)) 14857 return; 14858 } 14859 ire_delete(ire); 14860 return; 14861 } 14862 /* 14863 * ire_delete() will do an ire_flush_cache which will delete 14864 * all ire_ipif matches 14865 */ 14866 ire_delete(ire); 14867 } 14868 14869 /* 14870 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 14871 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 14872 * 2) when an interface is brought up or down (on that ill). 14873 * This ensures that the IRE_CACHE entries don't retain stale source 14874 * address selection results. 14875 */ 14876 void 14877 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 14878 { 14879 ill_t *ill = (ill_t *)ill_arg; 14880 14881 ASSERT(IAM_WRITER_ILL(ill)); 14882 ASSERT(ire->ire_type == IRE_CACHE); 14883 14884 /* 14885 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches 14886 * ill, but we only want to delete the IRE if ire_ipif matches. 14887 */ 14888 ASSERT(ire->ire_ipif != NULL); 14889 if (ill == ire->ire_ipif->ipif_ill) 14890 ire_delete(ire); 14891 } 14892 14893 /* 14894 * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this 14895 * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references 14896 * the IPMP ill. 14897 */ 14898 void 14899 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 14900 { 14901 ill_t *ill = (ill_t *)ill_arg; 14902 14903 ASSERT(IAM_WRITER_ILL(ill)); 14904 ASSERT(ire->ire_type == IRE_CACHE); 14905 14906 /* 14907 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches 14908 * ill, but we only want to delete the IRE if ire_stq matches. 14909 */ 14910 if (ire->ire_stq->q_ptr == ill_arg) 14911 ire_delete(ire); 14912 } 14913 14914 /* 14915 * Delete all the IREs whose ire_stq's reference any ill in the same IPMP 14916 * group as `ill_arg'. Used by ipmp_ill_deactivate() to flush all IRE_CACHE 14917 * entries for the illgrp. 14918 */ 14919 void 14920 ill_grp_cache_delete(ire_t *ire, char *ill_arg) 14921 { 14922 ill_t *ill = (ill_t *)ill_arg; 14923 14924 ASSERT(IAM_WRITER_ILL(ill)); 14925 14926 if (ire->ire_type == IRE_CACHE && 14927 IS_IN_SAME_ILLGRP((ill_t *)ire->ire_stq->q_ptr, ill)) { 14928 ire_delete(ire); 14929 } 14930 } 14931 14932 /* 14933 * Delete all broadcast IREs with a source address on `ill_arg'. 14934 */ 14935 static void 14936 ill_broadcast_delete(ire_t *ire, char *ill_arg) 14937 { 14938 ill_t *ill = (ill_t *)ill_arg; 14939 14940 ASSERT(IAM_WRITER_ILL(ill)); 14941 ASSERT(ire->ire_type == IRE_BROADCAST); 14942 14943 if (ire->ire_ipif->ipif_ill == ill) 14944 ire_delete(ire); 14945 } 14946 14947 /* 14948 * Initiate deallocate of an IPIF. Always called as writer. Called by 14949 * ill_delete or ip_sioctl_removeif. 14950 */ 14951 static void 14952 ipif_free(ipif_t *ipif) 14953 { 14954 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14955 14956 ASSERT(IAM_WRITER_IPIF(ipif)); 14957 14958 if (ipif->ipif_recovery_id != 0) 14959 (void) untimeout(ipif->ipif_recovery_id); 14960 ipif->ipif_recovery_id = 0; 14961 14962 /* Remove conn references */ 14963 reset_conn_ipif(ipif); 14964 14965 /* 14966 * Make sure we have valid net and subnet broadcast ire's for the 14967 * other ipif's which share them with this ipif. 14968 */ 14969 if (!ipif->ipif_isv6) 14970 ipif_check_bcast_ires(ipif); 14971 14972 /* 14973 * Take down the interface. We can be called either from ill_delete 14974 * or from ip_sioctl_removeif. 14975 */ 14976 (void) ipif_down(ipif, NULL, NULL); 14977 14978 /* 14979 * Now that the interface is down, there's no chance it can still 14980 * become a duplicate. Cancel any timer that may have been set while 14981 * tearing down. 14982 */ 14983 if (ipif->ipif_recovery_id != 0) 14984 (void) untimeout(ipif->ipif_recovery_id); 14985 ipif->ipif_recovery_id = 0; 14986 14987 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14988 /* Remove pointers to this ill in the multicast routing tables */ 14989 reset_mrt_vif_ipif(ipif); 14990 /* If necessary, clear the cached source ipif rotor. */ 14991 if (ipif->ipif_ill->ill_src_ipif == ipif) 14992 ipif->ipif_ill->ill_src_ipif = NULL; 14993 rw_exit(&ipst->ips_ill_g_lock); 14994 } 14995 14996 static void 14997 ipif_free_tail(ipif_t *ipif) 14998 { 14999 mblk_t *mp; 15000 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15001 15002 /* 15003 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 15004 */ 15005 mutex_enter(&ipif->ipif_saved_ire_lock); 15006 mp = ipif->ipif_saved_ire_mp; 15007 ipif->ipif_saved_ire_mp = NULL; 15008 mutex_exit(&ipif->ipif_saved_ire_lock); 15009 freemsg(mp); 15010 15011 /* 15012 * Need to hold both ill_g_lock and ill_lock while 15013 * inserting or removing an ipif from the linked list 15014 * of ipifs hanging off the ill. 15015 */ 15016 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15017 15018 ASSERT(ilm_walk_ipif(ipif) == 0); 15019 15020 #ifdef DEBUG 15021 ipif_trace_cleanup(ipif); 15022 #endif 15023 15024 /* Ask SCTP to take it out of it list */ 15025 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 15026 15027 /* Get it out of the ILL interface list. */ 15028 ipif_remove(ipif); 15029 rw_exit(&ipst->ips_ill_g_lock); 15030 15031 mutex_destroy(&ipif->ipif_saved_ire_lock); 15032 15033 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 15034 ASSERT(ipif->ipif_recovery_id == 0); 15035 15036 /* Free the memory. */ 15037 mi_free(ipif); 15038 } 15039 15040 /* 15041 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 15042 * is zero. 15043 */ 15044 void 15045 ipif_get_name(const ipif_t *ipif, char *buf, int len) 15046 { 15047 char lbuf[LIFNAMSIZ]; 15048 char *name; 15049 size_t name_len; 15050 15051 buf[0] = '\0'; 15052 name = ipif->ipif_ill->ill_name; 15053 name_len = ipif->ipif_ill->ill_name_length; 15054 if (ipif->ipif_id != 0) { 15055 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 15056 ipif->ipif_id); 15057 name = lbuf; 15058 name_len = mi_strlen(name) + 1; 15059 } 15060 len -= 1; 15061 buf[len] = '\0'; 15062 len = MIN(len, name_len); 15063 bcopy(name, buf, len); 15064 } 15065 15066 /* 15067 * Find an IPIF based on the name passed in. Names can be of the form <phys> 15068 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 15069 * implied unit id is zero. <phys> must correspond to the name of an ILL. 15070 * (May be called as writer.) 15071 */ 15072 static ipif_t * 15073 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 15074 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 15075 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 15076 { 15077 char *cp; 15078 char *endp; 15079 long id; 15080 ill_t *ill; 15081 ipif_t *ipif; 15082 uint_t ire_type; 15083 boolean_t did_alloc = B_FALSE; 15084 ipsq_t *ipsq; 15085 15086 if (error != NULL) 15087 *error = 0; 15088 15089 /* 15090 * If the caller wants to us to create the ipif, make sure we have a 15091 * valid zoneid 15092 */ 15093 ASSERT(!do_alloc || zoneid != ALL_ZONES); 15094 15095 if (namelen == 0) { 15096 if (error != NULL) 15097 *error = ENXIO; 15098 return (NULL); 15099 } 15100 15101 *exists = B_FALSE; 15102 /* Look for a colon in the name. */ 15103 endp = &name[namelen]; 15104 for (cp = endp; --cp > name; ) { 15105 if (*cp == IPIF_SEPARATOR_CHAR) 15106 break; 15107 } 15108 15109 if (*cp == IPIF_SEPARATOR_CHAR) { 15110 /* 15111 * Reject any non-decimal aliases for logical 15112 * interfaces. Aliases with leading zeroes 15113 * are also rejected as they introduce ambiguity 15114 * in the naming of the interfaces. 15115 * In order to confirm with existing semantics, 15116 * and to not break any programs/script relying 15117 * on that behaviour, if<0>:0 is considered to be 15118 * a valid interface. 15119 * 15120 * If alias has two or more digits and the first 15121 * is zero, fail. 15122 */ 15123 if (&cp[2] < endp && cp[1] == '0') { 15124 if (error != NULL) 15125 *error = EINVAL; 15126 return (NULL); 15127 } 15128 } 15129 15130 if (cp <= name) { 15131 cp = endp; 15132 } else { 15133 *cp = '\0'; 15134 } 15135 15136 /* 15137 * Look up the ILL, based on the portion of the name 15138 * before the slash. ill_lookup_on_name returns a held ill. 15139 * Temporary to check whether ill exists already. If so 15140 * ill_lookup_on_name will clear it. 15141 */ 15142 ill = ill_lookup_on_name(name, do_alloc, isv6, 15143 q, mp, func, error, &did_alloc, ipst); 15144 if (cp != endp) 15145 *cp = IPIF_SEPARATOR_CHAR; 15146 if (ill == NULL) 15147 return (NULL); 15148 15149 /* Establish the unit number in the name. */ 15150 id = 0; 15151 if (cp < endp && *endp == '\0') { 15152 /* If there was a colon, the unit number follows. */ 15153 cp++; 15154 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 15155 ill_refrele(ill); 15156 if (error != NULL) 15157 *error = ENXIO; 15158 return (NULL); 15159 } 15160 } 15161 15162 GRAB_CONN_LOCK(q); 15163 mutex_enter(&ill->ill_lock); 15164 /* Now see if there is an IPIF with this unit number. */ 15165 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15166 if (ipif->ipif_id == id) { 15167 if (zoneid != ALL_ZONES && 15168 zoneid != ipif->ipif_zoneid && 15169 ipif->ipif_zoneid != ALL_ZONES) { 15170 mutex_exit(&ill->ill_lock); 15171 RELEASE_CONN_LOCK(q); 15172 ill_refrele(ill); 15173 if (error != NULL) 15174 *error = ENXIO; 15175 return (NULL); 15176 } 15177 /* 15178 * The block comment at the start of ipif_down 15179 * explains the use of the macros used below 15180 */ 15181 if (IPIF_CAN_LOOKUP(ipif)) { 15182 ipif_refhold_locked(ipif); 15183 mutex_exit(&ill->ill_lock); 15184 if (!did_alloc) 15185 *exists = B_TRUE; 15186 /* 15187 * Drop locks before calling ill_refrele 15188 * since it can potentially call into 15189 * ipif_ill_refrele_tail which can end up 15190 * in trying to acquire any lock. 15191 */ 15192 RELEASE_CONN_LOCK(q); 15193 ill_refrele(ill); 15194 return (ipif); 15195 } else if (IPIF_CAN_WAIT(ipif, q)) { 15196 ipsq = ill->ill_phyint->phyint_ipsq; 15197 mutex_enter(&ipsq->ipsq_lock); 15198 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 15199 mutex_exit(&ill->ill_lock); 15200 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 15201 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 15202 mutex_exit(&ipsq->ipsq_lock); 15203 RELEASE_CONN_LOCK(q); 15204 ill_refrele(ill); 15205 if (error != NULL) 15206 *error = EINPROGRESS; 15207 return (NULL); 15208 } 15209 } 15210 } 15211 RELEASE_CONN_LOCK(q); 15212 15213 if (!do_alloc) { 15214 mutex_exit(&ill->ill_lock); 15215 ill_refrele(ill); 15216 if (error != NULL) 15217 *error = ENXIO; 15218 return (NULL); 15219 } 15220 15221 /* 15222 * If none found, atomically allocate and return a new one. 15223 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 15224 * to support "receive only" use of lo0:1 etc. as is still done 15225 * below as an initial guess. 15226 * However, this is now likely to be overriden later in ipif_up_done() 15227 * when we know for sure what address has been configured on the 15228 * interface, since we might have more than one loopback interface 15229 * with a loopback address, e.g. in the case of zones, and all the 15230 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 15231 */ 15232 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 15233 ire_type = IRE_LOOPBACK; 15234 else 15235 ire_type = IRE_LOCAL; 15236 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE); 15237 if (ipif != NULL) 15238 ipif_refhold_locked(ipif); 15239 else if (error != NULL) 15240 *error = ENOMEM; 15241 mutex_exit(&ill->ill_lock); 15242 ill_refrele(ill); 15243 return (ipif); 15244 } 15245 15246 /* 15247 * This routine is called whenever a new address comes up on an ipif. If 15248 * we are configured to respond to address mask requests, then we are supposed 15249 * to broadcast an address mask reply at this time. This routine is also 15250 * called if we are already up, but a netmask change is made. This is legal 15251 * but might not make the system manager very popular. (May be called 15252 * as writer.) 15253 */ 15254 void 15255 ipif_mask_reply(ipif_t *ipif) 15256 { 15257 icmph_t *icmph; 15258 ipha_t *ipha; 15259 mblk_t *mp; 15260 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15261 15262 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 15263 15264 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 15265 return; 15266 15267 /* ICMP mask reply is IPv4 only */ 15268 ASSERT(!ipif->ipif_isv6); 15269 /* ICMP mask reply is not for a loopback interface */ 15270 ASSERT(ipif->ipif_ill->ill_wq != NULL); 15271 15272 mp = allocb(REPLY_LEN, BPRI_HI); 15273 if (mp == NULL) 15274 return; 15275 mp->b_wptr = mp->b_rptr + REPLY_LEN; 15276 15277 ipha = (ipha_t *)mp->b_rptr; 15278 bzero(ipha, REPLY_LEN); 15279 *ipha = icmp_ipha; 15280 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 15281 ipha->ipha_src = ipif->ipif_src_addr; 15282 ipha->ipha_dst = ipif->ipif_brd_addr; 15283 ipha->ipha_length = htons(REPLY_LEN); 15284 ipha->ipha_ident = 0; 15285 15286 icmph = (icmph_t *)&ipha[1]; 15287 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 15288 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 15289 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 15290 15291 put(ipif->ipif_wq, mp); 15292 15293 #undef REPLY_LEN 15294 } 15295 15296 /* 15297 * When the mtu in the ipif changes, we call this routine through ire_walk 15298 * to update all the relevant IREs. 15299 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 15300 */ 15301 static void 15302 ipif_mtu_change(ire_t *ire, char *ipif_arg) 15303 { 15304 ipif_t *ipif = (ipif_t *)ipif_arg; 15305 15306 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 15307 return; 15308 15309 mutex_enter(&ire->ire_lock); 15310 if (ire->ire_marks & IRE_MARK_PMTU) { 15311 /* Avoid increasing the PMTU */ 15312 ire->ire_max_frag = MIN(ipif->ipif_mtu, ire->ire_max_frag); 15313 if (ire->ire_max_frag == ipif->ipif_mtu) 15314 ire->ire_marks &= ~IRE_MARK_PMTU; 15315 } else { 15316 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 15317 } 15318 mutex_exit(&ire->ire_lock); 15319 } 15320 15321 /* 15322 * When the mtu in the ill changes, we call this routine through ire_walk 15323 * to update all the relevant IREs. 15324 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 15325 */ 15326 void 15327 ill_mtu_change(ire_t *ire, char *ill_arg) 15328 { 15329 ill_t *ill = (ill_t *)ill_arg; 15330 15331 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 15332 return; 15333 15334 mutex_enter(&ire->ire_lock); 15335 if (ire->ire_marks & IRE_MARK_PMTU) { 15336 /* Avoid increasing the PMTU */ 15337 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, 15338 ire->ire_max_frag); 15339 if (ire->ire_max_frag == ire->ire_ipif->ipif_mtu) { 15340 ire->ire_marks &= ~IRE_MARK_PMTU; 15341 } 15342 } else { 15343 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, IP_MAXPACKET); 15344 } 15345 mutex_exit(&ire->ire_lock); 15346 } 15347 15348 /* 15349 * Join the ipif specific multicast groups. 15350 * Must be called after a mapping has been set up in the resolver. (Always 15351 * called as writer.) 15352 */ 15353 void 15354 ipif_multicast_up(ipif_t *ipif) 15355 { 15356 int err; 15357 ill_t *ill; 15358 15359 ASSERT(IAM_WRITER_IPIF(ipif)); 15360 15361 ill = ipif->ipif_ill; 15362 15363 ip1dbg(("ipif_multicast_up\n")); 15364 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 15365 return; 15366 15367 if (ipif->ipif_isv6) { 15368 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 15369 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 15370 15371 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 15372 15373 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 15374 return; 15375 15376 ip1dbg(("ipif_multicast_up - addmulti\n")); 15377 15378 /* 15379 * Join the all hosts multicast address. We skip this for 15380 * underlying IPMP interfaces since they should be invisible. 15381 */ 15382 if (!IS_UNDER_IPMP(ill)) { 15383 err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid, 15384 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15385 if (err != 0) { 15386 ip0dbg(("ipif_multicast_up: " 15387 "all_hosts_mcast failed %d\n", err)); 15388 return; 15389 } 15390 ipif->ipif_joined_allhosts = 1; 15391 } 15392 15393 /* 15394 * Enable multicast for the solicited node multicast address 15395 */ 15396 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 15397 err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid, 15398 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15399 if (err != 0) { 15400 ip0dbg(("ipif_multicast_up: solicited MC" 15401 " failed %d\n", err)); 15402 if (ipif->ipif_joined_allhosts) { 15403 (void) ip_delmulti_v6(&v6allmc, ill, 15404 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15405 ipif->ipif_joined_allhosts = 0; 15406 } 15407 return; 15408 } 15409 } 15410 } else { 15411 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 15412 return; 15413 15414 /* Join the all hosts multicast address */ 15415 ip1dbg(("ipif_multicast_up - addmulti\n")); 15416 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 15417 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15418 if (err) { 15419 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 15420 return; 15421 } 15422 } 15423 ipif->ipif_multicast_up = 1; 15424 } 15425 15426 /* 15427 * Blow away any multicast groups that we joined in ipif_multicast_up(). 15428 * (Explicit memberships are blown away in ill_leave_multicast() when the 15429 * ill is brought down.) 15430 */ 15431 void 15432 ipif_multicast_down(ipif_t *ipif) 15433 { 15434 int err; 15435 15436 ASSERT(IAM_WRITER_IPIF(ipif)); 15437 15438 ip1dbg(("ipif_multicast_down\n")); 15439 if (!ipif->ipif_multicast_up) 15440 return; 15441 15442 ip1dbg(("ipif_multicast_down - delmulti\n")); 15443 15444 if (!ipif->ipif_isv6) { 15445 err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE, 15446 B_TRUE); 15447 if (err != 0) 15448 ip0dbg(("ipif_multicast_down: failed %d\n", err)); 15449 15450 ipif->ipif_multicast_up = 0; 15451 return; 15452 } 15453 15454 /* 15455 * Leave the all-hosts multicast address. 15456 */ 15457 if (ipif->ipif_joined_allhosts) { 15458 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 15459 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15460 if (err != 0) { 15461 ip0dbg(("ipif_multicast_down: all_hosts_mcast " 15462 "failed %d\n", err)); 15463 } 15464 ipif->ipif_joined_allhosts = 0; 15465 } 15466 15467 /* 15468 * Disable multicast for the solicited node multicast address 15469 */ 15470 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 15471 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 15472 15473 ipv6_multi.s6_addr32[3] |= 15474 ipif->ipif_v6lcl_addr.s6_addr32[3]; 15475 15476 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 15477 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15478 if (err != 0) { 15479 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 15480 err)); 15481 } 15482 } 15483 15484 ipif->ipif_multicast_up = 0; 15485 } 15486 15487 /* 15488 * Used when an interface comes up to recreate any extra routes on this 15489 * interface. 15490 */ 15491 static ire_t ** 15492 ipif_recover_ire(ipif_t *ipif) 15493 { 15494 mblk_t *mp; 15495 ire_t **ipif_saved_irep; 15496 ire_t **irep; 15497 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15498 15499 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 15500 ipif->ipif_id)); 15501 15502 mutex_enter(&ipif->ipif_saved_ire_lock); 15503 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 15504 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 15505 if (ipif_saved_irep == NULL) { 15506 mutex_exit(&ipif->ipif_saved_ire_lock); 15507 return (NULL); 15508 } 15509 15510 irep = ipif_saved_irep; 15511 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 15512 ire_t *ire; 15513 queue_t *rfq; 15514 queue_t *stq; 15515 ifrt_t *ifrt; 15516 uchar_t *src_addr; 15517 uchar_t *gateway_addr; 15518 ushort_t type; 15519 15520 /* 15521 * When the ire was initially created and then added in 15522 * ip_rt_add(), it was created either using ipif->ipif_net_type 15523 * in the case of a traditional interface route, or as one of 15524 * the IRE_OFFSUBNET types (with the exception of 15525 * IRE_HOST types ire which is created by icmp_redirect() and 15526 * which we don't need to save or recover). In the case where 15527 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 15528 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 15529 * to satisfy software like GateD and Sun Cluster which creates 15530 * routes using the the loopback interface's address as a 15531 * gateway. 15532 * 15533 * As ifrt->ifrt_type reflects the already updated ire_type, 15534 * ire_create() will be called in the same way here as 15535 * in ip_rt_add(), namely using ipif->ipif_net_type when 15536 * the route looks like a traditional interface route (where 15537 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 15538 * the saved ifrt->ifrt_type. This means that in the case where 15539 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 15540 * ire_create() will be an IRE_LOOPBACK, it will then be turned 15541 * into an IRE_IF_NORESOLVER and then added by ire_add(). 15542 */ 15543 ifrt = (ifrt_t *)mp->b_rptr; 15544 ASSERT(ifrt->ifrt_type != IRE_CACHE); 15545 if (ifrt->ifrt_type & IRE_INTERFACE) { 15546 rfq = NULL; 15547 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 15548 ? ipif->ipif_rq : ipif->ipif_wq; 15549 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15550 ? (uint8_t *)&ifrt->ifrt_src_addr 15551 : (uint8_t *)&ipif->ipif_src_addr; 15552 gateway_addr = NULL; 15553 type = ipif->ipif_net_type; 15554 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 15555 /* Recover multiroute broadcast IRE. */ 15556 rfq = ipif->ipif_rq; 15557 stq = ipif->ipif_wq; 15558 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15559 ? (uint8_t *)&ifrt->ifrt_src_addr 15560 : (uint8_t *)&ipif->ipif_src_addr; 15561 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 15562 type = ifrt->ifrt_type; 15563 } else { 15564 rfq = NULL; 15565 stq = NULL; 15566 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15567 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 15568 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 15569 type = ifrt->ifrt_type; 15570 } 15571 15572 /* 15573 * Create a copy of the IRE with the saved address and netmask. 15574 */ 15575 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 15576 "0x%x/0x%x\n", 15577 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 15578 ntohl(ifrt->ifrt_addr), 15579 ntohl(ifrt->ifrt_mask))); 15580 ire = ire_create( 15581 (uint8_t *)&ifrt->ifrt_addr, 15582 (uint8_t *)&ifrt->ifrt_mask, 15583 src_addr, 15584 gateway_addr, 15585 &ifrt->ifrt_max_frag, 15586 NULL, 15587 rfq, 15588 stq, 15589 type, 15590 ipif, 15591 0, 15592 0, 15593 0, 15594 ifrt->ifrt_flags, 15595 &ifrt->ifrt_iulp_info, 15596 NULL, 15597 NULL, 15598 ipst); 15599 15600 if (ire == NULL) { 15601 mutex_exit(&ipif->ipif_saved_ire_lock); 15602 kmem_free(ipif_saved_irep, 15603 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 15604 return (NULL); 15605 } 15606 15607 /* 15608 * Some software (for example, GateD and Sun Cluster) attempts 15609 * to create (what amount to) IRE_PREFIX routes with the 15610 * loopback address as the gateway. This is primarily done to 15611 * set up prefixes with the RTF_REJECT flag set (for example, 15612 * when generating aggregate routes.) 15613 * 15614 * If the IRE type (as defined by ipif->ipif_net_type) is 15615 * IRE_LOOPBACK, then we map the request into a 15616 * IRE_IF_NORESOLVER. 15617 */ 15618 if (ipif->ipif_net_type == IRE_LOOPBACK) 15619 ire->ire_type = IRE_IF_NORESOLVER; 15620 /* 15621 * ire held by ire_add, will be refreled' towards the 15622 * the end of ipif_up_done 15623 */ 15624 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 15625 *irep = ire; 15626 irep++; 15627 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 15628 } 15629 mutex_exit(&ipif->ipif_saved_ire_lock); 15630 return (ipif_saved_irep); 15631 } 15632 15633 /* 15634 * Used to set the netmask and broadcast address to default values when the 15635 * interface is brought up. (Always called as writer.) 15636 */ 15637 static void 15638 ipif_set_default(ipif_t *ipif) 15639 { 15640 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 15641 15642 if (!ipif->ipif_isv6) { 15643 /* 15644 * Interface holds an IPv4 address. Default 15645 * mask is the natural netmask. 15646 */ 15647 if (!ipif->ipif_net_mask) { 15648 ipaddr_t v4mask; 15649 15650 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 15651 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 15652 } 15653 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 15654 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 15655 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 15656 } else { 15657 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 15658 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 15659 } 15660 /* 15661 * NOTE: SunOS 4.X does this even if the broadcast address 15662 * has been already set thus we do the same here. 15663 */ 15664 if (ipif->ipif_flags & IPIF_BROADCAST) { 15665 ipaddr_t v4addr; 15666 15667 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 15668 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 15669 } 15670 } else { 15671 /* 15672 * Interface holds an IPv6-only address. Default 15673 * mask is all-ones. 15674 */ 15675 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 15676 ipif->ipif_v6net_mask = ipv6_all_ones; 15677 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 15678 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 15679 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 15680 } else { 15681 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 15682 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 15683 } 15684 } 15685 } 15686 15687 /* 15688 * Return 0 if this address can be used as local address without causing 15689 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 15690 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 15691 * Note that the same IPv6 link-local address is allowed as long as the ills 15692 * are not on the same link. 15693 */ 15694 int 15695 ip_addr_availability_check(ipif_t *new_ipif) 15696 { 15697 in6_addr_t our_v6addr; 15698 ill_t *ill; 15699 ipif_t *ipif; 15700 ill_walk_context_t ctx; 15701 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 15702 15703 ASSERT(IAM_WRITER_IPIF(new_ipif)); 15704 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 15705 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 15706 15707 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 15708 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 15709 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 15710 return (0); 15711 15712 our_v6addr = new_ipif->ipif_v6lcl_addr; 15713 15714 if (new_ipif->ipif_isv6) 15715 ill = ILL_START_WALK_V6(&ctx, ipst); 15716 else 15717 ill = ILL_START_WALK_V4(&ctx, ipst); 15718 15719 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 15720 for (ipif = ill->ill_ipif; ipif != NULL; 15721 ipif = ipif->ipif_next) { 15722 if ((ipif == new_ipif) || 15723 !(ipif->ipif_flags & IPIF_UP) || 15724 (ipif->ipif_flags & IPIF_UNNUMBERED) || 15725 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 15726 &our_v6addr)) 15727 continue; 15728 15729 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 15730 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 15731 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 15732 ipif->ipif_flags |= IPIF_UNNUMBERED; 15733 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 15734 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 15735 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 15736 continue; 15737 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 15738 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 15739 continue; 15740 else if (new_ipif->ipif_ill == ill) 15741 return (EADDRINUSE); 15742 else 15743 return (EADDRNOTAVAIL); 15744 } 15745 } 15746 15747 return (0); 15748 } 15749 15750 /* 15751 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 15752 * IREs for the ipif. 15753 * When the routine returns EINPROGRESS then mp has been consumed and 15754 * the ioctl will be acked from ip_rput_dlpi. 15755 */ 15756 int 15757 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 15758 { 15759 ill_t *ill = ipif->ipif_ill; 15760 boolean_t isv6 = ipif->ipif_isv6; 15761 int err = 0; 15762 boolean_t success; 15763 uint_t ipif_orig_id; 15764 ip_stack_t *ipst = ill->ill_ipst; 15765 15766 ASSERT(IAM_WRITER_IPIF(ipif)); 15767 15768 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 15769 15770 /* Shouldn't get here if it is already up. */ 15771 if (ipif->ipif_flags & IPIF_UP) 15772 return (EALREADY); 15773 15774 /* 15775 * If this is a request to bring up a data address on an interface 15776 * under IPMP, then move the address to its IPMP meta-interface and 15777 * try to bring it up. One complication is that the zeroth ipif for 15778 * an ill is special, in that every ill always has one, and that code 15779 * throughout IP deferences ill->ill_ipif without holding any locks. 15780 */ 15781 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 15782 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 15783 ipif_t *stubipif = NULL, *moveipif = NULL; 15784 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 15785 15786 /* 15787 * The ipif being brought up should be quiesced. If it's not, 15788 * something has gone amiss and we need to bail out. (If it's 15789 * quiesced, we know it will remain so via IPIF_CHANGING.) 15790 */ 15791 mutex_enter(&ill->ill_lock); 15792 if (!ipif_is_quiescent(ipif)) { 15793 mutex_exit(&ill->ill_lock); 15794 return (EINVAL); 15795 } 15796 mutex_exit(&ill->ill_lock); 15797 15798 /* 15799 * If we're going to need to allocate ipifs, do it prior 15800 * to starting the move (and grabbing locks). 15801 */ 15802 if (ipif->ipif_id == 0) { 15803 moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 15804 B_FALSE); 15805 stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 15806 B_FALSE); 15807 if (moveipif == NULL || stubipif == NULL) { 15808 mi_free(moveipif); 15809 mi_free(stubipif); 15810 return (ENOMEM); 15811 } 15812 } 15813 15814 /* 15815 * Grab or transfer the ipif to move. During the move, keep 15816 * ill_g_lock held to prevent any ill walker threads from 15817 * seeing things in an inconsistent state. 15818 */ 15819 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15820 if (ipif->ipif_id != 0) { 15821 ipif_remove(ipif); 15822 } else { 15823 ipif_transfer(ipif, moveipif, stubipif); 15824 ipif = moveipif; 15825 } 15826 15827 /* 15828 * Place the ipif on the IPMP ill. If the zeroth ipif on 15829 * the IPMP ill is a stub (0.0.0.0 down address) then we 15830 * replace that one. Otherwise, pick the next available slot. 15831 */ 15832 ipif->ipif_ill = ipmp_ill; 15833 ipif_orig_id = ipif->ipif_id; 15834 15835 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 15836 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 15837 ipif = ipmp_ill->ill_ipif; 15838 } else { 15839 ipif->ipif_id = -1; 15840 if (ipif_insert(ipif, B_FALSE) != 0) { 15841 /* 15842 * No more available ipif_id's -- put it back 15843 * on the original ill and fail the operation. 15844 * Since we're writer on the ill, we can be 15845 * sure our old slot is still available. 15846 */ 15847 ipif->ipif_id = ipif_orig_id; 15848 ipif->ipif_ill = ill; 15849 if (ipif_orig_id == 0) { 15850 ipif_transfer(ipif, ill->ill_ipif, 15851 NULL); 15852 } else { 15853 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 15854 } 15855 rw_exit(&ipst->ips_ill_g_lock); 15856 return (ENOMEM); 15857 } 15858 } 15859 rw_exit(&ipst->ips_ill_g_lock); 15860 15861 /* 15862 * Tell SCTP that the ipif has moved. Note that even if we 15863 * had to allocate a new ipif, the original sequence id was 15864 * preserved and therefore SCTP won't know. 15865 */ 15866 sctp_move_ipif(ipif, ill, ipmp_ill); 15867 15868 /* 15869 * If the ipif being brought up was on slot zero, then we 15870 * first need to bring up the placeholder we stuck there. In 15871 * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call 15872 * to ipif_up() itself, if we successfully bring up the 15873 * placeholder, we'll check ill_move_ipif and bring it up too. 15874 */ 15875 if (ipif_orig_id == 0) { 15876 ASSERT(ill->ill_move_ipif == NULL); 15877 ill->ill_move_ipif = ipif; 15878 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 15879 ASSERT(ill->ill_move_ipif == NULL); 15880 if (err != EINPROGRESS) 15881 ill->ill_move_ipif = NULL; 15882 return (err); 15883 } 15884 15885 /* 15886 * Bring it up on the IPMP ill. 15887 */ 15888 return (ipif_up(ipif, q, mp)); 15889 } 15890 15891 /* Skip arp/ndp for any loopback interface. */ 15892 if (ill->ill_wq != NULL) { 15893 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 15894 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 15895 15896 if (!ill->ill_dl_up) { 15897 /* 15898 * ill_dl_up is not yet set. i.e. we are yet to 15899 * DL_BIND with the driver and this is the first 15900 * logical interface on the ill to become "up". 15901 * Tell the driver to get going (via DL_BIND_REQ). 15902 * Note that changing "significant" IFF_ flags 15903 * address/netmask etc cause a down/up dance, but 15904 * does not cause an unbind (DL_UNBIND) with the driver 15905 */ 15906 return (ill_dl_up(ill, ipif, mp, q)); 15907 } 15908 15909 /* 15910 * ipif_resolver_up may end up sending an 15911 * AR_INTERFACE_UP message to ARP, which would, in 15912 * turn send a DLPI message to the driver. ioctls are 15913 * serialized and so we cannot send more than one 15914 * interface up message at a time. If ipif_resolver_up 15915 * does send an interface up message to ARP, we get 15916 * EINPROGRESS and we will complete in ip_arp_done. 15917 */ 15918 15919 ASSERT(connp != NULL || !CONN_Q(q)); 15920 if (connp != NULL) 15921 mutex_enter(&connp->conn_lock); 15922 mutex_enter(&ill->ill_lock); 15923 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 15924 mutex_exit(&ill->ill_lock); 15925 if (connp != NULL) 15926 mutex_exit(&connp->conn_lock); 15927 if (!success) 15928 return (EINTR); 15929 15930 /* 15931 * Crank up the resolver. For IPv6, this cranks up the 15932 * external resolver if one is configured, but even if an 15933 * external resolver isn't configured, it must be called to 15934 * reset DAD state. For IPv6, if an external resolver is not 15935 * being used, ipif_resolver_up() will never return 15936 * EINPROGRESS, so we can always call ipif_ndp_up() here. 15937 * Note that if an external resolver is being used, there's no 15938 * need to call ipif_ndp_up() since it will do nothing. 15939 */ 15940 err = ipif_resolver_up(ipif, Res_act_initial); 15941 if (err == EINPROGRESS) { 15942 /* We will complete it in ip_arp_done() */ 15943 return (err); 15944 } 15945 15946 if (isv6 && err == 0) 15947 err = ipif_ndp_up(ipif, B_TRUE); 15948 15949 ASSERT(err != EINPROGRESS); 15950 mp = ipsq_pending_mp_get(ipsq, &connp); 15951 ASSERT(mp != NULL); 15952 if (err != 0) 15953 return (err); 15954 } else { 15955 /* 15956 * Interfaces without underlying hardware don't do duplicate 15957 * address detection. 15958 */ 15959 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 15960 ipif->ipif_addr_ready = 1; 15961 } 15962 15963 err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif); 15964 if (err == 0 && ill->ill_move_ipif != NULL) { 15965 ipif = ill->ill_move_ipif; 15966 ill->ill_move_ipif = NULL; 15967 return (ipif_up(ipif, q, mp)); 15968 } 15969 return (err); 15970 } 15971 15972 /* 15973 * Perform a bind for the physical device. 15974 * When the routine returns EINPROGRESS then mp has been consumed and 15975 * the ioctl will be acked from ip_rput_dlpi. 15976 * Allocate an unbind message and save it until ipif_down. 15977 */ 15978 static int 15979 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 15980 { 15981 areq_t *areq; 15982 mblk_t *areq_mp = NULL; 15983 mblk_t *bind_mp = NULL; 15984 mblk_t *unbind_mp = NULL; 15985 conn_t *connp; 15986 boolean_t success; 15987 uint16_t sap_addr; 15988 15989 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 15990 ASSERT(IAM_WRITER_ILL(ill)); 15991 ASSERT(mp != NULL); 15992 15993 /* Create a resolver cookie for ARP */ 15994 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 15995 areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0); 15996 if (areq_mp == NULL) 15997 return (ENOMEM); 15998 15999 freemsg(ill->ill_resolver_mp); 16000 ill->ill_resolver_mp = areq_mp; 16001 areq = (areq_t *)areq_mp->b_rptr; 16002 sap_addr = ill->ill_sap; 16003 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 16004 } 16005 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 16006 DL_BIND_REQ); 16007 if (bind_mp == NULL) 16008 goto bad; 16009 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 16010 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 16011 16012 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 16013 if (unbind_mp == NULL) 16014 goto bad; 16015 16016 /* 16017 * Record state needed to complete this operation when the 16018 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 16019 */ 16020 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 16021 ASSERT(connp != NULL || !CONN_Q(q)); 16022 GRAB_CONN_LOCK(q); 16023 mutex_enter(&ipif->ipif_ill->ill_lock); 16024 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 16025 mutex_exit(&ipif->ipif_ill->ill_lock); 16026 RELEASE_CONN_LOCK(q); 16027 if (!success) 16028 goto bad; 16029 16030 /* 16031 * Save the unbind message for ill_dl_down(); it will be consumed when 16032 * the interface goes down. 16033 */ 16034 ASSERT(ill->ill_unbind_mp == NULL); 16035 ill->ill_unbind_mp = unbind_mp; 16036 16037 ill_dlpi_send(ill, bind_mp); 16038 /* Send down link-layer capabilities probe if not already done. */ 16039 ill_capability_probe(ill); 16040 16041 /* 16042 * Sysid used to rely on the fact that netboots set domainname 16043 * and the like. Now that miniroot boots aren't strictly netboots 16044 * and miniroot network configuration is driven from userland 16045 * these things still need to be set. This situation can be detected 16046 * by comparing the interface being configured here to the one 16047 * dhcifname was set to reference by the boot loader. Once sysid is 16048 * converted to use dhcp_ipc_getinfo() this call can go away. 16049 */ 16050 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 16051 (strcmp(ill->ill_name, dhcifname) == 0) && 16052 (strlen(srpc_domain) == 0)) { 16053 if (dhcpinit() != 0) 16054 cmn_err(CE_WARN, "no cached dhcp response"); 16055 } 16056 16057 /* 16058 * This operation will complete in ip_rput_dlpi with either 16059 * a DL_BIND_ACK or DL_ERROR_ACK. 16060 */ 16061 return (EINPROGRESS); 16062 bad: 16063 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 16064 16065 freemsg(bind_mp); 16066 freemsg(unbind_mp); 16067 return (ENOMEM); 16068 } 16069 16070 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 16071 16072 /* 16073 * DLPI and ARP is up. 16074 * Create all the IREs associated with an interface bring up multicast. 16075 * Set the interface flag and finish other initialization 16076 * that potentially had to be differed to after DL_BIND_ACK. 16077 */ 16078 int 16079 ipif_up_done(ipif_t *ipif) 16080 { 16081 ire_t *ire_array[20]; 16082 ire_t **irep = ire_array; 16083 ire_t **irep1; 16084 ipaddr_t net_mask = 0; 16085 ipaddr_t subnet_mask, route_mask; 16086 ill_t *ill = ipif->ipif_ill; 16087 queue_t *stq; 16088 ipif_t *src_ipif; 16089 ipif_t *tmp_ipif; 16090 boolean_t flush_ire_cache = B_TRUE; 16091 int err = 0; 16092 ire_t **ipif_saved_irep = NULL; 16093 int ipif_saved_ire_cnt; 16094 int cnt; 16095 boolean_t src_ipif_held = B_FALSE; 16096 boolean_t loopback = B_FALSE; 16097 ip_stack_t *ipst = ill->ill_ipst; 16098 16099 ip1dbg(("ipif_up_done(%s:%u)\n", 16100 ipif->ipif_ill->ill_name, ipif->ipif_id)); 16101 /* Check if this is a loopback interface */ 16102 if (ipif->ipif_ill->ill_wq == NULL) 16103 loopback = B_TRUE; 16104 16105 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 16106 /* 16107 * If all other interfaces for this ill are down or DEPRECATED, 16108 * or otherwise unsuitable for source address selection, remove 16109 * any IRE_CACHE entries for this ill to make sure source 16110 * address selection gets to take this new ipif into account. 16111 * No need to hold ill_lock while traversing the ipif list since 16112 * we are writer 16113 */ 16114 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 16115 tmp_ipif = tmp_ipif->ipif_next) { 16116 if (((tmp_ipif->ipif_flags & 16117 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 16118 !(tmp_ipif->ipif_flags & IPIF_UP)) || 16119 (tmp_ipif == ipif)) 16120 continue; 16121 /* first useable pre-existing interface */ 16122 flush_ire_cache = B_FALSE; 16123 break; 16124 } 16125 if (flush_ire_cache) 16126 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 16127 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 16128 16129 /* 16130 * Figure out which way the send-to queue should go. Only 16131 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 16132 * should show up here. 16133 */ 16134 switch (ill->ill_net_type) { 16135 case IRE_IF_RESOLVER: 16136 stq = ill->ill_rq; 16137 break; 16138 case IRE_IF_NORESOLVER: 16139 case IRE_LOOPBACK: 16140 stq = ill->ill_wq; 16141 break; 16142 default: 16143 return (EINVAL); 16144 } 16145 16146 if (IS_LOOPBACK(ill)) { 16147 /* 16148 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 16149 * ipif_lookup_on_name(), but in the case of zones we can have 16150 * several loopback addresses on lo0. So all the interfaces with 16151 * loopback addresses need to be marked IRE_LOOPBACK. 16152 */ 16153 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 16154 htonl(INADDR_LOOPBACK)) 16155 ipif->ipif_ire_type = IRE_LOOPBACK; 16156 else 16157 ipif->ipif_ire_type = IRE_LOCAL; 16158 } 16159 16160 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || 16161 ((ipif->ipif_flags & IPIF_DEPRECATED) && 16162 !(ipif->ipif_flags & IPIF_NOFAILOVER))) { 16163 /* 16164 * Can't use our source address. Select a different 16165 * source address for the IRE_INTERFACE and IRE_LOCAL 16166 */ 16167 src_ipif = ipif_select_source(ipif->ipif_ill, 16168 ipif->ipif_subnet, ipif->ipif_zoneid); 16169 if (src_ipif == NULL) 16170 src_ipif = ipif; /* Last resort */ 16171 else 16172 src_ipif_held = B_TRUE; 16173 } else { 16174 src_ipif = ipif; 16175 } 16176 16177 /* Create all the IREs associated with this interface */ 16178 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 16179 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 16180 16181 /* 16182 * If we're on a labeled system then make sure that zone- 16183 * private addresses have proper remote host database entries. 16184 */ 16185 if (is_system_labeled() && 16186 ipif->ipif_ire_type != IRE_LOOPBACK && 16187 !tsol_check_interface_address(ipif)) 16188 return (EINVAL); 16189 16190 /* Register the source address for __sin6_src_id */ 16191 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 16192 ipif->ipif_zoneid, ipst); 16193 if (err != 0) { 16194 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 16195 return (err); 16196 } 16197 16198 /* If the interface address is set, create the local IRE. */ 16199 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 16200 (void *)ipif, 16201 ipif->ipif_ire_type, 16202 ntohl(ipif->ipif_lcl_addr))); 16203 *irep++ = ire_create( 16204 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 16205 (uchar_t *)&ip_g_all_ones, /* mask */ 16206 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 16207 NULL, /* no gateway */ 16208 &ip_loopback_mtuplus, /* max frag size */ 16209 NULL, 16210 ipif->ipif_rq, /* recv-from queue */ 16211 NULL, /* no send-to queue */ 16212 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 16213 ipif, 16214 0, 16215 0, 16216 0, 16217 (ipif->ipif_flags & IPIF_PRIVATE) ? 16218 RTF_PRIVATE : 0, 16219 &ire_uinfo_null, 16220 NULL, 16221 NULL, 16222 ipst); 16223 } else { 16224 ip1dbg(( 16225 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 16226 ipif->ipif_ire_type, 16227 ntohl(ipif->ipif_lcl_addr), 16228 (uint_t)ipif->ipif_flags)); 16229 } 16230 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 16231 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 16232 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 16233 } else { 16234 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 16235 } 16236 16237 subnet_mask = ipif->ipif_net_mask; 16238 16239 /* 16240 * If mask was not specified, use natural netmask of 16241 * interface address. Also, store this mask back into the 16242 * ipif struct. 16243 */ 16244 if (subnet_mask == 0) { 16245 subnet_mask = net_mask; 16246 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 16247 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 16248 ipif->ipif_v6subnet); 16249 } 16250 16251 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 16252 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 16253 ipif->ipif_subnet != INADDR_ANY) { 16254 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 16255 16256 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 16257 route_mask = IP_HOST_MASK; 16258 } else { 16259 route_mask = subnet_mask; 16260 } 16261 16262 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 16263 "creating if IRE ill_net_type 0x%x for 0x%x\n", 16264 (void *)ipif, (void *)ill, 16265 ill->ill_net_type, 16266 ntohl(ipif->ipif_subnet))); 16267 *irep++ = ire_create( 16268 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 16269 (uchar_t *)&route_mask, /* mask */ 16270 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 16271 NULL, /* no gateway */ 16272 &ipif->ipif_mtu, /* max frag */ 16273 NULL, 16274 NULL, /* no recv queue */ 16275 stq, /* send-to queue */ 16276 ill->ill_net_type, /* IF_[NO]RESOLVER */ 16277 ipif, 16278 0, 16279 0, 16280 0, 16281 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 16282 &ire_uinfo_null, 16283 NULL, 16284 NULL, 16285 ipst); 16286 } 16287 16288 /* 16289 * Create any necessary broadcast IREs. 16290 */ 16291 if (ipif->ipif_flags & IPIF_BROADCAST) 16292 irep = ipif_create_bcast_ires(ipif, irep); 16293 16294 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 16295 16296 /* If an earlier ire_create failed, get out now */ 16297 for (irep1 = irep; irep1 > ire_array; ) { 16298 irep1--; 16299 if (*irep1 == NULL) { 16300 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 16301 err = ENOMEM; 16302 goto bad; 16303 } 16304 } 16305 16306 /* 16307 * Need to atomically check for IP address availability under 16308 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 16309 * ills or new ipifs can be added while we are checking availability. 16310 */ 16311 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16312 mutex_enter(&ipst->ips_ip_addr_avail_lock); 16313 /* Mark it up, and increment counters. */ 16314 ipif->ipif_flags |= IPIF_UP; 16315 ill->ill_ipif_up_count++; 16316 err = ip_addr_availability_check(ipif); 16317 mutex_exit(&ipst->ips_ip_addr_avail_lock); 16318 rw_exit(&ipst->ips_ill_g_lock); 16319 16320 if (err != 0) { 16321 /* 16322 * Our address may already be up on the same ill. In this case, 16323 * the ARP entry for our ipif replaced the one for the other 16324 * ipif. So we don't want to delete it (otherwise the other ipif 16325 * would be unable to send packets). 16326 * ip_addr_availability_check() identifies this case for us and 16327 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 16328 * which is the expected error code. 16329 */ 16330 if (err == EADDRINUSE) { 16331 freemsg(ipif->ipif_arp_del_mp); 16332 ipif->ipif_arp_del_mp = NULL; 16333 err = EADDRNOTAVAIL; 16334 } 16335 ill->ill_ipif_up_count--; 16336 ipif->ipif_flags &= ~IPIF_UP; 16337 goto bad; 16338 } 16339 16340 /* 16341 * Add in all newly created IREs. ire_create_bcast() has 16342 * already checked for duplicates of the IRE_BROADCAST type. 16343 */ 16344 for (irep1 = irep; irep1 > ire_array; ) { 16345 irep1--; 16346 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 16347 /* 16348 * refheld by ire_add. refele towards the end of the func 16349 */ 16350 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 16351 } 16352 16353 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 16354 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 16355 ipif_saved_irep = ipif_recover_ire(ipif); 16356 16357 if (!loopback) { 16358 /* 16359 * If the broadcast address has been set, make sure it makes 16360 * sense based on the interface address. 16361 * Only match on ill since we are sharing broadcast addresses. 16362 */ 16363 if ((ipif->ipif_brd_addr != INADDR_ANY) && 16364 (ipif->ipif_flags & IPIF_BROADCAST)) { 16365 ire_t *ire; 16366 16367 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 16368 IRE_BROADCAST, ipif, ALL_ZONES, 16369 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); 16370 16371 if (ire == NULL) { 16372 /* 16373 * If there isn't a matching broadcast IRE, 16374 * revert to the default for this netmask. 16375 */ 16376 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16377 mutex_enter(&ipif->ipif_ill->ill_lock); 16378 ipif_set_default(ipif); 16379 mutex_exit(&ipif->ipif_ill->ill_lock); 16380 } else { 16381 ire_refrele(ire); 16382 } 16383 } 16384 16385 } 16386 16387 if (ill->ill_need_recover_multicast) { 16388 /* 16389 * Need to recover all multicast memberships in the driver. 16390 * This had to be deferred until we had attached. The same 16391 * code exists in ipif_up_done_v6() to recover IPv6 16392 * memberships. 16393 * 16394 * Note that it would be preferable to unconditionally do the 16395 * ill_recover_multicast() in ill_dl_up(), but we cannot do 16396 * that since ill_join_allmulti() depends on ill_dl_up being 16397 * set, and it is not set until we receive a DL_BIND_ACK after 16398 * having called ill_dl_up(). 16399 */ 16400 ill_recover_multicast(ill); 16401 } 16402 16403 if (ill->ill_ipif_up_count == 1) { 16404 /* 16405 * Since the interface is now up, it may now be active. 16406 */ 16407 if (IS_UNDER_IPMP(ill)) 16408 ipmp_ill_refresh_active(ill); 16409 16410 /* 16411 * If this is an IPMP interface, we may now be able to 16412 * establish ARP entries. 16413 */ 16414 if (IS_IPMP(ill)) 16415 ipmp_illgrp_refresh_arpent(ill->ill_grp); 16416 } 16417 16418 /* Join the allhosts multicast address */ 16419 ipif_multicast_up(ipif); 16420 16421 /* 16422 * See if anybody else would benefit from our new ipif. 16423 */ 16424 if (!loopback && 16425 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 16426 ill_update_source_selection(ill); 16427 } 16428 16429 for (irep1 = irep; irep1 > ire_array; ) { 16430 irep1--; 16431 if (*irep1 != NULL) { 16432 /* was held in ire_add */ 16433 ire_refrele(*irep1); 16434 } 16435 } 16436 16437 cnt = ipif_saved_ire_cnt; 16438 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 16439 if (*irep1 != NULL) { 16440 /* was held in ire_add */ 16441 ire_refrele(*irep1); 16442 } 16443 } 16444 16445 if (!loopback && ipif->ipif_addr_ready) { 16446 /* Broadcast an address mask reply. */ 16447 ipif_mask_reply(ipif); 16448 } 16449 if (ipif_saved_irep != NULL) { 16450 kmem_free(ipif_saved_irep, 16451 ipif_saved_ire_cnt * sizeof (ire_t *)); 16452 } 16453 if (src_ipif_held) 16454 ipif_refrele(src_ipif); 16455 16456 /* 16457 * This had to be deferred until we had bound. Tell routing sockets and 16458 * others that this interface is up if it looks like the address has 16459 * been validated. Otherwise, if it isn't ready yet, wait for 16460 * duplicate address detection to do its thing. 16461 */ 16462 if (ipif->ipif_addr_ready) 16463 ipif_up_notify(ipif); 16464 return (0); 16465 16466 bad: 16467 ip1dbg(("ipif_up_done: FAILED \n")); 16468 16469 while (irep > ire_array) { 16470 irep--; 16471 if (*irep != NULL) 16472 ire_delete(*irep); 16473 } 16474 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 16475 16476 if (ipif_saved_irep != NULL) { 16477 kmem_free(ipif_saved_irep, 16478 ipif_saved_ire_cnt * sizeof (ire_t *)); 16479 } 16480 if (src_ipif_held) 16481 ipif_refrele(src_ipif); 16482 16483 ipif_resolver_down(ipif); 16484 return (err); 16485 } 16486 16487 /* 16488 * Turn off the ARP with the ILLF_NOARP flag. 16489 */ 16490 static int 16491 ill_arp_off(ill_t *ill) 16492 { 16493 mblk_t *arp_off_mp = NULL; 16494 mblk_t *arp_on_mp = NULL; 16495 16496 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 16497 16498 ASSERT(IAM_WRITER_ILL(ill)); 16499 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 16500 16501 /* 16502 * If the on message is still around we've already done 16503 * an arp_off without doing an arp_on thus there is no 16504 * work needed. 16505 */ 16506 if (ill->ill_arp_on_mp != NULL) 16507 return (0); 16508 16509 /* 16510 * Allocate an ARP on message (to be saved) and an ARP off message 16511 */ 16512 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 16513 if (!arp_off_mp) 16514 return (ENOMEM); 16515 16516 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 16517 if (!arp_on_mp) 16518 goto failed; 16519 16520 ASSERT(ill->ill_arp_on_mp == NULL); 16521 ill->ill_arp_on_mp = arp_on_mp; 16522 16523 /* Send an AR_INTERFACE_OFF request */ 16524 putnext(ill->ill_rq, arp_off_mp); 16525 return (0); 16526 failed: 16527 16528 if (arp_off_mp) 16529 freemsg(arp_off_mp); 16530 return (ENOMEM); 16531 } 16532 16533 /* 16534 * Turn on ARP by turning off the ILLF_NOARP flag. 16535 */ 16536 static int 16537 ill_arp_on(ill_t *ill) 16538 { 16539 mblk_t *mp; 16540 16541 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 16542 16543 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 16544 16545 ASSERT(IAM_WRITER_ILL(ill)); 16546 /* 16547 * Send an AR_INTERFACE_ON request if we have already done 16548 * an arp_off (which allocated the message). 16549 */ 16550 if (ill->ill_arp_on_mp != NULL) { 16551 mp = ill->ill_arp_on_mp; 16552 ill->ill_arp_on_mp = NULL; 16553 putnext(ill->ill_rq, mp); 16554 } 16555 return (0); 16556 } 16557 16558 /* 16559 * Checks for availbility of a usable source address (if there is one) when the 16560 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 16561 * this selection is done regardless of the destination. 16562 */ 16563 boolean_t 16564 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 16565 { 16566 uint_t ifindex; 16567 ipif_t *ipif = NULL; 16568 ill_t *uill; 16569 boolean_t isv6; 16570 ip_stack_t *ipst = ill->ill_ipst; 16571 16572 ASSERT(ill != NULL); 16573 16574 isv6 = ill->ill_isv6; 16575 ifindex = ill->ill_usesrc_ifindex; 16576 if (ifindex != 0) { 16577 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 16578 NULL, ipst); 16579 if (uill == NULL) 16580 return (B_FALSE); 16581 mutex_enter(&uill->ill_lock); 16582 for (ipif = uill->ill_ipif; ipif != NULL; 16583 ipif = ipif->ipif_next) { 16584 if (!IPIF_CAN_LOOKUP(ipif)) 16585 continue; 16586 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 16587 continue; 16588 if (!(ipif->ipif_flags & IPIF_UP)) 16589 continue; 16590 if (ipif->ipif_zoneid != zoneid) 16591 continue; 16592 if ((isv6 && 16593 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 16594 (ipif->ipif_lcl_addr == INADDR_ANY)) 16595 continue; 16596 mutex_exit(&uill->ill_lock); 16597 ill_refrele(uill); 16598 return (B_TRUE); 16599 } 16600 mutex_exit(&uill->ill_lock); 16601 ill_refrele(uill); 16602 } 16603 return (B_FALSE); 16604 } 16605 16606 /* 16607 * IP source address type, sorted from worst to best. For a given type, 16608 * always prefer IP addresses on the same subnet. All-zones addresses are 16609 * suboptimal because they pose problems with unlabeled destinations. 16610 */ 16611 typedef enum { 16612 IPIF_NONE, 16613 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 16614 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 16615 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 16616 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 16617 IPIF_DIFFNET, /* normal and different subnet */ 16618 IPIF_SAMENET /* normal and same subnet */ 16619 } ipif_type_t; 16620 16621 /* 16622 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 16623 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 16624 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 16625 * the first one, unless IPMP is used in which case we round-robin among them; 16626 * see below for more. 16627 * 16628 * Returns NULL if there is no suitable source address for the ill. 16629 * This only occurs when there is no valid source address for the ill. 16630 */ 16631 ipif_t * 16632 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 16633 { 16634 ill_t *usill = NULL; 16635 ill_t *ipmp_ill = NULL; 16636 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 16637 ipif_type_t type, best_type; 16638 tsol_tpc_t *src_rhtp, *dst_rhtp; 16639 ip_stack_t *ipst = ill->ill_ipst; 16640 boolean_t samenet; 16641 16642 if (ill->ill_usesrc_ifindex != 0) { 16643 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 16644 B_FALSE, NULL, NULL, NULL, NULL, ipst); 16645 if (usill != NULL) 16646 ill = usill; /* Select source from usesrc ILL */ 16647 else 16648 return (NULL); 16649 } 16650 16651 /* 16652 * Test addresses should never be used for source address selection, 16653 * so if we were passed one, switch to the IPMP meta-interface. 16654 */ 16655 if (IS_UNDER_IPMP(ill)) { 16656 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 16657 ill = ipmp_ill; /* Select source from IPMP ill */ 16658 else 16659 return (NULL); 16660 } 16661 16662 /* 16663 * If we're dealing with an unlabeled destination on a labeled system, 16664 * make sure that we ignore source addresses that are incompatible with 16665 * the destination's default label. That destination's default label 16666 * must dominate the minimum label on the source address. 16667 */ 16668 dst_rhtp = NULL; 16669 if (is_system_labeled()) { 16670 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 16671 if (dst_rhtp == NULL) 16672 return (NULL); 16673 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 16674 TPC_RELE(dst_rhtp); 16675 dst_rhtp = NULL; 16676 } 16677 } 16678 16679 /* 16680 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 16681 * can be deleted. But an ipif/ill can get CONDEMNED any time. 16682 * After selecting the right ipif, under ill_lock make sure ipif is 16683 * not condemned, and increment refcnt. If ipif is CONDEMNED, 16684 * we retry. Inside the loop we still need to check for CONDEMNED, 16685 * but not under a lock. 16686 */ 16687 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16688 retry: 16689 /* 16690 * For source address selection, we treat the ipif list as circular 16691 * and continue until we get back to where we started. This allows 16692 * IPMP to vary source address selection (which improves inbound load 16693 * spreading) by caching its last ending point and starting from 16694 * there. NOTE: we don't have to worry about ill_src_ipif changing 16695 * ills since that can't happen on the IPMP ill. 16696 */ 16697 start_ipif = ill->ill_ipif; 16698 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 16699 start_ipif = ill->ill_src_ipif; 16700 16701 ipif = start_ipif; 16702 best_ipif = NULL; 16703 best_type = IPIF_NONE; 16704 do { 16705 if ((next_ipif = ipif->ipif_next) == NULL) 16706 next_ipif = ill->ill_ipif; 16707 16708 if (!IPIF_CAN_LOOKUP(ipif)) 16709 continue; 16710 /* Always skip NOLOCAL and ANYCAST interfaces */ 16711 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 16712 continue; 16713 if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready) 16714 continue; 16715 if (ipif->ipif_zoneid != zoneid && 16716 ipif->ipif_zoneid != ALL_ZONES) 16717 continue; 16718 16719 /* 16720 * Interfaces with 0.0.0.0 address are allowed to be UP, but 16721 * are not valid as source addresses. 16722 */ 16723 if (ipif->ipif_lcl_addr == INADDR_ANY) 16724 continue; 16725 16726 /* 16727 * Check compatibility of local address for destination's 16728 * default label if we're on a labeled system. Incompatible 16729 * addresses can't be used at all. 16730 */ 16731 if (dst_rhtp != NULL) { 16732 boolean_t incompat; 16733 16734 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 16735 IPV4_VERSION, B_FALSE); 16736 if (src_rhtp == NULL) 16737 continue; 16738 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 16739 src_rhtp->tpc_tp.tp_doi != 16740 dst_rhtp->tpc_tp.tp_doi || 16741 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 16742 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 16743 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 16744 src_rhtp->tpc_tp.tp_sl_set_cipso)); 16745 TPC_RELE(src_rhtp); 16746 if (incompat) 16747 continue; 16748 } 16749 16750 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 16751 16752 if (ipif->ipif_flags & IPIF_DEPRECATED) { 16753 type = samenet ? IPIF_SAMENET_DEPRECATED : 16754 IPIF_DIFFNET_DEPRECATED; 16755 } else if (ipif->ipif_zoneid == ALL_ZONES) { 16756 type = samenet ? IPIF_SAMENET_ALLZONES : 16757 IPIF_DIFFNET_ALLZONES; 16758 } else { 16759 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 16760 } 16761 16762 if (type > best_type) { 16763 best_type = type; 16764 best_ipif = ipif; 16765 if (best_type == IPIF_SAMENET) 16766 break; /* can't get better */ 16767 } 16768 } while ((ipif = next_ipif) != start_ipif); 16769 16770 if ((ipif = best_ipif) != NULL) { 16771 mutex_enter(&ipif->ipif_ill->ill_lock); 16772 if (!IPIF_CAN_LOOKUP(ipif)) { 16773 mutex_exit(&ipif->ipif_ill->ill_lock); 16774 goto retry; 16775 } 16776 ipif_refhold_locked(ipif); 16777 16778 /* 16779 * For IPMP, update the source ipif rotor to the next ipif, 16780 * provided we can look it up. (We must not use it if it's 16781 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 16782 * ipif_free() checked ill_src_ipif.) 16783 */ 16784 if (IS_IPMP(ill) && ipif != NULL) { 16785 next_ipif = ipif->ipif_next; 16786 if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) 16787 ill->ill_src_ipif = next_ipif; 16788 else 16789 ill->ill_src_ipif = NULL; 16790 } 16791 mutex_exit(&ipif->ipif_ill->ill_lock); 16792 } 16793 16794 rw_exit(&ipst->ips_ill_g_lock); 16795 if (usill != NULL) 16796 ill_refrele(usill); 16797 if (ipmp_ill != NULL) 16798 ill_refrele(ipmp_ill); 16799 if (dst_rhtp != NULL) 16800 TPC_RELE(dst_rhtp); 16801 16802 #ifdef DEBUG 16803 if (ipif == NULL) { 16804 char buf1[INET6_ADDRSTRLEN]; 16805 16806 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 16807 ill->ill_name, 16808 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 16809 } else { 16810 char buf1[INET6_ADDRSTRLEN]; 16811 char buf2[INET6_ADDRSTRLEN]; 16812 16813 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 16814 ipif->ipif_ill->ill_name, 16815 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 16816 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 16817 buf2, sizeof (buf2)))); 16818 } 16819 #endif /* DEBUG */ 16820 return (ipif); 16821 } 16822 16823 /* 16824 * If old_ipif is not NULL, see if ipif was derived from old 16825 * ipif and if so, recreate the interface route by re-doing 16826 * source address selection. This happens when ipif_down -> 16827 * ipif_update_other_ipifs calls us. 16828 * 16829 * If old_ipif is NULL, just redo the source address selection 16830 * if needed. This happens when ipif_up_done calls us. 16831 */ 16832 static void 16833 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 16834 { 16835 ire_t *ire; 16836 ire_t *ipif_ire; 16837 queue_t *stq; 16838 ipif_t *nipif; 16839 ill_t *ill; 16840 boolean_t need_rele = B_FALSE; 16841 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16842 16843 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 16844 ASSERT(IAM_WRITER_IPIF(ipif)); 16845 16846 ill = ipif->ipif_ill; 16847 if (!(ipif->ipif_flags & 16848 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 16849 /* 16850 * Can't possibly have borrowed the source 16851 * from old_ipif. 16852 */ 16853 return; 16854 } 16855 16856 /* 16857 * Is there any work to be done? No work if the address 16858 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 16859 * ipif_select_source() does not borrow addresses from 16860 * NOLOCAL and ANYCAST interfaces). 16861 */ 16862 if ((old_ipif != NULL) && 16863 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 16864 (old_ipif->ipif_ill->ill_wq == NULL) || 16865 (old_ipif->ipif_flags & 16866 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 16867 return; 16868 } 16869 16870 /* 16871 * Perform the same checks as when creating the 16872 * IRE_INTERFACE in ipif_up_done. 16873 */ 16874 if (!(ipif->ipif_flags & IPIF_UP)) 16875 return; 16876 16877 if ((ipif->ipif_flags & IPIF_NOXMIT) || 16878 (ipif->ipif_subnet == INADDR_ANY)) 16879 return; 16880 16881 ipif_ire = ipif_to_ire(ipif); 16882 if (ipif_ire == NULL) 16883 return; 16884 16885 /* 16886 * We know that ipif uses some other source for its 16887 * IRE_INTERFACE. Is it using the source of this 16888 * old_ipif? 16889 */ 16890 if (old_ipif != NULL && 16891 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 16892 ire_refrele(ipif_ire); 16893 return; 16894 } 16895 if (ip_debug > 2) { 16896 /* ip1dbg */ 16897 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 16898 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 16899 } 16900 16901 stq = ipif_ire->ire_stq; 16902 16903 /* 16904 * Can't use our source address. Select a different 16905 * source address for the IRE_INTERFACE. 16906 */ 16907 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 16908 if (nipif == NULL) { 16909 /* Last resort - all ipif's have IPIF_NOLOCAL */ 16910 nipif = ipif; 16911 } else { 16912 need_rele = B_TRUE; 16913 } 16914 16915 ire = ire_create( 16916 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 16917 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 16918 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 16919 NULL, /* no gateway */ 16920 &ipif->ipif_mtu, /* max frag */ 16921 NULL, /* no src nce */ 16922 NULL, /* no recv from queue */ 16923 stq, /* send-to queue */ 16924 ill->ill_net_type, /* IF_[NO]RESOLVER */ 16925 ipif, 16926 0, 16927 0, 16928 0, 16929 0, 16930 &ire_uinfo_null, 16931 NULL, 16932 NULL, 16933 ipst); 16934 16935 if (ire != NULL) { 16936 ire_t *ret_ire; 16937 int error; 16938 16939 /* 16940 * We don't need ipif_ire anymore. We need to delete 16941 * before we add so that ire_add does not detect 16942 * duplicates. 16943 */ 16944 ire_delete(ipif_ire); 16945 ret_ire = ire; 16946 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 16947 ASSERT(error == 0); 16948 ASSERT(ire == ret_ire); 16949 /* Held in ire_add */ 16950 ire_refrele(ret_ire); 16951 } 16952 /* 16953 * Either we are falling through from above or could not 16954 * allocate a replacement. 16955 */ 16956 ire_refrele(ipif_ire); 16957 if (need_rele) 16958 ipif_refrele(nipif); 16959 } 16960 16961 /* 16962 * This old_ipif is going away. 16963 * 16964 * Determine if any other ipif's are using our address as 16965 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 16966 * IPIF_DEPRECATED). 16967 * Find the IRE_INTERFACE for such ipifs and recreate them 16968 * to use an different source address following the rules in 16969 * ipif_up_done. 16970 */ 16971 static void 16972 ipif_update_other_ipifs(ipif_t *old_ipif) 16973 { 16974 ipif_t *ipif; 16975 ill_t *ill; 16976 char buf[INET6_ADDRSTRLEN]; 16977 16978 ASSERT(IAM_WRITER_IPIF(old_ipif)); 16979 16980 ill = old_ipif->ipif_ill; 16981 16982 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name, 16983 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf)))); 16984 16985 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 16986 if (ipif == old_ipif) 16987 continue; 16988 ipif_recreate_interface_routes(old_ipif, ipif); 16989 } 16990 } 16991 16992 /* ARGSUSED */ 16993 int 16994 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 16995 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 16996 { 16997 /* 16998 * ill_phyint_reinit merged the v4 and v6 into a single 16999 * ipsq. We might not have been able to complete the 17000 * operation in ipif_set_values, if we could not become 17001 * exclusive. If so restart it here. 17002 */ 17003 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 17004 } 17005 17006 /* 17007 * Can operate on either a module or a driver queue. 17008 * Returns an error if not a module queue. 17009 */ 17010 /* ARGSUSED */ 17011 int 17012 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 17013 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 17014 { 17015 queue_t *q1 = q; 17016 char *cp; 17017 char interf_name[LIFNAMSIZ]; 17018 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 17019 17020 if (q->q_next == NULL) { 17021 ip1dbg(( 17022 "if_unitsel: IF_UNITSEL: no q_next\n")); 17023 return (EINVAL); 17024 } 17025 17026 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 17027 return (EALREADY); 17028 17029 do { 17030 q1 = q1->q_next; 17031 } while (q1->q_next); 17032 cp = q1->q_qinfo->qi_minfo->mi_idname; 17033 (void) sprintf(interf_name, "%s%d", cp, ppa); 17034 17035 /* 17036 * Here we are not going to delay the ioack until after 17037 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 17038 * original ioctl message before sending the requests. 17039 */ 17040 return (ipif_set_values(q, mp, interf_name, &ppa)); 17041 } 17042 17043 /* ARGSUSED */ 17044 int 17045 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 17046 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 17047 { 17048 return (ENXIO); 17049 } 17050 17051 /* 17052 * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the 17053 * minimum (but complete) set exist. This is necessary when adding or 17054 * removing an interface to/from an IPMP group, since interfaces in an 17055 * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever 17056 * its test address subnets overlap with IPMP data addresses). It's also 17057 * used to refresh the IRE_BROADCAST entries associated with the IPMP 17058 * interface when the nominated broadcast interface changes. 17059 */ 17060 void 17061 ill_refresh_bcast(ill_t *ill) 17062 { 17063 ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */ 17064 ire_t **irep; 17065 ipif_t *ipif; 17066 17067 ASSERT(!ill->ill_isv6); 17068 ASSERT(IAM_WRITER_ILL(ill)); 17069 17070 /* 17071 * Remove any old broadcast IREs. 17072 */ 17073 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST, 17074 ill_broadcast_delete, ill, ill); 17075 17076 /* 17077 * Create new ones for any ipifs that are up and broadcast-capable. 17078 */ 17079 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17080 if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) != 17081 (IPIF_UP|IPIF_BROADCAST)) 17082 continue; 17083 17084 irep = ipif_create_bcast_ires(ipif, ire_array); 17085 while (irep-- > ire_array) { 17086 (void) ire_add(irep, NULL, NULL, NULL, B_FALSE); 17087 if (*irep != NULL) 17088 ire_refrele(*irep); 17089 } 17090 } 17091 } 17092 17093 /* 17094 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 17095 * `irep'. Returns a pointer to the next free `irep' entry (just like 17096 * ire_check_and_create_bcast()). 17097 */ 17098 static ire_t ** 17099 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 17100 { 17101 ipaddr_t addr; 17102 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 17103 ipaddr_t subnetmask = ipif->ipif_net_mask; 17104 int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; 17105 17106 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 17107 17108 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 17109 17110 if (ipif->ipif_lcl_addr == INADDR_ANY || 17111 (ipif->ipif_flags & IPIF_NOLOCAL)) 17112 netmask = htonl(IN_CLASSA_NET); /* fallback */ 17113 17114 irep = ire_check_and_create_bcast(ipif, 0, irep, flags); 17115 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags); 17116 17117 /* 17118 * For backward compatibility, we create net broadcast IREs based on 17119 * the old "IP address class system", since some old machines only 17120 * respond to these class derived net broadcast. However, we must not 17121 * create these net broadcast IREs if the subnetmask is shorter than 17122 * the IP address class based derived netmask. Otherwise, we may 17123 * create a net broadcast address which is the same as an IP address 17124 * on the subnet -- and then TCP will refuse to talk to that address. 17125 */ 17126 if (netmask < subnetmask) { 17127 addr = netmask & ipif->ipif_subnet; 17128 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 17129 irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep, 17130 flags); 17131 } 17132 17133 /* 17134 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 17135 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 17136 * created. Creating these broadcast IREs will only create confusion 17137 * as `addr' will be the same as the IP address. 17138 */ 17139 if (subnetmask != 0xFFFFFFFF) { 17140 addr = ipif->ipif_subnet; 17141 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 17142 irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr, 17143 irep, flags); 17144 } 17145 17146 return (irep); 17147 } 17148 17149 /* 17150 * Broadcast IRE info structure used in the functions below. Since we 17151 * allocate BCAST_COUNT of them on the stack, keep the bit layout compact. 17152 */ 17153 typedef struct bcast_ireinfo { 17154 uchar_t bi_type; /* BCAST_* value from below */ 17155 uchar_t bi_willdie:1, /* will this IRE be going away? */ 17156 bi_needrep:1, /* do we need to replace it? */ 17157 bi_haverep:1, /* have we replaced it? */ 17158 bi_pad:5; 17159 ipaddr_t bi_addr; /* IRE address */ 17160 ipif_t *bi_backup; /* last-ditch ipif to replace it on */ 17161 } bcast_ireinfo_t; 17162 17163 enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT }; 17164 17165 /* 17166 * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and 17167 * return B_TRUE if it should immediately be used to recreate the IRE. 17168 */ 17169 static boolean_t 17170 ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop) 17171 { 17172 ipaddr_t addr; 17173 17174 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie); 17175 17176 switch (bireinfop->bi_type) { 17177 case BCAST_NET: 17178 addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet); 17179 if (addr != bireinfop->bi_addr) 17180 return (B_FALSE); 17181 break; 17182 case BCAST_SUBNET: 17183 if (ipif->ipif_subnet != bireinfop->bi_addr) 17184 return (B_FALSE); 17185 break; 17186 } 17187 17188 bireinfop->bi_needrep = 1; 17189 if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) { 17190 if (bireinfop->bi_backup == NULL) 17191 bireinfop->bi_backup = ipif; 17192 return (B_FALSE); 17193 } 17194 return (B_TRUE); 17195 } 17196 17197 /* 17198 * Create the broadcast IREs described by `bireinfop' on `ipif', and return 17199 * them ala ire_check_and_create_bcast(). 17200 */ 17201 static ire_t ** 17202 ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep) 17203 { 17204 ipaddr_t mask, addr; 17205 17206 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep); 17207 17208 addr = bireinfop->bi_addr; 17209 irep = ire_create_bcast(ipif, addr, irep); 17210 17211 switch (bireinfop->bi_type) { 17212 case BCAST_NET: 17213 mask = ip_net_mask(ipif->ipif_subnet); 17214 irep = ire_create_bcast(ipif, addr | ~mask, irep); 17215 break; 17216 case BCAST_SUBNET: 17217 mask = ipif->ipif_net_mask; 17218 irep = ire_create_bcast(ipif, addr | ~mask, irep); 17219 break; 17220 } 17221 17222 bireinfop->bi_haverep = 1; 17223 return (irep); 17224 } 17225 17226 /* 17227 * Walk through all of the ipifs on `ill' that will be affected by `test_ipif' 17228 * going away, and determine if any of the broadcast IREs (named by `bireinfop') 17229 * that are going away are still needed. If so, have ipif_create_bcast() 17230 * recreate them (except for the deprecated case, as explained below). 17231 */ 17232 static ire_t ** 17233 ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo, 17234 ire_t **irep) 17235 { 17236 int i; 17237 ipif_t *ipif; 17238 17239 ASSERT(!ill->ill_isv6); 17240 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17241 /* 17242 * Skip this ipif if it's (a) the one being taken down, (b) 17243 * not in the same zone, or (c) has no valid local address. 17244 */ 17245 if (ipif == test_ipif || 17246 ipif->ipif_zoneid != test_ipif->ipif_zoneid || 17247 ipif->ipif_subnet == 0 || 17248 (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) != 17249 (IPIF_UP|IPIF_BROADCAST)) 17250 continue; 17251 17252 /* 17253 * For each dying IRE that hasn't yet been replaced, see if 17254 * `ipif' needs it and whether the IRE should be recreated on 17255 * `ipif'. If `ipif' is deprecated, ipif_consider_bcast() 17256 * will return B_FALSE even if `ipif' needs the IRE on the 17257 * hopes that we'll later find a needy non-deprecated ipif. 17258 * However, the ipif is recorded in bi_backup for possible 17259 * subsequent use by ipif_check_bcast_ires(). 17260 */ 17261 for (i = 0; i < BCAST_COUNT; i++) { 17262 if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep) 17263 continue; 17264 if (!ipif_consider_bcast(ipif, &bireinfo[i])) 17265 continue; 17266 irep = ipif_create_bcast(ipif, &bireinfo[i], irep); 17267 } 17268 17269 /* 17270 * If we've replaced all of the broadcast IREs that are going 17271 * to be taken down, we know we're done. 17272 */ 17273 for (i = 0; i < BCAST_COUNT; i++) { 17274 if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep) 17275 break; 17276 } 17277 if (i == BCAST_COUNT) 17278 break; 17279 } 17280 return (irep); 17281 } 17282 17283 /* 17284 * Check if `test_ipif' (which is going away) is associated with any existing 17285 * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were 17286 * using those broadcast IREs. If so, recreate the broadcast IREs on one or 17287 * more of those other ipifs. (The old IREs will be deleted in ipif_down().) 17288 * 17289 * This is necessary because broadcast IREs are shared. In particular, a 17290 * given ill has one set of all-zeroes and all-ones broadcast IREs (for every 17291 * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones, 17292 * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP 17293 * ipifs on. Thus, if there are two IPIF_UP ipifs on the same subnet with the 17294 * same zone, they will share the same set of broadcast IREs. 17295 * 17296 * Note: the upper bound of 12 IREs comes from the worst case of replacing all 17297 * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes, 17298 * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones). 17299 */ 17300 static void 17301 ipif_check_bcast_ires(ipif_t *test_ipif) 17302 { 17303 ill_t *ill = test_ipif->ipif_ill; 17304 ire_t *ire, *ire_array[12]; /* see note above */ 17305 ire_t **irep1, **irep = &ire_array[0]; 17306 uint_t i, willdie; 17307 ipaddr_t mask = ip_net_mask(test_ipif->ipif_subnet); 17308 bcast_ireinfo_t bireinfo[BCAST_COUNT]; 17309 17310 ASSERT(!test_ipif->ipif_isv6); 17311 ASSERT(IAM_WRITER_IPIF(test_ipif)); 17312 17313 /* 17314 * No broadcast IREs for the LOOPBACK interface 17315 * or others such as point to point and IPIF_NOXMIT. 17316 */ 17317 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 17318 (test_ipif->ipif_flags & IPIF_NOXMIT)) 17319 return; 17320 17321 bzero(bireinfo, sizeof (bireinfo)); 17322 bireinfo[0].bi_type = BCAST_ALLZEROES; 17323 bireinfo[0].bi_addr = 0; 17324 17325 bireinfo[1].bi_type = BCAST_ALLONES; 17326 bireinfo[1].bi_addr = INADDR_BROADCAST; 17327 17328 bireinfo[2].bi_type = BCAST_NET; 17329 bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask; 17330 17331 if (test_ipif->ipif_net_mask != 0) 17332 mask = test_ipif->ipif_net_mask; 17333 bireinfo[3].bi_type = BCAST_SUBNET; 17334 bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask; 17335 17336 /* 17337 * Figure out what (if any) broadcast IREs will die as a result of 17338 * `test_ipif' going away. If none will die, we're done. 17339 */ 17340 for (i = 0, willdie = 0; i < BCAST_COUNT; i++) { 17341 ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST, 17342 test_ipif, ALL_ZONES, NULL, 17343 (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst); 17344 if (ire != NULL) { 17345 willdie++; 17346 bireinfo[i].bi_willdie = 1; 17347 ire_refrele(ire); 17348 } 17349 } 17350 17351 if (willdie == 0) 17352 return; 17353 17354 /* 17355 * Walk through all the ipifs that will be affected by the dying IREs, 17356 * and recreate the IREs as necessary. Note that all interfaces in an 17357 * IPMP illgrp share the same broadcast IREs, and thus the entire 17358 * illgrp must be walked, starting with the IPMP meta-interface (so 17359 * that broadcast IREs end up on it whenever possible). 17360 */ 17361 if (IS_UNDER_IPMP(ill)) 17362 ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 17363 17364 irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); 17365 17366 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 17367 ipmp_illgrp_t *illg = ill->ill_grp; 17368 17369 ill = list_head(&illg->ig_if); 17370 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 17371 for (i = 0; i < BCAST_COUNT; i++) { 17372 if (bireinfo[i].bi_willdie && 17373 !bireinfo[i].bi_haverep) 17374 break; 17375 } 17376 if (i == BCAST_COUNT) 17377 break; 17378 17379 irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); 17380 } 17381 } 17382 17383 /* 17384 * Scan through the set of broadcast IREs and see if there are any 17385 * that we need to replace that have not yet been replaced. If so, 17386 * replace them using the appropriate backup ipif. 17387 */ 17388 for (i = 0; i < BCAST_COUNT; i++) { 17389 if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep) 17390 irep = ipif_create_bcast(bireinfo[i].bi_backup, 17391 &bireinfo[i], irep); 17392 } 17393 17394 /* 17395 * If we can't create all of them, don't add any of them. (Code in 17396 * ip_wput_ire() and ire_to_ill() assumes that we always have a 17397 * non-loopback copy and loopback copy for a given address.) 17398 */ 17399 for (irep1 = irep; irep1 > ire_array; ) { 17400 irep1--; 17401 if (*irep1 == NULL) { 17402 ip0dbg(("ipif_check_bcast_ires: can't create " 17403 "IRE_BROADCAST, memory allocation failure\n")); 17404 while (irep > ire_array) { 17405 irep--; 17406 if (*irep != NULL) 17407 ire_delete(*irep); 17408 } 17409 return; 17410 } 17411 } 17412 17413 for (irep1 = irep; irep1 > ire_array; ) { 17414 irep1--; 17415 if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0) 17416 ire_refrele(*irep1); /* Held in ire_add */ 17417 } 17418 } 17419 17420 /* 17421 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 17422 * from lifr_flags and the name from lifr_name. 17423 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 17424 * since ipif_lookup_on_name uses the _isv6 flags when matching. 17425 * Returns EINPROGRESS when mp has been consumed by queueing it on 17426 * ill_pending_mp and the ioctl will complete in ip_rput. 17427 * 17428 * Can operate on either a module or a driver queue. 17429 * Returns an error if not a module queue. 17430 */ 17431 /* ARGSUSED */ 17432 int 17433 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17434 ip_ioctl_cmd_t *ipip, void *if_req) 17435 { 17436 ill_t *ill = q->q_ptr; 17437 phyint_t *phyi; 17438 ip_stack_t *ipst; 17439 struct lifreq *lifr = if_req; 17440 uint64_t new_flags; 17441 17442 ASSERT(ipif != NULL); 17443 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 17444 17445 if (q->q_next == NULL) { 17446 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 17447 return (EINVAL); 17448 } 17449 17450 /* 17451 * If we are not writer on 'q' then this interface exists already 17452 * and previous lookups (ip_extract_lifreq()) found this ipif -- 17453 * so return EALREADY. 17454 */ 17455 if (ill != ipif->ipif_ill) 17456 return (EALREADY); 17457 17458 if (ill->ill_name[0] != '\0') 17459 return (EALREADY); 17460 17461 /* 17462 * If there's another ill already with the requested name, ensure 17463 * that it's of the same type. Otherwise, ill_phyint_reinit() will 17464 * fuse together two unrelated ills, which will cause chaos. 17465 */ 17466 ipst = ill->ill_ipst; 17467 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 17468 lifr->lifr_name, NULL); 17469 if (phyi != NULL) { 17470 ill_t *ill_mate = phyi->phyint_illv4; 17471 17472 if (ill_mate == NULL) 17473 ill_mate = phyi->phyint_illv6; 17474 ASSERT(ill_mate != NULL); 17475 17476 if (ill_mate->ill_media->ip_m_mac_type != 17477 ill->ill_media->ip_m_mac_type) { 17478 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 17479 "use the same ill name on differing media\n")); 17480 return (EINVAL); 17481 } 17482 } 17483 17484 /* 17485 * We start off as IFF_IPV4 in ipif_allocate and become 17486 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 17487 * The only flags that we read from user space are IFF_IPV4, 17488 * IFF_IPV6, IFF_XRESOLV and IFF_BROADCAST. 17489 * 17490 * This ill has not been inserted into the global list. 17491 * So we are still single threaded and don't need any lock 17492 * 17493 * Saniy check the flags. 17494 */ 17495 17496 if ((lifr->lifr_flags & IFF_BROADCAST) && 17497 ((lifr->lifr_flags & IFF_IPV6) || 17498 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 17499 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 17500 "or IPv6 i.e., no broadcast \n")); 17501 return (EINVAL); 17502 } 17503 17504 new_flags = 17505 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_XRESOLV|IFF_BROADCAST); 17506 17507 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 17508 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 17509 "IFF_IPV4 or IFF_IPV6\n")); 17510 return (EINVAL); 17511 } 17512 /* 17513 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 17514 */ 17515 if ((new_flags & IFF_XRESOLV) && !(new_flags & IFF_IPV6) && 17516 !(ipif->ipif_isv6)) { 17517 ip1dbg(("ip_sioctl_slifname: XRESOLV only allowed on " 17518 "IPv6 interface\n")); 17519 return (EINVAL); 17520 } 17521 17522 /* 17523 * We always start off as IPv4, so only need to check for IPv6. 17524 */ 17525 if ((new_flags & IFF_IPV6) != 0) { 17526 ill->ill_flags |= ILLF_IPV6; 17527 ill->ill_flags &= ~ILLF_IPV4; 17528 } 17529 17530 if ((new_flags & IFF_BROADCAST) != 0) 17531 ipif->ipif_flags |= IPIF_BROADCAST; 17532 else 17533 ipif->ipif_flags &= ~IPIF_BROADCAST; 17534 17535 if ((new_flags & IFF_XRESOLV) != 0) 17536 ill->ill_flags |= ILLF_XRESOLV; 17537 else 17538 ill->ill_flags &= ~ILLF_XRESOLV; 17539 17540 /* We started off as V4. */ 17541 if (ill->ill_flags & ILLF_IPV6) { 17542 ill->ill_phyint->phyint_illv6 = ill; 17543 ill->ill_phyint->phyint_illv4 = NULL; 17544 } 17545 17546 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 17547 } 17548 17549 /* ARGSUSED */ 17550 int 17551 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17552 ip_ioctl_cmd_t *ipip, void *if_req) 17553 { 17554 /* 17555 * ill_phyint_reinit merged the v4 and v6 into a single 17556 * ipsq. We might not have been able to complete the 17557 * slifname in ipif_set_values, if we could not become 17558 * exclusive. If so restart it here 17559 */ 17560 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 17561 } 17562 17563 /* 17564 * Return a pointer to the ipif which matches the index, IP version type and 17565 * zoneid. 17566 */ 17567 ipif_t * 17568 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 17569 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst) 17570 { 17571 ill_t *ill; 17572 ipif_t *ipif = NULL; 17573 17574 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 17575 (q != NULL && mp != NULL && func != NULL && err != NULL)); 17576 17577 if (err != NULL) 17578 *err = 0; 17579 17580 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 17581 if (ill != NULL) { 17582 mutex_enter(&ill->ill_lock); 17583 for (ipif = ill->ill_ipif; ipif != NULL; 17584 ipif = ipif->ipif_next) { 17585 if (IPIF_CAN_LOOKUP(ipif) && (zoneid == ALL_ZONES || 17586 zoneid == ipif->ipif_zoneid || 17587 ipif->ipif_zoneid == ALL_ZONES)) { 17588 ipif_refhold_locked(ipif); 17589 break; 17590 } 17591 } 17592 mutex_exit(&ill->ill_lock); 17593 ill_refrele(ill); 17594 if (ipif == NULL && err != NULL) 17595 *err = ENXIO; 17596 } 17597 return (ipif); 17598 } 17599 17600 /* 17601 * Change an existing physical interface's index. If the new index 17602 * is acceptable we update the index and the phyint_list_avl_by_index tree. 17603 * Finally, we update other systems which may have a dependence on the 17604 * index value. 17605 */ 17606 /* ARGSUSED */ 17607 int 17608 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17609 ip_ioctl_cmd_t *ipip, void *ifreq) 17610 { 17611 ill_t *ill; 17612 phyint_t *phyi; 17613 struct ifreq *ifr = (struct ifreq *)ifreq; 17614 struct lifreq *lifr = (struct lifreq *)ifreq; 17615 uint_t old_index, index; 17616 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 17617 avl_index_t where; 17618 17619 if (ipip->ipi_cmd_type == IF_CMD) 17620 index = ifr->ifr_index; 17621 else 17622 index = lifr->lifr_index; 17623 17624 /* 17625 * Only allow on physical interface. Also, index zero is illegal. 17626 */ 17627 ill = ipif->ipif_ill; 17628 phyi = ill->ill_phyint; 17629 if (ipif->ipif_id != 0 || index == 0) { 17630 return (EINVAL); 17631 } 17632 17633 /* If the index is not changing, no work to do */ 17634 if (phyi->phyint_ifindex == index) 17635 return (0); 17636 17637 /* 17638 * Use phyint_exists() to determine if the new interface index 17639 * is already in use. If the index is unused then we need to 17640 * change the phyint's position in the phyint_list_avl_by_index 17641 * tree. If we do not do this, subsequent lookups (using the new 17642 * index value) will not find the phyint. 17643 */ 17644 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 17645 if (phyint_exists(index, ipst)) { 17646 rw_exit(&ipst->ips_ill_g_lock); 17647 return (EEXIST); 17648 } 17649 17650 /* 17651 * The new index is unused. Set it in the phyint. However we must not 17652 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 17653 * changes. The event must be bound to old ifindex value. 17654 */ 17655 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 17656 &index, sizeof (index)); 17657 17658 old_index = phyi->phyint_ifindex; 17659 phyi->phyint_ifindex = index; 17660 17661 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 17662 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 17663 &index, &where); 17664 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 17665 phyi, where); 17666 rw_exit(&ipst->ips_ill_g_lock); 17667 17668 /* Update SCTP's ILL list */ 17669 sctp_ill_reindex(ill, old_index); 17670 17671 /* Send the routing sockets message */ 17672 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 17673 if (ILL_OTHER(ill)) 17674 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 17675 17676 return (0); 17677 } 17678 17679 /* ARGSUSED */ 17680 int 17681 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17682 ip_ioctl_cmd_t *ipip, void *ifreq) 17683 { 17684 struct ifreq *ifr = (struct ifreq *)ifreq; 17685 struct lifreq *lifr = (struct lifreq *)ifreq; 17686 17687 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 17688 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17689 /* Get the interface index */ 17690 if (ipip->ipi_cmd_type == IF_CMD) { 17691 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 17692 } else { 17693 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 17694 } 17695 return (0); 17696 } 17697 17698 /* ARGSUSED */ 17699 int 17700 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17701 ip_ioctl_cmd_t *ipip, void *ifreq) 17702 { 17703 struct lifreq *lifr = (struct lifreq *)ifreq; 17704 17705 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 17706 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17707 /* Get the interface zone */ 17708 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17709 lifr->lifr_zoneid = ipif->ipif_zoneid; 17710 return (0); 17711 } 17712 17713 /* 17714 * Set the zoneid of an interface. 17715 */ 17716 /* ARGSUSED */ 17717 int 17718 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17719 ip_ioctl_cmd_t *ipip, void *ifreq) 17720 { 17721 struct lifreq *lifr = (struct lifreq *)ifreq; 17722 int err = 0; 17723 boolean_t need_up = B_FALSE; 17724 zone_t *zptr; 17725 zone_status_t status; 17726 zoneid_t zoneid; 17727 17728 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17729 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 17730 if (!is_system_labeled()) 17731 return (ENOTSUP); 17732 zoneid = GLOBAL_ZONEID; 17733 } 17734 17735 /* cannot assign instance zero to a non-global zone */ 17736 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 17737 return (ENOTSUP); 17738 17739 /* 17740 * Cannot assign to a zone that doesn't exist or is shutting down. In 17741 * the event of a race with the zone shutdown processing, since IP 17742 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 17743 * interface will be cleaned up even if the zone is shut down 17744 * immediately after the status check. If the interface can't be brought 17745 * down right away, and the zone is shut down before the restart 17746 * function is called, we resolve the possible races by rechecking the 17747 * zone status in the restart function. 17748 */ 17749 if ((zptr = zone_find_by_id(zoneid)) == NULL) 17750 return (EINVAL); 17751 status = zone_status_get(zptr); 17752 zone_rele(zptr); 17753 17754 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 17755 return (EINVAL); 17756 17757 if (ipif->ipif_flags & IPIF_UP) { 17758 /* 17759 * If the interface is already marked up, 17760 * we call ipif_down which will take care 17761 * of ditching any IREs that have been set 17762 * up based on the old interface address. 17763 */ 17764 err = ipif_logical_down(ipif, q, mp); 17765 if (err == EINPROGRESS) 17766 return (err); 17767 ipif_down_tail(ipif); 17768 need_up = B_TRUE; 17769 } 17770 17771 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 17772 return (err); 17773 } 17774 17775 static int 17776 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 17777 queue_t *q, mblk_t *mp, boolean_t need_up) 17778 { 17779 int err = 0; 17780 ip_stack_t *ipst; 17781 17782 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 17783 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17784 17785 if (CONN_Q(q)) 17786 ipst = CONNQ_TO_IPST(q); 17787 else 17788 ipst = ILLQ_TO_IPST(q); 17789 17790 /* 17791 * For exclusive stacks we don't allow a different zoneid than 17792 * global. 17793 */ 17794 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 17795 zoneid != GLOBAL_ZONEID) 17796 return (EINVAL); 17797 17798 /* Set the new zone id. */ 17799 ipif->ipif_zoneid = zoneid; 17800 17801 /* Update sctp list */ 17802 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 17803 17804 if (need_up) { 17805 /* 17806 * Now bring the interface back up. If this 17807 * is the only IPIF for the ILL, ipif_up 17808 * will have to re-bind to the device, so 17809 * we may get back EINPROGRESS, in which 17810 * case, this IOCTL will get completed in 17811 * ip_rput_dlpi when we see the DL_BIND_ACK. 17812 */ 17813 err = ipif_up(ipif, q, mp); 17814 } 17815 return (err); 17816 } 17817 17818 /* ARGSUSED */ 17819 int 17820 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17821 ip_ioctl_cmd_t *ipip, void *if_req) 17822 { 17823 struct lifreq *lifr = (struct lifreq *)if_req; 17824 zoneid_t zoneid; 17825 zone_t *zptr; 17826 zone_status_t status; 17827 17828 ASSERT(ipif->ipif_id != 0); 17829 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17830 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 17831 zoneid = GLOBAL_ZONEID; 17832 17833 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 17834 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17835 17836 /* 17837 * We recheck the zone status to resolve the following race condition: 17838 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 17839 * 2) hme0:1 is up and can't be brought down right away; 17840 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 17841 * 3) zone "myzone" is halted; the zone status switches to 17842 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 17843 * the interfaces to remove - hme0:1 is not returned because it's not 17844 * yet in "myzone", so it won't be removed; 17845 * 4) the restart function for SIOCSLIFZONE is called; without the 17846 * status check here, we would have hme0:1 in "myzone" after it's been 17847 * destroyed. 17848 * Note that if the status check fails, we need to bring the interface 17849 * back to its state prior to ip_sioctl_slifzone(), hence the call to 17850 * ipif_up_done[_v6](). 17851 */ 17852 status = ZONE_IS_UNINITIALIZED; 17853 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 17854 status = zone_status_get(zptr); 17855 zone_rele(zptr); 17856 } 17857 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 17858 if (ipif->ipif_isv6) { 17859 (void) ipif_up_done_v6(ipif); 17860 } else { 17861 (void) ipif_up_done(ipif); 17862 } 17863 return (EINVAL); 17864 } 17865 17866 ipif_down_tail(ipif); 17867 17868 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 17869 B_TRUE)); 17870 } 17871 17872 /* 17873 * Return the number of addresses on `ill' with one or more of the values 17874 * in `set' set and all of the values in `clear' clear. 17875 */ 17876 static uint_t 17877 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 17878 { 17879 ipif_t *ipif; 17880 uint_t cnt = 0; 17881 17882 ASSERT(IAM_WRITER_ILL(ill)); 17883 17884 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 17885 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 17886 cnt++; 17887 17888 return (cnt); 17889 } 17890 17891 /* 17892 * Return the number of migratable addresses on `ill' that are under 17893 * application control. 17894 */ 17895 uint_t 17896 ill_appaddr_cnt(const ill_t *ill) 17897 { 17898 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 17899 IPIF_NOFAILOVER)); 17900 } 17901 17902 /* 17903 * Return the number of point-to-point addresses on `ill'. 17904 */ 17905 uint_t 17906 ill_ptpaddr_cnt(const ill_t *ill) 17907 { 17908 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 17909 } 17910 17911 /* ARGSUSED */ 17912 int 17913 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17914 ip_ioctl_cmd_t *ipip, void *ifreq) 17915 { 17916 struct lifreq *lifr = ifreq; 17917 17918 ASSERT(q->q_next == NULL); 17919 ASSERT(CONN_Q(q)); 17920 17921 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 17922 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17923 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 17924 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 17925 17926 return (0); 17927 } 17928 17929 /* Find the previous ILL in this usesrc group */ 17930 static ill_t * 17931 ill_prev_usesrc(ill_t *uill) 17932 { 17933 ill_t *ill; 17934 17935 for (ill = uill->ill_usesrc_grp_next; 17936 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 17937 ill = ill->ill_usesrc_grp_next) 17938 /* do nothing */; 17939 return (ill); 17940 } 17941 17942 /* 17943 * Release all members of the usesrc group. This routine is called 17944 * from ill_delete when the interface being unplumbed is the 17945 * group head. 17946 */ 17947 static void 17948 ill_disband_usesrc_group(ill_t *uill) 17949 { 17950 ill_t *next_ill, *tmp_ill; 17951 ip_stack_t *ipst = uill->ill_ipst; 17952 17953 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 17954 next_ill = uill->ill_usesrc_grp_next; 17955 17956 do { 17957 ASSERT(next_ill != NULL); 17958 tmp_ill = next_ill->ill_usesrc_grp_next; 17959 ASSERT(tmp_ill != NULL); 17960 next_ill->ill_usesrc_grp_next = NULL; 17961 next_ill->ill_usesrc_ifindex = 0; 17962 next_ill = tmp_ill; 17963 } while (next_ill->ill_usesrc_ifindex != 0); 17964 uill->ill_usesrc_grp_next = NULL; 17965 } 17966 17967 /* 17968 * Remove the client usesrc ILL from the list and relink to a new list 17969 */ 17970 int 17971 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 17972 { 17973 ill_t *ill, *tmp_ill; 17974 ip_stack_t *ipst = ucill->ill_ipst; 17975 17976 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 17977 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 17978 17979 /* 17980 * Check if the usesrc client ILL passed in is not already 17981 * in use as a usesrc ILL i.e one whose source address is 17982 * in use OR a usesrc ILL is not already in use as a usesrc 17983 * client ILL 17984 */ 17985 if ((ucill->ill_usesrc_ifindex == 0) || 17986 (uill->ill_usesrc_ifindex != 0)) { 17987 return (-1); 17988 } 17989 17990 ill = ill_prev_usesrc(ucill); 17991 ASSERT(ill->ill_usesrc_grp_next != NULL); 17992 17993 /* Remove from the current list */ 17994 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 17995 /* Only two elements in the list */ 17996 ASSERT(ill->ill_usesrc_ifindex == 0); 17997 ill->ill_usesrc_grp_next = NULL; 17998 } else { 17999 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 18000 } 18001 18002 if (ifindex == 0) { 18003 ucill->ill_usesrc_ifindex = 0; 18004 ucill->ill_usesrc_grp_next = NULL; 18005 return (0); 18006 } 18007 18008 ucill->ill_usesrc_ifindex = ifindex; 18009 tmp_ill = uill->ill_usesrc_grp_next; 18010 uill->ill_usesrc_grp_next = ucill; 18011 ucill->ill_usesrc_grp_next = 18012 (tmp_ill != NULL) ? tmp_ill : uill; 18013 return (0); 18014 } 18015 18016 /* 18017 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 18018 * ip.c for locking details. 18019 */ 18020 /* ARGSUSED */ 18021 int 18022 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18023 ip_ioctl_cmd_t *ipip, void *ifreq) 18024 { 18025 struct lifreq *lifr = (struct lifreq *)ifreq; 18026 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 18027 ill_flag_changed = B_FALSE; 18028 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 18029 int err = 0, ret; 18030 uint_t ifindex; 18031 ipsq_t *ipsq = NULL; 18032 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 18033 18034 ASSERT(IAM_WRITER_IPIF(ipif)); 18035 ASSERT(q->q_next == NULL); 18036 ASSERT(CONN_Q(q)); 18037 18038 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 18039 18040 ifindex = lifr->lifr_index; 18041 if (ifindex == 0) { 18042 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 18043 /* non usesrc group interface, nothing to reset */ 18044 return (0); 18045 } 18046 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 18047 /* valid reset request */ 18048 reset_flg = B_TRUE; 18049 } 18050 18051 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 18052 ip_process_ioctl, &err, ipst); 18053 if (usesrc_ill == NULL) { 18054 return (err); 18055 } 18056 18057 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 18058 NEW_OP, B_TRUE); 18059 if (ipsq == NULL) { 18060 err = EINPROGRESS; 18061 /* Operation enqueued on the ipsq of the usesrc ILL */ 18062 goto done; 18063 } 18064 18065 /* USESRC isn't currently supported with IPMP */ 18066 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 18067 err = ENOTSUP; 18068 goto done; 18069 } 18070 18071 /* 18072 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 18073 * used by IPMP underlying interfaces, but someone might think it's 18074 * more general and try to use it independently with VNI.) 18075 */ 18076 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 18077 err = ENOTSUP; 18078 goto done; 18079 } 18080 18081 /* 18082 * If the client is already in use as a usesrc_ill or a usesrc_ill is 18083 * already a client then return EINVAL 18084 */ 18085 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 18086 err = EINVAL; 18087 goto done; 18088 } 18089 18090 /* 18091 * If the ill_usesrc_ifindex field is already set to what it needs to 18092 * be then this is a duplicate operation. 18093 */ 18094 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 18095 err = 0; 18096 goto done; 18097 } 18098 18099 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 18100 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 18101 usesrc_ill->ill_isv6)); 18102 18103 /* 18104 * The next step ensures that no new ires will be created referencing 18105 * the client ill, until the ILL_CHANGING flag is cleared. Then 18106 * we go through an ire walk deleting all ire caches that reference 18107 * the client ill. New ires referencing the client ill that are added 18108 * to the ire table before the ILL_CHANGING flag is set, will be 18109 * cleaned up by the ire walk below. Attempt to add new ires referencing 18110 * the client ill while the ILL_CHANGING flag is set will be failed 18111 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 18112 * checks (under the ill_g_usesrc_lock) that the ire being added 18113 * is not stale, i.e the ire_stq and ire_ipif are consistent and 18114 * belong to the same usesrc group. 18115 */ 18116 mutex_enter(&usesrc_cli_ill->ill_lock); 18117 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 18118 mutex_exit(&usesrc_cli_ill->ill_lock); 18119 ill_flag_changed = B_TRUE; 18120 18121 if (ipif->ipif_isv6) 18122 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 18123 ALL_ZONES, ipst); 18124 else 18125 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 18126 ALL_ZONES, ipst); 18127 18128 /* 18129 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 18130 * and the ill_usesrc_ifindex fields 18131 */ 18132 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 18133 18134 if (reset_flg) { 18135 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 18136 if (ret != 0) { 18137 err = EINVAL; 18138 } 18139 rw_exit(&ipst->ips_ill_g_usesrc_lock); 18140 goto done; 18141 } 18142 18143 /* 18144 * Four possibilities to consider: 18145 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 18146 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 18147 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 18148 * 4. Both are part of their respective usesrc groups 18149 */ 18150 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 18151 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 18152 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 18153 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 18154 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 18155 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 18156 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 18157 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 18158 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 18159 /* Insert at head of list */ 18160 usesrc_cli_ill->ill_usesrc_grp_next = 18161 usesrc_ill->ill_usesrc_grp_next; 18162 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 18163 } else { 18164 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 18165 ifindex); 18166 if (ret != 0) 18167 err = EINVAL; 18168 } 18169 rw_exit(&ipst->ips_ill_g_usesrc_lock); 18170 18171 done: 18172 if (ill_flag_changed) { 18173 mutex_enter(&usesrc_cli_ill->ill_lock); 18174 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 18175 mutex_exit(&usesrc_cli_ill->ill_lock); 18176 } 18177 if (ipsq != NULL) 18178 ipsq_exit(ipsq); 18179 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 18180 ill_refrele(usesrc_ill); 18181 return (err); 18182 } 18183 18184 /* 18185 * comparison function used by avl. 18186 */ 18187 static int 18188 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 18189 { 18190 18191 uint_t index; 18192 18193 ASSERT(phyip != NULL && index_ptr != NULL); 18194 18195 index = *((uint_t *)index_ptr); 18196 /* 18197 * let the phyint with the lowest index be on top. 18198 */ 18199 if (((phyint_t *)phyip)->phyint_ifindex < index) 18200 return (1); 18201 if (((phyint_t *)phyip)->phyint_ifindex > index) 18202 return (-1); 18203 return (0); 18204 } 18205 18206 /* 18207 * comparison function used by avl. 18208 */ 18209 static int 18210 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 18211 { 18212 ill_t *ill; 18213 int res = 0; 18214 18215 ASSERT(phyip != NULL && name_ptr != NULL); 18216 18217 if (((phyint_t *)phyip)->phyint_illv4) 18218 ill = ((phyint_t *)phyip)->phyint_illv4; 18219 else 18220 ill = ((phyint_t *)phyip)->phyint_illv6; 18221 ASSERT(ill != NULL); 18222 18223 res = strcmp(ill->ill_name, (char *)name_ptr); 18224 if (res > 0) 18225 return (1); 18226 else if (res < 0) 18227 return (-1); 18228 return (0); 18229 } 18230 18231 /* 18232 * This function is called on the unplumb path via ill_glist_delete() when 18233 * there are no ills left on the phyint and thus the phyint can be freed. 18234 */ 18235 static void 18236 phyint_free(phyint_t *phyi) 18237 { 18238 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 18239 18240 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 18241 18242 /* 18243 * If this phyint was an IPMP meta-interface, blow away the group. 18244 * This is safe to do because all of the illgrps have already been 18245 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 18246 * If we're cleaning up as a result of failed initialization, 18247 * phyint_grp may be NULL. 18248 */ 18249 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 18250 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 18251 ipmp_grp_destroy(phyi->phyint_grp); 18252 phyi->phyint_grp = NULL; 18253 rw_exit(&ipst->ips_ipmp_lock); 18254 } 18255 18256 /* 18257 * If this interface was under IPMP, take it out of the group. 18258 */ 18259 if (phyi->phyint_grp != NULL) 18260 ipmp_phyint_leave_grp(phyi); 18261 18262 /* 18263 * Delete the phyint and disassociate its ipsq. The ipsq itself 18264 * will be freed in ipsq_exit(). 18265 */ 18266 phyi->phyint_ipsq->ipsq_phyint = NULL; 18267 phyi->phyint_name[0] = '\0'; 18268 18269 mi_free(phyi); 18270 } 18271 18272 /* 18273 * Attach the ill to the phyint structure which can be shared by both 18274 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 18275 * function is called from ipif_set_values and ill_lookup_on_name (for 18276 * loopback) where we know the name of the ill. We lookup the ill and if 18277 * there is one present already with the name use that phyint. Otherwise 18278 * reuse the one allocated by ill_init. 18279 */ 18280 static void 18281 ill_phyint_reinit(ill_t *ill) 18282 { 18283 boolean_t isv6 = ill->ill_isv6; 18284 phyint_t *phyi_old; 18285 phyint_t *phyi; 18286 avl_index_t where = 0; 18287 ill_t *ill_other = NULL; 18288 ip_stack_t *ipst = ill->ill_ipst; 18289 18290 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 18291 18292 phyi_old = ill->ill_phyint; 18293 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 18294 phyi_old->phyint_illv6 == NULL)); 18295 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 18296 phyi_old->phyint_illv4 == NULL)); 18297 ASSERT(phyi_old->phyint_ifindex == 0); 18298 18299 /* 18300 * Now that our ill has a name, set it in the phyint. 18301 */ 18302 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 18303 18304 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18305 ill->ill_name, &where); 18306 18307 /* 18308 * 1. We grabbed the ill_g_lock before inserting this ill into 18309 * the global list of ills. So no other thread could have located 18310 * this ill and hence the ipsq of this ill is guaranteed to be empty. 18311 * 2. Now locate the other protocol instance of this ill. 18312 * 3. Now grab both ill locks in the right order, and the phyint lock of 18313 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 18314 * of neither ill can change. 18315 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 18316 * other ill. 18317 * 5. Release all locks. 18318 */ 18319 18320 /* 18321 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 18322 * we are initializing IPv4. 18323 */ 18324 if (phyi != NULL) { 18325 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 18326 ASSERT(ill_other->ill_phyint != NULL); 18327 ASSERT((isv6 && !ill_other->ill_isv6) || 18328 (!isv6 && ill_other->ill_isv6)); 18329 GRAB_ILL_LOCKS(ill, ill_other); 18330 /* 18331 * We are potentially throwing away phyint_flags which 18332 * could be different from the one that we obtain from 18333 * ill_other->ill_phyint. But it is okay as we are assuming 18334 * that the state maintained within IP is correct. 18335 */ 18336 mutex_enter(&phyi->phyint_lock); 18337 if (isv6) { 18338 ASSERT(phyi->phyint_illv6 == NULL); 18339 phyi->phyint_illv6 = ill; 18340 } else { 18341 ASSERT(phyi->phyint_illv4 == NULL); 18342 phyi->phyint_illv4 = ill; 18343 } 18344 18345 /* 18346 * Delete the old phyint and make its ipsq eligible 18347 * to be freed in ipsq_exit(). 18348 */ 18349 phyi_old->phyint_illv4 = NULL; 18350 phyi_old->phyint_illv6 = NULL; 18351 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 18352 phyi_old->phyint_name[0] = '\0'; 18353 mi_free(phyi_old); 18354 } else { 18355 mutex_enter(&ill->ill_lock); 18356 /* 18357 * We don't need to acquire any lock, since 18358 * the ill is not yet visible globally and we 18359 * have not yet released the ill_g_lock. 18360 */ 18361 phyi = phyi_old; 18362 mutex_enter(&phyi->phyint_lock); 18363 /* XXX We need a recovery strategy here. */ 18364 if (!phyint_assign_ifindex(phyi, ipst)) 18365 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 18366 18367 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18368 (void *)phyi, where); 18369 18370 (void) avl_find(&ipst->ips_phyint_g_list-> 18371 phyint_list_avl_by_index, 18372 &phyi->phyint_ifindex, &where); 18373 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 18374 (void *)phyi, where); 18375 } 18376 18377 /* 18378 * Reassigning ill_phyint automatically reassigns the ipsq also. 18379 * pending mp is not affected because that is per ill basis. 18380 */ 18381 ill->ill_phyint = phyi; 18382 18383 /* 18384 * Now that the phyint's ifindex has been assigned, complete the 18385 * remaining 18386 */ 18387 18388 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 18389 if (ill->ill_isv6) { 18390 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 18391 ill->ill_phyint->phyint_ifindex; 18392 ill->ill_mcast_type = ipst->ips_mld_max_version; 18393 } else { 18394 ill->ill_mcast_type = ipst->ips_igmp_max_version; 18395 } 18396 18397 /* 18398 * Generate an event within the hooks framework to indicate that 18399 * a new interface has just been added to IP. For this event to 18400 * be generated, the network interface must, at least, have an 18401 * ifindex assigned to it. (We don't generate the event for 18402 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 18403 * 18404 * This needs to be run inside the ill_g_lock perimeter to ensure 18405 * that the ordering of delivered events to listeners matches the 18406 * order of them in the kernel. 18407 */ 18408 if (!IS_LOOPBACK(ill)) { 18409 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 18410 ill->ill_name_length); 18411 } 18412 RELEASE_ILL_LOCKS(ill, ill_other); 18413 mutex_exit(&phyi->phyint_lock); 18414 } 18415 18416 /* 18417 * Notify any downstream modules of the name of this interface. 18418 * An M_IOCTL is used even though we don't expect a successful reply. 18419 * Any reply message from the driver (presumably an M_IOCNAK) will 18420 * eventually get discarded somewhere upstream. The message format is 18421 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 18422 * to IP. 18423 */ 18424 static void 18425 ip_ifname_notify(ill_t *ill, queue_t *q) 18426 { 18427 mblk_t *mp1, *mp2; 18428 struct iocblk *iocp; 18429 struct lifreq *lifr; 18430 18431 mp1 = mkiocb(SIOCSLIFNAME); 18432 if (mp1 == NULL) 18433 return; 18434 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 18435 if (mp2 == NULL) { 18436 freeb(mp1); 18437 return; 18438 } 18439 18440 mp1->b_cont = mp2; 18441 iocp = (struct iocblk *)mp1->b_rptr; 18442 iocp->ioc_count = sizeof (struct lifreq); 18443 18444 lifr = (struct lifreq *)mp2->b_rptr; 18445 mp2->b_wptr += sizeof (struct lifreq); 18446 bzero(lifr, sizeof (struct lifreq)); 18447 18448 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 18449 lifr->lifr_ppa = ill->ill_ppa; 18450 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 18451 18452 putnext(q, mp1); 18453 } 18454 18455 static int 18456 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 18457 { 18458 int err; 18459 ip_stack_t *ipst = ill->ill_ipst; 18460 phyint_t *phyi = ill->ill_phyint; 18461 18462 /* Set the obsolete NDD per-interface forwarding name. */ 18463 err = ill_set_ndd_name(ill); 18464 if (err != 0) { 18465 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 18466 err); 18467 } 18468 18469 /* 18470 * Now that ill_name is set, the configuration for the IPMP 18471 * meta-interface can be performed. 18472 */ 18473 if (IS_IPMP(ill)) { 18474 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 18475 /* 18476 * If phyi->phyint_grp is NULL, then this is the first IPMP 18477 * meta-interface and we need to create the IPMP group. 18478 */ 18479 if (phyi->phyint_grp == NULL) { 18480 /* 18481 * If someone has renamed another IPMP group to have 18482 * the same name as our interface, bail. 18483 */ 18484 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 18485 rw_exit(&ipst->ips_ipmp_lock); 18486 return (EEXIST); 18487 } 18488 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 18489 if (phyi->phyint_grp == NULL) { 18490 rw_exit(&ipst->ips_ipmp_lock); 18491 return (ENOMEM); 18492 } 18493 } 18494 rw_exit(&ipst->ips_ipmp_lock); 18495 } 18496 18497 /* Tell downstream modules where they are. */ 18498 ip_ifname_notify(ill, q); 18499 18500 /* 18501 * ill_dl_phys returns EINPROGRESS in the usual case. 18502 * Error cases are ENOMEM ... 18503 */ 18504 err = ill_dl_phys(ill, ipif, mp, q); 18505 18506 /* 18507 * If there is no IRE expiration timer running, get one started. 18508 * igmp and mld timers will be triggered by the first multicast 18509 */ 18510 if (ipst->ips_ip_ire_expire_id == 0) { 18511 /* 18512 * acquire the lock and check again. 18513 */ 18514 mutex_enter(&ipst->ips_ip_trash_timer_lock); 18515 if (ipst->ips_ip_ire_expire_id == 0) { 18516 ipst->ips_ip_ire_expire_id = timeout( 18517 ip_trash_timer_expire, ipst, 18518 MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 18519 } 18520 mutex_exit(&ipst->ips_ip_trash_timer_lock); 18521 } 18522 18523 if (ill->ill_isv6) { 18524 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 18525 if (ipst->ips_mld_slowtimeout_id == 0) { 18526 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 18527 (void *)ipst, 18528 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 18529 } 18530 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 18531 } else { 18532 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 18533 if (ipst->ips_igmp_slowtimeout_id == 0) { 18534 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 18535 (void *)ipst, 18536 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 18537 } 18538 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 18539 } 18540 18541 return (err); 18542 } 18543 18544 /* 18545 * Common routine for ppa and ifname setting. Should be called exclusive. 18546 * 18547 * Returns EINPROGRESS when mp has been consumed by queueing it on 18548 * ill_pending_mp and the ioctl will complete in ip_rput. 18549 * 18550 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 18551 * the new name and new ppa in lifr_name and lifr_ppa respectively. 18552 * For SLIFNAME, we pass these values back to the userland. 18553 */ 18554 static int 18555 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 18556 { 18557 ill_t *ill; 18558 ipif_t *ipif; 18559 ipsq_t *ipsq; 18560 char *ppa_ptr; 18561 char *old_ptr; 18562 char old_char; 18563 int error; 18564 ip_stack_t *ipst; 18565 18566 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 18567 ASSERT(q->q_next != NULL); 18568 ASSERT(interf_name != NULL); 18569 18570 ill = (ill_t *)q->q_ptr; 18571 ipst = ill->ill_ipst; 18572 18573 ASSERT(ill->ill_ipst != NULL); 18574 ASSERT(ill->ill_name[0] == '\0'); 18575 ASSERT(IAM_WRITER_ILL(ill)); 18576 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 18577 ASSERT(ill->ill_ppa == UINT_MAX); 18578 18579 /* The ppa is sent down by ifconfig or is chosen */ 18580 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 18581 return (EINVAL); 18582 } 18583 18584 /* 18585 * make sure ppa passed in is same as ppa in the name. 18586 * This check is not made when ppa == UINT_MAX in that case ppa 18587 * in the name could be anything. System will choose a ppa and 18588 * update new_ppa_ptr and inter_name to contain the choosen ppa. 18589 */ 18590 if (*new_ppa_ptr != UINT_MAX) { 18591 /* stoi changes the pointer */ 18592 old_ptr = ppa_ptr; 18593 /* 18594 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 18595 * (they don't have an externally visible ppa). We assign one 18596 * here so that we can manage the interface. Note that in 18597 * the past this value was always 0 for DLPI 1 drivers. 18598 */ 18599 if (*new_ppa_ptr == 0) 18600 *new_ppa_ptr = stoi(&old_ptr); 18601 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 18602 return (EINVAL); 18603 } 18604 /* 18605 * terminate string before ppa 18606 * save char at that location. 18607 */ 18608 old_char = ppa_ptr[0]; 18609 ppa_ptr[0] = '\0'; 18610 18611 ill->ill_ppa = *new_ppa_ptr; 18612 /* 18613 * Finish as much work now as possible before calling ill_glist_insert 18614 * which makes the ill globally visible and also merges it with the 18615 * other protocol instance of this phyint. The remaining work is 18616 * done after entering the ipsq which may happen sometime later. 18617 * ill_set_ndd_name occurs after the ill has been made globally visible. 18618 */ 18619 ipif = ill->ill_ipif; 18620 18621 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 18622 ipif_assign_seqid(ipif); 18623 18624 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 18625 ill->ill_flags |= ILLF_IPV4; 18626 18627 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 18628 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 18629 18630 if (ill->ill_flags & ILLF_IPV6) { 18631 18632 ill->ill_isv6 = B_TRUE; 18633 if (ill->ill_rq != NULL) { 18634 ill->ill_rq->q_qinfo = &iprinitv6; 18635 ill->ill_wq->q_qinfo = &ipwinitv6; 18636 } 18637 18638 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 18639 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 18640 ipif->ipif_v6src_addr = ipv6_all_zeros; 18641 ipif->ipif_v6subnet = ipv6_all_zeros; 18642 ipif->ipif_v6net_mask = ipv6_all_zeros; 18643 ipif->ipif_v6brd_addr = ipv6_all_zeros; 18644 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 18645 /* 18646 * point-to-point or Non-mulicast capable 18647 * interfaces won't do NUD unless explicitly 18648 * configured to do so. 18649 */ 18650 if (ipif->ipif_flags & IPIF_POINTOPOINT || 18651 !(ill->ill_flags & ILLF_MULTICAST)) { 18652 ill->ill_flags |= ILLF_NONUD; 18653 } 18654 /* Make sure IPv4 specific flag is not set on IPv6 if */ 18655 if (ill->ill_flags & ILLF_NOARP) { 18656 /* 18657 * Note: xresolv interfaces will eventually need 18658 * NOARP set here as well, but that will require 18659 * those external resolvers to have some 18660 * knowledge of that flag and act appropriately. 18661 * Not to be changed at present. 18662 */ 18663 ill->ill_flags &= ~ILLF_NOARP; 18664 } 18665 /* 18666 * Set the ILLF_ROUTER flag according to the global 18667 * IPv6 forwarding policy. 18668 */ 18669 if (ipst->ips_ipv6_forward != 0) 18670 ill->ill_flags |= ILLF_ROUTER; 18671 } else if (ill->ill_flags & ILLF_IPV4) { 18672 ill->ill_isv6 = B_FALSE; 18673 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 18674 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 18675 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 18676 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 18677 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 18678 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 18679 /* 18680 * Set the ILLF_ROUTER flag according to the global 18681 * IPv4 forwarding policy. 18682 */ 18683 if (ipst->ips_ip_g_forward != 0) 18684 ill->ill_flags |= ILLF_ROUTER; 18685 } 18686 18687 ASSERT(ill->ill_phyint != NULL); 18688 18689 /* 18690 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 18691 * be completed in ill_glist_insert -> ill_phyint_reinit 18692 */ 18693 if (!ill_allocate_mibs(ill)) 18694 return (ENOMEM); 18695 18696 /* 18697 * Pick a default sap until we get the DL_INFO_ACK back from 18698 * the driver. 18699 */ 18700 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 18701 ill->ill_media->ip_m_ipv4sap; 18702 18703 ill->ill_ifname_pending = 1; 18704 ill->ill_ifname_pending_err = 0; 18705 18706 /* 18707 * When the first ipif comes up in ipif_up_done(), multicast groups 18708 * that were joined while this ill was not bound to the DLPI link need 18709 * to be recovered by ill_recover_multicast(). 18710 */ 18711 ill->ill_need_recover_multicast = 1; 18712 18713 ill_refhold(ill); 18714 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 18715 if ((error = ill_glist_insert(ill, interf_name, 18716 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 18717 ill->ill_ppa = UINT_MAX; 18718 ill->ill_name[0] = '\0'; 18719 /* 18720 * undo null termination done above. 18721 */ 18722 ppa_ptr[0] = old_char; 18723 rw_exit(&ipst->ips_ill_g_lock); 18724 ill_refrele(ill); 18725 return (error); 18726 } 18727 18728 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 18729 18730 /* 18731 * When we return the buffer pointed to by interf_name should contain 18732 * the same name as in ill_name. 18733 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 18734 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 18735 * so copy full name and update the ppa ptr. 18736 * When ppa passed in != UINT_MAX all values are correct just undo 18737 * null termination, this saves a bcopy. 18738 */ 18739 if (*new_ppa_ptr == UINT_MAX) { 18740 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 18741 *new_ppa_ptr = ill->ill_ppa; 18742 } else { 18743 /* 18744 * undo null termination done above. 18745 */ 18746 ppa_ptr[0] = old_char; 18747 } 18748 18749 /* Let SCTP know about this ILL */ 18750 sctp_update_ill(ill, SCTP_ILL_INSERT); 18751 18752 /* 18753 * ill_glist_insert has made the ill visible globally, and 18754 * ill_phyint_reinit could have changed the ipsq. At this point, 18755 * we need to hold the ips_ill_g_lock across the call to enter the 18756 * ipsq to enforce atomicity and prevent reordering. In the event 18757 * the ipsq has changed, and if the new ipsq is currently busy, 18758 * we need to make sure that this half-completed ioctl is ahead of 18759 * any subsequent ioctl. We achieve this by not dropping the 18760 * ips_ill_g_lock which prevents any ill lookup itself thereby 18761 * ensuring that new ioctls can't start. 18762 */ 18763 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 18764 B_TRUE); 18765 18766 rw_exit(&ipst->ips_ill_g_lock); 18767 ill_refrele(ill); 18768 if (ipsq == NULL) 18769 return (EINPROGRESS); 18770 18771 /* 18772 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 18773 */ 18774 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 18775 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 18776 else 18777 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 18778 18779 error = ipif_set_values_tail(ill, ipif, mp, q); 18780 ipsq_exit(ipsq); 18781 if (error != 0 && error != EINPROGRESS) { 18782 /* 18783 * restore previous values 18784 */ 18785 ill->ill_isv6 = B_FALSE; 18786 } 18787 return (error); 18788 } 18789 18790 void 18791 ipif_init(ip_stack_t *ipst) 18792 { 18793 int i; 18794 18795 for (i = 0; i < MAX_G_HEADS; i++) { 18796 ipst->ips_ill_g_heads[i].ill_g_list_head = 18797 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 18798 ipst->ips_ill_g_heads[i].ill_g_list_tail = 18799 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 18800 } 18801 18802 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 18803 ill_phyint_compare_index, 18804 sizeof (phyint_t), 18805 offsetof(struct phyint, phyint_avl_by_index)); 18806 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18807 ill_phyint_compare_name, 18808 sizeof (phyint_t), 18809 offsetof(struct phyint, phyint_avl_by_name)); 18810 } 18811 18812 /* 18813 * Lookup the ipif corresponding to the onlink destination address. For 18814 * point-to-point interfaces, it matches with remote endpoint destination 18815 * address. For point-to-multipoint interfaces it only tries to match the 18816 * destination with the interface's subnet address. The longest, most specific 18817 * match is found to take care of such rare network configurations like - 18818 * le0: 129.146.1.1/16 18819 * le1: 129.146.2.2/24 18820 * 18821 * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are 18822 * supported on underlying interfaces in an IPMP group, underlying interfaces 18823 * are ignored when looking up a match. (If we didn't ignore them, we'd 18824 * risk using a test address as a source for outgoing traffic.) 18825 */ 18826 ipif_t * 18827 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 18828 { 18829 ipif_t *ipif, *best_ipif; 18830 ill_t *ill; 18831 ill_walk_context_t ctx; 18832 18833 ASSERT(zoneid != ALL_ZONES); 18834 best_ipif = NULL; 18835 18836 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18837 ill = ILL_START_WALK_V4(&ctx, ipst); 18838 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18839 if (IS_UNDER_IPMP(ill)) 18840 continue; 18841 mutex_enter(&ill->ill_lock); 18842 for (ipif = ill->ill_ipif; ipif != NULL; 18843 ipif = ipif->ipif_next) { 18844 if (!IPIF_CAN_LOOKUP(ipif)) 18845 continue; 18846 if (ipif->ipif_zoneid != zoneid && 18847 ipif->ipif_zoneid != ALL_ZONES) 18848 continue; 18849 /* 18850 * Point-to-point case. Look for exact match with 18851 * destination address. 18852 */ 18853 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 18854 if (ipif->ipif_pp_dst_addr == addr) { 18855 ipif_refhold_locked(ipif); 18856 mutex_exit(&ill->ill_lock); 18857 rw_exit(&ipst->ips_ill_g_lock); 18858 if (best_ipif != NULL) 18859 ipif_refrele(best_ipif); 18860 return (ipif); 18861 } 18862 } else if (ipif->ipif_subnet == (addr & 18863 ipif->ipif_net_mask)) { 18864 /* 18865 * Point-to-multipoint case. Looping through to 18866 * find the most specific match. If there are 18867 * multiple best match ipif's then prefer ipif's 18868 * that are UP. If there is only one best match 18869 * ipif and it is DOWN we must still return it. 18870 */ 18871 if ((best_ipif == NULL) || 18872 (ipif->ipif_net_mask > 18873 best_ipif->ipif_net_mask) || 18874 ((ipif->ipif_net_mask == 18875 best_ipif->ipif_net_mask) && 18876 ((ipif->ipif_flags & IPIF_UP) && 18877 (!(best_ipif->ipif_flags & IPIF_UP))))) { 18878 ipif_refhold_locked(ipif); 18879 mutex_exit(&ill->ill_lock); 18880 rw_exit(&ipst->ips_ill_g_lock); 18881 if (best_ipif != NULL) 18882 ipif_refrele(best_ipif); 18883 best_ipif = ipif; 18884 rw_enter(&ipst->ips_ill_g_lock, 18885 RW_READER); 18886 mutex_enter(&ill->ill_lock); 18887 } 18888 } 18889 } 18890 mutex_exit(&ill->ill_lock); 18891 } 18892 rw_exit(&ipst->ips_ill_g_lock); 18893 return (best_ipif); 18894 } 18895 18896 /* 18897 * Save enough information so that we can recreate the IRE if 18898 * the interface goes down and then up. 18899 */ 18900 static void 18901 ipif_save_ire(ipif_t *ipif, ire_t *ire) 18902 { 18903 mblk_t *save_mp; 18904 18905 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 18906 if (save_mp != NULL) { 18907 ifrt_t *ifrt; 18908 18909 save_mp->b_wptr += sizeof (ifrt_t); 18910 ifrt = (ifrt_t *)save_mp->b_rptr; 18911 bzero(ifrt, sizeof (ifrt_t)); 18912 ifrt->ifrt_type = ire->ire_type; 18913 ifrt->ifrt_addr = ire->ire_addr; 18914 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 18915 ifrt->ifrt_src_addr = ire->ire_src_addr; 18916 ifrt->ifrt_mask = ire->ire_mask; 18917 ifrt->ifrt_flags = ire->ire_flags; 18918 ifrt->ifrt_max_frag = ire->ire_max_frag; 18919 mutex_enter(&ipif->ipif_saved_ire_lock); 18920 save_mp->b_cont = ipif->ipif_saved_ire_mp; 18921 ipif->ipif_saved_ire_mp = save_mp; 18922 ipif->ipif_saved_ire_cnt++; 18923 mutex_exit(&ipif->ipif_saved_ire_lock); 18924 } 18925 } 18926 18927 static void 18928 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 18929 { 18930 mblk_t **mpp; 18931 mblk_t *mp; 18932 ifrt_t *ifrt; 18933 18934 /* Remove from ipif_saved_ire_mp list if it is there */ 18935 mutex_enter(&ipif->ipif_saved_ire_lock); 18936 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 18937 mpp = &(*mpp)->b_cont) { 18938 /* 18939 * On a given ipif, the triple of address, gateway and 18940 * mask is unique for each saved IRE (in the case of 18941 * ordinary interface routes, the gateway address is 18942 * all-zeroes). 18943 */ 18944 mp = *mpp; 18945 ifrt = (ifrt_t *)mp->b_rptr; 18946 if (ifrt->ifrt_addr == ire->ire_addr && 18947 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 18948 ifrt->ifrt_mask == ire->ire_mask) { 18949 *mpp = mp->b_cont; 18950 ipif->ipif_saved_ire_cnt--; 18951 freeb(mp); 18952 break; 18953 } 18954 } 18955 mutex_exit(&ipif->ipif_saved_ire_lock); 18956 } 18957 18958 /* 18959 * IP multirouting broadcast routes handling 18960 * Append CGTP broadcast IREs to regular ones created 18961 * at ifconfig time. 18962 */ 18963 static void 18964 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst) 18965 { 18966 ire_t *ire_prim; 18967 18968 ASSERT(ire != NULL); 18969 ASSERT(ire_dst != NULL); 18970 18971 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 18972 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 18973 if (ire_prim != NULL) { 18974 /* 18975 * We are in the special case of broadcasts for 18976 * CGTP. We add an IRE_BROADCAST that holds 18977 * the RTF_MULTIRT flag, the destination 18978 * address of ire_dst and the low level 18979 * info of ire_prim. In other words, CGTP 18980 * broadcast is added to the redundant ipif. 18981 */ 18982 ipif_t *ipif_prim; 18983 ire_t *bcast_ire; 18984 18985 ipif_prim = ire_prim->ire_ipif; 18986 18987 ip2dbg(("ip_cgtp_filter_bcast_add: " 18988 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 18989 (void *)ire_dst, (void *)ire_prim, 18990 (void *)ipif_prim)); 18991 18992 bcast_ire = ire_create( 18993 (uchar_t *)&ire->ire_addr, 18994 (uchar_t *)&ip_g_all_ones, 18995 (uchar_t *)&ire_dst->ire_src_addr, 18996 (uchar_t *)&ire->ire_gateway_addr, 18997 &ipif_prim->ipif_mtu, 18998 NULL, 18999 ipif_prim->ipif_rq, 19000 ipif_prim->ipif_wq, 19001 IRE_BROADCAST, 19002 ipif_prim, 19003 0, 19004 0, 19005 0, 19006 ire->ire_flags, 19007 &ire_uinfo_null, 19008 NULL, 19009 NULL, 19010 ipst); 19011 19012 if (bcast_ire != NULL) { 19013 19014 if (ire_add(&bcast_ire, NULL, NULL, NULL, 19015 B_FALSE) == 0) { 19016 ip2dbg(("ip_cgtp_filter_bcast_add: " 19017 "added bcast_ire %p\n", 19018 (void *)bcast_ire)); 19019 19020 ipif_save_ire(bcast_ire->ire_ipif, 19021 bcast_ire); 19022 ire_refrele(bcast_ire); 19023 } 19024 } 19025 ire_refrele(ire_prim); 19026 } 19027 } 19028 19029 /* 19030 * IP multirouting broadcast routes handling 19031 * Remove the broadcast ire 19032 */ 19033 static void 19034 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 19035 { 19036 ire_t *ire_dst; 19037 19038 ASSERT(ire != NULL); 19039 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 19040 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19041 if (ire_dst != NULL) { 19042 ire_t *ire_prim; 19043 19044 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 19045 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19046 if (ire_prim != NULL) { 19047 ipif_t *ipif_prim; 19048 ire_t *bcast_ire; 19049 19050 ipif_prim = ire_prim->ire_ipif; 19051 19052 ip2dbg(("ip_cgtp_filter_bcast_delete: " 19053 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 19054 (void *)ire_dst, (void *)ire_prim, 19055 (void *)ipif_prim)); 19056 19057 bcast_ire = ire_ctable_lookup(ire->ire_addr, 19058 ire->ire_gateway_addr, 19059 IRE_BROADCAST, 19060 ipif_prim, ALL_ZONES, 19061 NULL, 19062 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 19063 MATCH_IRE_MASK, ipst); 19064 19065 if (bcast_ire != NULL) { 19066 ip2dbg(("ip_cgtp_filter_bcast_delete: " 19067 "looked up bcast_ire %p\n", 19068 (void *)bcast_ire)); 19069 ipif_remove_ire(bcast_ire->ire_ipif, 19070 bcast_ire); 19071 ire_delete(bcast_ire); 19072 ire_refrele(bcast_ire); 19073 } 19074 ire_refrele(ire_prim); 19075 } 19076 ire_refrele(ire_dst); 19077 } 19078 } 19079 19080 /* 19081 * IPsec hardware acceleration capabilities related functions. 19082 */ 19083 19084 /* 19085 * Free a per-ill IPsec capabilities structure. 19086 */ 19087 static void 19088 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 19089 { 19090 if (capab->auth_hw_algs != NULL) 19091 kmem_free(capab->auth_hw_algs, capab->algs_size); 19092 if (capab->encr_hw_algs != NULL) 19093 kmem_free(capab->encr_hw_algs, capab->algs_size); 19094 if (capab->encr_algparm != NULL) 19095 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 19096 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 19097 } 19098 19099 /* 19100 * Allocate a new per-ill IPsec capabilities structure. This structure 19101 * is specific to an IPsec protocol (AH or ESP). It is implemented as 19102 * an array which specifies, for each algorithm, whether this algorithm 19103 * is supported by the ill or not. 19104 */ 19105 static ill_ipsec_capab_t * 19106 ill_ipsec_capab_alloc(void) 19107 { 19108 ill_ipsec_capab_t *capab; 19109 uint_t nelems; 19110 19111 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 19112 if (capab == NULL) 19113 return (NULL); 19114 19115 /* we need one bit per algorithm */ 19116 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 19117 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 19118 19119 /* allocate memory to store algorithm flags */ 19120 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 19121 if (capab->encr_hw_algs == NULL) 19122 goto nomem; 19123 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 19124 if (capab->auth_hw_algs == NULL) 19125 goto nomem; 19126 /* 19127 * Leave encr_algparm NULL for now since we won't need it half 19128 * the time 19129 */ 19130 return (capab); 19131 19132 nomem: 19133 ill_ipsec_capab_free(capab); 19134 return (NULL); 19135 } 19136 19137 /* 19138 * Resize capability array. Since we're exclusive, this is OK. 19139 */ 19140 static boolean_t 19141 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 19142 { 19143 ipsec_capab_algparm_t *nalp, *oalp; 19144 uint32_t olen, nlen; 19145 19146 oalp = capab->encr_algparm; 19147 olen = capab->encr_algparm_size; 19148 19149 if (oalp != NULL) { 19150 if (algid < capab->encr_algparm_end) 19151 return (B_TRUE); 19152 } 19153 19154 nlen = (algid + 1) * sizeof (*nalp); 19155 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 19156 if (nalp == NULL) 19157 return (B_FALSE); 19158 19159 if (oalp != NULL) { 19160 bcopy(oalp, nalp, olen); 19161 kmem_free(oalp, olen); 19162 } 19163 capab->encr_algparm = nalp; 19164 capab->encr_algparm_size = nlen; 19165 capab->encr_algparm_end = algid + 1; 19166 19167 return (B_TRUE); 19168 } 19169 19170 /* 19171 * Compare the capabilities of the specified ill with the protocol 19172 * and algorithms specified by the SA passed as argument. 19173 * If they match, returns B_TRUE, B_FALSE if they do not match. 19174 * 19175 * The ill can be passed as a pointer to it, or by specifying its index 19176 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 19177 * 19178 * Called by ipsec_out_is_accelerated() do decide whether an outbound 19179 * packet is eligible for hardware acceleration, and by 19180 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 19181 * to a particular ill. 19182 */ 19183 boolean_t 19184 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 19185 ipsa_t *sa, netstack_t *ns) 19186 { 19187 boolean_t sa_isv6; 19188 uint_t algid; 19189 struct ill_ipsec_capab_s *cpp; 19190 boolean_t need_refrele = B_FALSE; 19191 ip_stack_t *ipst = ns->netstack_ip; 19192 19193 if (ill == NULL) { 19194 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 19195 NULL, NULL, NULL, ipst); 19196 if (ill == NULL) { 19197 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 19198 return (B_FALSE); 19199 } 19200 need_refrele = B_TRUE; 19201 } 19202 19203 /* 19204 * Use the address length specified by the SA to determine 19205 * if it corresponds to a IPv6 address, and fail the matching 19206 * if the isv6 flag passed as argument does not match. 19207 * Note: this check is used for SADB capability checking before 19208 * sending SA information to an ill. 19209 */ 19210 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 19211 if (sa_isv6 != ill_isv6) 19212 /* protocol mismatch */ 19213 goto done; 19214 19215 /* 19216 * Check if the ill supports the protocol, algorithm(s) and 19217 * key size(s) specified by the SA, and get the pointers to 19218 * the algorithms supported by the ill. 19219 */ 19220 switch (sa->ipsa_type) { 19221 19222 case SADB_SATYPE_ESP: 19223 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 19224 /* ill does not support ESP acceleration */ 19225 goto done; 19226 cpp = ill->ill_ipsec_capab_esp; 19227 algid = sa->ipsa_auth_alg; 19228 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 19229 goto done; 19230 algid = sa->ipsa_encr_alg; 19231 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 19232 goto done; 19233 if (algid < cpp->encr_algparm_end) { 19234 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 19235 if (sa->ipsa_encrkeybits < alp->minkeylen) 19236 goto done; 19237 if (sa->ipsa_encrkeybits > alp->maxkeylen) 19238 goto done; 19239 } 19240 break; 19241 19242 case SADB_SATYPE_AH: 19243 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 19244 /* ill does not support AH acceleration */ 19245 goto done; 19246 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 19247 ill->ill_ipsec_capab_ah->auth_hw_algs)) 19248 goto done; 19249 break; 19250 } 19251 19252 if (need_refrele) 19253 ill_refrele(ill); 19254 return (B_TRUE); 19255 done: 19256 if (need_refrele) 19257 ill_refrele(ill); 19258 return (B_FALSE); 19259 } 19260 19261 /* 19262 * Add a new ill to the list of IPsec capable ills. 19263 * Called from ill_capability_ipsec_ack() when an ACK was received 19264 * indicating that IPsec hardware processing was enabled for an ill. 19265 * 19266 * ill must point to the ill for which acceleration was enabled. 19267 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 19268 */ 19269 static void 19270 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 19271 { 19272 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 19273 uint_t sa_type; 19274 uint_t ipproto; 19275 ip_stack_t *ipst = ill->ill_ipst; 19276 19277 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 19278 (dl_cap == DL_CAPAB_IPSEC_ESP)); 19279 19280 switch (dl_cap) { 19281 case DL_CAPAB_IPSEC_AH: 19282 sa_type = SADB_SATYPE_AH; 19283 ills = &ipst->ips_ipsec_capab_ills_ah; 19284 ipproto = IPPROTO_AH; 19285 break; 19286 case DL_CAPAB_IPSEC_ESP: 19287 sa_type = SADB_SATYPE_ESP; 19288 ills = &ipst->ips_ipsec_capab_ills_esp; 19289 ipproto = IPPROTO_ESP; 19290 break; 19291 } 19292 19293 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 19294 19295 /* 19296 * Add ill index to list of hardware accelerators. If 19297 * already in list, do nothing. 19298 */ 19299 for (cur_ill = *ills; cur_ill != NULL && 19300 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 19301 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 19302 ; 19303 19304 if (cur_ill == NULL) { 19305 /* if this is a new entry for this ill */ 19306 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 19307 if (new_ill == NULL) { 19308 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19309 return; 19310 } 19311 19312 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 19313 new_ill->ill_isv6 = ill->ill_isv6; 19314 new_ill->next = *ills; 19315 *ills = new_ill; 19316 } else if (!sadb_resync) { 19317 /* not resync'ing SADB and an entry exists for this ill */ 19318 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19319 return; 19320 } 19321 19322 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19323 19324 if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 19325 /* 19326 * IPsec module for protocol loaded, initiate dump 19327 * of the SADB to this ill. 19328 */ 19329 sadb_ill_download(ill, sa_type); 19330 } 19331 19332 /* 19333 * Remove an ill from the list of IPsec capable ills. 19334 */ 19335 static void 19336 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 19337 { 19338 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 19339 ip_stack_t *ipst = ill->ill_ipst; 19340 19341 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 19342 dl_cap == DL_CAPAB_IPSEC_ESP); 19343 19344 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah : 19345 &ipst->ips_ipsec_capab_ills_esp; 19346 19347 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 19348 19349 prev_ill = NULL; 19350 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 19351 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 19352 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 19353 ; 19354 if (cur_ill == NULL) { 19355 /* entry not found */ 19356 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19357 return; 19358 } 19359 if (prev_ill == NULL) { 19360 /* entry at front of list */ 19361 *ills = NULL; 19362 } else { 19363 prev_ill->next = cur_ill->next; 19364 } 19365 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 19366 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19367 } 19368 19369 /* 19370 * Called by SADB to send a DL_CONTROL_REQ message to every ill 19371 * supporting the specified IPsec protocol acceleration. 19372 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 19373 * We free the mblk and, if sa is non-null, release the held referece. 19374 */ 19375 void 19376 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, 19377 netstack_t *ns) 19378 { 19379 ipsec_capab_ill_t *ici, *cur_ici; 19380 ill_t *ill; 19381 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 19382 ip_stack_t *ipst = ns->netstack_ip; 19383 19384 ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah : 19385 ipst->ips_ipsec_capab_ills_esp; 19386 19387 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER); 19388 19389 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 19390 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 19391 cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst); 19392 19393 /* 19394 * Handle the case where the ill goes away while the SADB is 19395 * attempting to send messages. If it's going away, it's 19396 * nuking its shadow SADB, so we don't care.. 19397 */ 19398 19399 if (ill == NULL) 19400 continue; 19401 19402 if (sa != NULL) { 19403 /* 19404 * Make sure capabilities match before 19405 * sending SA to ill. 19406 */ 19407 if (!ipsec_capab_match(ill, cur_ici->ill_index, 19408 cur_ici->ill_isv6, sa, ipst->ips_netstack)) { 19409 ill_refrele(ill); 19410 continue; 19411 } 19412 19413 mutex_enter(&sa->ipsa_lock); 19414 sa->ipsa_flags |= IPSA_F_HW; 19415 mutex_exit(&sa->ipsa_lock); 19416 } 19417 19418 /* 19419 * Copy template message, and add it to the front 19420 * of the mblk ship list. We want to avoid holding 19421 * the ipsec_capab_ills_lock while sending the 19422 * message to the ills. 19423 * 19424 * The b_next and b_prev are temporarily used 19425 * to build a list of mblks to be sent down, and to 19426 * save the ill to which they must be sent. 19427 */ 19428 nmp = copymsg(mp); 19429 if (nmp == NULL) { 19430 ill_refrele(ill); 19431 continue; 19432 } 19433 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 19434 nmp->b_next = mp_ship_list; 19435 mp_ship_list = nmp; 19436 nmp->b_prev = (mblk_t *)ill; 19437 } 19438 19439 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19440 19441 for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) { 19442 /* restore the mblk to a sane state */ 19443 next_mp = nmp->b_next; 19444 nmp->b_next = NULL; 19445 ill = (ill_t *)nmp->b_prev; 19446 nmp->b_prev = NULL; 19447 19448 ill_dlpi_send(ill, nmp); 19449 ill_refrele(ill); 19450 } 19451 19452 if (sa != NULL) 19453 IPSA_REFRELE(sa); 19454 freemsg(mp); 19455 } 19456 19457 /* 19458 * Derive an interface id from the link layer address. 19459 * Knows about IEEE 802 and IEEE EUI-64 mappings. 19460 */ 19461 static void 19462 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19463 { 19464 char *addr; 19465 19466 /* 19467 * Note that some IPv6 interfaces get plumbed over links that claim to 19468 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 19469 * PPP links). The ETHERADDRL check here ensures that we only set the 19470 * interface ID on IPv6 interfaces above links that actually have real 19471 * Ethernet addresses. 19472 */ 19473 if (ill->ill_phys_addr_length == ETHERADDRL) { 19474 /* Form EUI-64 like address */ 19475 addr = (char *)&v6addr->s6_addr32[2]; 19476 bcopy(ill->ill_phys_addr, addr, 3); 19477 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 19478 addr[3] = (char)0xff; 19479 addr[4] = (char)0xfe; 19480 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 19481 } 19482 } 19483 19484 /* ARGSUSED */ 19485 static void 19486 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19487 { 19488 } 19489 19490 typedef struct ipmp_ifcookie { 19491 uint32_t ic_hostid; 19492 char ic_ifname[LIFNAMSIZ]; 19493 char ic_zonename[ZONENAME_MAX]; 19494 } ipmp_ifcookie_t; 19495 19496 /* 19497 * Construct a pseudo-random interface ID for the IPMP interface that's both 19498 * predictable and (almost) guaranteed to be unique. 19499 */ 19500 static void 19501 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19502 { 19503 zone_t *zp; 19504 uint8_t *addr; 19505 uchar_t hash[16]; 19506 ulong_t hostid; 19507 MD5_CTX ctx; 19508 ipmp_ifcookie_t ic = { 0 }; 19509 19510 ASSERT(IS_IPMP(ill)); 19511 19512 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 19513 ic.ic_hostid = htonl((uint32_t)hostid); 19514 19515 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 19516 19517 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 19518 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 19519 zone_rele(zp); 19520 } 19521 19522 MD5Init(&ctx); 19523 MD5Update(&ctx, &ic, sizeof (ic)); 19524 MD5Final(hash, &ctx); 19525 19526 /* 19527 * Map the hash to an interface ID per the basic approach in RFC3041. 19528 */ 19529 addr = &v6addr->s6_addr8[8]; 19530 bcopy(hash + 8, addr, sizeof (uint64_t)); 19531 addr[0] &= ~0x2; /* set local bit */ 19532 } 19533 19534 /* ARGSUSED */ 19535 static boolean_t 19536 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 19537 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 19538 { 19539 /* 19540 * Multicast address mappings used over Ethernet/802.X. 19541 * This address is used as a base for mappings. 19542 */ 19543 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 19544 0x00, 0x00, 0x00}; 19545 19546 /* 19547 * Extract low order 32 bits from IPv6 multicast address. 19548 * Or that into the link layer address, starting from the 19549 * second byte. 19550 */ 19551 *hw_start = 2; 19552 v6_extract_mask->s6_addr32[0] = 0; 19553 v6_extract_mask->s6_addr32[1] = 0; 19554 v6_extract_mask->s6_addr32[2] = 0; 19555 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 19556 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 19557 return (B_TRUE); 19558 } 19559 19560 /* 19561 * Indicate by return value whether multicast is supported. If not, 19562 * this code should not touch/change any parameters. 19563 */ 19564 /* ARGSUSED */ 19565 static boolean_t 19566 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 19567 uint32_t *hw_start, ipaddr_t *extract_mask) 19568 { 19569 /* 19570 * Multicast address mappings used over Ethernet/802.X. 19571 * This address is used as a base for mappings. 19572 */ 19573 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 19574 0x00, 0x00, 0x00 }; 19575 19576 if (phys_length != ETHERADDRL) 19577 return (B_FALSE); 19578 19579 *extract_mask = htonl(0x007fffff); 19580 *hw_start = 2; 19581 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 19582 return (B_TRUE); 19583 } 19584 19585 /* ARGSUSED */ 19586 static boolean_t 19587 ip_nodef_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 19588 uint32_t *hw_start, ipaddr_t *extract_mask) 19589 { 19590 return (B_FALSE); 19591 } 19592 19593 /* ARGSUSED */ 19594 static boolean_t 19595 ip_nodef_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 19596 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 19597 { 19598 return (B_FALSE); 19599 } 19600 19601 /* 19602 * Derive IPoIB interface id from the link layer address. 19603 */ 19604 static void 19605 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19606 { 19607 char *addr; 19608 19609 ASSERT(ill->ill_phys_addr_length == 20); 19610 addr = (char *)&v6addr->s6_addr32[2]; 19611 bcopy(ill->ill_phys_addr + 12, addr, 8); 19612 /* 19613 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 19614 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 19615 * rules. In these cases, the IBA considers these GUIDs to be in 19616 * "Modified EUI-64" format, and thus toggling the u/l bit is not 19617 * required; vendors are required not to assign global EUI-64's 19618 * that differ only in u/l bit values, thus guaranteeing uniqueness 19619 * of the interface identifier. Whether the GUID is in modified 19620 * or proper EUI-64 format, the ipv6 identifier must have the u/l 19621 * bit set to 1. 19622 */ 19623 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 19624 } 19625 19626 /* 19627 * Note on mapping from multicast IP addresses to IPoIB multicast link 19628 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 19629 * The format of an IPoIB multicast address is: 19630 * 19631 * 4 byte QPN Scope Sign. Pkey 19632 * +--------------------------------------------+ 19633 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 19634 * +--------------------------------------------+ 19635 * 19636 * The Scope and Pkey components are properties of the IBA port and 19637 * network interface. They can be ascertained from the broadcast address. 19638 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 19639 */ 19640 19641 static boolean_t 19642 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 19643 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 19644 { 19645 /* 19646 * Base IPoIB IPv6 multicast address used for mappings. 19647 * Does not contain the IBA scope/Pkey values. 19648 */ 19649 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 19650 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 19651 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 19652 19653 /* 19654 * Extract low order 80 bits from IPv6 multicast address. 19655 * Or that into the link layer address, starting from the 19656 * sixth byte. 19657 */ 19658 *hw_start = 6; 19659 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 19660 19661 /* 19662 * Now fill in the IBA scope/Pkey values from the broadcast address. 19663 */ 19664 *(maddr + 5) = *(bphys_addr + 5); 19665 *(maddr + 8) = *(bphys_addr + 8); 19666 *(maddr + 9) = *(bphys_addr + 9); 19667 19668 v6_extract_mask->s6_addr32[0] = 0; 19669 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 19670 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 19671 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 19672 return (B_TRUE); 19673 } 19674 19675 static boolean_t 19676 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 19677 uint32_t *hw_start, ipaddr_t *extract_mask) 19678 { 19679 /* 19680 * Base IPoIB IPv4 multicast address used for mappings. 19681 * Does not contain the IBA scope/Pkey values. 19682 */ 19683 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 19684 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 19685 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 19686 19687 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 19688 return (B_FALSE); 19689 19690 /* 19691 * Extract low order 28 bits from IPv4 multicast address. 19692 * Or that into the link layer address, starting from the 19693 * sixteenth byte. 19694 */ 19695 *extract_mask = htonl(0x0fffffff); 19696 *hw_start = 16; 19697 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 19698 19699 /* 19700 * Now fill in the IBA scope/Pkey values from the broadcast address. 19701 */ 19702 *(maddr + 5) = *(bphys_addr + 5); 19703 *(maddr + 8) = *(bphys_addr + 8); 19704 *(maddr + 9) = *(bphys_addr + 9); 19705 return (B_TRUE); 19706 } 19707 19708 /* 19709 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 19710 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 19711 * IPv6 interface id. This is a suggested mechanism described in section 3.7 19712 * of RFC4213. 19713 */ 19714 static void 19715 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 19716 { 19717 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 19718 v6addr->s6_addr32[2] = 0; 19719 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 19720 } 19721 19722 /* 19723 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 19724 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 19725 * id. 19726 */ 19727 static void 19728 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 19729 { 19730 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 19731 19732 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 19733 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 19734 } 19735 19736 static void 19737 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19738 { 19739 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 19740 } 19741 19742 static void 19743 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 19744 { 19745 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 19746 } 19747 19748 static void 19749 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19750 { 19751 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 19752 } 19753 19754 static void 19755 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 19756 { 19757 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 19758 } 19759 19760 /* 19761 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 19762 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 19763 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 19764 * the link-local address is preferred. 19765 */ 19766 boolean_t 19767 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 19768 { 19769 ipif_t *ipif; 19770 ipif_t *maybe_ipif = NULL; 19771 19772 mutex_enter(&ill->ill_lock); 19773 if (ill->ill_state_flags & ILL_CONDEMNED) { 19774 mutex_exit(&ill->ill_lock); 19775 if (ipifp != NULL) 19776 *ipifp = NULL; 19777 return (B_FALSE); 19778 } 19779 19780 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19781 if (!IPIF_CAN_LOOKUP(ipif)) 19782 continue; 19783 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 19784 ipif->ipif_zoneid != ALL_ZONES) 19785 continue; 19786 if ((ipif->ipif_flags & flags) != flags) 19787 continue; 19788 19789 if (ipifp == NULL) { 19790 mutex_exit(&ill->ill_lock); 19791 ASSERT(maybe_ipif == NULL); 19792 return (B_TRUE); 19793 } 19794 if (!ill->ill_isv6 || 19795 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 19796 ipif_refhold_locked(ipif); 19797 mutex_exit(&ill->ill_lock); 19798 *ipifp = ipif; 19799 return (B_TRUE); 19800 } 19801 if (maybe_ipif == NULL) 19802 maybe_ipif = ipif; 19803 } 19804 if (ipifp != NULL) { 19805 if (maybe_ipif != NULL) 19806 ipif_refhold_locked(maybe_ipif); 19807 *ipifp = maybe_ipif; 19808 } 19809 mutex_exit(&ill->ill_lock); 19810 return (maybe_ipif != NULL); 19811 } 19812 19813 /* 19814 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 19815 * If a pointer to an ipif_t is returned then the caller will need to do 19816 * an ill_refrele(). 19817 */ 19818 ipif_t * 19819 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 19820 ip_stack_t *ipst) 19821 { 19822 ipif_t *ipif; 19823 ill_t *ill; 19824 19825 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 19826 ipst); 19827 if (ill == NULL) 19828 return (NULL); 19829 19830 mutex_enter(&ill->ill_lock); 19831 if (ill->ill_state_flags & ILL_CONDEMNED) { 19832 mutex_exit(&ill->ill_lock); 19833 ill_refrele(ill); 19834 return (NULL); 19835 } 19836 19837 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19838 if (!IPIF_CAN_LOOKUP(ipif)) 19839 continue; 19840 if (lifidx == ipif->ipif_id) { 19841 ipif_refhold_locked(ipif); 19842 break; 19843 } 19844 } 19845 19846 mutex_exit(&ill->ill_lock); 19847 ill_refrele(ill); 19848 return (ipif); 19849 } 19850 19851 /* 19852 * Flush the fastpath by deleting any nce's that are waiting for the fastpath, 19853 * There is one exceptions IRE_BROADCAST are difficult to recreate, 19854 * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush() 19855 * for details. 19856 */ 19857 void 19858 ill_fastpath_flush(ill_t *ill) 19859 { 19860 ip_stack_t *ipst = ill->ill_ipst; 19861 19862 nce_fastpath_list_dispatch(ill, NULL, NULL); 19863 ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4), 19864 ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE); 19865 } 19866 19867 /* 19868 * Set the physical address information for `ill' to the contents of the 19869 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 19870 * asynchronous if `ill' cannot immediately be quiesced -- in which case 19871 * EINPROGRESS will be returned. 19872 */ 19873 int 19874 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 19875 { 19876 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19877 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 19878 19879 ASSERT(IAM_WRITER_IPSQ(ipsq)); 19880 19881 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 19882 dlindp->dl_data != DL_CURR_DEST_ADDR && 19883 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 19884 /* Changing DL_IPV6_TOKEN is not yet supported */ 19885 return (0); 19886 } 19887 19888 /* 19889 * We need to store up to two copies of `mp' in `ill'. Due to the 19890 * design of ipsq_pending_mp_add(), we can't pass them as separate 19891 * arguments to ill_set_phys_addr_tail(). Instead, chain them 19892 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 19893 */ 19894 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 19895 freemsg(mp); 19896 return (ENOMEM); 19897 } 19898 19899 ipsq_current_start(ipsq, ill->ill_ipif, 0); 19900 19901 /* 19902 * If we can quiesce the ill, then set the address. If not, then 19903 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 19904 */ 19905 ill_down_ipifs(ill, B_TRUE); 19906 mutex_enter(&ill->ill_lock); 19907 if (!ill_is_quiescent(ill)) { 19908 /* call cannot fail since `conn_t *' argument is NULL */ 19909 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 19910 mp, ILL_DOWN); 19911 mutex_exit(&ill->ill_lock); 19912 return (EINPROGRESS); 19913 } 19914 mutex_exit(&ill->ill_lock); 19915 19916 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 19917 return (0); 19918 } 19919 19920 /* 19921 * Once the ill associated with `q' has quiesced, set its physical address 19922 * information to the values in `addrmp'. Note that two copies of `addrmp' 19923 * are passed (linked by b_cont), since we sometimes need to save two distinct 19924 * copies in the ill_t, and our context doesn't permit sleeping or allocation 19925 * failure (we'll free the other copy if it's not needed). Since the ill_t 19926 * is quiesced, we know any stale IREs with the old address information have 19927 * already been removed, so we don't need to call ill_fastpath_flush(). 19928 */ 19929 /* ARGSUSED */ 19930 static void 19931 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 19932 { 19933 ill_t *ill = q->q_ptr; 19934 mblk_t *addrmp2 = unlinkb(addrmp); 19935 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 19936 uint_t addrlen, addroff; 19937 19938 ASSERT(IAM_WRITER_IPSQ(ipsq)); 19939 19940 addroff = dlindp->dl_addr_offset; 19941 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 19942 19943 switch (dlindp->dl_data) { 19944 case DL_IPV6_LINK_LAYER_ADDR: 19945 ill_set_ndmp(ill, addrmp, addroff, addrlen); 19946 freemsg(addrmp2); 19947 break; 19948 19949 case DL_CURR_DEST_ADDR: 19950 freemsg(ill->ill_dest_addr_mp); 19951 ill->ill_dest_addr = addrmp->b_rptr + addroff; 19952 ill->ill_dest_addr_mp = addrmp; 19953 if (ill->ill_isv6) { 19954 ill_setdesttoken(ill); 19955 ipif_setdestlinklocal(ill->ill_ipif); 19956 } 19957 freemsg(addrmp2); 19958 break; 19959 19960 case DL_CURR_PHYS_ADDR: 19961 freemsg(ill->ill_phys_addr_mp); 19962 ill->ill_phys_addr = addrmp->b_rptr + addroff; 19963 ill->ill_phys_addr_mp = addrmp; 19964 ill->ill_phys_addr_length = addrlen; 19965 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 19966 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 19967 else 19968 freemsg(addrmp2); 19969 if (ill->ill_isv6) { 19970 ill_setdefaulttoken(ill); 19971 ipif_setlinklocal(ill->ill_ipif); 19972 } 19973 break; 19974 default: 19975 ASSERT(0); 19976 } 19977 19978 /* 19979 * If there are ipifs to bring up, ill_up_ipifs() will return 19980 * EINPROGRESS, and ipsq_current_finish() will be called by 19981 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 19982 * brought up. 19983 */ 19984 if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS) 19985 ipsq_current_finish(ipsq); 19986 } 19987 19988 /* 19989 * Helper routine for setting the ill_nd_lla fields. 19990 */ 19991 void 19992 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 19993 { 19994 freemsg(ill->ill_nd_lla_mp); 19995 ill->ill_nd_lla = ndmp->b_rptr + addroff; 19996 ill->ill_nd_lla_mp = ndmp; 19997 ill->ill_nd_lla_len = addrlen; 19998 } 19999 20000 /* 20001 * Replumb the ill. 20002 */ 20003 int 20004 ill_replumb(ill_t *ill, mblk_t *mp) 20005 { 20006 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 20007 20008 ASSERT(IAM_WRITER_IPSQ(ipsq)); 20009 20010 ipsq_current_start(ipsq, ill->ill_ipif, 0); 20011 20012 /* 20013 * If we can quiesce the ill, then continue. If not, then 20014 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 20015 */ 20016 ill_down_ipifs(ill, B_FALSE); 20017 20018 mutex_enter(&ill->ill_lock); 20019 if (!ill_is_quiescent(ill)) { 20020 /* call cannot fail since `conn_t *' argument is NULL */ 20021 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 20022 mp, ILL_DOWN); 20023 mutex_exit(&ill->ill_lock); 20024 return (EINPROGRESS); 20025 } 20026 mutex_exit(&ill->ill_lock); 20027 20028 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 20029 return (0); 20030 } 20031 20032 /* ARGSUSED */ 20033 static void 20034 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 20035 { 20036 ill_t *ill = q->q_ptr; 20037 20038 ASSERT(IAM_WRITER_IPSQ(ipsq)); 20039 20040 ill_down_ipifs_tail(ill); 20041 20042 freemsg(ill->ill_replumb_mp); 20043 ill->ill_replumb_mp = copyb(mp); 20044 20045 /* 20046 * Successfully quiesced and brought down the interface, now we send 20047 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 20048 * DL_NOTE_REPLUMB message. 20049 */ 20050 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 20051 DL_NOTIFY_CONF); 20052 ASSERT(mp != NULL); 20053 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 20054 DL_NOTE_REPLUMB_DONE; 20055 ill_dlpi_send(ill, mp); 20056 20057 /* 20058 * If there are ipifs to bring up, ill_up_ipifs() will return 20059 * EINPROGRESS, and ipsq_current_finish() will be called by 20060 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 20061 * brought up. 20062 */ 20063 if (ill->ill_replumb_mp == NULL || 20064 ill_up_ipifs(ill, q, ill->ill_replumb_mp) != EINPROGRESS) { 20065 ipsq_current_finish(ipsq); 20066 } 20067 } 20068 20069 /* 20070 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 20071 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 20072 * as per the ioctl. On failure, an errno is returned. 20073 */ 20074 static int 20075 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 20076 { 20077 int rval; 20078 struct strioctl iocb; 20079 20080 iocb.ic_cmd = cmd; 20081 iocb.ic_timout = 15; 20082 iocb.ic_len = bufsize; 20083 iocb.ic_dp = buf; 20084 20085 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 20086 } 20087 20088 /* 20089 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 20090 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 20091 */ 20092 static int 20093 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 20094 uint_t *bufsizep, cred_t *cr) 20095 { 20096 int err; 20097 struct lifnum lifn; 20098 20099 bzero(&lifn, sizeof (lifn)); 20100 lifn.lifn_family = af; 20101 lifn.lifn_flags = LIFC_UNDER_IPMP; 20102 20103 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 20104 return (err); 20105 20106 /* 20107 * Pad the interface count to account for additional interfaces that 20108 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 20109 */ 20110 lifn.lifn_count += 4; 20111 bzero(lifcp, sizeof (*lifcp)); 20112 lifcp->lifc_flags = LIFC_UNDER_IPMP; 20113 lifcp->lifc_family = af; 20114 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 20115 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 20116 20117 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 20118 if (err != 0) { 20119 kmem_free(lifcp->lifc_buf, *bufsizep); 20120 return (err); 20121 } 20122 20123 return (0); 20124 } 20125 20126 /* 20127 * Helper for ip_interface_cleanup() that removes the loopback interface. 20128 */ 20129 static void 20130 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 20131 { 20132 int err; 20133 struct lifreq lifr; 20134 20135 bzero(&lifr, sizeof (lifr)); 20136 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 20137 20138 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 20139 if (err != 0) { 20140 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 20141 "error %d\n", isv6 ? "v6" : "v4", err)); 20142 } 20143 } 20144 20145 /* 20146 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 20147 * groups and that IPMP data addresses are down. These conditions must be met 20148 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 20149 */ 20150 static void 20151 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 20152 { 20153 int af = isv6 ? AF_INET6 : AF_INET; 20154 int i, nifs; 20155 int err; 20156 uint_t bufsize; 20157 uint_t lifrsize = sizeof (struct lifreq); 20158 struct lifconf lifc; 20159 struct lifreq *lifrp; 20160 20161 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 20162 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 20163 "(error %d); any IPMP interfaces cannot be shutdown", err); 20164 return; 20165 } 20166 20167 nifs = lifc.lifc_len / lifrsize; 20168 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 20169 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 20170 if (err != 0) { 20171 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 20172 "flags: error %d", lifrp->lifr_name, err); 20173 continue; 20174 } 20175 20176 if (lifrp->lifr_flags & IFF_IPMP) { 20177 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 20178 continue; 20179 20180 lifrp->lifr_flags &= ~IFF_UP; 20181 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 20182 if (err != 0) { 20183 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 20184 "bring down (error %d); IPMP interface may " 20185 "not be shutdown", lifrp->lifr_name, err); 20186 } 20187 20188 /* 20189 * Check if IFF_DUPLICATE is still set -- and if so, 20190 * reset the address to clear it. 20191 */ 20192 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 20193 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 20194 continue; 20195 20196 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 20197 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 20198 lifrp, lifrsize, cr)) != 0) { 20199 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 20200 "reset DAD (error %d); IPMP interface may " 20201 "not be shutdown", lifrp->lifr_name, err); 20202 } 20203 continue; 20204 } 20205 20206 lifrp->lifr_groupname[0] = '\0'; 20207 err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, lifrsize, cr); 20208 if (err != 0) { 20209 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot leave " 20210 "IPMP group (error %d); associated IPMP interface " 20211 "may not be shutdown", lifrp->lifr_name, err); 20212 continue; 20213 } 20214 } 20215 20216 kmem_free(lifc.lifc_buf, bufsize); 20217 } 20218 20219 #define UDPDEV "/devices/pseudo/udp@0:udp" 20220 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 20221 20222 /* 20223 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 20224 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 20225 * when the user-level processes in the zone are killed and the latter are 20226 * cleaned up by str_stack_shutdown(). 20227 */ 20228 void 20229 ip_interface_cleanup(ip_stack_t *ipst) 20230 { 20231 ldi_handle_t lh; 20232 ldi_ident_t li; 20233 cred_t *cr; 20234 int err; 20235 int i; 20236 char *devs[] = { UDP6DEV, UDPDEV }; 20237 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 20238 20239 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 20240 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 20241 " error %d", err); 20242 return; 20243 } 20244 20245 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 20246 ASSERT(cr != NULL); 20247 20248 /* 20249 * NOTE: loop executes exactly twice and is hardcoded to know that the 20250 * first iteration is IPv6. (Unrolling yields repetitious code, hence 20251 * the loop.) 20252 */ 20253 for (i = 0; i < 2; i++) { 20254 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 20255 if (err != 0) { 20256 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 20257 " error %d", devs[i], err); 20258 continue; 20259 } 20260 20261 ip_loopback_removeif(lh, i == 0, cr); 20262 ip_ipmp_cleanup(lh, i == 0, cr); 20263 20264 (void) ldi_close(lh, FREAD|FWRITE, cr); 20265 } 20266 20267 ldi_ident_release(li); 20268 crfree(cr); 20269 } 20270 20271 /* 20272 * This needs to be in-sync with nic_event_t definition 20273 */ 20274 static const char * 20275 ill_hook_event2str(nic_event_t event) 20276 { 20277 switch (event) { 20278 case NE_PLUMB: 20279 return ("PLUMB"); 20280 case NE_UNPLUMB: 20281 return ("UNPLUMB"); 20282 case NE_UP: 20283 return ("UP"); 20284 case NE_DOWN: 20285 return ("DOWN"); 20286 case NE_ADDRESS_CHANGE: 20287 return ("ADDRESS_CHANGE"); 20288 case NE_LIF_UP: 20289 return ("LIF_UP"); 20290 case NE_LIF_DOWN: 20291 return ("LIF_DOWN"); 20292 case NE_IFINDEX_CHANGE: 20293 return ("IFINDEX_CHANGE"); 20294 default: 20295 return ("UNKNOWN"); 20296 } 20297 } 20298 20299 void 20300 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 20301 nic_event_data_t data, size_t datalen) 20302 { 20303 ip_stack_t *ipst = ill->ill_ipst; 20304 hook_nic_event_int_t *info; 20305 const char *str = NULL; 20306 20307 /* create a new nic event info */ 20308 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 20309 goto fail; 20310 20311 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 20312 info->hnei_event.hne_lif = lif; 20313 info->hnei_event.hne_event = event; 20314 info->hnei_event.hne_protocol = ill->ill_isv6 ? 20315 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 20316 info->hnei_event.hne_data = NULL; 20317 info->hnei_event.hne_datalen = 0; 20318 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 20319 20320 if (data != NULL && datalen != 0) { 20321 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 20322 if (info->hnei_event.hne_data == NULL) 20323 goto fail; 20324 bcopy(data, info->hnei_event.hne_data, datalen); 20325 info->hnei_event.hne_datalen = datalen; 20326 } 20327 20328 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 20329 DDI_NOSLEEP) == DDI_SUCCESS) 20330 return; 20331 20332 fail: 20333 if (info != NULL) { 20334 if (info->hnei_event.hne_data != NULL) { 20335 kmem_free(info->hnei_event.hne_data, 20336 info->hnei_event.hne_datalen); 20337 } 20338 kmem_free(info, sizeof (hook_nic_event_t)); 20339 } 20340 str = ill_hook_event2str(event); 20341 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 20342 "information for %s (ENOMEM)\n", str, ill->ill_name)); 20343 } 20344 20345 void 20346 ipif_up_notify(ipif_t *ipif) 20347 { 20348 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 20349 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 20350 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20351 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 20352 NE_LIF_UP, NULL, 0); 20353 } 20354 20355 /* 20356 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and 20357 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on 20358 * TPI end points with STREAMS modules pushed above. This is assured by not 20359 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl 20360 * never ends up on an ipsq, otherwise we may end up processing the ioctl 20361 * while unwinding from the ispq and that could be a thread from the bottom. 20362 */ 20363 /* ARGSUSED */ 20364 int 20365 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 20366 ip_ioctl_cmd_t *ipip, void *arg) 20367 { 20368 mblk_t *cmd_mp = mp->b_cont->b_cont; 20369 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr); 20370 int ret = 0; 20371 int i; 20372 size_t size; 20373 ip_stack_t *ipst; 20374 zoneid_t zoneid; 20375 ilb_stack_t *ilbs; 20376 20377 ipst = CONNQ_TO_IPST(q); 20378 ilbs = ipst->ips_netstack->netstack_ilb; 20379 zoneid = Q_TO_CONN(q)->conn_zoneid; 20380 20381 switch (command) { 20382 case ILB_CREATE_RULE: { 20383 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 20384 20385 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 20386 ret = EINVAL; 20387 break; 20388 } 20389 20390 ret = ilb_rule_add(ilbs, zoneid, cmd); 20391 break; 20392 } 20393 case ILB_DESTROY_RULE: 20394 case ILB_ENABLE_RULE: 20395 case ILB_DISABLE_RULE: { 20396 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr; 20397 20398 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) { 20399 ret = EINVAL; 20400 break; 20401 } 20402 20403 if (cmd->flags & ILB_RULE_ALLRULES) { 20404 if (command == ILB_DESTROY_RULE) { 20405 ilb_rule_del_all(ilbs, zoneid); 20406 break; 20407 } else if (command == ILB_ENABLE_RULE) { 20408 ilb_rule_enable_all(ilbs, zoneid); 20409 break; 20410 } else if (command == ILB_DISABLE_RULE) { 20411 ilb_rule_disable_all(ilbs, zoneid); 20412 break; 20413 } 20414 } else { 20415 if (command == ILB_DESTROY_RULE) { 20416 ret = ilb_rule_del(ilbs, zoneid, cmd->name); 20417 } else if (command == ILB_ENABLE_RULE) { 20418 ret = ilb_rule_enable(ilbs, zoneid, cmd->name, 20419 NULL); 20420 } else if (command == ILB_DISABLE_RULE) { 20421 ret = ilb_rule_disable(ilbs, zoneid, cmd->name, 20422 NULL); 20423 } 20424 } 20425 break; 20426 } 20427 case ILB_NUM_RULES: { 20428 ilb_num_rules_cmd_t *cmd; 20429 20430 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) { 20431 ret = EINVAL; 20432 break; 20433 } 20434 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr; 20435 ilb_get_num_rules(ilbs, zoneid, &(cmd->num)); 20436 break; 20437 } 20438 case ILB_RULE_NAMES: { 20439 ilb_rule_names_cmd_t *cmd; 20440 20441 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr; 20442 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) || 20443 cmd->num_names == 0) { 20444 ret = EINVAL; 20445 break; 20446 } 20447 size = cmd->num_names * ILB_RULE_NAMESZ; 20448 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) + 20449 size != cmd_mp->b_wptr) { 20450 ret = EINVAL; 20451 break; 20452 } 20453 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf); 20454 break; 20455 } 20456 case ILB_NUM_SERVERS: { 20457 ilb_num_servers_cmd_t *cmd; 20458 20459 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) { 20460 ret = EINVAL; 20461 break; 20462 } 20463 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr; 20464 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name, 20465 &(cmd->num)); 20466 break; 20467 } 20468 case ILB_LIST_RULE: { 20469 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 20470 20471 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 20472 ret = EINVAL; 20473 break; 20474 } 20475 ret = ilb_rule_list(ilbs, zoneid, cmd); 20476 break; 20477 } 20478 case ILB_LIST_SERVERS: { 20479 ilb_servers_info_cmd_t *cmd; 20480 20481 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 20482 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) || 20483 cmd->num_servers == 0) { 20484 ret = EINVAL; 20485 break; 20486 } 20487 size = cmd->num_servers * sizeof (ilb_server_info_t); 20488 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 20489 size != cmd_mp->b_wptr) { 20490 ret = EINVAL; 20491 break; 20492 } 20493 20494 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers, 20495 &cmd->num_servers); 20496 break; 20497 } 20498 case ILB_ADD_SERVERS: { 20499 ilb_servers_info_cmd_t *cmd; 20500 ilb_rule_t *rule; 20501 20502 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 20503 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) { 20504 ret = EINVAL; 20505 break; 20506 } 20507 size = cmd->num_servers * sizeof (ilb_server_info_t); 20508 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 20509 size != cmd_mp->b_wptr) { 20510 ret = EINVAL; 20511 break; 20512 } 20513 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 20514 if (rule == NULL) { 20515 ASSERT(ret != 0); 20516 break; 20517 } 20518 for (i = 0; i < cmd->num_servers; i++) { 20519 ilb_server_info_t *s; 20520 20521 s = &cmd->servers[i]; 20522 s->err = ilb_server_add(ilbs, rule, s); 20523 } 20524 ILB_RULE_REFRELE(rule); 20525 break; 20526 } 20527 case ILB_DEL_SERVERS: 20528 case ILB_ENABLE_SERVERS: 20529 case ILB_DISABLE_SERVERS: { 20530 ilb_servers_cmd_t *cmd; 20531 ilb_rule_t *rule; 20532 int (*f)(); 20533 20534 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr; 20535 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) { 20536 ret = EINVAL; 20537 break; 20538 } 20539 size = cmd->num_servers * sizeof (ilb_server_arg_t); 20540 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) + 20541 size != cmd_mp->b_wptr) { 20542 ret = EINVAL; 20543 break; 20544 } 20545 20546 if (command == ILB_DEL_SERVERS) 20547 f = ilb_server_del; 20548 else if (command == ILB_ENABLE_SERVERS) 20549 f = ilb_server_enable; 20550 else if (command == ILB_DISABLE_SERVERS) 20551 f = ilb_server_disable; 20552 20553 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 20554 if (rule == NULL) { 20555 ASSERT(ret != 0); 20556 break; 20557 } 20558 20559 for (i = 0; i < cmd->num_servers; i++) { 20560 ilb_server_arg_t *s; 20561 20562 s = &cmd->servers[i]; 20563 s->err = f(ilbs, zoneid, NULL, rule, &s->addr); 20564 } 20565 ILB_RULE_REFRELE(rule); 20566 break; 20567 } 20568 case ILB_LIST_NAT_TABLE: { 20569 ilb_list_nat_cmd_t *cmd; 20570 20571 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr; 20572 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) { 20573 ret = EINVAL; 20574 break; 20575 } 20576 size = cmd->num_nat * sizeof (ilb_nat_entry_t); 20577 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) + 20578 size != cmd_mp->b_wptr) { 20579 ret = EINVAL; 20580 break; 20581 } 20582 20583 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat, 20584 &cmd->flags); 20585 break; 20586 } 20587 case ILB_LIST_STICKY_TABLE: { 20588 ilb_list_sticky_cmd_t *cmd; 20589 20590 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr; 20591 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) { 20592 ret = EINVAL; 20593 break; 20594 } 20595 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t); 20596 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) + 20597 size != cmd_mp->b_wptr) { 20598 ret = EINVAL; 20599 break; 20600 } 20601 20602 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries, 20603 &cmd->num_sticky, &cmd->flags); 20604 break; 20605 } 20606 default: 20607 ret = EINVAL; 20608 break; 20609 } 20610 done: 20611 return (ret); 20612 } 20613