1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains the interface control functions for IP. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/dlpi.h> 34 #include <sys/stropts.h> 35 #include <sys/strsun.h> 36 #include <sys/sysmacros.h> 37 #include <sys/strsubr.h> 38 #include <sys/strlog.h> 39 #include <sys/ddi.h> 40 #include <sys/sunddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kstat.h> 43 #include <sys/debug.h> 44 #include <sys/zone.h> 45 #include <sys/sunldi.h> 46 #include <sys/file.h> 47 #include <sys/bitmap.h> 48 #include <sys/cpuvar.h> 49 #include <sys/time.h> 50 #include <sys/ctype.h> 51 #include <sys/kmem.h> 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/socket.h> 55 #include <sys/isa_defs.h> 56 #include <net/if.h> 57 #include <net/if_arp.h> 58 #include <net/if_types.h> 59 #include <net/if_dl.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/igmp_var.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 #include <sys/callb.h> 69 #include <sys/md5.h> 70 71 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 #include <inet/mi.h> 73 #include <inet/nd.h> 74 #include <inet/arp.h> 75 #include <inet/mib2.h> 76 #include <inet/ip.h> 77 #include <inet/ip6.h> 78 #include <inet/ip6_asp.h> 79 #include <inet/tcp.h> 80 #include <inet/ip_multi.h> 81 #include <inet/ip_ire.h> 82 #include <inet/ip_ftable.h> 83 #include <inet/ip_rts.h> 84 #include <inet/ip_ndp.h> 85 #include <inet/ip_if.h> 86 #include <inet/ip_impl.h> 87 #include <inet/sctp_ip.h> 88 #include <inet/ip_netinfo.h> 89 90 #include <net/pfkeyv2.h> 91 #include <inet/ipsec_info.h> 92 #include <inet/sadb.h> 93 #include <inet/ipsec_impl.h> 94 #include <sys/iphada.h> 95 96 #include <netinet/igmp.h> 97 #include <inet/ip_listutils.h> 98 #include <inet/ipclassifier.h> 99 #include <sys/mac_client.h> 100 #include <sys/dld.h> 101 102 #include <sys/systeminfo.h> 103 #include <sys/bootconf.h> 104 105 #include <sys/tsol/tndb.h> 106 #include <sys/tsol/tnet.h> 107 108 /* The character which tells where the ill_name ends */ 109 #define IPIF_SEPARATOR_CHAR ':' 110 111 /* IP ioctl function table entry */ 112 typedef struct ipft_s { 113 int ipft_cmd; 114 pfi_t ipft_pfi; 115 int ipft_min_size; 116 int ipft_flags; 117 } ipft_t; 118 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 119 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 120 121 typedef struct ip_sock_ar_s { 122 union { 123 area_t ip_sock_area; 124 ared_t ip_sock_ared; 125 areq_t ip_sock_areq; 126 } ip_sock_ar_u; 127 queue_t *ip_sock_ar_q; 128 } ip_sock_ar_t; 129 130 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 131 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 132 char *value, caddr_t cp, cred_t *ioc_cr); 133 134 static boolean_t ill_is_quiescent(ill_t *); 135 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 136 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 137 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 138 mblk_t *mp, boolean_t need_up); 139 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 140 mblk_t *mp, boolean_t need_up); 141 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 142 queue_t *q, mblk_t *mp, boolean_t need_up); 143 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 144 mblk_t *mp); 145 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 146 mblk_t *mp); 147 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 148 queue_t *q, mblk_t *mp, boolean_t need_up); 149 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 150 int ioccmd, struct linkblk *li, boolean_t doconsist); 151 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 152 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 153 static void ipsq_flush(ill_t *ill); 154 155 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 156 queue_t *q, mblk_t *mp, boolean_t need_up); 157 static void ipsq_delete(ipsq_t *); 158 159 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 160 boolean_t initialize, boolean_t insert); 161 static void ipif_check_bcast_ires(ipif_t *test_ipif); 162 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 163 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 164 boolean_t isv6); 165 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 166 static void ipif_delete_cache_ire(ire_t *, char *); 167 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 168 static void ipif_free(ipif_t *ipif); 169 static void ipif_free_tail(ipif_t *ipif); 170 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 171 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 172 static void ipif_set_default(ipif_t *ipif); 173 static int ipif_set_values(queue_t *q, mblk_t *mp, 174 char *interf_name, uint_t *ppa); 175 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 176 queue_t *q); 177 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 178 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 179 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); 180 static void ipif_update_other_ipifs(ipif_t *old_ipif); 181 182 static int ill_alloc_ppa(ill_if_t *, ill_t *); 183 static int ill_arp_off(ill_t *ill); 184 static int ill_arp_on(ill_t *ill); 185 static void ill_delete_interface_type(ill_if_t *); 186 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 187 static void ill_dl_down(ill_t *ill); 188 static void ill_down(ill_t *ill); 189 static void ill_downi(ire_t *ire, char *ill_arg); 190 static void ill_free_mib(ill_t *ill); 191 static void ill_glist_delete(ill_t *); 192 static void ill_phyint_reinit(ill_t *ill); 193 static void ill_set_nce_router_flags(ill_t *, boolean_t); 194 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 195 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 196 197 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 198 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 199 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 200 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 201 static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo; 202 static ip_v6mapinfo_func_t ip_nodef_v6mapinfo; 203 static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo; 204 static ip_v4mapinfo_func_t ip_nodef_v4mapinfo; 205 static void ipif_save_ire(ipif_t *, ire_t *); 206 static void ipif_remove_ire(ipif_t *, ire_t *); 207 static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); 208 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 209 static void phyint_free(phyint_t *); 210 211 /* 212 * Per-ill IPsec capabilities management. 213 */ 214 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 215 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 216 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 217 static void ill_ipsec_capab_delete(ill_t *, uint_t); 218 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 219 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 220 boolean_t); 221 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 222 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 223 static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *); 224 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 225 static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *); 226 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 227 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 228 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 229 dl_capability_sub_t *); 230 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 231 static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *, 232 int *); 233 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 234 static void ill_capability_dld_ack(ill_t *, mblk_t *, 235 dl_capability_sub_t *); 236 static void ill_capability_dld_enable(ill_t *); 237 static void ill_capability_ack_thr(void *); 238 static void ill_capability_lso_enable(ill_t *); 239 static void ill_capability_send(ill_t *, mblk_t *); 240 241 static ill_t *ill_prev_usesrc(ill_t *); 242 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 243 static void ill_disband_usesrc_group(ill_t *); 244 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 245 246 #ifdef DEBUG 247 static void ill_trace_cleanup(const ill_t *); 248 static void ipif_trace_cleanup(const ipif_t *); 249 #endif 250 251 /* 252 * if we go over the memory footprint limit more than once in this msec 253 * interval, we'll start pruning aggressively. 254 */ 255 int ip_min_frag_prune_time = 0; 256 257 /* 258 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 259 * and the IPsec DOI 260 */ 261 #define MAX_IPSEC_ALGS 256 262 263 #define BITSPERBYTE 8 264 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 265 266 #define IPSEC_ALG_ENABLE(algs, algid) \ 267 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 268 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 269 270 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 271 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 272 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 273 274 typedef uint8_t ipsec_capab_elem_t; 275 276 /* 277 * Per-algorithm parameters. Note that at present, only encryption 278 * algorithms have variable keysize (IKE does not provide a way to negotiate 279 * auth algorithm keysize). 280 * 281 * All sizes here are in bits. 282 */ 283 typedef struct 284 { 285 uint16_t minkeylen; 286 uint16_t maxkeylen; 287 } ipsec_capab_algparm_t; 288 289 /* 290 * Per-ill capabilities. 291 */ 292 struct ill_ipsec_capab_s { 293 ipsec_capab_elem_t *encr_hw_algs; 294 ipsec_capab_elem_t *auth_hw_algs; 295 uint32_t algs_size; /* size of _hw_algs in bytes */ 296 /* algorithm key lengths */ 297 ipsec_capab_algparm_t *encr_algparm; 298 uint32_t encr_algparm_size; 299 uint32_t encr_algparm_end; 300 }; 301 302 /* 303 * The field values are larger than strictly necessary for simple 304 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 305 */ 306 static area_t ip_area_template = { 307 AR_ENTRY_ADD, /* area_cmd */ 308 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 309 /* area_name_offset */ 310 /* area_name_length temporarily holds this structure length */ 311 sizeof (area_t), /* area_name_length */ 312 IP_ARP_PROTO_TYPE, /* area_proto */ 313 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 314 IP_ADDR_LEN, /* area_proto_addr_length */ 315 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 316 /* area_proto_mask_offset */ 317 0, /* area_flags */ 318 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 319 /* area_hw_addr_offset */ 320 /* Zero length hw_addr_length means 'use your idea of the address' */ 321 0 /* area_hw_addr_length */ 322 }; 323 324 /* 325 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 326 * support 327 */ 328 static area_t ip6_area_template = { 329 AR_ENTRY_ADD, /* area_cmd */ 330 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 331 /* area_name_offset */ 332 /* area_name_length temporarily holds this structure length */ 333 sizeof (area_t), /* area_name_length */ 334 IP_ARP_PROTO_TYPE, /* area_proto */ 335 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 336 IPV6_ADDR_LEN, /* area_proto_addr_length */ 337 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 338 /* area_proto_mask_offset */ 339 0, /* area_flags */ 340 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 341 /* area_hw_addr_offset */ 342 /* Zero length hw_addr_length means 'use your idea of the address' */ 343 0 /* area_hw_addr_length */ 344 }; 345 346 static ared_t ip_ared_template = { 347 AR_ENTRY_DELETE, 348 sizeof (ared_t) + IP_ADDR_LEN, 349 sizeof (ared_t), 350 IP_ARP_PROTO_TYPE, 351 sizeof (ared_t), 352 IP_ADDR_LEN, 353 0 354 }; 355 356 static ared_t ip6_ared_template = { 357 AR_ENTRY_DELETE, 358 sizeof (ared_t) + IPV6_ADDR_LEN, 359 sizeof (ared_t), 360 IP_ARP_PROTO_TYPE, 361 sizeof (ared_t), 362 IPV6_ADDR_LEN, 363 0 364 }; 365 366 /* 367 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 368 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 369 * areq is used). 370 */ 371 static areq_t ip_areq_template = { 372 AR_ENTRY_QUERY, /* cmd */ 373 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 374 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 375 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 376 sizeof (areq_t), /* target addr offset */ 377 IP_ADDR_LEN, /* target addr_length */ 378 0, /* flags */ 379 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 380 IP_ADDR_LEN, /* sender addr length */ 381 AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ 382 AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ 383 AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ 384 /* anything else filled in by the code */ 385 }; 386 387 static arc_t ip_aru_template = { 388 AR_INTERFACE_UP, 389 sizeof (arc_t), /* Name offset */ 390 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 391 }; 392 393 static arc_t ip_ard_template = { 394 AR_INTERFACE_DOWN, 395 sizeof (arc_t), /* Name offset */ 396 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 397 }; 398 399 static arc_t ip_aron_template = { 400 AR_INTERFACE_ON, 401 sizeof (arc_t), /* Name offset */ 402 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 403 }; 404 405 static arc_t ip_aroff_template = { 406 AR_INTERFACE_OFF, 407 sizeof (arc_t), /* Name offset */ 408 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 409 }; 410 411 static arma_t ip_arma_multi_template = { 412 AR_MAPPING_ADD, 413 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 414 /* Name offset */ 415 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 416 IP_ARP_PROTO_TYPE, 417 sizeof (arma_t), /* proto_addr_offset */ 418 IP_ADDR_LEN, /* proto_addr_length */ 419 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 420 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 421 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 422 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 423 IP_MAX_HW_LEN, /* hw_addr_length */ 424 0, /* hw_mapping_start */ 425 }; 426 427 static ipft_t ip_ioctl_ftbl[] = { 428 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 429 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 430 IPFT_F_NO_REPLY }, 431 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 432 IPFT_F_NO_REPLY }, 433 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 434 { 0 } 435 }; 436 437 /* Simple ICMP IP Header Template */ 438 static ipha_t icmp_ipha = { 439 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 440 }; 441 442 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 443 444 static ip_m_t ip_m_tbl[] = { 445 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 446 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid, 447 ip_nodef_v6intfid }, 448 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 449 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, 450 ip_nodef_v6intfid }, 451 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 452 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, 453 ip_nodef_v6intfid }, 454 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 455 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, 456 ip_nodef_v6intfid }, 457 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 458 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid, 459 ip_nodef_v6intfid }, 460 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 461 ip_ib_v4mapinfo, ip_ib_v6mapinfo, ip_ib_v6intfid, 462 ip_nodef_v6intfid }, 463 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, 464 ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_ipv4_v6destintfid }, 465 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, 466 ip_nodef_v6mapinfo, ip_ipv6_v6intfid, ip_ipv6_v6destintfid }, 467 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, 468 ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_nodef_v6intfid }, 469 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 470 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 471 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 472 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 473 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 474 ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, 475 ip_nodef_v6intfid } 476 }; 477 478 static ill_t ill_null; /* Empty ILL for init. */ 479 char ipif_loopback_name[] = "lo0"; 480 static char *ipv4_forward_suffix = ":ip_forwarding"; 481 static char *ipv6_forward_suffix = ":ip6_forwarding"; 482 static sin6_t sin6_null; /* Zero address for quick clears */ 483 static sin_t sin_null; /* Zero address for quick clears */ 484 485 /* When set search for unused ipif_seqid */ 486 static ipif_t ipif_zero; 487 488 /* 489 * ppa arena is created after these many 490 * interfaces have been plumbed. 491 */ 492 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 493 494 /* 495 * Allocate per-interface mibs. 496 * Returns true if ok. False otherwise. 497 * ipsq may not yet be allocated (loopback case ). 498 */ 499 static boolean_t 500 ill_allocate_mibs(ill_t *ill) 501 { 502 /* Already allocated? */ 503 if (ill->ill_ip_mib != NULL) { 504 if (ill->ill_isv6) 505 ASSERT(ill->ill_icmp6_mib != NULL); 506 return (B_TRUE); 507 } 508 509 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 510 KM_NOSLEEP); 511 if (ill->ill_ip_mib == NULL) { 512 return (B_FALSE); 513 } 514 515 /* Setup static information */ 516 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 517 sizeof (mib2_ipIfStatsEntry_t)); 518 if (ill->ill_isv6) { 519 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 520 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 521 sizeof (mib2_ipv6AddrEntry_t)); 522 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 523 sizeof (mib2_ipv6RouteEntry_t)); 524 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 525 sizeof (mib2_ipv6NetToMediaEntry_t)); 526 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 527 sizeof (ipv6_member_t)); 528 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 529 sizeof (ipv6_grpsrc_t)); 530 } else { 531 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 532 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 533 sizeof (mib2_ipAddrEntry_t)); 534 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 535 sizeof (mib2_ipRouteEntry_t)); 536 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 537 sizeof (mib2_ipNetToMediaEntry_t)); 538 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 539 sizeof (ip_member_t)); 540 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 541 sizeof (ip_grpsrc_t)); 542 543 /* 544 * For a v4 ill, we are done at this point, because per ill 545 * icmp mibs are only used for v6. 546 */ 547 return (B_TRUE); 548 } 549 550 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 551 KM_NOSLEEP); 552 if (ill->ill_icmp6_mib == NULL) { 553 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 554 ill->ill_ip_mib = NULL; 555 return (B_FALSE); 556 } 557 /* static icmp info */ 558 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 559 sizeof (mib2_ipv6IfIcmpEntry_t); 560 /* 561 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 562 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 563 * -> ill_phyint_reinit 564 */ 565 return (B_TRUE); 566 } 567 568 /* 569 * Common code for preparation of ARP commands. Two points to remember: 570 * 1) The ill_name is tacked on at the end of the allocated space so 571 * the templates name_offset field must contain the total space 572 * to allocate less the name length. 573 * 574 * 2) The templates name_length field should contain the *template* 575 * length. We use it as a parameter to bcopy() and then write 576 * the real ill_name_length into the name_length field of the copy. 577 * (Always called as writer.) 578 */ 579 mblk_t * 580 ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr) 581 { 582 arc_t *arc = (arc_t *)template; 583 char *cp; 584 int len; 585 mblk_t *mp; 586 uint_t name_length = ill->ill_name_length; 587 uint_t template_len = arc->arc_name_length; 588 589 len = arc->arc_name_offset + name_length; 590 mp = allocb(len, BPRI_HI); 591 if (mp == NULL) 592 return (NULL); 593 cp = (char *)mp->b_rptr; 594 mp->b_wptr = (uchar_t *)&cp[len]; 595 if (template_len) 596 bcopy(template, cp, template_len); 597 if (len > template_len) 598 bzero(&cp[template_len], len - template_len); 599 mp->b_datap->db_type = M_PROTO; 600 601 arc = (arc_t *)cp; 602 arc->arc_name_length = name_length; 603 cp = (char *)arc + arc->arc_name_offset; 604 bcopy(ill->ill_name, cp, name_length); 605 606 if (addr) { 607 area_t *area = (area_t *)mp->b_rptr; 608 609 cp = (char *)area + area->area_proto_addr_offset; 610 bcopy(addr, cp, area->area_proto_addr_length); 611 if (area->area_cmd == AR_ENTRY_ADD) { 612 cp = (char *)area; 613 len = area->area_proto_addr_length; 614 if (area->area_proto_mask_offset) 615 cp += area->area_proto_mask_offset; 616 else 617 cp += area->area_proto_addr_offset + len; 618 while (len-- > 0) 619 *cp++ = (char)~0; 620 } 621 } 622 return (mp); 623 } 624 625 mblk_t * 626 ipif_area_alloc(ipif_t *ipif, uint_t optflags) 627 { 628 caddr_t addr; 629 mblk_t *mp; 630 area_t *area; 631 uchar_t *areap; 632 ill_t *ill = ipif->ipif_ill; 633 634 if (ill->ill_isv6) { 635 ASSERT(ill->ill_flags & ILLF_XRESOLV); 636 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 637 areap = (uchar_t *)&ip6_area_template; 638 } else { 639 addr = (caddr_t)&ipif->ipif_lcl_addr; 640 areap = (uchar_t *)&ip_area_template; 641 } 642 643 if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL) 644 return (NULL); 645 646 /* 647 * IPMP requires that the hardware address be included in all 648 * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on. 649 * If there are no active underlying ills in the group (and thus no 650 * hardware address, DAD will be deferred until an underlying ill 651 * becomes active. 652 */ 653 if (IS_IPMP(ill)) { 654 if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { 655 freemsg(mp); 656 return (NULL); 657 } 658 } else { 659 ill_refhold(ill); 660 } 661 662 area = (area_t *)mp->b_rptr; 663 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; 664 area->area_flags |= optflags; 665 area->area_hw_addr_length = ill->ill_phys_addr_length; 666 bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset, 667 area->area_hw_addr_length); 668 669 ill_refrele(ill); 670 return (mp); 671 } 672 673 mblk_t * 674 ipif_ared_alloc(ipif_t *ipif) 675 { 676 caddr_t addr; 677 uchar_t *aredp; 678 679 if (ipif->ipif_ill->ill_isv6) { 680 ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV); 681 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 682 aredp = (uchar_t *)&ip6_ared_template; 683 } else { 684 addr = (caddr_t)&ipif->ipif_lcl_addr; 685 aredp = (uchar_t *)&ip_ared_template; 686 } 687 688 return (ill_arp_alloc(ipif->ipif_ill, aredp, addr)); 689 } 690 691 mblk_t * 692 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 693 { 694 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 695 (char *)&addr)); 696 } 697 698 mblk_t * 699 ill_arie_alloc(ill_t *ill, const char *grifname, const void *template) 700 { 701 mblk_t *mp = ill_arp_alloc(ill, template, 0); 702 arie_t *arie; 703 704 if (mp != NULL) { 705 arie = (arie_t *)mp->b_rptr; 706 (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ); 707 } 708 return (mp); 709 } 710 711 /* 712 * Completely vaporize a lower level tap and all associated interfaces. 713 * ill_delete is called only out of ip_close when the device control 714 * stream is being closed. 715 */ 716 void 717 ill_delete(ill_t *ill) 718 { 719 ipif_t *ipif; 720 ill_t *prev_ill; 721 ip_stack_t *ipst = ill->ill_ipst; 722 723 /* 724 * ill_delete may be forcibly entering the ipsq. The previous 725 * ioctl may not have completed and may need to be aborted. 726 * ipsq_flush takes care of it. If we don't need to enter the 727 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 728 * ill_delete_tail is sufficient. 729 */ 730 ipsq_flush(ill); 731 732 /* 733 * Nuke all interfaces. ipif_free will take down the interface, 734 * remove it from the list, and free the data structure. 735 * Walk down the ipif list and remove the logical interfaces 736 * first before removing the main ipif. We can't unplumb 737 * zeroth interface first in the case of IPv6 as reset_conn_ill 738 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 739 * POINTOPOINT. 740 * 741 * If ill_ipif was not properly initialized (i.e low on memory), 742 * then no interfaces to clean up. In this case just clean up the 743 * ill. 744 */ 745 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 746 ipif_free(ipif); 747 748 /* 749 * Used only by ill_arp_on and ill_arp_off, which are writers. 750 * So nobody can be using this mp now. Free the mp allocated for 751 * honoring ILLF_NOARP 752 */ 753 freemsg(ill->ill_arp_on_mp); 754 ill->ill_arp_on_mp = NULL; 755 756 /* Clean up msgs on pending upcalls for mrouted */ 757 reset_mrt_ill(ill); 758 759 /* 760 * ipif_free -> reset_conn_ipif will remove all multicast 761 * references for IPv4. For IPv6, we need to do it here as 762 * it points only at ills. 763 */ 764 reset_conn_ill(ill); 765 766 /* 767 * Remove multicast references added as a result of calls to 768 * ip_join_allmulti(). 769 */ 770 ip_purge_allmulti(ill); 771 772 /* 773 * If the ill being deleted is under IPMP, boot it out of the illgrp. 774 */ 775 if (IS_UNDER_IPMP(ill)) 776 ipmp_ill_leave_illgrp(ill); 777 778 /* 779 * ill_down will arrange to blow off any IRE's dependent on this 780 * ILL, and shut down fragmentation reassembly. 781 */ 782 ill_down(ill); 783 784 /* Let SCTP know, so that it can remove this from its list. */ 785 sctp_update_ill(ill, SCTP_ILL_REMOVE); 786 787 /* 788 * If an address on this ILL is being used as a source address then 789 * clear out the pointers in other ILLs that point to this ILL. 790 */ 791 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 792 if (ill->ill_usesrc_grp_next != NULL) { 793 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 794 ill_disband_usesrc_group(ill); 795 } else { /* consumer of the usesrc ILL */ 796 prev_ill = ill_prev_usesrc(ill); 797 prev_ill->ill_usesrc_grp_next = 798 ill->ill_usesrc_grp_next; 799 } 800 } 801 rw_exit(&ipst->ips_ill_g_usesrc_lock); 802 } 803 804 static void 805 ipif_non_duplicate(ipif_t *ipif) 806 { 807 ill_t *ill = ipif->ipif_ill; 808 mutex_enter(&ill->ill_lock); 809 if (ipif->ipif_flags & IPIF_DUPLICATE) { 810 ipif->ipif_flags &= ~IPIF_DUPLICATE; 811 ASSERT(ill->ill_ipif_dup_count > 0); 812 ill->ill_ipif_dup_count--; 813 } 814 mutex_exit(&ill->ill_lock); 815 } 816 817 /* 818 * ill_delete_tail is called from ip_modclose after all references 819 * to the closing ill are gone. The wait is done in ip_modclose 820 */ 821 void 822 ill_delete_tail(ill_t *ill) 823 { 824 mblk_t **mpp; 825 ipif_t *ipif; 826 ip_stack_t *ipst = ill->ill_ipst; 827 828 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 829 ipif_non_duplicate(ipif); 830 ipif_down_tail(ipif); 831 } 832 833 ASSERT(ill->ill_ipif_dup_count == 0 && 834 ill->ill_arp_down_mp == NULL && 835 ill->ill_arp_del_mapping_mp == NULL); 836 837 /* 838 * If polling capability is enabled (which signifies direct 839 * upcall into IP and driver has ill saved as a handle), 840 * we need to make sure that unbind has completed before we 841 * let the ill disappear and driver no longer has any reference 842 * to this ill. 843 */ 844 mutex_enter(&ill->ill_lock); 845 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 846 cv_wait(&ill->ill_cv, &ill->ill_lock); 847 mutex_exit(&ill->ill_lock); 848 ASSERT(!(ill->ill_capabilities & 849 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 850 851 if (ill->ill_net_type != IRE_LOOPBACK) 852 qprocsoff(ill->ill_rq); 853 854 /* 855 * We do an ipsq_flush once again now. New messages could have 856 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 857 * could also have landed up if an ioctl thread had looked up 858 * the ill before we set the ILL_CONDEMNED flag, but not yet 859 * enqueued the ioctl when we did the ipsq_flush last time. 860 */ 861 ipsq_flush(ill); 862 863 /* 864 * Free capabilities. 865 */ 866 if (ill->ill_ipsec_capab_ah != NULL) { 867 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 868 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 869 ill->ill_ipsec_capab_ah = NULL; 870 } 871 872 if (ill->ill_ipsec_capab_esp != NULL) { 873 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 874 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 875 ill->ill_ipsec_capab_esp = NULL; 876 } 877 878 if (ill->ill_mdt_capab != NULL) { 879 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 880 ill->ill_mdt_capab = NULL; 881 } 882 883 if (ill->ill_hcksum_capab != NULL) { 884 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 885 ill->ill_hcksum_capab = NULL; 886 } 887 888 if (ill->ill_zerocopy_capab != NULL) { 889 kmem_free(ill->ill_zerocopy_capab, 890 sizeof (ill_zerocopy_capab_t)); 891 ill->ill_zerocopy_capab = NULL; 892 } 893 894 if (ill->ill_lso_capab != NULL) { 895 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 896 ill->ill_lso_capab = NULL; 897 } 898 899 if (ill->ill_dld_capab != NULL) { 900 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 901 ill->ill_dld_capab = NULL; 902 } 903 904 while (ill->ill_ipif != NULL) 905 ipif_free_tail(ill->ill_ipif); 906 907 /* 908 * We have removed all references to ilm from conn and the ones joined 909 * within the kernel. 910 * 911 * We don't walk conns, mrts and ires because 912 * 913 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 914 * 2) ill_down ->ill_downi walks all the ires and cleans up 915 * ill references. 916 */ 917 ASSERT(ilm_walk_ill(ill) == 0); 918 919 /* 920 * If this ill is an IPMP meta-interface, blow away the illgrp. This 921 * is safe to do because the illgrp has already been unlinked from the 922 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 923 */ 924 if (IS_IPMP(ill)) { 925 ipmp_illgrp_destroy(ill->ill_grp); 926 ill->ill_grp = NULL; 927 } 928 929 /* 930 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 931 * could free the phyint. No more reference to the phyint after this 932 * point. 933 */ 934 (void) ill_glist_delete(ill); 935 936 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 937 if (ill->ill_ndd_name != NULL) 938 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 939 rw_exit(&ipst->ips_ip_g_nd_lock); 940 941 if (ill->ill_frag_ptr != NULL) { 942 uint_t count; 943 944 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 945 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 946 } 947 mi_free(ill->ill_frag_ptr); 948 ill->ill_frag_ptr = NULL; 949 ill->ill_frag_hash_tbl = NULL; 950 } 951 952 freemsg(ill->ill_nd_lla_mp); 953 /* Free all retained control messages. */ 954 mpp = &ill->ill_first_mp_to_free; 955 do { 956 while (mpp[0]) { 957 mblk_t *mp; 958 mblk_t *mp1; 959 960 mp = mpp[0]; 961 mpp[0] = mp->b_next; 962 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 963 mp1->b_next = NULL; 964 mp1->b_prev = NULL; 965 } 966 freemsg(mp); 967 } 968 } while (mpp++ != &ill->ill_last_mp_to_free); 969 970 ill_free_mib(ill); 971 972 #ifdef DEBUG 973 ill_trace_cleanup(ill); 974 #endif 975 976 /* Drop refcnt here */ 977 netstack_rele(ill->ill_ipst->ips_netstack); 978 ill->ill_ipst = NULL; 979 } 980 981 static void 982 ill_free_mib(ill_t *ill) 983 { 984 ip_stack_t *ipst = ill->ill_ipst; 985 986 /* 987 * MIB statistics must not be lost, so when an interface 988 * goes away the counter values will be added to the global 989 * MIBs. 990 */ 991 if (ill->ill_ip_mib != NULL) { 992 if (ill->ill_isv6) { 993 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 994 ill->ill_ip_mib); 995 } else { 996 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 997 ill->ill_ip_mib); 998 } 999 1000 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 1001 ill->ill_ip_mib = NULL; 1002 } 1003 if (ill->ill_icmp6_mib != NULL) { 1004 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 1005 ill->ill_icmp6_mib); 1006 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 1007 ill->ill_icmp6_mib = NULL; 1008 } 1009 } 1010 1011 /* 1012 * Concatenate together a physical address and a sap. 1013 * 1014 * Sap_lengths are interpreted as follows: 1015 * sap_length == 0 ==> no sap 1016 * sap_length > 0 ==> sap is at the head of the dlpi address 1017 * sap_length < 0 ==> sap is at the tail of the dlpi address 1018 */ 1019 static void 1020 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 1021 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 1022 { 1023 uint16_t sap_addr = (uint16_t)sap_src; 1024 1025 if (sap_length == 0) { 1026 if (phys_src == NULL) 1027 bzero(dst, phys_length); 1028 else 1029 bcopy(phys_src, dst, phys_length); 1030 } else if (sap_length < 0) { 1031 if (phys_src == NULL) 1032 bzero(dst, phys_length); 1033 else 1034 bcopy(phys_src, dst, phys_length); 1035 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 1036 } else { 1037 bcopy(&sap_addr, dst, sizeof (sap_addr)); 1038 if (phys_src == NULL) 1039 bzero((char *)dst + sap_length, phys_length); 1040 else 1041 bcopy(phys_src, (char *)dst + sap_length, phys_length); 1042 } 1043 } 1044 1045 /* 1046 * Generate a dl_unitdata_req mblk for the device and address given. 1047 * addr_length is the length of the physical portion of the address. 1048 * If addr is NULL include an all zero address of the specified length. 1049 * TRUE? In any case, addr_length is taken to be the entire length of the 1050 * dlpi address, including the absolute value of sap_length. 1051 */ 1052 mblk_t * 1053 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1054 t_scalar_t sap_length) 1055 { 1056 dl_unitdata_req_t *dlur; 1057 mblk_t *mp; 1058 t_scalar_t abs_sap_length; /* absolute value */ 1059 1060 abs_sap_length = ABS(sap_length); 1061 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1062 DL_UNITDATA_REQ); 1063 if (mp == NULL) 1064 return (NULL); 1065 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1066 /* HACK: accomodate incompatible DLPI drivers */ 1067 if (addr_length == 8) 1068 addr_length = 6; 1069 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1070 dlur->dl_dest_addr_offset = sizeof (*dlur); 1071 dlur->dl_priority.dl_min = 0; 1072 dlur->dl_priority.dl_max = 0; 1073 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1074 (uchar_t *)&dlur[1]); 1075 return (mp); 1076 } 1077 1078 /* 1079 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp. Return 1080 * an error if we already have 1 or more ioctls in progress. This is only 1081 * needed for SIOCG*ARP. 1082 */ 1083 boolean_t 1084 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1085 { 1086 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1087 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1088 /* We should only see M_IOCDATA arp ioctls here. */ 1089 ASSERT(add_mp->b_datap->db_type == M_IOCDATA); 1090 1091 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1092 /* 1093 * Return error if the conn has started closing. The conn 1094 * could have finished cleaning up the pending mp list, 1095 * If so we should not add another mp to the list negating 1096 * the cleanup. 1097 */ 1098 if (connp->conn_state_flags & CONN_CLOSING) 1099 return (B_FALSE); 1100 /* 1101 * Add the pending mp to the head of the list, chained by b_next. 1102 * Note down the conn on which the ioctl request came, in b_prev. 1103 * This will be used to later get the conn, when we get a response 1104 * on the ill queue, from some other module (typically arp) 1105 */ 1106 add_mp->b_next = (void *)ill->ill_pending_mp; 1107 add_mp->b_queue = CONNP_TO_WQ(connp); 1108 ill->ill_pending_mp = add_mp; 1109 if (connp != NULL) 1110 connp->conn_oper_pending_ill = ill; 1111 return (B_TRUE); 1112 } 1113 1114 /* 1115 * Retrieve the ill_pending_mp and return it. We have to walk the list 1116 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1117 */ 1118 mblk_t * 1119 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1120 { 1121 mblk_t *prev = NULL; 1122 mblk_t *curr = NULL; 1123 uint_t id; 1124 conn_t *connp; 1125 1126 /* 1127 * When the conn closes, conn_ioctl_cleanup needs to clean 1128 * up the pending mp, but it does not know the ioc_id and 1129 * passes in a zero for it. 1130 */ 1131 mutex_enter(&ill->ill_lock); 1132 if (ioc_id != 0) 1133 *connpp = NULL; 1134 1135 /* Search the list for the appropriate ioctl based on ioc_id */ 1136 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1137 prev = curr, curr = curr->b_next) { 1138 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1139 connp = Q_TO_CONN(curr->b_queue); 1140 /* Match based on the ioc_id or based on the conn */ 1141 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1142 break; 1143 } 1144 1145 if (curr != NULL) { 1146 /* Unlink the mblk from the pending mp list */ 1147 if (prev != NULL) { 1148 prev->b_next = curr->b_next; 1149 } else { 1150 ASSERT(ill->ill_pending_mp == curr); 1151 ill->ill_pending_mp = curr->b_next; 1152 } 1153 1154 /* 1155 * conn refcnt must have been bumped up at the start of 1156 * the ioctl. So we can safely access the conn. 1157 */ 1158 ASSERT(CONN_Q(curr->b_queue)); 1159 *connpp = Q_TO_CONN(curr->b_queue); 1160 curr->b_next = NULL; 1161 curr->b_queue = NULL; 1162 } 1163 1164 mutex_exit(&ill->ill_lock); 1165 1166 return (curr); 1167 } 1168 1169 /* 1170 * Add the pending mp to the list. There can be only 1 pending mp 1171 * in the list. Any exclusive ioctl that needs to wait for a response 1172 * from another module or driver needs to use this function to set 1173 * the ipx_pending_mp to the ioctl mblk and wait for the response from 1174 * the other module/driver. This is also used while waiting for the 1175 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1176 */ 1177 boolean_t 1178 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1179 int waitfor) 1180 { 1181 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 1182 1183 ASSERT(IAM_WRITER_IPIF(ipif)); 1184 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1185 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1186 ASSERT(ipx->ipx_pending_mp == NULL); 1187 /* 1188 * The caller may be using a different ipif than the one passed into 1189 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 1190 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 1191 * that `ipx_current_ipif == ipif'. 1192 */ 1193 ASSERT(ipx->ipx_current_ipif != NULL); 1194 1195 /* 1196 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 1197 * driver. 1198 */ 1199 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 1200 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 1201 (DB_TYPE(add_mp) == M_PCPROTO)); 1202 1203 if (connp != NULL) { 1204 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1205 /* 1206 * Return error if the conn has started closing. The conn 1207 * could have finished cleaning up the pending mp list, 1208 * If so we should not add another mp to the list negating 1209 * the cleanup. 1210 */ 1211 if (connp->conn_state_flags & CONN_CLOSING) 1212 return (B_FALSE); 1213 } 1214 mutex_enter(&ipx->ipx_lock); 1215 ipx->ipx_pending_ipif = ipif; 1216 /* 1217 * Note down the queue in b_queue. This will be returned by 1218 * ipsq_pending_mp_get. Caller will then use these values to restart 1219 * the processing 1220 */ 1221 add_mp->b_next = NULL; 1222 add_mp->b_queue = q; 1223 ipx->ipx_pending_mp = add_mp; 1224 ipx->ipx_waitfor = waitfor; 1225 mutex_exit(&ipx->ipx_lock); 1226 1227 if (connp != NULL) 1228 connp->conn_oper_pending_ill = ipif->ipif_ill; 1229 1230 return (B_TRUE); 1231 } 1232 1233 /* 1234 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 1235 * queued in the list. 1236 */ 1237 mblk_t * 1238 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1239 { 1240 mblk_t *curr = NULL; 1241 ipxop_t *ipx = ipsq->ipsq_xop; 1242 1243 *connpp = NULL; 1244 mutex_enter(&ipx->ipx_lock); 1245 if (ipx->ipx_pending_mp == NULL) { 1246 mutex_exit(&ipx->ipx_lock); 1247 return (NULL); 1248 } 1249 1250 /* There can be only 1 such excl message */ 1251 curr = ipx->ipx_pending_mp; 1252 ASSERT(curr->b_next == NULL); 1253 ipx->ipx_pending_ipif = NULL; 1254 ipx->ipx_pending_mp = NULL; 1255 ipx->ipx_waitfor = 0; 1256 mutex_exit(&ipx->ipx_lock); 1257 1258 if (CONN_Q(curr->b_queue)) { 1259 /* 1260 * This mp did a refhold on the conn, at the start of the ioctl. 1261 * So we can safely return a pointer to the conn to the caller. 1262 */ 1263 *connpp = Q_TO_CONN(curr->b_queue); 1264 } else { 1265 *connpp = NULL; 1266 } 1267 curr->b_next = NULL; 1268 curr->b_prev = NULL; 1269 return (curr); 1270 } 1271 1272 /* 1273 * Cleanup the ioctl mp queued in ipx_pending_mp 1274 * - Called in the ill_delete path 1275 * - Called in the M_ERROR or M_HANGUP path on the ill. 1276 * - Called in the conn close path. 1277 */ 1278 boolean_t 1279 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1280 { 1281 mblk_t *mp; 1282 ipxop_t *ipx; 1283 queue_t *q; 1284 ipif_t *ipif; 1285 1286 ASSERT(IAM_WRITER_ILL(ill)); 1287 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 1288 1289 /* 1290 * If connp is null, unconditionally clean up the ipx_pending_mp. 1291 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1292 * even if it is meant for another ill, since we have to enqueue 1293 * a new mp now in ipx_pending_mp to complete the ipif_down. 1294 * If connp is non-null we are called from the conn close path. 1295 */ 1296 mutex_enter(&ipx->ipx_lock); 1297 mp = ipx->ipx_pending_mp; 1298 if (mp == NULL || (connp != NULL && 1299 mp->b_queue != CONNP_TO_WQ(connp))) { 1300 mutex_exit(&ipx->ipx_lock); 1301 return (B_FALSE); 1302 } 1303 /* Now remove from the ipx_pending_mp */ 1304 ipx->ipx_pending_mp = NULL; 1305 q = mp->b_queue; 1306 mp->b_next = NULL; 1307 mp->b_prev = NULL; 1308 mp->b_queue = NULL; 1309 1310 ipif = ipx->ipx_pending_ipif; 1311 ipx->ipx_pending_ipif = NULL; 1312 ipx->ipx_waitfor = 0; 1313 ipx->ipx_current_ipif = NULL; 1314 ipx->ipx_current_ioctl = 0; 1315 ipx->ipx_current_done = B_TRUE; 1316 mutex_exit(&ipx->ipx_lock); 1317 1318 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1319 if (connp == NULL) { 1320 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1321 } else { 1322 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 1323 mutex_enter(&ipif->ipif_ill->ill_lock); 1324 ipif->ipif_state_flags &= ~IPIF_CHANGING; 1325 mutex_exit(&ipif->ipif_ill->ill_lock); 1326 } 1327 } else { 1328 /* 1329 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1330 * be just inet_freemsg. we have to restart it 1331 * otherwise the thread will be stuck. 1332 */ 1333 inet_freemsg(mp); 1334 } 1335 return (B_TRUE); 1336 } 1337 1338 /* 1339 * The ill is closing. Cleanup all the pending mps. Called exclusively 1340 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1341 * knows this ill, and hence nobody can add an mp to this list 1342 */ 1343 static void 1344 ill_pending_mp_cleanup(ill_t *ill) 1345 { 1346 mblk_t *mp; 1347 queue_t *q; 1348 1349 ASSERT(IAM_WRITER_ILL(ill)); 1350 1351 mutex_enter(&ill->ill_lock); 1352 /* 1353 * Every mp on the pending mp list originating from an ioctl 1354 * added 1 to the conn refcnt, at the start of the ioctl. 1355 * So bump it down now. See comments in ip_wput_nondata() 1356 */ 1357 while (ill->ill_pending_mp != NULL) { 1358 mp = ill->ill_pending_mp; 1359 ill->ill_pending_mp = mp->b_next; 1360 mutex_exit(&ill->ill_lock); 1361 1362 q = mp->b_queue; 1363 ASSERT(CONN_Q(q)); 1364 mp->b_next = NULL; 1365 mp->b_prev = NULL; 1366 mp->b_queue = NULL; 1367 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1368 mutex_enter(&ill->ill_lock); 1369 } 1370 ill->ill_pending_ipif = NULL; 1371 1372 mutex_exit(&ill->ill_lock); 1373 } 1374 1375 /* 1376 * Called in the conn close path and ill delete path 1377 */ 1378 static void 1379 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1380 { 1381 ipsq_t *ipsq; 1382 mblk_t *prev; 1383 mblk_t *curr; 1384 mblk_t *next; 1385 queue_t *q; 1386 mblk_t *tmp_list = NULL; 1387 1388 ASSERT(IAM_WRITER_ILL(ill)); 1389 if (connp != NULL) 1390 q = CONNP_TO_WQ(connp); 1391 else 1392 q = ill->ill_wq; 1393 1394 ipsq = ill->ill_phyint->phyint_ipsq; 1395 /* 1396 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1397 * In the case of ioctl from a conn, there can be only 1 mp 1398 * queued on the ipsq. If an ill is being unplumbed, only messages 1399 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1400 * ioctls meant for this ill form conn's are not flushed. They will 1401 * be processed during ipsq_exit and will not find the ill and will 1402 * return error. 1403 */ 1404 mutex_enter(&ipsq->ipsq_lock); 1405 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1406 curr = next) { 1407 next = curr->b_next; 1408 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1409 /* Unlink the mblk from the pending mp list */ 1410 if (prev != NULL) { 1411 prev->b_next = curr->b_next; 1412 } else { 1413 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1414 ipsq->ipsq_xopq_mphead = curr->b_next; 1415 } 1416 if (ipsq->ipsq_xopq_mptail == curr) 1417 ipsq->ipsq_xopq_mptail = prev; 1418 /* 1419 * Create a temporary list and release the ipsq lock 1420 * New elements are added to the head of the tmp_list 1421 */ 1422 curr->b_next = tmp_list; 1423 tmp_list = curr; 1424 } else { 1425 prev = curr; 1426 } 1427 } 1428 mutex_exit(&ipsq->ipsq_lock); 1429 1430 while (tmp_list != NULL) { 1431 curr = tmp_list; 1432 tmp_list = curr->b_next; 1433 curr->b_next = NULL; 1434 curr->b_prev = NULL; 1435 curr->b_queue = NULL; 1436 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1437 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1438 CONN_CLOSE : NO_COPYOUT, NULL); 1439 } else { 1440 /* 1441 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1442 * this can't be just inet_freemsg. we have to 1443 * restart it otherwise the thread will be stuck. 1444 */ 1445 inet_freemsg(curr); 1446 } 1447 } 1448 } 1449 1450 /* 1451 * This conn has started closing. Cleanup any pending ioctl from this conn. 1452 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1453 */ 1454 void 1455 conn_ioctl_cleanup(conn_t *connp) 1456 { 1457 mblk_t *curr; 1458 ipsq_t *ipsq; 1459 ill_t *ill; 1460 boolean_t refheld; 1461 1462 /* 1463 * Is any exclusive ioctl pending ? If so clean it up. If the 1464 * ioctl has not yet started, the mp is pending in the list headed by 1465 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1466 * ipx_pending_mp. If the ioctl timed out in the streamhead but 1467 * is currently executing now the mp is not queued anywhere but 1468 * conn_oper_pending_ill is null. The conn close will wait 1469 * till the conn_ref drops to zero. 1470 */ 1471 mutex_enter(&connp->conn_lock); 1472 ill = connp->conn_oper_pending_ill; 1473 if (ill == NULL) { 1474 mutex_exit(&connp->conn_lock); 1475 return; 1476 } 1477 1478 curr = ill_pending_mp_get(ill, &connp, 0); 1479 if (curr != NULL) { 1480 mutex_exit(&connp->conn_lock); 1481 CONN_DEC_REF(connp); 1482 inet_freemsg(curr); 1483 return; 1484 } 1485 /* 1486 * We may not be able to refhold the ill if the ill/ipif 1487 * is changing. But we need to make sure that the ill will 1488 * not vanish. So we just bump up the ill_waiter count. 1489 */ 1490 refheld = ill_waiter_inc(ill); 1491 mutex_exit(&connp->conn_lock); 1492 if (refheld) { 1493 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1494 ill_waiter_dcr(ill); 1495 /* 1496 * Check whether this ioctl has started and is 1497 * pending. If it is not found there then check 1498 * whether this ioctl has not even started and is in 1499 * the ipsq_xopq list. 1500 */ 1501 if (!ipsq_pending_mp_cleanup(ill, connp)) 1502 ipsq_xopq_mp_cleanup(ill, connp); 1503 ipsq = ill->ill_phyint->phyint_ipsq; 1504 ipsq_exit(ipsq); 1505 return; 1506 } 1507 } 1508 1509 /* 1510 * The ill is also closing and we could not bump up the 1511 * ill_waiter_count or we could not enter the ipsq. Leave 1512 * the cleanup to ill_delete 1513 */ 1514 mutex_enter(&connp->conn_lock); 1515 while (connp->conn_oper_pending_ill != NULL) 1516 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1517 mutex_exit(&connp->conn_lock); 1518 if (refheld) 1519 ill_waiter_dcr(ill); 1520 } 1521 1522 /* 1523 * ipcl_walk function for cleaning up conn_*_ill fields. 1524 */ 1525 static void 1526 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1527 { 1528 ill_t *ill = (ill_t *)arg; 1529 ire_t *ire; 1530 1531 mutex_enter(&connp->conn_lock); 1532 if (connp->conn_multicast_ill == ill) { 1533 /* Revert to late binding */ 1534 connp->conn_multicast_ill = NULL; 1535 } 1536 if (connp->conn_incoming_ill == ill) 1537 connp->conn_incoming_ill = NULL; 1538 if (connp->conn_outgoing_ill == ill) 1539 connp->conn_outgoing_ill = NULL; 1540 if (connp->conn_dhcpinit_ill == ill) { 1541 connp->conn_dhcpinit_ill = NULL; 1542 ASSERT(ill->ill_dhcpinit != 0); 1543 atomic_dec_32(&ill->ill_dhcpinit); 1544 } 1545 if (connp->conn_ire_cache != NULL) { 1546 ire = connp->conn_ire_cache; 1547 /* 1548 * Source address selection makes it possible for IRE_CACHE 1549 * entries to be created with ire_stq coming from interface X 1550 * and ipif coming from interface Y. Thus whenever interface 1551 * X goes down, remove all references to it by checking both 1552 * on ire_ipif and ire_stq. 1553 */ 1554 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1555 (ire->ire_type == IRE_CACHE && 1556 ire->ire_stq == ill->ill_wq)) { 1557 connp->conn_ire_cache = NULL; 1558 mutex_exit(&connp->conn_lock); 1559 ire_refrele_notr(ire); 1560 return; 1561 } 1562 } 1563 mutex_exit(&connp->conn_lock); 1564 } 1565 1566 static void 1567 ill_down_ipifs_tail(ill_t *ill) 1568 { 1569 ipif_t *ipif; 1570 1571 ASSERT(IAM_WRITER_ILL(ill)); 1572 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1573 ipif_non_duplicate(ipif); 1574 ipif_down_tail(ipif); 1575 } 1576 } 1577 1578 /* ARGSUSED */ 1579 void 1580 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1581 { 1582 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1583 ill_down_ipifs_tail(q->q_ptr); 1584 freemsg(mp); 1585 ipsq_current_finish(ipsq); 1586 } 1587 1588 /* 1589 * ill_down_start is called when we want to down this ill and bring it up again 1590 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1591 * all interfaces, but don't tear down any plumbing. 1592 */ 1593 boolean_t 1594 ill_down_start(queue_t *q, mblk_t *mp) 1595 { 1596 ill_t *ill = q->q_ptr; 1597 ipif_t *ipif; 1598 1599 ASSERT(IAM_WRITER_ILL(ill)); 1600 1601 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1602 (void) ipif_down(ipif, NULL, NULL); 1603 1604 ill_down(ill); 1605 1606 (void) ipsq_pending_mp_cleanup(ill, NULL); 1607 1608 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1609 1610 /* 1611 * Atomically test and add the pending mp if references are active. 1612 */ 1613 mutex_enter(&ill->ill_lock); 1614 if (!ill_is_quiescent(ill)) { 1615 /* call cannot fail since `conn_t *' argument is NULL */ 1616 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1617 mp, ILL_DOWN); 1618 mutex_exit(&ill->ill_lock); 1619 return (B_FALSE); 1620 } 1621 mutex_exit(&ill->ill_lock); 1622 return (B_TRUE); 1623 } 1624 1625 static void 1626 ill_down(ill_t *ill) 1627 { 1628 ip_stack_t *ipst = ill->ill_ipst; 1629 1630 /* Blow off any IREs dependent on this ILL. */ 1631 ire_walk(ill_downi, ill, ipst); 1632 1633 /* Remove any conn_*_ill depending on this ill */ 1634 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1635 } 1636 1637 /* 1638 * ire_walk routine used to delete every IRE that depends on queues 1639 * associated with 'ill'. (Always called as writer.) 1640 */ 1641 static void 1642 ill_downi(ire_t *ire, char *ill_arg) 1643 { 1644 ill_t *ill = (ill_t *)ill_arg; 1645 1646 /* 1647 * Source address selection makes it possible for IRE_CACHE 1648 * entries to be created with ire_stq coming from interface X 1649 * and ipif coming from interface Y. Thus whenever interface 1650 * X goes down, remove all references to it by checking both 1651 * on ire_ipif and ire_stq. 1652 */ 1653 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1654 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1655 ire_delete(ire); 1656 } 1657 } 1658 1659 /* 1660 * Remove ire/nce from the fastpath list. 1661 */ 1662 void 1663 ill_fastpath_nack(ill_t *ill) 1664 { 1665 nce_fastpath_list_dispatch(ill, NULL, NULL); 1666 } 1667 1668 /* Consume an M_IOCACK of the fastpath probe. */ 1669 void 1670 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1671 { 1672 mblk_t *mp1 = mp; 1673 1674 /* 1675 * If this was the first attempt turn on the fastpath probing. 1676 */ 1677 mutex_enter(&ill->ill_lock); 1678 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1679 ill->ill_dlpi_fastpath_state = IDS_OK; 1680 mutex_exit(&ill->ill_lock); 1681 1682 /* Free the M_IOCACK mblk, hold on to the data */ 1683 mp = mp->b_cont; 1684 freeb(mp1); 1685 if (mp == NULL) 1686 return; 1687 if (mp->b_cont != NULL) { 1688 /* 1689 * Update all IRE's or NCE's that are waiting for 1690 * fastpath update. 1691 */ 1692 nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp); 1693 mp1 = mp->b_cont; 1694 freeb(mp); 1695 mp = mp1; 1696 } else { 1697 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1698 } 1699 1700 freeb(mp); 1701 } 1702 1703 /* 1704 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1705 * The data portion of the request is a dl_unitdata_req_t template for 1706 * what we would send downstream in the absence of a fastpath confirmation. 1707 */ 1708 int 1709 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1710 { 1711 struct iocblk *ioc; 1712 mblk_t *mp; 1713 1714 if (dlur_mp == NULL) 1715 return (EINVAL); 1716 1717 mutex_enter(&ill->ill_lock); 1718 switch (ill->ill_dlpi_fastpath_state) { 1719 case IDS_FAILED: 1720 /* 1721 * Driver NAKed the first fastpath ioctl - assume it doesn't 1722 * support it. 1723 */ 1724 mutex_exit(&ill->ill_lock); 1725 return (ENOTSUP); 1726 case IDS_UNKNOWN: 1727 /* This is the first probe */ 1728 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1729 break; 1730 default: 1731 break; 1732 } 1733 mutex_exit(&ill->ill_lock); 1734 1735 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1736 return (EAGAIN); 1737 1738 mp->b_cont = copyb(dlur_mp); 1739 if (mp->b_cont == NULL) { 1740 freeb(mp); 1741 return (EAGAIN); 1742 } 1743 1744 ioc = (struct iocblk *)mp->b_rptr; 1745 ioc->ioc_count = msgdsize(mp->b_cont); 1746 1747 putnext(ill->ill_wq, mp); 1748 return (0); 1749 } 1750 1751 void 1752 ill_capability_probe(ill_t *ill) 1753 { 1754 mblk_t *mp; 1755 1756 ASSERT(IAM_WRITER_ILL(ill)); 1757 1758 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1759 ill->ill_dlpi_capab_state != IDCS_FAILED) 1760 return; 1761 1762 /* 1763 * We are starting a new cycle of capability negotiation. 1764 * Free up the capab reset messages of any previous incarnation. 1765 * We will do a fresh allocation when we get the response to our probe 1766 */ 1767 if (ill->ill_capab_reset_mp != NULL) { 1768 freemsg(ill->ill_capab_reset_mp); 1769 ill->ill_capab_reset_mp = NULL; 1770 } 1771 1772 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1773 1774 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1775 if (mp == NULL) 1776 return; 1777 1778 ill_capability_send(ill, mp); 1779 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1780 } 1781 1782 void 1783 ill_capability_reset(ill_t *ill, boolean_t reneg) 1784 { 1785 ASSERT(IAM_WRITER_ILL(ill)); 1786 1787 if (ill->ill_dlpi_capab_state != IDCS_OK) 1788 return; 1789 1790 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1791 1792 ill_capability_send(ill, ill->ill_capab_reset_mp); 1793 ill->ill_capab_reset_mp = NULL; 1794 /* 1795 * We turn off all capabilities except those pertaining to 1796 * direct function call capabilities viz. ILL_CAPAB_DLD* 1797 * which will be turned off by the corresponding reset functions. 1798 */ 1799 ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM | 1800 ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP); 1801 } 1802 1803 static void 1804 ill_capability_reset_alloc(ill_t *ill) 1805 { 1806 mblk_t *mp; 1807 size_t size = 0; 1808 int err; 1809 dl_capability_req_t *capb; 1810 1811 ASSERT(IAM_WRITER_ILL(ill)); 1812 ASSERT(ill->ill_capab_reset_mp == NULL); 1813 1814 if (ILL_MDT_CAPABLE(ill)) 1815 size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 1816 1817 if (ILL_HCKSUM_CAPABLE(ill)) { 1818 size += sizeof (dl_capability_sub_t) + 1819 sizeof (dl_capab_hcksum_t); 1820 } 1821 1822 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1823 size += sizeof (dl_capability_sub_t) + 1824 sizeof (dl_capab_zerocopy_t); 1825 } 1826 1827 if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) { 1828 size += sizeof (dl_capability_sub_t); 1829 size += ill_capability_ipsec_reset_size(ill, NULL, NULL, 1830 NULL, NULL); 1831 } 1832 1833 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1834 size += sizeof (dl_capability_sub_t) + 1835 sizeof (dl_capab_dld_t); 1836 } 1837 1838 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1839 STR_NOSIG, &err); 1840 1841 mp->b_datap->db_type = M_PROTO; 1842 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1843 1844 capb = (dl_capability_req_t *)mp->b_rptr; 1845 capb->dl_primitive = DL_CAPABILITY_REQ; 1846 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1847 capb->dl_sub_length = size; 1848 1849 mp->b_wptr += sizeof (dl_capability_req_t); 1850 1851 /* 1852 * Each handler fills in the corresponding dl_capability_sub_t 1853 * inside the mblk, 1854 */ 1855 ill_capability_mdt_reset_fill(ill, mp); 1856 ill_capability_hcksum_reset_fill(ill, mp); 1857 ill_capability_zerocopy_reset_fill(ill, mp); 1858 ill_capability_ipsec_reset_fill(ill, mp); 1859 ill_capability_dld_reset_fill(ill, mp); 1860 1861 ill->ill_capab_reset_mp = mp; 1862 } 1863 1864 static void 1865 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1866 { 1867 dl_capab_id_t *id_ic; 1868 uint_t sub_dl_cap = outers->dl_cap; 1869 dl_capability_sub_t *inners; 1870 uint8_t *capend; 1871 1872 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1873 1874 /* 1875 * Note: range checks here are not absolutely sufficient to 1876 * make us robust against malformed messages sent by drivers; 1877 * this is in keeping with the rest of IP's dlpi handling. 1878 * (Remember, it's coming from something else in the kernel 1879 * address space) 1880 */ 1881 1882 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1883 if (capend > mp->b_wptr) { 1884 cmn_err(CE_WARN, "ill_capability_id_ack: " 1885 "malformed sub-capability too long for mblk"); 1886 return; 1887 } 1888 1889 id_ic = (dl_capab_id_t *)(outers + 1); 1890 1891 if (outers->dl_length < sizeof (*id_ic) || 1892 (inners = &id_ic->id_subcap, 1893 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1894 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1895 "encapsulated capab type %d too long for mblk", 1896 inners->dl_cap); 1897 return; 1898 } 1899 1900 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1901 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1902 "isn't as expected; pass-thru module(s) detected, " 1903 "discarding capability\n", inners->dl_cap)); 1904 return; 1905 } 1906 1907 /* Process the encapsulated sub-capability */ 1908 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1909 } 1910 1911 /* 1912 * Process Multidata Transmit capability negotiation ack received from a 1913 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1914 * DL_CAPABILITY_ACK message. 1915 */ 1916 static void 1917 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1918 { 1919 mblk_t *nmp = NULL; 1920 dl_capability_req_t *oc; 1921 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1922 ill_mdt_capab_t **ill_mdt_capab; 1923 uint_t sub_dl_cap = isub->dl_cap; 1924 uint8_t *capend; 1925 1926 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1927 1928 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1929 1930 /* 1931 * Note: range checks here are not absolutely sufficient to 1932 * make us robust against malformed messages sent by drivers; 1933 * this is in keeping with the rest of IP's dlpi handling. 1934 * (Remember, it's coming from something else in the kernel 1935 * address space) 1936 */ 1937 1938 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1939 if (capend > mp->b_wptr) { 1940 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1941 "malformed sub-capability too long for mblk"); 1942 return; 1943 } 1944 1945 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1946 1947 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1948 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1949 "unsupported MDT sub-capability (version %d, expected %d)", 1950 mdt_ic->mdt_version, MDT_VERSION_2); 1951 return; 1952 } 1953 1954 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 1955 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 1956 "capability isn't as expected; pass-thru module(s) " 1957 "detected, discarding capability\n")); 1958 return; 1959 } 1960 1961 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 1962 1963 if (*ill_mdt_capab == NULL) { 1964 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 1965 KM_NOSLEEP); 1966 if (*ill_mdt_capab == NULL) { 1967 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1968 "could not enable MDT version %d " 1969 "for %s (ENOMEM)\n", MDT_VERSION_2, 1970 ill->ill_name); 1971 return; 1972 } 1973 } 1974 1975 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 1976 "MDT version %d (%d bytes leading, %d bytes trailing " 1977 "header spaces, %d max pld bufs, %d span limit)\n", 1978 ill->ill_name, MDT_VERSION_2, 1979 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 1980 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 1981 1982 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 1983 (*ill_mdt_capab)->ill_mdt_on = 1; 1984 /* 1985 * Round the following values to the nearest 32-bit; ULP 1986 * may further adjust them to accomodate for additional 1987 * protocol headers. We pass these values to ULP during 1988 * bind time. 1989 */ 1990 (*ill_mdt_capab)->ill_mdt_hdr_head = 1991 roundup(mdt_ic->mdt_hdr_head, 4); 1992 (*ill_mdt_capab)->ill_mdt_hdr_tail = 1993 roundup(mdt_ic->mdt_hdr_tail, 4); 1994 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 1995 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 1996 1997 ill->ill_capabilities |= ILL_CAPAB_MDT; 1998 } else { 1999 uint_t size; 2000 uchar_t *rptr; 2001 2002 size = sizeof (dl_capability_req_t) + 2003 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 2004 2005 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2006 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 2007 "could not enable MDT for %s (ENOMEM)\n", 2008 ill->ill_name); 2009 return; 2010 } 2011 2012 rptr = nmp->b_rptr; 2013 /* initialize dl_capability_req_t */ 2014 oc = (dl_capability_req_t *)nmp->b_rptr; 2015 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2016 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2017 sizeof (dl_capab_mdt_t); 2018 nmp->b_rptr += sizeof (dl_capability_req_t); 2019 2020 /* initialize dl_capability_sub_t */ 2021 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2022 nmp->b_rptr += sizeof (*isub); 2023 2024 /* initialize dl_capab_mdt_t */ 2025 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2026 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2027 2028 nmp->b_rptr = rptr; 2029 2030 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2031 "to enable MDT version %d\n", ill->ill_name, 2032 MDT_VERSION_2)); 2033 2034 /* set ENABLE flag */ 2035 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2036 2037 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2038 ill_capability_send(ill, nmp); 2039 } 2040 } 2041 2042 static void 2043 ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp) 2044 { 2045 dl_capab_mdt_t *mdt_subcap; 2046 dl_capability_sub_t *dl_subcap; 2047 2048 if (!ILL_MDT_CAPABLE(ill)) 2049 return; 2050 2051 ASSERT(ill->ill_mdt_capab != NULL); 2052 2053 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2054 dl_subcap->dl_cap = DL_CAPAB_MDT; 2055 dl_subcap->dl_length = sizeof (*mdt_subcap); 2056 2057 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2058 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2059 mdt_subcap->mdt_flags = 0; 2060 mdt_subcap->mdt_hdr_head = 0; 2061 mdt_subcap->mdt_hdr_tail = 0; 2062 2063 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2064 } 2065 2066 static void 2067 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 2068 { 2069 dl_capability_sub_t *dl_subcap; 2070 2071 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2072 return; 2073 2074 /* 2075 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 2076 * initialized below since it is not used by DLD. 2077 */ 2078 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2079 dl_subcap->dl_cap = DL_CAPAB_DLD; 2080 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 2081 2082 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 2083 } 2084 2085 /* 2086 * Allocate an IPsec capability request which will be filled by our 2087 * caller to turn on support for one or more algorithms. 2088 */ 2089 /* ARGSUSED */ 2090 static mblk_t * 2091 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2092 { 2093 mblk_t *nmp; 2094 dl_capability_req_t *ocap; 2095 dl_capab_ipsec_t *ocip; 2096 dl_capab_ipsec_t *icip; 2097 uint8_t *ptr; 2098 icip = (dl_capab_ipsec_t *)(isub + 1); 2099 2100 /* 2101 * Allocate new mblk which will contain a new capability 2102 * request to enable the capabilities. 2103 */ 2104 2105 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2106 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2107 if (nmp == NULL) 2108 return (NULL); 2109 2110 ptr = nmp->b_rptr; 2111 2112 /* initialize dl_capability_req_t */ 2113 ocap = (dl_capability_req_t *)ptr; 2114 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2115 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2116 ptr += sizeof (dl_capability_req_t); 2117 2118 /* initialize dl_capability_sub_t */ 2119 bcopy(isub, ptr, sizeof (*isub)); 2120 ptr += sizeof (*isub); 2121 2122 /* initialize dl_capab_ipsec_t */ 2123 ocip = (dl_capab_ipsec_t *)ptr; 2124 bcopy(icip, ocip, sizeof (*icip)); 2125 2126 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2127 return (nmp); 2128 } 2129 2130 /* 2131 * Process an IPsec capability negotiation ack received from a DLS Provider. 2132 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2133 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2134 */ 2135 static void 2136 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2137 { 2138 dl_capab_ipsec_t *icip; 2139 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2140 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2141 uint_t cipher, nciphers; 2142 mblk_t *nmp; 2143 uint_t alg_len; 2144 boolean_t need_sadb_dump; 2145 uint_t sub_dl_cap = isub->dl_cap; 2146 ill_ipsec_capab_t **ill_capab; 2147 uint64_t ill_capab_flag; 2148 uint8_t *capend, *ciphend; 2149 boolean_t sadb_resync; 2150 2151 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2152 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2153 2154 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2155 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2156 ill_capab_flag = ILL_CAPAB_AH; 2157 } else { 2158 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2159 ill_capab_flag = ILL_CAPAB_ESP; 2160 } 2161 2162 /* 2163 * If the ill capability structure exists, then this incoming 2164 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2165 * If this is so, then we'd need to resynchronize the SADB 2166 * after re-enabling the offloaded ciphers. 2167 */ 2168 sadb_resync = (*ill_capab != NULL); 2169 2170 /* 2171 * Note: range checks here are not absolutely sufficient to 2172 * make us robust against malformed messages sent by drivers; 2173 * this is in keeping with the rest of IP's dlpi handling. 2174 * (Remember, it's coming from something else in the kernel 2175 * address space) 2176 */ 2177 2178 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2179 if (capend > mp->b_wptr) { 2180 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2181 "malformed sub-capability too long for mblk"); 2182 return; 2183 } 2184 2185 /* 2186 * There are two types of acks we process here: 2187 * 1. acks in reply to a (first form) generic capability req 2188 * (no ENABLE flag set) 2189 * 2. acks in reply to a ENABLE capability req. 2190 * (ENABLE flag set) 2191 * 2192 * We process the subcapability passed as argument as follows: 2193 * 1 do initializations 2194 * 1.1 initialize nmp = NULL 2195 * 1.2 set need_sadb_dump to B_FALSE 2196 * 2 for each cipher in subcapability: 2197 * 2.1 if ENABLE flag is set: 2198 * 2.1.1 update per-ill ipsec capabilities info 2199 * 2.1.2 set need_sadb_dump to B_TRUE 2200 * 2.2 if ENABLE flag is not set: 2201 * 2.2.1 if nmp is NULL: 2202 * 2.2.1.1 allocate and initialize nmp 2203 * 2.2.1.2 init current pos in nmp 2204 * 2.2.2 copy current cipher to current pos in nmp 2205 * 2.2.3 set ENABLE flag in nmp 2206 * 2.2.4 update current pos 2207 * 3 if nmp is not equal to NULL, send enable request 2208 * 3.1 send capability request 2209 * 4 if need_sadb_dump is B_TRUE 2210 * 4.1 enable promiscuous on/off notifications 2211 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2212 * AH or ESP SA's to interface. 2213 */ 2214 2215 nmp = NULL; 2216 oalg = NULL; 2217 need_sadb_dump = B_FALSE; 2218 icip = (dl_capab_ipsec_t *)(isub + 1); 2219 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2220 2221 nciphers = icip->cip_nciphers; 2222 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2223 2224 if (ciphend > capend) { 2225 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2226 "too many ciphers for sub-capability len"); 2227 return; 2228 } 2229 2230 for (cipher = 0; cipher < nciphers; cipher++) { 2231 alg_len = sizeof (dl_capab_ipsec_alg_t); 2232 2233 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2234 /* 2235 * TBD: when we provide a way to disable capabilities 2236 * from above, need to manage the request-pending state 2237 * and fail if we were not expecting this ACK. 2238 */ 2239 IPSECHW_DEBUG(IPSECHW_CAPAB, 2240 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2241 2242 /* 2243 * Update IPsec capabilities for this ill 2244 */ 2245 2246 if (*ill_capab == NULL) { 2247 IPSECHW_DEBUG(IPSECHW_CAPAB, 2248 ("ill_capability_ipsec_ack: " 2249 "allocating ipsec_capab for ill\n")); 2250 *ill_capab = ill_ipsec_capab_alloc(); 2251 2252 if (*ill_capab == NULL) { 2253 cmn_err(CE_WARN, 2254 "ill_capability_ipsec_ack: " 2255 "could not enable IPsec Hardware " 2256 "acceleration for %s (ENOMEM)\n", 2257 ill->ill_name); 2258 return; 2259 } 2260 } 2261 2262 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2263 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2264 2265 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2266 cmn_err(CE_WARN, 2267 "ill_capability_ipsec_ack: " 2268 "malformed IPsec algorithm id %d", 2269 ialg->alg_prim); 2270 continue; 2271 } 2272 2273 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2274 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2275 ialg->alg_prim); 2276 } else { 2277 ipsec_capab_algparm_t *alp; 2278 2279 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2280 ialg->alg_prim); 2281 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2282 ialg->alg_prim)) { 2283 cmn_err(CE_WARN, 2284 "ill_capability_ipsec_ack: " 2285 "no space for IPsec alg id %d", 2286 ialg->alg_prim); 2287 continue; 2288 } 2289 alp = &((*ill_capab)->encr_algparm[ 2290 ialg->alg_prim]); 2291 alp->minkeylen = ialg->alg_minbits; 2292 alp->maxkeylen = ialg->alg_maxbits; 2293 } 2294 ill->ill_capabilities |= ill_capab_flag; 2295 /* 2296 * indicate that a capability was enabled, which 2297 * will be used below to kick off a SADB dump 2298 * to the ill. 2299 */ 2300 need_sadb_dump = B_TRUE; 2301 } else { 2302 IPSECHW_DEBUG(IPSECHW_CAPAB, 2303 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2304 ialg->alg_prim)); 2305 2306 if (nmp == NULL) { 2307 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2308 if (nmp == NULL) { 2309 /* 2310 * Sending the PROMISC_ON/OFF 2311 * notification request failed. 2312 * We cannot enable the algorithms 2313 * since the Provider will not 2314 * notify IP of promiscous mode 2315 * changes, which could lead 2316 * to leakage of packets. 2317 */ 2318 cmn_err(CE_WARN, 2319 "ill_capability_ipsec_ack: " 2320 "could not enable IPsec Hardware " 2321 "acceleration for %s (ENOMEM)\n", 2322 ill->ill_name); 2323 return; 2324 } 2325 /* ptr to current output alg specifier */ 2326 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2327 } 2328 2329 /* 2330 * Copy current alg specifier, set ENABLE 2331 * flag, and advance to next output alg. 2332 * For now we enable all IPsec capabilities. 2333 */ 2334 ASSERT(oalg != NULL); 2335 bcopy(ialg, oalg, alg_len); 2336 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2337 nmp->b_wptr += alg_len; 2338 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2339 } 2340 2341 /* move to next input algorithm specifier */ 2342 ialg = (dl_capab_ipsec_alg_t *) 2343 ((char *)ialg + alg_len); 2344 } 2345 2346 if (nmp != NULL) 2347 /* 2348 * nmp points to a DL_CAPABILITY_REQ message to enable 2349 * IPsec hardware acceleration. 2350 */ 2351 ill_capability_send(ill, nmp); 2352 2353 if (need_sadb_dump) 2354 /* 2355 * An acknowledgement corresponding to a request to 2356 * enable acceleration was received, notify SADB. 2357 */ 2358 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2359 } 2360 2361 /* 2362 * Given an mblk with enough space in it, create sub-capability entries for 2363 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2364 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2365 * in preparation for the reset the DL_CAPABILITY_REQ message. 2366 */ 2367 static void 2368 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2369 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2370 { 2371 dl_capab_ipsec_t *oipsec; 2372 dl_capab_ipsec_alg_t *oalg; 2373 dl_capability_sub_t *dl_subcap; 2374 int i, k; 2375 2376 ASSERT(nciphers > 0); 2377 ASSERT(ill_cap != NULL); 2378 ASSERT(mp != NULL); 2379 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2380 2381 /* dl_capability_sub_t for "stype" */ 2382 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2383 dl_subcap->dl_cap = stype; 2384 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2385 mp->b_wptr += sizeof (dl_capability_sub_t); 2386 2387 /* dl_capab_ipsec_t for "stype" */ 2388 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2389 oipsec->cip_version = 1; 2390 oipsec->cip_nciphers = nciphers; 2391 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2392 2393 /* create entries for "stype" AUTH ciphers */ 2394 for (i = 0; i < ill_cap->algs_size; i++) { 2395 for (k = 0; k < BITSPERBYTE; k++) { 2396 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2397 continue; 2398 2399 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2400 bzero((void *)oalg, sizeof (*oalg)); 2401 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2402 oalg->alg_prim = k + (BITSPERBYTE * i); 2403 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2404 } 2405 } 2406 /* create entries for "stype" ENCR ciphers */ 2407 for (i = 0; i < ill_cap->algs_size; i++) { 2408 for (k = 0; k < BITSPERBYTE; k++) { 2409 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2410 continue; 2411 2412 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2413 bzero((void *)oalg, sizeof (*oalg)); 2414 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2415 oalg->alg_prim = k + (BITSPERBYTE * i); 2416 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2417 } 2418 } 2419 } 2420 2421 /* 2422 * Macro to count number of 1s in a byte (8-bit word). The total count is 2423 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2424 * POPC instruction, but our macro is more flexible for an arbitrary length 2425 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2426 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2427 * stays that way, we can reduce the number of iterations required. 2428 */ 2429 #define COUNT_1S(val, sum) { \ 2430 uint8_t x = val & 0xff; \ 2431 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2432 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2433 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2434 } 2435 2436 /* ARGSUSED */ 2437 static int 2438 ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp, 2439 int *esp_cntp, int *esp_lenp) 2440 { 2441 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2442 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2443 uint64_t ill_capabilities = ill->ill_capabilities; 2444 int ah_cnt = 0, esp_cnt = 0; 2445 int ah_len = 0, esp_len = 0; 2446 int i, size = 0; 2447 2448 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2449 return (0); 2450 2451 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2452 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2453 2454 /* Find out the number of ciphers for AH */ 2455 if (cap_ah != NULL) { 2456 for (i = 0; i < cap_ah->algs_size; i++) { 2457 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2458 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2459 } 2460 if (ah_cnt > 0) { 2461 size += sizeof (dl_capability_sub_t) + 2462 sizeof (dl_capab_ipsec_t); 2463 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2464 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2465 size += ah_len; 2466 } 2467 } 2468 2469 /* Find out the number of ciphers for ESP */ 2470 if (cap_esp != NULL) { 2471 for (i = 0; i < cap_esp->algs_size; i++) { 2472 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2473 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2474 } 2475 if (esp_cnt > 0) { 2476 size += sizeof (dl_capability_sub_t) + 2477 sizeof (dl_capab_ipsec_t); 2478 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2479 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2480 size += esp_len; 2481 } 2482 } 2483 2484 if (ah_cntp != NULL) 2485 *ah_cntp = ah_cnt; 2486 if (ah_lenp != NULL) 2487 *ah_lenp = ah_len; 2488 if (esp_cntp != NULL) 2489 *esp_cntp = esp_cnt; 2490 if (esp_lenp != NULL) 2491 *esp_lenp = esp_len; 2492 2493 return (size); 2494 } 2495 2496 /* ARGSUSED */ 2497 static void 2498 ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp) 2499 { 2500 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2501 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2502 int ah_cnt = 0, esp_cnt = 0; 2503 int ah_len = 0, esp_len = 0; 2504 int size; 2505 2506 size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len, 2507 &esp_cnt, &esp_len); 2508 if (size == 0) 2509 return; 2510 2511 /* 2512 * Clear the capability flags for IPsec HA but retain the ill 2513 * capability structures since it's possible that another thread 2514 * is still referring to them. The structures only get deallocated 2515 * when we destroy the ill. 2516 * 2517 * Various places check the flags to see if the ill is capable of 2518 * hardware acceleration, and by clearing them we ensure that new 2519 * outbound IPsec packets are sent down encrypted. 2520 */ 2521 2522 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2523 if (ah_cnt > 0) { 2524 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2525 cap_ah, mp); 2526 } 2527 2528 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2529 if (esp_cnt > 0) { 2530 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2531 cap_esp, mp); 2532 } 2533 2534 /* 2535 * At this point we've composed a bunch of sub-capabilities to be 2536 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2537 * by the caller. Upon receiving this reset message, the driver 2538 * must stop inbound decryption (by destroying all inbound SAs) 2539 * and let the corresponding packets come in encrypted. 2540 */ 2541 } 2542 2543 static void 2544 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2545 boolean_t encapsulated) 2546 { 2547 boolean_t legacy = B_FALSE; 2548 2549 /* 2550 * Note that only the following two sub-capabilities may be 2551 * considered as "legacy", since their original definitions 2552 * do not incorporate the dl_mid_t module ID token, and hence 2553 * may require the use of the wrapper sub-capability. 2554 */ 2555 switch (subp->dl_cap) { 2556 case DL_CAPAB_IPSEC_AH: 2557 case DL_CAPAB_IPSEC_ESP: 2558 legacy = B_TRUE; 2559 break; 2560 } 2561 2562 /* 2563 * For legacy sub-capabilities which don't incorporate a queue_t 2564 * pointer in their structures, discard them if we detect that 2565 * there are intermediate modules in between IP and the driver. 2566 */ 2567 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2568 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2569 "%d discarded; %d module(s) present below IP\n", 2570 subp->dl_cap, ill->ill_lmod_cnt)); 2571 return; 2572 } 2573 2574 switch (subp->dl_cap) { 2575 case DL_CAPAB_IPSEC_AH: 2576 case DL_CAPAB_IPSEC_ESP: 2577 ill_capability_ipsec_ack(ill, mp, subp); 2578 break; 2579 case DL_CAPAB_MDT: 2580 ill_capability_mdt_ack(ill, mp, subp); 2581 break; 2582 case DL_CAPAB_HCKSUM: 2583 ill_capability_hcksum_ack(ill, mp, subp); 2584 break; 2585 case DL_CAPAB_ZEROCOPY: 2586 ill_capability_zerocopy_ack(ill, mp, subp); 2587 break; 2588 case DL_CAPAB_DLD: 2589 ill_capability_dld_ack(ill, mp, subp); 2590 break; 2591 default: 2592 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2593 subp->dl_cap)); 2594 } 2595 } 2596 2597 /* 2598 * Process a hardware checksum offload capability negotiation ack received 2599 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 2600 * of a DL_CAPABILITY_ACK message. 2601 */ 2602 static void 2603 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2604 { 2605 dl_capability_req_t *ocap; 2606 dl_capab_hcksum_t *ihck, *ohck; 2607 ill_hcksum_capab_t **ill_hcksum; 2608 mblk_t *nmp = NULL; 2609 uint_t sub_dl_cap = isub->dl_cap; 2610 uint8_t *capend; 2611 2612 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 2613 2614 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 2615 2616 /* 2617 * Note: range checks here are not absolutely sufficient to 2618 * make us robust against malformed messages sent by drivers; 2619 * this is in keeping with the rest of IP's dlpi handling. 2620 * (Remember, it's coming from something else in the kernel 2621 * address space) 2622 */ 2623 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2624 if (capend > mp->b_wptr) { 2625 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2626 "malformed sub-capability too long for mblk"); 2627 return; 2628 } 2629 2630 /* 2631 * There are two types of acks we process here: 2632 * 1. acks in reply to a (first form) generic capability req 2633 * (no ENABLE flag set) 2634 * 2. acks in reply to a ENABLE capability req. 2635 * (ENABLE flag set) 2636 */ 2637 ihck = (dl_capab_hcksum_t *)(isub + 1); 2638 2639 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 2640 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 2641 "unsupported hardware checksum " 2642 "sub-capability (version %d, expected %d)", 2643 ihck->hcksum_version, HCKSUM_VERSION_1); 2644 return; 2645 } 2646 2647 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 2648 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 2649 "checksum capability isn't as expected; pass-thru " 2650 "module(s) detected, discarding capability\n")); 2651 return; 2652 } 2653 2654 #define CURR_HCKSUM_CAPAB \ 2655 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 2656 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 2657 2658 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 2659 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 2660 /* do ENABLE processing */ 2661 if (*ill_hcksum == NULL) { 2662 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 2663 KM_NOSLEEP); 2664 2665 if (*ill_hcksum == NULL) { 2666 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2667 "could not enable hcksum version %d " 2668 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 2669 ill->ill_name); 2670 return; 2671 } 2672 } 2673 2674 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 2675 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 2676 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 2677 ip1dbg(("ill_capability_hcksum_ack: interface %s " 2678 "has enabled hardware checksumming\n ", 2679 ill->ill_name)); 2680 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 2681 /* 2682 * Enabling hardware checksum offload 2683 * Currently IP supports {TCP,UDP}/IPv4 2684 * partial and full cksum offload and 2685 * IPv4 header checksum offload. 2686 * Allocate new mblk which will 2687 * contain a new capability request 2688 * to enable hardware checksum offload. 2689 */ 2690 uint_t size; 2691 uchar_t *rptr; 2692 2693 size = sizeof (dl_capability_req_t) + 2694 sizeof (dl_capability_sub_t) + isub->dl_length; 2695 2696 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2697 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2698 "could not enable hardware cksum for %s (ENOMEM)\n", 2699 ill->ill_name); 2700 return; 2701 } 2702 2703 rptr = nmp->b_rptr; 2704 /* initialize dl_capability_req_t */ 2705 ocap = (dl_capability_req_t *)nmp->b_rptr; 2706 ocap->dl_sub_offset = 2707 sizeof (dl_capability_req_t); 2708 ocap->dl_sub_length = 2709 sizeof (dl_capability_sub_t) + 2710 isub->dl_length; 2711 nmp->b_rptr += sizeof (dl_capability_req_t); 2712 2713 /* initialize dl_capability_sub_t */ 2714 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2715 nmp->b_rptr += sizeof (*isub); 2716 2717 /* initialize dl_capab_hcksum_t */ 2718 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 2719 bcopy(ihck, ohck, sizeof (*ihck)); 2720 2721 nmp->b_rptr = rptr; 2722 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2723 2724 /* Set ENABLE flag */ 2725 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 2726 ohck->hcksum_txflags |= HCKSUM_ENABLE; 2727 2728 /* 2729 * nmp points to a DL_CAPABILITY_REQ message to enable 2730 * hardware checksum acceleration. 2731 */ 2732 ill_capability_send(ill, nmp); 2733 } else { 2734 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 2735 "advertised %x hardware checksum capability flags\n", 2736 ill->ill_name, ihck->hcksum_txflags)); 2737 } 2738 } 2739 2740 static void 2741 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 2742 { 2743 dl_capab_hcksum_t *hck_subcap; 2744 dl_capability_sub_t *dl_subcap; 2745 2746 if (!ILL_HCKSUM_CAPABLE(ill)) 2747 return; 2748 2749 ASSERT(ill->ill_hcksum_capab != NULL); 2750 2751 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2752 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 2753 dl_subcap->dl_length = sizeof (*hck_subcap); 2754 2755 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 2756 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 2757 hck_subcap->hcksum_txflags = 0; 2758 2759 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 2760 } 2761 2762 static void 2763 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2764 { 2765 mblk_t *nmp = NULL; 2766 dl_capability_req_t *oc; 2767 dl_capab_zerocopy_t *zc_ic, *zc_oc; 2768 ill_zerocopy_capab_t **ill_zerocopy_capab; 2769 uint_t sub_dl_cap = isub->dl_cap; 2770 uint8_t *capend; 2771 2772 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 2773 2774 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 2775 2776 /* 2777 * Note: range checks here are not absolutely sufficient to 2778 * make us robust against malformed messages sent by drivers; 2779 * this is in keeping with the rest of IP's dlpi handling. 2780 * (Remember, it's coming from something else in the kernel 2781 * address space) 2782 */ 2783 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2784 if (capend > mp->b_wptr) { 2785 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2786 "malformed sub-capability too long for mblk"); 2787 return; 2788 } 2789 2790 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 2791 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 2792 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 2793 "unsupported ZEROCOPY sub-capability (version %d, " 2794 "expected %d)", zc_ic->zerocopy_version, 2795 ZEROCOPY_VERSION_1); 2796 return; 2797 } 2798 2799 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 2800 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 2801 "capability isn't as expected; pass-thru module(s) " 2802 "detected, discarding capability\n")); 2803 return; 2804 } 2805 2806 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 2807 if (*ill_zerocopy_capab == NULL) { 2808 *ill_zerocopy_capab = 2809 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 2810 KM_NOSLEEP); 2811 2812 if (*ill_zerocopy_capab == NULL) { 2813 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2814 "could not enable Zero-copy version %d " 2815 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 2816 ill->ill_name); 2817 return; 2818 } 2819 } 2820 2821 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 2822 "supports Zero-copy version %d\n", ill->ill_name, 2823 ZEROCOPY_VERSION_1)); 2824 2825 (*ill_zerocopy_capab)->ill_zerocopy_version = 2826 zc_ic->zerocopy_version; 2827 (*ill_zerocopy_capab)->ill_zerocopy_flags = 2828 zc_ic->zerocopy_flags; 2829 2830 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 2831 } else { 2832 uint_t size; 2833 uchar_t *rptr; 2834 2835 size = sizeof (dl_capability_req_t) + 2836 sizeof (dl_capability_sub_t) + 2837 sizeof (dl_capab_zerocopy_t); 2838 2839 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2840 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2841 "could not enable zerocopy for %s (ENOMEM)\n", 2842 ill->ill_name); 2843 return; 2844 } 2845 2846 rptr = nmp->b_rptr; 2847 /* initialize dl_capability_req_t */ 2848 oc = (dl_capability_req_t *)rptr; 2849 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2850 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2851 sizeof (dl_capab_zerocopy_t); 2852 rptr += sizeof (dl_capability_req_t); 2853 2854 /* initialize dl_capability_sub_t */ 2855 bcopy(isub, rptr, sizeof (*isub)); 2856 rptr += sizeof (*isub); 2857 2858 /* initialize dl_capab_zerocopy_t */ 2859 zc_oc = (dl_capab_zerocopy_t *)rptr; 2860 *zc_oc = *zc_ic; 2861 2862 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 2863 "to enable zero-copy version %d\n", ill->ill_name, 2864 ZEROCOPY_VERSION_1)); 2865 2866 /* set VMSAFE_MEM flag */ 2867 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 2868 2869 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 2870 ill_capability_send(ill, nmp); 2871 } 2872 } 2873 2874 static void 2875 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 2876 { 2877 dl_capab_zerocopy_t *zerocopy_subcap; 2878 dl_capability_sub_t *dl_subcap; 2879 2880 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 2881 return; 2882 2883 ASSERT(ill->ill_zerocopy_capab != NULL); 2884 2885 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2886 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 2887 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 2888 2889 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 2890 zerocopy_subcap->zerocopy_version = 2891 ill->ill_zerocopy_capab->ill_zerocopy_version; 2892 zerocopy_subcap->zerocopy_flags = 0; 2893 2894 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 2895 } 2896 2897 /* 2898 * DLD capability 2899 * Refer to dld.h for more information regarding the purpose and usage 2900 * of this capability. 2901 */ 2902 static void 2903 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2904 { 2905 dl_capab_dld_t *dld_ic, dld; 2906 uint_t sub_dl_cap = isub->dl_cap; 2907 uint8_t *capend; 2908 ill_dld_capab_t *idc; 2909 2910 ASSERT(IAM_WRITER_ILL(ill)); 2911 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 2912 2913 /* 2914 * Note: range checks here are not absolutely sufficient to 2915 * make us robust against malformed messages sent by drivers; 2916 * this is in keeping with the rest of IP's dlpi handling. 2917 * (Remember, it's coming from something else in the kernel 2918 * address space) 2919 */ 2920 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2921 if (capend > mp->b_wptr) { 2922 cmn_err(CE_WARN, "ill_capability_dld_ack: " 2923 "malformed sub-capability too long for mblk"); 2924 return; 2925 } 2926 dld_ic = (dl_capab_dld_t *)(isub + 1); 2927 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 2928 cmn_err(CE_CONT, "ill_capability_dld_ack: " 2929 "unsupported DLD sub-capability (version %d, " 2930 "expected %d)", dld_ic->dld_version, 2931 DLD_CURRENT_VERSION); 2932 return; 2933 } 2934 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 2935 ip1dbg(("ill_capability_dld_ack: mid token for dld " 2936 "capability isn't as expected; pass-thru module(s) " 2937 "detected, discarding capability\n")); 2938 return; 2939 } 2940 2941 /* 2942 * Copy locally to ensure alignment. 2943 */ 2944 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 2945 2946 if ((idc = ill->ill_dld_capab) == NULL) { 2947 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 2948 if (idc == NULL) { 2949 cmn_err(CE_WARN, "ill_capability_dld_ack: " 2950 "could not enable DLD version %d " 2951 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 2952 ill->ill_name); 2953 return; 2954 } 2955 ill->ill_dld_capab = idc; 2956 } 2957 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 2958 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 2959 ip1dbg(("ill_capability_dld_ack: interface %s " 2960 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 2961 2962 ill_capability_dld_enable(ill); 2963 } 2964 2965 /* 2966 * Typically capability negotiation between IP and the driver happens via 2967 * DLPI message exchange. However GLD also offers a direct function call 2968 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 2969 * But arbitrary function calls into IP or GLD are not permitted, since both 2970 * of them are protected by their own perimeter mechanism. The perimeter can 2971 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 2972 * these perimeters is IP -> MAC. Thus for example to enable the squeue 2973 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 2974 * to enter the mac perimeter and then do the direct function calls into 2975 * GLD to enable squeue polling. The ring related callbacks from the mac into 2976 * the stack to add, bind, quiesce, restart or cleanup a ring are all 2977 * protected by the mac perimeter. 2978 */ 2979 static void 2980 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 2981 { 2982 ill_dld_capab_t *idc = ill->ill_dld_capab; 2983 int err; 2984 2985 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 2986 DLD_ENABLE); 2987 ASSERT(err == 0); 2988 } 2989 2990 static void 2991 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 2992 { 2993 ill_dld_capab_t *idc = ill->ill_dld_capab; 2994 int err; 2995 2996 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 2997 DLD_DISABLE); 2998 ASSERT(err == 0); 2999 } 3000 3001 boolean_t 3002 ill_mac_perim_held(ill_t *ill) 3003 { 3004 ill_dld_capab_t *idc = ill->ill_dld_capab; 3005 3006 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 3007 DLD_QUERY)); 3008 } 3009 3010 static void 3011 ill_capability_direct_enable(ill_t *ill) 3012 { 3013 ill_dld_capab_t *idc = ill->ill_dld_capab; 3014 ill_dld_direct_t *idd = &idc->idc_direct; 3015 dld_capab_direct_t direct; 3016 int rc; 3017 3018 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3019 3020 bzero(&direct, sizeof (direct)); 3021 direct.di_rx_cf = (uintptr_t)ip_input; 3022 direct.di_rx_ch = ill; 3023 3024 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 3025 DLD_ENABLE); 3026 if (rc == 0) { 3027 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 3028 idd->idd_tx_dh = direct.di_tx_dh; 3029 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 3030 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 3031 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 3032 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 3033 ASSERT(idd->idd_tx_cb_df != NULL); 3034 ASSERT(idd->idd_tx_fctl_df != NULL); 3035 ASSERT(idd->idd_tx_df != NULL); 3036 /* 3037 * One time registration of flow enable callback function 3038 */ 3039 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 3040 ill_flow_enable, ill); 3041 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 3042 DTRACE_PROBE1(direct_on, (ill_t *), ill); 3043 } else { 3044 cmn_err(CE_WARN, "warning: could not enable DIRECT " 3045 "capability, rc = %d\n", rc); 3046 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 3047 } 3048 } 3049 3050 static void 3051 ill_capability_poll_enable(ill_t *ill) 3052 { 3053 ill_dld_capab_t *idc = ill->ill_dld_capab; 3054 dld_capab_poll_t poll; 3055 int rc; 3056 3057 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3058 3059 bzero(&poll, sizeof (poll)); 3060 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 3061 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 3062 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 3063 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 3064 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 3065 poll.poll_ring_ch = ill; 3066 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 3067 DLD_ENABLE); 3068 if (rc == 0) { 3069 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 3070 DTRACE_PROBE1(poll_on, (ill_t *), ill); 3071 } else { 3072 ip1dbg(("warning: could not enable POLL " 3073 "capability, rc = %d\n", rc)); 3074 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 3075 } 3076 } 3077 3078 /* 3079 * Enable the LSO capability. 3080 */ 3081 static void 3082 ill_capability_lso_enable(ill_t *ill) 3083 { 3084 ill_dld_capab_t *idc = ill->ill_dld_capab; 3085 dld_capab_lso_t lso; 3086 int rc; 3087 3088 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3089 3090 if (ill->ill_lso_capab == NULL) { 3091 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 3092 KM_NOSLEEP); 3093 if (ill->ill_lso_capab == NULL) { 3094 cmn_err(CE_WARN, "ill_capability_lso_enable: " 3095 "could not enable LSO for %s (ENOMEM)\n", 3096 ill->ill_name); 3097 return; 3098 } 3099 } 3100 3101 bzero(&lso, sizeof (lso)); 3102 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 3103 DLD_ENABLE)) == 0) { 3104 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 3105 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 3106 ill->ill_capabilities |= ILL_CAPAB_DLD_LSO; 3107 ip1dbg(("ill_capability_lso_enable: interface %s " 3108 "has enabled LSO\n ", ill->ill_name)); 3109 } else { 3110 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 3111 ill->ill_lso_capab = NULL; 3112 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 3113 } 3114 } 3115 3116 static void 3117 ill_capability_dld_enable(ill_t *ill) 3118 { 3119 mac_perim_handle_t mph; 3120 3121 ASSERT(IAM_WRITER_ILL(ill)); 3122 3123 if (ill->ill_isv6) 3124 return; 3125 3126 ill_mac_perim_enter(ill, &mph); 3127 if (!ill->ill_isv6) { 3128 ill_capability_direct_enable(ill); 3129 ill_capability_poll_enable(ill); 3130 ill_capability_lso_enable(ill); 3131 } 3132 ill->ill_capabilities |= ILL_CAPAB_DLD; 3133 ill_mac_perim_exit(ill, mph); 3134 } 3135 3136 static void 3137 ill_capability_dld_disable(ill_t *ill) 3138 { 3139 ill_dld_capab_t *idc; 3140 ill_dld_direct_t *idd; 3141 mac_perim_handle_t mph; 3142 3143 ASSERT(IAM_WRITER_ILL(ill)); 3144 3145 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 3146 return; 3147 3148 ill_mac_perim_enter(ill, &mph); 3149 3150 idc = ill->ill_dld_capab; 3151 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 3152 /* 3153 * For performance we avoid locks in the transmit data path 3154 * and don't maintain a count of the number of threads using 3155 * direct calls. Thus some threads could be using direct 3156 * transmit calls to GLD, even after the capability mechanism 3157 * turns it off. This is still safe since the handles used in 3158 * the direct calls continue to be valid until the unplumb is 3159 * completed. Remove the callback that was added (1-time) at 3160 * capab enable time. 3161 */ 3162 mutex_enter(&ill->ill_lock); 3163 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 3164 mutex_exit(&ill->ill_lock); 3165 if (ill->ill_flownotify_mh != NULL) { 3166 idd = &idc->idc_direct; 3167 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 3168 ill->ill_flownotify_mh); 3169 ill->ill_flownotify_mh = NULL; 3170 } 3171 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 3172 NULL, DLD_DISABLE); 3173 } 3174 3175 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 3176 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 3177 ip_squeue_clean_all(ill); 3178 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 3179 NULL, DLD_DISABLE); 3180 } 3181 3182 if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) { 3183 ASSERT(ill->ill_lso_capab != NULL); 3184 /* 3185 * Clear the capability flag for LSO but retain the 3186 * ill_lso_capab structure since it's possible that another 3187 * thread is still referring to it. The structure only gets 3188 * deallocated when we destroy the ill. 3189 */ 3190 3191 ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO; 3192 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 3193 NULL, DLD_DISABLE); 3194 } 3195 3196 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 3197 ill_mac_perim_exit(ill, mph); 3198 } 3199 3200 /* 3201 * Capability Negotiation protocol 3202 * 3203 * We don't wait for DLPI capability operations to finish during interface 3204 * bringup or teardown. Doing so would introduce more asynchrony and the 3205 * interface up/down operations will need multiple return and restarts. 3206 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 3207 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 3208 * exclusive operation won't start until the DLPI operations of the previous 3209 * exclusive operation complete. 3210 * 3211 * The capability state machine is shown below. 3212 * 3213 * state next state event, action 3214 * 3215 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 3216 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 3217 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 3218 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 3219 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 3220 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 3221 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 3222 * ill_capability_probe. 3223 */ 3224 3225 /* 3226 * Dedicated thread started from ip_stack_init that handles capability 3227 * disable. This thread ensures the taskq dispatch does not fail by waiting 3228 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 3229 * that direct calls to DLD are done in a cv_waitable context. 3230 */ 3231 void 3232 ill_taskq_dispatch(ip_stack_t *ipst) 3233 { 3234 callb_cpr_t cprinfo; 3235 char name[64]; 3236 mblk_t *mp; 3237 3238 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 3239 ipst->ips_netstack->netstack_stackid); 3240 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 3241 name); 3242 mutex_enter(&ipst->ips_capab_taskq_lock); 3243 3244 for (;;) { 3245 mp = ipst->ips_capab_taskq_head; 3246 while (mp != NULL) { 3247 ipst->ips_capab_taskq_head = mp->b_next; 3248 if (ipst->ips_capab_taskq_head == NULL) 3249 ipst->ips_capab_taskq_tail = NULL; 3250 mutex_exit(&ipst->ips_capab_taskq_lock); 3251 mp->b_next = NULL; 3252 3253 VERIFY(taskq_dispatch(system_taskq, 3254 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 3255 mutex_enter(&ipst->ips_capab_taskq_lock); 3256 mp = ipst->ips_capab_taskq_head; 3257 } 3258 3259 if (ipst->ips_capab_taskq_quit) 3260 break; 3261 CALLB_CPR_SAFE_BEGIN(&cprinfo); 3262 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 3263 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 3264 } 3265 VERIFY(ipst->ips_capab_taskq_head == NULL); 3266 VERIFY(ipst->ips_capab_taskq_tail == NULL); 3267 CALLB_CPR_EXIT(&cprinfo); 3268 thread_exit(); 3269 } 3270 3271 /* 3272 * Consume a new-style hardware capabilities negotiation ack. 3273 * Called via taskq on receipt of DL_CAPABBILITY_ACK. 3274 */ 3275 static void 3276 ill_capability_ack_thr(void *arg) 3277 { 3278 mblk_t *mp = arg; 3279 dl_capability_ack_t *capp; 3280 dl_capability_sub_t *subp, *endp; 3281 ill_t *ill; 3282 boolean_t reneg; 3283 3284 ill = (ill_t *)mp->b_prev; 3285 mp->b_prev = NULL; 3286 3287 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 3288 3289 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 3290 ill->ill_dlpi_capab_state == IDCS_RENEG) { 3291 /* 3292 * We have received the ack for our DL_CAPAB reset request. 3293 * There isnt' anything in the message that needs processing. 3294 * All message based capabilities have been disabled, now 3295 * do the function call based capability disable. 3296 */ 3297 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 3298 ill_capability_dld_disable(ill); 3299 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 3300 if (reneg) 3301 ill_capability_probe(ill); 3302 goto done; 3303 } 3304 3305 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 3306 ill->ill_dlpi_capab_state = IDCS_OK; 3307 3308 capp = (dl_capability_ack_t *)mp->b_rptr; 3309 3310 if (capp->dl_sub_length == 0) { 3311 /* no new-style capabilities */ 3312 goto done; 3313 } 3314 3315 /* make sure the driver supplied correct dl_sub_length */ 3316 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3317 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3318 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3319 goto done; 3320 } 3321 3322 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3323 /* 3324 * There are sub-capabilities. Process the ones we know about. 3325 * Loop until we don't have room for another sub-cap header.. 3326 */ 3327 for (subp = SC(capp, capp->dl_sub_offset), 3328 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3329 subp <= endp; 3330 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3331 3332 switch (subp->dl_cap) { 3333 case DL_CAPAB_ID_WRAPPER: 3334 ill_capability_id_ack(ill, mp, subp); 3335 break; 3336 default: 3337 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3338 break; 3339 } 3340 } 3341 #undef SC 3342 done: 3343 inet_freemsg(mp); 3344 ill_capability_done(ill); 3345 ipsq_exit(ill->ill_phyint->phyint_ipsq); 3346 } 3347 3348 /* 3349 * This needs to be started in a taskq thread to provide a cv_waitable 3350 * context. 3351 */ 3352 void 3353 ill_capability_ack(ill_t *ill, mblk_t *mp) 3354 { 3355 ip_stack_t *ipst = ill->ill_ipst; 3356 3357 mp->b_prev = (mblk_t *)ill; 3358 ASSERT(mp->b_next == NULL); 3359 3360 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 3361 TQ_NOSLEEP) != 0) 3362 return; 3363 3364 /* 3365 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 3366 * which will do the dispatch using TQ_SLEEP to guarantee success. 3367 */ 3368 mutex_enter(&ipst->ips_capab_taskq_lock); 3369 if (ipst->ips_capab_taskq_head == NULL) { 3370 ASSERT(ipst->ips_capab_taskq_tail == NULL); 3371 ipst->ips_capab_taskq_head = mp; 3372 } else { 3373 ipst->ips_capab_taskq_tail->b_next = mp; 3374 } 3375 ipst->ips_capab_taskq_tail = mp; 3376 3377 cv_signal(&ipst->ips_capab_taskq_cv); 3378 mutex_exit(&ipst->ips_capab_taskq_lock); 3379 } 3380 3381 /* 3382 * This routine is called to scan the fragmentation reassembly table for 3383 * the specified ILL for any packets that are starting to smell. 3384 * dead_interval is the maximum time in seconds that will be tolerated. It 3385 * will either be the value specified in ip_g_frag_timeout, or zero if the 3386 * ILL is shutting down and it is time to blow everything off. 3387 * 3388 * It returns the number of seconds (as a time_t) that the next frag timer 3389 * should be scheduled for, 0 meaning that the timer doesn't need to be 3390 * re-started. Note that the method of calculating next_timeout isn't 3391 * entirely accurate since time will flow between the time we grab 3392 * current_time and the time we schedule the next timeout. This isn't a 3393 * big problem since this is the timer for sending an ICMP reassembly time 3394 * exceeded messages, and it doesn't have to be exactly accurate. 3395 * 3396 * This function is 3397 * sometimes called as writer, although this is not required. 3398 */ 3399 time_t 3400 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3401 { 3402 ipfb_t *ipfb; 3403 ipfb_t *endp; 3404 ipf_t *ipf; 3405 ipf_t *ipfnext; 3406 mblk_t *mp; 3407 time_t current_time = gethrestime_sec(); 3408 time_t next_timeout = 0; 3409 uint32_t hdr_length; 3410 mblk_t *send_icmp_head; 3411 mblk_t *send_icmp_head_v6; 3412 zoneid_t zoneid; 3413 ip_stack_t *ipst = ill->ill_ipst; 3414 3415 ipfb = ill->ill_frag_hash_tbl; 3416 if (ipfb == NULL) 3417 return (B_FALSE); 3418 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3419 /* Walk the frag hash table. */ 3420 for (; ipfb < endp; ipfb++) { 3421 send_icmp_head = NULL; 3422 send_icmp_head_v6 = NULL; 3423 mutex_enter(&ipfb->ipfb_lock); 3424 while ((ipf = ipfb->ipfb_ipf) != 0) { 3425 time_t frag_time = current_time - ipf->ipf_timestamp; 3426 time_t frag_timeout; 3427 3428 if (frag_time < dead_interval) { 3429 /* 3430 * There are some outstanding fragments 3431 * that will timeout later. Make note of 3432 * the time so that we can reschedule the 3433 * next timeout appropriately. 3434 */ 3435 frag_timeout = dead_interval - frag_time; 3436 if (next_timeout == 0 || 3437 frag_timeout < next_timeout) { 3438 next_timeout = frag_timeout; 3439 } 3440 break; 3441 } 3442 /* Time's up. Get it out of here. */ 3443 hdr_length = ipf->ipf_nf_hdr_len; 3444 ipfnext = ipf->ipf_hash_next; 3445 if (ipfnext) 3446 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3447 *ipf->ipf_ptphn = ipfnext; 3448 mp = ipf->ipf_mp->b_cont; 3449 for (; mp; mp = mp->b_cont) { 3450 /* Extra points for neatness. */ 3451 IP_REASS_SET_START(mp, 0); 3452 IP_REASS_SET_END(mp, 0); 3453 } 3454 mp = ipf->ipf_mp->b_cont; 3455 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 3456 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3457 ipfb->ipfb_count -= ipf->ipf_count; 3458 ASSERT(ipfb->ipfb_frag_pkts > 0); 3459 ipfb->ipfb_frag_pkts--; 3460 /* 3461 * We do not send any icmp message from here because 3462 * we currently are holding the ipfb_lock for this 3463 * hash chain. If we try and send any icmp messages 3464 * from here we may end up via a put back into ip 3465 * trying to get the same lock, causing a recursive 3466 * mutex panic. Instead we build a list and send all 3467 * the icmp messages after we have dropped the lock. 3468 */ 3469 if (ill->ill_isv6) { 3470 if (hdr_length != 0) { 3471 mp->b_next = send_icmp_head_v6; 3472 send_icmp_head_v6 = mp; 3473 } else { 3474 freemsg(mp); 3475 } 3476 } else { 3477 if (hdr_length != 0) { 3478 mp->b_next = send_icmp_head; 3479 send_icmp_head = mp; 3480 } else { 3481 freemsg(mp); 3482 } 3483 } 3484 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3485 freeb(ipf->ipf_mp); 3486 } 3487 mutex_exit(&ipfb->ipfb_lock); 3488 /* 3489 * Now need to send any icmp messages that we delayed from 3490 * above. 3491 */ 3492 while (send_icmp_head_v6 != NULL) { 3493 ip6_t *ip6h; 3494 3495 mp = send_icmp_head_v6; 3496 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3497 mp->b_next = NULL; 3498 if (mp->b_datap->db_type == M_CTL) 3499 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3500 else 3501 ip6h = (ip6_t *)mp->b_rptr; 3502 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3503 ill, ipst); 3504 if (zoneid == ALL_ZONES) { 3505 freemsg(mp); 3506 } else { 3507 icmp_time_exceeded_v6(ill->ill_wq, mp, 3508 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3509 B_FALSE, zoneid, ipst); 3510 } 3511 } 3512 while (send_icmp_head != NULL) { 3513 ipaddr_t dst; 3514 3515 mp = send_icmp_head; 3516 send_icmp_head = send_icmp_head->b_next; 3517 mp->b_next = NULL; 3518 3519 if (mp->b_datap->db_type == M_CTL) 3520 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3521 else 3522 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3523 3524 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 3525 if (zoneid == ALL_ZONES) { 3526 freemsg(mp); 3527 } else { 3528 icmp_time_exceeded(ill->ill_wq, mp, 3529 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid, 3530 ipst); 3531 } 3532 } 3533 } 3534 /* 3535 * A non-dying ILL will use the return value to decide whether to 3536 * restart the frag timer, and for how long. 3537 */ 3538 return (next_timeout); 3539 } 3540 3541 /* 3542 * This routine is called when the approximate count of mblk memory used 3543 * for the specified ILL has exceeded max_count. 3544 */ 3545 void 3546 ill_frag_prune(ill_t *ill, uint_t max_count) 3547 { 3548 ipfb_t *ipfb; 3549 ipf_t *ipf; 3550 size_t count; 3551 3552 /* 3553 * If we are here within ip_min_frag_prune_time msecs remove 3554 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3555 * ill_frag_free_num_pkts. 3556 */ 3557 mutex_enter(&ill->ill_lock); 3558 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3559 (ip_min_frag_prune_time != 0 ? 3560 ip_min_frag_prune_time : msec_per_tick)) { 3561 3562 ill->ill_frag_free_num_pkts++; 3563 3564 } else { 3565 ill->ill_frag_free_num_pkts = 0; 3566 } 3567 ill->ill_last_frag_clean_time = lbolt; 3568 mutex_exit(&ill->ill_lock); 3569 3570 /* 3571 * free ill_frag_free_num_pkts oldest packets from each bucket. 3572 */ 3573 if (ill->ill_frag_free_num_pkts != 0) { 3574 int ix; 3575 3576 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3577 ipfb = &ill->ill_frag_hash_tbl[ix]; 3578 mutex_enter(&ipfb->ipfb_lock); 3579 if (ipfb->ipfb_ipf != NULL) { 3580 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3581 ill->ill_frag_free_num_pkts); 3582 } 3583 mutex_exit(&ipfb->ipfb_lock); 3584 } 3585 } 3586 /* 3587 * While the reassembly list for this ILL is too big, prune a fragment 3588 * queue by age, oldest first. 3589 */ 3590 while (ill->ill_frag_count > max_count) { 3591 int ix; 3592 ipfb_t *oipfb = NULL; 3593 uint_t oldest = UINT_MAX; 3594 3595 count = 0; 3596 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3597 ipfb = &ill->ill_frag_hash_tbl[ix]; 3598 mutex_enter(&ipfb->ipfb_lock); 3599 ipf = ipfb->ipfb_ipf; 3600 if (ipf != NULL && ipf->ipf_gen < oldest) { 3601 oldest = ipf->ipf_gen; 3602 oipfb = ipfb; 3603 } 3604 count += ipfb->ipfb_count; 3605 mutex_exit(&ipfb->ipfb_lock); 3606 } 3607 if (oipfb == NULL) 3608 break; 3609 3610 if (count <= max_count) 3611 return; /* Somebody beat us to it, nothing to do */ 3612 mutex_enter(&oipfb->ipfb_lock); 3613 ipf = oipfb->ipfb_ipf; 3614 if (ipf != NULL) { 3615 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3616 } 3617 mutex_exit(&oipfb->ipfb_lock); 3618 } 3619 } 3620 3621 /* 3622 * free 'free_cnt' fragmented packets starting at ipf. 3623 */ 3624 void 3625 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3626 { 3627 size_t count; 3628 mblk_t *mp; 3629 mblk_t *tmp; 3630 ipf_t **ipfp = ipf->ipf_ptphn; 3631 3632 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3633 ASSERT(ipfp != NULL); 3634 ASSERT(ipf != NULL); 3635 3636 while (ipf != NULL && free_cnt-- > 0) { 3637 count = ipf->ipf_count; 3638 mp = ipf->ipf_mp; 3639 ipf = ipf->ipf_hash_next; 3640 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3641 IP_REASS_SET_START(tmp, 0); 3642 IP_REASS_SET_END(tmp, 0); 3643 } 3644 atomic_add_32(&ill->ill_frag_count, -count); 3645 ASSERT(ipfb->ipfb_count >= count); 3646 ipfb->ipfb_count -= count; 3647 ASSERT(ipfb->ipfb_frag_pkts > 0); 3648 ipfb->ipfb_frag_pkts--; 3649 freemsg(mp); 3650 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3651 } 3652 3653 if (ipf) 3654 ipf->ipf_ptphn = ipfp; 3655 ipfp[0] = ipf; 3656 } 3657 3658 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3659 "obsolete and may be removed in a future release of Solaris. Use " \ 3660 "ifconfig(1M) to manipulate the forwarding status of an interface." 3661 3662 /* 3663 * For obsolete per-interface forwarding configuration; 3664 * called in response to ND_GET. 3665 */ 3666 /* ARGSUSED */ 3667 static int 3668 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3669 { 3670 ill_t *ill = (ill_t *)cp; 3671 3672 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3673 3674 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3675 return (0); 3676 } 3677 3678 /* 3679 * For obsolete per-interface forwarding configuration; 3680 * called in response to ND_SET. 3681 */ 3682 /* ARGSUSED */ 3683 static int 3684 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3685 cred_t *ioc_cr) 3686 { 3687 long value; 3688 int retval; 3689 ip_stack_t *ipst = CONNQ_TO_IPST(q); 3690 3691 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3692 3693 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3694 value < 0 || value > 1) { 3695 return (EINVAL); 3696 } 3697 3698 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3699 retval = ill_forward_set((ill_t *)cp, (value != 0)); 3700 rw_exit(&ipst->ips_ill_g_lock); 3701 return (retval); 3702 } 3703 3704 /* 3705 * Helper function for ill_forward_set(). 3706 */ 3707 static void 3708 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 3709 { 3710 ip_stack_t *ipst = ill->ill_ipst; 3711 3712 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3713 3714 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3715 (enable ? "Enabling" : "Disabling"), 3716 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3717 mutex_enter(&ill->ill_lock); 3718 if (enable) 3719 ill->ill_flags |= ILLF_ROUTER; 3720 else 3721 ill->ill_flags &= ~ILLF_ROUTER; 3722 mutex_exit(&ill->ill_lock); 3723 if (ill->ill_isv6) 3724 ill_set_nce_router_flags(ill, enable); 3725 /* Notify routing socket listeners of this change. */ 3726 if (ill->ill_ipif != NULL) 3727 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 3728 } 3729 3730 /* 3731 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 3732 * socket messages for each interface whose flags we change. 3733 */ 3734 int 3735 ill_forward_set(ill_t *ill, boolean_t enable) 3736 { 3737 ipmp_illgrp_t *illg; 3738 ip_stack_t *ipst = ill->ill_ipst; 3739 3740 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3741 3742 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3743 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 3744 return (0); 3745 3746 if (IS_LOOPBACK(ill)) 3747 return (EINVAL); 3748 3749 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 3750 /* 3751 * Update all of the interfaces in the group. 3752 */ 3753 illg = ill->ill_grp; 3754 ill = list_head(&illg->ig_if); 3755 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 3756 ill_forward_set_on_ill(ill, enable); 3757 3758 /* 3759 * Update the IPMP meta-interface. 3760 */ 3761 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 3762 return (0); 3763 } 3764 3765 ill_forward_set_on_ill(ill, enable); 3766 return (0); 3767 } 3768 3769 /* 3770 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3771 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3772 * set or clear. 3773 */ 3774 static void 3775 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3776 { 3777 ipif_t *ipif; 3778 nce_t *nce; 3779 3780 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3781 /* 3782 * NOTE: we match across the illgrp because nce's for 3783 * addresses on IPMP interfaces have an nce_ill that points to 3784 * the bound underlying ill. 3785 */ 3786 nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr, 3787 B_FALSE); 3788 if (nce != NULL) { 3789 mutex_enter(&nce->nce_lock); 3790 if (enable) 3791 nce->nce_flags |= NCE_F_ISROUTER; 3792 else 3793 nce->nce_flags &= ~NCE_F_ISROUTER; 3794 mutex_exit(&nce->nce_lock); 3795 NCE_REFRELE(nce); 3796 } 3797 } 3798 } 3799 3800 /* 3801 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 3802 * for this ill. Make sure the v6/v4 question has been answered about this 3803 * ill. The creation of this ndd variable is only for backwards compatibility. 3804 * The preferred way to control per-interface IP forwarding is through the 3805 * ILLF_ROUTER interface flag. 3806 */ 3807 static int 3808 ill_set_ndd_name(ill_t *ill) 3809 { 3810 char *suffix; 3811 ip_stack_t *ipst = ill->ill_ipst; 3812 3813 ASSERT(IAM_WRITER_ILL(ill)); 3814 3815 if (ill->ill_isv6) 3816 suffix = ipv6_forward_suffix; 3817 else 3818 suffix = ipv4_forward_suffix; 3819 3820 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 3821 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 3822 /* 3823 * Copies over the '\0'. 3824 * Note that strlen(suffix) is always bounded. 3825 */ 3826 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 3827 strlen(suffix) + 1); 3828 3829 /* 3830 * Use of the nd table requires holding the reader lock. 3831 * Modifying the nd table thru nd_load/nd_unload requires 3832 * the writer lock. 3833 */ 3834 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 3835 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 3836 nd_ill_forward_set, (caddr_t)ill)) { 3837 /* 3838 * If the nd_load failed, it only meant that it could not 3839 * allocate a new bunch of room for further NDD expansion. 3840 * Because of that, the ill_ndd_name will be set to 0, and 3841 * this interface is at the mercy of the global ip_forwarding 3842 * variable. 3843 */ 3844 rw_exit(&ipst->ips_ip_g_nd_lock); 3845 ill->ill_ndd_name = NULL; 3846 return (ENOMEM); 3847 } 3848 rw_exit(&ipst->ips_ip_g_nd_lock); 3849 return (0); 3850 } 3851 3852 /* 3853 * Intializes the context structure and returns the first ill in the list 3854 * cuurently start_list and end_list can have values: 3855 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 3856 * IP_V4_G_HEAD Traverse IPV4 list only. 3857 * IP_V6_G_HEAD Traverse IPV6 list only. 3858 */ 3859 3860 /* 3861 * We don't check for CONDEMNED ills here. Caller must do that if 3862 * necessary under the ill lock. 3863 */ 3864 ill_t * 3865 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 3866 ip_stack_t *ipst) 3867 { 3868 ill_if_t *ifp; 3869 ill_t *ill; 3870 avl_tree_t *avl_tree; 3871 3872 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3873 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 3874 3875 /* 3876 * setup the lists to search 3877 */ 3878 if (end_list != MAX_G_HEADS) { 3879 ctx->ctx_current_list = start_list; 3880 ctx->ctx_last_list = end_list; 3881 } else { 3882 ctx->ctx_last_list = MAX_G_HEADS - 1; 3883 ctx->ctx_current_list = 0; 3884 } 3885 3886 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 3887 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 3888 if (ifp != (ill_if_t *) 3889 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 3890 avl_tree = &ifp->illif_avl_by_ppa; 3891 ill = avl_first(avl_tree); 3892 /* 3893 * ill is guaranteed to be non NULL or ifp should have 3894 * not existed. 3895 */ 3896 ASSERT(ill != NULL); 3897 return (ill); 3898 } 3899 ctx->ctx_current_list++; 3900 } 3901 3902 return (NULL); 3903 } 3904 3905 /* 3906 * returns the next ill in the list. ill_first() must have been called 3907 * before calling ill_next() or bad things will happen. 3908 */ 3909 3910 /* 3911 * We don't check for CONDEMNED ills here. Caller must do that if 3912 * necessary under the ill lock. 3913 */ 3914 ill_t * 3915 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 3916 { 3917 ill_if_t *ifp; 3918 ill_t *ill; 3919 ip_stack_t *ipst = lastill->ill_ipst; 3920 3921 ASSERT(lastill->ill_ifptr != (ill_if_t *) 3922 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 3923 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 3924 AVL_AFTER)) != NULL) { 3925 return (ill); 3926 } 3927 3928 /* goto next ill_ifp in the list. */ 3929 ifp = lastill->ill_ifptr->illif_next; 3930 3931 /* make sure not at end of circular list */ 3932 while (ifp == 3933 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 3934 if (++ctx->ctx_current_list > ctx->ctx_last_list) 3935 return (NULL); 3936 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 3937 } 3938 3939 return (avl_first(&ifp->illif_avl_by_ppa)); 3940 } 3941 3942 /* 3943 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 3944 * The final number (PPA) must not have any leading zeros. Upon success, a 3945 * pointer to the start of the PPA is returned; otherwise NULL is returned. 3946 */ 3947 static char * 3948 ill_get_ppa_ptr(char *name) 3949 { 3950 int namelen = strlen(name); 3951 int end_ndx = namelen - 1; 3952 int ppa_ndx, i; 3953 3954 /* 3955 * Check that the first character is [a-zA-Z], and that the last 3956 * character is [0-9]. 3957 */ 3958 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 3959 return (NULL); 3960 3961 /* 3962 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 3963 */ 3964 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 3965 if (!isdigit(name[ppa_ndx - 1])) 3966 break; 3967 3968 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 3969 return (NULL); 3970 3971 /* 3972 * Check that the intermediate characters are [a-z0-9.] 3973 */ 3974 for (i = 1; i < ppa_ndx; i++) { 3975 if (!isalpha(name[i]) && !isdigit(name[i]) && 3976 name[i] != '.' && name[i] != '_') { 3977 return (NULL); 3978 } 3979 } 3980 3981 return (name + ppa_ndx); 3982 } 3983 3984 /* 3985 * use avl tree to locate the ill. 3986 */ 3987 static ill_t * 3988 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 3989 ipsq_func_t func, int *error, ip_stack_t *ipst) 3990 { 3991 char *ppa_ptr = NULL; 3992 int len; 3993 uint_t ppa; 3994 ill_t *ill = NULL; 3995 ill_if_t *ifp; 3996 int list; 3997 ipsq_t *ipsq; 3998 3999 if (error != NULL) 4000 *error = 0; 4001 4002 /* 4003 * get ppa ptr 4004 */ 4005 if (isv6) 4006 list = IP_V6_G_HEAD; 4007 else 4008 list = IP_V4_G_HEAD; 4009 4010 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4011 if (error != NULL) 4012 *error = ENXIO; 4013 return (NULL); 4014 } 4015 4016 len = ppa_ptr - name + 1; 4017 4018 ppa = stoi(&ppa_ptr); 4019 4020 ifp = IP_VX_ILL_G_LIST(list, ipst); 4021 4022 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4023 /* 4024 * match is done on len - 1 as the name is not null 4025 * terminated it contains ppa in addition to the interface 4026 * name. 4027 */ 4028 if ((ifp->illif_name_len == len) && 4029 bcmp(ifp->illif_name, name, len - 1) == 0) { 4030 break; 4031 } else { 4032 ifp = ifp->illif_next; 4033 } 4034 } 4035 4036 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4037 /* 4038 * Even the interface type does not exist. 4039 */ 4040 if (error != NULL) 4041 *error = ENXIO; 4042 return (NULL); 4043 } 4044 4045 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4046 if (ill != NULL) { 4047 /* 4048 * The block comment at the start of ipif_down 4049 * explains the use of the macros used below 4050 */ 4051 GRAB_CONN_LOCK(q); 4052 mutex_enter(&ill->ill_lock); 4053 if (ILL_CAN_LOOKUP(ill)) { 4054 ill_refhold_locked(ill); 4055 mutex_exit(&ill->ill_lock); 4056 RELEASE_CONN_LOCK(q); 4057 return (ill); 4058 } else if (ILL_CAN_WAIT(ill, q)) { 4059 ipsq = ill->ill_phyint->phyint_ipsq; 4060 mutex_enter(&ipsq->ipsq_lock); 4061 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 4062 mutex_exit(&ill->ill_lock); 4063 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4064 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 4065 mutex_exit(&ipsq->ipsq_lock); 4066 RELEASE_CONN_LOCK(q); 4067 if (error != NULL) 4068 *error = EINPROGRESS; 4069 return (NULL); 4070 } 4071 mutex_exit(&ill->ill_lock); 4072 RELEASE_CONN_LOCK(q); 4073 } 4074 if (error != NULL) 4075 *error = ENXIO; 4076 return (NULL); 4077 } 4078 4079 /* 4080 * comparison function for use with avl. 4081 */ 4082 static int 4083 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4084 { 4085 uint_t ppa; 4086 uint_t ill_ppa; 4087 4088 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4089 4090 ppa = *((uint_t *)ppa_ptr); 4091 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4092 /* 4093 * We want the ill with the lowest ppa to be on the 4094 * top. 4095 */ 4096 if (ill_ppa < ppa) 4097 return (1); 4098 if (ill_ppa > ppa) 4099 return (-1); 4100 return (0); 4101 } 4102 4103 /* 4104 * remove an interface type from the global list. 4105 */ 4106 static void 4107 ill_delete_interface_type(ill_if_t *interface) 4108 { 4109 ASSERT(interface != NULL); 4110 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4111 4112 avl_destroy(&interface->illif_avl_by_ppa); 4113 if (interface->illif_ppa_arena != NULL) 4114 vmem_destroy(interface->illif_ppa_arena); 4115 4116 remque(interface); 4117 4118 mi_free(interface); 4119 } 4120 4121 /* 4122 * remove ill from the global list. 4123 */ 4124 static void 4125 ill_glist_delete(ill_t *ill) 4126 { 4127 ip_stack_t *ipst; 4128 phyint_t *phyi; 4129 4130 if (ill == NULL) 4131 return; 4132 ipst = ill->ill_ipst; 4133 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4134 4135 /* 4136 * If the ill was never inserted into the AVL tree 4137 * we skip the if branch. 4138 */ 4139 if (ill->ill_ifptr != NULL) { 4140 /* 4141 * remove from AVL tree and free ppa number 4142 */ 4143 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4144 4145 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4146 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4147 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4148 } 4149 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4150 ill_delete_interface_type(ill->ill_ifptr); 4151 } 4152 4153 /* 4154 * Indicate ill is no longer in the list. 4155 */ 4156 ill->ill_ifptr = NULL; 4157 ill->ill_name_length = 0; 4158 ill->ill_name[0] = '\0'; 4159 ill->ill_ppa = UINT_MAX; 4160 } 4161 4162 /* Generate one last event for this ill. */ 4163 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 4164 ill->ill_name_length); 4165 4166 ASSERT(ill->ill_phyint != NULL); 4167 phyi = ill->ill_phyint; 4168 ill->ill_phyint = NULL; 4169 4170 /* 4171 * ill_init allocates a phyint always to store the copy 4172 * of flags relevant to phyint. At that point in time, we could 4173 * not assign the name and hence phyint_illv4/v6 could not be 4174 * initialized. Later in ipif_set_values, we assign the name to 4175 * the ill, at which point in time we assign phyint_illv4/v6. 4176 * Thus we don't rely on phyint_illv6 to be initialized always. 4177 */ 4178 if (ill->ill_flags & ILLF_IPV6) 4179 phyi->phyint_illv6 = NULL; 4180 else 4181 phyi->phyint_illv4 = NULL; 4182 4183 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 4184 rw_exit(&ipst->ips_ill_g_lock); 4185 return; 4186 } 4187 4188 /* 4189 * There are no ills left on this phyint; pull it out of the phyint 4190 * avl trees, and free it. 4191 */ 4192 if (phyi->phyint_ifindex > 0) { 4193 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4194 phyi); 4195 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4196 phyi); 4197 } 4198 rw_exit(&ipst->ips_ill_g_lock); 4199 4200 phyint_free(phyi); 4201 } 4202 4203 /* 4204 * allocate a ppa, if the number of plumbed interfaces of this type are 4205 * less than ill_no_arena do a linear search to find a unused ppa. 4206 * When the number goes beyond ill_no_arena switch to using an arena. 4207 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4208 * is the return value for an error condition, so allocation starts at one 4209 * and is decremented by one. 4210 */ 4211 static int 4212 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4213 { 4214 ill_t *tmp_ill; 4215 uint_t start, end; 4216 int ppa; 4217 4218 if (ifp->illif_ppa_arena == NULL && 4219 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4220 /* 4221 * Create an arena. 4222 */ 4223 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4224 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4225 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4226 /* allocate what has already been assigned */ 4227 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4228 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4229 tmp_ill, AVL_AFTER)) { 4230 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4231 1, /* size */ 4232 1, /* align/quantum */ 4233 0, /* phase */ 4234 0, /* nocross */ 4235 /* minaddr */ 4236 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 4237 /* maxaddr */ 4238 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 4239 VM_NOSLEEP|VM_FIRSTFIT); 4240 if (ppa == 0) { 4241 ip1dbg(("ill_alloc_ppa: ppa allocation" 4242 " failed while switching")); 4243 vmem_destroy(ifp->illif_ppa_arena); 4244 ifp->illif_ppa_arena = NULL; 4245 break; 4246 } 4247 } 4248 } 4249 4250 if (ifp->illif_ppa_arena != NULL) { 4251 if (ill->ill_ppa == UINT_MAX) { 4252 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4253 1, VM_NOSLEEP|VM_FIRSTFIT); 4254 if (ppa == 0) 4255 return (EAGAIN); 4256 ill->ill_ppa = --ppa; 4257 } else { 4258 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4259 1, /* size */ 4260 1, /* align/quantum */ 4261 0, /* phase */ 4262 0, /* nocross */ 4263 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4264 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4265 VM_NOSLEEP|VM_FIRSTFIT); 4266 /* 4267 * Most likely the allocation failed because 4268 * the requested ppa was in use. 4269 */ 4270 if (ppa == 0) 4271 return (EEXIST); 4272 } 4273 return (0); 4274 } 4275 4276 /* 4277 * No arena is in use and not enough (>ill_no_arena) interfaces have 4278 * been plumbed to create one. Do a linear search to get a unused ppa. 4279 */ 4280 if (ill->ill_ppa == UINT_MAX) { 4281 end = UINT_MAX - 1; 4282 start = 0; 4283 } else { 4284 end = start = ill->ill_ppa; 4285 } 4286 4287 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4288 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4289 if (start++ >= end) { 4290 if (ill->ill_ppa == UINT_MAX) 4291 return (EAGAIN); 4292 else 4293 return (EEXIST); 4294 } 4295 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4296 } 4297 ill->ill_ppa = start; 4298 return (0); 4299 } 4300 4301 /* 4302 * Insert ill into the list of configured ill's. Once this function completes, 4303 * the ill is globally visible and is available through lookups. More precisely 4304 * this happens after the caller drops the ill_g_lock. 4305 */ 4306 static int 4307 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4308 { 4309 ill_if_t *ill_interface; 4310 avl_index_t where = 0; 4311 int error; 4312 int name_length; 4313 int index; 4314 boolean_t check_length = B_FALSE; 4315 ip_stack_t *ipst = ill->ill_ipst; 4316 4317 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 4318 4319 name_length = mi_strlen(name) + 1; 4320 4321 if (isv6) 4322 index = IP_V6_G_HEAD; 4323 else 4324 index = IP_V4_G_HEAD; 4325 4326 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 4327 /* 4328 * Search for interface type based on name 4329 */ 4330 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4331 if ((ill_interface->illif_name_len == name_length) && 4332 (strcmp(ill_interface->illif_name, name) == 0)) { 4333 break; 4334 } 4335 ill_interface = ill_interface->illif_next; 4336 } 4337 4338 /* 4339 * Interface type not found, create one. 4340 */ 4341 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4342 ill_g_head_t ghead; 4343 4344 /* 4345 * allocate ill_if_t structure 4346 */ 4347 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4348 if (ill_interface == NULL) { 4349 return (ENOMEM); 4350 } 4351 4352 (void) strcpy(ill_interface->illif_name, name); 4353 ill_interface->illif_name_len = name_length; 4354 4355 avl_create(&ill_interface->illif_avl_by_ppa, 4356 ill_compare_ppa, sizeof (ill_t), 4357 offsetof(struct ill_s, ill_avl_byppa)); 4358 4359 /* 4360 * link the structure in the back to maintain order 4361 * of configuration for ifconfig output. 4362 */ 4363 ghead = ipst->ips_ill_g_heads[index]; 4364 insque(ill_interface, ghead.ill_g_list_tail); 4365 } 4366 4367 if (ill->ill_ppa == UINT_MAX) 4368 check_length = B_TRUE; 4369 4370 error = ill_alloc_ppa(ill_interface, ill); 4371 if (error != 0) { 4372 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4373 ill_delete_interface_type(ill->ill_ifptr); 4374 return (error); 4375 } 4376 4377 /* 4378 * When the ppa is choosen by the system, check that there is 4379 * enough space to insert ppa. if a specific ppa was passed in this 4380 * check is not required as the interface name passed in will have 4381 * the right ppa in it. 4382 */ 4383 if (check_length) { 4384 /* 4385 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4386 */ 4387 char buf[sizeof (uint_t) * 3]; 4388 4389 /* 4390 * convert ppa to string to calculate the amount of space 4391 * required for it in the name. 4392 */ 4393 numtos(ill->ill_ppa, buf); 4394 4395 /* Do we have enough space to insert ppa ? */ 4396 4397 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4398 /* Free ppa and interface type struct */ 4399 if (ill_interface->illif_ppa_arena != NULL) { 4400 vmem_free(ill_interface->illif_ppa_arena, 4401 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4402 } 4403 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4404 ill_delete_interface_type(ill->ill_ifptr); 4405 4406 return (EINVAL); 4407 } 4408 } 4409 4410 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4411 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4412 4413 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4414 &where); 4415 ill->ill_ifptr = ill_interface; 4416 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4417 4418 ill_phyint_reinit(ill); 4419 return (0); 4420 } 4421 4422 /* Initialize the per phyint ipsq used for serialization */ 4423 static boolean_t 4424 ipsq_init(ill_t *ill, boolean_t enter) 4425 { 4426 ipsq_t *ipsq; 4427 ipxop_t *ipx; 4428 4429 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 4430 return (B_FALSE); 4431 4432 ill->ill_phyint->phyint_ipsq = ipsq; 4433 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 4434 ipx->ipx_ipsq = ipsq; 4435 ipsq->ipsq_next = ipsq; 4436 ipsq->ipsq_phyint = ill->ill_phyint; 4437 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4438 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 4439 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 4440 if (enter) { 4441 ipx->ipx_writer = curthread; 4442 ipx->ipx_forced = B_FALSE; 4443 ipx->ipx_reentry_cnt = 1; 4444 #ifdef DEBUG 4445 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 4446 #endif 4447 } 4448 return (B_TRUE); 4449 } 4450 4451 /* 4452 * ill_init is called by ip_open when a device control stream is opened. 4453 * It does a few initializations, and shoots a DL_INFO_REQ message down 4454 * to the driver. The response is later picked up in ip_rput_dlpi and 4455 * used to set up default mechanisms for talking to the driver. (Always 4456 * called as writer.) 4457 * 4458 * If this function returns error, ip_open will call ip_close which in 4459 * turn will call ill_delete to clean up any memory allocated here that 4460 * is not yet freed. 4461 */ 4462 int 4463 ill_init(queue_t *q, ill_t *ill) 4464 { 4465 int count; 4466 dl_info_req_t *dlir; 4467 mblk_t *info_mp; 4468 uchar_t *frag_ptr; 4469 4470 /* 4471 * The ill is initialized to zero by mi_alloc*(). In addition 4472 * some fields already contain valid values, initialized in 4473 * ip_open(), before we reach here. 4474 */ 4475 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4476 4477 ill->ill_rq = q; 4478 ill->ill_wq = WR(q); 4479 4480 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4481 BPRI_HI); 4482 if (info_mp == NULL) 4483 return (ENOMEM); 4484 4485 /* 4486 * Allocate sufficient space to contain our fragment hash table and 4487 * the device name. 4488 */ 4489 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4490 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4491 if (frag_ptr == NULL) { 4492 freemsg(info_mp); 4493 return (ENOMEM); 4494 } 4495 ill->ill_frag_ptr = frag_ptr; 4496 ill->ill_frag_free_num_pkts = 0; 4497 ill->ill_last_frag_clean_time = 0; 4498 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4499 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4500 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4501 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4502 NULL, MUTEX_DEFAULT, NULL); 4503 } 4504 4505 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4506 if (ill->ill_phyint == NULL) { 4507 freemsg(info_mp); 4508 mi_free(frag_ptr); 4509 return (ENOMEM); 4510 } 4511 4512 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4513 /* 4514 * For now pretend this is a v4 ill. We need to set phyint_ill* 4515 * at this point because of the following reason. If we can't 4516 * enter the ipsq at some point and cv_wait, the writer that 4517 * wakes us up tries to locate us using the list of all phyints 4518 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4519 * If we don't set it now, we risk a missed wakeup. 4520 */ 4521 ill->ill_phyint->phyint_illv4 = ill; 4522 ill->ill_ppa = UINT_MAX; 4523 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4524 4525 if (!ipsq_init(ill, B_TRUE)) { 4526 freemsg(info_mp); 4527 mi_free(frag_ptr); 4528 mi_free(ill->ill_phyint); 4529 return (ENOMEM); 4530 } 4531 4532 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4533 4534 /* Frag queue limit stuff */ 4535 ill->ill_frag_count = 0; 4536 ill->ill_ipf_gen = 0; 4537 4538 ill->ill_global_timer = INFINITY; 4539 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4540 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4541 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4542 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4543 4544 /* 4545 * Initialize IPv6 configuration variables. The IP module is always 4546 * opened as an IPv4 module. Instead tracking down the cases where 4547 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4548 * here for convenience, this has no effect until the ill is set to do 4549 * IPv6. 4550 */ 4551 ill->ill_reachable_time = ND_REACHABLE_TIME; 4552 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4553 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4554 ill->ill_max_buf = ND_MAX_Q; 4555 ill->ill_refcnt = 0; 4556 4557 /* Send down the Info Request to the driver. */ 4558 info_mp->b_datap->db_type = M_PCPROTO; 4559 dlir = (dl_info_req_t *)info_mp->b_rptr; 4560 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4561 dlir->dl_primitive = DL_INFO_REQ; 4562 4563 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4564 4565 qprocson(q); 4566 ill_dlpi_send(ill, info_mp); 4567 4568 return (0); 4569 } 4570 4571 /* 4572 * ill_dls_info 4573 * creates datalink socket info from the device. 4574 */ 4575 int 4576 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4577 { 4578 size_t len; 4579 ill_t *ill = ipif->ipif_ill; 4580 4581 sdl->sdl_family = AF_LINK; 4582 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4583 sdl->sdl_type = ill->ill_type; 4584 ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4585 len = strlen(sdl->sdl_data); 4586 ASSERT(len < 256); 4587 sdl->sdl_nlen = (uchar_t)len; 4588 sdl->sdl_alen = ill->ill_phys_addr_length; 4589 sdl->sdl_slen = 0; 4590 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 4591 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 4592 4593 return (sizeof (struct sockaddr_dl)); 4594 } 4595 4596 /* 4597 * ill_xarp_info 4598 * creates xarp info from the device. 4599 */ 4600 static int 4601 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4602 { 4603 sdl->sdl_family = AF_LINK; 4604 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4605 sdl->sdl_type = ill->ill_type; 4606 ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4607 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4608 sdl->sdl_alen = ill->ill_phys_addr_length; 4609 sdl->sdl_slen = 0; 4610 return (sdl->sdl_nlen); 4611 } 4612 4613 static int 4614 loopback_kstat_update(kstat_t *ksp, int rw) 4615 { 4616 kstat_named_t *kn; 4617 netstackid_t stackid; 4618 netstack_t *ns; 4619 ip_stack_t *ipst; 4620 4621 if (ksp == NULL || ksp->ks_data == NULL) 4622 return (EIO); 4623 4624 if (rw == KSTAT_WRITE) 4625 return (EACCES); 4626 4627 kn = KSTAT_NAMED_PTR(ksp); 4628 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 4629 4630 ns = netstack_find_by_stackid(stackid); 4631 if (ns == NULL) 4632 return (-1); 4633 4634 ipst = ns->netstack_ip; 4635 if (ipst == NULL) { 4636 netstack_rele(ns); 4637 return (-1); 4638 } 4639 kn[0].value.ui32 = ipst->ips_loopback_packets; 4640 kn[1].value.ui32 = ipst->ips_loopback_packets; 4641 netstack_rele(ns); 4642 return (0); 4643 } 4644 4645 /* 4646 * Has ifindex been plumbed already? 4647 */ 4648 boolean_t 4649 phyint_exists(uint_t index, ip_stack_t *ipst) 4650 { 4651 ASSERT(index != 0); 4652 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4653 4654 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4655 &index, NULL) != NULL); 4656 } 4657 4658 /* Pick a unique ifindex */ 4659 boolean_t 4660 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 4661 { 4662 uint_t starting_index; 4663 4664 if (!ipst->ips_ill_index_wrap) { 4665 *indexp = ipst->ips_ill_index++; 4666 if (ipst->ips_ill_index == 0) { 4667 /* Reached the uint_t limit Next time wrap */ 4668 ipst->ips_ill_index_wrap = B_TRUE; 4669 } 4670 return (B_TRUE); 4671 } 4672 4673 /* 4674 * Start reusing unused indexes. Note that we hold the ill_g_lock 4675 * at this point and don't want to call any function that attempts 4676 * to get the lock again. 4677 */ 4678 starting_index = ipst->ips_ill_index++; 4679 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 4680 if (ipst->ips_ill_index != 0 && 4681 !phyint_exists(ipst->ips_ill_index, ipst)) { 4682 /* found unused index - use it */ 4683 *indexp = ipst->ips_ill_index; 4684 return (B_TRUE); 4685 } 4686 } 4687 4688 /* 4689 * all interface indicies are inuse. 4690 */ 4691 return (B_FALSE); 4692 } 4693 4694 /* 4695 * Assign a unique interface index for the phyint. 4696 */ 4697 static boolean_t 4698 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 4699 { 4700 ASSERT(phyi->phyint_ifindex == 0); 4701 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 4702 } 4703 4704 /* 4705 * Initialize the flags on `phyi' as per the provided mactype. 4706 */ 4707 static void 4708 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 4709 { 4710 uint64_t flags = 0; 4711 4712 /* 4713 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 4714 * we always presume the underlying hardware is working and set 4715 * PHYI_RUNNING (if it's not, the driver will subsequently send a 4716 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 4717 * there are no active interfaces in the group so we set PHYI_FAILED. 4718 */ 4719 if (mactype == SUNW_DL_IPMP) 4720 flags |= PHYI_FAILED; 4721 else 4722 flags |= PHYI_RUNNING; 4723 4724 switch (mactype) { 4725 case SUNW_DL_VNI: 4726 flags |= PHYI_VIRTUAL; 4727 break; 4728 case SUNW_DL_IPMP: 4729 flags |= PHYI_IPMP; 4730 break; 4731 case DL_LOOP: 4732 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 4733 break; 4734 } 4735 4736 mutex_enter(&phyi->phyint_lock); 4737 phyi->phyint_flags |= flags; 4738 mutex_exit(&phyi->phyint_lock); 4739 } 4740 4741 /* 4742 * Return a pointer to the ill which matches the supplied name. Note that 4743 * the ill name length includes the null termination character. (May be 4744 * called as writer.) 4745 * If do_alloc and the interface is "lo0" it will be automatically created. 4746 * Cannot bump up reference on condemned ills. So dup detect can't be done 4747 * using this func. 4748 */ 4749 ill_t * 4750 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4751 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc, 4752 ip_stack_t *ipst) 4753 { 4754 ill_t *ill; 4755 ipif_t *ipif; 4756 ipsq_t *ipsq; 4757 kstat_named_t *kn; 4758 boolean_t isloopback; 4759 in6_addr_t ov6addr; 4760 4761 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4762 4763 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4764 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4765 rw_exit(&ipst->ips_ill_g_lock); 4766 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4767 return (ill); 4768 4769 /* 4770 * Couldn't find it. Does this happen to be a lookup for the 4771 * loopback device and are we allowed to allocate it? 4772 */ 4773 if (!isloopback || !do_alloc) 4774 return (NULL); 4775 4776 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4777 4778 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4779 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4780 rw_exit(&ipst->ips_ill_g_lock); 4781 return (ill); 4782 } 4783 4784 /* Create the loopback device on demand */ 4785 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4786 sizeof (ipif_loopback_name), BPRI_MED)); 4787 if (ill == NULL) 4788 goto done; 4789 4790 *ill = ill_null; 4791 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4792 ill->ill_ipst = ipst; 4793 netstack_hold(ipst->ips_netstack); 4794 /* 4795 * For exclusive stacks we set the zoneid to zero 4796 * to make IP operate as if in the global zone. 4797 */ 4798 ill->ill_zoneid = GLOBAL_ZONEID; 4799 4800 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4801 if (ill->ill_phyint == NULL) 4802 goto done; 4803 4804 if (isv6) 4805 ill->ill_phyint->phyint_illv6 = ill; 4806 else 4807 ill->ill_phyint->phyint_illv4 = ill; 4808 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4809 phyint_flags_init(ill->ill_phyint, DL_LOOP); 4810 4811 ill->ill_max_frag = IP_LOOPBACK_MTU; 4812 /* Add room for tcp+ip headers */ 4813 if (isv6) { 4814 ill->ill_isv6 = B_TRUE; 4815 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 4816 } else { 4817 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 4818 } 4819 if (!ill_allocate_mibs(ill)) 4820 goto done; 4821 ill->ill_max_mtu = ill->ill_max_frag; 4822 /* 4823 * ipif_loopback_name can't be pointed at directly because its used 4824 * by both the ipv4 and ipv6 interfaces. When the ill is removed 4825 * from the glist, ill_glist_delete() sets the first character of 4826 * ill_name to '\0'. 4827 */ 4828 ill->ill_name = (char *)ill + sizeof (*ill); 4829 (void) strcpy(ill->ill_name, ipif_loopback_name); 4830 ill->ill_name_length = sizeof (ipif_loopback_name); 4831 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 4832 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4833 4834 ill->ill_global_timer = INFINITY; 4835 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4836 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4837 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4838 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4839 4840 /* No resolver here. */ 4841 ill->ill_net_type = IRE_LOOPBACK; 4842 4843 /* Initialize the ipsq */ 4844 if (!ipsq_init(ill, B_FALSE)) 4845 goto done; 4846 4847 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE); 4848 if (ipif == NULL) 4849 goto done; 4850 4851 ill->ill_flags = ILLF_MULTICAST; 4852 4853 ov6addr = ipif->ipif_v6lcl_addr; 4854 /* Set up default loopback address and mask. */ 4855 if (!isv6) { 4856 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 4857 4858 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 4859 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4860 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 4861 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4862 ipif->ipif_v6subnet); 4863 ill->ill_flags |= ILLF_IPV4; 4864 } else { 4865 ipif->ipif_v6lcl_addr = ipv6_loopback; 4866 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4867 ipif->ipif_v6net_mask = ipv6_all_ones; 4868 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4869 ipif->ipif_v6subnet); 4870 ill->ill_flags |= ILLF_IPV6; 4871 } 4872 4873 /* 4874 * Chain us in at the end of the ill list. hold the ill 4875 * before we make it globally visible. 1 for the lookup. 4876 */ 4877 ill->ill_refcnt = 0; 4878 ill_refhold(ill); 4879 4880 ill->ill_frag_count = 0; 4881 ill->ill_frag_free_num_pkts = 0; 4882 ill->ill_last_frag_clean_time = 0; 4883 4884 ipsq = ill->ill_phyint->phyint_ipsq; 4885 4886 if (ill_glist_insert(ill, "lo", isv6) != 0) 4887 cmn_err(CE_PANIC, "cannot insert loopback interface"); 4888 4889 /* Let SCTP know so that it can add this to its list */ 4890 sctp_update_ill(ill, SCTP_ILL_INSERT); 4891 4892 /* 4893 * We have already assigned ipif_v6lcl_addr above, but we need to 4894 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 4895 * requires to be after ill_glist_insert() since we need the 4896 * ill_index set. Pass on ipv6_loopback as the old address. 4897 */ 4898 sctp_update_ipif_addr(ipif, ov6addr); 4899 4900 /* 4901 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 4902 * If so, free our original one. 4903 */ 4904 if (ipsq != ill->ill_phyint->phyint_ipsq) 4905 ipsq_delete(ipsq); 4906 4907 if (ipst->ips_loopback_ksp == NULL) { 4908 /* Export loopback interface statistics */ 4909 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 4910 ipif_loopback_name, "net", 4911 KSTAT_TYPE_NAMED, 2, 0, 4912 ipst->ips_netstack->netstack_stackid); 4913 if (ipst->ips_loopback_ksp != NULL) { 4914 ipst->ips_loopback_ksp->ks_update = 4915 loopback_kstat_update; 4916 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 4917 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 4918 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 4919 ipst->ips_loopback_ksp->ks_private = 4920 (void *)(uintptr_t)ipst->ips_netstack-> 4921 netstack_stackid; 4922 kstat_install(ipst->ips_loopback_ksp); 4923 } 4924 } 4925 4926 if (error != NULL) 4927 *error = 0; 4928 *did_alloc = B_TRUE; 4929 rw_exit(&ipst->ips_ill_g_lock); 4930 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 4931 NE_PLUMB, ill->ill_name, ill->ill_name_length); 4932 return (ill); 4933 done: 4934 if (ill != NULL) { 4935 if (ill->ill_phyint != NULL) { 4936 ipsq = ill->ill_phyint->phyint_ipsq; 4937 if (ipsq != NULL) { 4938 ipsq->ipsq_phyint = NULL; 4939 ipsq_delete(ipsq); 4940 } 4941 mi_free(ill->ill_phyint); 4942 } 4943 ill_free_mib(ill); 4944 if (ill->ill_ipst != NULL) 4945 netstack_rele(ill->ill_ipst->ips_netstack); 4946 mi_free(ill); 4947 } 4948 rw_exit(&ipst->ips_ill_g_lock); 4949 if (error != NULL) 4950 *error = ENOMEM; 4951 return (NULL); 4952 } 4953 4954 /* 4955 * For IPP calls - use the ip_stack_t for global stack. 4956 */ 4957 ill_t * 4958 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, 4959 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 4960 { 4961 ip_stack_t *ipst; 4962 ill_t *ill; 4963 4964 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 4965 if (ipst == NULL) { 4966 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 4967 return (NULL); 4968 } 4969 4970 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 4971 netstack_rele(ipst->ips_netstack); 4972 return (ill); 4973 } 4974 4975 /* 4976 * Return a pointer to the ill which matches the index and IP version type. 4977 */ 4978 ill_t * 4979 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 4980 ipsq_func_t func, int *err, ip_stack_t *ipst) 4981 { 4982 ill_t *ill; 4983 ipsq_t *ipsq; 4984 phyint_t *phyi; 4985 4986 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 4987 (q != NULL && mp != NULL && func != NULL && err != NULL)); 4988 4989 if (err != NULL) 4990 *err = 0; 4991 4992 /* 4993 * Indexes are stored in the phyint - a common structure 4994 * to both IPv4 and IPv6. 4995 */ 4996 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4997 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4998 (void *) &index, NULL); 4999 if (phyi != NULL) { 5000 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5001 if (ill != NULL) { 5002 /* 5003 * The block comment at the start of ipif_down 5004 * explains the use of the macros used below 5005 */ 5006 GRAB_CONN_LOCK(q); 5007 mutex_enter(&ill->ill_lock); 5008 if (ILL_CAN_LOOKUP(ill)) { 5009 ill_refhold_locked(ill); 5010 mutex_exit(&ill->ill_lock); 5011 RELEASE_CONN_LOCK(q); 5012 rw_exit(&ipst->ips_ill_g_lock); 5013 return (ill); 5014 } else if (ILL_CAN_WAIT(ill, q)) { 5015 ipsq = ill->ill_phyint->phyint_ipsq; 5016 mutex_enter(&ipsq->ipsq_lock); 5017 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5018 rw_exit(&ipst->ips_ill_g_lock); 5019 mutex_exit(&ill->ill_lock); 5020 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5021 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5022 mutex_exit(&ipsq->ipsq_lock); 5023 RELEASE_CONN_LOCK(q); 5024 if (err != NULL) 5025 *err = EINPROGRESS; 5026 return (NULL); 5027 } 5028 RELEASE_CONN_LOCK(q); 5029 mutex_exit(&ill->ill_lock); 5030 } 5031 } 5032 rw_exit(&ipst->ips_ill_g_lock); 5033 if (err != NULL) 5034 *err = ENXIO; 5035 return (NULL); 5036 } 5037 5038 /* 5039 * Return the ifindex next in sequence after the passed in ifindex. 5040 * If there is no next ifindex for the given protocol, return 0. 5041 */ 5042 uint_t 5043 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 5044 { 5045 phyint_t *phyi; 5046 phyint_t *phyi_initial; 5047 uint_t ifindex; 5048 5049 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5050 5051 if (index == 0) { 5052 phyi = avl_first( 5053 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 5054 } else { 5055 phyi = phyi_initial = avl_find( 5056 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5057 (void *) &index, NULL); 5058 } 5059 5060 for (; phyi != NULL; 5061 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5062 phyi, AVL_AFTER)) { 5063 /* 5064 * If we're not returning the first interface in the tree 5065 * and we still haven't moved past the phyint_t that 5066 * corresponds to index, avl_walk needs to be called again 5067 */ 5068 if (!((index != 0) && (phyi == phyi_initial))) { 5069 if (isv6) { 5070 if ((phyi->phyint_illv6) && 5071 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5072 (phyi->phyint_illv6->ill_isv6 == 1)) 5073 break; 5074 } else { 5075 if ((phyi->phyint_illv4) && 5076 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5077 (phyi->phyint_illv4->ill_isv6 == 0)) 5078 break; 5079 } 5080 } 5081 } 5082 5083 rw_exit(&ipst->ips_ill_g_lock); 5084 5085 if (phyi != NULL) 5086 ifindex = phyi->phyint_ifindex; 5087 else 5088 ifindex = 0; 5089 5090 return (ifindex); 5091 } 5092 5093 /* 5094 * Return the ifindex for the named interface. 5095 * If there is no next ifindex for the interface, return 0. 5096 */ 5097 uint_t 5098 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 5099 { 5100 phyint_t *phyi; 5101 avl_index_t where = 0; 5102 uint_t ifindex; 5103 5104 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5105 5106 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 5107 name, &where)) == NULL) { 5108 rw_exit(&ipst->ips_ill_g_lock); 5109 return (0); 5110 } 5111 5112 ifindex = phyi->phyint_ifindex; 5113 5114 rw_exit(&ipst->ips_ill_g_lock); 5115 5116 return (ifindex); 5117 } 5118 5119 /* 5120 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5121 * that gives a running thread a reference to the ill. This reference must be 5122 * released by the thread when it is done accessing the ill and related 5123 * objects. ill_refcnt can not be used to account for static references 5124 * such as other structures pointing to an ill. Callers must generally 5125 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5126 * or be sure that the ill is not being deleted or changing state before 5127 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5128 * ill won't change any of its critical state such as address, netmask etc. 5129 */ 5130 void 5131 ill_refhold(ill_t *ill) 5132 { 5133 mutex_enter(&ill->ill_lock); 5134 ill->ill_refcnt++; 5135 ILL_TRACE_REF(ill); 5136 mutex_exit(&ill->ill_lock); 5137 } 5138 5139 void 5140 ill_refhold_locked(ill_t *ill) 5141 { 5142 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5143 ill->ill_refcnt++; 5144 ILL_TRACE_REF(ill); 5145 } 5146 5147 int 5148 ill_check_and_refhold(ill_t *ill) 5149 { 5150 mutex_enter(&ill->ill_lock); 5151 if (ILL_CAN_LOOKUP(ill)) { 5152 ill_refhold_locked(ill); 5153 mutex_exit(&ill->ill_lock); 5154 return (0); 5155 } 5156 mutex_exit(&ill->ill_lock); 5157 return (ILL_LOOKUP_FAILED); 5158 } 5159 5160 /* 5161 * Must not be called while holding any locks. Otherwise if this is 5162 * the last reference to be released, there is a chance of recursive mutex 5163 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5164 * to restart an ioctl. 5165 */ 5166 void 5167 ill_refrele(ill_t *ill) 5168 { 5169 mutex_enter(&ill->ill_lock); 5170 ASSERT(ill->ill_refcnt != 0); 5171 ill->ill_refcnt--; 5172 ILL_UNTRACE_REF(ill); 5173 if (ill->ill_refcnt != 0) { 5174 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5175 mutex_exit(&ill->ill_lock); 5176 return; 5177 } 5178 5179 /* Drops the ill_lock */ 5180 ipif_ill_refrele_tail(ill); 5181 } 5182 5183 /* 5184 * Obtain a weak reference count on the ill. This reference ensures the 5185 * ill won't be freed, but the ill may change any of its critical state 5186 * such as netmask, address etc. Returns an error if the ill has started 5187 * closing. 5188 */ 5189 boolean_t 5190 ill_waiter_inc(ill_t *ill) 5191 { 5192 mutex_enter(&ill->ill_lock); 5193 if (ill->ill_state_flags & ILL_CONDEMNED) { 5194 mutex_exit(&ill->ill_lock); 5195 return (B_FALSE); 5196 } 5197 ill->ill_waiters++; 5198 mutex_exit(&ill->ill_lock); 5199 return (B_TRUE); 5200 } 5201 5202 void 5203 ill_waiter_dcr(ill_t *ill) 5204 { 5205 mutex_enter(&ill->ill_lock); 5206 ill->ill_waiters--; 5207 if (ill->ill_waiters == 0) 5208 cv_broadcast(&ill->ill_cv); 5209 mutex_exit(&ill->ill_lock); 5210 } 5211 5212 /* 5213 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5214 * driver. We construct best guess defaults for lower level information that 5215 * we need. If an interface is brought up without injection of any overriding 5216 * information from outside, we have to be ready to go with these defaults. 5217 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5218 * we primarely want the dl_provider_style. 5219 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5220 * at which point we assume the other part of the information is valid. 5221 */ 5222 void 5223 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5224 { 5225 uchar_t *brdcst_addr; 5226 uint_t brdcst_addr_length, phys_addr_length; 5227 t_scalar_t sap_length; 5228 dl_info_ack_t *dlia; 5229 ip_m_t *ipm; 5230 dl_qos_cl_sel1_t *sel1; 5231 int min_mtu; 5232 5233 ASSERT(IAM_WRITER_ILL(ill)); 5234 5235 /* 5236 * Till the ill is fully up ILL_CHANGING will be set and 5237 * the ill is not globally visible. So no need for a lock. 5238 */ 5239 dlia = (dl_info_ack_t *)mp->b_rptr; 5240 ill->ill_mactype = dlia->dl_mac_type; 5241 5242 ipm = ip_m_lookup(dlia->dl_mac_type); 5243 if (ipm == NULL) { 5244 ipm = ip_m_lookup(DL_OTHER); 5245 ASSERT(ipm != NULL); 5246 } 5247 ill->ill_media = ipm; 5248 5249 /* 5250 * When the new DLPI stuff is ready we'll pull lengths 5251 * from dlia. 5252 */ 5253 if (dlia->dl_version == DL_VERSION_2) { 5254 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5255 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5256 brdcst_addr_length); 5257 if (brdcst_addr == NULL) { 5258 brdcst_addr_length = 0; 5259 } 5260 sap_length = dlia->dl_sap_length; 5261 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5262 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5263 brdcst_addr_length, sap_length, phys_addr_length)); 5264 } else { 5265 brdcst_addr_length = 6; 5266 brdcst_addr = ip_six_byte_all_ones; 5267 sap_length = -2; 5268 phys_addr_length = brdcst_addr_length; 5269 } 5270 5271 ill->ill_bcast_addr_length = brdcst_addr_length; 5272 ill->ill_phys_addr_length = phys_addr_length; 5273 ill->ill_sap_length = sap_length; 5274 5275 /* 5276 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 5277 * but we must ensure a minimum IP MTU is used since other bits of 5278 * IP will fly apart otherwise. 5279 */ 5280 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 5281 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 5282 ill->ill_max_mtu = ill->ill_max_frag; 5283 5284 ill->ill_type = ipm->ip_m_type; 5285 5286 if (!ill->ill_dlpi_style_set) { 5287 if (dlia->dl_provider_style == DL_STYLE2) 5288 ill->ill_needs_attach = 1; 5289 5290 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 5291 5292 /* 5293 * Allocate the first ipif on this ill. We don't delay it 5294 * further as ioctl handling assumes at least one ipif exists. 5295 * 5296 * At this point we don't know whether the ill is v4 or v6. 5297 * We will know this whan the SIOCSLIFNAME happens and 5298 * the correct value for ill_isv6 will be assigned in 5299 * ipif_set_values(). We need to hold the ill lock and 5300 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5301 * the wakeup. 5302 */ 5303 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5304 dlia->dl_provider_style != DL_STYLE2, B_TRUE); 5305 mutex_enter(&ill->ill_lock); 5306 ASSERT(ill->ill_dlpi_style_set == 0); 5307 ill->ill_dlpi_style_set = 1; 5308 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5309 cv_broadcast(&ill->ill_cv); 5310 mutex_exit(&ill->ill_lock); 5311 freemsg(mp); 5312 return; 5313 } 5314 ASSERT(ill->ill_ipif != NULL); 5315 /* 5316 * We know whether it is IPv4 or IPv6 now, as this is the 5317 * second DL_INFO_ACK we are recieving in response to the 5318 * DL_INFO_REQ sent in ipif_set_values. 5319 */ 5320 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 5321 /* 5322 * Set ipif_mtu which is used to set the IRE's 5323 * ire_max_frag value. The driver could have sent 5324 * a different mtu from what it sent last time. No 5325 * need to call ipif_mtu_change because IREs have 5326 * not yet been created. 5327 */ 5328 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5329 /* 5330 * Clear all the flags that were set based on ill_bcast_addr_length 5331 * and ill_phys_addr_length (in ipif_set_values) as these could have 5332 * changed now and we need to re-evaluate. 5333 */ 5334 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5335 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5336 5337 /* 5338 * Free ill_resolver_mp and ill_bcast_mp as things could have 5339 * changed now. 5340 * 5341 * NOTE: The IPMP meta-interface is special-cased because it starts 5342 * with no underlying interfaces (and thus an unknown broadcast 5343 * address length), but we enforce that an interface is broadcast- 5344 * capable as part of allowing it to join a group. 5345 */ 5346 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 5347 if (ill->ill_resolver_mp != NULL) 5348 freemsg(ill->ill_resolver_mp); 5349 if (ill->ill_bcast_mp != NULL) 5350 freemsg(ill->ill_bcast_mp); 5351 if (ill->ill_flags & ILLF_XRESOLV) 5352 ill->ill_net_type = IRE_IF_RESOLVER; 5353 else 5354 ill->ill_net_type = IRE_IF_NORESOLVER; 5355 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5356 ill->ill_phys_addr_length, 5357 ill->ill_sap, 5358 ill->ill_sap_length); 5359 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5360 5361 if (ill->ill_isv6) 5362 /* 5363 * Note: xresolv interfaces will eventually need NOARP 5364 * set here as well, but that will require those 5365 * external resolvers to have some knowledge of 5366 * that flag and act appropriately. Not to be changed 5367 * at present. 5368 */ 5369 ill->ill_flags |= ILLF_NONUD; 5370 else 5371 ill->ill_flags |= ILLF_NOARP; 5372 5373 if (ill->ill_mactype == SUNW_DL_VNI) { 5374 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5375 } else if (ill->ill_phys_addr_length == 0 || 5376 ill->ill_mactype == DL_IPV4 || 5377 ill->ill_mactype == DL_IPV6) { 5378 /* 5379 * The underying link is point-to-point, so mark the 5380 * interface as such. We can do IP multicast over 5381 * such a link since it transmits all network-layer 5382 * packets to the remote side the same way. 5383 */ 5384 ill->ill_flags |= ILLF_MULTICAST; 5385 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5386 } 5387 } else { 5388 ill->ill_net_type = IRE_IF_RESOLVER; 5389 if (ill->ill_bcast_mp != NULL) 5390 freemsg(ill->ill_bcast_mp); 5391 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5392 ill->ill_bcast_addr_length, ill->ill_sap, 5393 ill->ill_sap_length); 5394 /* 5395 * Later detect lack of DLPI driver multicast 5396 * capability by catching DL_ENABMULTI errors in 5397 * ip_rput_dlpi. 5398 */ 5399 ill->ill_flags |= ILLF_MULTICAST; 5400 if (!ill->ill_isv6) 5401 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5402 } 5403 5404 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 5405 if (ill->ill_mactype == SUNW_DL_IPMP) 5406 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 5407 5408 /* By default an interface does not support any CoS marking */ 5409 ill->ill_flags &= ~ILLF_COS_ENABLED; 5410 5411 /* 5412 * If we get QoS information in DL_INFO_ACK, the device supports 5413 * some form of CoS marking, set ILLF_COS_ENABLED. 5414 */ 5415 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5416 dlia->dl_qos_length); 5417 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5418 ill->ill_flags |= ILLF_COS_ENABLED; 5419 } 5420 5421 /* Clear any previous error indication. */ 5422 ill->ill_error = 0; 5423 freemsg(mp); 5424 } 5425 5426 /* 5427 * Perform various checks to verify that an address would make sense as a 5428 * local, remote, or subnet interface address. 5429 */ 5430 static boolean_t 5431 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5432 { 5433 ipaddr_t net_mask; 5434 5435 /* 5436 * Don't allow all zeroes, or all ones, but allow 5437 * all ones netmask. 5438 */ 5439 if ((net_mask = ip_net_mask(addr)) == 0) 5440 return (B_FALSE); 5441 /* A given netmask overrides the "guess" netmask */ 5442 if (subnet_mask != 0) 5443 net_mask = subnet_mask; 5444 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5445 (addr == (addr | ~net_mask)))) { 5446 return (B_FALSE); 5447 } 5448 5449 /* 5450 * Even if the netmask is all ones, we do not allow address to be 5451 * 255.255.255.255 5452 */ 5453 if (addr == INADDR_BROADCAST) 5454 return (B_FALSE); 5455 5456 if (CLASSD(addr)) 5457 return (B_FALSE); 5458 5459 return (B_TRUE); 5460 } 5461 5462 #define V6_IPIF_LINKLOCAL(p) \ 5463 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 5464 5465 /* 5466 * Compare two given ipifs and check if the second one is better than 5467 * the first one using the order of preference (not taking deprecated 5468 * into acount) specified in ipif_lookup_multicast(). 5469 */ 5470 static boolean_t 5471 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 5472 { 5473 /* Check the least preferred first. */ 5474 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 5475 /* If both ipifs are the same, use the first one. */ 5476 if (IS_LOOPBACK(new_ipif->ipif_ill)) 5477 return (B_FALSE); 5478 else 5479 return (B_TRUE); 5480 } 5481 5482 /* For IPv6, check for link local address. */ 5483 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 5484 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5485 V6_IPIF_LINKLOCAL(new_ipif)) { 5486 /* The second one is equal or less preferred. */ 5487 return (B_FALSE); 5488 } else { 5489 return (B_TRUE); 5490 } 5491 } 5492 5493 /* Then check for point to point interface. */ 5494 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 5495 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5496 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 5497 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 5498 return (B_FALSE); 5499 } else { 5500 return (B_TRUE); 5501 } 5502 } 5503 5504 /* old_ipif is a normal interface, so no need to use the new one. */ 5505 return (B_FALSE); 5506 } 5507 5508 /* 5509 * Find a mulitcast-capable ipif given an IP instance and zoneid. 5510 * The ipif must be up, and its ill must multicast-capable, not 5511 * condemned, not an underlying interface in an IPMP group, and 5512 * not a VNI interface. Order of preference: 5513 * 5514 * 1a. normal 5515 * 1b. normal, but deprecated 5516 * 2a. point to point 5517 * 2b. point to point, but deprecated 5518 * 3a. link local 5519 * 3b. link local, but deprecated 5520 * 4. loopback. 5521 */ 5522 ipif_t * 5523 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 5524 { 5525 ill_t *ill; 5526 ill_walk_context_t ctx; 5527 ipif_t *ipif; 5528 ipif_t *saved_ipif = NULL; 5529 ipif_t *dep_ipif = NULL; 5530 5531 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5532 if (isv6) 5533 ill = ILL_START_WALK_V6(&ctx, ipst); 5534 else 5535 ill = ILL_START_WALK_V4(&ctx, ipst); 5536 5537 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5538 mutex_enter(&ill->ill_lock); 5539 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) || 5540 !(ill->ill_flags & ILLF_MULTICAST)) { 5541 mutex_exit(&ill->ill_lock); 5542 continue; 5543 } 5544 for (ipif = ill->ill_ipif; ipif != NULL; 5545 ipif = ipif->ipif_next) { 5546 if (zoneid != ipif->ipif_zoneid && 5547 zoneid != ALL_ZONES && 5548 ipif->ipif_zoneid != ALL_ZONES) { 5549 continue; 5550 } 5551 if (!(ipif->ipif_flags & IPIF_UP) || 5552 !IPIF_CAN_LOOKUP(ipif)) { 5553 continue; 5554 } 5555 5556 /* 5557 * Found one candidate. If it is deprecated, 5558 * remember it in dep_ipif. If it is not deprecated, 5559 * remember it in saved_ipif. 5560 */ 5561 if (ipif->ipif_flags & IPIF_DEPRECATED) { 5562 if (dep_ipif == NULL) { 5563 dep_ipif = ipif; 5564 } else if (ipif_comp_multi(dep_ipif, ipif, 5565 isv6)) { 5566 /* 5567 * If the previous dep_ipif does not 5568 * belong to the same ill, we've done 5569 * a ipif_refhold() on it. So we need 5570 * to release it. 5571 */ 5572 if (dep_ipif->ipif_ill != ill) 5573 ipif_refrele(dep_ipif); 5574 dep_ipif = ipif; 5575 } 5576 continue; 5577 } 5578 if (saved_ipif == NULL) { 5579 saved_ipif = ipif; 5580 } else { 5581 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 5582 if (saved_ipif->ipif_ill != ill) 5583 ipif_refrele(saved_ipif); 5584 saved_ipif = ipif; 5585 } 5586 } 5587 } 5588 /* 5589 * Before going to the next ill, do a ipif_refhold() on the 5590 * saved ones. 5591 */ 5592 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 5593 ipif_refhold_locked(saved_ipif); 5594 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 5595 ipif_refhold_locked(dep_ipif); 5596 mutex_exit(&ill->ill_lock); 5597 } 5598 rw_exit(&ipst->ips_ill_g_lock); 5599 5600 /* 5601 * If we have only the saved_ipif, return it. But if we have both 5602 * saved_ipif and dep_ipif, check to see which one is better. 5603 */ 5604 if (saved_ipif != NULL) { 5605 if (dep_ipif != NULL) { 5606 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 5607 ipif_refrele(saved_ipif); 5608 return (dep_ipif); 5609 } else { 5610 ipif_refrele(dep_ipif); 5611 return (saved_ipif); 5612 } 5613 } 5614 return (saved_ipif); 5615 } else { 5616 return (dep_ipif); 5617 } 5618 } 5619 5620 /* 5621 * This function is called when an application does not specify an interface 5622 * to be used for multicast traffic (joining a group/sending data). It 5623 * calls ire_lookup_multi() to look for an interface route for the 5624 * specified multicast group. Doing this allows the administrator to add 5625 * prefix routes for multicast to indicate which interface to be used for 5626 * multicast traffic in the above scenario. The route could be for all 5627 * multicast (224.0/4), for a single multicast group (a /32 route) or 5628 * anything in between. If there is no such multicast route, we just find 5629 * any multicast capable interface and return it. The returned ipif 5630 * is refhold'ed. 5631 */ 5632 ipif_t * 5633 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) 5634 { 5635 ire_t *ire; 5636 ipif_t *ipif; 5637 5638 ire = ire_lookup_multi(group, zoneid, ipst); 5639 if (ire != NULL) { 5640 ipif = ire->ire_ipif; 5641 ipif_refhold(ipif); 5642 ire_refrele(ire); 5643 return (ipif); 5644 } 5645 5646 return (ipif_lookup_multicast(ipst, zoneid, B_FALSE)); 5647 } 5648 5649 /* 5650 * Look for an ipif with the specified interface address and destination. 5651 * The destination address is used only for matching point-to-point interfaces. 5652 */ 5653 ipif_t * 5654 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5655 ipsq_func_t func, int *error, ip_stack_t *ipst) 5656 { 5657 ipif_t *ipif; 5658 ill_t *ill; 5659 ill_walk_context_t ctx; 5660 ipsq_t *ipsq; 5661 5662 if (error != NULL) 5663 *error = 0; 5664 5665 /* 5666 * First match all the point-to-point interfaces 5667 * before looking at non-point-to-point interfaces. 5668 * This is done to avoid returning non-point-to-point 5669 * ipif instead of unnumbered point-to-point ipif. 5670 */ 5671 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5672 ill = ILL_START_WALK_V4(&ctx, ipst); 5673 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5674 GRAB_CONN_LOCK(q); 5675 mutex_enter(&ill->ill_lock); 5676 for (ipif = ill->ill_ipif; ipif != NULL; 5677 ipif = ipif->ipif_next) { 5678 /* Allow the ipif to be down */ 5679 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5680 (ipif->ipif_lcl_addr == if_addr) && 5681 (ipif->ipif_pp_dst_addr == dst)) { 5682 /* 5683 * The block comment at the start of ipif_down 5684 * explains the use of the macros used below 5685 */ 5686 if (IPIF_CAN_LOOKUP(ipif)) { 5687 ipif_refhold_locked(ipif); 5688 mutex_exit(&ill->ill_lock); 5689 RELEASE_CONN_LOCK(q); 5690 rw_exit(&ipst->ips_ill_g_lock); 5691 return (ipif); 5692 } else if (IPIF_CAN_WAIT(ipif, q)) { 5693 ipsq = ill->ill_phyint->phyint_ipsq; 5694 mutex_enter(&ipsq->ipsq_lock); 5695 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5696 mutex_exit(&ill->ill_lock); 5697 rw_exit(&ipst->ips_ill_g_lock); 5698 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5699 ill); 5700 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5701 mutex_exit(&ipsq->ipsq_lock); 5702 RELEASE_CONN_LOCK(q); 5703 if (error != NULL) 5704 *error = EINPROGRESS; 5705 return (NULL); 5706 } 5707 } 5708 } 5709 mutex_exit(&ill->ill_lock); 5710 RELEASE_CONN_LOCK(q); 5711 } 5712 rw_exit(&ipst->ips_ill_g_lock); 5713 5714 /* lookup the ipif based on interface address */ 5715 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error, 5716 ipst); 5717 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5718 return (ipif); 5719 } 5720 5721 /* 5722 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 5723 */ 5724 static ipif_t * 5725 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp, 5726 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, 5727 ip_stack_t *ipst) 5728 { 5729 ipif_t *ipif; 5730 ill_t *ill; 5731 boolean_t ptp = B_FALSE; 5732 ipsq_t *ipsq; 5733 ill_walk_context_t ctx; 5734 5735 if (error != NULL) 5736 *error = 0; 5737 5738 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5739 /* 5740 * Repeat twice, first based on local addresses and 5741 * next time for pointopoint. 5742 */ 5743 repeat: 5744 ill = ILL_START_WALK_V4(&ctx, ipst); 5745 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5746 if (match_ill != NULL && ill != match_ill && 5747 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 5748 continue; 5749 } 5750 GRAB_CONN_LOCK(q); 5751 mutex_enter(&ill->ill_lock); 5752 for (ipif = ill->ill_ipif; ipif != NULL; 5753 ipif = ipif->ipif_next) { 5754 if (zoneid != ALL_ZONES && 5755 zoneid != ipif->ipif_zoneid && 5756 ipif->ipif_zoneid != ALL_ZONES) 5757 continue; 5758 /* Allow the ipif to be down */ 5759 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5760 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5761 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5762 (ipif->ipif_pp_dst_addr == addr))) { 5763 /* 5764 * The block comment at the start of ipif_down 5765 * explains the use of the macros used below 5766 */ 5767 if (IPIF_CAN_LOOKUP(ipif)) { 5768 ipif_refhold_locked(ipif); 5769 mutex_exit(&ill->ill_lock); 5770 RELEASE_CONN_LOCK(q); 5771 rw_exit(&ipst->ips_ill_g_lock); 5772 return (ipif); 5773 } else if (IPIF_CAN_WAIT(ipif, q)) { 5774 ipsq = ill->ill_phyint->phyint_ipsq; 5775 mutex_enter(&ipsq->ipsq_lock); 5776 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5777 mutex_exit(&ill->ill_lock); 5778 rw_exit(&ipst->ips_ill_g_lock); 5779 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5780 ill); 5781 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5782 mutex_exit(&ipsq->ipsq_lock); 5783 RELEASE_CONN_LOCK(q); 5784 if (error != NULL) 5785 *error = EINPROGRESS; 5786 return (NULL); 5787 } 5788 } 5789 } 5790 mutex_exit(&ill->ill_lock); 5791 RELEASE_CONN_LOCK(q); 5792 } 5793 5794 /* If we already did the ptp case, then we are done */ 5795 if (ptp) { 5796 rw_exit(&ipst->ips_ill_g_lock); 5797 if (error != NULL) 5798 *error = ENXIO; 5799 return (NULL); 5800 } 5801 ptp = B_TRUE; 5802 goto repeat; 5803 } 5804 5805 /* 5806 * Check if the address exists in the system. 5807 * We don't hold the conn_lock as we will not perform defered ipsqueue 5808 * operation. 5809 */ 5810 boolean_t 5811 ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 5812 { 5813 ipif_t *ipif; 5814 ill_t *ill; 5815 ill_walk_context_t ctx; 5816 5817 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5818 5819 ill = ILL_START_WALK_V4(&ctx, ipst); 5820 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5821 mutex_enter(&ill->ill_lock); 5822 for (ipif = ill->ill_ipif; ipif != NULL; 5823 ipif = ipif->ipif_next) { 5824 if (zoneid != ALL_ZONES && 5825 zoneid != ipif->ipif_zoneid && 5826 ipif->ipif_zoneid != ALL_ZONES) 5827 continue; 5828 /* Allow the ipif to be down */ 5829 /* 5830 * XXX Different from ipif_lookup_addr(), we don't do 5831 * twice lookups. As from bind()'s point of view, we 5832 * may return once we find a match. 5833 */ 5834 if (((ipif->ipif_lcl_addr == addr) && 5835 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5836 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5837 (ipif->ipif_pp_dst_addr == addr))) { 5838 /* 5839 * Allow bind() to be successful even if the 5840 * ipif is with IPIF_CHANGING bit set. 5841 */ 5842 mutex_exit(&ill->ill_lock); 5843 rw_exit(&ipst->ips_ill_g_lock); 5844 return (B_TRUE); 5845 } 5846 } 5847 mutex_exit(&ill->ill_lock); 5848 } 5849 5850 rw_exit(&ipst->ips_ill_g_lock); 5851 return (B_FALSE); 5852 } 5853 5854 /* 5855 * Lookup an ipif with the specified address. For point-to-point links we 5856 * look for matches on either the destination address or the local address, 5857 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 5858 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 5859 * (or illgrp if `match_ill' is in an IPMP group). 5860 */ 5861 ipif_t * 5862 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5863 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 5864 { 5865 return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp, 5866 func, error, ipst)); 5867 } 5868 5869 /* 5870 * Special abbreviated version of ipif_lookup_addr() that doesn't match 5871 * `match_ill' across the IPMP group. This function is only needed in some 5872 * corner-cases; almost everything should use ipif_lookup_addr(). 5873 */ 5874 static ipif_t * 5875 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 5876 { 5877 ASSERT(match_ill != NULL); 5878 return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES, 5879 NULL, NULL, NULL, NULL, ipst)); 5880 } 5881 5882 /* 5883 * Look for an ipif with the specified address. For point-point links 5884 * we look for matches on either the destination address and the local 5885 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5886 * is set. 5887 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 5888 * ill (or illgrp if `match_ill' is in an IPMP group). 5889 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 5890 */ 5891 zoneid_t 5892 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 5893 { 5894 zoneid_t zoneid; 5895 ipif_t *ipif; 5896 ill_t *ill; 5897 boolean_t ptp = B_FALSE; 5898 ill_walk_context_t ctx; 5899 5900 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5901 /* 5902 * Repeat twice, first based on local addresses and 5903 * next time for pointopoint. 5904 */ 5905 repeat: 5906 ill = ILL_START_WALK_V4(&ctx, ipst); 5907 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5908 if (match_ill != NULL && ill != match_ill && 5909 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 5910 continue; 5911 } 5912 mutex_enter(&ill->ill_lock); 5913 for (ipif = ill->ill_ipif; ipif != NULL; 5914 ipif = ipif->ipif_next) { 5915 /* Allow the ipif to be down */ 5916 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5917 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5918 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5919 (ipif->ipif_pp_dst_addr == addr)) && 5920 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 5921 zoneid = ipif->ipif_zoneid; 5922 mutex_exit(&ill->ill_lock); 5923 rw_exit(&ipst->ips_ill_g_lock); 5924 /* 5925 * If ipif_zoneid was ALL_ZONES then we have 5926 * a trusted extensions shared IP address. 5927 * In that case GLOBAL_ZONEID works to send. 5928 */ 5929 if (zoneid == ALL_ZONES) 5930 zoneid = GLOBAL_ZONEID; 5931 return (zoneid); 5932 } 5933 } 5934 mutex_exit(&ill->ill_lock); 5935 } 5936 5937 /* If we already did the ptp case, then we are done */ 5938 if (ptp) { 5939 rw_exit(&ipst->ips_ill_g_lock); 5940 return (ALL_ZONES); 5941 } 5942 ptp = B_TRUE; 5943 goto repeat; 5944 } 5945 5946 /* 5947 * Look for an ipif that matches the specified remote address i.e. the 5948 * ipif that would receive the specified packet. 5949 * First look for directly connected interfaces and then do a recursive 5950 * IRE lookup and pick the first ipif corresponding to the source address in the 5951 * ire. 5952 * Returns: held ipif 5953 */ 5954 ipif_t * 5955 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 5956 { 5957 ipif_t *ipif; 5958 ire_t *ire; 5959 ip_stack_t *ipst = ill->ill_ipst; 5960 5961 ASSERT(!ill->ill_isv6); 5962 5963 /* 5964 * Someone could be changing this ipif currently or change it 5965 * after we return this. Thus a few packets could use the old 5966 * old values. However structure updates/creates (ire, ilg, ilm etc) 5967 * will atomically be updated or cleaned up with the new value 5968 * Thus we don't need a lock to check the flags or other attrs below. 5969 */ 5970 mutex_enter(&ill->ill_lock); 5971 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5972 if (!IPIF_CAN_LOOKUP(ipif)) 5973 continue; 5974 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 5975 ipif->ipif_zoneid != ALL_ZONES) 5976 continue; 5977 /* Allow the ipif to be down */ 5978 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5979 if ((ipif->ipif_pp_dst_addr == addr) || 5980 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 5981 ipif->ipif_lcl_addr == addr)) { 5982 ipif_refhold_locked(ipif); 5983 mutex_exit(&ill->ill_lock); 5984 return (ipif); 5985 } 5986 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 5987 ipif_refhold_locked(ipif); 5988 mutex_exit(&ill->ill_lock); 5989 return (ipif); 5990 } 5991 } 5992 mutex_exit(&ill->ill_lock); 5993 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 5994 NULL, MATCH_IRE_RECURSIVE, ipst); 5995 if (ire != NULL) { 5996 /* 5997 * The callers of this function wants to know the 5998 * interface on which they have to send the replies 5999 * back. For IREs that have ire_stq and ire_ipif 6000 * derived from different ills, we really don't care 6001 * what we return here. 6002 */ 6003 ipif = ire->ire_ipif; 6004 if (ipif != NULL) { 6005 ipif_refhold(ipif); 6006 ire_refrele(ire); 6007 return (ipif); 6008 } 6009 ire_refrele(ire); 6010 } 6011 /* Pick the first interface */ 6012 ipif = ipif_get_next_ipif(NULL, ill); 6013 return (ipif); 6014 } 6015 6016 /* 6017 * This func does not prevent refcnt from increasing. But if 6018 * the caller has taken steps to that effect, then this func 6019 * can be used to determine whether the ill has become quiescent 6020 */ 6021 static boolean_t 6022 ill_is_quiescent(ill_t *ill) 6023 { 6024 ipif_t *ipif; 6025 6026 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6027 6028 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6029 if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) { 6030 return (B_FALSE); 6031 } 6032 } 6033 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 6034 return (B_FALSE); 6035 } 6036 return (B_TRUE); 6037 } 6038 6039 boolean_t 6040 ill_is_freeable(ill_t *ill) 6041 { 6042 ipif_t *ipif; 6043 6044 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6045 6046 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6047 if (ipif->ipif_refcnt != 0 || !IPIF_FREE_OK(ipif)) { 6048 return (B_FALSE); 6049 } 6050 } 6051 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 6052 return (B_FALSE); 6053 } 6054 return (B_TRUE); 6055 } 6056 6057 /* 6058 * This func does not prevent refcnt from increasing. But if 6059 * the caller has taken steps to that effect, then this func 6060 * can be used to determine whether the ipif has become quiescent 6061 */ 6062 static boolean_t 6063 ipif_is_quiescent(ipif_t *ipif) 6064 { 6065 ill_t *ill; 6066 6067 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6068 6069 if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) { 6070 return (B_FALSE); 6071 } 6072 6073 ill = ipif->ipif_ill; 6074 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 6075 ill->ill_logical_down) { 6076 return (B_TRUE); 6077 } 6078 6079 /* This is the last ipif going down or being deleted on this ill */ 6080 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 6081 return (B_FALSE); 6082 } 6083 6084 return (B_TRUE); 6085 } 6086 6087 /* 6088 * return true if the ipif can be destroyed: the ipif has to be quiescent 6089 * with zero references from ire/nce/ilm to it. 6090 */ 6091 static boolean_t 6092 ipif_is_freeable(ipif_t *ipif) 6093 { 6094 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6095 ASSERT(ipif->ipif_id != 0); 6096 return (ipif->ipif_refcnt == 0 && IPIF_FREE_OK(ipif)); 6097 } 6098 6099 /* 6100 * The ipif/ill/ire has been refreled. Do the tail processing. 6101 * Determine if the ipif or ill in question has become quiescent and if so 6102 * wakeup close and/or restart any queued pending ioctl that is waiting 6103 * for the ipif_down (or ill_down) 6104 */ 6105 void 6106 ipif_ill_refrele_tail(ill_t *ill) 6107 { 6108 mblk_t *mp; 6109 conn_t *connp; 6110 ipsq_t *ipsq; 6111 ipxop_t *ipx; 6112 ipif_t *ipif; 6113 dl_notify_ind_t *dlindp; 6114 6115 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6116 6117 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 6118 /* ip_modclose() may be waiting */ 6119 cv_broadcast(&ill->ill_cv); 6120 } 6121 6122 ipsq = ill->ill_phyint->phyint_ipsq; 6123 mutex_enter(&ipsq->ipsq_lock); 6124 ipx = ipsq->ipsq_xop; 6125 mutex_enter(&ipx->ipx_lock); 6126 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 6127 goto unlock; 6128 6129 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 6130 6131 ipif = ipx->ipx_pending_ipif; 6132 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 6133 goto unlock; 6134 6135 switch (ipx->ipx_waitfor) { 6136 case IPIF_DOWN: 6137 if (!ipif_is_quiescent(ipif)) 6138 goto unlock; 6139 break; 6140 case IPIF_FREE: 6141 if (!ipif_is_freeable(ipif)) 6142 goto unlock; 6143 break; 6144 case ILL_DOWN: 6145 if (!ill_is_quiescent(ill)) 6146 goto unlock; 6147 break; 6148 case ILL_FREE: 6149 /* 6150 * ILL_FREE is only for loopback; normal ill teardown waits 6151 * synchronously in ip_modclose() without using ipx_waitfor, 6152 * handled by the cv_broadcast() at the top of this function. 6153 */ 6154 if (!ill_is_freeable(ill)) 6155 goto unlock; 6156 break; 6157 default: 6158 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 6159 (void *)ipsq, ipx->ipx_waitfor); 6160 } 6161 6162 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 6163 mutex_exit(&ipx->ipx_lock); 6164 mp = ipsq_pending_mp_get(ipsq, &connp); 6165 mutex_exit(&ipsq->ipsq_lock); 6166 mutex_exit(&ill->ill_lock); 6167 6168 ASSERT(mp != NULL); 6169 /* 6170 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 6171 * we can only get here when the current operation decides it 6172 * it needs to quiesce via ipsq_pending_mp_add(). 6173 */ 6174 switch (mp->b_datap->db_type) { 6175 case M_PCPROTO: 6176 case M_PROTO: 6177 /* 6178 * For now, only DL_NOTIFY_IND messages can use this facility. 6179 */ 6180 dlindp = (dl_notify_ind_t *)mp->b_rptr; 6181 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 6182 6183 switch (dlindp->dl_notification) { 6184 case DL_NOTE_PHYS_ADDR: 6185 qwriter_ip(ill, ill->ill_rq, mp, 6186 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 6187 return; 6188 case DL_NOTE_REPLUMB: 6189 qwriter_ip(ill, ill->ill_rq, mp, 6190 ill_replumb_tail, CUR_OP, B_TRUE); 6191 return; 6192 default: 6193 ASSERT(0); 6194 ill_refrele(ill); 6195 } 6196 break; 6197 6198 case M_ERROR: 6199 case M_HANGUP: 6200 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 6201 B_TRUE); 6202 return; 6203 6204 case M_IOCTL: 6205 case M_IOCDATA: 6206 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 6207 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 6208 return; 6209 6210 default: 6211 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6212 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6213 } 6214 return; 6215 unlock: 6216 mutex_exit(&ipsq->ipsq_lock); 6217 mutex_exit(&ipx->ipx_lock); 6218 mutex_exit(&ill->ill_lock); 6219 } 6220 6221 #ifdef DEBUG 6222 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6223 static void 6224 th_trace_rrecord(th_trace_t *th_trace) 6225 { 6226 tr_buf_t *tr_buf; 6227 uint_t lastref; 6228 6229 lastref = th_trace->th_trace_lastref; 6230 lastref++; 6231 if (lastref == TR_BUF_MAX) 6232 lastref = 0; 6233 th_trace->th_trace_lastref = lastref; 6234 tr_buf = &th_trace->th_trbuf[lastref]; 6235 tr_buf->tr_time = lbolt; 6236 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 6237 } 6238 6239 static void 6240 th_trace_free(void *value) 6241 { 6242 th_trace_t *th_trace = value; 6243 6244 ASSERT(th_trace->th_refcnt == 0); 6245 kmem_free(th_trace, sizeof (*th_trace)); 6246 } 6247 6248 /* 6249 * Find or create the per-thread hash table used to track object references. 6250 * The ipst argument is NULL if we shouldn't allocate. 6251 * 6252 * Accesses per-thread data, so there's no need to lock here. 6253 */ 6254 static mod_hash_t * 6255 th_trace_gethash(ip_stack_t *ipst) 6256 { 6257 th_hash_t *thh; 6258 6259 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 6260 mod_hash_t *mh; 6261 char name[256]; 6262 size_t objsize, rshift; 6263 int retv; 6264 6265 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 6266 return (NULL); 6267 (void) snprintf(name, sizeof (name), "th_trace_%p", 6268 (void *)curthread); 6269 6270 /* 6271 * We use mod_hash_create_extended here rather than the more 6272 * obvious mod_hash_create_ptrhash because the latter has a 6273 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 6274 * block. 6275 */ 6276 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 6277 MAX(sizeof (ire_t), sizeof (nce_t))); 6278 rshift = highbit(objsize); 6279 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 6280 th_trace_free, mod_hash_byptr, (void *)rshift, 6281 mod_hash_ptrkey_cmp, KM_NOSLEEP); 6282 if (mh == NULL) { 6283 kmem_free(thh, sizeof (*thh)); 6284 return (NULL); 6285 } 6286 thh->thh_hash = mh; 6287 thh->thh_ipst = ipst; 6288 /* 6289 * We trace ills, ipifs, ires, and nces. All of these are 6290 * per-IP-stack, so the lock on the thread list is as well. 6291 */ 6292 rw_enter(&ip_thread_rwlock, RW_WRITER); 6293 list_insert_tail(&ip_thread_list, thh); 6294 rw_exit(&ip_thread_rwlock); 6295 retv = tsd_set(ip_thread_data, thh); 6296 ASSERT(retv == 0); 6297 } 6298 return (thh != NULL ? thh->thh_hash : NULL); 6299 } 6300 6301 boolean_t 6302 th_trace_ref(const void *obj, ip_stack_t *ipst) 6303 { 6304 th_trace_t *th_trace; 6305 mod_hash_t *mh; 6306 mod_hash_val_t val; 6307 6308 if ((mh = th_trace_gethash(ipst)) == NULL) 6309 return (B_FALSE); 6310 6311 /* 6312 * Attempt to locate the trace buffer for this obj and thread. 6313 * If it does not exist, then allocate a new trace buffer and 6314 * insert into the hash. 6315 */ 6316 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 6317 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 6318 if (th_trace == NULL) 6319 return (B_FALSE); 6320 6321 th_trace->th_id = curthread; 6322 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 6323 (mod_hash_val_t)th_trace) != 0) { 6324 kmem_free(th_trace, sizeof (th_trace_t)); 6325 return (B_FALSE); 6326 } 6327 } else { 6328 th_trace = (th_trace_t *)val; 6329 } 6330 6331 ASSERT(th_trace->th_refcnt >= 0 && 6332 th_trace->th_refcnt < TR_BUF_MAX - 1); 6333 6334 th_trace->th_refcnt++; 6335 th_trace_rrecord(th_trace); 6336 return (B_TRUE); 6337 } 6338 6339 /* 6340 * For the purpose of tracing a reference release, we assume that global 6341 * tracing is always on and that the same thread initiated the reference hold 6342 * is releasing. 6343 */ 6344 void 6345 th_trace_unref(const void *obj) 6346 { 6347 int retv; 6348 mod_hash_t *mh; 6349 th_trace_t *th_trace; 6350 mod_hash_val_t val; 6351 6352 mh = th_trace_gethash(NULL); 6353 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 6354 ASSERT(retv == 0); 6355 th_trace = (th_trace_t *)val; 6356 6357 ASSERT(th_trace->th_refcnt > 0); 6358 th_trace->th_refcnt--; 6359 th_trace_rrecord(th_trace); 6360 } 6361 6362 /* 6363 * If tracing has been disabled, then we assume that the reference counts are 6364 * now useless, and we clear them out before destroying the entries. 6365 */ 6366 void 6367 th_trace_cleanup(const void *obj, boolean_t trace_disable) 6368 { 6369 th_hash_t *thh; 6370 mod_hash_t *mh; 6371 mod_hash_val_t val; 6372 th_trace_t *th_trace; 6373 int retv; 6374 6375 rw_enter(&ip_thread_rwlock, RW_READER); 6376 for (thh = list_head(&ip_thread_list); thh != NULL; 6377 thh = list_next(&ip_thread_list, thh)) { 6378 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 6379 &val) == 0) { 6380 th_trace = (th_trace_t *)val; 6381 if (trace_disable) 6382 th_trace->th_refcnt = 0; 6383 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 6384 ASSERT(retv == 0); 6385 } 6386 } 6387 rw_exit(&ip_thread_rwlock); 6388 } 6389 6390 void 6391 ipif_trace_ref(ipif_t *ipif) 6392 { 6393 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6394 6395 if (ipif->ipif_trace_disable) 6396 return; 6397 6398 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 6399 ipif->ipif_trace_disable = B_TRUE; 6400 ipif_trace_cleanup(ipif); 6401 } 6402 } 6403 6404 void 6405 ipif_untrace_ref(ipif_t *ipif) 6406 { 6407 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6408 6409 if (!ipif->ipif_trace_disable) 6410 th_trace_unref(ipif); 6411 } 6412 6413 void 6414 ill_trace_ref(ill_t *ill) 6415 { 6416 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6417 6418 if (ill->ill_trace_disable) 6419 return; 6420 6421 if (!th_trace_ref(ill, ill->ill_ipst)) { 6422 ill->ill_trace_disable = B_TRUE; 6423 ill_trace_cleanup(ill); 6424 } 6425 } 6426 6427 void 6428 ill_untrace_ref(ill_t *ill) 6429 { 6430 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6431 6432 if (!ill->ill_trace_disable) 6433 th_trace_unref(ill); 6434 } 6435 6436 /* 6437 * Called when ipif is unplumbed or when memory alloc fails. Note that on 6438 * failure, ipif_trace_disable is set. 6439 */ 6440 static void 6441 ipif_trace_cleanup(const ipif_t *ipif) 6442 { 6443 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 6444 } 6445 6446 /* 6447 * Called when ill is unplumbed or when memory alloc fails. Note that on 6448 * failure, ill_trace_disable is set. 6449 */ 6450 static void 6451 ill_trace_cleanup(const ill_t *ill) 6452 { 6453 th_trace_cleanup(ill, ill->ill_trace_disable); 6454 } 6455 #endif /* DEBUG */ 6456 6457 void 6458 ipif_refhold_locked(ipif_t *ipif) 6459 { 6460 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6461 ipif->ipif_refcnt++; 6462 IPIF_TRACE_REF(ipif); 6463 } 6464 6465 void 6466 ipif_refhold(ipif_t *ipif) 6467 { 6468 ill_t *ill; 6469 6470 ill = ipif->ipif_ill; 6471 mutex_enter(&ill->ill_lock); 6472 ipif->ipif_refcnt++; 6473 IPIF_TRACE_REF(ipif); 6474 mutex_exit(&ill->ill_lock); 6475 } 6476 6477 /* 6478 * Must not be called while holding any locks. Otherwise if this is 6479 * the last reference to be released there is a chance of recursive mutex 6480 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6481 * to restart an ioctl. 6482 */ 6483 void 6484 ipif_refrele(ipif_t *ipif) 6485 { 6486 ill_t *ill; 6487 6488 ill = ipif->ipif_ill; 6489 6490 mutex_enter(&ill->ill_lock); 6491 ASSERT(ipif->ipif_refcnt != 0); 6492 ipif->ipif_refcnt--; 6493 IPIF_UNTRACE_REF(ipif); 6494 if (ipif->ipif_refcnt != 0) { 6495 mutex_exit(&ill->ill_lock); 6496 return; 6497 } 6498 6499 /* Drops the ill_lock */ 6500 ipif_ill_refrele_tail(ill); 6501 } 6502 6503 ipif_t * 6504 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6505 { 6506 ipif_t *ipif; 6507 6508 mutex_enter(&ill->ill_lock); 6509 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6510 ipif != NULL; ipif = ipif->ipif_next) { 6511 if (!IPIF_CAN_LOOKUP(ipif)) 6512 continue; 6513 ipif_refhold_locked(ipif); 6514 mutex_exit(&ill->ill_lock); 6515 return (ipif); 6516 } 6517 mutex_exit(&ill->ill_lock); 6518 return (NULL); 6519 } 6520 6521 /* 6522 * TODO: make this table extendible at run time 6523 * Return a pointer to the mac type info for 'mac_type' 6524 */ 6525 static ip_m_t * 6526 ip_m_lookup(t_uscalar_t mac_type) 6527 { 6528 ip_m_t *ipm; 6529 6530 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6531 if (ipm->ip_m_mac_type == mac_type) 6532 return (ipm); 6533 return (NULL); 6534 } 6535 6536 /* 6537 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6538 * ipif_arg is passed in to associate it with the correct interface. 6539 * We may need to restart this operation if the ipif cannot be looked up 6540 * due to an exclusive operation that is currently in progress. The restart 6541 * entry point is specified by 'func' 6542 */ 6543 int 6544 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6545 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg, 6546 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func, 6547 struct rtsa_s *sp, ip_stack_t *ipst) 6548 { 6549 ire_t *ire; 6550 ire_t *gw_ire = NULL; 6551 ipif_t *ipif = NULL; 6552 boolean_t ipif_refheld = B_FALSE; 6553 uint_t type; 6554 int match_flags = MATCH_IRE_TYPE; 6555 int error; 6556 tsol_gc_t *gc = NULL; 6557 tsol_gcgrp_t *gcgrp = NULL; 6558 boolean_t gcgrp_xtraref = B_FALSE; 6559 6560 ip1dbg(("ip_rt_add:")); 6561 6562 if (ire_arg != NULL) 6563 *ire_arg = NULL; 6564 6565 /* 6566 * If this is the case of RTF_HOST being set, then we set the netmask 6567 * to all ones (regardless if one was supplied). 6568 */ 6569 if (flags & RTF_HOST) 6570 mask = IP_HOST_MASK; 6571 6572 /* 6573 * Prevent routes with a zero gateway from being created (since 6574 * interfaces can currently be plumbed and brought up no assigned 6575 * address). 6576 */ 6577 if (gw_addr == 0) 6578 return (ENETUNREACH); 6579 /* 6580 * Get the ipif, if any, corresponding to the gw_addr 6581 */ 6582 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error, 6583 ipst); 6584 if (ipif != NULL) { 6585 if (IS_VNI(ipif->ipif_ill)) { 6586 ipif_refrele(ipif); 6587 return (EINVAL); 6588 } 6589 ipif_refheld = B_TRUE; 6590 } else if (error == EINPROGRESS) { 6591 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6592 return (EINPROGRESS); 6593 } else { 6594 error = 0; 6595 } 6596 6597 if (ipif != NULL) { 6598 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6599 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6600 } else { 6601 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6602 } 6603 6604 /* 6605 * GateD will attempt to create routes with a loopback interface 6606 * address as the gateway and with RTF_GATEWAY set. We allow 6607 * these routes to be added, but create them as interface routes 6608 * since the gateway is an interface address. 6609 */ 6610 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6611 flags &= ~RTF_GATEWAY; 6612 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6613 mask == IP_HOST_MASK) { 6614 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6615 ALL_ZONES, NULL, match_flags, ipst); 6616 if (ire != NULL) { 6617 ire_refrele(ire); 6618 if (ipif_refheld) 6619 ipif_refrele(ipif); 6620 return (EEXIST); 6621 } 6622 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 6623 "for 0x%x\n", (void *)ipif, 6624 ipif->ipif_ire_type, 6625 ntohl(ipif->ipif_lcl_addr))); 6626 ire = ire_create( 6627 (uchar_t *)&dst_addr, /* dest address */ 6628 (uchar_t *)&mask, /* mask */ 6629 (uchar_t *)&ipif->ipif_src_addr, 6630 NULL, /* no gateway */ 6631 &ipif->ipif_mtu, 6632 NULL, 6633 ipif->ipif_rq, /* recv-from queue */ 6634 NULL, /* no send-to queue */ 6635 ipif->ipif_ire_type, /* LOOPBACK */ 6636 ipif, 6637 0, 6638 0, 6639 0, 6640 (ipif->ipif_flags & IPIF_PRIVATE) ? 6641 RTF_PRIVATE : 0, 6642 &ire_uinfo_null, 6643 NULL, 6644 NULL, 6645 ipst); 6646 6647 if (ire == NULL) { 6648 if (ipif_refheld) 6649 ipif_refrele(ipif); 6650 return (ENOMEM); 6651 } 6652 error = ire_add(&ire, q, mp, func, B_FALSE); 6653 if (error == 0) 6654 goto save_ire; 6655 if (ipif_refheld) 6656 ipif_refrele(ipif); 6657 return (error); 6658 6659 } 6660 } 6661 6662 /* 6663 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6664 * and the gateway address provided is one of the system's interface 6665 * addresses. By using the routing socket interface and supplying an 6666 * RTA_IFP sockaddr with an interface index, an alternate method of 6667 * specifying an interface route to be created is available which uses 6668 * the interface index that specifies the outgoing interface rather than 6669 * the address of an outgoing interface (which may not be able to 6670 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6671 * flag, routes can be specified which not only specify the next-hop to 6672 * be used when routing to a certain prefix, but also which outgoing 6673 * interface should be used. 6674 * 6675 * Previously, interfaces would have unique addresses assigned to them 6676 * and so the address assigned to a particular interface could be used 6677 * to identify a particular interface. One exception to this was the 6678 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6679 * 6680 * With the advent of IPv6 and its link-local addresses, this 6681 * restriction was relaxed and interfaces could share addresses between 6682 * themselves. In fact, typically all of the link-local interfaces on 6683 * an IPv6 node or router will have the same link-local address. In 6684 * order to differentiate between these interfaces, the use of an 6685 * interface index is necessary and this index can be carried inside a 6686 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6687 * of using the interface index, however, is that all of the ipif's that 6688 * are part of an ill have the same index and so the RTA_IFP sockaddr 6689 * cannot be used to differentiate between ipif's (or logical 6690 * interfaces) that belong to the same ill (physical interface). 6691 * 6692 * For example, in the following case involving IPv4 interfaces and 6693 * logical interfaces 6694 * 6695 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6696 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6697 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6698 * 6699 * the ipif's corresponding to each of these interface routes can be 6700 * uniquely identified by the "gateway" (actually interface address). 6701 * 6702 * In this case involving multiple IPv6 default routes to a particular 6703 * link-local gateway, the use of RTA_IFP is necessary to specify which 6704 * default route is of interest: 6705 * 6706 * default fe80::123:4567:89ab:cdef U if0 6707 * default fe80::123:4567:89ab:cdef U if1 6708 */ 6709 6710 /* RTF_GATEWAY not set */ 6711 if (!(flags & RTF_GATEWAY)) { 6712 queue_t *stq; 6713 6714 if (sp != NULL) { 6715 ip2dbg(("ip_rt_add: gateway security attributes " 6716 "cannot be set with interface route\n")); 6717 if (ipif_refheld) 6718 ipif_refrele(ipif); 6719 return (EINVAL); 6720 } 6721 6722 /* 6723 * As the interface index specified with the RTA_IFP sockaddr is 6724 * the same for all ipif's off of an ill, the matching logic 6725 * below uses MATCH_IRE_ILL if such an index was specified. 6726 * This means that routes sharing the same prefix when added 6727 * using a RTA_IFP sockaddr must have distinct interface 6728 * indices (namely, they must be on distinct ill's). 6729 * 6730 * On the other hand, since the gateway address will usually be 6731 * different for each ipif on the system, the matching logic 6732 * uses MATCH_IRE_IPIF in the case of a traditional interface 6733 * route. This means that interface routes for the same prefix 6734 * can be created if they belong to distinct ipif's and if a 6735 * RTA_IFP sockaddr is not present. 6736 */ 6737 if (ipif_arg != NULL) { 6738 if (ipif_refheld) { 6739 ipif_refrele(ipif); 6740 ipif_refheld = B_FALSE; 6741 } 6742 ipif = ipif_arg; 6743 match_flags |= MATCH_IRE_ILL; 6744 } else { 6745 /* 6746 * Check the ipif corresponding to the gw_addr 6747 */ 6748 if (ipif == NULL) 6749 return (ENETUNREACH); 6750 match_flags |= MATCH_IRE_IPIF; 6751 } 6752 ASSERT(ipif != NULL); 6753 6754 /* 6755 * We check for an existing entry at this point. 6756 * 6757 * Since a netmask isn't passed in via the ioctl interface 6758 * (SIOCADDRT), we don't check for a matching netmask in that 6759 * case. 6760 */ 6761 if (!ioctl_msg) 6762 match_flags |= MATCH_IRE_MASK; 6763 ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif, 6764 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 6765 if (ire != NULL) { 6766 ire_refrele(ire); 6767 if (ipif_refheld) 6768 ipif_refrele(ipif); 6769 return (EEXIST); 6770 } 6771 6772 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6773 ? ipif->ipif_rq : ipif->ipif_wq; 6774 6775 /* 6776 * Create a copy of the IRE_LOOPBACK, 6777 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6778 * the modified address and netmask. 6779 */ 6780 ire = ire_create( 6781 (uchar_t *)&dst_addr, 6782 (uint8_t *)&mask, 6783 (uint8_t *)&ipif->ipif_src_addr, 6784 NULL, 6785 &ipif->ipif_mtu, 6786 NULL, 6787 NULL, 6788 stq, 6789 ipif->ipif_net_type, 6790 ipif, 6791 0, 6792 0, 6793 0, 6794 flags, 6795 &ire_uinfo_null, 6796 NULL, 6797 NULL, 6798 ipst); 6799 if (ire == NULL) { 6800 if (ipif_refheld) 6801 ipif_refrele(ipif); 6802 return (ENOMEM); 6803 } 6804 6805 /* 6806 * Some software (for example, GateD and Sun Cluster) attempts 6807 * to create (what amount to) IRE_PREFIX routes with the 6808 * loopback address as the gateway. This is primarily done to 6809 * set up prefixes with the RTF_REJECT flag set (for example, 6810 * when generating aggregate routes.) 6811 * 6812 * If the IRE type (as defined by ipif->ipif_net_type) is 6813 * IRE_LOOPBACK, then we map the request into a 6814 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 6815 * these interface routes, by definition, can only be that. 6816 * 6817 * Needless to say, the real IRE_LOOPBACK is NOT created by this 6818 * routine, but rather using ire_create() directly. 6819 * 6820 */ 6821 if (ipif->ipif_net_type == IRE_LOOPBACK) { 6822 ire->ire_type = IRE_IF_NORESOLVER; 6823 ire->ire_flags |= RTF_BLACKHOLE; 6824 } 6825 6826 error = ire_add(&ire, q, mp, func, B_FALSE); 6827 if (error == 0) 6828 goto save_ire; 6829 6830 /* 6831 * In the result of failure, ire_add() will have already 6832 * deleted the ire in question, so there is no need to 6833 * do that here. 6834 */ 6835 if (ipif_refheld) 6836 ipif_refrele(ipif); 6837 return (error); 6838 } 6839 if (ipif_refheld) { 6840 ipif_refrele(ipif); 6841 ipif_refheld = B_FALSE; 6842 } 6843 6844 /* 6845 * Get an interface IRE for the specified gateway. 6846 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 6847 * gateway, it is currently unreachable and we fail the request 6848 * accordingly. 6849 */ 6850 ipif = ipif_arg; 6851 if (ipif_arg != NULL) 6852 match_flags |= MATCH_IRE_ILL; 6853 again: 6854 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 6855 ALL_ZONES, 0, NULL, match_flags, ipst); 6856 if (gw_ire == NULL) { 6857 /* 6858 * With IPMP, we allow host routes to influence in.mpathd's 6859 * target selection. However, if the test addresses are on 6860 * their own network, the above lookup will fail since the 6861 * underlying IRE_INTERFACEs are marked hidden. So allow 6862 * hidden test IREs to be found and try again. 6863 */ 6864 if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) { 6865 match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 6866 goto again; 6867 } 6868 return (ENETUNREACH); 6869 } 6870 6871 /* 6872 * We create one of three types of IREs as a result of this request 6873 * based on the netmask. A netmask of all ones (which is automatically 6874 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 6875 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 6876 * created. Otherwise, an IRE_PREFIX route is created for the 6877 * destination prefix. 6878 */ 6879 if (mask == IP_HOST_MASK) 6880 type = IRE_HOST; 6881 else if (mask == 0) 6882 type = IRE_DEFAULT; 6883 else 6884 type = IRE_PREFIX; 6885 6886 /* check for a duplicate entry */ 6887 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 6888 NULL, ALL_ZONES, 0, NULL, 6889 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst); 6890 if (ire != NULL) { 6891 ire_refrele(gw_ire); 6892 ire_refrele(ire); 6893 return (EEXIST); 6894 } 6895 6896 /* Security attribute exists */ 6897 if (sp != NULL) { 6898 tsol_gcgrp_addr_t ga; 6899 6900 /* find or create the gateway credentials group */ 6901 ga.ga_af = AF_INET; 6902 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 6903 6904 /* we hold reference to it upon success */ 6905 gcgrp = gcgrp_lookup(&ga, B_TRUE); 6906 if (gcgrp == NULL) { 6907 ire_refrele(gw_ire); 6908 return (ENOMEM); 6909 } 6910 6911 /* 6912 * Create and add the security attribute to the group; a 6913 * reference to the group is made upon allocating a new 6914 * entry successfully. If it finds an already-existing 6915 * entry for the security attribute in the group, it simply 6916 * returns it and no new reference is made to the group. 6917 */ 6918 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 6919 if (gc == NULL) { 6920 /* release reference held by gcgrp_lookup */ 6921 GCGRP_REFRELE(gcgrp); 6922 ire_refrele(gw_ire); 6923 return (ENOMEM); 6924 } 6925 } 6926 6927 /* Create the IRE. */ 6928 ire = ire_create( 6929 (uchar_t *)&dst_addr, /* dest address */ 6930 (uchar_t *)&mask, /* mask */ 6931 /* src address assigned by the caller? */ 6932 (uchar_t *)(((src_addr != INADDR_ANY) && 6933 (flags & RTF_SETSRC)) ? &src_addr : NULL), 6934 (uchar_t *)&gw_addr, /* gateway address */ 6935 &gw_ire->ire_max_frag, 6936 NULL, /* no src nce */ 6937 NULL, /* no recv-from queue */ 6938 NULL, /* no send-to queue */ 6939 (ushort_t)type, /* IRE type */ 6940 ipif_arg, 6941 0, 6942 0, 6943 0, 6944 flags, 6945 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 6946 gc, /* security attribute */ 6947 NULL, 6948 ipst); 6949 6950 /* 6951 * The ire holds a reference to the 'gc' and the 'gc' holds a 6952 * reference to the 'gcgrp'. We can now release the extra reference 6953 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 6954 */ 6955 if (gcgrp_xtraref) 6956 GCGRP_REFRELE(gcgrp); 6957 if (ire == NULL) { 6958 if (gc != NULL) 6959 GC_REFRELE(gc); 6960 ire_refrele(gw_ire); 6961 return (ENOMEM); 6962 } 6963 6964 /* 6965 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 6966 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 6967 */ 6968 6969 /* Add the new IRE. */ 6970 error = ire_add(&ire, q, mp, func, B_FALSE); 6971 if (error != 0) { 6972 /* 6973 * In the result of failure, ire_add() will have already 6974 * deleted the ire in question, so there is no need to 6975 * do that here. 6976 */ 6977 ire_refrele(gw_ire); 6978 return (error); 6979 } 6980 6981 if (flags & RTF_MULTIRT) { 6982 /* 6983 * Invoke the CGTP (multirouting) filtering module 6984 * to add the dst address in the filtering database. 6985 * Replicated inbound packets coming from that address 6986 * will be filtered to discard the duplicates. 6987 * It is not necessary to call the CGTP filter hook 6988 * when the dst address is a broadcast or multicast, 6989 * because an IP source address cannot be a broadcast 6990 * or a multicast. 6991 */ 6992 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 6993 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 6994 if (ire_dst != NULL) { 6995 ip_cgtp_bcast_add(ire, ire_dst, ipst); 6996 ire_refrele(ire_dst); 6997 goto save_ire; 6998 } 6999 if (ipst->ips_ip_cgtp_filter_ops != NULL && 7000 !CLASSD(ire->ire_addr)) { 7001 int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4( 7002 ipst->ips_netstack->netstack_stackid, 7003 ire->ire_addr, 7004 ire->ire_gateway_addr, 7005 ire->ire_src_addr, 7006 gw_ire->ire_src_addr); 7007 if (res != 0) { 7008 ire_refrele(gw_ire); 7009 ire_delete(ire); 7010 return (res); 7011 } 7012 } 7013 } 7014 7015 /* 7016 * Now that the prefix IRE entry has been created, delete any 7017 * existing gateway IRE cache entries as well as any IRE caches 7018 * using the gateway, and force them to be created through 7019 * ip_newroute. 7020 */ 7021 if (gc != NULL) { 7022 ASSERT(gcgrp != NULL); 7023 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst); 7024 } 7025 7026 save_ire: 7027 if (gw_ire != NULL) { 7028 ire_refrele(gw_ire); 7029 } 7030 if (ipif != NULL) { 7031 /* 7032 * Save enough information so that we can recreate the IRE if 7033 * the interface goes down and then up. The metrics associated 7034 * with the route will be saved as well when rts_setmetrics() is 7035 * called after the IRE has been created. In the case where 7036 * memory cannot be allocated, none of this information will be 7037 * saved. 7038 */ 7039 ipif_save_ire(ipif, ire); 7040 } 7041 if (ioctl_msg) 7042 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 7043 if (ire_arg != NULL) { 7044 /* 7045 * Store the ire that was successfully added into where ire_arg 7046 * points to so that callers don't have to look it up 7047 * themselves (but they are responsible for ire_refrele()ing 7048 * the ire when they are finished with it). 7049 */ 7050 *ire_arg = ire; 7051 } else { 7052 ire_refrele(ire); /* Held in ire_add */ 7053 } 7054 if (ipif_refheld) 7055 ipif_refrele(ipif); 7056 return (0); 7057 } 7058 7059 /* 7060 * ip_rt_delete is called to delete an IPv4 route. 7061 * ipif_arg is passed in to associate it with the correct interface. 7062 * We may need to restart this operation if the ipif cannot be looked up 7063 * due to an exclusive operation that is currently in progress. The restart 7064 * entry point is specified by 'func' 7065 */ 7066 /* ARGSUSED4 */ 7067 int 7068 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7069 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg, 7070 queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst) 7071 { 7072 ire_t *ire = NULL; 7073 ipif_t *ipif; 7074 boolean_t ipif_refheld = B_FALSE; 7075 uint_t type; 7076 uint_t match_flags = MATCH_IRE_TYPE; 7077 int err = 0; 7078 7079 ip1dbg(("ip_rt_delete:")); 7080 /* 7081 * If this is the case of RTF_HOST being set, then we set the netmask 7082 * to all ones. Otherwise, we use the netmask if one was supplied. 7083 */ 7084 if (flags & RTF_HOST) { 7085 mask = IP_HOST_MASK; 7086 match_flags |= MATCH_IRE_MASK; 7087 } else if (rtm_addrs & RTA_NETMASK) { 7088 match_flags |= MATCH_IRE_MASK; 7089 } 7090 7091 /* 7092 * Note that RTF_GATEWAY is never set on a delete, therefore 7093 * we check if the gateway address is one of our interfaces first, 7094 * and fall back on RTF_GATEWAY routes. 7095 * 7096 * This makes it possible to delete an original 7097 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7098 * 7099 * As the interface index specified with the RTA_IFP sockaddr is the 7100 * same for all ipif's off of an ill, the matching logic below uses 7101 * MATCH_IRE_ILL if such an index was specified. This means a route 7102 * sharing the same prefix and interface index as the the route 7103 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7104 * is specified in the request. 7105 * 7106 * On the other hand, since the gateway address will usually be 7107 * different for each ipif on the system, the matching logic 7108 * uses MATCH_IRE_IPIF in the case of a traditional interface 7109 * route. This means that interface routes for the same prefix can be 7110 * uniquely identified if they belong to distinct ipif's and if a 7111 * RTA_IFP sockaddr is not present. 7112 * 7113 * For more detail on specifying routes by gateway address and by 7114 * interface index, see the comments in ip_rt_add(). 7115 */ 7116 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err, 7117 ipst); 7118 if (ipif != NULL) 7119 ipif_refheld = B_TRUE; 7120 else if (err == EINPROGRESS) 7121 return (err); 7122 else 7123 err = 0; 7124 if (ipif != NULL) { 7125 if (ipif_arg != NULL) { 7126 if (ipif_refheld) { 7127 ipif_refrele(ipif); 7128 ipif_refheld = B_FALSE; 7129 } 7130 ipif = ipif_arg; 7131 match_flags |= MATCH_IRE_ILL; 7132 } else { 7133 match_flags |= MATCH_IRE_IPIF; 7134 } 7135 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7136 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 7137 ALL_ZONES, NULL, match_flags, ipst); 7138 } 7139 if (ire == NULL) { 7140 ire = ire_ftable_lookup(dst_addr, mask, 0, 7141 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, 7142 match_flags, ipst); 7143 } 7144 } 7145 7146 if (ire == NULL) { 7147 /* 7148 * At this point, the gateway address is not one of our own 7149 * addresses or a matching interface route was not found. We 7150 * set the IRE type to lookup based on whether 7151 * this is a host route, a default route or just a prefix. 7152 * 7153 * If an ipif_arg was passed in, then the lookup is based on an 7154 * interface index so MATCH_IRE_ILL is added to match_flags. 7155 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7156 * set as the route being looked up is not a traditional 7157 * interface route. 7158 */ 7159 match_flags &= ~MATCH_IRE_IPIF; 7160 match_flags |= MATCH_IRE_GW; 7161 if (ipif_arg != NULL) 7162 match_flags |= MATCH_IRE_ILL; 7163 if (mask == IP_HOST_MASK) 7164 type = IRE_HOST; 7165 else if (mask == 0) 7166 type = IRE_DEFAULT; 7167 else 7168 type = IRE_PREFIX; 7169 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7170 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 7171 } 7172 7173 if (ipif_refheld) 7174 ipif_refrele(ipif); 7175 7176 /* ipif is not refheld anymore */ 7177 if (ire == NULL) 7178 return (ESRCH); 7179 7180 if (ire->ire_flags & RTF_MULTIRT) { 7181 /* 7182 * Invoke the CGTP (multirouting) filtering module 7183 * to remove the dst address from the filtering database. 7184 * Packets coming from that address will no longer be 7185 * filtered to remove duplicates. 7186 */ 7187 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 7188 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 7189 ipst->ips_netstack->netstack_stackid, 7190 ire->ire_addr, ire->ire_gateway_addr); 7191 } 7192 ip_cgtp_bcast_delete(ire, ipst); 7193 } 7194 7195 ipif = ire->ire_ipif; 7196 if (ipif != NULL) 7197 ipif_remove_ire(ipif, ire); 7198 if (ioctl_msg) 7199 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 7200 ire_delete(ire); 7201 ire_refrele(ire); 7202 return (err); 7203 } 7204 7205 /* 7206 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7207 */ 7208 /* ARGSUSED */ 7209 int 7210 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7211 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7212 { 7213 ipaddr_t dst_addr; 7214 ipaddr_t gw_addr; 7215 ipaddr_t mask; 7216 int error = 0; 7217 mblk_t *mp1; 7218 struct rtentry *rt; 7219 ipif_t *ipif = NULL; 7220 ip_stack_t *ipst; 7221 7222 ASSERT(q->q_next == NULL); 7223 ipst = CONNQ_TO_IPST(q); 7224 7225 ip1dbg(("ip_siocaddrt:")); 7226 /* Existence of mp1 verified in ip_wput_nondata */ 7227 mp1 = mp->b_cont->b_cont; 7228 rt = (struct rtentry *)mp1->b_rptr; 7229 7230 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7231 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7232 7233 /* 7234 * If the RTF_HOST flag is on, this is a request to assign a gateway 7235 * to a particular host address. In this case, we set the netmask to 7236 * all ones for the particular destination address. Otherwise, 7237 * determine the netmask to be used based on dst_addr and the interfaces 7238 * in use. 7239 */ 7240 if (rt->rt_flags & RTF_HOST) { 7241 mask = IP_HOST_MASK; 7242 } else { 7243 /* 7244 * Note that ip_subnet_mask returns a zero mask in the case of 7245 * default (an all-zeroes address). 7246 */ 7247 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7248 } 7249 7250 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7251 B_TRUE, q, mp, ip_process_ioctl, NULL, ipst); 7252 if (ipif != NULL) 7253 ipif_refrele(ipif); 7254 return (error); 7255 } 7256 7257 /* 7258 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7259 */ 7260 /* ARGSUSED */ 7261 int 7262 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7263 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7264 { 7265 ipaddr_t dst_addr; 7266 ipaddr_t gw_addr; 7267 ipaddr_t mask; 7268 int error; 7269 mblk_t *mp1; 7270 struct rtentry *rt; 7271 ipif_t *ipif = NULL; 7272 ip_stack_t *ipst; 7273 7274 ASSERT(q->q_next == NULL); 7275 ipst = CONNQ_TO_IPST(q); 7276 7277 ip1dbg(("ip_siocdelrt:")); 7278 /* Existence of mp1 verified in ip_wput_nondata */ 7279 mp1 = mp->b_cont->b_cont; 7280 rt = (struct rtentry *)mp1->b_rptr; 7281 7282 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7283 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7284 7285 /* 7286 * If the RTF_HOST flag is on, this is a request to delete a gateway 7287 * to a particular host address. In this case, we set the netmask to 7288 * all ones for the particular destination address. Otherwise, 7289 * determine the netmask to be used based on dst_addr and the interfaces 7290 * in use. 7291 */ 7292 if (rt->rt_flags & RTF_HOST) { 7293 mask = IP_HOST_MASK; 7294 } else { 7295 /* 7296 * Note that ip_subnet_mask returns a zero mask in the case of 7297 * default (an all-zeroes address). 7298 */ 7299 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7300 } 7301 7302 error = ip_rt_delete(dst_addr, mask, gw_addr, 7303 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q, 7304 mp, ip_process_ioctl, ipst); 7305 if (ipif != NULL) 7306 ipif_refrele(ipif); 7307 return (error); 7308 } 7309 7310 /* 7311 * Enqueue the mp onto the ipsq, chained by b_next. 7312 * b_prev stores the function to be executed later, and b_queue the queue 7313 * where this mp originated. 7314 */ 7315 void 7316 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7317 ill_t *pending_ill) 7318 { 7319 conn_t *connp; 7320 ipxop_t *ipx = ipsq->ipsq_xop; 7321 7322 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7323 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 7324 ASSERT(func != NULL); 7325 7326 mp->b_queue = q; 7327 mp->b_prev = (void *)func; 7328 mp->b_next = NULL; 7329 7330 switch (type) { 7331 case CUR_OP: 7332 if (ipx->ipx_mptail != NULL) { 7333 ASSERT(ipx->ipx_mphead != NULL); 7334 ipx->ipx_mptail->b_next = mp; 7335 } else { 7336 ASSERT(ipx->ipx_mphead == NULL); 7337 ipx->ipx_mphead = mp; 7338 } 7339 ipx->ipx_mptail = mp; 7340 break; 7341 7342 case NEW_OP: 7343 if (ipsq->ipsq_xopq_mptail != NULL) { 7344 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7345 ipsq->ipsq_xopq_mptail->b_next = mp; 7346 } else { 7347 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7348 ipsq->ipsq_xopq_mphead = mp; 7349 } 7350 ipsq->ipsq_xopq_mptail = mp; 7351 ipx->ipx_ipsq_queued = B_TRUE; 7352 break; 7353 7354 case SWITCH_OP: 7355 ASSERT(ipsq->ipsq_swxop != NULL); 7356 /* only one switch operation is currently allowed */ 7357 ASSERT(ipsq->ipsq_switch_mp == NULL); 7358 ipsq->ipsq_switch_mp = mp; 7359 ipx->ipx_ipsq_queued = B_TRUE; 7360 break; 7361 default: 7362 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7363 } 7364 7365 if (CONN_Q(q) && pending_ill != NULL) { 7366 connp = Q_TO_CONN(q); 7367 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7368 connp->conn_oper_pending_ill = pending_ill; 7369 } 7370 } 7371 7372 /* 7373 * Dequeue the next message that requested exclusive access to this IPSQ's 7374 * xop. Specifically: 7375 * 7376 * 1. If we're still processing the current operation on `ipsq', then 7377 * dequeue the next message for the operation (from ipx_mphead), or 7378 * return NULL if there are no queued messages for the operation. 7379 * These messages are queued via CUR_OP to qwriter_ip() and friends. 7380 * 7381 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 7382 * not set) see if the ipsq has requested an xop switch. If so, switch 7383 * `ipsq' to a different xop. Xop switches only happen when joining or 7384 * leaving IPMP groups and require a careful dance -- see the comments 7385 * in-line below for details. If we're leaving a group xop or if we're 7386 * joining a group xop and become writer on it, then we proceed to (3). 7387 * Otherwise, we return NULL and exit the xop. 7388 * 7389 * 3. For each IPSQ in the xop, return any switch operation stored on 7390 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 7391 * any other messages queued on the IPSQ. Otherwise, dequeue the next 7392 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 7393 * Note that if the phyint tied to `ipsq' is not using IPMP there will 7394 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 7395 * each phyint in the group, including the IPMP meta-interface phyint. 7396 */ 7397 static mblk_t * 7398 ipsq_dq(ipsq_t *ipsq) 7399 { 7400 ill_t *illv4, *illv6; 7401 mblk_t *mp; 7402 ipsq_t *xopipsq; 7403 ipsq_t *leftipsq = NULL; 7404 ipxop_t *ipx; 7405 phyint_t *phyi = ipsq->ipsq_phyint; 7406 ip_stack_t *ipst = ipsq->ipsq_ipst; 7407 boolean_t emptied = B_FALSE; 7408 7409 /* 7410 * Grab all the locks we need in the defined order (ill_g_lock -> 7411 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 7412 */ 7413 rw_enter(&ipst->ips_ill_g_lock, 7414 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 7415 mutex_enter(&ipsq->ipsq_lock); 7416 ipx = ipsq->ipsq_xop; 7417 mutex_enter(&ipx->ipx_lock); 7418 7419 /* 7420 * Dequeue the next message associated with the current exclusive 7421 * operation, if any. 7422 */ 7423 if ((mp = ipx->ipx_mphead) != NULL) { 7424 ipx->ipx_mphead = mp->b_next; 7425 if (ipx->ipx_mphead == NULL) 7426 ipx->ipx_mptail = NULL; 7427 mp->b_next = (void *)ipsq; 7428 goto out; 7429 } 7430 7431 if (ipx->ipx_current_ipif != NULL) 7432 goto empty; 7433 7434 if (ipsq->ipsq_swxop != NULL) { 7435 /* 7436 * The exclusive operation that is now being completed has 7437 * requested a switch to a different xop. This happens 7438 * when an interface joins or leaves an IPMP group. Joins 7439 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 7440 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 7441 * (phyint_free()), or interface plumb for an ill type 7442 * not in the IPMP group (ip_rput_dlpi_writer()). 7443 * 7444 * Xop switches are not allowed on the IPMP meta-interface. 7445 */ 7446 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 7447 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 7448 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 7449 7450 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 7451 /* 7452 * We're switching back to our own xop, so we have two 7453 * xop's to drain/exit: our own, and the group xop 7454 * that we are leaving. 7455 * 7456 * First, pull ourselves out of the group ipsq list. 7457 * This is safe since we're writer on ill_g_lock. 7458 */ 7459 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 7460 7461 xopipsq = ipx->ipx_ipsq; 7462 while (xopipsq->ipsq_next != ipsq) 7463 xopipsq = xopipsq->ipsq_next; 7464 7465 xopipsq->ipsq_next = ipsq->ipsq_next; 7466 ipsq->ipsq_next = ipsq; 7467 ipsq->ipsq_xop = ipsq->ipsq_swxop; 7468 ipsq->ipsq_swxop = NULL; 7469 7470 /* 7471 * Second, prepare to exit the group xop. The actual 7472 * ipsq_exit() is done at the end of this function 7473 * since we cannot hold any locks across ipsq_exit(). 7474 * Note that although we drop the group's ipx_lock, no 7475 * threads can proceed since we're still ipx_writer. 7476 */ 7477 leftipsq = xopipsq; 7478 mutex_exit(&ipx->ipx_lock); 7479 7480 /* 7481 * Third, set ipx to point to our own xop (which was 7482 * inactive and therefore can be entered). 7483 */ 7484 ipx = ipsq->ipsq_xop; 7485 mutex_enter(&ipx->ipx_lock); 7486 ASSERT(ipx->ipx_writer == NULL); 7487 ASSERT(ipx->ipx_current_ipif == NULL); 7488 } else { 7489 /* 7490 * We're switching from our own xop to a group xop. 7491 * The requestor of the switch must ensure that the 7492 * group xop cannot go away (e.g. by ensuring the 7493 * phyint associated with the xop cannot go away). 7494 * 7495 * If we can become writer on our new xop, then we'll 7496 * do the drain. Otherwise, the current writer of our 7497 * new xop will do the drain when it exits. 7498 * 7499 * First, splice ourselves into the group IPSQ list. 7500 * This is safe since we're writer on ill_g_lock. 7501 */ 7502 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 7503 7504 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 7505 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 7506 xopipsq = xopipsq->ipsq_next; 7507 7508 xopipsq->ipsq_next = ipsq; 7509 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 7510 ipsq->ipsq_xop = ipsq->ipsq_swxop; 7511 ipsq->ipsq_swxop = NULL; 7512 7513 /* 7514 * Second, exit our own xop, since it's now unused. 7515 * This is safe since we've got the only reference. 7516 */ 7517 ASSERT(ipx->ipx_writer == curthread); 7518 ipx->ipx_writer = NULL; 7519 VERIFY(--ipx->ipx_reentry_cnt == 0); 7520 ipx->ipx_ipsq_queued = B_FALSE; 7521 mutex_exit(&ipx->ipx_lock); 7522 7523 /* 7524 * Third, set ipx to point to our new xop, and check 7525 * if we can become writer on it. If we cannot, then 7526 * the current writer will drain the IPSQ group when 7527 * it exits. Our ipsq_xop is guaranteed to be stable 7528 * because we're still holding ipsq_lock. 7529 */ 7530 ipx = ipsq->ipsq_xop; 7531 mutex_enter(&ipx->ipx_lock); 7532 if (ipx->ipx_writer != NULL || 7533 ipx->ipx_current_ipif != NULL) { 7534 goto out; 7535 } 7536 } 7537 7538 /* 7539 * Fourth, become writer on our new ipx before we continue 7540 * with the drain. Note that we never dropped ipsq_lock 7541 * above, so no other thread could've raced with us to 7542 * become writer first. Also, we're holding ipx_lock, so 7543 * no other thread can examine the ipx right now. 7544 */ 7545 ASSERT(ipx->ipx_current_ipif == NULL); 7546 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 7547 VERIFY(ipx->ipx_reentry_cnt++ == 0); 7548 ipx->ipx_writer = curthread; 7549 ipx->ipx_forced = B_FALSE; 7550 #ifdef DEBUG 7551 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7552 #endif 7553 } 7554 7555 xopipsq = ipsq; 7556 do { 7557 /* 7558 * So that other operations operate on a consistent and 7559 * complete phyint, a switch message on an IPSQ must be 7560 * handled prior to any other operations on that IPSQ. 7561 */ 7562 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 7563 xopipsq->ipsq_switch_mp = NULL; 7564 ASSERT(mp->b_next == NULL); 7565 mp->b_next = (void *)xopipsq; 7566 goto out; 7567 } 7568 7569 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 7570 xopipsq->ipsq_xopq_mphead = mp->b_next; 7571 if (xopipsq->ipsq_xopq_mphead == NULL) 7572 xopipsq->ipsq_xopq_mptail = NULL; 7573 mp->b_next = (void *)xopipsq; 7574 goto out; 7575 } 7576 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 7577 empty: 7578 /* 7579 * There are no messages. Further, we are holding ipx_lock, hence no 7580 * new messages can end up on any IPSQ in the xop. 7581 */ 7582 ipx->ipx_writer = NULL; 7583 ipx->ipx_forced = B_FALSE; 7584 VERIFY(--ipx->ipx_reentry_cnt == 0); 7585 ipx->ipx_ipsq_queued = B_FALSE; 7586 emptied = B_TRUE; 7587 #ifdef DEBUG 7588 ipx->ipx_depth = 0; 7589 #endif 7590 out: 7591 mutex_exit(&ipx->ipx_lock); 7592 mutex_exit(&ipsq->ipsq_lock); 7593 7594 /* 7595 * If we completely emptied the xop, then wake up any threads waiting 7596 * to enter any of the IPSQ's associated with it. 7597 */ 7598 if (emptied) { 7599 xopipsq = ipsq; 7600 do { 7601 if ((phyi = xopipsq->ipsq_phyint) == NULL) 7602 continue; 7603 7604 illv4 = phyi->phyint_illv4; 7605 illv6 = phyi->phyint_illv6; 7606 7607 GRAB_ILL_LOCKS(illv4, illv6); 7608 if (illv4 != NULL) 7609 cv_broadcast(&illv4->ill_cv); 7610 if (illv6 != NULL) 7611 cv_broadcast(&illv6->ill_cv); 7612 RELEASE_ILL_LOCKS(illv4, illv6); 7613 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 7614 } 7615 rw_exit(&ipst->ips_ill_g_lock); 7616 7617 /* 7618 * Now that all locks are dropped, exit the IPSQ we left. 7619 */ 7620 if (leftipsq != NULL) 7621 ipsq_exit(leftipsq); 7622 7623 return (mp); 7624 } 7625 7626 /* 7627 * Return completion status of previously initiated DLPI operations on 7628 * ills in the purview of an ipsq. 7629 */ 7630 static boolean_t 7631 ipsq_dlpi_done(ipsq_t *ipsq) 7632 { 7633 ipsq_t *ipsq_start; 7634 phyint_t *phyi; 7635 ill_t *ill; 7636 7637 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 7638 ipsq_start = ipsq; 7639 7640 do { 7641 /* 7642 * The only current users of this function are ipsq_try_enter 7643 * and ipsq_enter which have made sure that ipsq_writer is 7644 * NULL before we reach here. ill_dlpi_pending is modified 7645 * only by an ipsq writer 7646 */ 7647 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 7648 phyi = ipsq->ipsq_phyint; 7649 /* 7650 * phyi could be NULL if a phyint that is part of an 7651 * IPMP group is being unplumbed. A more detailed 7652 * comment is in ipmp_grp_update_kstats() 7653 */ 7654 if (phyi != NULL) { 7655 ill = phyi->phyint_illv4; 7656 if (ill != NULL && 7657 ill->ill_dlpi_pending != DL_PRIM_INVAL) 7658 return (B_FALSE); 7659 7660 ill = phyi->phyint_illv6; 7661 if (ill != NULL && 7662 ill->ill_dlpi_pending != DL_PRIM_INVAL) 7663 return (B_FALSE); 7664 } 7665 7666 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 7667 7668 return (B_TRUE); 7669 } 7670 7671 /* 7672 * Enter the ipsq corresponding to ill, by waiting synchronously till 7673 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7674 * will have to drain completely before ipsq_enter returns success. 7675 * ipx_current_ipif will be set if some exclusive op is in progress, 7676 * and the ipsq_exit logic will start the next enqueued op after 7677 * completion of the current op. If 'force' is used, we don't wait 7678 * for the enqueued ops. This is needed when a conn_close wants to 7679 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7680 * of an ill can also use this option. But we dont' use it currently. 7681 */ 7682 #define ENTER_SQ_WAIT_TICKS 100 7683 boolean_t 7684 ipsq_enter(ill_t *ill, boolean_t force, int type) 7685 { 7686 ipsq_t *ipsq; 7687 ipxop_t *ipx; 7688 boolean_t waited_enough = B_FALSE; 7689 ip_stack_t *ipst = ill->ill_ipst; 7690 7691 /* 7692 * Note that the relationship between ill and ipsq is fixed as long as 7693 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 7694 * relationship between the IPSQ and xop cannot change. However, 7695 * since we cannot hold ipsq_lock across the cv_wait(), it may change 7696 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 7697 * waking up all ills in the xop when it becomes available. 7698 */ 7699 for (;;) { 7700 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7701 mutex_enter(&ill->ill_lock); 7702 if (ill->ill_state_flags & ILL_CONDEMNED) { 7703 mutex_exit(&ill->ill_lock); 7704 rw_exit(&ipst->ips_ill_g_lock); 7705 return (B_FALSE); 7706 } 7707 7708 ipsq = ill->ill_phyint->phyint_ipsq; 7709 mutex_enter(&ipsq->ipsq_lock); 7710 ipx = ipsq->ipsq_xop; 7711 mutex_enter(&ipx->ipx_lock); 7712 7713 if (ipx->ipx_writer == NULL && (type == CUR_OP || 7714 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 7715 waited_enough)) 7716 break; 7717 7718 rw_exit(&ipst->ips_ill_g_lock); 7719 7720 if (!force || ipx->ipx_writer != NULL) { 7721 mutex_exit(&ipx->ipx_lock); 7722 mutex_exit(&ipsq->ipsq_lock); 7723 cv_wait(&ill->ill_cv, &ill->ill_lock); 7724 } else { 7725 mutex_exit(&ipx->ipx_lock); 7726 mutex_exit(&ipsq->ipsq_lock); 7727 (void) cv_timedwait(&ill->ill_cv, 7728 &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS); 7729 waited_enough = B_TRUE; 7730 } 7731 mutex_exit(&ill->ill_lock); 7732 } 7733 7734 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 7735 ASSERT(ipx->ipx_reentry_cnt == 0); 7736 ipx->ipx_writer = curthread; 7737 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 7738 ipx->ipx_reentry_cnt++; 7739 #ifdef DEBUG 7740 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7741 #endif 7742 mutex_exit(&ipx->ipx_lock); 7743 mutex_exit(&ipsq->ipsq_lock); 7744 mutex_exit(&ill->ill_lock); 7745 rw_exit(&ipst->ips_ill_g_lock); 7746 7747 return (B_TRUE); 7748 } 7749 7750 /* 7751 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 7752 * across the call to the core interface ipsq_try_enter() and hence calls this 7753 * function directly. This is explained more fully in ipif_set_values(). 7754 * In order to support the above constraint, ipsq_try_enter is implemented as 7755 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 7756 */ 7757 static ipsq_t * 7758 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 7759 int type, boolean_t reentry_ok) 7760 { 7761 ipsq_t *ipsq; 7762 ipxop_t *ipx; 7763 ip_stack_t *ipst = ill->ill_ipst; 7764 7765 /* 7766 * lock ordering: 7767 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 7768 * 7769 * ipx of an ipsq can't change when ipsq_lock is held. 7770 */ 7771 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 7772 GRAB_CONN_LOCK(q); 7773 mutex_enter(&ill->ill_lock); 7774 ipsq = ill->ill_phyint->phyint_ipsq; 7775 mutex_enter(&ipsq->ipsq_lock); 7776 ipx = ipsq->ipsq_xop; 7777 mutex_enter(&ipx->ipx_lock); 7778 7779 /* 7780 * 1. Enter the ipsq if we are already writer and reentry is ok. 7781 * (Note: If the caller does not specify reentry_ok then neither 7782 * 'func' nor any of its callees must ever attempt to enter the ipsq 7783 * again. Otherwise it can lead to an infinite loop 7784 * 2. Enter the ipsq if there is no current writer and this attempted 7785 * entry is part of the current operation 7786 * 3. Enter the ipsq if there is no current writer and this is a new 7787 * operation and the operation queue is empty and there is no 7788 * operation currently in progress and if all previously initiated 7789 * DLPI operations have completed. 7790 */ 7791 if ((ipx->ipx_writer == curthread && reentry_ok) || 7792 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 7793 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 7794 ipsq_dlpi_done(ipsq))))) { 7795 /* Success. */ 7796 ipx->ipx_reentry_cnt++; 7797 ipx->ipx_writer = curthread; 7798 ipx->ipx_forced = B_FALSE; 7799 mutex_exit(&ipx->ipx_lock); 7800 mutex_exit(&ipsq->ipsq_lock); 7801 mutex_exit(&ill->ill_lock); 7802 RELEASE_CONN_LOCK(q); 7803 #ifdef DEBUG 7804 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7805 #endif 7806 return (ipsq); 7807 } 7808 7809 if (func != NULL) 7810 ipsq_enq(ipsq, q, mp, func, type, ill); 7811 7812 mutex_exit(&ipx->ipx_lock); 7813 mutex_exit(&ipsq->ipsq_lock); 7814 mutex_exit(&ill->ill_lock); 7815 RELEASE_CONN_LOCK(q); 7816 return (NULL); 7817 } 7818 7819 /* 7820 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7821 * certain critical operations like plumbing (i.e. most set ioctls), multicast 7822 * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq 7823 * serializes exclusive ioctls issued by applications on a per ipsq basis in 7824 * ipsq_xopq_mphead. It also protects against multiple threads executing in 7825 * the ipsq. Responses from the driver pertain to the current ioctl (say a 7826 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 7827 * up the interface) and are enqueued in ipx_mphead. 7828 * 7829 * If a thread does not want to reenter the ipsq when it is already writer, 7830 * it must make sure that the specified reentry point to be called later 7831 * when the ipsq is empty, nor any code path starting from the specified reentry 7832 * point must never ever try to enter the ipsq again. Otherwise it can lead 7833 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7834 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7835 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 7836 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 7837 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7838 * ioctl if the current ioctl has completed. If the current ioctl is still 7839 * in progress it simply returns. The current ioctl could be waiting for 7840 * a response from another module (arp or the driver or could be waiting for 7841 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 7842 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 7843 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7844 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 7845 * all associated DLPI operations have completed. 7846 */ 7847 7848 /* 7849 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 7850 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 7851 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 7852 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 7853 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 7854 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 7855 */ 7856 ipsq_t * 7857 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7858 ipsq_func_t func, int type, boolean_t reentry_ok) 7859 { 7860 ip_stack_t *ipst; 7861 ipsq_t *ipsq; 7862 7863 /* Only 1 of ipif or ill can be specified */ 7864 ASSERT((ipif != NULL) ^ (ill != NULL)); 7865 7866 if (ipif != NULL) 7867 ill = ipif->ipif_ill; 7868 ipst = ill->ill_ipst; 7869 7870 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7871 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 7872 rw_exit(&ipst->ips_ill_g_lock); 7873 7874 return (ipsq); 7875 } 7876 7877 /* 7878 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 7879 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 7880 * cannot be entered, the mp is queued for completion. 7881 */ 7882 void 7883 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7884 boolean_t reentry_ok) 7885 { 7886 ipsq_t *ipsq; 7887 7888 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 7889 7890 /* 7891 * Drop the caller's refhold on the ill. This is safe since we either 7892 * entered the IPSQ (and thus are exclusive), or failed to enter the 7893 * IPSQ, in which case we return without accessing ill anymore. This 7894 * is needed because func needs to see the correct refcount. 7895 * e.g. removeif can work only then. 7896 */ 7897 ill_refrele(ill); 7898 if (ipsq != NULL) { 7899 (*func)(ipsq, q, mp, NULL); 7900 ipsq_exit(ipsq); 7901 } 7902 } 7903 7904 /* 7905 * Exit the specified IPSQ. If this is the final exit on it then drain it 7906 * prior to exiting. Caller must be writer on the specified IPSQ. 7907 */ 7908 void 7909 ipsq_exit(ipsq_t *ipsq) 7910 { 7911 mblk_t *mp; 7912 ipsq_t *mp_ipsq; 7913 queue_t *q; 7914 phyint_t *phyi; 7915 ipsq_func_t func; 7916 7917 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7918 7919 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 7920 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 7921 ipsq->ipsq_xop->ipx_reentry_cnt--; 7922 return; 7923 } 7924 7925 for (;;) { 7926 phyi = ipsq->ipsq_phyint; 7927 mp = ipsq_dq(ipsq); 7928 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 7929 7930 /* 7931 * If we've changed to a new IPSQ, and the phyint associated 7932 * with the old one has gone away, free the old IPSQ. Note 7933 * that this cannot happen while the IPSQ is in a group. 7934 */ 7935 if (mp_ipsq != ipsq && phyi == NULL) { 7936 ASSERT(ipsq->ipsq_next == ipsq); 7937 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 7938 ipsq_delete(ipsq); 7939 } 7940 7941 if (mp == NULL) 7942 break; 7943 7944 q = mp->b_queue; 7945 func = (ipsq_func_t)mp->b_prev; 7946 ipsq = mp_ipsq; 7947 mp->b_next = mp->b_prev = NULL; 7948 mp->b_queue = NULL; 7949 7950 /* 7951 * If 'q' is an conn queue, it is valid, since we did a 7952 * a refhold on the conn at the start of the ioctl. 7953 * If 'q' is an ill queue, it is valid, since close of an 7954 * ill will clean up its IPSQ. 7955 */ 7956 (*func)(ipsq, q, mp, NULL); 7957 } 7958 } 7959 7960 /* 7961 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 7962 * and `ioccmd'. 7963 */ 7964 void 7965 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 7966 { 7967 ill_t *ill = ipif->ipif_ill; 7968 ipxop_t *ipx = ipsq->ipsq_xop; 7969 7970 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7971 ASSERT(ipx->ipx_current_ipif == NULL); 7972 ASSERT(ipx->ipx_current_ioctl == 0); 7973 7974 ipx->ipx_current_done = B_FALSE; 7975 ipx->ipx_current_ioctl = ioccmd; 7976 mutex_enter(&ipx->ipx_lock); 7977 ipx->ipx_current_ipif = ipif; 7978 mutex_exit(&ipx->ipx_lock); 7979 7980 /* 7981 * Set IPIF_CHANGING on one or more ipifs associated with the 7982 * current exclusive operation. IPIF_CHANGING prevents any new 7983 * references to the ipif (so that the references will eventually 7984 * drop to zero) and also prevents any "get" operations (e.g., 7985 * SIOCGLIFFLAGS) from being able to access the ipif until the 7986 * operation has completed and the ipif is again in a stable state. 7987 * 7988 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 7989 * ioctl. For internal operations (where ioccmd is zero), all ipifs 7990 * on the ill are marked with IPIF_CHANGING since it's unclear which 7991 * ipifs will be affected. 7992 * 7993 * Note that SIOCLIFREMOVEIF is a special case as it sets 7994 * IPIF_CONDEMNED internally after identifying the right ipif to 7995 * operate on. 7996 */ 7997 switch (ioccmd) { 7998 case SIOCLIFREMOVEIF: 7999 break; 8000 case 0: 8001 mutex_enter(&ill->ill_lock); 8002 ipif = ipif->ipif_ill->ill_ipif; 8003 for (; ipif != NULL; ipif = ipif->ipif_next) 8004 ipif->ipif_state_flags |= IPIF_CHANGING; 8005 mutex_exit(&ill->ill_lock); 8006 break; 8007 default: 8008 mutex_enter(&ill->ill_lock); 8009 ipif->ipif_state_flags |= IPIF_CHANGING; 8010 mutex_exit(&ill->ill_lock); 8011 } 8012 } 8013 8014 /* 8015 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 8016 * the next exclusive operation to begin once we ipsq_exit(). However, if 8017 * pending DLPI operations remain, then we will wait for the queue to drain 8018 * before allowing the next exclusive operation to begin. This ensures that 8019 * DLPI operations from one exclusive operation are never improperly processed 8020 * as part of a subsequent exclusive operation. 8021 */ 8022 void 8023 ipsq_current_finish(ipsq_t *ipsq) 8024 { 8025 ipxop_t *ipx = ipsq->ipsq_xop; 8026 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 8027 ipif_t *ipif = ipx->ipx_current_ipif; 8028 8029 ASSERT(IAM_WRITER_IPSQ(ipsq)); 8030 8031 /* 8032 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 8033 * (but in that case, IPIF_CHANGING will already be clear and no 8034 * pending DLPI messages can remain). 8035 */ 8036 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 8037 ill_t *ill = ipif->ipif_ill; 8038 8039 mutex_enter(&ill->ill_lock); 8040 dlpi_pending = ill->ill_dlpi_pending; 8041 if (ipx->ipx_current_ioctl == 0) { 8042 ipif = ill->ill_ipif; 8043 for (; ipif != NULL; ipif = ipif->ipif_next) 8044 ipif->ipif_state_flags &= ~IPIF_CHANGING; 8045 } else { 8046 ipif->ipif_state_flags &= ~IPIF_CHANGING; 8047 } 8048 mutex_exit(&ill->ill_lock); 8049 } 8050 8051 ASSERT(!ipx->ipx_current_done); 8052 ipx->ipx_current_done = B_TRUE; 8053 ipx->ipx_current_ioctl = 0; 8054 if (dlpi_pending == DL_PRIM_INVAL) { 8055 mutex_enter(&ipx->ipx_lock); 8056 ipx->ipx_current_ipif = NULL; 8057 mutex_exit(&ipx->ipx_lock); 8058 } 8059 } 8060 8061 /* 8062 * The ill is closing. Flush all messages on the ipsq that originated 8063 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 8064 * for this ill since ipsq_enter could not have entered until then. 8065 * New messages can't be queued since the CONDEMNED flag is set. 8066 */ 8067 static void 8068 ipsq_flush(ill_t *ill) 8069 { 8070 queue_t *q; 8071 mblk_t *prev; 8072 mblk_t *mp; 8073 mblk_t *mp_next; 8074 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 8075 8076 ASSERT(IAM_WRITER_ILL(ill)); 8077 8078 /* 8079 * Flush any messages sent up by the driver. 8080 */ 8081 mutex_enter(&ipx->ipx_lock); 8082 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 8083 mp_next = mp->b_next; 8084 q = mp->b_queue; 8085 if (q == ill->ill_rq || q == ill->ill_wq) { 8086 /* dequeue mp */ 8087 if (prev == NULL) 8088 ipx->ipx_mphead = mp->b_next; 8089 else 8090 prev->b_next = mp->b_next; 8091 if (ipx->ipx_mptail == mp) { 8092 ASSERT(mp_next == NULL); 8093 ipx->ipx_mptail = prev; 8094 } 8095 inet_freemsg(mp); 8096 } else { 8097 prev = mp; 8098 } 8099 } 8100 mutex_exit(&ipx->ipx_lock); 8101 (void) ipsq_pending_mp_cleanup(ill, NULL); 8102 ipsq_xopq_mp_cleanup(ill, NULL); 8103 ill_pending_mp_cleanup(ill); 8104 } 8105 8106 /* 8107 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8108 * and return the associated ipif. 8109 * Return value: 8110 * Non zero: An error has occurred. ci may not be filled out. 8111 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8112 * a held ipif in ci.ci_ipif. 8113 */ 8114 int 8115 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8116 cmd_info_t *ci, ipsq_func_t func) 8117 { 8118 char *name; 8119 struct ifreq *ifr; 8120 struct lifreq *lifr; 8121 ipif_t *ipif = NULL; 8122 ill_t *ill; 8123 conn_t *connp; 8124 boolean_t isv6; 8125 boolean_t exists; 8126 int err; 8127 mblk_t *mp1; 8128 zoneid_t zoneid; 8129 ip_stack_t *ipst; 8130 8131 if (q->q_next != NULL) { 8132 ill = (ill_t *)q->q_ptr; 8133 isv6 = ill->ill_isv6; 8134 connp = NULL; 8135 zoneid = ALL_ZONES; 8136 ipst = ill->ill_ipst; 8137 } else { 8138 ill = NULL; 8139 connp = Q_TO_CONN(q); 8140 isv6 = connp->conn_af_isv6; 8141 zoneid = connp->conn_zoneid; 8142 if (zoneid == GLOBAL_ZONEID) { 8143 /* global zone can access ipifs in all zones */ 8144 zoneid = ALL_ZONES; 8145 } 8146 ipst = connp->conn_netstack->netstack_ip; 8147 } 8148 8149 /* Has been checked in ip_wput_nondata */ 8150 mp1 = mp->b_cont->b_cont; 8151 8152 if (ipip->ipi_cmd_type == IF_CMD) { 8153 /* This a old style SIOC[GS]IF* command */ 8154 ifr = (struct ifreq *)mp1->b_rptr; 8155 /* 8156 * Null terminate the string to protect against buffer 8157 * overrun. String was generated by user code and may not 8158 * be trusted. 8159 */ 8160 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8161 name = ifr->ifr_name; 8162 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 8163 ci->ci_sin6 = NULL; 8164 ci->ci_lifr = (struct lifreq *)ifr; 8165 } else { 8166 /* This a new style SIOC[GS]LIF* command */ 8167 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 8168 lifr = (struct lifreq *)mp1->b_rptr; 8169 /* 8170 * Null terminate the string to protect against buffer 8171 * overrun. String was generated by user code and may not 8172 * be trusted. 8173 */ 8174 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8175 name = lifr->lifr_name; 8176 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 8177 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 8178 ci->ci_lifr = lifr; 8179 } 8180 8181 if (ipip->ipi_cmd == SIOCSLIFNAME) { 8182 /* 8183 * The ioctl will be failed if the ioctl comes down 8184 * an conn stream 8185 */ 8186 if (ill == NULL) { 8187 /* 8188 * Not an ill queue, return EINVAL same as the 8189 * old error code. 8190 */ 8191 return (ENXIO); 8192 } 8193 ipif = ill->ill_ipif; 8194 ipif_refhold(ipif); 8195 } else { 8196 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8197 &exists, isv6, zoneid, 8198 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err, 8199 ipst); 8200 if (ipif == NULL) { 8201 if (err == EINPROGRESS) 8202 return (err); 8203 err = 0; /* Ensure we don't use it below */ 8204 } 8205 } 8206 8207 /* 8208 * Old style [GS]IFCMD does not admit IPv6 ipif 8209 */ 8210 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 8211 ipif_refrele(ipif); 8212 return (ENXIO); 8213 } 8214 8215 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8216 name[0] == '\0') { 8217 /* 8218 * Handle a or a SIOC?IF* with a null name 8219 * during plumb (on the ill queue before the I_PLINK). 8220 */ 8221 ipif = ill->ill_ipif; 8222 ipif_refhold(ipif); 8223 } 8224 8225 if (ipif == NULL) 8226 return (ENXIO); 8227 8228 ci->ci_ipif = ipif; 8229 return (0); 8230 } 8231 8232 /* 8233 * Return the total number of ipifs. 8234 */ 8235 static uint_t 8236 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 8237 { 8238 uint_t numifs = 0; 8239 ill_t *ill; 8240 ill_walk_context_t ctx; 8241 ipif_t *ipif; 8242 8243 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8244 ill = ILL_START_WALK_V4(&ctx, ipst); 8245 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8246 if (IS_UNDER_IPMP(ill)) 8247 continue; 8248 for (ipif = ill->ill_ipif; ipif != NULL; 8249 ipif = ipif->ipif_next) { 8250 if (ipif->ipif_zoneid == zoneid || 8251 ipif->ipif_zoneid == ALL_ZONES) 8252 numifs++; 8253 } 8254 } 8255 rw_exit(&ipst->ips_ill_g_lock); 8256 return (numifs); 8257 } 8258 8259 /* 8260 * Return the total number of ipifs. 8261 */ 8262 static uint_t 8263 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 8264 { 8265 uint_t numifs = 0; 8266 ill_t *ill; 8267 ipif_t *ipif; 8268 ill_walk_context_t ctx; 8269 8270 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8271 8272 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8273 if (family == AF_INET) 8274 ill = ILL_START_WALK_V4(&ctx, ipst); 8275 else if (family == AF_INET6) 8276 ill = ILL_START_WALK_V6(&ctx, ipst); 8277 else 8278 ill = ILL_START_WALK_ALL(&ctx, ipst); 8279 8280 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8281 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 8282 continue; 8283 8284 for (ipif = ill->ill_ipif; ipif != NULL; 8285 ipif = ipif->ipif_next) { 8286 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8287 !(lifn_flags & LIFC_NOXMIT)) 8288 continue; 8289 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8290 !(lifn_flags & LIFC_TEMPORARY)) 8291 continue; 8292 if (((ipif->ipif_flags & 8293 (IPIF_NOXMIT|IPIF_NOLOCAL| 8294 IPIF_DEPRECATED)) || 8295 IS_LOOPBACK(ill) || 8296 !(ipif->ipif_flags & IPIF_UP)) && 8297 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8298 continue; 8299 8300 if (zoneid != ipif->ipif_zoneid && 8301 ipif->ipif_zoneid != ALL_ZONES && 8302 (zoneid != GLOBAL_ZONEID || 8303 !(lifn_flags & LIFC_ALLZONES))) 8304 continue; 8305 8306 numifs++; 8307 } 8308 } 8309 rw_exit(&ipst->ips_ill_g_lock); 8310 return (numifs); 8311 } 8312 8313 uint_t 8314 ip_get_lifsrcofnum(ill_t *ill) 8315 { 8316 uint_t numifs = 0; 8317 ill_t *ill_head = ill; 8318 ip_stack_t *ipst = ill->ill_ipst; 8319 8320 /* 8321 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8322 * other thread may be trying to relink the ILLs in this usesrc group 8323 * and adjusting the ill_usesrc_grp_next pointers 8324 */ 8325 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8326 if ((ill->ill_usesrc_ifindex == 0) && 8327 (ill->ill_usesrc_grp_next != NULL)) { 8328 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8329 ill = ill->ill_usesrc_grp_next) 8330 numifs++; 8331 } 8332 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8333 8334 return (numifs); 8335 } 8336 8337 /* Null values are passed in for ipif, sin, and ifreq */ 8338 /* ARGSUSED */ 8339 int 8340 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8341 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8342 { 8343 int *nump; 8344 conn_t *connp = Q_TO_CONN(q); 8345 8346 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8347 8348 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8349 nump = (int *)mp->b_cont->b_cont->b_rptr; 8350 8351 *nump = ip_get_numifs(connp->conn_zoneid, 8352 connp->conn_netstack->netstack_ip); 8353 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8354 return (0); 8355 } 8356 8357 /* Null values are passed in for ipif, sin, and ifreq */ 8358 /* ARGSUSED */ 8359 int 8360 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8361 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8362 { 8363 struct lifnum *lifn; 8364 mblk_t *mp1; 8365 conn_t *connp = Q_TO_CONN(q); 8366 8367 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8368 8369 /* Existence checked in ip_wput_nondata */ 8370 mp1 = mp->b_cont->b_cont; 8371 8372 lifn = (struct lifnum *)mp1->b_rptr; 8373 switch (lifn->lifn_family) { 8374 case AF_UNSPEC: 8375 case AF_INET: 8376 case AF_INET6: 8377 break; 8378 default: 8379 return (EAFNOSUPPORT); 8380 } 8381 8382 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8383 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 8384 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8385 return (0); 8386 } 8387 8388 /* ARGSUSED */ 8389 int 8390 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8391 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8392 { 8393 STRUCT_HANDLE(ifconf, ifc); 8394 mblk_t *mp1; 8395 struct iocblk *iocp; 8396 struct ifreq *ifr; 8397 ill_walk_context_t ctx; 8398 ill_t *ill; 8399 ipif_t *ipif; 8400 struct sockaddr_in *sin; 8401 int32_t ifclen; 8402 zoneid_t zoneid; 8403 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8404 8405 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8406 8407 ip1dbg(("ip_sioctl_get_ifconf")); 8408 /* Existence verified in ip_wput_nondata */ 8409 mp1 = mp->b_cont->b_cont; 8410 iocp = (struct iocblk *)mp->b_rptr; 8411 zoneid = Q_TO_CONN(q)->conn_zoneid; 8412 8413 /* 8414 * The original SIOCGIFCONF passed in a struct ifconf which specified 8415 * the user buffer address and length into which the list of struct 8416 * ifreqs was to be copied. Since AT&T Streams does not seem to 8417 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8418 * the SIOCGIFCONF operation was redefined to simply provide 8419 * a large output buffer into which we are supposed to jam the ifreq 8420 * array. The same ioctl command code was used, despite the fact that 8421 * both the applications and the kernel code had to change, thus making 8422 * it impossible to support both interfaces. 8423 * 8424 * For reasons not good enough to try to explain, the following 8425 * algorithm is used for deciding what to do with one of these: 8426 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8427 * form with the output buffer coming down as the continuation message. 8428 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8429 * and we have to copy in the ifconf structure to find out how big the 8430 * output buffer is and where to copy out to. Sure no problem... 8431 * 8432 */ 8433 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8434 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8435 int numifs = 0; 8436 size_t ifc_bufsize; 8437 8438 /* 8439 * Must be (better be!) continuation of a TRANSPARENT 8440 * IOCTL. We just copied in the ifconf structure. 8441 */ 8442 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8443 (struct ifconf *)mp1->b_rptr); 8444 8445 /* 8446 * Allocate a buffer to hold requested information. 8447 * 8448 * If ifc_len is larger than what is needed, we only 8449 * allocate what we will use. 8450 * 8451 * If ifc_len is smaller than what is needed, return 8452 * EINVAL. 8453 * 8454 * XXX: the ill_t structure can hava 2 counters, for 8455 * v4 and v6 (not just ill_ipif_up_count) to store the 8456 * number of interfaces for a device, so we don't need 8457 * to count them here... 8458 */ 8459 numifs = ip_get_numifs(zoneid, ipst); 8460 8461 ifclen = STRUCT_FGET(ifc, ifc_len); 8462 ifc_bufsize = numifs * sizeof (struct ifreq); 8463 if (ifc_bufsize > ifclen) { 8464 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8465 /* old behaviour */ 8466 return (EINVAL); 8467 } else { 8468 ifc_bufsize = ifclen; 8469 } 8470 } 8471 8472 mp1 = mi_copyout_alloc(q, mp, 8473 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8474 if (mp1 == NULL) 8475 return (ENOMEM); 8476 8477 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8478 } 8479 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8480 /* 8481 * the SIOCGIFCONF ioctl only knows about 8482 * IPv4 addresses, so don't try to tell 8483 * it about interfaces with IPv6-only 8484 * addresses. (Last parm 'isv6' is B_FALSE) 8485 */ 8486 8487 ifr = (struct ifreq *)mp1->b_rptr; 8488 8489 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8490 ill = ILL_START_WALK_V4(&ctx, ipst); 8491 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8492 if (IS_UNDER_IPMP(ill)) 8493 continue; 8494 for (ipif = ill->ill_ipif; ipif != NULL; 8495 ipif = ipif->ipif_next) { 8496 if (zoneid != ipif->ipif_zoneid && 8497 ipif->ipif_zoneid != ALL_ZONES) 8498 continue; 8499 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8500 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8501 /* old behaviour */ 8502 rw_exit(&ipst->ips_ill_g_lock); 8503 return (EINVAL); 8504 } else { 8505 goto if_copydone; 8506 } 8507 } 8508 ipif_get_name(ipif, ifr->ifr_name, 8509 sizeof (ifr->ifr_name)); 8510 sin = (sin_t *)&ifr->ifr_addr; 8511 *sin = sin_null; 8512 sin->sin_family = AF_INET; 8513 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8514 ifr++; 8515 } 8516 } 8517 if_copydone: 8518 rw_exit(&ipst->ips_ill_g_lock); 8519 mp1->b_wptr = (uchar_t *)ifr; 8520 8521 if (STRUCT_BUF(ifc) != NULL) { 8522 STRUCT_FSET(ifc, ifc_len, 8523 (int)((uchar_t *)ifr - mp1->b_rptr)); 8524 } 8525 return (0); 8526 } 8527 8528 /* 8529 * Get the interfaces using the address hosted on the interface passed in, 8530 * as a source adddress 8531 */ 8532 /* ARGSUSED */ 8533 int 8534 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8535 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8536 { 8537 mblk_t *mp1; 8538 ill_t *ill, *ill_head; 8539 ipif_t *ipif, *orig_ipif; 8540 int numlifs = 0; 8541 size_t lifs_bufsize, lifsmaxlen; 8542 struct lifreq *lifr; 8543 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8544 uint_t ifindex; 8545 zoneid_t zoneid; 8546 int err = 0; 8547 boolean_t isv6 = B_FALSE; 8548 struct sockaddr_in *sin; 8549 struct sockaddr_in6 *sin6; 8550 STRUCT_HANDLE(lifsrcof, lifs); 8551 ip_stack_t *ipst; 8552 8553 ipst = CONNQ_TO_IPST(q); 8554 8555 ASSERT(q->q_next == NULL); 8556 8557 zoneid = Q_TO_CONN(q)->conn_zoneid; 8558 8559 /* Existence verified in ip_wput_nondata */ 8560 mp1 = mp->b_cont->b_cont; 8561 8562 /* 8563 * Must be (better be!) continuation of a TRANSPARENT 8564 * IOCTL. We just copied in the lifsrcof structure. 8565 */ 8566 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8567 (struct lifsrcof *)mp1->b_rptr); 8568 8569 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8570 return (EINVAL); 8571 8572 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8573 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8574 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8575 ip_process_ioctl, &err, ipst); 8576 if (ipif == NULL) { 8577 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8578 ifindex)); 8579 return (err); 8580 } 8581 8582 /* Allocate a buffer to hold requested information */ 8583 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8584 lifs_bufsize = numlifs * sizeof (struct lifreq); 8585 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8586 /* The actual size needed is always returned in lifs_len */ 8587 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8588 8589 /* If the amount we need is more than what is passed in, abort */ 8590 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8591 ipif_refrele(ipif); 8592 return (0); 8593 } 8594 8595 mp1 = mi_copyout_alloc(q, mp, 8596 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8597 if (mp1 == NULL) { 8598 ipif_refrele(ipif); 8599 return (ENOMEM); 8600 } 8601 8602 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8603 bzero(mp1->b_rptr, lifs_bufsize); 8604 8605 lifr = (struct lifreq *)mp1->b_rptr; 8606 8607 ill = ill_head = ipif->ipif_ill; 8608 orig_ipif = ipif; 8609 8610 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8611 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8612 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8613 8614 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8615 for (; (ill != NULL) && (ill != ill_head); 8616 ill = ill->ill_usesrc_grp_next) { 8617 8618 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8619 break; 8620 8621 ipif = ill->ill_ipif; 8622 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 8623 if (ipif->ipif_isv6) { 8624 sin6 = (sin6_t *)&lifr->lifr_addr; 8625 *sin6 = sin6_null; 8626 sin6->sin6_family = AF_INET6; 8627 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8628 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8629 &ipif->ipif_v6net_mask); 8630 } else { 8631 sin = (sin_t *)&lifr->lifr_addr; 8632 *sin = sin_null; 8633 sin->sin_family = AF_INET; 8634 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8635 lifr->lifr_addrlen = ip_mask_to_plen( 8636 ipif->ipif_net_mask); 8637 } 8638 lifr++; 8639 } 8640 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8641 rw_exit(&ipst->ips_ill_g_lock); 8642 ipif_refrele(orig_ipif); 8643 mp1->b_wptr = (uchar_t *)lifr; 8644 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8645 8646 return (0); 8647 } 8648 8649 /* ARGSUSED */ 8650 int 8651 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8652 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8653 { 8654 mblk_t *mp1; 8655 int list; 8656 ill_t *ill; 8657 ipif_t *ipif; 8658 int flags; 8659 int numlifs = 0; 8660 size_t lifc_bufsize; 8661 struct lifreq *lifr; 8662 sa_family_t family; 8663 struct sockaddr_in *sin; 8664 struct sockaddr_in6 *sin6; 8665 ill_walk_context_t ctx; 8666 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8667 int32_t lifclen; 8668 zoneid_t zoneid; 8669 STRUCT_HANDLE(lifconf, lifc); 8670 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8671 8672 ip1dbg(("ip_sioctl_get_lifconf")); 8673 8674 ASSERT(q->q_next == NULL); 8675 8676 zoneid = Q_TO_CONN(q)->conn_zoneid; 8677 8678 /* Existence verified in ip_wput_nondata */ 8679 mp1 = mp->b_cont->b_cont; 8680 8681 /* 8682 * An extended version of SIOCGIFCONF that takes an 8683 * additional address family and flags field. 8684 * AF_UNSPEC retrieve both IPv4 and IPv6. 8685 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8686 * interfaces are omitted. 8687 * Similarly, IPIF_TEMPORARY interfaces are omitted 8688 * unless LIFC_TEMPORARY is specified. 8689 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8690 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8691 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8692 * has priority over LIFC_NOXMIT. 8693 */ 8694 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8695 8696 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8697 return (EINVAL); 8698 8699 /* 8700 * Must be (better be!) continuation of a TRANSPARENT 8701 * IOCTL. We just copied in the lifconf structure. 8702 */ 8703 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8704 8705 family = STRUCT_FGET(lifc, lifc_family); 8706 flags = STRUCT_FGET(lifc, lifc_flags); 8707 8708 switch (family) { 8709 case AF_UNSPEC: 8710 /* 8711 * walk all ILL's. 8712 */ 8713 list = MAX_G_HEADS; 8714 break; 8715 case AF_INET: 8716 /* 8717 * walk only IPV4 ILL's. 8718 */ 8719 list = IP_V4_G_HEAD; 8720 break; 8721 case AF_INET6: 8722 /* 8723 * walk only IPV6 ILL's. 8724 */ 8725 list = IP_V6_G_HEAD; 8726 break; 8727 default: 8728 return (EAFNOSUPPORT); 8729 } 8730 8731 /* 8732 * Allocate a buffer to hold requested information. 8733 * 8734 * If lifc_len is larger than what is needed, we only 8735 * allocate what we will use. 8736 * 8737 * If lifc_len is smaller than what is needed, return 8738 * EINVAL. 8739 */ 8740 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 8741 lifc_bufsize = numlifs * sizeof (struct lifreq); 8742 lifclen = STRUCT_FGET(lifc, lifc_len); 8743 if (lifc_bufsize > lifclen) { 8744 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8745 return (EINVAL); 8746 else 8747 lifc_bufsize = lifclen; 8748 } 8749 8750 mp1 = mi_copyout_alloc(q, mp, 8751 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8752 if (mp1 == NULL) 8753 return (ENOMEM); 8754 8755 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8756 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8757 8758 lifr = (struct lifreq *)mp1->b_rptr; 8759 8760 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8761 ill = ill_first(list, list, &ctx, ipst); 8762 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8763 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 8764 continue; 8765 8766 for (ipif = ill->ill_ipif; ipif != NULL; 8767 ipif = ipif->ipif_next) { 8768 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8769 !(flags & LIFC_NOXMIT)) 8770 continue; 8771 8772 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8773 !(flags & LIFC_TEMPORARY)) 8774 continue; 8775 8776 if (((ipif->ipif_flags & 8777 (IPIF_NOXMIT|IPIF_NOLOCAL| 8778 IPIF_DEPRECATED)) || 8779 IS_LOOPBACK(ill) || 8780 !(ipif->ipif_flags & IPIF_UP)) && 8781 (flags & LIFC_EXTERNAL_SOURCE)) 8782 continue; 8783 8784 if (zoneid != ipif->ipif_zoneid && 8785 ipif->ipif_zoneid != ALL_ZONES && 8786 (zoneid != GLOBAL_ZONEID || 8787 !(flags & LIFC_ALLZONES))) 8788 continue; 8789 8790 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8791 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8792 rw_exit(&ipst->ips_ill_g_lock); 8793 return (EINVAL); 8794 } else { 8795 goto lif_copydone; 8796 } 8797 } 8798 8799 ipif_get_name(ipif, lifr->lifr_name, 8800 sizeof (lifr->lifr_name)); 8801 lifr->lifr_type = ill->ill_type; 8802 if (ipif->ipif_isv6) { 8803 sin6 = (sin6_t *)&lifr->lifr_addr; 8804 *sin6 = sin6_null; 8805 sin6->sin6_family = AF_INET6; 8806 sin6->sin6_addr = 8807 ipif->ipif_v6lcl_addr; 8808 lifr->lifr_addrlen = 8809 ip_mask_to_plen_v6( 8810 &ipif->ipif_v6net_mask); 8811 } else { 8812 sin = (sin_t *)&lifr->lifr_addr; 8813 *sin = sin_null; 8814 sin->sin_family = AF_INET; 8815 sin->sin_addr.s_addr = 8816 ipif->ipif_lcl_addr; 8817 lifr->lifr_addrlen = 8818 ip_mask_to_plen( 8819 ipif->ipif_net_mask); 8820 } 8821 lifr++; 8822 } 8823 } 8824 lif_copydone: 8825 rw_exit(&ipst->ips_ill_g_lock); 8826 8827 mp1->b_wptr = (uchar_t *)lifr; 8828 if (STRUCT_BUF(lifc) != NULL) { 8829 STRUCT_FSET(lifc, lifc_len, 8830 (int)((uchar_t *)lifr - mp1->b_rptr)); 8831 } 8832 return (0); 8833 } 8834 8835 static void 8836 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 8837 { 8838 ip6_asp_t *table; 8839 size_t table_size; 8840 mblk_t *data_mp; 8841 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8842 ip_stack_t *ipst; 8843 8844 if (q->q_next == NULL) 8845 ipst = CONNQ_TO_IPST(q); 8846 else 8847 ipst = ILLQ_TO_IPST(q); 8848 8849 /* These two ioctls are I_STR only */ 8850 if (iocp->ioc_count == TRANSPARENT) { 8851 miocnak(q, mp, 0, EINVAL); 8852 return; 8853 } 8854 8855 data_mp = mp->b_cont; 8856 if (data_mp == NULL) { 8857 /* The user passed us a NULL argument */ 8858 table = NULL; 8859 table_size = iocp->ioc_count; 8860 } else { 8861 /* 8862 * The user provided a table. The stream head 8863 * may have copied in the user data in chunks, 8864 * so make sure everything is pulled up 8865 * properly. 8866 */ 8867 if (MBLKL(data_mp) < iocp->ioc_count) { 8868 mblk_t *new_data_mp; 8869 if ((new_data_mp = msgpullup(data_mp, -1)) == 8870 NULL) { 8871 miocnak(q, mp, 0, ENOMEM); 8872 return; 8873 } 8874 freemsg(data_mp); 8875 data_mp = new_data_mp; 8876 mp->b_cont = data_mp; 8877 } 8878 table = (ip6_asp_t *)data_mp->b_rptr; 8879 table_size = iocp->ioc_count; 8880 } 8881 8882 switch (iocp->ioc_cmd) { 8883 case SIOCGIP6ADDRPOLICY: 8884 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 8885 if (iocp->ioc_rval == -1) 8886 iocp->ioc_error = EINVAL; 8887 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8888 else if (table != NULL && 8889 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 8890 ip6_asp_t *src = table; 8891 ip6_asp32_t *dst = (void *)table; 8892 int count = table_size / sizeof (ip6_asp_t); 8893 int i; 8894 8895 /* 8896 * We need to do an in-place shrink of the array 8897 * to match the alignment attributes of the 8898 * 32-bit ABI looking at it. 8899 */ 8900 /* LINTED: logical expression always true: op "||" */ 8901 ASSERT(sizeof (*src) > sizeof (*dst)); 8902 for (i = 1; i < count; i++) 8903 bcopy(src + i, dst + i, sizeof (*dst)); 8904 } 8905 #endif 8906 break; 8907 8908 case SIOCSIP6ADDRPOLICY: 8909 ASSERT(mp->b_prev == NULL); 8910 mp->b_prev = (void *)q; 8911 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8912 /* 8913 * We pass in the datamodel here so that the ip6_asp_replace() 8914 * routine can handle converting from 32-bit to native formats 8915 * where necessary. 8916 * 8917 * A better way to handle this might be to convert the inbound 8918 * data structure here, and hang it off a new 'mp'; thus the 8919 * ip6_asp_replace() logic would always be dealing with native 8920 * format data structures.. 8921 * 8922 * (An even simpler way to handle these ioctls is to just 8923 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 8924 * and just recompile everything that depends on it.) 8925 */ 8926 #endif 8927 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 8928 iocp->ioc_flag & IOC_MODELS); 8929 return; 8930 } 8931 8932 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 8933 qreply(q, mp); 8934 } 8935 8936 static void 8937 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 8938 { 8939 mblk_t *data_mp; 8940 struct dstinforeq *dir; 8941 uint8_t *end, *cur; 8942 in6_addr_t *daddr, *saddr; 8943 ipaddr_t v4daddr; 8944 ire_t *ire; 8945 char *slabel, *dlabel; 8946 boolean_t isipv4; 8947 int match_ire; 8948 ill_t *dst_ill; 8949 ipif_t *src_ipif, *ire_ipif; 8950 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8951 zoneid_t zoneid; 8952 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8953 8954 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8955 zoneid = Q_TO_CONN(q)->conn_zoneid; 8956 8957 /* 8958 * This ioctl is I_STR only, and must have a 8959 * data mblk following the M_IOCTL mblk. 8960 */ 8961 data_mp = mp->b_cont; 8962 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 8963 miocnak(q, mp, 0, EINVAL); 8964 return; 8965 } 8966 8967 if (MBLKL(data_mp) < iocp->ioc_count) { 8968 mblk_t *new_data_mp; 8969 8970 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 8971 miocnak(q, mp, 0, ENOMEM); 8972 return; 8973 } 8974 freemsg(data_mp); 8975 data_mp = new_data_mp; 8976 mp->b_cont = data_mp; 8977 } 8978 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 8979 8980 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 8981 end - cur >= sizeof (struct dstinforeq); 8982 cur += sizeof (struct dstinforeq)) { 8983 dir = (struct dstinforeq *)cur; 8984 daddr = &dir->dir_daddr; 8985 saddr = &dir->dir_saddr; 8986 8987 /* 8988 * ip_addr_scope_v6() and ip6_asp_lookup() handle 8989 * v4 mapped addresses; ire_ftable_lookup[_v6]() 8990 * and ipif_select_source[_v6]() do not. 8991 */ 8992 dir->dir_dscope = ip_addr_scope_v6(daddr); 8993 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 8994 8995 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 8996 if (isipv4) { 8997 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 8998 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 8999 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9000 } else { 9001 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 9002 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9003 } 9004 if (ire == NULL) { 9005 dir->dir_dreachable = 0; 9006 9007 /* move on to next dst addr */ 9008 continue; 9009 } 9010 dir->dir_dreachable = 1; 9011 9012 ire_ipif = ire->ire_ipif; 9013 if (ire_ipif == NULL) 9014 goto next_dst; 9015 9016 /* 9017 * We expect to get back an interface ire or a 9018 * gateway ire cache entry. For both types, the 9019 * output interface is ire_ipif->ipif_ill. 9020 */ 9021 dst_ill = ire_ipif->ipif_ill; 9022 dir->dir_dmactype = dst_ill->ill_mactype; 9023 9024 if (isipv4) { 9025 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 9026 } else { 9027 src_ipif = ipif_select_source_v6(dst_ill, 9028 daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); 9029 } 9030 if (src_ipif == NULL) 9031 goto next_dst; 9032 9033 *saddr = src_ipif->ipif_v6lcl_addr; 9034 dir->dir_sscope = ip_addr_scope_v6(saddr); 9035 slabel = ip6_asp_lookup(saddr, NULL, ipst); 9036 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 9037 dir->dir_sdeprecated = 9038 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 9039 ipif_refrele(src_ipif); 9040 next_dst: 9041 ire_refrele(ire); 9042 } 9043 miocack(q, mp, iocp->ioc_count, 0); 9044 } 9045 9046 /* 9047 * Check if this is an address assigned to this machine. 9048 * Skips interfaces that are down by using ire checks. 9049 * Translates mapped addresses to v4 addresses and then 9050 * treats them as such, returning true if the v4 address 9051 * associated with this mapped address is configured. 9052 * Note: Applications will have to be careful what they do 9053 * with the response; use of mapped addresses limits 9054 * what can be done with the socket, especially with 9055 * respect to socket options and ioctls - neither IPv4 9056 * options nor IPv6 sticky options/ancillary data options 9057 * may be used. 9058 */ 9059 /* ARGSUSED */ 9060 int 9061 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9062 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9063 { 9064 struct sioc_addrreq *sia; 9065 sin_t *sin; 9066 ire_t *ire; 9067 mblk_t *mp1; 9068 zoneid_t zoneid; 9069 ip_stack_t *ipst; 9070 9071 ip1dbg(("ip_sioctl_tmyaddr")); 9072 9073 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9074 zoneid = Q_TO_CONN(q)->conn_zoneid; 9075 ipst = CONNQ_TO_IPST(q); 9076 9077 /* Existence verified in ip_wput_nondata */ 9078 mp1 = mp->b_cont->b_cont; 9079 sia = (struct sioc_addrreq *)mp1->b_rptr; 9080 sin = (sin_t *)&sia->sa_addr; 9081 switch (sin->sin_family) { 9082 case AF_INET6: { 9083 sin6_t *sin6 = (sin6_t *)sin; 9084 9085 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9086 ipaddr_t v4_addr; 9087 9088 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9089 v4_addr); 9090 ire = ire_ctable_lookup(v4_addr, 0, 9091 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9092 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9093 } else { 9094 in6_addr_t v6addr; 9095 9096 v6addr = sin6->sin6_addr; 9097 ire = ire_ctable_lookup_v6(&v6addr, 0, 9098 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9099 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9100 } 9101 break; 9102 } 9103 case AF_INET: { 9104 ipaddr_t v4addr; 9105 9106 v4addr = sin->sin_addr.s_addr; 9107 ire = ire_ctable_lookup(v4addr, 0, 9108 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9109 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9110 break; 9111 } 9112 default: 9113 return (EAFNOSUPPORT); 9114 } 9115 if (ire != NULL) { 9116 sia->sa_res = 1; 9117 ire_refrele(ire); 9118 } else { 9119 sia->sa_res = 0; 9120 } 9121 return (0); 9122 } 9123 9124 /* 9125 * Check if this is an address assigned on-link i.e. neighbor, 9126 * and makes sure it's reachable from the current zone. 9127 * Returns true for my addresses as well. 9128 * Translates mapped addresses to v4 addresses and then 9129 * treats them as such, returning true if the v4 address 9130 * associated with this mapped address is configured. 9131 * Note: Applications will have to be careful what they do 9132 * with the response; use of mapped addresses limits 9133 * what can be done with the socket, especially with 9134 * respect to socket options and ioctls - neither IPv4 9135 * options nor IPv6 sticky options/ancillary data options 9136 * may be used. 9137 */ 9138 /* ARGSUSED */ 9139 int 9140 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9141 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9142 { 9143 struct sioc_addrreq *sia; 9144 sin_t *sin; 9145 mblk_t *mp1; 9146 ire_t *ire = NULL; 9147 zoneid_t zoneid; 9148 ip_stack_t *ipst; 9149 9150 ip1dbg(("ip_sioctl_tonlink")); 9151 9152 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9153 zoneid = Q_TO_CONN(q)->conn_zoneid; 9154 ipst = CONNQ_TO_IPST(q); 9155 9156 /* Existence verified in ip_wput_nondata */ 9157 mp1 = mp->b_cont->b_cont; 9158 sia = (struct sioc_addrreq *)mp1->b_rptr; 9159 sin = (sin_t *)&sia->sa_addr; 9160 9161 /* 9162 * Match addresses with a zero gateway field to avoid 9163 * routes going through a router. 9164 * Exclude broadcast and multicast addresses. 9165 */ 9166 switch (sin->sin_family) { 9167 case AF_INET6: { 9168 sin6_t *sin6 = (sin6_t *)sin; 9169 9170 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9171 ipaddr_t v4_addr; 9172 9173 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9174 v4_addr); 9175 if (!CLASSD(v4_addr)) { 9176 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9177 NULL, NULL, zoneid, NULL, 9178 MATCH_IRE_GW, ipst); 9179 } 9180 } else { 9181 in6_addr_t v6addr; 9182 in6_addr_t v6gw; 9183 9184 v6addr = sin6->sin6_addr; 9185 v6gw = ipv6_all_zeros; 9186 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9187 ire = ire_route_lookup_v6(&v6addr, 0, 9188 &v6gw, 0, NULL, NULL, zoneid, 9189 NULL, MATCH_IRE_GW, ipst); 9190 } 9191 } 9192 break; 9193 } 9194 case AF_INET: { 9195 ipaddr_t v4addr; 9196 9197 v4addr = sin->sin_addr.s_addr; 9198 if (!CLASSD(v4addr)) { 9199 ire = ire_route_lookup(v4addr, 0, 0, 0, 9200 NULL, NULL, zoneid, NULL, 9201 MATCH_IRE_GW, ipst); 9202 } 9203 break; 9204 } 9205 default: 9206 return (EAFNOSUPPORT); 9207 } 9208 sia->sa_res = 0; 9209 if (ire != NULL) { 9210 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9211 IRE_LOCAL|IRE_LOOPBACK)) { 9212 sia->sa_res = 1; 9213 } 9214 ire_refrele(ire); 9215 } 9216 return (0); 9217 } 9218 9219 /* 9220 * TBD: implement when kernel maintaines a list of site prefixes. 9221 */ 9222 /* ARGSUSED */ 9223 int 9224 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9225 ip_ioctl_cmd_t *ipip, void *ifreq) 9226 { 9227 return (ENXIO); 9228 } 9229 9230 /* 9231 * ARP IOCTLs. 9232 * How does IP get in the business of fronting ARP configuration/queries? 9233 * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9234 * are by tradition passed in through a datagram socket. That lands in IP. 9235 * As it happens, this is just as well since the interface is quite crude in 9236 * that it passes in no information about protocol or hardware types, or 9237 * interface association. After making the protocol assumption, IP is in 9238 * the position to look up the name of the ILL, which ARP will need, and 9239 * format a request that can be handled by ARP. The request is passed up 9240 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9241 * back a response. ARP supports its own set of more general IOCTLs, in 9242 * case anyone is interested. 9243 */ 9244 /* ARGSUSED */ 9245 int 9246 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9247 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9248 { 9249 mblk_t *mp1; 9250 mblk_t *mp2; 9251 mblk_t *pending_mp; 9252 ipaddr_t ipaddr; 9253 area_t *area; 9254 struct iocblk *iocp; 9255 conn_t *connp; 9256 struct arpreq *ar; 9257 struct xarpreq *xar; 9258 int flags, alength; 9259 uchar_t *lladdr; 9260 ire_t *ire; 9261 ip_stack_t *ipst; 9262 ill_t *ill = ipif->ipif_ill; 9263 ill_t *proxy_ill = NULL; 9264 ipmp_arpent_t *entp = NULL; 9265 boolean_t if_arp_ioctl = B_FALSE; 9266 boolean_t proxyarp = B_FALSE; 9267 9268 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9269 connp = Q_TO_CONN(q); 9270 ipst = connp->conn_netstack->netstack_ip; 9271 9272 if (ipip->ipi_cmd_type == XARP_CMD) { 9273 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9274 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9275 ar = NULL; 9276 9277 flags = xar->xarp_flags; 9278 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 9279 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 9280 /* 9281 * Validate against user's link layer address length 9282 * input and name and addr length limits. 9283 */ 9284 alength = ill->ill_phys_addr_length; 9285 if (ipip->ipi_cmd == SIOCSXARP) { 9286 if (alength != xar->xarp_ha.sdl_alen || 9287 (alength + xar->xarp_ha.sdl_nlen > 9288 sizeof (xar->xarp_ha.sdl_data))) 9289 return (EINVAL); 9290 } 9291 } else { 9292 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9293 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9294 xar = NULL; 9295 9296 flags = ar->arp_flags; 9297 lladdr = (uchar_t *)ar->arp_ha.sa_data; 9298 /* 9299 * Theoretically, the sa_family could tell us what link 9300 * layer type this operation is trying to deal with. By 9301 * common usage AF_UNSPEC means ethernet. We'll assume 9302 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9303 * for now. Our new SIOC*XARP ioctls can be used more 9304 * generally. 9305 * 9306 * If the underlying media happens to have a non 6 byte 9307 * address, arp module will fail set/get, but the del 9308 * operation will succeed. 9309 */ 9310 alength = 6; 9311 if ((ipip->ipi_cmd != SIOCDARP) && 9312 (alength != ill->ill_phys_addr_length)) { 9313 return (EINVAL); 9314 } 9315 } 9316 9317 ipaddr = sin->sin_addr.s_addr; 9318 9319 /* 9320 * IPMP ARP special handling: 9321 * 9322 * 1. Since ARP mappings must appear consistent across the group, 9323 * prohibit changing ARP mappings on the underlying interfaces. 9324 * 9325 * 2. Since ARP mappings for IPMP data addresses are maintained by 9326 * IP itself, prohibit changing them. 9327 * 9328 * 3. For proxy ARP, use a functioning hardware address in the group, 9329 * provided one exists. If one doesn't, just add the entry as-is; 9330 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 9331 */ 9332 if (IS_UNDER_IPMP(ill)) { 9333 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 9334 return (EPERM); 9335 } 9336 if (IS_IPMP(ill)) { 9337 ipmp_illgrp_t *illg = ill->ill_grp; 9338 9339 switch (ipip->ipi_cmd) { 9340 case SIOCSARP: 9341 case SIOCSXARP: 9342 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 9343 if (proxy_ill != NULL) { 9344 proxyarp = B_TRUE; 9345 if (!ipmp_ill_is_active(proxy_ill)) 9346 proxy_ill = ipmp_illgrp_next_ill(illg); 9347 if (proxy_ill != NULL) 9348 lladdr = proxy_ill->ill_phys_addr; 9349 } 9350 /* FALLTHRU */ 9351 case SIOCDARP: 9352 case SIOCDXARP: 9353 ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL, 9354 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 9355 if (ire != NULL) { 9356 ire_refrele(ire); 9357 return (EPERM); 9358 } 9359 } 9360 } 9361 9362 /* 9363 * We are going to pass up to ARP a packet chain that looks 9364 * like: 9365 * 9366 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9367 * 9368 * Get a copy of the original IOCTL mblk to head the chain, 9369 * to be sent up (in mp1). Also get another copy to store 9370 * in the ill_pending_mp list, for matching the response 9371 * when it comes back from ARP. 9372 */ 9373 mp1 = copyb(mp); 9374 pending_mp = copymsg(mp); 9375 if (mp1 == NULL || pending_mp == NULL) { 9376 if (mp1 != NULL) 9377 freeb(mp1); 9378 if (pending_mp != NULL) 9379 inet_freemsg(pending_mp); 9380 return (ENOMEM); 9381 } 9382 9383 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9384 (caddr_t)&ipaddr); 9385 if (mp2 == NULL) { 9386 freeb(mp1); 9387 inet_freemsg(pending_mp); 9388 return (ENOMEM); 9389 } 9390 /* Put together the chain. */ 9391 mp1->b_cont = mp2; 9392 mp1->b_datap->db_type = M_IOCTL; 9393 mp2->b_cont = mp; 9394 mp2->b_datap->db_type = M_DATA; 9395 9396 iocp = (struct iocblk *)mp1->b_rptr; 9397 9398 /* 9399 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9400 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9401 * cp_private field (or cp_rval on 32-bit systems) in place of the 9402 * ioc_count field; set ioc_count to be correct. 9403 */ 9404 iocp->ioc_count = MBLKL(mp1->b_cont); 9405 9406 /* 9407 * Set the proper command in the ARP message. 9408 * Convert the SIOC{G|S|D}ARP calls into our 9409 * AR_ENTRY_xxx calls. 9410 */ 9411 area = (area_t *)mp2->b_rptr; 9412 switch (iocp->ioc_cmd) { 9413 case SIOCDARP: 9414 case SIOCDXARP: 9415 /* 9416 * We defer deleting the corresponding IRE until 9417 * we return from arp. 9418 */ 9419 area->area_cmd = AR_ENTRY_DELETE; 9420 area->area_proto_mask_offset = 0; 9421 break; 9422 case SIOCGARP: 9423 case SIOCGXARP: 9424 area->area_cmd = AR_ENTRY_SQUERY; 9425 area->area_proto_mask_offset = 0; 9426 break; 9427 case SIOCSARP: 9428 case SIOCSXARP: 9429 /* 9430 * Delete the corresponding ire to make sure IP will 9431 * pick up any change from arp. 9432 */ 9433 if (!if_arp_ioctl) { 9434 (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst); 9435 } else { 9436 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9437 if (ipif != NULL) { 9438 (void) ip_ire_clookup_and_delete(ipaddr, ipif, 9439 ipst); 9440 ipif_refrele(ipif); 9441 } 9442 } 9443 break; 9444 } 9445 iocp->ioc_cmd = area->area_cmd; 9446 9447 /* 9448 * Fill in the rest of the ARP operation fields. 9449 */ 9450 area->area_hw_addr_length = alength; 9451 bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength); 9452 9453 /* Translate the flags. */ 9454 if (flags & ATF_PERM) 9455 area->area_flags |= ACE_F_PERMANENT; 9456 if (flags & ATF_PUBL) 9457 area->area_flags |= ACE_F_PUBLISH; 9458 if (flags & ATF_AUTHORITY) 9459 area->area_flags |= ACE_F_AUTHORITY; 9460 9461 /* 9462 * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it 9463 * so that IP can update ARP as the active ills in the group change. 9464 */ 9465 if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD && 9466 (area->area_flags & ACE_F_PERMANENT)) { 9467 entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp); 9468 9469 /* 9470 * The second part of the conditional below handles a corner 9471 * case: if this is proxy ARP and the IPMP group has no active 9472 * interfaces, we can't send the request to ARP now since it 9473 * won't be able to build an ACE. So we return success and 9474 * notify ARP about the proxy ARP entry once an interface 9475 * becomes active. 9476 */ 9477 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 9478 mp2->b_cont = NULL; 9479 inet_freemsg(mp1); 9480 inet_freemsg(pending_mp); 9481 return (entp == NULL ? ENOMEM : 0); 9482 } 9483 } 9484 9485 /* 9486 * Before sending 'mp' to ARP, we have to clear the b_next 9487 * and b_prev. Otherwise if STREAMS encounters such a message 9488 * in freemsg(), (because ARP can close any time) it can cause 9489 * a panic. But mi code needs the b_next and b_prev values of 9490 * mp->b_cont, to complete the ioctl. So we store it here 9491 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9492 * when the response comes down from ARP. 9493 */ 9494 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9495 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9496 mp->b_cont->b_next = NULL; 9497 mp->b_cont->b_prev = NULL; 9498 9499 mutex_enter(&connp->conn_lock); 9500 mutex_enter(&ill->ill_lock); 9501 /* conn has not yet started closing, hence this can't fail */ 9502 if (ipip->ipi_flags & IPI_WR) { 9503 VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9504 pending_mp, 0) != 0); 9505 } else { 9506 VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); 9507 } 9508 mutex_exit(&ill->ill_lock); 9509 mutex_exit(&connp->conn_lock); 9510 9511 /* 9512 * Up to ARP it goes. The response will come back in ip_wput() as an 9513 * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion. 9514 */ 9515 putnext(ill->ill_rq, mp1); 9516 9517 /* 9518 * If we created an IPMP ARP entry, mark that we've notified ARP. 9519 */ 9520 if (entp != NULL) 9521 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 9522 9523 return (EINPROGRESS); 9524 } 9525 9526 /* 9527 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 9528 * the associated sin and refhold and return the associated ipif via `ci'. 9529 */ 9530 int 9531 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 9532 cmd_info_t *ci, ipsq_func_t func) 9533 { 9534 mblk_t *mp1; 9535 int err; 9536 sin_t *sin; 9537 conn_t *connp; 9538 ipif_t *ipif; 9539 ire_t *ire = NULL; 9540 ill_t *ill = NULL; 9541 boolean_t exists; 9542 ip_stack_t *ipst; 9543 struct arpreq *ar; 9544 struct xarpreq *xar; 9545 struct sockaddr_dl *sdl; 9546 9547 /* ioctl comes down on a conn */ 9548 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9549 connp = Q_TO_CONN(q); 9550 if (connp->conn_af_isv6) 9551 return (ENXIO); 9552 9553 ipst = connp->conn_netstack->netstack_ip; 9554 9555 /* Verified in ip_wput_nondata */ 9556 mp1 = mp->b_cont->b_cont; 9557 9558 if (ipip->ipi_cmd_type == XARP_CMD) { 9559 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 9560 xar = (struct xarpreq *)mp1->b_rptr; 9561 sin = (sin_t *)&xar->xarp_pa; 9562 sdl = &xar->xarp_ha; 9563 9564 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 9565 return (ENXIO); 9566 if (sdl->sdl_nlen >= LIFNAMSIZ) 9567 return (EINVAL); 9568 } else { 9569 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 9570 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 9571 ar = (struct arpreq *)mp1->b_rptr; 9572 sin = (sin_t *)&ar->arp_pa; 9573 } 9574 9575 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 9576 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 9577 B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp), 9578 mp, func, &err, ipst); 9579 if (ipif == NULL) 9580 return (err); 9581 if (ipif->ipif_id != 0) { 9582 ipif_refrele(ipif); 9583 return (ENXIO); 9584 } 9585 } else { 9586 /* 9587 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 9588 * of 0: use the IP address to find the ipif. If the IP 9589 * address is an IPMP test address, ire_ftable_lookup() will 9590 * find the wrong ill, so we first do an ipif_lookup_addr(). 9591 */ 9592 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 9593 CONNP_TO_WQ(connp), mp, func, &err, ipst); 9594 if (ipif == NULL) { 9595 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0, 9596 IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL, 9597 MATCH_IRE_TYPE, ipst); 9598 if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) { 9599 if (ire != NULL) 9600 ire_refrele(ire); 9601 return (ENXIO); 9602 } 9603 ipif = ill->ill_ipif; 9604 ipif_refhold(ipif); 9605 ire_refrele(ire); 9606 } 9607 } 9608 9609 if (ipif->ipif_net_type != IRE_IF_RESOLVER) { 9610 ipif_refrele(ipif); 9611 return (ENXIO); 9612 } 9613 9614 ci->ci_sin = sin; 9615 ci->ci_ipif = ipif; 9616 return (0); 9617 } 9618 9619 /* 9620 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 9621 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 9622 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 9623 * up and thus an ill can join that illgrp. 9624 * 9625 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 9626 * open()/close() primarily because close() is not allowed to fail or block 9627 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 9628 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 9629 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 9630 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 9631 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 9632 * state if I_UNLINK didn't occur. 9633 * 9634 * Note that for each plumb/unplumb operation, we may end up here more than 9635 * once because of the way ifconfig works. However, it's OK to link the same 9636 * illgrp more than once, or unlink an illgrp that's already unlinked. 9637 */ 9638 static int 9639 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 9640 { 9641 int err; 9642 ip_stack_t *ipst = ill->ill_ipst; 9643 9644 ASSERT(IS_IPMP(ill)); 9645 ASSERT(IAM_WRITER_ILL(ill)); 9646 9647 switch (ioccmd) { 9648 case I_LINK: 9649 return (ENOTSUP); 9650 9651 case I_PLINK: 9652 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 9653 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 9654 rw_exit(&ipst->ips_ipmp_lock); 9655 break; 9656 9657 case I_PUNLINK: 9658 /* 9659 * Require all UP ipifs be brought down prior to unlinking the 9660 * illgrp so any associated IREs (and other state) is torched. 9661 */ 9662 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 9663 return (EBUSY); 9664 9665 /* 9666 * NOTE: We hold ipmp_lock across the unlink to prevent a race 9667 * with an SIOCSLIFGROUPNAME request from an ill trying to 9668 * join this group. Specifically: ills trying to join grab 9669 * ipmp_lock and bump a "pending join" counter checked by 9670 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 9671 * joins can occur (since we have ipmp_lock). Once we drop 9672 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 9673 * find the illgrp (since we unlinked it) and will return 9674 * EAFNOSUPPORT. This will then take them back through the 9675 * IPMP meta-interface plumbing logic in ifconfig, and thus 9676 * back through I_PLINK above. 9677 */ 9678 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 9679 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 9680 rw_exit(&ipst->ips_ipmp_lock); 9681 return (err); 9682 default: 9683 break; 9684 } 9685 return (0); 9686 } 9687 9688 /* 9689 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9690 * atomically set/clear the muxids. Also complete the ioctl by acking or 9691 * naking it. Note that the code is structured such that the link type, 9692 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9693 * its clones use the persistent link, while pppd(1M) and perhaps many 9694 * other daemons may use non-persistent link. When combined with some 9695 * ill_t states, linking and unlinking lower streams may be used as 9696 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9697 */ 9698 /* ARGSUSED */ 9699 void 9700 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9701 { 9702 mblk_t *mp1, *mp2; 9703 struct linkblk *li; 9704 struct ipmx_s *ipmxp; 9705 ill_t *ill; 9706 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 9707 int err = 0; 9708 boolean_t entered_ipsq = B_FALSE; 9709 boolean_t islink; 9710 ip_stack_t *ipst; 9711 9712 if (CONN_Q(q)) 9713 ipst = CONNQ_TO_IPST(q); 9714 else 9715 ipst = ILLQ_TO_IPST(q); 9716 9717 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 9718 ioccmd == I_LINK || ioccmd == I_UNLINK); 9719 9720 islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9721 9722 mp1 = mp->b_cont; /* This is the linkblk info */ 9723 li = (struct linkblk *)mp1->b_rptr; 9724 9725 /* 9726 * ARP has added this special mblk, and the utility is asking us 9727 * to perform consistency checks, and also atomically set the 9728 * muxid. Ifconfig is an example. It achieves this by using 9729 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9730 * to /dev/udp[6] stream for use as the mux when plinking the IP 9731 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9732 * and other comments in this routine for more details. 9733 */ 9734 mp2 = mp1->b_cont; /* This is added by ARP */ 9735 9736 /* 9737 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9738 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9739 * get the special mblk above. For backward compatibility, we 9740 * request ip_sioctl_plink_ipmod() to skip the consistency checks. 9741 * The utility will use SIOCSLIFMUXID to store the muxids. This is 9742 * not atomic, and can leave the streams unplumbable if the utility 9743 * is interrupted before it does the SIOCSLIFMUXID. 9744 */ 9745 if (mp2 == NULL) { 9746 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE); 9747 if (err == EINPROGRESS) 9748 return; 9749 goto done; 9750 } 9751 9752 /* 9753 * This is an I_{P}LINK sent down by ifconfig through the ARP module; 9754 * ARP has appended this last mblk to tell us whether the lower stream 9755 * is an arp-dev stream or an IP module stream. 9756 */ 9757 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9758 if (ipmxp->ipmx_arpdev_stream) { 9759 /* 9760 * The lower stream is the arp-dev stream. 9761 */ 9762 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9763 q, mp, ip_sioctl_plink, &err, NULL, ipst); 9764 if (ill == NULL) { 9765 if (err == EINPROGRESS) 9766 return; 9767 err = EINVAL; 9768 goto done; 9769 } 9770 9771 if (ipsq == NULL) { 9772 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9773 NEW_OP, B_FALSE); 9774 if (ipsq == NULL) { 9775 ill_refrele(ill); 9776 return; 9777 } 9778 entered_ipsq = B_TRUE; 9779 } 9780 ASSERT(IAM_WRITER_ILL(ill)); 9781 ill_refrele(ill); 9782 9783 /* 9784 * To ensure consistency between IP and ARP, the following 9785 * LIFO scheme is used in plink/punlink. (IP first, ARP last). 9786 * This is because the muxid's are stored in the IP stream on 9787 * the ill. 9788 * 9789 * I_{P}LINK: ifconfig plinks the IP stream before plinking 9790 * the ARP stream. On an arp-dev stream, IP checks that it is 9791 * not yet plinked, and it also checks that the corresponding 9792 * IP stream is already plinked. 9793 * 9794 * I_{P}UNLINK: ifconfig punlinks the ARP stream before 9795 * punlinking the IP stream. IP does not allow punlink of the 9796 * IP stream unless the arp stream has been punlinked. 9797 */ 9798 if ((islink && 9799 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9800 (!islink && ill->ill_arp_muxid != li->l_index)) { 9801 err = EINVAL; 9802 goto done; 9803 } 9804 9805 if (IS_IPMP(ill) && 9806 (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 9807 goto done; 9808 9809 ill->ill_arp_muxid = islink ? li->l_index : 0; 9810 } else { 9811 /* 9812 * The lower stream is probably an IP module stream. Do 9813 * consistency checking. 9814 */ 9815 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE); 9816 if (err == EINPROGRESS) 9817 return; 9818 } 9819 done: 9820 if (err == 0) 9821 miocack(q, mp, 0, 0); 9822 else 9823 miocnak(q, mp, 0, err); 9824 9825 /* Conn was refheld in ip_sioctl_copyin_setup */ 9826 if (CONN_Q(q)) 9827 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9828 if (entered_ipsq) 9829 ipsq_exit(ipsq); 9830 } 9831 9832 /* 9833 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 9834 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 9835 * module stream). If `doconsist' is set, then do the extended consistency 9836 * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here. 9837 * Returns zero on success, EINPROGRESS if the operation is still pending, or 9838 * an error code on failure. 9839 */ 9840 static int 9841 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 9842 struct linkblk *li, boolean_t doconsist) 9843 { 9844 int err = 0; 9845 ill_t *ill; 9846 queue_t *ipwq, *dwq; 9847 const char *name; 9848 struct qinit *qinfo; 9849 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9850 boolean_t entered_ipsq = B_FALSE; 9851 9852 /* 9853 * Walk the lower stream to verify it's the IP module stream. 9854 * The IP module is identified by its name, wput function, 9855 * and non-NULL q_next. STREAMS ensures that the lower stream 9856 * (li->l_qbot) will not vanish until this ioctl completes. 9857 */ 9858 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 9859 qinfo = ipwq->q_qinfo; 9860 name = qinfo->qi_minfo->mi_idname; 9861 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 9862 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 9863 break; 9864 } 9865 } 9866 9867 /* 9868 * If this isn't an IP module stream, bail. 9869 */ 9870 if (ipwq == NULL) 9871 return (0); 9872 9873 ill = ipwq->q_ptr; 9874 ASSERT(ill != NULL); 9875 9876 if (ipsq == NULL) { 9877 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9878 NEW_OP, B_FALSE); 9879 if (ipsq == NULL) 9880 return (EINPROGRESS); 9881 entered_ipsq = B_TRUE; 9882 } 9883 ASSERT(IAM_WRITER_ILL(ill)); 9884 9885 if (doconsist) { 9886 /* 9887 * Consistency checking requires that I_{P}LINK occurs 9888 * prior to setting ill_ip_muxid, and that I_{P}UNLINK 9889 * occurs prior to clearing ill_arp_muxid. 9890 */ 9891 if ((islink && ill->ill_ip_muxid != 0) || 9892 (!islink && ill->ill_arp_muxid != 0)) { 9893 err = EINVAL; 9894 goto done; 9895 } 9896 } 9897 9898 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 9899 goto done; 9900 9901 /* 9902 * As part of I_{P}LINKing, stash the number of downstream modules and 9903 * the read queue of the module immediately below IP in the ill. 9904 * These are used during the capability negotiation below. 9905 */ 9906 ill->ill_lmod_rq = NULL; 9907 ill->ill_lmod_cnt = 0; 9908 if (islink && ((dwq = ipwq->q_next) != NULL)) { 9909 ill->ill_lmod_rq = RD(dwq); 9910 for (; dwq != NULL; dwq = dwq->q_next) 9911 ill->ill_lmod_cnt++; 9912 } 9913 9914 if (doconsist) 9915 ill->ill_ip_muxid = islink ? li->l_index : 0; 9916 9917 /* 9918 * Mark the ipsq busy until the capability operations initiated below 9919 * complete. The PLINK/UNLINK ioctl itself completes when our caller 9920 * returns, but the capability operation may complete asynchronously 9921 * much later. 9922 */ 9923 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 9924 /* 9925 * If there's at least one up ipif on this ill, then we're bound to 9926 * the underlying driver via DLPI. In that case, renegotiate 9927 * capabilities to account for any possible change in modules 9928 * interposed between IP and the driver. 9929 */ 9930 if (ill->ill_ipif_up_count > 0) { 9931 if (islink) 9932 ill_capability_probe(ill); 9933 else 9934 ill_capability_reset(ill, B_FALSE); 9935 } 9936 ipsq_current_finish(ipsq); 9937 done: 9938 if (entered_ipsq) 9939 ipsq_exit(ipsq); 9940 9941 return (err); 9942 } 9943 9944 /* 9945 * Search the ioctl command in the ioctl tables and return a pointer 9946 * to the ioctl command information. The ioctl command tables are 9947 * static and fully populated at compile time. 9948 */ 9949 ip_ioctl_cmd_t * 9950 ip_sioctl_lookup(int ioc_cmd) 9951 { 9952 int index; 9953 ip_ioctl_cmd_t *ipip; 9954 ip_ioctl_cmd_t *ipip_end; 9955 9956 if (ioc_cmd == IPI_DONTCARE) 9957 return (NULL); 9958 9959 /* 9960 * Do a 2 step search. First search the indexed table 9961 * based on the least significant byte of the ioctl cmd. 9962 * If we don't find a match, then search the misc table 9963 * serially. 9964 */ 9965 index = ioc_cmd & 0xFF; 9966 if (index < ip_ndx_ioctl_count) { 9967 ipip = &ip_ndx_ioctl_table[index]; 9968 if (ipip->ipi_cmd == ioc_cmd) { 9969 /* Found a match in the ndx table */ 9970 return (ipip); 9971 } 9972 } 9973 9974 /* Search the misc table */ 9975 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 9976 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 9977 if (ipip->ipi_cmd == ioc_cmd) 9978 /* Found a match in the misc table */ 9979 return (ipip); 9980 } 9981 9982 return (NULL); 9983 } 9984 9985 /* 9986 * Wrapper function for resuming deferred ioctl processing 9987 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 9988 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 9989 */ 9990 /* ARGSUSED */ 9991 void 9992 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 9993 void *dummy_arg) 9994 { 9995 ip_sioctl_copyin_setup(q, mp); 9996 } 9997 9998 /* 9999 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 10000 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 10001 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 10002 * We establish here the size of the block to be copied in. mi_copyin 10003 * arranges for this to happen, an processing continues in ip_wput with 10004 * an M_IOCDATA message. 10005 */ 10006 void 10007 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 10008 { 10009 int copyin_size; 10010 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10011 ip_ioctl_cmd_t *ipip; 10012 cred_t *cr; 10013 ip_stack_t *ipst; 10014 10015 if (CONN_Q(q)) 10016 ipst = CONNQ_TO_IPST(q); 10017 else 10018 ipst = ILLQ_TO_IPST(q); 10019 10020 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 10021 if (ipip == NULL) { 10022 /* 10023 * The ioctl is not one we understand or own. 10024 * Pass it along to be processed down stream, 10025 * if this is a module instance of IP, else nak 10026 * the ioctl. 10027 */ 10028 if (q->q_next == NULL) { 10029 goto nak; 10030 } else { 10031 putnext(q, mp); 10032 return; 10033 } 10034 } 10035 10036 /* 10037 * If this is deferred, then we will do all the checks when we 10038 * come back. 10039 */ 10040 if ((iocp->ioc_cmd == SIOCGDSTINFO || 10041 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 10042 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 10043 return; 10044 } 10045 10046 /* 10047 * Only allow a very small subset of IP ioctls on this stream if 10048 * IP is a module and not a driver. Allowing ioctls to be processed 10049 * in this case may cause assert failures or data corruption. 10050 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10051 * ioctls allowed on an IP module stream, after which this stream 10052 * normally becomes a multiplexor (at which time the stream head 10053 * will fail all ioctls). 10054 */ 10055 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10056 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10057 /* 10058 * Pass common Streams ioctls which the IP 10059 * module does not own or consume along to 10060 * be processed down stream. 10061 */ 10062 putnext(q, mp); 10063 return; 10064 } else { 10065 goto nak; 10066 } 10067 } 10068 10069 /* Make sure we have ioctl data to process. */ 10070 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10071 goto nak; 10072 10073 /* 10074 * Prefer dblk credential over ioctl credential; some synthesized 10075 * ioctls have kcred set because there's no way to crhold() 10076 * a credential in some contexts. (ioc_cr is not crfree() by 10077 * the framework; the caller of ioctl needs to hold the reference 10078 * for the duration of the call). 10079 */ 10080 cr = msg_getcred(mp, NULL); 10081 if (cr == NULL) 10082 cr = iocp->ioc_cr; 10083 10084 /* Make sure normal users don't send down privileged ioctls */ 10085 if ((ipip->ipi_flags & IPI_PRIV) && 10086 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 10087 /* We checked the privilege earlier but log it here */ 10088 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 10089 return; 10090 } 10091 10092 /* 10093 * The ioctl command tables can only encode fixed length 10094 * ioctl data. If the length is variable, the table will 10095 * encode the length as zero. Such special cases are handled 10096 * below in the switch. 10097 */ 10098 if (ipip->ipi_copyin_size != 0) { 10099 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10100 return; 10101 } 10102 10103 switch (iocp->ioc_cmd) { 10104 case O_SIOCGIFCONF: 10105 case SIOCGIFCONF: 10106 /* 10107 * This IOCTL is hilarious. See comments in 10108 * ip_sioctl_get_ifconf for the story. 10109 */ 10110 if (iocp->ioc_count == TRANSPARENT) 10111 copyin_size = SIZEOF_STRUCT(ifconf, 10112 iocp->ioc_flag); 10113 else 10114 copyin_size = iocp->ioc_count; 10115 mi_copyin(q, mp, NULL, copyin_size); 10116 return; 10117 10118 case O_SIOCGLIFCONF: 10119 case SIOCGLIFCONF: 10120 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10121 mi_copyin(q, mp, NULL, copyin_size); 10122 return; 10123 10124 case SIOCGLIFSRCOF: 10125 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10126 mi_copyin(q, mp, NULL, copyin_size); 10127 return; 10128 case SIOCGIP6ADDRPOLICY: 10129 ip_sioctl_ip6addrpolicy(q, mp); 10130 ip6_asp_table_refrele(ipst); 10131 return; 10132 10133 case SIOCSIP6ADDRPOLICY: 10134 ip_sioctl_ip6addrpolicy(q, mp); 10135 return; 10136 10137 case SIOCGDSTINFO: 10138 ip_sioctl_dstinfo(q, mp); 10139 ip6_asp_table_refrele(ipst); 10140 return; 10141 10142 case I_PLINK: 10143 case I_PUNLINK: 10144 case I_LINK: 10145 case I_UNLINK: 10146 /* 10147 * We treat non-persistent link similarly as the persistent 10148 * link case, in terms of plumbing/unplumbing, as well as 10149 * dynamic re-plumbing events indicator. See comments 10150 * in ip_sioctl_plink() for more. 10151 * 10152 * Request can be enqueued in the 'ipsq' while waiting 10153 * to become exclusive. So bump up the conn ref. 10154 */ 10155 if (CONN_Q(q)) 10156 CONN_INC_REF(Q_TO_CONN(q)); 10157 ip_sioctl_plink(NULL, q, mp, NULL); 10158 return; 10159 10160 case ND_GET: 10161 case ND_SET: 10162 /* 10163 * Use of the nd table requires holding the reader lock. 10164 * Modifying the nd table thru nd_load/nd_unload requires 10165 * the writer lock. 10166 */ 10167 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 10168 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 10169 rw_exit(&ipst->ips_ip_g_nd_lock); 10170 10171 if (iocp->ioc_error) 10172 iocp->ioc_count = 0; 10173 mp->b_datap->db_type = M_IOCACK; 10174 qreply(q, mp); 10175 return; 10176 } 10177 rw_exit(&ipst->ips_ip_g_nd_lock); 10178 /* 10179 * We don't understand this subioctl of ND_GET / ND_SET. 10180 * Maybe intended for some driver / module below us 10181 */ 10182 if (q->q_next) { 10183 putnext(q, mp); 10184 } else { 10185 iocp->ioc_error = ENOENT; 10186 mp->b_datap->db_type = M_IOCNAK; 10187 iocp->ioc_count = 0; 10188 qreply(q, mp); 10189 } 10190 return; 10191 10192 case IP_IOCTL: 10193 ip_wput_ioctl(q, mp); 10194 return; 10195 default: 10196 cmn_err(CE_PANIC, "should not happen "); 10197 } 10198 nak: 10199 if (mp->b_cont != NULL) { 10200 freemsg(mp->b_cont); 10201 mp->b_cont = NULL; 10202 } 10203 iocp->ioc_error = EINVAL; 10204 mp->b_datap->db_type = M_IOCNAK; 10205 iocp->ioc_count = 0; 10206 qreply(q, mp); 10207 } 10208 10209 /* ip_wput hands off ARP IOCTL responses to us */ 10210 /* ARGSUSED3 */ 10211 void 10212 ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 10213 { 10214 struct arpreq *ar; 10215 struct xarpreq *xar; 10216 area_t *area; 10217 mblk_t *area_mp; 10218 struct iocblk *iocp; 10219 mblk_t *orig_ioc_mp, *tmp; 10220 struct iocblk *orig_iocp; 10221 ill_t *ill; 10222 conn_t *connp = NULL; 10223 mblk_t *pending_mp; 10224 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10225 int *flagsp; 10226 char *storage = NULL; 10227 sin_t *sin; 10228 ipaddr_t addr; 10229 int err; 10230 ip_stack_t *ipst; 10231 10232 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 10233 ill = q->q_ptr; 10234 ASSERT(ill != NULL); 10235 ipst = ill->ill_ipst; 10236 10237 /* 10238 * We should get back from ARP a packet chain that looks like: 10239 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10240 */ 10241 if (!(area_mp = mp->b_cont) || 10242 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10243 !(orig_ioc_mp = area_mp->b_cont) || 10244 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10245 freemsg(mp); 10246 return; 10247 } 10248 10249 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10250 10251 tmp = (orig_ioc_mp->b_cont)->b_cont; 10252 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10253 (orig_iocp->ioc_cmd == SIOCSXARP) || 10254 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10255 x_arp_ioctl = B_TRUE; 10256 xar = (struct xarpreq *)tmp->b_rptr; 10257 sin = (sin_t *)&xar->xarp_pa; 10258 flagsp = &xar->xarp_flags; 10259 storage = xar->xarp_ha.sdl_data; 10260 if (xar->xarp_ha.sdl_nlen != 0) 10261 ifx_arp_ioctl = B_TRUE; 10262 } else { 10263 ar = (struct arpreq *)tmp->b_rptr; 10264 sin = (sin_t *)&ar->arp_pa; 10265 flagsp = &ar->arp_flags; 10266 storage = ar->arp_ha.sa_data; 10267 } 10268 10269 iocp = (struct iocblk *)mp->b_rptr; 10270 10271 /* 10272 * Find the pending message; if we're exclusive, it'll be on our IPSQ. 10273 * Otherwise, we can find it from our ioc_id. 10274 */ 10275 if (ipsq != NULL) 10276 pending_mp = ipsq_pending_mp_get(ipsq, &connp); 10277 else 10278 pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 10279 10280 if (pending_mp == NULL) { 10281 ASSERT(connp == NULL); 10282 inet_freemsg(mp); 10283 return; 10284 } 10285 ASSERT(connp != NULL); 10286 q = CONNP_TO_WQ(connp); 10287 10288 /* Uncouple the internally generated IOCTL from the original one */ 10289 area = (area_t *)area_mp->b_rptr; 10290 area_mp->b_cont = NULL; 10291 10292 /* 10293 * Restore the b_next and b_prev used by mi code. This is needed 10294 * to complete the ioctl using mi* functions. We stored them in 10295 * the pending mp prior to sending the request to ARP. 10296 */ 10297 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10298 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10299 inet_freemsg(pending_mp); 10300 10301 /* 10302 * We're done if there was an error or if this is not an SIOCG{X}ARP 10303 * Catch the case where there is an IRE_CACHE by no entry in the 10304 * arp table. 10305 */ 10306 addr = sin->sin_addr.s_addr; 10307 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10308 ire_t *ire; 10309 dl_unitdata_req_t *dlup; 10310 mblk_t *llmp; 10311 int addr_len; 10312 ill_t *ipsqill = NULL; 10313 10314 if (ifx_arp_ioctl) { 10315 /* 10316 * There's no need to lookup the ill, since 10317 * we've already done that when we started 10318 * processing the ioctl and sent the message 10319 * to ARP on that ill. So use the ill that 10320 * is stored in q->q_ptr. 10321 */ 10322 ipsqill = ill; 10323 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10324 ipsqill->ill_ipif, ALL_ZONES, 10325 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 10326 } else { 10327 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10328 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 10329 if (ire != NULL) 10330 ipsqill = ire_to_ill(ire); 10331 } 10332 10333 if ((x_arp_ioctl) && (ipsqill != NULL)) 10334 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10335 10336 if (ire != NULL) { 10337 /* 10338 * Since the ire obtained from cachetable is used for 10339 * mac addr copying below, treat an incomplete ire as if 10340 * as if we never found it. 10341 */ 10342 if (ire->ire_nce != NULL && 10343 ire->ire_nce->nce_state != ND_REACHABLE) { 10344 ire_refrele(ire); 10345 ire = NULL; 10346 ipsqill = NULL; 10347 goto errack; 10348 } 10349 *flagsp = ATF_INUSE; 10350 llmp = (ire->ire_nce != NULL ? 10351 ire->ire_nce->nce_res_mp : NULL); 10352 if (llmp != NULL && ipsqill != NULL) { 10353 uchar_t *macaddr; 10354 10355 addr_len = ipsqill->ill_phys_addr_length; 10356 if (x_arp_ioctl && ((addr_len + 10357 ipsqill->ill_name_length) > 10358 sizeof (xar->xarp_ha.sdl_data))) { 10359 ire_refrele(ire); 10360 freemsg(mp); 10361 ip_ioctl_finish(q, orig_ioc_mp, 10362 EINVAL, NO_COPYOUT, ipsq); 10363 return; 10364 } 10365 *flagsp |= ATF_COM; 10366 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10367 if (ipsqill->ill_sap_length < 0) 10368 macaddr = llmp->b_rptr + 10369 dlup->dl_dest_addr_offset; 10370 else 10371 macaddr = llmp->b_rptr + 10372 dlup->dl_dest_addr_offset + 10373 ipsqill->ill_sap_length; 10374 /* 10375 * For SIOCGARP, MAC address length 10376 * validation has already been done 10377 * before the ioctl was issued to ARP to 10378 * allow it to progress only on 6 byte 10379 * addressable (ethernet like) media. Thus 10380 * the mac address copying can not overwrite 10381 * the sa_data area below. 10382 */ 10383 bcopy(macaddr, storage, addr_len); 10384 } 10385 /* Ditch the internal IOCTL. */ 10386 freemsg(mp); 10387 ire_refrele(ire); 10388 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); 10389 return; 10390 } 10391 } 10392 10393 /* 10394 * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE 10395 * on the IPMP meta-interface, ensure any ARP entries added in 10396 * ip_sioctl_arp() are deleted. 10397 */ 10398 if (IS_IPMP(ill) && 10399 ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) || 10400 ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) { 10401 ipmp_illgrp_t *illg = ill->ill_grp; 10402 ipmp_arpent_t *entp; 10403 10404 if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL) 10405 ipmp_illgrp_destroy_arpent(illg, entp); 10406 } 10407 10408 /* 10409 * Delete the coresponding IRE_CACHE if any. 10410 * Reset the error if there was one (in case there was no entry 10411 * in arp.) 10412 */ 10413 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10414 ipif_t *ipintf = NULL; 10415 10416 if (ifx_arp_ioctl) { 10417 /* 10418 * There's no need to lookup the ill, since 10419 * we've already done that when we started 10420 * processing the ioctl and sent the message 10421 * to ARP on that ill. So use the ill that 10422 * is stored in q->q_ptr. 10423 */ 10424 ipintf = ill->ill_ipif; 10425 } 10426 if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) { 10427 /* 10428 * The address in "addr" may be an entry for a 10429 * router. If that's true, then any off-net 10430 * IRE_CACHE entries that go through the router 10431 * with address "addr" must be clobbered. Use 10432 * ire_walk to achieve this goal. 10433 */ 10434 if (ifx_arp_ioctl) 10435 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10436 ire_delete_cache_gw, (char *)&addr, ill); 10437 else 10438 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10439 ALL_ZONES, ipst); 10440 iocp->ioc_error = 0; 10441 } 10442 } 10443 errack: 10444 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10445 err = iocp->ioc_error; 10446 freemsg(mp); 10447 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq); 10448 return; 10449 } 10450 10451 /* 10452 * Completion of an SIOCG{X}ARP. Translate the information from 10453 * the area_t into the struct {x}arpreq. 10454 */ 10455 if (x_arp_ioctl) { 10456 storage += ill_xarp_info(&xar->xarp_ha, ill); 10457 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10458 sizeof (xar->xarp_ha.sdl_data)) { 10459 freemsg(mp); 10460 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, 10461 ipsq); 10462 return; 10463 } 10464 } 10465 *flagsp = ATF_INUSE; 10466 if (area->area_flags & ACE_F_PERMANENT) 10467 *flagsp |= ATF_PERM; 10468 if (area->area_flags & ACE_F_PUBLISH) 10469 *flagsp |= ATF_PUBL; 10470 if (area->area_flags & ACE_F_AUTHORITY) 10471 *flagsp |= ATF_AUTHORITY; 10472 if (area->area_hw_addr_length != 0) { 10473 *flagsp |= ATF_COM; 10474 /* 10475 * For SIOCGARP, MAC address length validation has 10476 * already been done before the ioctl was issued to ARP 10477 * to allow it to progress only on 6 byte addressable 10478 * (ethernet like) media. Thus the mac address copying 10479 * can not overwrite the sa_data area below. 10480 */ 10481 bcopy((char *)area + area->area_hw_addr_offset, 10482 storage, area->area_hw_addr_length); 10483 } 10484 10485 /* Ditch the internal IOCTL. */ 10486 freemsg(mp); 10487 /* Complete the original. */ 10488 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); 10489 } 10490 10491 /* 10492 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10493 * interface) create the next available logical interface for this 10494 * physical interface. 10495 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10496 * ipif with the specified name. 10497 * 10498 * If the address family is not AF_UNSPEC then set the address as well. 10499 * 10500 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10501 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10502 * 10503 * Executed as a writer on the ill. 10504 * So no lock is needed to traverse the ipif chain, or examine the 10505 * phyint flags. 10506 */ 10507 /* ARGSUSED */ 10508 int 10509 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10510 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10511 { 10512 mblk_t *mp1; 10513 struct lifreq *lifr; 10514 boolean_t isv6; 10515 boolean_t exists; 10516 char *name; 10517 char *endp; 10518 char *cp; 10519 int namelen; 10520 ipif_t *ipif; 10521 long id; 10522 ipsq_t *ipsq; 10523 ill_t *ill; 10524 sin_t *sin; 10525 int err = 0; 10526 boolean_t found_sep = B_FALSE; 10527 conn_t *connp; 10528 zoneid_t zoneid; 10529 ip_stack_t *ipst = CONNQ_TO_IPST(q); 10530 10531 ASSERT(q->q_next == NULL); 10532 ip1dbg(("ip_sioctl_addif\n")); 10533 /* Existence of mp1 has been checked in ip_wput_nondata */ 10534 mp1 = mp->b_cont->b_cont; 10535 /* 10536 * Null terminate the string to protect against buffer 10537 * overrun. String was generated by user code and may not 10538 * be trusted. 10539 */ 10540 lifr = (struct lifreq *)mp1->b_rptr; 10541 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10542 name = lifr->lifr_name; 10543 ASSERT(CONN_Q(q)); 10544 connp = Q_TO_CONN(q); 10545 isv6 = connp->conn_af_isv6; 10546 zoneid = connp->conn_zoneid; 10547 namelen = mi_strlen(name); 10548 if (namelen == 0) 10549 return (EINVAL); 10550 10551 exists = B_FALSE; 10552 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10553 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10554 /* 10555 * Allow creating lo0 using SIOCLIFADDIF. 10556 * can't be any other writer thread. So can pass null below 10557 * for the last 4 args to ipif_lookup_name. 10558 */ 10559 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 10560 &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst); 10561 /* Prevent any further action */ 10562 if (ipif == NULL) { 10563 return (ENOBUFS); 10564 } else if (!exists) { 10565 /* We created the ipif now and as writer */ 10566 ipif_refrele(ipif); 10567 return (0); 10568 } else { 10569 ill = ipif->ipif_ill; 10570 ill_refhold(ill); 10571 ipif_refrele(ipif); 10572 } 10573 } else { 10574 /* Look for a colon in the name. */ 10575 endp = &name[namelen]; 10576 for (cp = endp; --cp > name; ) { 10577 if (*cp == IPIF_SEPARATOR_CHAR) { 10578 found_sep = B_TRUE; 10579 /* 10580 * Reject any non-decimal aliases for plumbing 10581 * of logical interfaces. Aliases with leading 10582 * zeroes are also rejected as they introduce 10583 * ambiguity in the naming of the interfaces. 10584 * Comparing with "0" takes care of all such 10585 * cases. 10586 */ 10587 if ((strncmp("0", cp+1, 1)) == 0) 10588 return (EINVAL); 10589 10590 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10591 id <= 0 || *endp != '\0') { 10592 return (EINVAL); 10593 } 10594 *cp = '\0'; 10595 break; 10596 } 10597 } 10598 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10599 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst); 10600 if (found_sep) 10601 *cp = IPIF_SEPARATOR_CHAR; 10602 if (ill == NULL) 10603 return (err); 10604 } 10605 10606 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10607 B_TRUE); 10608 10609 /* 10610 * Release the refhold due to the lookup, now that we are excl 10611 * or we are just returning 10612 */ 10613 ill_refrele(ill); 10614 10615 if (ipsq == NULL) 10616 return (EINPROGRESS); 10617 10618 /* We are now exclusive on the IPSQ */ 10619 ASSERT(IAM_WRITER_ILL(ill)); 10620 10621 if (found_sep) { 10622 /* Now see if there is an IPIF with this unit number. */ 10623 for (ipif = ill->ill_ipif; ipif != NULL; 10624 ipif = ipif->ipif_next) { 10625 if (ipif->ipif_id == id) { 10626 err = EEXIST; 10627 goto done; 10628 } 10629 } 10630 } 10631 10632 /* 10633 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10634 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 10635 * instead. 10636 */ 10637 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 10638 B_TRUE, B_TRUE)) == NULL) { 10639 err = ENOBUFS; 10640 goto done; 10641 } 10642 10643 /* Return created name with ioctl */ 10644 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10645 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10646 ip1dbg(("created %s\n", lifr->lifr_name)); 10647 10648 /* Set address */ 10649 sin = (sin_t *)&lifr->lifr_addr; 10650 if (sin->sin_family != AF_UNSPEC) { 10651 err = ip_sioctl_addr(ipif, sin, q, mp, 10652 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10653 } 10654 10655 done: 10656 ipsq_exit(ipsq); 10657 return (err); 10658 } 10659 10660 /* 10661 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10662 * interface) delete it based on the IP address (on this physical interface). 10663 * Otherwise delete it based on the ipif_id. 10664 * Also, special handling to allow a removeif of lo0. 10665 */ 10666 /* ARGSUSED */ 10667 int 10668 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10669 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10670 { 10671 conn_t *connp; 10672 ill_t *ill = ipif->ipif_ill; 10673 boolean_t success; 10674 ip_stack_t *ipst; 10675 10676 ipst = CONNQ_TO_IPST(q); 10677 10678 ASSERT(q->q_next == NULL); 10679 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10680 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10681 ASSERT(IAM_WRITER_IPIF(ipif)); 10682 10683 connp = Q_TO_CONN(q); 10684 /* 10685 * Special case for unplumbing lo0 (the loopback physical interface). 10686 * If unplumbing lo0, the incoming address structure has been 10687 * initialized to all zeros. When unplumbing lo0, all its logical 10688 * interfaces must be removed too. 10689 * 10690 * Note that this interface may be called to remove a specific 10691 * loopback logical interface (eg, lo0:1). But in that case 10692 * ipif->ipif_id != 0 so that the code path for that case is the 10693 * same as any other interface (meaning it skips the code directly 10694 * below). 10695 */ 10696 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10697 if (sin->sin_family == AF_UNSPEC && 10698 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10699 /* 10700 * Mark it condemned. No new ref. will be made to ill. 10701 */ 10702 mutex_enter(&ill->ill_lock); 10703 ill->ill_state_flags |= ILL_CONDEMNED; 10704 for (ipif = ill->ill_ipif; ipif != NULL; 10705 ipif = ipif->ipif_next) { 10706 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10707 } 10708 mutex_exit(&ill->ill_lock); 10709 10710 ipif = ill->ill_ipif; 10711 /* unplumb the loopback interface */ 10712 ill_delete(ill); 10713 mutex_enter(&connp->conn_lock); 10714 mutex_enter(&ill->ill_lock); 10715 10716 /* Are any references to this ill active */ 10717 if (ill_is_freeable(ill)) { 10718 mutex_exit(&ill->ill_lock); 10719 mutex_exit(&connp->conn_lock); 10720 ill_delete_tail(ill); 10721 mi_free(ill); 10722 return (0); 10723 } 10724 success = ipsq_pending_mp_add(connp, ipif, 10725 CONNP_TO_WQ(connp), mp, ILL_FREE); 10726 mutex_exit(&connp->conn_lock); 10727 mutex_exit(&ill->ill_lock); 10728 if (success) 10729 return (EINPROGRESS); 10730 else 10731 return (EINTR); 10732 } 10733 } 10734 10735 if (ipif->ipif_id == 0) { 10736 ipsq_t *ipsq; 10737 10738 /* Find based on address */ 10739 if (ipif->ipif_isv6) { 10740 sin6_t *sin6; 10741 10742 if (sin->sin_family != AF_INET6) 10743 return (EAFNOSUPPORT); 10744 10745 sin6 = (sin6_t *)sin; 10746 /* We are a writer, so we should be able to lookup */ 10747 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 10748 ipst); 10749 } else { 10750 if (sin->sin_family != AF_INET) 10751 return (EAFNOSUPPORT); 10752 10753 /* We are a writer, so we should be able to lookup */ 10754 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 10755 ipst); 10756 } 10757 if (ipif == NULL) { 10758 return (EADDRNOTAVAIL); 10759 } 10760 10761 /* 10762 * It is possible for a user to send an SIOCLIFREMOVEIF with 10763 * lifr_name of the physical interface but with an ip address 10764 * lifr_addr of a logical interface plumbed over it. 10765 * So update ipx_current_ipif now that ipif points to the 10766 * correct one. 10767 */ 10768 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 10769 ipsq->ipsq_xop->ipx_current_ipif = ipif; 10770 10771 /* This is a writer */ 10772 ipif_refrele(ipif); 10773 } 10774 10775 /* 10776 * Can not delete instance zero since it is tied to the ill. 10777 */ 10778 if (ipif->ipif_id == 0) 10779 return (EBUSY); 10780 10781 mutex_enter(&ill->ill_lock); 10782 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10783 mutex_exit(&ill->ill_lock); 10784 10785 ipif_free(ipif); 10786 10787 mutex_enter(&connp->conn_lock); 10788 mutex_enter(&ill->ill_lock); 10789 10790 /* Are any references to this ipif active */ 10791 if (ipif_is_freeable(ipif)) { 10792 mutex_exit(&ill->ill_lock); 10793 mutex_exit(&connp->conn_lock); 10794 ipif_non_duplicate(ipif); 10795 ipif_down_tail(ipif); 10796 ipif_free_tail(ipif); /* frees ipif */ 10797 return (0); 10798 } 10799 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10800 IPIF_FREE); 10801 mutex_exit(&ill->ill_lock); 10802 mutex_exit(&connp->conn_lock); 10803 if (success) 10804 return (EINPROGRESS); 10805 else 10806 return (EINTR); 10807 } 10808 10809 /* 10810 * Restart the removeif ioctl. The refcnt has gone down to 0. 10811 * The ipif is already condemned. So can't find it thru lookups. 10812 */ 10813 /* ARGSUSED */ 10814 int 10815 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10816 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10817 { 10818 ill_t *ill = ipif->ipif_ill; 10819 10820 ASSERT(IAM_WRITER_IPIF(ipif)); 10821 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10822 10823 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10824 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10825 10826 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10827 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 10828 ill_delete_tail(ill); 10829 mi_free(ill); 10830 return (0); 10831 } 10832 10833 ipif_non_duplicate(ipif); 10834 ipif_down_tail(ipif); 10835 ipif_free_tail(ipif); 10836 10837 ILL_UNMARK_CHANGING(ill); 10838 return (0); 10839 } 10840 10841 /* 10842 * Set the local interface address. 10843 * Allow an address of all zero when the interface is down. 10844 */ 10845 /* ARGSUSED */ 10846 int 10847 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10848 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10849 { 10850 int err = 0; 10851 in6_addr_t v6addr; 10852 boolean_t need_up = B_FALSE; 10853 10854 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10855 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10856 10857 ASSERT(IAM_WRITER_IPIF(ipif)); 10858 10859 if (ipif->ipif_isv6) { 10860 sin6_t *sin6; 10861 ill_t *ill; 10862 phyint_t *phyi; 10863 10864 if (sin->sin_family != AF_INET6) 10865 return (EAFNOSUPPORT); 10866 10867 sin6 = (sin6_t *)sin; 10868 v6addr = sin6->sin6_addr; 10869 ill = ipif->ipif_ill; 10870 phyi = ill->ill_phyint; 10871 10872 /* 10873 * Enforce that true multicast interfaces have a link-local 10874 * address for logical unit 0. 10875 */ 10876 if (ipif->ipif_id == 0 && 10877 (ill->ill_flags & ILLF_MULTICAST) && 10878 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 10879 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 10880 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 10881 return (EADDRNOTAVAIL); 10882 } 10883 10884 /* 10885 * up interfaces shouldn't have the unspecified address 10886 * unless they also have the IPIF_NOLOCAL flags set and 10887 * have a subnet assigned. 10888 */ 10889 if ((ipif->ipif_flags & IPIF_UP) && 10890 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 10891 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 10892 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 10893 return (EADDRNOTAVAIL); 10894 } 10895 10896 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10897 return (EADDRNOTAVAIL); 10898 } else { 10899 ipaddr_t addr; 10900 10901 if (sin->sin_family != AF_INET) 10902 return (EAFNOSUPPORT); 10903 10904 addr = sin->sin_addr.s_addr; 10905 10906 /* Allow 0 as the local address. */ 10907 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10908 return (EADDRNOTAVAIL); 10909 10910 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10911 } 10912 10913 /* 10914 * Even if there is no change we redo things just to rerun 10915 * ipif_set_default. 10916 */ 10917 if (ipif->ipif_flags & IPIF_UP) { 10918 /* 10919 * Setting a new local address, make sure 10920 * we have net and subnet bcast ire's for 10921 * the old address if we need them. 10922 */ 10923 if (!ipif->ipif_isv6) 10924 ipif_check_bcast_ires(ipif); 10925 /* 10926 * If the interface is already marked up, 10927 * we call ipif_down which will take care 10928 * of ditching any IREs that have been set 10929 * up based on the old interface address. 10930 */ 10931 err = ipif_logical_down(ipif, q, mp); 10932 if (err == EINPROGRESS) 10933 return (err); 10934 ipif_down_tail(ipif); 10935 need_up = 1; 10936 } 10937 10938 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 10939 return (err); 10940 } 10941 10942 int 10943 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10944 boolean_t need_up) 10945 { 10946 in6_addr_t v6addr; 10947 in6_addr_t ov6addr; 10948 ipaddr_t addr; 10949 sin6_t *sin6; 10950 int sinlen; 10951 int err = 0; 10952 ill_t *ill = ipif->ipif_ill; 10953 boolean_t need_dl_down; 10954 boolean_t need_arp_down; 10955 struct iocblk *iocp; 10956 10957 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 10958 10959 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 10960 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10961 ASSERT(IAM_WRITER_IPIF(ipif)); 10962 10963 /* Must cancel any pending timer before taking the ill_lock */ 10964 if (ipif->ipif_recovery_id != 0) 10965 (void) untimeout(ipif->ipif_recovery_id); 10966 ipif->ipif_recovery_id = 0; 10967 10968 if (ipif->ipif_isv6) { 10969 sin6 = (sin6_t *)sin; 10970 v6addr = sin6->sin6_addr; 10971 sinlen = sizeof (struct sockaddr_in6); 10972 } else { 10973 addr = sin->sin_addr.s_addr; 10974 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10975 sinlen = sizeof (struct sockaddr_in); 10976 } 10977 mutex_enter(&ill->ill_lock); 10978 ov6addr = ipif->ipif_v6lcl_addr; 10979 ipif->ipif_v6lcl_addr = v6addr; 10980 sctp_update_ipif_addr(ipif, ov6addr); 10981 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 10982 ipif->ipif_v6src_addr = ipv6_all_zeros; 10983 } else { 10984 ipif->ipif_v6src_addr = v6addr; 10985 } 10986 ipif->ipif_addr_ready = 0; 10987 10988 /* 10989 * If the interface was previously marked as a duplicate, then since 10990 * we've now got a "new" address, it should no longer be considered a 10991 * duplicate -- even if the "new" address is the same as the old one. 10992 * Note that if all ipifs are down, we may have a pending ARP down 10993 * event to handle. This is because we want to recover from duplicates 10994 * and thus delay tearing down ARP until the duplicates have been 10995 * removed or disabled. 10996 */ 10997 need_dl_down = need_arp_down = B_FALSE; 10998 if (ipif->ipif_flags & IPIF_DUPLICATE) { 10999 need_arp_down = !need_up; 11000 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11001 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11002 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11003 need_dl_down = B_TRUE; 11004 } 11005 } 11006 11007 ipif_set_default(ipif); 11008 11009 /* 11010 * If we've just manually set the IPv6 link-local address (0th ipif), 11011 * tag the ill so that future updates to the interface ID don't result 11012 * in this address getting automatically reconfigured from under the 11013 * administrator. 11014 */ 11015 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 11016 ill->ill_manual_linklocal = 1; 11017 11018 /* 11019 * When publishing an interface address change event, we only notify 11020 * the event listeners of the new address. It is assumed that if they 11021 * actively care about the addresses assigned that they will have 11022 * already discovered the previous address assigned (if there was one.) 11023 * 11024 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11025 */ 11026 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 11027 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 11028 NE_ADDRESS_CHANGE, sin, sinlen); 11029 } 11030 11031 mutex_exit(&ill->ill_lock); 11032 11033 if (need_up) { 11034 /* 11035 * Now bring the interface back up. If this 11036 * is the only IPIF for the ILL, ipif_up 11037 * will have to re-bind to the device, so 11038 * we may get back EINPROGRESS, in which 11039 * case, this IOCTL will get completed in 11040 * ip_rput_dlpi when we see the DL_BIND_ACK. 11041 */ 11042 err = ipif_up(ipif, q, mp); 11043 } 11044 11045 if (need_dl_down) 11046 ill_dl_down(ill); 11047 if (need_arp_down) 11048 ipif_resolver_down(ipif); 11049 11050 return (err); 11051 } 11052 11053 /* 11054 * Restart entry point to restart the address set operation after the 11055 * refcounts have dropped to zero. 11056 */ 11057 /* ARGSUSED */ 11058 int 11059 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11060 ip_ioctl_cmd_t *ipip, void *ifreq) 11061 { 11062 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11063 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11064 ASSERT(IAM_WRITER_IPIF(ipif)); 11065 ipif_down_tail(ipif); 11066 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11067 } 11068 11069 /* ARGSUSED */ 11070 int 11071 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11072 ip_ioctl_cmd_t *ipip, void *if_req) 11073 { 11074 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11075 struct lifreq *lifr = (struct lifreq *)if_req; 11076 11077 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11078 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11079 /* 11080 * The net mask and address can't change since we have a 11081 * reference to the ipif. So no lock is necessary. 11082 */ 11083 if (ipif->ipif_isv6) { 11084 *sin6 = sin6_null; 11085 sin6->sin6_family = AF_INET6; 11086 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11087 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11088 lifr->lifr_addrlen = 11089 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11090 } else { 11091 *sin = sin_null; 11092 sin->sin_family = AF_INET; 11093 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11094 if (ipip->ipi_cmd_type == LIF_CMD) { 11095 lifr->lifr_addrlen = 11096 ip_mask_to_plen(ipif->ipif_net_mask); 11097 } 11098 } 11099 return (0); 11100 } 11101 11102 /* 11103 * Set the destination address for a pt-pt interface. 11104 */ 11105 /* ARGSUSED */ 11106 int 11107 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11108 ip_ioctl_cmd_t *ipip, void *if_req) 11109 { 11110 int err = 0; 11111 in6_addr_t v6addr; 11112 boolean_t need_up = B_FALSE; 11113 11114 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11115 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11116 ASSERT(IAM_WRITER_IPIF(ipif)); 11117 11118 if (ipif->ipif_isv6) { 11119 sin6_t *sin6; 11120 11121 if (sin->sin_family != AF_INET6) 11122 return (EAFNOSUPPORT); 11123 11124 sin6 = (sin6_t *)sin; 11125 v6addr = sin6->sin6_addr; 11126 11127 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11128 return (EADDRNOTAVAIL); 11129 } else { 11130 ipaddr_t addr; 11131 11132 if (sin->sin_family != AF_INET) 11133 return (EAFNOSUPPORT); 11134 11135 addr = sin->sin_addr.s_addr; 11136 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11137 return (EADDRNOTAVAIL); 11138 11139 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11140 } 11141 11142 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11143 return (0); /* No change */ 11144 11145 if (ipif->ipif_flags & IPIF_UP) { 11146 /* 11147 * If the interface is already marked up, 11148 * we call ipif_down which will take care 11149 * of ditching any IREs that have been set 11150 * up based on the old pp dst address. 11151 */ 11152 err = ipif_logical_down(ipif, q, mp); 11153 if (err == EINPROGRESS) 11154 return (err); 11155 ipif_down_tail(ipif); 11156 need_up = B_TRUE; 11157 } 11158 /* 11159 * could return EINPROGRESS. If so ioctl will complete in 11160 * ip_rput_dlpi_writer 11161 */ 11162 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11163 return (err); 11164 } 11165 11166 static int 11167 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11168 boolean_t need_up) 11169 { 11170 in6_addr_t v6addr; 11171 ill_t *ill = ipif->ipif_ill; 11172 int err = 0; 11173 boolean_t need_dl_down; 11174 boolean_t need_arp_down; 11175 11176 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11177 ipif->ipif_id, (void *)ipif)); 11178 11179 /* Must cancel any pending timer before taking the ill_lock */ 11180 if (ipif->ipif_recovery_id != 0) 11181 (void) untimeout(ipif->ipif_recovery_id); 11182 ipif->ipif_recovery_id = 0; 11183 11184 if (ipif->ipif_isv6) { 11185 sin6_t *sin6; 11186 11187 sin6 = (sin6_t *)sin; 11188 v6addr = sin6->sin6_addr; 11189 } else { 11190 ipaddr_t addr; 11191 11192 addr = sin->sin_addr.s_addr; 11193 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11194 } 11195 mutex_enter(&ill->ill_lock); 11196 /* Set point to point destination address. */ 11197 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11198 /* 11199 * Allow this as a means of creating logical 11200 * pt-pt interfaces on top of e.g. an Ethernet. 11201 * XXX Undocumented HACK for testing. 11202 * pt-pt interfaces are created with NUD disabled. 11203 */ 11204 ipif->ipif_flags |= IPIF_POINTOPOINT; 11205 ipif->ipif_flags &= ~IPIF_BROADCAST; 11206 if (ipif->ipif_isv6) 11207 ill->ill_flags |= ILLF_NONUD; 11208 } 11209 11210 /* 11211 * If the interface was previously marked as a duplicate, then since 11212 * we've now got a "new" address, it should no longer be considered a 11213 * duplicate -- even if the "new" address is the same as the old one. 11214 * Note that if all ipifs are down, we may have a pending ARP down 11215 * event to handle. 11216 */ 11217 need_dl_down = need_arp_down = B_FALSE; 11218 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11219 need_arp_down = !need_up; 11220 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11221 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11222 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11223 need_dl_down = B_TRUE; 11224 } 11225 } 11226 11227 /* Set the new address. */ 11228 ipif->ipif_v6pp_dst_addr = v6addr; 11229 /* Make sure subnet tracks pp_dst */ 11230 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11231 mutex_exit(&ill->ill_lock); 11232 11233 if (need_up) { 11234 /* 11235 * Now bring the interface back up. If this 11236 * is the only IPIF for the ILL, ipif_up 11237 * will have to re-bind to the device, so 11238 * we may get back EINPROGRESS, in which 11239 * case, this IOCTL will get completed in 11240 * ip_rput_dlpi when we see the DL_BIND_ACK. 11241 */ 11242 err = ipif_up(ipif, q, mp); 11243 } 11244 11245 if (need_dl_down) 11246 ill_dl_down(ill); 11247 if (need_arp_down) 11248 ipif_resolver_down(ipif); 11249 11250 return (err); 11251 } 11252 11253 /* 11254 * Restart entry point to restart the dstaddress set operation after the 11255 * refcounts have dropped to zero. 11256 */ 11257 /* ARGSUSED */ 11258 int 11259 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11260 ip_ioctl_cmd_t *ipip, void *ifreq) 11261 { 11262 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11263 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11264 ipif_down_tail(ipif); 11265 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11266 } 11267 11268 /* ARGSUSED */ 11269 int 11270 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11271 ip_ioctl_cmd_t *ipip, void *if_req) 11272 { 11273 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11274 11275 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11276 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11277 /* 11278 * Get point to point destination address. The addresses can't 11279 * change since we hold a reference to the ipif. 11280 */ 11281 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11282 return (EADDRNOTAVAIL); 11283 11284 if (ipif->ipif_isv6) { 11285 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11286 *sin6 = sin6_null; 11287 sin6->sin6_family = AF_INET6; 11288 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11289 } else { 11290 *sin = sin_null; 11291 sin->sin_family = AF_INET; 11292 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11293 } 11294 return (0); 11295 } 11296 11297 /* 11298 * Set interface flags. Many flags require special handling (e.g., 11299 * bringing the interface down); see below for details. 11300 * 11301 * NOTE : We really don't enforce that ipif_id zero should be used 11302 * for setting any flags other than IFF_LOGINT_FLAGS. This 11303 * is because applications generally does SICGLIFFLAGS and 11304 * ORs in the new flags (that affects the logical) and does a 11305 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11306 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11307 * flags that will be turned on is correct with respect to 11308 * ipif_id 0. For backward compatibility reasons, it is not done. 11309 */ 11310 /* ARGSUSED */ 11311 int 11312 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11313 ip_ioctl_cmd_t *ipip, void *if_req) 11314 { 11315 uint64_t turn_on; 11316 uint64_t turn_off; 11317 int err = 0; 11318 phyint_t *phyi; 11319 ill_t *ill; 11320 uint64_t intf_flags, cantchange_flags; 11321 boolean_t phyint_flags_modified = B_FALSE; 11322 uint64_t flags; 11323 struct ifreq *ifr; 11324 struct lifreq *lifr; 11325 boolean_t set_linklocal = B_FALSE; 11326 boolean_t zero_source = B_FALSE; 11327 11328 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11329 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11330 11331 ASSERT(IAM_WRITER_IPIF(ipif)); 11332 11333 ill = ipif->ipif_ill; 11334 phyi = ill->ill_phyint; 11335 11336 if (ipip->ipi_cmd_type == IF_CMD) { 11337 ifr = (struct ifreq *)if_req; 11338 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11339 } else { 11340 lifr = (struct lifreq *)if_req; 11341 flags = lifr->lifr_flags; 11342 } 11343 11344 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11345 11346 /* 11347 * Have the flags been set correctly until now? 11348 */ 11349 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11350 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11351 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11352 /* 11353 * Compare the new flags to the old, and partition 11354 * into those coming on and those going off. 11355 * For the 16 bit command keep the bits above bit 16 unchanged. 11356 */ 11357 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11358 flags |= intf_flags & ~0xFFFF; 11359 11360 /* 11361 * Explicitly fail attempts to change flags that are always invalid on 11362 * an IPMP meta-interface. 11363 */ 11364 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 11365 return (EINVAL); 11366 11367 /* 11368 * Check which flags will change; silently ignore flags which userland 11369 * is not allowed to control. (Because these flags may change between 11370 * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's 11371 * control, we need to silently ignore them rather than fail.) 11372 */ 11373 cantchange_flags = IFF_CANTCHANGE; 11374 if (IS_IPMP(ill)) 11375 cantchange_flags |= IFF_IPMP_CANTCHANGE; 11376 11377 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 11378 if (turn_on == 0) 11379 return (0); /* No change */ 11380 11381 turn_off = intf_flags & turn_on; 11382 turn_on ^= turn_off; 11383 11384 /* 11385 * All test addresses must be IFF_DEPRECATED (to ensure source address 11386 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 11387 * allow it to be turned off. 11388 */ 11389 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 11390 (turn_on|intf_flags) & IFF_NOFAILOVER) 11391 return (EINVAL); 11392 11393 if (turn_on & IFF_NOFAILOVER) { 11394 turn_on |= IFF_DEPRECATED; 11395 flags |= IFF_DEPRECATED; 11396 } 11397 11398 /* 11399 * On underlying interfaces, only allow applications to manage test 11400 * addresses -- otherwise, they may get confused when the address 11401 * moves as part of being brought up. Likewise, prevent an 11402 * application-managed test address from being converted to a data 11403 * address. To prevent migration of administratively up addresses in 11404 * the kernel, we don't allow them to be converted either. 11405 */ 11406 if (IS_UNDER_IPMP(ill)) { 11407 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 11408 11409 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 11410 return (EINVAL); 11411 11412 if ((turn_off & IFF_NOFAILOVER) && 11413 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 11414 return (EINVAL); 11415 } 11416 11417 /* 11418 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11419 * IPv6 interfaces. 11420 */ 11421 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11422 return (EINVAL); 11423 11424 /* 11425 * cannot turn off IFF_NOXMIT on VNI interfaces. 11426 */ 11427 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 11428 return (EINVAL); 11429 11430 /* 11431 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11432 * interfaces. It makes no sense in that context. 11433 */ 11434 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11435 return (EINVAL); 11436 11437 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11438 zero_source = B_TRUE; 11439 11440 /* 11441 * For IPv6 ipif_id 0, don't allow the interface to be up without 11442 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11443 * If the link local address isn't set, and can be set, it will get 11444 * set later on in this function. 11445 */ 11446 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11447 (flags & IFF_UP) && !zero_source && 11448 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11449 if (ipif_cant_setlinklocal(ipif)) 11450 return (EINVAL); 11451 set_linklocal = B_TRUE; 11452 } 11453 11454 /* 11455 * If we modify physical interface flags, we'll potentially need to 11456 * send up two routing socket messages for the changes (one for the 11457 * IPv4 ill, and another for the IPv6 ill). Note that here. 11458 */ 11459 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11460 phyint_flags_modified = B_TRUE; 11461 11462 /* 11463 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 11464 * (otherwise, we'd immediately use them, defeating standby). Also, 11465 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 11466 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 11467 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 11468 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 11469 * will not be honored. 11470 */ 11471 if (turn_on & PHYI_STANDBY) { 11472 /* 11473 * No need to grab ill_g_usesrc_lock here; see the 11474 * synchronization notes in ip.c. 11475 */ 11476 if (ill->ill_usesrc_grp_next != NULL || 11477 intf_flags & PHYI_INACTIVE) 11478 return (EINVAL); 11479 if (!(flags & PHYI_FAILED)) { 11480 flags |= PHYI_INACTIVE; 11481 turn_on |= PHYI_INACTIVE; 11482 } 11483 } 11484 11485 if (turn_off & PHYI_STANDBY) { 11486 flags &= ~PHYI_INACTIVE; 11487 turn_off |= PHYI_INACTIVE; 11488 } 11489 11490 /* 11491 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 11492 * would end up on. 11493 */ 11494 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 11495 (PHYI_FAILED | PHYI_INACTIVE)) 11496 return (EINVAL); 11497 11498 /* 11499 * If ILLF_ROUTER changes, we need to change the ip forwarding 11500 * status of the interface. 11501 */ 11502 if ((turn_on | turn_off) & ILLF_ROUTER) 11503 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 11504 11505 /* 11506 * If the interface is not UP and we are not going to 11507 * bring it UP, record the flags and return. When the 11508 * interface comes UP later, the right actions will be 11509 * taken. 11510 */ 11511 if (!(ipif->ipif_flags & IPIF_UP) && 11512 !(turn_on & IPIF_UP)) { 11513 /* Record new flags in their respective places. */ 11514 mutex_enter(&ill->ill_lock); 11515 mutex_enter(&ill->ill_phyint->phyint_lock); 11516 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11517 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11518 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11519 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11520 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11521 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11522 mutex_exit(&ill->ill_lock); 11523 mutex_exit(&ill->ill_phyint->phyint_lock); 11524 11525 /* 11526 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 11527 * same to the kernel: if any of them has been set by 11528 * userland, the interface cannot be used for data traffic. 11529 */ 11530 if ((turn_on|turn_off) & 11531 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 11532 ASSERT(!IS_IPMP(ill)); 11533 /* 11534 * It's possible the ill is part of an "anonymous" 11535 * IPMP group rather than a real group. In that case, 11536 * there are no other interfaces in the group and thus 11537 * no need to call ipmp_phyint_refresh_active(). 11538 */ 11539 if (IS_UNDER_IPMP(ill)) 11540 ipmp_phyint_refresh_active(phyi); 11541 } 11542 11543 if (phyint_flags_modified) { 11544 if (phyi->phyint_illv4 != NULL) { 11545 ip_rts_ifmsg(phyi->phyint_illv4-> 11546 ill_ipif, RTSQ_DEFAULT); 11547 } 11548 if (phyi->phyint_illv6 != NULL) { 11549 ip_rts_ifmsg(phyi->phyint_illv6-> 11550 ill_ipif, RTSQ_DEFAULT); 11551 } 11552 } 11553 return (0); 11554 } else if (set_linklocal || zero_source) { 11555 mutex_enter(&ill->ill_lock); 11556 if (set_linklocal) 11557 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11558 if (zero_source) 11559 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11560 mutex_exit(&ill->ill_lock); 11561 } 11562 11563 /* 11564 * Disallow IPv6 interfaces coming up that have the unspecified address, 11565 * or point-to-point interfaces with an unspecified destination. We do 11566 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11567 * have a subnet assigned, which is how in.ndpd currently manages its 11568 * onlink prefix list when no addresses are configured with those 11569 * prefixes. 11570 */ 11571 if (ipif->ipif_isv6 && 11572 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11573 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11574 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11575 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11576 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11577 return (EINVAL); 11578 } 11579 11580 /* 11581 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11582 * from being brought up. 11583 */ 11584 if (!ipif->ipif_isv6 && 11585 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11586 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11587 return (EINVAL); 11588 } 11589 11590 /* 11591 * The only flag changes that we currently take specific action on are 11592 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 11593 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 11594 * IPIF_NOFAILOVER. This is done by bring the ipif down, changing the 11595 * flags and bringing it back up again. For IPIF_NOFAILOVER, the act 11596 * of bringing it back up will trigger the address to be moved. 11597 */ 11598 if ((turn_on|turn_off) & 11599 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11600 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 11601 IPIF_NOFAILOVER)) { 11602 /* 11603 * Taking this ipif down, make sure we have 11604 * valid net and subnet bcast ire's for other 11605 * logical interfaces, if we need them. 11606 */ 11607 if (!ipif->ipif_isv6) 11608 ipif_check_bcast_ires(ipif); 11609 11610 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11611 !(turn_off & IPIF_UP)) { 11612 if (ipif->ipif_flags & IPIF_UP) 11613 ill->ill_logical_down = 1; 11614 turn_on &= ~IPIF_UP; 11615 } 11616 err = ipif_down(ipif, q, mp); 11617 ip1dbg(("ipif_down returns %d err ", err)); 11618 if (err == EINPROGRESS) 11619 return (err); 11620 ipif_down_tail(ipif); 11621 } 11622 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 11623 } 11624 11625 static int 11626 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 11627 { 11628 ill_t *ill; 11629 phyint_t *phyi; 11630 uint64_t turn_on, turn_off; 11631 uint64_t intf_flags, cantchange_flags; 11632 boolean_t phyint_flags_modified = B_FALSE; 11633 int err = 0; 11634 boolean_t set_linklocal = B_FALSE; 11635 boolean_t zero_source = B_FALSE; 11636 11637 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 11638 ipif->ipif_ill->ill_name, ipif->ipif_id)); 11639 11640 ASSERT(IAM_WRITER_IPIF(ipif)); 11641 11642 ill = ipif->ipif_ill; 11643 phyi = ill->ill_phyint; 11644 11645 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11646 cantchange_flags = IFF_CANTCHANGE | IFF_UP; 11647 if (IS_IPMP(ill)) 11648 cantchange_flags |= IFF_IPMP_CANTCHANGE; 11649 11650 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 11651 turn_off = intf_flags & turn_on; 11652 turn_on ^= turn_off; 11653 11654 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11655 phyint_flags_modified = B_TRUE; 11656 11657 /* 11658 * Now we change the flags. Track current value of 11659 * other flags in their respective places. 11660 */ 11661 mutex_enter(&ill->ill_lock); 11662 mutex_enter(&phyi->phyint_lock); 11663 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11664 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11665 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11666 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11667 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11668 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11669 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 11670 set_linklocal = B_TRUE; 11671 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 11672 } 11673 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 11674 zero_source = B_TRUE; 11675 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 11676 } 11677 mutex_exit(&ill->ill_lock); 11678 mutex_exit(&phyi->phyint_lock); 11679 11680 if (set_linklocal) 11681 (void) ipif_setlinklocal(ipif); 11682 11683 if (zero_source) 11684 ipif->ipif_v6src_addr = ipv6_all_zeros; 11685 else 11686 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 11687 11688 /* 11689 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 11690 * the kernel: if any of them has been set by userland, the interface 11691 * cannot be used for data traffic. 11692 */ 11693 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 11694 ASSERT(!IS_IPMP(ill)); 11695 /* 11696 * It's possible the ill is part of an "anonymous" IPMP group 11697 * rather than a real group. In that case, there are no other 11698 * interfaces in the group and thus no need for us to call 11699 * ipmp_phyint_refresh_active(). 11700 */ 11701 if (IS_UNDER_IPMP(ill)) 11702 ipmp_phyint_refresh_active(phyi); 11703 } 11704 11705 if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 11706 /* 11707 * XXX ipif_up really does not know whether a phyint flags 11708 * was modified or not. So, it sends up information on 11709 * only one routing sockets message. As we don't bring up 11710 * the interface and also set PHYI_ flags simultaneously 11711 * it should be okay. 11712 */ 11713 err = ipif_up(ipif, q, mp); 11714 } else { 11715 /* 11716 * Make sure routing socket sees all changes to the flags. 11717 * ipif_up_done* handles this when we use ipif_up. 11718 */ 11719 if (phyint_flags_modified) { 11720 if (phyi->phyint_illv4 != NULL) { 11721 ip_rts_ifmsg(phyi->phyint_illv4-> 11722 ill_ipif, RTSQ_DEFAULT); 11723 } 11724 if (phyi->phyint_illv6 != NULL) { 11725 ip_rts_ifmsg(phyi->phyint_illv6-> 11726 ill_ipif, RTSQ_DEFAULT); 11727 } 11728 } else { 11729 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 11730 } 11731 /* 11732 * Update the flags in SCTP's IPIF list, ipif_up() will do 11733 * this in need_up case. 11734 */ 11735 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11736 } 11737 return (err); 11738 } 11739 11740 /* 11741 * Restart the flags operation now that the refcounts have dropped to zero. 11742 */ 11743 /* ARGSUSED */ 11744 int 11745 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11746 ip_ioctl_cmd_t *ipip, void *if_req) 11747 { 11748 uint64_t flags; 11749 struct ifreq *ifr = if_req; 11750 struct lifreq *lifr = if_req; 11751 11752 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 11753 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11754 11755 ipif_down_tail(ipif); 11756 if (ipip->ipi_cmd_type == IF_CMD) { 11757 /* cast to uint16_t prevents unwanted sign extension */ 11758 flags = (uint16_t)ifr->ifr_flags; 11759 } else { 11760 flags = lifr->lifr_flags; 11761 } 11762 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 11763 } 11764 11765 /* 11766 * Can operate on either a module or a driver queue. 11767 */ 11768 /* ARGSUSED */ 11769 int 11770 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11771 ip_ioctl_cmd_t *ipip, void *if_req) 11772 { 11773 /* 11774 * Has the flags been set correctly till now ? 11775 */ 11776 ill_t *ill = ipif->ipif_ill; 11777 phyint_t *phyi = ill->ill_phyint; 11778 11779 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 11780 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11781 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11782 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11783 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11784 11785 /* 11786 * Need a lock since some flags can be set even when there are 11787 * references to the ipif. 11788 */ 11789 mutex_enter(&ill->ill_lock); 11790 if (ipip->ipi_cmd_type == IF_CMD) { 11791 struct ifreq *ifr = (struct ifreq *)if_req; 11792 11793 /* Get interface flags (low 16 only). */ 11794 ifr->ifr_flags = ((ipif->ipif_flags | 11795 ill->ill_flags | phyi->phyint_flags) & 0xffff); 11796 } else { 11797 struct lifreq *lifr = (struct lifreq *)if_req; 11798 11799 /* Get interface flags. */ 11800 lifr->lifr_flags = ipif->ipif_flags | 11801 ill->ill_flags | phyi->phyint_flags; 11802 } 11803 mutex_exit(&ill->ill_lock); 11804 return (0); 11805 } 11806 11807 /* ARGSUSED */ 11808 int 11809 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11810 ip_ioctl_cmd_t *ipip, void *if_req) 11811 { 11812 int mtu; 11813 int ip_min_mtu; 11814 struct ifreq *ifr; 11815 struct lifreq *lifr; 11816 ire_t *ire; 11817 ip_stack_t *ipst; 11818 11819 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 11820 ipif->ipif_id, (void *)ipif)); 11821 if (ipip->ipi_cmd_type == IF_CMD) { 11822 ifr = (struct ifreq *)if_req; 11823 mtu = ifr->ifr_metric; 11824 } else { 11825 lifr = (struct lifreq *)if_req; 11826 mtu = lifr->lifr_mtu; 11827 } 11828 11829 if (ipif->ipif_isv6) 11830 ip_min_mtu = IPV6_MIN_MTU; 11831 else 11832 ip_min_mtu = IP_MIN_MTU; 11833 11834 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 11835 return (EINVAL); 11836 11837 /* 11838 * Change the MTU size in all relevant ire's. 11839 * Mtu change Vs. new ire creation - protocol below. 11840 * First change ipif_mtu and the ire_max_frag of the 11841 * interface ire. Then do an ire walk and change the 11842 * ire_max_frag of all affected ires. During ire_add 11843 * under the bucket lock, set the ire_max_frag of the 11844 * new ire being created from the ipif/ire from which 11845 * it is being derived. If an mtu change happens after 11846 * the ire is added, the new ire will be cleaned up. 11847 * Conversely if the mtu change happens before the ire 11848 * is added, ire_add will see the new value of the mtu. 11849 */ 11850 ipif->ipif_mtu = mtu; 11851 ipif->ipif_flags |= IPIF_FIXEDMTU; 11852 11853 if (ipif->ipif_isv6) 11854 ire = ipif_to_ire_v6(ipif); 11855 else 11856 ire = ipif_to_ire(ipif); 11857 if (ire != NULL) { 11858 ire->ire_max_frag = ipif->ipif_mtu; 11859 ire_refrele(ire); 11860 } 11861 ipst = ipif->ipif_ill->ill_ipst; 11862 if (ipif->ipif_flags & IPIF_UP) { 11863 if (ipif->ipif_isv6) 11864 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES, 11865 ipst); 11866 else 11867 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES, 11868 ipst); 11869 } 11870 /* Update the MTU in SCTP's list */ 11871 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11872 return (0); 11873 } 11874 11875 /* Get interface MTU. */ 11876 /* ARGSUSED */ 11877 int 11878 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11879 ip_ioctl_cmd_t *ipip, void *if_req) 11880 { 11881 struct ifreq *ifr; 11882 struct lifreq *lifr; 11883 11884 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 11885 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11886 if (ipip->ipi_cmd_type == IF_CMD) { 11887 ifr = (struct ifreq *)if_req; 11888 ifr->ifr_metric = ipif->ipif_mtu; 11889 } else { 11890 lifr = (struct lifreq *)if_req; 11891 lifr->lifr_mtu = ipif->ipif_mtu; 11892 } 11893 return (0); 11894 } 11895 11896 /* Set interface broadcast address. */ 11897 /* ARGSUSED2 */ 11898 int 11899 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11900 ip_ioctl_cmd_t *ipip, void *if_req) 11901 { 11902 ipaddr_t addr; 11903 ire_t *ire; 11904 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11905 11906 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 11907 ipif->ipif_id)); 11908 11909 ASSERT(IAM_WRITER_IPIF(ipif)); 11910 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11911 return (EADDRNOTAVAIL); 11912 11913 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 11914 11915 if (sin->sin_family != AF_INET) 11916 return (EAFNOSUPPORT); 11917 11918 addr = sin->sin_addr.s_addr; 11919 if (ipif->ipif_flags & IPIF_UP) { 11920 /* 11921 * If we are already up, make sure the new 11922 * broadcast address makes sense. If it does, 11923 * there should be an IRE for it already. 11924 * Don't match on ipif, only on the ill 11925 * since we are sharing these now. 11926 */ 11927 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 11928 ipif, ALL_ZONES, NULL, 11929 (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst); 11930 if (ire == NULL) { 11931 return (EINVAL); 11932 } else { 11933 ire_refrele(ire); 11934 } 11935 } 11936 /* 11937 * Changing the broadcast addr for this ipif. 11938 * Make sure we have valid net and subnet bcast 11939 * ire's for other logical interfaces, if needed. 11940 */ 11941 if (addr != ipif->ipif_brd_addr) 11942 ipif_check_bcast_ires(ipif); 11943 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 11944 return (0); 11945 } 11946 11947 /* Get interface broadcast address. */ 11948 /* ARGSUSED */ 11949 int 11950 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11951 ip_ioctl_cmd_t *ipip, void *if_req) 11952 { 11953 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 11954 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11955 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11956 return (EADDRNOTAVAIL); 11957 11958 /* IPIF_BROADCAST not possible with IPv6 */ 11959 ASSERT(!ipif->ipif_isv6); 11960 *sin = sin_null; 11961 sin->sin_family = AF_INET; 11962 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 11963 return (0); 11964 } 11965 11966 /* 11967 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 11968 */ 11969 /* ARGSUSED */ 11970 int 11971 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11972 ip_ioctl_cmd_t *ipip, void *if_req) 11973 { 11974 int err = 0; 11975 in6_addr_t v6mask; 11976 11977 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 11978 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11979 11980 ASSERT(IAM_WRITER_IPIF(ipif)); 11981 11982 if (ipif->ipif_isv6) { 11983 sin6_t *sin6; 11984 11985 if (sin->sin_family != AF_INET6) 11986 return (EAFNOSUPPORT); 11987 11988 sin6 = (sin6_t *)sin; 11989 v6mask = sin6->sin6_addr; 11990 } else { 11991 ipaddr_t mask; 11992 11993 if (sin->sin_family != AF_INET) 11994 return (EAFNOSUPPORT); 11995 11996 mask = sin->sin_addr.s_addr; 11997 V4MASK_TO_V6(mask, v6mask); 11998 } 11999 12000 /* 12001 * No big deal if the interface isn't already up, or the mask 12002 * isn't really changing, or this is pt-pt. 12003 */ 12004 if (!(ipif->ipif_flags & IPIF_UP) || 12005 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12006 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12007 ipif->ipif_v6net_mask = v6mask; 12008 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12009 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12010 ipif->ipif_v6net_mask, 12011 ipif->ipif_v6subnet); 12012 } 12013 return (0); 12014 } 12015 /* 12016 * Make sure we have valid net and subnet broadcast ire's 12017 * for the old netmask, if needed by other logical interfaces. 12018 */ 12019 if (!ipif->ipif_isv6) 12020 ipif_check_bcast_ires(ipif); 12021 12022 err = ipif_logical_down(ipif, q, mp); 12023 if (err == EINPROGRESS) 12024 return (err); 12025 ipif_down_tail(ipif); 12026 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12027 return (err); 12028 } 12029 12030 static int 12031 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12032 { 12033 in6_addr_t v6mask; 12034 int err = 0; 12035 12036 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12037 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12038 12039 if (ipif->ipif_isv6) { 12040 sin6_t *sin6; 12041 12042 sin6 = (sin6_t *)sin; 12043 v6mask = sin6->sin6_addr; 12044 } else { 12045 ipaddr_t mask; 12046 12047 mask = sin->sin_addr.s_addr; 12048 V4MASK_TO_V6(mask, v6mask); 12049 } 12050 12051 ipif->ipif_v6net_mask = v6mask; 12052 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12053 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12054 ipif->ipif_v6subnet); 12055 } 12056 err = ipif_up(ipif, q, mp); 12057 12058 if (err == 0 || err == EINPROGRESS) { 12059 /* 12060 * The interface must be DL_BOUND if this packet has to 12061 * go out on the wire. Since we only go through a logical 12062 * down and are bound with the driver during an internal 12063 * down/up that is satisfied. 12064 */ 12065 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12066 /* Potentially broadcast an address mask reply. */ 12067 ipif_mask_reply(ipif); 12068 } 12069 } 12070 return (err); 12071 } 12072 12073 /* ARGSUSED */ 12074 int 12075 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12076 ip_ioctl_cmd_t *ipip, void *if_req) 12077 { 12078 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12079 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12080 ipif_down_tail(ipif); 12081 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12082 } 12083 12084 /* Get interface net mask. */ 12085 /* ARGSUSED */ 12086 int 12087 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12088 ip_ioctl_cmd_t *ipip, void *if_req) 12089 { 12090 struct lifreq *lifr = (struct lifreq *)if_req; 12091 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12092 12093 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12094 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12095 12096 /* 12097 * net mask can't change since we have a reference to the ipif. 12098 */ 12099 if (ipif->ipif_isv6) { 12100 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12101 *sin6 = sin6_null; 12102 sin6->sin6_family = AF_INET6; 12103 sin6->sin6_addr = ipif->ipif_v6net_mask; 12104 lifr->lifr_addrlen = 12105 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12106 } else { 12107 *sin = sin_null; 12108 sin->sin_family = AF_INET; 12109 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12110 if (ipip->ipi_cmd_type == LIF_CMD) { 12111 lifr->lifr_addrlen = 12112 ip_mask_to_plen(ipif->ipif_net_mask); 12113 } 12114 } 12115 return (0); 12116 } 12117 12118 /* ARGSUSED */ 12119 int 12120 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12121 ip_ioctl_cmd_t *ipip, void *if_req) 12122 { 12123 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12124 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12125 12126 /* 12127 * Since no applications should ever be setting metrics on underlying 12128 * interfaces, we explicitly fail to smoke 'em out. 12129 */ 12130 if (IS_UNDER_IPMP(ipif->ipif_ill)) 12131 return (EINVAL); 12132 12133 /* 12134 * Set interface metric. We don't use this for 12135 * anything but we keep track of it in case it is 12136 * important to routing applications or such. 12137 */ 12138 if (ipip->ipi_cmd_type == IF_CMD) { 12139 struct ifreq *ifr; 12140 12141 ifr = (struct ifreq *)if_req; 12142 ipif->ipif_metric = ifr->ifr_metric; 12143 } else { 12144 struct lifreq *lifr; 12145 12146 lifr = (struct lifreq *)if_req; 12147 ipif->ipif_metric = lifr->lifr_metric; 12148 } 12149 return (0); 12150 } 12151 12152 /* ARGSUSED */ 12153 int 12154 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12155 ip_ioctl_cmd_t *ipip, void *if_req) 12156 { 12157 /* Get interface metric. */ 12158 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12159 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12160 12161 if (ipip->ipi_cmd_type == IF_CMD) { 12162 struct ifreq *ifr; 12163 12164 ifr = (struct ifreq *)if_req; 12165 ifr->ifr_metric = ipif->ipif_metric; 12166 } else { 12167 struct lifreq *lifr; 12168 12169 lifr = (struct lifreq *)if_req; 12170 lifr->lifr_metric = ipif->ipif_metric; 12171 } 12172 12173 return (0); 12174 } 12175 12176 /* ARGSUSED */ 12177 int 12178 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12179 ip_ioctl_cmd_t *ipip, void *if_req) 12180 { 12181 12182 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12183 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12184 /* 12185 * Set the muxid returned from I_PLINK. 12186 */ 12187 if (ipip->ipi_cmd_type == IF_CMD) { 12188 struct ifreq *ifr = (struct ifreq *)if_req; 12189 12190 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12191 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12192 } else { 12193 struct lifreq *lifr = (struct lifreq *)if_req; 12194 12195 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12196 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12197 } 12198 return (0); 12199 } 12200 12201 /* ARGSUSED */ 12202 int 12203 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12204 ip_ioctl_cmd_t *ipip, void *if_req) 12205 { 12206 12207 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12208 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12209 /* 12210 * Get the muxid saved in ill for I_PUNLINK. 12211 */ 12212 if (ipip->ipi_cmd_type == IF_CMD) { 12213 struct ifreq *ifr = (struct ifreq *)if_req; 12214 12215 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12216 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12217 } else { 12218 struct lifreq *lifr = (struct lifreq *)if_req; 12219 12220 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12221 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12222 } 12223 return (0); 12224 } 12225 12226 /* 12227 * Set the subnet prefix. Does not modify the broadcast address. 12228 */ 12229 /* ARGSUSED */ 12230 int 12231 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12232 ip_ioctl_cmd_t *ipip, void *if_req) 12233 { 12234 int err = 0; 12235 in6_addr_t v6addr; 12236 in6_addr_t v6mask; 12237 boolean_t need_up = B_FALSE; 12238 int addrlen; 12239 12240 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12241 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12242 12243 ASSERT(IAM_WRITER_IPIF(ipif)); 12244 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12245 12246 if (ipif->ipif_isv6) { 12247 sin6_t *sin6; 12248 12249 if (sin->sin_family != AF_INET6) 12250 return (EAFNOSUPPORT); 12251 12252 sin6 = (sin6_t *)sin; 12253 v6addr = sin6->sin6_addr; 12254 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12255 return (EADDRNOTAVAIL); 12256 } else { 12257 ipaddr_t addr; 12258 12259 if (sin->sin_family != AF_INET) 12260 return (EAFNOSUPPORT); 12261 12262 addr = sin->sin_addr.s_addr; 12263 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12264 return (EADDRNOTAVAIL); 12265 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12266 /* Add 96 bits */ 12267 addrlen += IPV6_ABITS - IP_ABITS; 12268 } 12269 12270 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12271 return (EINVAL); 12272 12273 /* Check if bits in the address is set past the mask */ 12274 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12275 return (EINVAL); 12276 12277 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12278 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12279 return (0); /* No change */ 12280 12281 if (ipif->ipif_flags & IPIF_UP) { 12282 /* 12283 * If the interface is already marked up, 12284 * we call ipif_down which will take care 12285 * of ditching any IREs that have been set 12286 * up based on the old interface address. 12287 */ 12288 err = ipif_logical_down(ipif, q, mp); 12289 if (err == EINPROGRESS) 12290 return (err); 12291 ipif_down_tail(ipif); 12292 need_up = B_TRUE; 12293 } 12294 12295 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12296 return (err); 12297 } 12298 12299 static int 12300 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12301 queue_t *q, mblk_t *mp, boolean_t need_up) 12302 { 12303 ill_t *ill = ipif->ipif_ill; 12304 int err = 0; 12305 12306 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12307 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12308 12309 /* Set the new address. */ 12310 mutex_enter(&ill->ill_lock); 12311 ipif->ipif_v6net_mask = v6mask; 12312 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12313 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12314 ipif->ipif_v6subnet); 12315 } 12316 mutex_exit(&ill->ill_lock); 12317 12318 if (need_up) { 12319 /* 12320 * Now bring the interface back up. If this 12321 * is the only IPIF for the ILL, ipif_up 12322 * will have to re-bind to the device, so 12323 * we may get back EINPROGRESS, in which 12324 * case, this IOCTL will get completed in 12325 * ip_rput_dlpi when we see the DL_BIND_ACK. 12326 */ 12327 err = ipif_up(ipif, q, mp); 12328 if (err == EINPROGRESS) 12329 return (err); 12330 } 12331 return (err); 12332 } 12333 12334 /* ARGSUSED */ 12335 int 12336 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12337 ip_ioctl_cmd_t *ipip, void *if_req) 12338 { 12339 int addrlen; 12340 in6_addr_t v6addr; 12341 in6_addr_t v6mask; 12342 struct lifreq *lifr = (struct lifreq *)if_req; 12343 12344 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12345 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12346 ipif_down_tail(ipif); 12347 12348 addrlen = lifr->lifr_addrlen; 12349 if (ipif->ipif_isv6) { 12350 sin6_t *sin6; 12351 12352 sin6 = (sin6_t *)sin; 12353 v6addr = sin6->sin6_addr; 12354 } else { 12355 ipaddr_t addr; 12356 12357 addr = sin->sin_addr.s_addr; 12358 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12359 addrlen += IPV6_ABITS - IP_ABITS; 12360 } 12361 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12362 12363 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12364 } 12365 12366 /* ARGSUSED */ 12367 int 12368 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12369 ip_ioctl_cmd_t *ipip, void *if_req) 12370 { 12371 struct lifreq *lifr = (struct lifreq *)if_req; 12372 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12373 12374 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12375 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12376 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12377 12378 if (ipif->ipif_isv6) { 12379 *sin6 = sin6_null; 12380 sin6->sin6_family = AF_INET6; 12381 sin6->sin6_addr = ipif->ipif_v6subnet; 12382 lifr->lifr_addrlen = 12383 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12384 } else { 12385 *sin = sin_null; 12386 sin->sin_family = AF_INET; 12387 sin->sin_addr.s_addr = ipif->ipif_subnet; 12388 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12389 } 12390 return (0); 12391 } 12392 12393 /* 12394 * Set the IPv6 address token. 12395 */ 12396 /* ARGSUSED */ 12397 int 12398 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12399 ip_ioctl_cmd_t *ipi, void *if_req) 12400 { 12401 ill_t *ill = ipif->ipif_ill; 12402 int err; 12403 in6_addr_t v6addr; 12404 in6_addr_t v6mask; 12405 boolean_t need_up = B_FALSE; 12406 int i; 12407 sin6_t *sin6 = (sin6_t *)sin; 12408 struct lifreq *lifr = (struct lifreq *)if_req; 12409 int addrlen; 12410 12411 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12412 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12413 ASSERT(IAM_WRITER_IPIF(ipif)); 12414 12415 addrlen = lifr->lifr_addrlen; 12416 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12417 if (ipif->ipif_id != 0) 12418 return (EINVAL); 12419 12420 if (!ipif->ipif_isv6) 12421 return (EINVAL); 12422 12423 if (addrlen > IPV6_ABITS) 12424 return (EINVAL); 12425 12426 v6addr = sin6->sin6_addr; 12427 12428 /* 12429 * The length of the token is the length from the end. To get 12430 * the proper mask for this, compute the mask of the bits not 12431 * in the token; ie. the prefix, and then xor to get the mask. 12432 */ 12433 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12434 return (EINVAL); 12435 for (i = 0; i < 4; i++) { 12436 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12437 } 12438 12439 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12440 ill->ill_token_length == addrlen) 12441 return (0); /* No change */ 12442 12443 if (ipif->ipif_flags & IPIF_UP) { 12444 err = ipif_logical_down(ipif, q, mp); 12445 if (err == EINPROGRESS) 12446 return (err); 12447 ipif_down_tail(ipif); 12448 need_up = B_TRUE; 12449 } 12450 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12451 return (err); 12452 } 12453 12454 static int 12455 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12456 mblk_t *mp, boolean_t need_up) 12457 { 12458 in6_addr_t v6addr; 12459 in6_addr_t v6mask; 12460 ill_t *ill = ipif->ipif_ill; 12461 int i; 12462 int err = 0; 12463 12464 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12465 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12466 v6addr = sin6->sin6_addr; 12467 /* 12468 * The length of the token is the length from the end. To get 12469 * the proper mask for this, compute the mask of the bits not 12470 * in the token; ie. the prefix, and then xor to get the mask. 12471 */ 12472 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12473 for (i = 0; i < 4; i++) 12474 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12475 12476 mutex_enter(&ill->ill_lock); 12477 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12478 ill->ill_token_length = addrlen; 12479 ill->ill_manual_token = 1; 12480 12481 /* Reconfigure the link-local address based on this new token */ 12482 ipif_setlinklocal(ill->ill_ipif); 12483 12484 mutex_exit(&ill->ill_lock); 12485 12486 if (need_up) { 12487 /* 12488 * Now bring the interface back up. If this 12489 * is the only IPIF for the ILL, ipif_up 12490 * will have to re-bind to the device, so 12491 * we may get back EINPROGRESS, in which 12492 * case, this IOCTL will get completed in 12493 * ip_rput_dlpi when we see the DL_BIND_ACK. 12494 */ 12495 err = ipif_up(ipif, q, mp); 12496 if (err == EINPROGRESS) 12497 return (err); 12498 } 12499 return (err); 12500 } 12501 12502 /* ARGSUSED */ 12503 int 12504 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12505 ip_ioctl_cmd_t *ipi, void *if_req) 12506 { 12507 ill_t *ill; 12508 sin6_t *sin6 = (sin6_t *)sin; 12509 struct lifreq *lifr = (struct lifreq *)if_req; 12510 12511 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12512 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12513 if (ipif->ipif_id != 0) 12514 return (EINVAL); 12515 12516 ill = ipif->ipif_ill; 12517 if (!ill->ill_isv6) 12518 return (ENXIO); 12519 12520 *sin6 = sin6_null; 12521 sin6->sin6_family = AF_INET6; 12522 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12523 sin6->sin6_addr = ill->ill_token; 12524 lifr->lifr_addrlen = ill->ill_token_length; 12525 return (0); 12526 } 12527 12528 /* 12529 * Set (hardware) link specific information that might override 12530 * what was acquired through the DL_INFO_ACK. 12531 * The logic is as follows. 12532 * 12533 * become exclusive 12534 * set CHANGING flag 12535 * change mtu on affected IREs 12536 * clear CHANGING flag 12537 * 12538 * An ire add that occurs before the CHANGING flag is set will have its mtu 12539 * changed by the ip_sioctl_lnkinfo. 12540 * 12541 * During the time the CHANGING flag is set, no new ires will be added to the 12542 * bucket, and ire add will fail (due the CHANGING flag). 12543 * 12544 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12545 * before it is added to the bucket. 12546 * 12547 * Obviously only 1 thread can set the CHANGING flag and we need to become 12548 * exclusive to set the flag. 12549 */ 12550 /* ARGSUSED */ 12551 int 12552 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12553 ip_ioctl_cmd_t *ipi, void *if_req) 12554 { 12555 ill_t *ill = ipif->ipif_ill; 12556 ipif_t *nipif; 12557 int ip_min_mtu; 12558 boolean_t mtu_walk = B_FALSE; 12559 struct lifreq *lifr = (struct lifreq *)if_req; 12560 lif_ifinfo_req_t *lir; 12561 ire_t *ire; 12562 12563 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12564 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12565 lir = &lifr->lifr_ifinfo; 12566 ASSERT(IAM_WRITER_IPIF(ipif)); 12567 12568 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12569 if (ipif->ipif_id != 0) 12570 return (EINVAL); 12571 12572 /* Set interface MTU. */ 12573 if (ipif->ipif_isv6) 12574 ip_min_mtu = IPV6_MIN_MTU; 12575 else 12576 ip_min_mtu = IP_MIN_MTU; 12577 12578 /* 12579 * Verify values before we set anything. Allow zero to 12580 * mean unspecified. 12581 */ 12582 if (lir->lir_maxmtu != 0 && 12583 (lir->lir_maxmtu > ill->ill_max_frag || 12584 lir->lir_maxmtu < ip_min_mtu)) 12585 return (EINVAL); 12586 if (lir->lir_reachtime != 0 && 12587 lir->lir_reachtime > ND_MAX_REACHTIME) 12588 return (EINVAL); 12589 if (lir->lir_reachretrans != 0 && 12590 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12591 return (EINVAL); 12592 12593 mutex_enter(&ill->ill_lock); 12594 ill->ill_state_flags |= ILL_CHANGING; 12595 for (nipif = ill->ill_ipif; nipif != NULL; 12596 nipif = nipif->ipif_next) { 12597 nipif->ipif_state_flags |= IPIF_CHANGING; 12598 } 12599 12600 if (lir->lir_maxmtu != 0) { 12601 ill->ill_max_mtu = lir->lir_maxmtu; 12602 ill->ill_user_mtu = lir->lir_maxmtu; 12603 mtu_walk = B_TRUE; 12604 } 12605 mutex_exit(&ill->ill_lock); 12606 12607 if (lir->lir_reachtime != 0) 12608 ill->ill_reachable_time = lir->lir_reachtime; 12609 12610 if (lir->lir_reachretrans != 0) 12611 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12612 12613 ill->ill_max_hops = lir->lir_maxhops; 12614 12615 ill->ill_max_buf = ND_MAX_Q; 12616 12617 if (mtu_walk) { 12618 /* 12619 * Set the MTU on all ipifs associated with this ill except 12620 * for those whose MTU was fixed via SIOCSLIFMTU. 12621 */ 12622 for (nipif = ill->ill_ipif; nipif != NULL; 12623 nipif = nipif->ipif_next) { 12624 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12625 continue; 12626 12627 nipif->ipif_mtu = ill->ill_max_mtu; 12628 12629 if (!(nipif->ipif_flags & IPIF_UP)) 12630 continue; 12631 12632 if (nipif->ipif_isv6) 12633 ire = ipif_to_ire_v6(nipif); 12634 else 12635 ire = ipif_to_ire(nipif); 12636 if (ire != NULL) { 12637 ire->ire_max_frag = ipif->ipif_mtu; 12638 ire_refrele(ire); 12639 } 12640 12641 ire_walk_ill(MATCH_IRE_ILL, 0, ipif_mtu_change, 12642 nipif, ill); 12643 } 12644 } 12645 12646 mutex_enter(&ill->ill_lock); 12647 for (nipif = ill->ill_ipif; nipif != NULL; 12648 nipif = nipif->ipif_next) { 12649 nipif->ipif_state_flags &= ~IPIF_CHANGING; 12650 } 12651 ILL_UNMARK_CHANGING(ill); 12652 mutex_exit(&ill->ill_lock); 12653 12654 /* 12655 * Refresh IPMP meta-interface MTU if necessary. 12656 */ 12657 if (IS_UNDER_IPMP(ill)) 12658 ipmp_illgrp_refresh_mtu(ill->ill_grp); 12659 12660 return (0); 12661 } 12662 12663 /* ARGSUSED */ 12664 int 12665 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12666 ip_ioctl_cmd_t *ipi, void *if_req) 12667 { 12668 struct lif_ifinfo_req *lir; 12669 ill_t *ill = ipif->ipif_ill; 12670 12671 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 12672 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12673 if (ipif->ipif_id != 0) 12674 return (EINVAL); 12675 12676 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 12677 lir->lir_maxhops = ill->ill_max_hops; 12678 lir->lir_reachtime = ill->ill_reachable_time; 12679 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 12680 lir->lir_maxmtu = ill->ill_max_mtu; 12681 12682 return (0); 12683 } 12684 12685 /* 12686 * Return best guess as to the subnet mask for the specified address. 12687 * Based on the subnet masks for all the configured interfaces. 12688 * 12689 * We end up returning a zero mask in the case of default, multicast or 12690 * experimental. 12691 */ 12692 static ipaddr_t 12693 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 12694 { 12695 ipaddr_t net_mask; 12696 ill_t *ill; 12697 ipif_t *ipif; 12698 ill_walk_context_t ctx; 12699 ipif_t *fallback_ipif = NULL; 12700 12701 net_mask = ip_net_mask(addr); 12702 if (net_mask == 0) { 12703 *ipifp = NULL; 12704 return (0); 12705 } 12706 12707 /* Let's check to see if this is maybe a local subnet route. */ 12708 /* this function only applies to IPv4 interfaces */ 12709 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 12710 ill = ILL_START_WALK_V4(&ctx, ipst); 12711 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 12712 mutex_enter(&ill->ill_lock); 12713 for (ipif = ill->ill_ipif; ipif != NULL; 12714 ipif = ipif->ipif_next) { 12715 if (!IPIF_CAN_LOOKUP(ipif)) 12716 continue; 12717 if (!(ipif->ipif_flags & IPIF_UP)) 12718 continue; 12719 if ((ipif->ipif_subnet & net_mask) == 12720 (addr & net_mask)) { 12721 /* 12722 * Don't trust pt-pt interfaces if there are 12723 * other interfaces. 12724 */ 12725 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 12726 if (fallback_ipif == NULL) { 12727 ipif_refhold_locked(ipif); 12728 fallback_ipif = ipif; 12729 } 12730 continue; 12731 } 12732 12733 /* 12734 * Fine. Just assume the same net mask as the 12735 * directly attached subnet interface is using. 12736 */ 12737 ipif_refhold_locked(ipif); 12738 mutex_exit(&ill->ill_lock); 12739 rw_exit(&ipst->ips_ill_g_lock); 12740 if (fallback_ipif != NULL) 12741 ipif_refrele(fallback_ipif); 12742 *ipifp = ipif; 12743 return (ipif->ipif_net_mask); 12744 } 12745 } 12746 mutex_exit(&ill->ill_lock); 12747 } 12748 rw_exit(&ipst->ips_ill_g_lock); 12749 12750 *ipifp = fallback_ipif; 12751 return ((fallback_ipif != NULL) ? 12752 fallback_ipif->ipif_net_mask : net_mask); 12753 } 12754 12755 /* 12756 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 12757 */ 12758 static void 12759 ip_wput_ioctl(queue_t *q, mblk_t *mp) 12760 { 12761 IOCP iocp; 12762 ipft_t *ipft; 12763 ipllc_t *ipllc; 12764 mblk_t *mp1; 12765 cred_t *cr; 12766 int error = 0; 12767 conn_t *connp; 12768 12769 ip1dbg(("ip_wput_ioctl")); 12770 iocp = (IOCP)mp->b_rptr; 12771 mp1 = mp->b_cont; 12772 if (mp1 == NULL) { 12773 iocp->ioc_error = EINVAL; 12774 mp->b_datap->db_type = M_IOCNAK; 12775 iocp->ioc_count = 0; 12776 qreply(q, mp); 12777 return; 12778 } 12779 12780 /* 12781 * These IOCTLs provide various control capabilities to 12782 * upstream agents such as ULPs and processes. There 12783 * are currently two such IOCTLs implemented. They 12784 * are used by TCP to provide update information for 12785 * existing IREs and to forcibly delete an IRE for a 12786 * host that is not responding, thereby forcing an 12787 * attempt at a new route. 12788 */ 12789 iocp->ioc_error = EINVAL; 12790 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 12791 goto done; 12792 12793 ipllc = (ipllc_t *)mp1->b_rptr; 12794 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 12795 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 12796 break; 12797 } 12798 /* 12799 * prefer credential from mblk over ioctl; 12800 * see ip_sioctl_copyin_setup 12801 */ 12802 cr = msg_getcred(mp, NULL); 12803 if (cr == NULL) 12804 cr = iocp->ioc_cr; 12805 12806 /* 12807 * Refhold the conn in case the request gets queued up in some lookup 12808 */ 12809 ASSERT(CONN_Q(q)); 12810 connp = Q_TO_CONN(q); 12811 CONN_INC_REF(connp); 12812 if (ipft->ipft_pfi && 12813 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 12814 pullupmsg(mp1, ipft->ipft_min_size))) { 12815 error = (*ipft->ipft_pfi)(q, 12816 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 12817 } 12818 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 12819 /* 12820 * CONN_OPER_PENDING_DONE happens in the function called 12821 * through ipft_pfi above. 12822 */ 12823 return; 12824 } 12825 12826 CONN_OPER_PENDING_DONE(connp); 12827 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 12828 freemsg(mp); 12829 return; 12830 } 12831 iocp->ioc_error = error; 12832 12833 done: 12834 mp->b_datap->db_type = M_IOCACK; 12835 if (iocp->ioc_error) 12836 iocp->ioc_count = 0; 12837 qreply(q, mp); 12838 } 12839 12840 /* 12841 * Lookup an ipif using the sequence id (ipif_seqid) 12842 */ 12843 ipif_t * 12844 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 12845 { 12846 ipif_t *ipif; 12847 12848 ASSERT(MUTEX_HELD(&ill->ill_lock)); 12849 12850 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12851 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 12852 return (ipif); 12853 } 12854 return (NULL); 12855 } 12856 12857 /* 12858 * Assign a unique id for the ipif. This is used later when we send 12859 * IRES to ARP for resolution where we initialize ire_ipif_seqid 12860 * to the value pointed by ire_ipif->ipif_seqid. Later when the 12861 * IRE is added, we verify that ipif has not disappeared. 12862 */ 12863 12864 static void 12865 ipif_assign_seqid(ipif_t *ipif) 12866 { 12867 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12868 12869 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 12870 } 12871 12872 /* 12873 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 12874 * administratively down (i.e., no DAD), of the same type, and locked. Note 12875 * that the clone is complete -- including the seqid -- and the expectation is 12876 * that the caller will either free or overwrite `sipif' before it's unlocked. 12877 */ 12878 static void 12879 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 12880 { 12881 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 12882 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 12883 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 12884 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 12885 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 12886 ASSERT(sipif->ipif_arp_del_mp == NULL); 12887 ASSERT(dipif->ipif_arp_del_mp == NULL); 12888 ASSERT(sipif->ipif_igmp_rpt == NULL); 12889 ASSERT(dipif->ipif_igmp_rpt == NULL); 12890 ASSERT(sipif->ipif_multicast_up == 0); 12891 ASSERT(dipif->ipif_multicast_up == 0); 12892 ASSERT(sipif->ipif_joined_allhosts == 0); 12893 ASSERT(dipif->ipif_joined_allhosts == 0); 12894 12895 dipif->ipif_mtu = sipif->ipif_mtu; 12896 dipif->ipif_flags = sipif->ipif_flags; 12897 dipif->ipif_metric = sipif->ipif_metric; 12898 dipif->ipif_zoneid = sipif->ipif_zoneid; 12899 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 12900 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 12901 dipif->ipif_v6src_addr = sipif->ipif_v6src_addr; 12902 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 12903 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 12904 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 12905 12906 /* 12907 * While dipif is down right now, it might've been up before. Since 12908 * it's changing identity, its packet counters need to be reset. 12909 */ 12910 dipif->ipif_ib_pkt_count = 0; 12911 dipif->ipif_ob_pkt_count = 0; 12912 dipif->ipif_fo_pkt_count = 0; 12913 12914 /* 12915 * As per the comment atop the function, we assume that these sipif 12916 * fields will be changed before sipif is unlocked. 12917 */ 12918 dipif->ipif_seqid = sipif->ipif_seqid; 12919 dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp; 12920 dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt; 12921 dipif->ipif_state_flags = sipif->ipif_state_flags; 12922 } 12923 12924 /* 12925 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 12926 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 12927 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 12928 * transfer the xop to `dipif'. Requires that all ipifs are administratively 12929 * down (i.e., no DAD), of the same type, and unlocked. 12930 */ 12931 static void 12932 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 12933 { 12934 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 12935 ipxop_t *ipx = ipsq->ipsq_xop; 12936 12937 ASSERT(sipif != dipif); 12938 ASSERT(sipif != virgipif); 12939 12940 /* 12941 * Grab all of the locks that protect the ipif in a defined order. 12942 */ 12943 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 12944 if (sipif > dipif) { 12945 mutex_enter(&sipif->ipif_saved_ire_lock); 12946 mutex_enter(&dipif->ipif_saved_ire_lock); 12947 } else { 12948 mutex_enter(&dipif->ipif_saved_ire_lock); 12949 mutex_enter(&sipif->ipif_saved_ire_lock); 12950 } 12951 12952 ipif_clone(sipif, dipif); 12953 if (virgipif != NULL) { 12954 ipif_clone(virgipif, sipif); 12955 mi_free(virgipif); 12956 } 12957 12958 mutex_exit(&sipif->ipif_saved_ire_lock); 12959 mutex_exit(&dipif->ipif_saved_ire_lock); 12960 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 12961 12962 /* 12963 * Transfer ownership of the current xop, if necessary. 12964 */ 12965 if (ipx->ipx_current_ipif == sipif) { 12966 ASSERT(ipx->ipx_pending_ipif == NULL); 12967 mutex_enter(&ipx->ipx_lock); 12968 ipx->ipx_current_ipif = dipif; 12969 mutex_exit(&ipx->ipx_lock); 12970 } 12971 12972 if (virgipif == NULL) 12973 mi_free(sipif); 12974 } 12975 12976 /* 12977 * Insert the ipif, so that the list of ipifs on the ill will be sorted 12978 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 12979 * be inserted into the first space available in the list. The value of 12980 * ipif_id will then be set to the appropriate value for its position. 12981 */ 12982 static int 12983 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 12984 { 12985 ill_t *ill; 12986 ipif_t *tipif; 12987 ipif_t **tipifp; 12988 int id; 12989 ip_stack_t *ipst; 12990 12991 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 12992 IAM_WRITER_IPIF(ipif)); 12993 12994 ill = ipif->ipif_ill; 12995 ASSERT(ill != NULL); 12996 ipst = ill->ill_ipst; 12997 12998 /* 12999 * In the case of lo0:0 we already hold the ill_g_lock. 13000 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13001 * ipif_insert. 13002 */ 13003 if (acquire_g_lock) 13004 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13005 mutex_enter(&ill->ill_lock); 13006 id = ipif->ipif_id; 13007 tipifp = &(ill->ill_ipif); 13008 if (id == -1) { /* need to find a real id */ 13009 id = 0; 13010 while ((tipif = *tipifp) != NULL) { 13011 ASSERT(tipif->ipif_id >= id); 13012 if (tipif->ipif_id != id) 13013 break; /* non-consecutive id */ 13014 id++; 13015 tipifp = &(tipif->ipif_next); 13016 } 13017 /* limit number of logical interfaces */ 13018 if (id >= ipst->ips_ip_addrs_per_if) { 13019 mutex_exit(&ill->ill_lock); 13020 if (acquire_g_lock) 13021 rw_exit(&ipst->ips_ill_g_lock); 13022 return (-1); 13023 } 13024 ipif->ipif_id = id; /* assign new id */ 13025 } else if (id < ipst->ips_ip_addrs_per_if) { 13026 /* we have a real id; insert ipif in the right place */ 13027 while ((tipif = *tipifp) != NULL) { 13028 ASSERT(tipif->ipif_id != id); 13029 if (tipif->ipif_id > id) 13030 break; /* found correct location */ 13031 tipifp = &(tipif->ipif_next); 13032 } 13033 } else { 13034 mutex_exit(&ill->ill_lock); 13035 if (acquire_g_lock) 13036 rw_exit(&ipst->ips_ill_g_lock); 13037 return (-1); 13038 } 13039 13040 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13041 13042 ipif->ipif_next = tipif; 13043 *tipifp = ipif; 13044 mutex_exit(&ill->ill_lock); 13045 if (acquire_g_lock) 13046 rw_exit(&ipst->ips_ill_g_lock); 13047 13048 return (0); 13049 } 13050 13051 static void 13052 ipif_remove(ipif_t *ipif) 13053 { 13054 ipif_t **ipifp; 13055 ill_t *ill = ipif->ipif_ill; 13056 13057 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 13058 13059 mutex_enter(&ill->ill_lock); 13060 ipifp = &ill->ill_ipif; 13061 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 13062 if (*ipifp == ipif) { 13063 *ipifp = ipif->ipif_next; 13064 break; 13065 } 13066 } 13067 mutex_exit(&ill->ill_lock); 13068 } 13069 13070 /* 13071 * Allocate and initialize a new interface control structure. (Always 13072 * called as writer.) 13073 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13074 * is not part of the global linked list of ills. ipif_seqid is unique 13075 * in the system and to preserve the uniqueness, it is assigned only 13076 * when ill becomes part of the global list. At that point ill will 13077 * have a name. If it doesn't get assigned here, it will get assigned 13078 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13079 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13080 * the interface flags or any other information from the DL_INFO_ACK for 13081 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13082 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13083 * second DL_INFO_ACK comes in from the driver. 13084 */ 13085 static ipif_t * 13086 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 13087 boolean_t insert) 13088 { 13089 ipif_t *ipif; 13090 ip_stack_t *ipst = ill->ill_ipst; 13091 13092 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13093 ill->ill_name, id, (void *)ill)); 13094 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13095 13096 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13097 return (NULL); 13098 *ipif = ipif_zero; /* start clean */ 13099 13100 ipif->ipif_ill = ill; 13101 ipif->ipif_id = id; /* could be -1 */ 13102 /* 13103 * Inherit the zoneid from the ill; for the shared stack instance 13104 * this is always the global zone 13105 */ 13106 ipif->ipif_zoneid = ill->ill_zoneid; 13107 13108 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13109 13110 ipif->ipif_refcnt = 0; 13111 ipif->ipif_saved_ire_cnt = 0; 13112 13113 if (insert) { 13114 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) { 13115 mi_free(ipif); 13116 return (NULL); 13117 } 13118 /* -1 id should have been replaced by real id */ 13119 id = ipif->ipif_id; 13120 ASSERT(id >= 0); 13121 } 13122 13123 if (ill->ill_name[0] != '\0') 13124 ipif_assign_seqid(ipif); 13125 13126 /* 13127 * If this is the zeroth ipif on the IPMP ill, create the illgrp 13128 * (which must not exist yet because the zeroth ipif is created once 13129 * per ill). However, do not not link it to the ipmp_grp_t until 13130 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 13131 */ 13132 if (id == 0 && IS_IPMP(ill)) { 13133 if (ipmp_illgrp_create(ill) == NULL) { 13134 if (insert) { 13135 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13136 ipif_remove(ipif); 13137 rw_exit(&ipst->ips_ill_g_lock); 13138 } 13139 mi_free(ipif); 13140 return (NULL); 13141 } 13142 } 13143 13144 /* 13145 * We grab ill_lock to protect the flag changes. The ipif is still 13146 * not up and can't be looked up until the ioctl completes and the 13147 * IPIF_CHANGING flag is cleared. 13148 */ 13149 mutex_enter(&ill->ill_lock); 13150 13151 ipif->ipif_ire_type = ire_type; 13152 13153 if (ipif->ipif_isv6) { 13154 ill->ill_flags |= ILLF_IPV6; 13155 } else { 13156 ipaddr_t inaddr_any = INADDR_ANY; 13157 13158 ill->ill_flags |= ILLF_IPV4; 13159 13160 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13161 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13162 &ipif->ipif_v6lcl_addr); 13163 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13164 &ipif->ipif_v6src_addr); 13165 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13166 &ipif->ipif_v6subnet); 13167 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13168 &ipif->ipif_v6net_mask); 13169 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13170 &ipif->ipif_v6brd_addr); 13171 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13172 &ipif->ipif_v6pp_dst_addr); 13173 } 13174 13175 /* 13176 * Don't set the interface flags etc. now, will do it in 13177 * ip_ll_subnet_defaults. 13178 */ 13179 if (!initialize) 13180 goto out; 13181 13182 ipif->ipif_mtu = ill->ill_max_mtu; 13183 13184 /* 13185 * NOTE: The IPMP meta-interface is special-cased because it starts 13186 * with no underlying interfaces (and thus an unknown broadcast 13187 * address length), but all interfaces that can be placed into an IPMP 13188 * group are required to be broadcast-capable. 13189 */ 13190 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 13191 /* 13192 * Later detect lack of DLPI driver multicast capability by 13193 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 13194 */ 13195 ill->ill_flags |= ILLF_MULTICAST; 13196 if (!ipif->ipif_isv6) 13197 ipif->ipif_flags |= IPIF_BROADCAST; 13198 } else { 13199 if (ill->ill_net_type != IRE_LOOPBACK) { 13200 if (ipif->ipif_isv6) 13201 /* 13202 * Note: xresolv interfaces will eventually need 13203 * NOARP set here as well, but that will require 13204 * those external resolvers to have some 13205 * knowledge of that flag and act appropriately. 13206 * Not to be changed at present. 13207 */ 13208 ill->ill_flags |= ILLF_NONUD; 13209 else 13210 ill->ill_flags |= ILLF_NOARP; 13211 } 13212 if (ill->ill_phys_addr_length == 0) { 13213 if (IS_VNI(ill)) { 13214 ipif->ipif_flags |= IPIF_NOXMIT; 13215 } else { 13216 /* pt-pt supports multicast. */ 13217 ill->ill_flags |= ILLF_MULTICAST; 13218 if (ill->ill_net_type != IRE_LOOPBACK) 13219 ipif->ipif_flags |= IPIF_POINTOPOINT; 13220 } 13221 } 13222 } 13223 out: 13224 mutex_exit(&ill->ill_lock); 13225 return (ipif); 13226 } 13227 13228 /* 13229 * If appropriate, send a message up to the resolver delete the entry 13230 * for the address of this interface which is going out of business. 13231 * (Always called as writer). 13232 * 13233 * NOTE : We need to check for NULL mps as some of the fields are 13234 * initialized only for some interface types. See ipif_resolver_up() 13235 * for details. 13236 */ 13237 void 13238 ipif_resolver_down(ipif_t *ipif) 13239 { 13240 mblk_t *mp; 13241 ill_t *ill = ipif->ipif_ill; 13242 13243 ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13244 ASSERT(IAM_WRITER_IPIF(ipif)); 13245 13246 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13247 return; 13248 13249 /* Delete the mapping for the local address */ 13250 mp = ipif->ipif_arp_del_mp; 13251 if (mp != NULL) { 13252 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13253 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13254 putnext(ill->ill_rq, mp); 13255 ipif->ipif_arp_del_mp = NULL; 13256 } 13257 13258 /* 13259 * Make IPMP aware of the deleted data address. 13260 */ 13261 if (IS_IPMP(ill)) 13262 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 13263 13264 /* 13265 * If this is the last ipif that is going down and there are no 13266 * duplicate addresses we may yet attempt to re-probe, then we need to 13267 * clean up ARP completely. 13268 */ 13269 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13270 /* 13271 * If this was the last ipif on an IPMP interface, purge any 13272 * IPMP ARP entries associated with it. 13273 */ 13274 if (IS_IPMP(ill)) 13275 ipmp_illgrp_refresh_arpent(ill->ill_grp); 13276 13277 /* Send up AR_INTERFACE_DOWN message */ 13278 mp = ill->ill_arp_down_mp; 13279 if (mp != NULL) { 13280 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13281 *(unsigned *)mp->b_rptr, ill->ill_name, 13282 ipif->ipif_id)); 13283 putnext(ill->ill_rq, mp); 13284 ill->ill_arp_down_mp = NULL; 13285 } 13286 13287 /* Tell ARP to delete the multicast mappings */ 13288 mp = ill->ill_arp_del_mapping_mp; 13289 if (mp != NULL) { 13290 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13291 *(unsigned *)mp->b_rptr, ill->ill_name, 13292 ipif->ipif_id)); 13293 putnext(ill->ill_rq, mp); 13294 ill->ill_arp_del_mapping_mp = NULL; 13295 } 13296 } 13297 } 13298 13299 /* 13300 * Set up the multicast mappings for `ipif' in ARP. If `arp_add_mapping_mp' 13301 * is non-NULL, then upon success it will contain an mblk that can be passed 13302 * to ARP to create the mapping. Otherwise, if it's NULL, upon success ARP 13303 * will have already been notified to create the mapping. Returns zero on 13304 * success, -1 upon failure. 13305 */ 13306 int 13307 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13308 { 13309 mblk_t *del_mp = NULL; 13310 mblk_t *add_mp = NULL; 13311 mblk_t *mp; 13312 ill_t *ill = ipif->ipif_ill; 13313 phyint_t *phyi = ill->ill_phyint; 13314 ipaddr_t addr, mask, extract_mask = 0; 13315 arma_t *arma; 13316 uint8_t *maddr, *bphys_addr; 13317 uint32_t hw_start; 13318 dl_unitdata_req_t *dlur; 13319 13320 ASSERT(IAM_WRITER_IPIF(ipif)); 13321 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13322 return (0); 13323 13324 /* 13325 * IPMP meta-interfaces don't have any inherent multicast mappings, 13326 * and instead use the ones on the underlying interfaces. 13327 */ 13328 if (IS_IPMP(ill)) 13329 return (0); 13330 13331 /* 13332 * Delete the existing mapping from ARP. Normally, ipif_down() -> 13333 * ipif_resolver_down() will send this up to ARP, but it may be that 13334 * we are enabling PHYI_MULTI_BCAST via ip_rput_dlpi_writer(). 13335 */ 13336 mp = ill->ill_arp_del_mapping_mp; 13337 if (mp != NULL) { 13338 ip1dbg(("ipif_arp_setup_multicast: arp cmd %x for %s:%u\n", 13339 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13340 putnext(ill->ill_rq, mp); 13341 ill->ill_arp_del_mapping_mp = NULL; 13342 } 13343 13344 if (arp_add_mapping_mp != NULL) 13345 *arp_add_mapping_mp = NULL; 13346 13347 /* 13348 * Check that the address is not to long for the constant 13349 * length reserved in the template arma_t. 13350 */ 13351 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13352 return (-1); 13353 13354 /* Add mapping mblk */ 13355 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13356 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13357 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13358 (caddr_t)&addr); 13359 if (add_mp == NULL) 13360 return (-1); 13361 arma = (arma_t *)add_mp->b_rptr; 13362 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13363 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13364 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13365 13366 /* 13367 * Determine the broadcast address. 13368 */ 13369 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13370 if (ill->ill_sap_length < 0) 13371 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13372 else 13373 bphys_addr = (uchar_t *)dlur + 13374 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13375 /* 13376 * Check PHYI_MULTI_BCAST and length of physical 13377 * address to determine if we use the mapping or the 13378 * broadcast address. 13379 */ 13380 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13381 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13382 bphys_addr, maddr, &hw_start, &extract_mask)) 13383 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13384 13385 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13386 (ill->ill_flags & ILLF_MULTICAST)) { 13387 /* Make sure this will not match the "exact" entry. */ 13388 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13389 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13390 (caddr_t)&addr); 13391 if (del_mp == NULL) { 13392 freemsg(add_mp); 13393 return (-1); 13394 } 13395 bcopy(&extract_mask, (char *)arma + 13396 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13397 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13398 /* Use link-layer broadcast address for MULTI_BCAST */ 13399 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13400 ip2dbg(("ipif_arp_setup_multicast: adding" 13401 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13402 } else { 13403 arma->arma_hw_mapping_start = hw_start; 13404 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13405 " ARP setup for %s\n", ill->ill_name)); 13406 } 13407 } else { 13408 freemsg(add_mp); 13409 ASSERT(del_mp == NULL); 13410 /* It is neither MULTICAST nor MULTI_BCAST */ 13411 return (0); 13412 } 13413 ASSERT(add_mp != NULL && del_mp != NULL); 13414 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13415 ill->ill_arp_del_mapping_mp = del_mp; 13416 if (arp_add_mapping_mp != NULL) { 13417 /* The caller just wants the mblks allocated */ 13418 *arp_add_mapping_mp = add_mp; 13419 } else { 13420 /* The caller wants us to send it to arp */ 13421 putnext(ill->ill_rq, add_mp); 13422 } 13423 return (0); 13424 } 13425 13426 /* 13427 * Get the resolver set up for a new IP address. (Always called as writer.) 13428 * Called both for IPv4 and IPv6 interfaces, though it only sets up the 13429 * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP. 13430 * 13431 * The enumerated value res_act tunes the behavior: 13432 * * Res_act_initial: set up all the resolver structures for a new 13433 * IP address. 13434 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 13435 * ARP message in defense of the address. 13436 * * Res_act_rebind: tell ARP to change the hardware address for an IP 13437 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 13438 * 13439 * Returns zero on success, or an errno upon failure. 13440 */ 13441 int 13442 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13443 { 13444 mblk_t *arp_up_mp = NULL; 13445 mblk_t *arp_down_mp = NULL; 13446 mblk_t *arp_add_mp = NULL; 13447 mblk_t *arp_del_mp = NULL; 13448 mblk_t *arp_add_mapping_mp = NULL; 13449 mblk_t *arp_del_mapping_mp = NULL; 13450 ill_t *ill = ipif->ipif_ill; 13451 int err = ENOMEM; 13452 boolean_t added_ipif = B_FALSE; 13453 boolean_t publish; 13454 boolean_t was_dup; 13455 13456 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13457 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13458 ASSERT(IAM_WRITER_IPIF(ipif)); 13459 13460 was_dup = B_FALSE; 13461 if (res_act == Res_act_initial) { 13462 ipif->ipif_addr_ready = 0; 13463 /* 13464 * We're bringing an interface up here. There's no way that we 13465 * should need to shut down ARP now. 13466 */ 13467 mutex_enter(&ill->ill_lock); 13468 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13469 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13470 ill->ill_ipif_dup_count--; 13471 was_dup = B_TRUE; 13472 } 13473 mutex_exit(&ill->ill_lock); 13474 } 13475 if (ipif->ipif_recovery_id != 0) 13476 (void) untimeout(ipif->ipif_recovery_id); 13477 ipif->ipif_recovery_id = 0; 13478 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13479 ipif->ipif_addr_ready = 1; 13480 return (0); 13481 } 13482 /* NDP will set the ipif_addr_ready flag when it's ready */ 13483 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13484 return (0); 13485 13486 if (ill->ill_isv6) { 13487 /* 13488 * External resolver for IPv6 13489 */ 13490 ASSERT(res_act == Res_act_initial); 13491 publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr); 13492 } else { 13493 /* 13494 * IPv4 arp case. If the ARP stream has already started 13495 * closing, fail this request for ARP bringup. Else 13496 * record the fact that an ARP bringup is pending. 13497 */ 13498 mutex_enter(&ill->ill_lock); 13499 if (ill->ill_arp_closing) { 13500 mutex_exit(&ill->ill_lock); 13501 err = EINVAL; 13502 goto failed; 13503 } else { 13504 if (ill->ill_ipif_up_count == 0 && 13505 ill->ill_ipif_dup_count == 0 && !was_dup) 13506 ill->ill_arp_bringup_pending = 1; 13507 mutex_exit(&ill->ill_lock); 13508 } 13509 publish = (ipif->ipif_lcl_addr != INADDR_ANY); 13510 } 13511 13512 if (IS_IPMP(ill) && publish) { 13513 /* 13514 * If we're here via ipif_up(), then the ipif won't be bound 13515 * yet -- add it to the group, which will bind it if possible. 13516 * (We would add it in ipif_up(), but deleting on failure 13517 * there is gruesome.) If we're here via ipmp_ill_bind_ipif(), 13518 * then the ipif has already been added to the group and we 13519 * just need to use the binding. 13520 */ 13521 if (ipmp_ipif_bound_ill(ipif) == NULL) { 13522 if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) { 13523 /* 13524 * We couldn't bind the ipif to an ill yet, 13525 * so we have nothing to publish. 13526 */ 13527 publish = B_FALSE; 13528 } 13529 added_ipif = B_TRUE; 13530 } 13531 } 13532 13533 /* 13534 * Add an entry for the local address in ARP only if it 13535 * is not UNNUMBERED and it is suitable for publishing. 13536 */ 13537 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) { 13538 if (res_act == Res_act_defend) { 13539 arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND); 13540 if (arp_add_mp == NULL) 13541 goto failed; 13542 /* 13543 * If we're just defending our address now, then 13544 * there's no need to set up ARP multicast mappings. 13545 * The publish command is enough. 13546 */ 13547 goto done; 13548 } 13549 13550 /* 13551 * Allocate an ARP add message and an ARP delete message (the 13552 * latter is saved for use when the address goes down). 13553 */ 13554 if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL) 13555 goto failed; 13556 13557 if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) 13558 goto failed; 13559 13560 if (res_act != Res_act_initial) 13561 goto arp_setup_multicast; 13562 } else { 13563 if (res_act != Res_act_initial) 13564 goto done; 13565 } 13566 /* 13567 * Need to bring up ARP or setup multicast mapping only 13568 * when the first interface is coming UP. 13569 */ 13570 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup) 13571 goto done; 13572 13573 /* 13574 * Allocate an ARP down message (to be saved) and an ARP up message. 13575 */ 13576 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13577 if (arp_down_mp == NULL) 13578 goto failed; 13579 13580 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13581 if (arp_up_mp == NULL) 13582 goto failed; 13583 13584 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13585 goto done; 13586 13587 arp_setup_multicast: 13588 /* 13589 * Setup the multicast mappings. This function initializes 13590 * ill_arp_del_mapping_mp also. This does not need to be done for 13591 * IPv6, or for the IPMP interface (since it has no link-layer). 13592 */ 13593 if (!ill->ill_isv6 && !IS_IPMP(ill)) { 13594 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13595 if (err != 0) 13596 goto failed; 13597 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13598 ASSERT(arp_add_mapping_mp != NULL); 13599 } 13600 done: 13601 if (arp_up_mp != NULL) { 13602 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13603 ill->ill_name, ipif->ipif_id)); 13604 putnext(ill->ill_rq, arp_up_mp); 13605 arp_up_mp = NULL; 13606 } 13607 if (arp_add_mp != NULL) { 13608 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13609 ill->ill_name, ipif->ipif_id)); 13610 /* 13611 * If it's an extended ARP implementation, then we'll wait to 13612 * hear that DAD has finished before using the interface. 13613 */ 13614 if (!ill->ill_arp_extend) 13615 ipif->ipif_addr_ready = 1; 13616 putnext(ill->ill_rq, arp_add_mp); 13617 arp_add_mp = NULL; 13618 } else { 13619 ipif->ipif_addr_ready = 1; 13620 } 13621 if (arp_add_mapping_mp != NULL) { 13622 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13623 ill->ill_name, ipif->ipif_id)); 13624 putnext(ill->ill_rq, arp_add_mapping_mp); 13625 arp_add_mapping_mp = NULL; 13626 } 13627 13628 if (res_act == Res_act_initial) { 13629 if (ill->ill_flags & ILLF_NOARP) 13630 err = ill_arp_off(ill); 13631 else 13632 err = ill_arp_on(ill); 13633 if (err != 0) { 13634 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", 13635 err)); 13636 goto failed; 13637 } 13638 } 13639 13640 if (arp_del_mp != NULL) { 13641 ASSERT(ipif->ipif_arp_del_mp == NULL); 13642 ipif->ipif_arp_del_mp = arp_del_mp; 13643 } 13644 if (arp_down_mp != NULL) { 13645 ASSERT(ill->ill_arp_down_mp == NULL); 13646 ill->ill_arp_down_mp = arp_down_mp; 13647 } 13648 if (arp_del_mapping_mp != NULL) { 13649 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13650 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13651 } 13652 13653 return ((ill->ill_ipif_up_count != 0 || was_dup || 13654 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 13655 failed: 13656 ip1dbg(("ipif_resolver_up: FAILED\n")); 13657 if (added_ipif) 13658 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 13659 freemsg(arp_add_mp); 13660 freemsg(arp_del_mp); 13661 freemsg(arp_add_mapping_mp); 13662 freemsg(arp_up_mp); 13663 freemsg(arp_down_mp); 13664 ill->ill_arp_bringup_pending = 0; 13665 return (err); 13666 } 13667 13668 /* 13669 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 13670 * just gone back up. 13671 */ 13672 static void 13673 ipif_arp_start_dad(ipif_t *ipif) 13674 { 13675 ill_t *ill = ipif->ipif_ill; 13676 mblk_t *arp_add_mp; 13677 13678 /* ACE_F_UNVERIFIED restarts DAD */ 13679 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 13680 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13681 ipif->ipif_lcl_addr == INADDR_ANY || 13682 (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) { 13683 /* 13684 * If we can't contact ARP for some reason, that's not really a 13685 * problem. Just send out the routing socket notification that 13686 * DAD completion would have done, and continue. 13687 */ 13688 ipif_mask_reply(ipif); 13689 ipif_up_notify(ipif); 13690 ipif->ipif_addr_ready = 1; 13691 return; 13692 } 13693 13694 putnext(ill->ill_rq, arp_add_mp); 13695 } 13696 13697 static void 13698 ipif_ndp_start_dad(ipif_t *ipif) 13699 { 13700 nce_t *nce; 13701 13702 nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr, 13703 B_FALSE); 13704 if (nce == NULL) 13705 return; 13706 13707 if (!ndp_restart_dad(nce)) { 13708 /* 13709 * If we can't restart DAD for some reason, that's not really a 13710 * problem. Just send out the routing socket notification that 13711 * DAD completion would have done, and continue. 13712 */ 13713 ipif_up_notify(ipif); 13714 ipif->ipif_addr_ready = 1; 13715 } 13716 NCE_REFRELE(nce); 13717 } 13718 13719 /* 13720 * Restart duplicate address detection on all interfaces on the given ill. 13721 * 13722 * This is called when an interface transitions from down to up 13723 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 13724 * 13725 * Note that since the underlying physical link has transitioned, we must cause 13726 * at least one routing socket message to be sent here, either via DAD 13727 * completion or just by default on the first ipif. (If we don't do this, then 13728 * in.mpathd will see long delays when doing link-based failure recovery.) 13729 */ 13730 void 13731 ill_restart_dad(ill_t *ill, boolean_t went_up) 13732 { 13733 ipif_t *ipif; 13734 13735 if (ill == NULL) 13736 return; 13737 13738 /* 13739 * If layer two doesn't support duplicate address detection, then just 13740 * send the routing socket message now and be done with it. 13741 */ 13742 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 13743 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 13744 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 13745 return; 13746 } 13747 13748 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13749 if (went_up) { 13750 if (ipif->ipif_flags & IPIF_UP) { 13751 if (ill->ill_isv6) 13752 ipif_ndp_start_dad(ipif); 13753 else 13754 ipif_arp_start_dad(ipif); 13755 } else if (ill->ill_isv6 && 13756 (ipif->ipif_flags & IPIF_DUPLICATE)) { 13757 /* 13758 * For IPv4, the ARP module itself will 13759 * automatically start the DAD process when it 13760 * sees DL_NOTE_LINK_UP. We respond to the 13761 * AR_CN_READY at the completion of that task. 13762 * For IPv6, we must kick off the bring-up 13763 * process now. 13764 */ 13765 ndp_do_recovery(ipif); 13766 } else { 13767 /* 13768 * Unfortunately, the first ipif is "special" 13769 * and represents the underlying ill in the 13770 * routing socket messages. Thus, when this 13771 * one ipif is down, we must still notify so 13772 * that the user knows the IFF_RUNNING status 13773 * change. (If the first ipif is up, then 13774 * we'll handle eventual routing socket 13775 * notification via DAD completion.) 13776 */ 13777 if (ipif == ill->ill_ipif) { 13778 ip_rts_ifmsg(ill->ill_ipif, 13779 RTSQ_DEFAULT); 13780 } 13781 } 13782 } else { 13783 /* 13784 * After link down, we'll need to send a new routing 13785 * message when the link comes back, so clear 13786 * ipif_addr_ready. 13787 */ 13788 ipif->ipif_addr_ready = 0; 13789 } 13790 } 13791 13792 /* 13793 * If we've torn down links, then notify the user right away. 13794 */ 13795 if (!went_up) 13796 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 13797 } 13798 13799 static void 13800 ipsq_delete(ipsq_t *ipsq) 13801 { 13802 ipxop_t *ipx = ipsq->ipsq_xop; 13803 13804 ipsq->ipsq_ipst = NULL; 13805 ASSERT(ipsq->ipsq_phyint == NULL); 13806 ASSERT(ipsq->ipsq_xop != NULL); 13807 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 13808 ASSERT(ipx->ipx_pending_mp == NULL); 13809 kmem_free(ipsq, sizeof (ipsq_t)); 13810 } 13811 13812 static int 13813 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 13814 { 13815 int err; 13816 ipif_t *ipif; 13817 13818 if (ill == NULL) 13819 return (0); 13820 13821 ASSERT(IAM_WRITER_ILL(ill)); 13822 ill->ill_up_ipifs = B_TRUE; 13823 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13824 if (ipif->ipif_was_up) { 13825 if (!(ipif->ipif_flags & IPIF_UP)) 13826 err = ipif_up(ipif, q, mp); 13827 ipif->ipif_was_up = B_FALSE; 13828 if (err != 0) { 13829 ASSERT(err == EINPROGRESS); 13830 return (err); 13831 } 13832 } 13833 } 13834 mutex_enter(&ill->ill_lock); 13835 ill->ill_state_flags &= ~ILL_CHANGING; 13836 mutex_exit(&ill->ill_lock); 13837 ill->ill_up_ipifs = B_FALSE; 13838 return (0); 13839 } 13840 13841 /* 13842 * This function is called to bring up all the ipifs that were up before 13843 * bringing the ill down via ill_down_ipifs(). 13844 */ 13845 int 13846 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 13847 { 13848 int err; 13849 13850 ASSERT(IAM_WRITER_ILL(ill)); 13851 13852 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 13853 if (err != 0) 13854 return (err); 13855 13856 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 13857 } 13858 13859 /* 13860 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 13861 * down the ipifs without sending DL_UNBIND_REQ to the driver. 13862 */ 13863 static void 13864 ill_down_ipifs(ill_t *ill, boolean_t logical) 13865 { 13866 ipif_t *ipif; 13867 13868 ASSERT(IAM_WRITER_ILL(ill)); 13869 13870 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13871 /* 13872 * We go through the ipif_down logic even if the ipif 13873 * is already down, since routes can be added based 13874 * on down ipifs. Going through ipif_down once again 13875 * will delete any IREs created based on these routes. 13876 */ 13877 if (ipif->ipif_flags & IPIF_UP) 13878 ipif->ipif_was_up = B_TRUE; 13879 13880 /* 13881 * Need to re-create net/subnet bcast ires if 13882 * they are dependent on ipif. 13883 */ 13884 if (!ipif->ipif_isv6) 13885 ipif_check_bcast_ires(ipif); 13886 if (logical) { 13887 (void) ipif_logical_down(ipif, NULL, NULL); 13888 ipif_non_duplicate(ipif); 13889 ipif_down_tail(ipif); 13890 } else { 13891 (void) ipif_down(ipif, NULL, NULL); 13892 } 13893 } 13894 } 13895 13896 /* 13897 * Redo source address selection. This is called when a 13898 * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up. 13899 */ 13900 void 13901 ill_update_source_selection(ill_t *ill) 13902 { 13903 ipif_t *ipif; 13904 13905 ASSERT(IAM_WRITER_ILL(ill)); 13906 13907 /* 13908 * Underlying interfaces are only used for test traffic and thus 13909 * should always send with their (deprecated) source addresses. 13910 */ 13911 if (IS_UNDER_IPMP(ill)) 13912 return; 13913 13914 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13915 if (ill->ill_isv6) 13916 ipif_recreate_interface_routes_v6(NULL, ipif); 13917 else 13918 ipif_recreate_interface_routes(NULL, ipif); 13919 } 13920 } 13921 13922 /* 13923 * Finish the group join started in ip_sioctl_groupname(). 13924 */ 13925 /* ARGSUSED */ 13926 static void 13927 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 13928 { 13929 ill_t *ill = q->q_ptr; 13930 phyint_t *phyi = ill->ill_phyint; 13931 ipmp_grp_t *grp = phyi->phyint_grp; 13932 ip_stack_t *ipst = ill->ill_ipst; 13933 13934 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 13935 ASSERT(!IS_IPMP(ill) && grp != NULL); 13936 ASSERT(IAM_WRITER_IPSQ(ipsq)); 13937 13938 if (phyi->phyint_illv4 != NULL) { 13939 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 13940 VERIFY(grp->gr_pendv4-- > 0); 13941 rw_exit(&ipst->ips_ipmp_lock); 13942 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 13943 } 13944 if (phyi->phyint_illv6 != NULL) { 13945 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 13946 VERIFY(grp->gr_pendv6-- > 0); 13947 rw_exit(&ipst->ips_ipmp_lock); 13948 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 13949 } 13950 freemsg(mp); 13951 } 13952 13953 /* 13954 * Process an SIOCSLIFGROUPNAME request. 13955 */ 13956 /* ARGSUSED */ 13957 int 13958 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 13959 ip_ioctl_cmd_t *ipip, void *ifreq) 13960 { 13961 struct lifreq *lifr = ifreq; 13962 ill_t *ill = ipif->ipif_ill; 13963 ip_stack_t *ipst = ill->ill_ipst; 13964 phyint_t *phyi = ill->ill_phyint; 13965 ipmp_grp_t *grp = phyi->phyint_grp; 13966 mblk_t *ipsq_mp; 13967 int err = 0; 13968 13969 /* 13970 * Note that phyint_grp can only change here, where we're exclusive. 13971 */ 13972 ASSERT(IAM_WRITER_ILL(ill)); 13973 13974 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 13975 (phyi->phyint_flags & PHYI_VIRTUAL)) 13976 return (EINVAL); 13977 13978 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 13979 13980 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 13981 13982 /* 13983 * If the name hasn't changed, there's nothing to do. 13984 */ 13985 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 13986 goto unlock; 13987 13988 /* 13989 * Handle requests to rename an IPMP meta-interface. 13990 * 13991 * Note that creation of the IPMP meta-interface is handled in 13992 * userland through the standard plumbing sequence. As part of the 13993 * plumbing the IPMP meta-interface, its initial groupname is set to 13994 * the name of the interface (see ipif_set_values_tail()). 13995 */ 13996 if (IS_IPMP(ill)) { 13997 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 13998 goto unlock; 13999 } 14000 14001 /* 14002 * Handle requests to add or remove an IP interface from a group. 14003 */ 14004 if (lifr->lifr_groupname[0] != '\0') { /* add */ 14005 /* 14006 * Moves are handled by first removing the interface from 14007 * its existing group, and then adding it to another group. 14008 * So, fail if it's already in a group. 14009 */ 14010 if (IS_UNDER_IPMP(ill)) { 14011 err = EALREADY; 14012 goto unlock; 14013 } 14014 14015 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 14016 if (grp == NULL) { 14017 err = ENOENT; 14018 goto unlock; 14019 } 14020 14021 /* 14022 * Check if the phyint and its ills are suitable for 14023 * inclusion into the group. 14024 */ 14025 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 14026 goto unlock; 14027 14028 /* 14029 * Checks pass; join the group, and enqueue the remaining 14030 * illgrp joins for when we've become part of the group xop 14031 * and are exclusive across its IPSQs. Since qwriter_ip() 14032 * requires an mblk_t to scribble on, and since `mp' will be 14033 * freed as part of completing the ioctl, allocate another. 14034 */ 14035 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 14036 err = ENOMEM; 14037 goto unlock; 14038 } 14039 14040 /* 14041 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 14042 * IPMP meta-interface ills needed by `phyi' cannot go away 14043 * before ip_join_illgrps() is called back. See the comments 14044 * in ip_sioctl_plink_ipmp() for more. 14045 */ 14046 if (phyi->phyint_illv4 != NULL) 14047 grp->gr_pendv4++; 14048 if (phyi->phyint_illv6 != NULL) 14049 grp->gr_pendv6++; 14050 14051 rw_exit(&ipst->ips_ipmp_lock); 14052 14053 ipmp_phyint_join_grp(phyi, grp); 14054 ill_refhold(ill); 14055 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 14056 SWITCH_OP, B_FALSE); 14057 return (0); 14058 } else { 14059 /* 14060 * Request to remove the interface from a group. If the 14061 * interface is not in a group, this trivially succeeds. 14062 */ 14063 rw_exit(&ipst->ips_ipmp_lock); 14064 if (IS_UNDER_IPMP(ill)) 14065 ipmp_phyint_leave_grp(phyi); 14066 return (0); 14067 } 14068 unlock: 14069 rw_exit(&ipst->ips_ipmp_lock); 14070 return (err); 14071 } 14072 14073 /* 14074 * Process an SIOCGLIFBINDING request. 14075 */ 14076 /* ARGSUSED */ 14077 int 14078 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14079 ip_ioctl_cmd_t *ipip, void *ifreq) 14080 { 14081 ill_t *ill; 14082 struct lifreq *lifr = ifreq; 14083 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14084 14085 if (!IS_IPMP(ipif->ipif_ill)) 14086 return (EINVAL); 14087 14088 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14089 if ((ill = ipif->ipif_bound_ill) == NULL) 14090 lifr->lifr_binding[0] = '\0'; 14091 else 14092 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 14093 rw_exit(&ipst->ips_ipmp_lock); 14094 return (0); 14095 } 14096 14097 /* 14098 * Process an SIOCGLIFGROUPNAME request. 14099 */ 14100 /* ARGSUSED */ 14101 int 14102 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14103 ip_ioctl_cmd_t *ipip, void *ifreq) 14104 { 14105 ipmp_grp_t *grp; 14106 struct lifreq *lifr = ifreq; 14107 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14108 14109 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14110 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 14111 lifr->lifr_groupname[0] = '\0'; 14112 else 14113 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 14114 rw_exit(&ipst->ips_ipmp_lock); 14115 return (0); 14116 } 14117 14118 /* 14119 * Process an SIOCGLIFGROUPINFO request. 14120 */ 14121 /* ARGSUSED */ 14122 int 14123 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14124 ip_ioctl_cmd_t *ipip, void *dummy) 14125 { 14126 ipmp_grp_t *grp; 14127 lifgroupinfo_t *lifgr; 14128 ip_stack_t *ipst = CONNQ_TO_IPST(q); 14129 14130 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 14131 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 14132 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 14133 14134 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14135 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 14136 rw_exit(&ipst->ips_ipmp_lock); 14137 return (ENOENT); 14138 } 14139 ipmp_grp_info(grp, lifgr); 14140 rw_exit(&ipst->ips_ipmp_lock); 14141 return (0); 14142 } 14143 14144 static void 14145 ill_dl_down(ill_t *ill) 14146 { 14147 /* 14148 * The ill is down; unbind but stay attached since we're still 14149 * associated with a PPA. If we have negotiated DLPI capabilites 14150 * with the data link service provider (IDS_OK) then reset them. 14151 * The interval between unbinding and rebinding is potentially 14152 * unbounded hence we cannot assume things will be the same. 14153 * The DLPI capabilities will be probed again when the data link 14154 * is brought up. 14155 */ 14156 mblk_t *mp = ill->ill_unbind_mp; 14157 14158 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 14159 14160 ill->ill_unbind_mp = NULL; 14161 if (mp != NULL) { 14162 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 14163 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 14164 ill->ill_name)); 14165 mutex_enter(&ill->ill_lock); 14166 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 14167 mutex_exit(&ill->ill_lock); 14168 /* 14169 * ip_rput does not pass up normal (M_PROTO) DLPI messages 14170 * after ILL_CONDEMNED is set. So in the unplumb case, we call 14171 * ill_capability_dld_disable disable rightaway. If this is not 14172 * an unplumb operation then the disable happens on receipt of 14173 * the capab ack via ip_rput_dlpi_writer -> 14174 * ill_capability_ack_thr. In both cases the order of 14175 * the operations seen by DLD is capability disable followed 14176 * by DL_UNBIND. Also the DLD capability disable needs a 14177 * cv_wait'able context. 14178 */ 14179 if (ill->ill_state_flags & ILL_CONDEMNED) 14180 ill_capability_dld_disable(ill); 14181 ill_capability_reset(ill, B_FALSE); 14182 ill_dlpi_send(ill, mp); 14183 } 14184 14185 /* 14186 * Toss all of our multicast memberships. We could keep them, but 14187 * then we'd have to do bookkeeping of any joins and leaves performed 14188 * by the application while the the interface is down (we can't just 14189 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 14190 * on a downed interface). 14191 */ 14192 ill_leave_multicast(ill); 14193 14194 mutex_enter(&ill->ill_lock); 14195 ill->ill_dl_up = 0; 14196 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 14197 mutex_exit(&ill->ill_lock); 14198 } 14199 14200 static void 14201 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 14202 { 14203 union DL_primitives *dlp; 14204 t_uscalar_t prim; 14205 boolean_t waitack = B_FALSE; 14206 14207 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 14208 14209 dlp = (union DL_primitives *)mp->b_rptr; 14210 prim = dlp->dl_primitive; 14211 14212 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 14213 dl_primstr(prim), prim, ill->ill_name)); 14214 14215 switch (prim) { 14216 case DL_PHYS_ADDR_REQ: 14217 { 14218 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 14219 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 14220 break; 14221 } 14222 case DL_BIND_REQ: 14223 mutex_enter(&ill->ill_lock); 14224 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14225 mutex_exit(&ill->ill_lock); 14226 break; 14227 } 14228 14229 /* 14230 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 14231 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 14232 * we only wait for the ACK of the DL_UNBIND_REQ. 14233 */ 14234 mutex_enter(&ill->ill_lock); 14235 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 14236 (prim == DL_UNBIND_REQ)) { 14237 ill->ill_dlpi_pending = prim; 14238 waitack = B_TRUE; 14239 } 14240 14241 mutex_exit(&ill->ill_lock); 14242 putnext(ill->ill_wq, mp); 14243 14244 /* 14245 * There is no ack for DL_NOTIFY_CONF messages 14246 */ 14247 if (waitack && prim == DL_NOTIFY_CONF) 14248 ill_dlpi_done(ill, prim); 14249 } 14250 14251 /* 14252 * Helper function for ill_dlpi_send(). 14253 */ 14254 /* ARGSUSED */ 14255 static void 14256 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 14257 { 14258 ill_dlpi_send(q->q_ptr, mp); 14259 } 14260 14261 /* 14262 * Send a DLPI control message to the driver but make sure there 14263 * is only one outstanding message. Uses ill_dlpi_pending to tell 14264 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 14265 * when an ACK or a NAK is received to process the next queued message. 14266 */ 14267 void 14268 ill_dlpi_send(ill_t *ill, mblk_t *mp) 14269 { 14270 mblk_t **mpp; 14271 14272 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 14273 14274 /* 14275 * To ensure that any DLPI requests for current exclusive operation 14276 * are always completely sent before any DLPI messages for other 14277 * operations, require writer access before enqueuing. 14278 */ 14279 if (!IAM_WRITER_ILL(ill)) { 14280 ill_refhold(ill); 14281 /* qwriter_ip() does the ill_refrele() */ 14282 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 14283 NEW_OP, B_TRUE); 14284 return; 14285 } 14286 14287 mutex_enter(&ill->ill_lock); 14288 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 14289 /* Must queue message. Tail insertion */ 14290 mpp = &ill->ill_dlpi_deferred; 14291 while (*mpp != NULL) 14292 mpp = &((*mpp)->b_next); 14293 14294 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 14295 ill->ill_name)); 14296 14297 *mpp = mp; 14298 mutex_exit(&ill->ill_lock); 14299 return; 14300 } 14301 mutex_exit(&ill->ill_lock); 14302 ill_dlpi_dispatch(ill, mp); 14303 } 14304 14305 static void 14306 ill_capability_send(ill_t *ill, mblk_t *mp) 14307 { 14308 ill->ill_capab_pending_cnt++; 14309 ill_dlpi_send(ill, mp); 14310 } 14311 14312 void 14313 ill_capability_done(ill_t *ill) 14314 { 14315 ASSERT(ill->ill_capab_pending_cnt != 0); 14316 14317 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 14318 14319 ill->ill_capab_pending_cnt--; 14320 if (ill->ill_capab_pending_cnt == 0 && 14321 ill->ill_dlpi_capab_state == IDCS_OK) 14322 ill_capability_reset_alloc(ill); 14323 } 14324 14325 /* 14326 * Send all deferred DLPI messages without waiting for their ACKs. 14327 */ 14328 void 14329 ill_dlpi_send_deferred(ill_t *ill) 14330 { 14331 mblk_t *mp, *nextmp; 14332 14333 /* 14334 * Clear ill_dlpi_pending so that the message is not queued in 14335 * ill_dlpi_send(). 14336 */ 14337 mutex_enter(&ill->ill_lock); 14338 ill->ill_dlpi_pending = DL_PRIM_INVAL; 14339 mp = ill->ill_dlpi_deferred; 14340 ill->ill_dlpi_deferred = NULL; 14341 mutex_exit(&ill->ill_lock); 14342 14343 for (; mp != NULL; mp = nextmp) { 14344 nextmp = mp->b_next; 14345 mp->b_next = NULL; 14346 ill_dlpi_send(ill, mp); 14347 } 14348 } 14349 14350 /* 14351 * Check if the DLPI primitive `prim' is pending; print a warning if not. 14352 */ 14353 boolean_t 14354 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 14355 { 14356 t_uscalar_t pending; 14357 14358 mutex_enter(&ill->ill_lock); 14359 if (ill->ill_dlpi_pending == prim) { 14360 mutex_exit(&ill->ill_lock); 14361 return (B_TRUE); 14362 } 14363 14364 /* 14365 * During teardown, ill_dlpi_dispatch() will send DLPI requests 14366 * without waiting, so don't print any warnings in that case. 14367 */ 14368 if (ill->ill_state_flags & ILL_CONDEMNED) { 14369 mutex_exit(&ill->ill_lock); 14370 return (B_FALSE); 14371 } 14372 pending = ill->ill_dlpi_pending; 14373 mutex_exit(&ill->ill_lock); 14374 14375 if (pending == DL_PRIM_INVAL) { 14376 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14377 "received unsolicited ack for %s on %s\n", 14378 dl_primstr(prim), ill->ill_name); 14379 } else { 14380 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14381 "received unexpected ack for %s on %s (expecting %s)\n", 14382 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 14383 } 14384 return (B_FALSE); 14385 } 14386 14387 /* 14388 * Complete the current DLPI operation associated with `prim' on `ill' and 14389 * start the next queued DLPI operation (if any). If there are no queued DLPI 14390 * operations and the ill's current exclusive IPSQ operation has finished 14391 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 14392 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 14393 * the comments above ipsq_current_finish() for details. 14394 */ 14395 void 14396 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 14397 { 14398 mblk_t *mp; 14399 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 14400 ipxop_t *ipx = ipsq->ipsq_xop; 14401 14402 ASSERT(IAM_WRITER_IPSQ(ipsq)); 14403 mutex_enter(&ill->ill_lock); 14404 14405 ASSERT(prim != DL_PRIM_INVAL); 14406 ASSERT(ill->ill_dlpi_pending == prim); 14407 14408 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 14409 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 14410 14411 if ((mp = ill->ill_dlpi_deferred) == NULL) { 14412 ill->ill_dlpi_pending = DL_PRIM_INVAL; 14413 if (ipx->ipx_current_done) { 14414 mutex_enter(&ipx->ipx_lock); 14415 ipx->ipx_current_ipif = NULL; 14416 mutex_exit(&ipx->ipx_lock); 14417 } 14418 cv_signal(&ill->ill_cv); 14419 mutex_exit(&ill->ill_lock); 14420 return; 14421 } 14422 14423 ill->ill_dlpi_deferred = mp->b_next; 14424 mp->b_next = NULL; 14425 mutex_exit(&ill->ill_lock); 14426 14427 ill_dlpi_dispatch(ill, mp); 14428 } 14429 14430 void 14431 conn_delete_ire(conn_t *connp, caddr_t arg) 14432 { 14433 ipif_t *ipif = (ipif_t *)arg; 14434 ire_t *ire; 14435 14436 /* 14437 * Look at the cached ires on conns which has pointers to ipifs. 14438 * We just call ire_refrele which clears up the reference 14439 * to ire. Called when a conn closes. Also called from ipif_free 14440 * to cleanup indirect references to the stale ipif via the cached ire. 14441 */ 14442 mutex_enter(&connp->conn_lock); 14443 ire = connp->conn_ire_cache; 14444 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 14445 connp->conn_ire_cache = NULL; 14446 mutex_exit(&connp->conn_lock); 14447 IRE_REFRELE_NOTR(ire); 14448 return; 14449 } 14450 mutex_exit(&connp->conn_lock); 14451 14452 } 14453 14454 /* 14455 * Some operations (e.g., ipif_down()) conditionally delete a number 14456 * of IREs. Those IREs may have been previously cached in the conn structure. 14457 * This ipcl_walk() walker function releases all references to such IREs based 14458 * on the condemned flag. 14459 */ 14460 /* ARGSUSED */ 14461 void 14462 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 14463 { 14464 ire_t *ire; 14465 14466 mutex_enter(&connp->conn_lock); 14467 ire = connp->conn_ire_cache; 14468 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 14469 connp->conn_ire_cache = NULL; 14470 mutex_exit(&connp->conn_lock); 14471 IRE_REFRELE_NOTR(ire); 14472 return; 14473 } 14474 mutex_exit(&connp->conn_lock); 14475 } 14476 14477 /* 14478 * Take down a specific interface, but don't lose any information about it. 14479 * (Always called as writer.) 14480 * This function goes through the down sequence even if the interface is 14481 * already down. There are 2 reasons. 14482 * a. Currently we permit interface routes that depend on down interfaces 14483 * to be added. This behaviour itself is questionable. However it appears 14484 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 14485 * time. We go thru the cleanup in order to remove these routes. 14486 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 14487 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 14488 * down, but we need to cleanup i.e. do ill_dl_down and 14489 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 14490 * 14491 * IP-MT notes: 14492 * 14493 * Model of reference to interfaces. 14494 * 14495 * The following members in ipif_t track references to the ipif. 14496 * int ipif_refcnt; Active reference count 14497 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 14498 * uint_t ipif_ilm_cnt; Number of ilms's references this ipif. 14499 * 14500 * The following members in ill_t track references to the ill. 14501 * int ill_refcnt; active refcnt 14502 * uint_t ill_ire_cnt; Number of ires referencing ill 14503 * uint_t ill_nce_cnt; Number of nces referencing ill 14504 * uint_t ill_ilm_cnt; Number of ilms referencing ill 14505 * 14506 * Reference to an ipif or ill can be obtained in any of the following ways. 14507 * 14508 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 14509 * Pointers to ipif / ill from other data structures viz ire and conn. 14510 * Implicit reference to the ipif / ill by holding a reference to the ire. 14511 * 14512 * The ipif/ill lookup functions return a reference held ipif / ill. 14513 * ipif_refcnt and ill_refcnt track the reference counts respectively. 14514 * This is a purely dynamic reference count associated with threads holding 14515 * references to the ipif / ill. Pointers from other structures do not 14516 * count towards this reference count. 14517 * 14518 * ipif_ire_cnt/ill_ire_cnt is the number of ire's 14519 * associated with the ipif/ill. This is incremented whenever a new 14520 * ire is created referencing the ipif/ill. This is done atomically inside 14521 * ire_add_v[46] where the ire is actually added to the ire hash table. 14522 * The count is decremented in ire_inactive where the ire is destroyed. 14523 * 14524 * nce's reference ill's thru nce_ill and the count of nce's associated with 14525 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 14526 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 14527 * table. Similarly it is decremented in ndp_inactive() where the nce 14528 * is destroyed. 14529 * 14530 * ilm's reference to the ipif (for IPv4 ilm's) or the ill (for IPv6 ilm's) 14531 * is incremented in ilm_add_v6() and decremented before the ilm is freed 14532 * in ilm_walker_cleanup() or ilm_delete(). 14533 * 14534 * Flow of ioctls involving interface down/up 14535 * 14536 * The following is the sequence of an attempt to set some critical flags on an 14537 * up interface. 14538 * ip_sioctl_flags 14539 * ipif_down 14540 * wait for ipif to be quiescent 14541 * ipif_down_tail 14542 * ip_sioctl_flags_tail 14543 * 14544 * All set ioctls that involve down/up sequence would have a skeleton similar 14545 * to the above. All the *tail functions are called after the refcounts have 14546 * dropped to the appropriate values. 14547 * 14548 * The mechanism to quiesce an ipif is as follows. 14549 * 14550 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 14551 * on the ipif. Callers either pass a flag requesting wait or the lookup 14552 * functions will return NULL. 14553 * 14554 * Delete all ires referencing this ipif 14555 * 14556 * Any thread attempting to do an ipif_refhold on an ipif that has been 14557 * obtained thru a cached pointer will first make sure that 14558 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 14559 * increment the refcount. 14560 * 14561 * The above guarantees that the ipif refcount will eventually come down to 14562 * zero and the ipif will quiesce, once all threads that currently hold a 14563 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 14564 * ipif_refcount has dropped to zero and all ire's associated with this ipif 14565 * have also been ire_inactive'd. i.e. when ipif_{ire, ill}_cnt and 14566 * ipif_refcnt both drop to zero. See also: comments above IPIF_DOWN_OK() 14567 * in ip.h 14568 * 14569 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 14570 * 14571 * Threads trying to lookup an ipif or ill can pass a flag requesting 14572 * wait and restart if the ipif / ill cannot be looked up currently. 14573 * For eg. bind, and route operations (Eg. route add / delete) cannot return 14574 * failure if the ipif is currently undergoing an exclusive operation, and 14575 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 14576 * is restarted by ipsq_exit() when the current exclusive operation completes. 14577 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 14578 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 14579 * change while the ill_lock is held. Before dropping the ill_lock we acquire 14580 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 14581 * until we release the ipsq_lock, even though the the ill/ipif state flags 14582 * can change after we drop the ill_lock. 14583 * 14584 * An attempt to send out a packet using an ipif that is currently 14585 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 14586 * operation and restart it later when the exclusive condition on the ipif ends. 14587 * This is an example of not passing the wait flag to the lookup functions. For 14588 * example an attempt to refhold and use conn->conn_multicast_ipif and send 14589 * out a multicast packet on that ipif will fail while the ipif is 14590 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 14591 * currently IPIF_CHANGING will also fail. 14592 */ 14593 int 14594 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 14595 { 14596 ill_t *ill = ipif->ipif_ill; 14597 conn_t *connp; 14598 boolean_t success; 14599 boolean_t ipif_was_up = B_FALSE; 14600 ip_stack_t *ipst = ill->ill_ipst; 14601 14602 ASSERT(IAM_WRITER_IPIF(ipif)); 14603 14604 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 14605 14606 if (ipif->ipif_flags & IPIF_UP) { 14607 mutex_enter(&ill->ill_lock); 14608 ipif->ipif_flags &= ~IPIF_UP; 14609 ASSERT(ill->ill_ipif_up_count > 0); 14610 --ill->ill_ipif_up_count; 14611 mutex_exit(&ill->ill_lock); 14612 ipif_was_up = B_TRUE; 14613 /* Update status in SCTP's list */ 14614 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 14615 ill_nic_event_dispatch(ipif->ipif_ill, 14616 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 14617 } 14618 14619 /* 14620 * Blow away memberships we established in ipif_multicast_up(). 14621 */ 14622 ipif_multicast_down(ipif); 14623 14624 /* 14625 * Remove from the mapping for __sin6_src_id. We insert only 14626 * when the address is not INADDR_ANY. As IPv4 addresses are 14627 * stored as mapped addresses, we need to check for mapped 14628 * INADDR_ANY also. 14629 */ 14630 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 14631 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 14632 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14633 int err; 14634 14635 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 14636 ipif->ipif_zoneid, ipst); 14637 if (err != 0) { 14638 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 14639 } 14640 } 14641 14642 /* 14643 * Delete all IRE's pointing at this ipif or its source address. 14644 */ 14645 if (ipif->ipif_isv6) { 14646 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 14647 ipst); 14648 } else { 14649 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 14650 ipst); 14651 } 14652 14653 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 14654 /* 14655 * Since the interface is now down, it may have just become 14656 * inactive. Note that this needs to be done even for a 14657 * lll_logical_down(), or ARP entries will not get correctly 14658 * restored when the interface comes back up. 14659 */ 14660 if (IS_UNDER_IPMP(ill)) 14661 ipmp_ill_refresh_active(ill); 14662 } 14663 14664 /* 14665 * Cleaning up the conn_ire_cache or conns must be done only after the 14666 * ires have been deleted above. Otherwise a thread could end up 14667 * caching an ire in a conn after we have finished the cleanup of the 14668 * conn. The caching is done after making sure that the ire is not yet 14669 * condemned. Also documented in the block comment above ip_output 14670 */ 14671 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 14672 /* Also, delete the ires cached in SCTP */ 14673 sctp_ire_cache_flush(ipif); 14674 14675 /* 14676 * Update any other ipifs which have used "our" local address as 14677 * a source address. This entails removing and recreating IRE_INTERFACE 14678 * entries for such ipifs. 14679 */ 14680 if (ipif->ipif_isv6) 14681 ipif_update_other_ipifs_v6(ipif); 14682 else 14683 ipif_update_other_ipifs(ipif); 14684 14685 /* 14686 * neighbor-discovery or arp entries for this interface. 14687 */ 14688 ipif_ndp_down(ipif); 14689 14690 /* 14691 * If mp is NULL the caller will wait for the appropriate refcnt. 14692 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 14693 * and ill_delete -> ipif_free -> ipif_down 14694 */ 14695 if (mp == NULL) { 14696 ASSERT(q == NULL); 14697 return (0); 14698 } 14699 14700 if (CONN_Q(q)) { 14701 connp = Q_TO_CONN(q); 14702 mutex_enter(&connp->conn_lock); 14703 } else { 14704 connp = NULL; 14705 } 14706 mutex_enter(&ill->ill_lock); 14707 /* 14708 * Are there any ire's pointing to this ipif that are still active ? 14709 * If this is the last ipif going down, are there any ire's pointing 14710 * to this ill that are still active ? 14711 */ 14712 if (ipif_is_quiescent(ipif)) { 14713 mutex_exit(&ill->ill_lock); 14714 if (connp != NULL) 14715 mutex_exit(&connp->conn_lock); 14716 return (0); 14717 } 14718 14719 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 14720 ill->ill_name, (void *)ill)); 14721 /* 14722 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 14723 * drops down, the operation will be restarted by ipif_ill_refrele_tail 14724 * which in turn is called by the last refrele on the ipif/ill/ire. 14725 */ 14726 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 14727 if (!success) { 14728 /* The conn is closing. So just return */ 14729 ASSERT(connp != NULL); 14730 mutex_exit(&ill->ill_lock); 14731 mutex_exit(&connp->conn_lock); 14732 return (EINTR); 14733 } 14734 14735 mutex_exit(&ill->ill_lock); 14736 if (connp != NULL) 14737 mutex_exit(&connp->conn_lock); 14738 return (EINPROGRESS); 14739 } 14740 14741 void 14742 ipif_down_tail(ipif_t *ipif) 14743 { 14744 ill_t *ill = ipif->ipif_ill; 14745 14746 /* 14747 * Skip any loopback interface (null wq). 14748 * If this is the last logical interface on the ill 14749 * have ill_dl_down tell the driver we are gone (unbind) 14750 * Note that lun 0 can ipif_down even though 14751 * there are other logical units that are up. 14752 * This occurs e.g. when we change a "significant" IFF_ flag. 14753 */ 14754 if (ill->ill_wq != NULL && !ill->ill_logical_down && 14755 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 14756 ill->ill_dl_up) { 14757 ill_dl_down(ill); 14758 } 14759 ill->ill_logical_down = 0; 14760 14761 /* 14762 * Has to be after removing the routes in ipif_down_delete_ire. 14763 */ 14764 ipif_resolver_down(ipif); 14765 14766 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 14767 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 14768 } 14769 14770 /* 14771 * Bring interface logically down without bringing the physical interface 14772 * down e.g. when the netmask is changed. This avoids long lasting link 14773 * negotiations between an ethernet interface and a certain switches. 14774 */ 14775 static int 14776 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 14777 { 14778 /* 14779 * The ill_logical_down flag is a transient flag. It is set here 14780 * and is cleared once the down has completed in ipif_down_tail. 14781 * This flag does not indicate whether the ill stream is in the 14782 * DL_BOUND state with the driver. Instead this flag is used by 14783 * ipif_down_tail to determine whether to DL_UNBIND the stream with 14784 * the driver. The state of the ill stream i.e. whether it is 14785 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 14786 */ 14787 ipif->ipif_ill->ill_logical_down = 1; 14788 return (ipif_down(ipif, q, mp)); 14789 } 14790 14791 /* 14792 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 14793 * If the usesrc client ILL is already part of a usesrc group or not, 14794 * in either case a ire_stq with the matching usesrc client ILL will 14795 * locate the IRE's that need to be deleted. We want IREs to be created 14796 * with the new source address. 14797 */ 14798 static void 14799 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 14800 { 14801 ill_t *ucill = (ill_t *)ill_arg; 14802 14803 ASSERT(IAM_WRITER_ILL(ucill)); 14804 14805 if (ire->ire_stq == NULL) 14806 return; 14807 14808 if ((ire->ire_type == IRE_CACHE) && 14809 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 14810 ire_delete(ire); 14811 } 14812 14813 /* 14814 * ire_walk routine to delete every IRE dependent on the interface 14815 * address that is going down. (Always called as writer.) 14816 * Works for both v4 and v6. 14817 * In addition for checking for ire_ipif matches it also checks for 14818 * IRE_CACHE entries which have the same source address as the 14819 * disappearing ipif since ipif_select_source might have picked 14820 * that source. Note that ipif_down/ipif_update_other_ipifs takes 14821 * care of any IRE_INTERFACE with the disappearing source address. 14822 */ 14823 static void 14824 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 14825 { 14826 ipif_t *ipif = (ipif_t *)ipif_arg; 14827 14828 ASSERT(IAM_WRITER_IPIF(ipif)); 14829 if (ire->ire_ipif == NULL) 14830 return; 14831 14832 if (ire->ire_ipif != ipif) { 14833 /* 14834 * Look for a matching source address. 14835 */ 14836 if (ire->ire_type != IRE_CACHE) 14837 return; 14838 if (ipif->ipif_flags & IPIF_NOLOCAL) 14839 return; 14840 14841 if (ire->ire_ipversion == IPV4_VERSION) { 14842 if (ire->ire_src_addr != ipif->ipif_src_addr) 14843 return; 14844 } else { 14845 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 14846 &ipif->ipif_v6lcl_addr)) 14847 return; 14848 } 14849 ire_delete(ire); 14850 return; 14851 } 14852 /* 14853 * ire_delete() will do an ire_flush_cache which will delete 14854 * all ire_ipif matches 14855 */ 14856 ire_delete(ire); 14857 } 14858 14859 /* 14860 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 14861 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 14862 * 2) when an interface is brought up or down (on that ill). 14863 * This ensures that the IRE_CACHE entries don't retain stale source 14864 * address selection results. 14865 */ 14866 void 14867 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 14868 { 14869 ill_t *ill = (ill_t *)ill_arg; 14870 14871 ASSERT(IAM_WRITER_ILL(ill)); 14872 ASSERT(ire->ire_type == IRE_CACHE); 14873 14874 /* 14875 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches 14876 * ill, but we only want to delete the IRE if ire_ipif matches. 14877 */ 14878 ASSERT(ire->ire_ipif != NULL); 14879 if (ill == ire->ire_ipif->ipif_ill) 14880 ire_delete(ire); 14881 } 14882 14883 /* 14884 * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this 14885 * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references 14886 * the IPMP ill. 14887 */ 14888 void 14889 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 14890 { 14891 ill_t *ill = (ill_t *)ill_arg; 14892 14893 ASSERT(IAM_WRITER_ILL(ill)); 14894 ASSERT(ire->ire_type == IRE_CACHE); 14895 14896 /* 14897 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches 14898 * ill, but we only want to delete the IRE if ire_stq matches. 14899 */ 14900 if (ire->ire_stq->q_ptr == ill_arg) 14901 ire_delete(ire); 14902 } 14903 14904 /* 14905 * Delete all the IREs whose ire_stq's reference any ill in the same IPMP 14906 * group as `ill_arg'. Used by ipmp_ill_deactivate() to flush all IRE_CACHE 14907 * entries for the illgrp. 14908 */ 14909 void 14910 ill_grp_cache_delete(ire_t *ire, char *ill_arg) 14911 { 14912 ill_t *ill = (ill_t *)ill_arg; 14913 14914 ASSERT(IAM_WRITER_ILL(ill)); 14915 14916 if (ire->ire_type == IRE_CACHE && 14917 IS_IN_SAME_ILLGRP((ill_t *)ire->ire_stq->q_ptr, ill)) { 14918 ire_delete(ire); 14919 } 14920 } 14921 14922 /* 14923 * Delete all broadcast IREs with a source address on `ill_arg'. 14924 */ 14925 static void 14926 ill_broadcast_delete(ire_t *ire, char *ill_arg) 14927 { 14928 ill_t *ill = (ill_t *)ill_arg; 14929 14930 ASSERT(IAM_WRITER_ILL(ill)); 14931 ASSERT(ire->ire_type == IRE_BROADCAST); 14932 14933 if (ire->ire_ipif->ipif_ill == ill) 14934 ire_delete(ire); 14935 } 14936 14937 /* 14938 * Initiate deallocate of an IPIF. Always called as writer. Called by 14939 * ill_delete or ip_sioctl_removeif. 14940 */ 14941 static void 14942 ipif_free(ipif_t *ipif) 14943 { 14944 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14945 14946 ASSERT(IAM_WRITER_IPIF(ipif)); 14947 14948 if (ipif->ipif_recovery_id != 0) 14949 (void) untimeout(ipif->ipif_recovery_id); 14950 ipif->ipif_recovery_id = 0; 14951 14952 /* Remove conn references */ 14953 reset_conn_ipif(ipif); 14954 14955 /* 14956 * Make sure we have valid net and subnet broadcast ire's for the 14957 * other ipif's which share them with this ipif. 14958 */ 14959 if (!ipif->ipif_isv6) 14960 ipif_check_bcast_ires(ipif); 14961 14962 /* 14963 * Take down the interface. We can be called either from ill_delete 14964 * or from ip_sioctl_removeif. 14965 */ 14966 (void) ipif_down(ipif, NULL, NULL); 14967 14968 /* 14969 * Now that the interface is down, there's no chance it can still 14970 * become a duplicate. Cancel any timer that may have been set while 14971 * tearing down. 14972 */ 14973 if (ipif->ipif_recovery_id != 0) 14974 (void) untimeout(ipif->ipif_recovery_id); 14975 ipif->ipif_recovery_id = 0; 14976 14977 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14978 /* Remove pointers to this ill in the multicast routing tables */ 14979 reset_mrt_vif_ipif(ipif); 14980 /* If necessary, clear the cached source ipif rotor. */ 14981 if (ipif->ipif_ill->ill_src_ipif == ipif) 14982 ipif->ipif_ill->ill_src_ipif = NULL; 14983 rw_exit(&ipst->ips_ill_g_lock); 14984 } 14985 14986 static void 14987 ipif_free_tail(ipif_t *ipif) 14988 { 14989 mblk_t *mp; 14990 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14991 14992 /* 14993 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 14994 */ 14995 mutex_enter(&ipif->ipif_saved_ire_lock); 14996 mp = ipif->ipif_saved_ire_mp; 14997 ipif->ipif_saved_ire_mp = NULL; 14998 mutex_exit(&ipif->ipif_saved_ire_lock); 14999 freemsg(mp); 15000 15001 /* 15002 * Need to hold both ill_g_lock and ill_lock while 15003 * inserting or removing an ipif from the linked list 15004 * of ipifs hanging off the ill. 15005 */ 15006 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15007 15008 ASSERT(ilm_walk_ipif(ipif) == 0); 15009 15010 #ifdef DEBUG 15011 ipif_trace_cleanup(ipif); 15012 #endif 15013 15014 /* Ask SCTP to take it out of it list */ 15015 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 15016 15017 /* Get it out of the ILL interface list. */ 15018 ipif_remove(ipif); 15019 rw_exit(&ipst->ips_ill_g_lock); 15020 15021 mutex_destroy(&ipif->ipif_saved_ire_lock); 15022 15023 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 15024 ASSERT(ipif->ipif_recovery_id == 0); 15025 15026 /* Free the memory. */ 15027 mi_free(ipif); 15028 } 15029 15030 /* 15031 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 15032 * is zero. 15033 */ 15034 void 15035 ipif_get_name(const ipif_t *ipif, char *buf, int len) 15036 { 15037 char lbuf[LIFNAMSIZ]; 15038 char *name; 15039 size_t name_len; 15040 15041 buf[0] = '\0'; 15042 name = ipif->ipif_ill->ill_name; 15043 name_len = ipif->ipif_ill->ill_name_length; 15044 if (ipif->ipif_id != 0) { 15045 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 15046 ipif->ipif_id); 15047 name = lbuf; 15048 name_len = mi_strlen(name) + 1; 15049 } 15050 len -= 1; 15051 buf[len] = '\0'; 15052 len = MIN(len, name_len); 15053 bcopy(name, buf, len); 15054 } 15055 15056 /* 15057 * Find an IPIF based on the name passed in. Names can be of the form <phys> 15058 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 15059 * implied unit id is zero. <phys> must correspond to the name of an ILL. 15060 * (May be called as writer.) 15061 */ 15062 static ipif_t * 15063 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 15064 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 15065 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 15066 { 15067 char *cp; 15068 char *endp; 15069 long id; 15070 ill_t *ill; 15071 ipif_t *ipif; 15072 uint_t ire_type; 15073 boolean_t did_alloc = B_FALSE; 15074 ipsq_t *ipsq; 15075 15076 if (error != NULL) 15077 *error = 0; 15078 15079 /* 15080 * If the caller wants to us to create the ipif, make sure we have a 15081 * valid zoneid 15082 */ 15083 ASSERT(!do_alloc || zoneid != ALL_ZONES); 15084 15085 if (namelen == 0) { 15086 if (error != NULL) 15087 *error = ENXIO; 15088 return (NULL); 15089 } 15090 15091 *exists = B_FALSE; 15092 /* Look for a colon in the name. */ 15093 endp = &name[namelen]; 15094 for (cp = endp; --cp > name; ) { 15095 if (*cp == IPIF_SEPARATOR_CHAR) 15096 break; 15097 } 15098 15099 if (*cp == IPIF_SEPARATOR_CHAR) { 15100 /* 15101 * Reject any non-decimal aliases for logical 15102 * interfaces. Aliases with leading zeroes 15103 * are also rejected as they introduce ambiguity 15104 * in the naming of the interfaces. 15105 * In order to confirm with existing semantics, 15106 * and to not break any programs/script relying 15107 * on that behaviour, if<0>:0 is considered to be 15108 * a valid interface. 15109 * 15110 * If alias has two or more digits and the first 15111 * is zero, fail. 15112 */ 15113 if (&cp[2] < endp && cp[1] == '0') { 15114 if (error != NULL) 15115 *error = EINVAL; 15116 return (NULL); 15117 } 15118 } 15119 15120 if (cp <= name) { 15121 cp = endp; 15122 } else { 15123 *cp = '\0'; 15124 } 15125 15126 /* 15127 * Look up the ILL, based on the portion of the name 15128 * before the slash. ill_lookup_on_name returns a held ill. 15129 * Temporary to check whether ill exists already. If so 15130 * ill_lookup_on_name will clear it. 15131 */ 15132 ill = ill_lookup_on_name(name, do_alloc, isv6, 15133 q, mp, func, error, &did_alloc, ipst); 15134 if (cp != endp) 15135 *cp = IPIF_SEPARATOR_CHAR; 15136 if (ill == NULL) 15137 return (NULL); 15138 15139 /* Establish the unit number in the name. */ 15140 id = 0; 15141 if (cp < endp && *endp == '\0') { 15142 /* If there was a colon, the unit number follows. */ 15143 cp++; 15144 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 15145 ill_refrele(ill); 15146 if (error != NULL) 15147 *error = ENXIO; 15148 return (NULL); 15149 } 15150 } 15151 15152 GRAB_CONN_LOCK(q); 15153 mutex_enter(&ill->ill_lock); 15154 /* Now see if there is an IPIF with this unit number. */ 15155 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15156 if (ipif->ipif_id == id) { 15157 if (zoneid != ALL_ZONES && 15158 zoneid != ipif->ipif_zoneid && 15159 ipif->ipif_zoneid != ALL_ZONES) { 15160 mutex_exit(&ill->ill_lock); 15161 RELEASE_CONN_LOCK(q); 15162 ill_refrele(ill); 15163 if (error != NULL) 15164 *error = ENXIO; 15165 return (NULL); 15166 } 15167 /* 15168 * The block comment at the start of ipif_down 15169 * explains the use of the macros used below 15170 */ 15171 if (IPIF_CAN_LOOKUP(ipif)) { 15172 ipif_refhold_locked(ipif); 15173 mutex_exit(&ill->ill_lock); 15174 if (!did_alloc) 15175 *exists = B_TRUE; 15176 /* 15177 * Drop locks before calling ill_refrele 15178 * since it can potentially call into 15179 * ipif_ill_refrele_tail which can end up 15180 * in trying to acquire any lock. 15181 */ 15182 RELEASE_CONN_LOCK(q); 15183 ill_refrele(ill); 15184 return (ipif); 15185 } else if (IPIF_CAN_WAIT(ipif, q)) { 15186 ipsq = ill->ill_phyint->phyint_ipsq; 15187 mutex_enter(&ipsq->ipsq_lock); 15188 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 15189 mutex_exit(&ill->ill_lock); 15190 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 15191 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 15192 mutex_exit(&ipsq->ipsq_lock); 15193 RELEASE_CONN_LOCK(q); 15194 ill_refrele(ill); 15195 if (error != NULL) 15196 *error = EINPROGRESS; 15197 return (NULL); 15198 } 15199 } 15200 } 15201 RELEASE_CONN_LOCK(q); 15202 15203 if (!do_alloc) { 15204 mutex_exit(&ill->ill_lock); 15205 ill_refrele(ill); 15206 if (error != NULL) 15207 *error = ENXIO; 15208 return (NULL); 15209 } 15210 15211 /* 15212 * If none found, atomically allocate and return a new one. 15213 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 15214 * to support "receive only" use of lo0:1 etc. as is still done 15215 * below as an initial guess. 15216 * However, this is now likely to be overriden later in ipif_up_done() 15217 * when we know for sure what address has been configured on the 15218 * interface, since we might have more than one loopback interface 15219 * with a loopback address, e.g. in the case of zones, and all the 15220 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 15221 */ 15222 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 15223 ire_type = IRE_LOOPBACK; 15224 else 15225 ire_type = IRE_LOCAL; 15226 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE); 15227 if (ipif != NULL) 15228 ipif_refhold_locked(ipif); 15229 else if (error != NULL) 15230 *error = ENOMEM; 15231 mutex_exit(&ill->ill_lock); 15232 ill_refrele(ill); 15233 return (ipif); 15234 } 15235 15236 /* 15237 * This routine is called whenever a new address comes up on an ipif. If 15238 * we are configured to respond to address mask requests, then we are supposed 15239 * to broadcast an address mask reply at this time. This routine is also 15240 * called if we are already up, but a netmask change is made. This is legal 15241 * but might not make the system manager very popular. (May be called 15242 * as writer.) 15243 */ 15244 void 15245 ipif_mask_reply(ipif_t *ipif) 15246 { 15247 icmph_t *icmph; 15248 ipha_t *ipha; 15249 mblk_t *mp; 15250 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15251 15252 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 15253 15254 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 15255 return; 15256 15257 /* ICMP mask reply is IPv4 only */ 15258 ASSERT(!ipif->ipif_isv6); 15259 /* ICMP mask reply is not for a loopback interface */ 15260 ASSERT(ipif->ipif_ill->ill_wq != NULL); 15261 15262 mp = allocb(REPLY_LEN, BPRI_HI); 15263 if (mp == NULL) 15264 return; 15265 mp->b_wptr = mp->b_rptr + REPLY_LEN; 15266 15267 ipha = (ipha_t *)mp->b_rptr; 15268 bzero(ipha, REPLY_LEN); 15269 *ipha = icmp_ipha; 15270 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 15271 ipha->ipha_src = ipif->ipif_src_addr; 15272 ipha->ipha_dst = ipif->ipif_brd_addr; 15273 ipha->ipha_length = htons(REPLY_LEN); 15274 ipha->ipha_ident = 0; 15275 15276 icmph = (icmph_t *)&ipha[1]; 15277 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 15278 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 15279 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 15280 15281 put(ipif->ipif_wq, mp); 15282 15283 #undef REPLY_LEN 15284 } 15285 15286 /* 15287 * When the mtu in the ipif changes, we call this routine through ire_walk 15288 * to update all the relevant IREs. 15289 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 15290 */ 15291 static void 15292 ipif_mtu_change(ire_t *ire, char *ipif_arg) 15293 { 15294 ipif_t *ipif = (ipif_t *)ipif_arg; 15295 15296 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 15297 return; 15298 15299 mutex_enter(&ire->ire_lock); 15300 if (ire->ire_marks & IRE_MARK_PMTU) { 15301 /* Avoid increasing the PMTU */ 15302 ire->ire_max_frag = MIN(ipif->ipif_mtu, ire->ire_max_frag); 15303 if (ire->ire_max_frag == ipif->ipif_mtu) 15304 ire->ire_marks &= ~IRE_MARK_PMTU; 15305 } else { 15306 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 15307 } 15308 mutex_exit(&ire->ire_lock); 15309 } 15310 15311 /* 15312 * When the mtu in the ill changes, we call this routine through ire_walk 15313 * to update all the relevant IREs. 15314 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 15315 */ 15316 void 15317 ill_mtu_change(ire_t *ire, char *ill_arg) 15318 { 15319 ill_t *ill = (ill_t *)ill_arg; 15320 15321 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 15322 return; 15323 15324 mutex_enter(&ire->ire_lock); 15325 if (ire->ire_marks & IRE_MARK_PMTU) { 15326 /* Avoid increasing the PMTU */ 15327 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, 15328 ire->ire_max_frag); 15329 if (ire->ire_max_frag == ire->ire_ipif->ipif_mtu) { 15330 ire->ire_marks &= ~IRE_MARK_PMTU; 15331 } 15332 } else { 15333 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, IP_MAXPACKET); 15334 } 15335 mutex_exit(&ire->ire_lock); 15336 } 15337 15338 /* 15339 * Join the ipif specific multicast groups. 15340 * Must be called after a mapping has been set up in the resolver. (Always 15341 * called as writer.) 15342 */ 15343 void 15344 ipif_multicast_up(ipif_t *ipif) 15345 { 15346 int err; 15347 ill_t *ill; 15348 15349 ASSERT(IAM_WRITER_IPIF(ipif)); 15350 15351 ill = ipif->ipif_ill; 15352 15353 ip1dbg(("ipif_multicast_up\n")); 15354 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 15355 return; 15356 15357 if (ipif->ipif_isv6) { 15358 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 15359 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 15360 15361 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 15362 15363 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 15364 return; 15365 15366 ip1dbg(("ipif_multicast_up - addmulti\n")); 15367 15368 /* 15369 * Join the all hosts multicast address. We skip this for 15370 * underlying IPMP interfaces since they should be invisible. 15371 */ 15372 if (!IS_UNDER_IPMP(ill)) { 15373 err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid, 15374 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15375 if (err != 0) { 15376 ip0dbg(("ipif_multicast_up: " 15377 "all_hosts_mcast failed %d\n", err)); 15378 return; 15379 } 15380 ipif->ipif_joined_allhosts = 1; 15381 } 15382 15383 /* 15384 * Enable multicast for the solicited node multicast address 15385 */ 15386 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 15387 err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid, 15388 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15389 if (err != 0) { 15390 ip0dbg(("ipif_multicast_up: solicited MC" 15391 " failed %d\n", err)); 15392 if (ipif->ipif_joined_allhosts) { 15393 (void) ip_delmulti_v6(&v6allmc, ill, 15394 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15395 ipif->ipif_joined_allhosts = 0; 15396 } 15397 return; 15398 } 15399 } 15400 } else { 15401 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 15402 return; 15403 15404 /* Join the all hosts multicast address */ 15405 ip1dbg(("ipif_multicast_up - addmulti\n")); 15406 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 15407 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15408 if (err) { 15409 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 15410 return; 15411 } 15412 } 15413 ipif->ipif_multicast_up = 1; 15414 } 15415 15416 /* 15417 * Blow away any multicast groups that we joined in ipif_multicast_up(). 15418 * (Explicit memberships are blown away in ill_leave_multicast() when the 15419 * ill is brought down.) 15420 */ 15421 void 15422 ipif_multicast_down(ipif_t *ipif) 15423 { 15424 int err; 15425 15426 ASSERT(IAM_WRITER_IPIF(ipif)); 15427 15428 ip1dbg(("ipif_multicast_down\n")); 15429 if (!ipif->ipif_multicast_up) 15430 return; 15431 15432 ip1dbg(("ipif_multicast_down - delmulti\n")); 15433 15434 if (!ipif->ipif_isv6) { 15435 err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE, 15436 B_TRUE); 15437 if (err != 0) 15438 ip0dbg(("ipif_multicast_down: failed %d\n", err)); 15439 15440 ipif->ipif_multicast_up = 0; 15441 return; 15442 } 15443 15444 /* 15445 * Leave the all-hosts multicast address. 15446 */ 15447 if (ipif->ipif_joined_allhosts) { 15448 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 15449 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15450 if (err != 0) { 15451 ip0dbg(("ipif_multicast_down: all_hosts_mcast " 15452 "failed %d\n", err)); 15453 } 15454 ipif->ipif_joined_allhosts = 0; 15455 } 15456 15457 /* 15458 * Disable multicast for the solicited node multicast address 15459 */ 15460 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 15461 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 15462 15463 ipv6_multi.s6_addr32[3] |= 15464 ipif->ipif_v6lcl_addr.s6_addr32[3]; 15465 15466 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 15467 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15468 if (err != 0) { 15469 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 15470 err)); 15471 } 15472 } 15473 15474 ipif->ipif_multicast_up = 0; 15475 } 15476 15477 /* 15478 * Used when an interface comes up to recreate any extra routes on this 15479 * interface. 15480 */ 15481 static ire_t ** 15482 ipif_recover_ire(ipif_t *ipif) 15483 { 15484 mblk_t *mp; 15485 ire_t **ipif_saved_irep; 15486 ire_t **irep; 15487 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15488 15489 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 15490 ipif->ipif_id)); 15491 15492 mutex_enter(&ipif->ipif_saved_ire_lock); 15493 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 15494 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 15495 if (ipif_saved_irep == NULL) { 15496 mutex_exit(&ipif->ipif_saved_ire_lock); 15497 return (NULL); 15498 } 15499 15500 irep = ipif_saved_irep; 15501 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 15502 ire_t *ire; 15503 queue_t *rfq; 15504 queue_t *stq; 15505 ifrt_t *ifrt; 15506 uchar_t *src_addr; 15507 uchar_t *gateway_addr; 15508 ushort_t type; 15509 15510 /* 15511 * When the ire was initially created and then added in 15512 * ip_rt_add(), it was created either using ipif->ipif_net_type 15513 * in the case of a traditional interface route, or as one of 15514 * the IRE_OFFSUBNET types (with the exception of 15515 * IRE_HOST types ire which is created by icmp_redirect() and 15516 * which we don't need to save or recover). In the case where 15517 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 15518 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 15519 * to satisfy software like GateD and Sun Cluster which creates 15520 * routes using the the loopback interface's address as a 15521 * gateway. 15522 * 15523 * As ifrt->ifrt_type reflects the already updated ire_type, 15524 * ire_create() will be called in the same way here as 15525 * in ip_rt_add(), namely using ipif->ipif_net_type when 15526 * the route looks like a traditional interface route (where 15527 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 15528 * the saved ifrt->ifrt_type. This means that in the case where 15529 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 15530 * ire_create() will be an IRE_LOOPBACK, it will then be turned 15531 * into an IRE_IF_NORESOLVER and then added by ire_add(). 15532 */ 15533 ifrt = (ifrt_t *)mp->b_rptr; 15534 ASSERT(ifrt->ifrt_type != IRE_CACHE); 15535 if (ifrt->ifrt_type & IRE_INTERFACE) { 15536 rfq = NULL; 15537 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 15538 ? ipif->ipif_rq : ipif->ipif_wq; 15539 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15540 ? (uint8_t *)&ifrt->ifrt_src_addr 15541 : (uint8_t *)&ipif->ipif_src_addr; 15542 gateway_addr = NULL; 15543 type = ipif->ipif_net_type; 15544 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 15545 /* Recover multiroute broadcast IRE. */ 15546 rfq = ipif->ipif_rq; 15547 stq = ipif->ipif_wq; 15548 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15549 ? (uint8_t *)&ifrt->ifrt_src_addr 15550 : (uint8_t *)&ipif->ipif_src_addr; 15551 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 15552 type = ifrt->ifrt_type; 15553 } else { 15554 rfq = NULL; 15555 stq = NULL; 15556 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15557 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 15558 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 15559 type = ifrt->ifrt_type; 15560 } 15561 15562 /* 15563 * Create a copy of the IRE with the saved address and netmask. 15564 */ 15565 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 15566 "0x%x/0x%x\n", 15567 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 15568 ntohl(ifrt->ifrt_addr), 15569 ntohl(ifrt->ifrt_mask))); 15570 ire = ire_create( 15571 (uint8_t *)&ifrt->ifrt_addr, 15572 (uint8_t *)&ifrt->ifrt_mask, 15573 src_addr, 15574 gateway_addr, 15575 &ifrt->ifrt_max_frag, 15576 NULL, 15577 rfq, 15578 stq, 15579 type, 15580 ipif, 15581 0, 15582 0, 15583 0, 15584 ifrt->ifrt_flags, 15585 &ifrt->ifrt_iulp_info, 15586 NULL, 15587 NULL, 15588 ipst); 15589 15590 if (ire == NULL) { 15591 mutex_exit(&ipif->ipif_saved_ire_lock); 15592 kmem_free(ipif_saved_irep, 15593 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 15594 return (NULL); 15595 } 15596 15597 /* 15598 * Some software (for example, GateD and Sun Cluster) attempts 15599 * to create (what amount to) IRE_PREFIX routes with the 15600 * loopback address as the gateway. This is primarily done to 15601 * set up prefixes with the RTF_REJECT flag set (for example, 15602 * when generating aggregate routes.) 15603 * 15604 * If the IRE type (as defined by ipif->ipif_net_type) is 15605 * IRE_LOOPBACK, then we map the request into a 15606 * IRE_IF_NORESOLVER. 15607 */ 15608 if (ipif->ipif_net_type == IRE_LOOPBACK) 15609 ire->ire_type = IRE_IF_NORESOLVER; 15610 /* 15611 * ire held by ire_add, will be refreled' towards the 15612 * the end of ipif_up_done 15613 */ 15614 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 15615 *irep = ire; 15616 irep++; 15617 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 15618 } 15619 mutex_exit(&ipif->ipif_saved_ire_lock); 15620 return (ipif_saved_irep); 15621 } 15622 15623 /* 15624 * Used to set the netmask and broadcast address to default values when the 15625 * interface is brought up. (Always called as writer.) 15626 */ 15627 static void 15628 ipif_set_default(ipif_t *ipif) 15629 { 15630 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 15631 15632 if (!ipif->ipif_isv6) { 15633 /* 15634 * Interface holds an IPv4 address. Default 15635 * mask is the natural netmask. 15636 */ 15637 if (!ipif->ipif_net_mask) { 15638 ipaddr_t v4mask; 15639 15640 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 15641 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 15642 } 15643 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 15644 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 15645 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 15646 } else { 15647 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 15648 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 15649 } 15650 /* 15651 * NOTE: SunOS 4.X does this even if the broadcast address 15652 * has been already set thus we do the same here. 15653 */ 15654 if (ipif->ipif_flags & IPIF_BROADCAST) { 15655 ipaddr_t v4addr; 15656 15657 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 15658 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 15659 } 15660 } else { 15661 /* 15662 * Interface holds an IPv6-only address. Default 15663 * mask is all-ones. 15664 */ 15665 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 15666 ipif->ipif_v6net_mask = ipv6_all_ones; 15667 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 15668 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 15669 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 15670 } else { 15671 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 15672 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 15673 } 15674 } 15675 } 15676 15677 /* 15678 * Return 0 if this address can be used as local address without causing 15679 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 15680 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 15681 * Note that the same IPv6 link-local address is allowed as long as the ills 15682 * are not on the same link. 15683 */ 15684 int 15685 ip_addr_availability_check(ipif_t *new_ipif) 15686 { 15687 in6_addr_t our_v6addr; 15688 ill_t *ill; 15689 ipif_t *ipif; 15690 ill_walk_context_t ctx; 15691 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 15692 15693 ASSERT(IAM_WRITER_IPIF(new_ipif)); 15694 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 15695 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 15696 15697 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 15698 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 15699 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 15700 return (0); 15701 15702 our_v6addr = new_ipif->ipif_v6lcl_addr; 15703 15704 if (new_ipif->ipif_isv6) 15705 ill = ILL_START_WALK_V6(&ctx, ipst); 15706 else 15707 ill = ILL_START_WALK_V4(&ctx, ipst); 15708 15709 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 15710 for (ipif = ill->ill_ipif; ipif != NULL; 15711 ipif = ipif->ipif_next) { 15712 if ((ipif == new_ipif) || 15713 !(ipif->ipif_flags & IPIF_UP) || 15714 (ipif->ipif_flags & IPIF_UNNUMBERED) || 15715 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 15716 &our_v6addr)) 15717 continue; 15718 15719 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 15720 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 15721 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 15722 ipif->ipif_flags |= IPIF_UNNUMBERED; 15723 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 15724 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 15725 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 15726 continue; 15727 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 15728 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 15729 continue; 15730 else if (new_ipif->ipif_ill == ill) 15731 return (EADDRINUSE); 15732 else 15733 return (EADDRNOTAVAIL); 15734 } 15735 } 15736 15737 return (0); 15738 } 15739 15740 /* 15741 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 15742 * IREs for the ipif. 15743 * When the routine returns EINPROGRESS then mp has been consumed and 15744 * the ioctl will be acked from ip_rput_dlpi. 15745 */ 15746 int 15747 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 15748 { 15749 ill_t *ill = ipif->ipif_ill; 15750 boolean_t isv6 = ipif->ipif_isv6; 15751 int err = 0; 15752 boolean_t success; 15753 uint_t ipif_orig_id; 15754 ip_stack_t *ipst = ill->ill_ipst; 15755 15756 ASSERT(IAM_WRITER_IPIF(ipif)); 15757 15758 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 15759 15760 /* Shouldn't get here if it is already up. */ 15761 if (ipif->ipif_flags & IPIF_UP) 15762 return (EALREADY); 15763 15764 /* 15765 * If this is a request to bring up a data address on an interface 15766 * under IPMP, then move the address to its IPMP meta-interface and 15767 * try to bring it up. One complication is that the zeroth ipif for 15768 * an ill is special, in that every ill always has one, and that code 15769 * throughout IP deferences ill->ill_ipif without holding any locks. 15770 */ 15771 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 15772 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 15773 ipif_t *stubipif = NULL, *moveipif = NULL; 15774 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 15775 15776 /* 15777 * The ipif being brought up should be quiesced. If it's not, 15778 * something has gone amiss and we need to bail out. (If it's 15779 * quiesced, we know it will remain so via IPIF_CHANGING.) 15780 */ 15781 mutex_enter(&ill->ill_lock); 15782 if (!ipif_is_quiescent(ipif)) { 15783 mutex_exit(&ill->ill_lock); 15784 return (EINVAL); 15785 } 15786 mutex_exit(&ill->ill_lock); 15787 15788 /* 15789 * If we're going to need to allocate ipifs, do it prior 15790 * to starting the move (and grabbing locks). 15791 */ 15792 if (ipif->ipif_id == 0) { 15793 moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 15794 B_FALSE); 15795 stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 15796 B_FALSE); 15797 if (moveipif == NULL || stubipif == NULL) { 15798 mi_free(moveipif); 15799 mi_free(stubipif); 15800 return (ENOMEM); 15801 } 15802 } 15803 15804 /* 15805 * Grab or transfer the ipif to move. During the move, keep 15806 * ill_g_lock held to prevent any ill walker threads from 15807 * seeing things in an inconsistent state. 15808 */ 15809 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15810 if (ipif->ipif_id != 0) { 15811 ipif_remove(ipif); 15812 } else { 15813 ipif_transfer(ipif, moveipif, stubipif); 15814 ipif = moveipif; 15815 } 15816 15817 /* 15818 * Place the ipif on the IPMP ill. If the zeroth ipif on 15819 * the IPMP ill is a stub (0.0.0.0 down address) then we 15820 * replace that one. Otherwise, pick the next available slot. 15821 */ 15822 ipif->ipif_ill = ipmp_ill; 15823 ipif_orig_id = ipif->ipif_id; 15824 15825 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 15826 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 15827 ipif = ipmp_ill->ill_ipif; 15828 } else { 15829 ipif->ipif_id = -1; 15830 if (ipif_insert(ipif, B_FALSE) != 0) { 15831 /* 15832 * No more available ipif_id's -- put it back 15833 * on the original ill and fail the operation. 15834 * Since we're writer on the ill, we can be 15835 * sure our old slot is still available. 15836 */ 15837 ipif->ipif_id = ipif_orig_id; 15838 ipif->ipif_ill = ill; 15839 if (ipif_orig_id == 0) { 15840 ipif_transfer(ipif, ill->ill_ipif, 15841 NULL); 15842 } else { 15843 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 15844 } 15845 rw_exit(&ipst->ips_ill_g_lock); 15846 return (ENOMEM); 15847 } 15848 } 15849 rw_exit(&ipst->ips_ill_g_lock); 15850 15851 /* 15852 * Tell SCTP that the ipif has moved. Note that even if we 15853 * had to allocate a new ipif, the original sequence id was 15854 * preserved and therefore SCTP won't know. 15855 */ 15856 sctp_move_ipif(ipif, ill, ipmp_ill); 15857 15858 /* 15859 * If the ipif being brought up was on slot zero, then we 15860 * first need to bring up the placeholder we stuck there. In 15861 * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call 15862 * to ipif_up() itself, if we successfully bring up the 15863 * placeholder, we'll check ill_move_ipif and bring it up too. 15864 */ 15865 if (ipif_orig_id == 0) { 15866 ASSERT(ill->ill_move_ipif == NULL); 15867 ill->ill_move_ipif = ipif; 15868 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 15869 ASSERT(ill->ill_move_ipif == NULL); 15870 if (err != EINPROGRESS) 15871 ill->ill_move_ipif = NULL; 15872 return (err); 15873 } 15874 15875 /* 15876 * Bring it up on the IPMP ill. 15877 */ 15878 return (ipif_up(ipif, q, mp)); 15879 } 15880 15881 /* Skip arp/ndp for any loopback interface. */ 15882 if (ill->ill_wq != NULL) { 15883 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 15884 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 15885 15886 if (!ill->ill_dl_up) { 15887 /* 15888 * ill_dl_up is not yet set. i.e. we are yet to 15889 * DL_BIND with the driver and this is the first 15890 * logical interface on the ill to become "up". 15891 * Tell the driver to get going (via DL_BIND_REQ). 15892 * Note that changing "significant" IFF_ flags 15893 * address/netmask etc cause a down/up dance, but 15894 * does not cause an unbind (DL_UNBIND) with the driver 15895 */ 15896 return (ill_dl_up(ill, ipif, mp, q)); 15897 } 15898 15899 /* 15900 * ipif_resolver_up may end up sending an 15901 * AR_INTERFACE_UP message to ARP, which would, in 15902 * turn send a DLPI message to the driver. ioctls are 15903 * serialized and so we cannot send more than one 15904 * interface up message at a time. If ipif_resolver_up 15905 * does send an interface up message to ARP, we get 15906 * EINPROGRESS and we will complete in ip_arp_done. 15907 */ 15908 15909 ASSERT(connp != NULL || !CONN_Q(q)); 15910 if (connp != NULL) 15911 mutex_enter(&connp->conn_lock); 15912 mutex_enter(&ill->ill_lock); 15913 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 15914 mutex_exit(&ill->ill_lock); 15915 if (connp != NULL) 15916 mutex_exit(&connp->conn_lock); 15917 if (!success) 15918 return (EINTR); 15919 15920 /* 15921 * Crank up the resolver. For IPv6, this cranks up the 15922 * external resolver if one is configured, but even if an 15923 * external resolver isn't configured, it must be called to 15924 * reset DAD state. For IPv6, if an external resolver is not 15925 * being used, ipif_resolver_up() will never return 15926 * EINPROGRESS, so we can always call ipif_ndp_up() here. 15927 * Note that if an external resolver is being used, there's no 15928 * need to call ipif_ndp_up() since it will do nothing. 15929 */ 15930 err = ipif_resolver_up(ipif, Res_act_initial); 15931 if (err == EINPROGRESS) { 15932 /* We will complete it in ip_arp_done() */ 15933 return (err); 15934 } 15935 15936 if (isv6 && err == 0) 15937 err = ipif_ndp_up(ipif, B_TRUE); 15938 15939 ASSERT(err != EINPROGRESS); 15940 mp = ipsq_pending_mp_get(ipsq, &connp); 15941 ASSERT(mp != NULL); 15942 if (err != 0) 15943 return (err); 15944 } else { 15945 /* 15946 * Interfaces without underlying hardware don't do duplicate 15947 * address detection. 15948 */ 15949 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 15950 ipif->ipif_addr_ready = 1; 15951 } 15952 15953 err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif); 15954 if (err == 0 && ill->ill_move_ipif != NULL) { 15955 ipif = ill->ill_move_ipif; 15956 ill->ill_move_ipif = NULL; 15957 return (ipif_up(ipif, q, mp)); 15958 } 15959 return (err); 15960 } 15961 15962 /* 15963 * Perform a bind for the physical device. 15964 * When the routine returns EINPROGRESS then mp has been consumed and 15965 * the ioctl will be acked from ip_rput_dlpi. 15966 * Allocate an unbind message and save it until ipif_down. 15967 */ 15968 static int 15969 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 15970 { 15971 areq_t *areq; 15972 mblk_t *areq_mp = NULL; 15973 mblk_t *bind_mp = NULL; 15974 mblk_t *unbind_mp = NULL; 15975 conn_t *connp; 15976 boolean_t success; 15977 uint16_t sap_addr; 15978 15979 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 15980 ASSERT(IAM_WRITER_ILL(ill)); 15981 ASSERT(mp != NULL); 15982 15983 /* Create a resolver cookie for ARP */ 15984 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 15985 areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0); 15986 if (areq_mp == NULL) 15987 return (ENOMEM); 15988 15989 freemsg(ill->ill_resolver_mp); 15990 ill->ill_resolver_mp = areq_mp; 15991 areq = (areq_t *)areq_mp->b_rptr; 15992 sap_addr = ill->ill_sap; 15993 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 15994 } 15995 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 15996 DL_BIND_REQ); 15997 if (bind_mp == NULL) 15998 goto bad; 15999 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 16000 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 16001 16002 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 16003 if (unbind_mp == NULL) 16004 goto bad; 16005 16006 /* 16007 * Record state needed to complete this operation when the 16008 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 16009 */ 16010 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 16011 ASSERT(connp != NULL || !CONN_Q(q)); 16012 GRAB_CONN_LOCK(q); 16013 mutex_enter(&ipif->ipif_ill->ill_lock); 16014 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 16015 mutex_exit(&ipif->ipif_ill->ill_lock); 16016 RELEASE_CONN_LOCK(q); 16017 if (!success) 16018 goto bad; 16019 16020 /* 16021 * Save the unbind message for ill_dl_down(); it will be consumed when 16022 * the interface goes down. 16023 */ 16024 ASSERT(ill->ill_unbind_mp == NULL); 16025 ill->ill_unbind_mp = unbind_mp; 16026 16027 ill_dlpi_send(ill, bind_mp); 16028 /* Send down link-layer capabilities probe if not already done. */ 16029 ill_capability_probe(ill); 16030 16031 /* 16032 * Sysid used to rely on the fact that netboots set domainname 16033 * and the like. Now that miniroot boots aren't strictly netboots 16034 * and miniroot network configuration is driven from userland 16035 * these things still need to be set. This situation can be detected 16036 * by comparing the interface being configured here to the one 16037 * dhcifname was set to reference by the boot loader. Once sysid is 16038 * converted to use dhcp_ipc_getinfo() this call can go away. 16039 */ 16040 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 16041 (strcmp(ill->ill_name, dhcifname) == 0) && 16042 (strlen(srpc_domain) == 0)) { 16043 if (dhcpinit() != 0) 16044 cmn_err(CE_WARN, "no cached dhcp response"); 16045 } 16046 16047 /* 16048 * This operation will complete in ip_rput_dlpi with either 16049 * a DL_BIND_ACK or DL_ERROR_ACK. 16050 */ 16051 return (EINPROGRESS); 16052 bad: 16053 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 16054 16055 freemsg(bind_mp); 16056 freemsg(unbind_mp); 16057 return (ENOMEM); 16058 } 16059 16060 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 16061 16062 /* 16063 * DLPI and ARP is up. 16064 * Create all the IREs associated with an interface bring up multicast. 16065 * Set the interface flag and finish other initialization 16066 * that potentially had to be differed to after DL_BIND_ACK. 16067 */ 16068 int 16069 ipif_up_done(ipif_t *ipif) 16070 { 16071 ire_t *ire_array[20]; 16072 ire_t **irep = ire_array; 16073 ire_t **irep1; 16074 ipaddr_t net_mask = 0; 16075 ipaddr_t subnet_mask, route_mask; 16076 ill_t *ill = ipif->ipif_ill; 16077 queue_t *stq; 16078 ipif_t *src_ipif; 16079 ipif_t *tmp_ipif; 16080 boolean_t flush_ire_cache = B_TRUE; 16081 int err = 0; 16082 ire_t **ipif_saved_irep = NULL; 16083 int ipif_saved_ire_cnt; 16084 int cnt; 16085 boolean_t src_ipif_held = B_FALSE; 16086 boolean_t loopback = B_FALSE; 16087 ip_stack_t *ipst = ill->ill_ipst; 16088 16089 ip1dbg(("ipif_up_done(%s:%u)\n", 16090 ipif->ipif_ill->ill_name, ipif->ipif_id)); 16091 /* Check if this is a loopback interface */ 16092 if (ipif->ipif_ill->ill_wq == NULL) 16093 loopback = B_TRUE; 16094 16095 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 16096 /* 16097 * If all other interfaces for this ill are down or DEPRECATED, 16098 * or otherwise unsuitable for source address selection, remove 16099 * any IRE_CACHE entries for this ill to make sure source 16100 * address selection gets to take this new ipif into account. 16101 * No need to hold ill_lock while traversing the ipif list since 16102 * we are writer 16103 */ 16104 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 16105 tmp_ipif = tmp_ipif->ipif_next) { 16106 if (((tmp_ipif->ipif_flags & 16107 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 16108 !(tmp_ipif->ipif_flags & IPIF_UP)) || 16109 (tmp_ipif == ipif)) 16110 continue; 16111 /* first useable pre-existing interface */ 16112 flush_ire_cache = B_FALSE; 16113 break; 16114 } 16115 if (flush_ire_cache) 16116 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 16117 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 16118 16119 /* 16120 * Figure out which way the send-to queue should go. Only 16121 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 16122 * should show up here. 16123 */ 16124 switch (ill->ill_net_type) { 16125 case IRE_IF_RESOLVER: 16126 stq = ill->ill_rq; 16127 break; 16128 case IRE_IF_NORESOLVER: 16129 case IRE_LOOPBACK: 16130 stq = ill->ill_wq; 16131 break; 16132 default: 16133 return (EINVAL); 16134 } 16135 16136 if (IS_LOOPBACK(ill)) { 16137 /* 16138 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 16139 * ipif_lookup_on_name(), but in the case of zones we can have 16140 * several loopback addresses on lo0. So all the interfaces with 16141 * loopback addresses need to be marked IRE_LOOPBACK. 16142 */ 16143 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 16144 htonl(INADDR_LOOPBACK)) 16145 ipif->ipif_ire_type = IRE_LOOPBACK; 16146 else 16147 ipif->ipif_ire_type = IRE_LOCAL; 16148 } 16149 16150 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || 16151 ((ipif->ipif_flags & IPIF_DEPRECATED) && 16152 !(ipif->ipif_flags & IPIF_NOFAILOVER))) { 16153 /* 16154 * Can't use our source address. Select a different 16155 * source address for the IRE_INTERFACE and IRE_LOCAL 16156 */ 16157 src_ipif = ipif_select_source(ipif->ipif_ill, 16158 ipif->ipif_subnet, ipif->ipif_zoneid); 16159 if (src_ipif == NULL) 16160 src_ipif = ipif; /* Last resort */ 16161 else 16162 src_ipif_held = B_TRUE; 16163 } else { 16164 src_ipif = ipif; 16165 } 16166 16167 /* Create all the IREs associated with this interface */ 16168 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 16169 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 16170 16171 /* 16172 * If we're on a labeled system then make sure that zone- 16173 * private addresses have proper remote host database entries. 16174 */ 16175 if (is_system_labeled() && 16176 ipif->ipif_ire_type != IRE_LOOPBACK && 16177 !tsol_check_interface_address(ipif)) 16178 return (EINVAL); 16179 16180 /* Register the source address for __sin6_src_id */ 16181 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 16182 ipif->ipif_zoneid, ipst); 16183 if (err != 0) { 16184 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 16185 return (err); 16186 } 16187 16188 /* If the interface address is set, create the local IRE. */ 16189 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 16190 (void *)ipif, 16191 ipif->ipif_ire_type, 16192 ntohl(ipif->ipif_lcl_addr))); 16193 *irep++ = ire_create( 16194 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 16195 (uchar_t *)&ip_g_all_ones, /* mask */ 16196 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 16197 NULL, /* no gateway */ 16198 &ip_loopback_mtuplus, /* max frag size */ 16199 NULL, 16200 ipif->ipif_rq, /* recv-from queue */ 16201 NULL, /* no send-to queue */ 16202 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 16203 ipif, 16204 0, 16205 0, 16206 0, 16207 (ipif->ipif_flags & IPIF_PRIVATE) ? 16208 RTF_PRIVATE : 0, 16209 &ire_uinfo_null, 16210 NULL, 16211 NULL, 16212 ipst); 16213 } else { 16214 ip1dbg(( 16215 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 16216 ipif->ipif_ire_type, 16217 ntohl(ipif->ipif_lcl_addr), 16218 (uint_t)ipif->ipif_flags)); 16219 } 16220 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 16221 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 16222 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 16223 } else { 16224 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 16225 } 16226 16227 subnet_mask = ipif->ipif_net_mask; 16228 16229 /* 16230 * If mask was not specified, use natural netmask of 16231 * interface address. Also, store this mask back into the 16232 * ipif struct. 16233 */ 16234 if (subnet_mask == 0) { 16235 subnet_mask = net_mask; 16236 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 16237 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 16238 ipif->ipif_v6subnet); 16239 } 16240 16241 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 16242 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 16243 ipif->ipif_subnet != INADDR_ANY) { 16244 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 16245 16246 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 16247 route_mask = IP_HOST_MASK; 16248 } else { 16249 route_mask = subnet_mask; 16250 } 16251 16252 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 16253 "creating if IRE ill_net_type 0x%x for 0x%x\n", 16254 (void *)ipif, (void *)ill, 16255 ill->ill_net_type, 16256 ntohl(ipif->ipif_subnet))); 16257 *irep++ = ire_create( 16258 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 16259 (uchar_t *)&route_mask, /* mask */ 16260 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 16261 NULL, /* no gateway */ 16262 &ipif->ipif_mtu, /* max frag */ 16263 NULL, 16264 NULL, /* no recv queue */ 16265 stq, /* send-to queue */ 16266 ill->ill_net_type, /* IF_[NO]RESOLVER */ 16267 ipif, 16268 0, 16269 0, 16270 0, 16271 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 16272 &ire_uinfo_null, 16273 NULL, 16274 NULL, 16275 ipst); 16276 } 16277 16278 /* 16279 * Create any necessary broadcast IREs. 16280 */ 16281 if (ipif->ipif_flags & IPIF_BROADCAST) 16282 irep = ipif_create_bcast_ires(ipif, irep); 16283 16284 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 16285 16286 /* If an earlier ire_create failed, get out now */ 16287 for (irep1 = irep; irep1 > ire_array; ) { 16288 irep1--; 16289 if (*irep1 == NULL) { 16290 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 16291 err = ENOMEM; 16292 goto bad; 16293 } 16294 } 16295 16296 /* 16297 * Need to atomically check for IP address availability under 16298 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 16299 * ills or new ipifs can be added while we are checking availability. 16300 */ 16301 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16302 mutex_enter(&ipst->ips_ip_addr_avail_lock); 16303 /* Mark it up, and increment counters. */ 16304 ipif->ipif_flags |= IPIF_UP; 16305 ill->ill_ipif_up_count++; 16306 err = ip_addr_availability_check(ipif); 16307 mutex_exit(&ipst->ips_ip_addr_avail_lock); 16308 rw_exit(&ipst->ips_ill_g_lock); 16309 16310 if (err != 0) { 16311 /* 16312 * Our address may already be up on the same ill. In this case, 16313 * the ARP entry for our ipif replaced the one for the other 16314 * ipif. So we don't want to delete it (otherwise the other ipif 16315 * would be unable to send packets). 16316 * ip_addr_availability_check() identifies this case for us and 16317 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 16318 * which is the expected error code. 16319 */ 16320 if (err == EADDRINUSE) { 16321 freemsg(ipif->ipif_arp_del_mp); 16322 ipif->ipif_arp_del_mp = NULL; 16323 err = EADDRNOTAVAIL; 16324 } 16325 ill->ill_ipif_up_count--; 16326 ipif->ipif_flags &= ~IPIF_UP; 16327 goto bad; 16328 } 16329 16330 /* 16331 * Add in all newly created IREs. ire_create_bcast() has 16332 * already checked for duplicates of the IRE_BROADCAST type. 16333 */ 16334 for (irep1 = irep; irep1 > ire_array; ) { 16335 irep1--; 16336 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 16337 /* 16338 * refheld by ire_add. refele towards the end of the func 16339 */ 16340 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 16341 } 16342 16343 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 16344 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 16345 ipif_saved_irep = ipif_recover_ire(ipif); 16346 16347 if (!loopback) { 16348 /* 16349 * If the broadcast address has been set, make sure it makes 16350 * sense based on the interface address. 16351 * Only match on ill since we are sharing broadcast addresses. 16352 */ 16353 if ((ipif->ipif_brd_addr != INADDR_ANY) && 16354 (ipif->ipif_flags & IPIF_BROADCAST)) { 16355 ire_t *ire; 16356 16357 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 16358 IRE_BROADCAST, ipif, ALL_ZONES, 16359 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); 16360 16361 if (ire == NULL) { 16362 /* 16363 * If there isn't a matching broadcast IRE, 16364 * revert to the default for this netmask. 16365 */ 16366 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16367 mutex_enter(&ipif->ipif_ill->ill_lock); 16368 ipif_set_default(ipif); 16369 mutex_exit(&ipif->ipif_ill->ill_lock); 16370 } else { 16371 ire_refrele(ire); 16372 } 16373 } 16374 16375 } 16376 16377 if (ill->ill_need_recover_multicast) { 16378 /* 16379 * Need to recover all multicast memberships in the driver. 16380 * This had to be deferred until we had attached. The same 16381 * code exists in ipif_up_done_v6() to recover IPv6 16382 * memberships. 16383 * 16384 * Note that it would be preferable to unconditionally do the 16385 * ill_recover_multicast() in ill_dl_up(), but we cannot do 16386 * that since ill_join_allmulti() depends on ill_dl_up being 16387 * set, and it is not set until we receive a DL_BIND_ACK after 16388 * having called ill_dl_up(). 16389 */ 16390 ill_recover_multicast(ill); 16391 } 16392 16393 if (ill->ill_ipif_up_count == 1) { 16394 /* 16395 * Since the interface is now up, it may now be active. 16396 */ 16397 if (IS_UNDER_IPMP(ill)) 16398 ipmp_ill_refresh_active(ill); 16399 16400 /* 16401 * If this is an IPMP interface, we may now be able to 16402 * establish ARP entries. 16403 */ 16404 if (IS_IPMP(ill)) 16405 ipmp_illgrp_refresh_arpent(ill->ill_grp); 16406 } 16407 16408 /* Join the allhosts multicast address */ 16409 ipif_multicast_up(ipif); 16410 16411 /* 16412 * See if anybody else would benefit from our new ipif. 16413 */ 16414 if (!loopback && 16415 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 16416 ill_update_source_selection(ill); 16417 } 16418 16419 for (irep1 = irep; irep1 > ire_array; ) { 16420 irep1--; 16421 if (*irep1 != NULL) { 16422 /* was held in ire_add */ 16423 ire_refrele(*irep1); 16424 } 16425 } 16426 16427 cnt = ipif_saved_ire_cnt; 16428 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 16429 if (*irep1 != NULL) { 16430 /* was held in ire_add */ 16431 ire_refrele(*irep1); 16432 } 16433 } 16434 16435 if (!loopback && ipif->ipif_addr_ready) { 16436 /* Broadcast an address mask reply. */ 16437 ipif_mask_reply(ipif); 16438 } 16439 if (ipif_saved_irep != NULL) { 16440 kmem_free(ipif_saved_irep, 16441 ipif_saved_ire_cnt * sizeof (ire_t *)); 16442 } 16443 if (src_ipif_held) 16444 ipif_refrele(src_ipif); 16445 16446 /* 16447 * This had to be deferred until we had bound. Tell routing sockets and 16448 * others that this interface is up if it looks like the address has 16449 * been validated. Otherwise, if it isn't ready yet, wait for 16450 * duplicate address detection to do its thing. 16451 */ 16452 if (ipif->ipif_addr_ready) 16453 ipif_up_notify(ipif); 16454 return (0); 16455 16456 bad: 16457 ip1dbg(("ipif_up_done: FAILED \n")); 16458 16459 while (irep > ire_array) { 16460 irep--; 16461 if (*irep != NULL) 16462 ire_delete(*irep); 16463 } 16464 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 16465 16466 if (ipif_saved_irep != NULL) { 16467 kmem_free(ipif_saved_irep, 16468 ipif_saved_ire_cnt * sizeof (ire_t *)); 16469 } 16470 if (src_ipif_held) 16471 ipif_refrele(src_ipif); 16472 16473 ipif_resolver_down(ipif); 16474 return (err); 16475 } 16476 16477 /* 16478 * Turn off the ARP with the ILLF_NOARP flag. 16479 */ 16480 static int 16481 ill_arp_off(ill_t *ill) 16482 { 16483 mblk_t *arp_off_mp = NULL; 16484 mblk_t *arp_on_mp = NULL; 16485 16486 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 16487 16488 ASSERT(IAM_WRITER_ILL(ill)); 16489 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 16490 16491 /* 16492 * If the on message is still around we've already done 16493 * an arp_off without doing an arp_on thus there is no 16494 * work needed. 16495 */ 16496 if (ill->ill_arp_on_mp != NULL) 16497 return (0); 16498 16499 /* 16500 * Allocate an ARP on message (to be saved) and an ARP off message 16501 */ 16502 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 16503 if (!arp_off_mp) 16504 return (ENOMEM); 16505 16506 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 16507 if (!arp_on_mp) 16508 goto failed; 16509 16510 ASSERT(ill->ill_arp_on_mp == NULL); 16511 ill->ill_arp_on_mp = arp_on_mp; 16512 16513 /* Send an AR_INTERFACE_OFF request */ 16514 putnext(ill->ill_rq, arp_off_mp); 16515 return (0); 16516 failed: 16517 16518 if (arp_off_mp) 16519 freemsg(arp_off_mp); 16520 return (ENOMEM); 16521 } 16522 16523 /* 16524 * Turn on ARP by turning off the ILLF_NOARP flag. 16525 */ 16526 static int 16527 ill_arp_on(ill_t *ill) 16528 { 16529 mblk_t *mp; 16530 16531 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 16532 16533 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 16534 16535 ASSERT(IAM_WRITER_ILL(ill)); 16536 /* 16537 * Send an AR_INTERFACE_ON request if we have already done 16538 * an arp_off (which allocated the message). 16539 */ 16540 if (ill->ill_arp_on_mp != NULL) { 16541 mp = ill->ill_arp_on_mp; 16542 ill->ill_arp_on_mp = NULL; 16543 putnext(ill->ill_rq, mp); 16544 } 16545 return (0); 16546 } 16547 16548 /* 16549 * Checks for availbility of a usable source address (if there is one) when the 16550 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 16551 * this selection is done regardless of the destination. 16552 */ 16553 boolean_t 16554 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 16555 { 16556 uint_t ifindex; 16557 ipif_t *ipif = NULL; 16558 ill_t *uill; 16559 boolean_t isv6; 16560 ip_stack_t *ipst = ill->ill_ipst; 16561 16562 ASSERT(ill != NULL); 16563 16564 isv6 = ill->ill_isv6; 16565 ifindex = ill->ill_usesrc_ifindex; 16566 if (ifindex != 0) { 16567 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 16568 NULL, ipst); 16569 if (uill == NULL) 16570 return (NULL); 16571 mutex_enter(&uill->ill_lock); 16572 for (ipif = uill->ill_ipif; ipif != NULL; 16573 ipif = ipif->ipif_next) { 16574 if (!IPIF_CAN_LOOKUP(ipif)) 16575 continue; 16576 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 16577 continue; 16578 if (!(ipif->ipif_flags & IPIF_UP)) 16579 continue; 16580 if (ipif->ipif_zoneid != zoneid) 16581 continue; 16582 if ((isv6 && 16583 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 16584 (ipif->ipif_lcl_addr == INADDR_ANY)) 16585 continue; 16586 mutex_exit(&uill->ill_lock); 16587 ill_refrele(uill); 16588 return (B_TRUE); 16589 } 16590 mutex_exit(&uill->ill_lock); 16591 ill_refrele(uill); 16592 } 16593 return (B_FALSE); 16594 } 16595 16596 /* 16597 * IP source address type, sorted from worst to best. For a given type, 16598 * always prefer IP addresses on the same subnet. All-zones addresses are 16599 * suboptimal because they pose problems with unlabeled destinations. 16600 */ 16601 typedef enum { 16602 IPIF_NONE, 16603 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 16604 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 16605 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 16606 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 16607 IPIF_DIFFNET, /* normal and different subnet */ 16608 IPIF_SAMENET /* normal and same subnet */ 16609 } ipif_type_t; 16610 16611 /* 16612 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 16613 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 16614 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 16615 * the first one, unless IPMP is used in which case we round-robin among them; 16616 * see below for more. 16617 * 16618 * Returns NULL if there is no suitable source address for the ill. 16619 * This only occurs when there is no valid source address for the ill. 16620 */ 16621 ipif_t * 16622 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 16623 { 16624 ill_t *usill = NULL; 16625 ill_t *ipmp_ill = NULL; 16626 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 16627 ipif_type_t type, best_type; 16628 tsol_tpc_t *src_rhtp, *dst_rhtp; 16629 ip_stack_t *ipst = ill->ill_ipst; 16630 boolean_t samenet; 16631 16632 if (ill->ill_usesrc_ifindex != 0) { 16633 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 16634 B_FALSE, NULL, NULL, NULL, NULL, ipst); 16635 if (usill != NULL) 16636 ill = usill; /* Select source from usesrc ILL */ 16637 else 16638 return (NULL); 16639 } 16640 16641 /* 16642 * Test addresses should never be used for source address selection, 16643 * so if we were passed one, switch to the IPMP meta-interface. 16644 */ 16645 if (IS_UNDER_IPMP(ill)) { 16646 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 16647 ill = ipmp_ill; /* Select source from IPMP ill */ 16648 else 16649 return (NULL); 16650 } 16651 16652 /* 16653 * If we're dealing with an unlabeled destination on a labeled system, 16654 * make sure that we ignore source addresses that are incompatible with 16655 * the destination's default label. That destination's default label 16656 * must dominate the minimum label on the source address. 16657 */ 16658 dst_rhtp = NULL; 16659 if (is_system_labeled()) { 16660 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 16661 if (dst_rhtp == NULL) 16662 return (NULL); 16663 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 16664 TPC_RELE(dst_rhtp); 16665 dst_rhtp = NULL; 16666 } 16667 } 16668 16669 /* 16670 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 16671 * can be deleted. But an ipif/ill can get CONDEMNED any time. 16672 * After selecting the right ipif, under ill_lock make sure ipif is 16673 * not condemned, and increment refcnt. If ipif is CONDEMNED, 16674 * we retry. Inside the loop we still need to check for CONDEMNED, 16675 * but not under a lock. 16676 */ 16677 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16678 retry: 16679 /* 16680 * For source address selection, we treat the ipif list as circular 16681 * and continue until we get back to where we started. This allows 16682 * IPMP to vary source address selection (which improves inbound load 16683 * spreading) by caching its last ending point and starting from 16684 * there. NOTE: we don't have to worry about ill_src_ipif changing 16685 * ills since that can't happen on the IPMP ill. 16686 */ 16687 start_ipif = ill->ill_ipif; 16688 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 16689 start_ipif = ill->ill_src_ipif; 16690 16691 ipif = start_ipif; 16692 best_ipif = NULL; 16693 best_type = IPIF_NONE; 16694 do { 16695 if ((next_ipif = ipif->ipif_next) == NULL) 16696 next_ipif = ill->ill_ipif; 16697 16698 if (!IPIF_CAN_LOOKUP(ipif)) 16699 continue; 16700 /* Always skip NOLOCAL and ANYCAST interfaces */ 16701 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 16702 continue; 16703 if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready) 16704 continue; 16705 if (ipif->ipif_zoneid != zoneid && 16706 ipif->ipif_zoneid != ALL_ZONES) 16707 continue; 16708 16709 /* 16710 * Interfaces with 0.0.0.0 address are allowed to be UP, but 16711 * are not valid as source addresses. 16712 */ 16713 if (ipif->ipif_lcl_addr == INADDR_ANY) 16714 continue; 16715 16716 /* 16717 * Check compatibility of local address for destination's 16718 * default label if we're on a labeled system. Incompatible 16719 * addresses can't be used at all. 16720 */ 16721 if (dst_rhtp != NULL) { 16722 boolean_t incompat; 16723 16724 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 16725 IPV4_VERSION, B_FALSE); 16726 if (src_rhtp == NULL) 16727 continue; 16728 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 16729 src_rhtp->tpc_tp.tp_doi != 16730 dst_rhtp->tpc_tp.tp_doi || 16731 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 16732 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 16733 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 16734 src_rhtp->tpc_tp.tp_sl_set_cipso)); 16735 TPC_RELE(src_rhtp); 16736 if (incompat) 16737 continue; 16738 } 16739 16740 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 16741 16742 if (ipif->ipif_flags & IPIF_DEPRECATED) { 16743 type = samenet ? IPIF_SAMENET_DEPRECATED : 16744 IPIF_DIFFNET_DEPRECATED; 16745 } else if (ipif->ipif_zoneid == ALL_ZONES) { 16746 type = samenet ? IPIF_SAMENET_ALLZONES : 16747 IPIF_DIFFNET_ALLZONES; 16748 } else { 16749 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 16750 } 16751 16752 if (type > best_type) { 16753 best_type = type; 16754 best_ipif = ipif; 16755 if (best_type == IPIF_SAMENET) 16756 break; /* can't get better */ 16757 } 16758 } while ((ipif = next_ipif) != start_ipif); 16759 16760 if ((ipif = best_ipif) != NULL) { 16761 mutex_enter(&ipif->ipif_ill->ill_lock); 16762 if (!IPIF_CAN_LOOKUP(ipif)) { 16763 mutex_exit(&ipif->ipif_ill->ill_lock); 16764 goto retry; 16765 } 16766 ipif_refhold_locked(ipif); 16767 16768 /* 16769 * For IPMP, update the source ipif rotor to the next ipif, 16770 * provided we can look it up. (We must not use it if it's 16771 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 16772 * ipif_free() checked ill_src_ipif.) 16773 */ 16774 if (IS_IPMP(ill) && ipif != NULL) { 16775 next_ipif = ipif->ipif_next; 16776 if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) 16777 ill->ill_src_ipif = next_ipif; 16778 else 16779 ill->ill_src_ipif = NULL; 16780 } 16781 mutex_exit(&ipif->ipif_ill->ill_lock); 16782 } 16783 16784 rw_exit(&ipst->ips_ill_g_lock); 16785 if (usill != NULL) 16786 ill_refrele(usill); 16787 if (ipmp_ill != NULL) 16788 ill_refrele(ipmp_ill); 16789 if (dst_rhtp != NULL) 16790 TPC_RELE(dst_rhtp); 16791 16792 #ifdef DEBUG 16793 if (ipif == NULL) { 16794 char buf1[INET6_ADDRSTRLEN]; 16795 16796 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 16797 ill->ill_name, 16798 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 16799 } else { 16800 char buf1[INET6_ADDRSTRLEN]; 16801 char buf2[INET6_ADDRSTRLEN]; 16802 16803 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 16804 ipif->ipif_ill->ill_name, 16805 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 16806 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 16807 buf2, sizeof (buf2)))); 16808 } 16809 #endif /* DEBUG */ 16810 return (ipif); 16811 } 16812 16813 /* 16814 * If old_ipif is not NULL, see if ipif was derived from old 16815 * ipif and if so, recreate the interface route by re-doing 16816 * source address selection. This happens when ipif_down -> 16817 * ipif_update_other_ipifs calls us. 16818 * 16819 * If old_ipif is NULL, just redo the source address selection 16820 * if needed. This happens when ipif_up_done calls us. 16821 */ 16822 static void 16823 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 16824 { 16825 ire_t *ire; 16826 ire_t *ipif_ire; 16827 queue_t *stq; 16828 ipif_t *nipif; 16829 ill_t *ill; 16830 boolean_t need_rele = B_FALSE; 16831 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16832 16833 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 16834 ASSERT(IAM_WRITER_IPIF(ipif)); 16835 16836 ill = ipif->ipif_ill; 16837 if (!(ipif->ipif_flags & 16838 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 16839 /* 16840 * Can't possibly have borrowed the source 16841 * from old_ipif. 16842 */ 16843 return; 16844 } 16845 16846 /* 16847 * Is there any work to be done? No work if the address 16848 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 16849 * ipif_select_source() does not borrow addresses from 16850 * NOLOCAL and ANYCAST interfaces). 16851 */ 16852 if ((old_ipif != NULL) && 16853 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 16854 (old_ipif->ipif_ill->ill_wq == NULL) || 16855 (old_ipif->ipif_flags & 16856 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 16857 return; 16858 } 16859 16860 /* 16861 * Perform the same checks as when creating the 16862 * IRE_INTERFACE in ipif_up_done. 16863 */ 16864 if (!(ipif->ipif_flags & IPIF_UP)) 16865 return; 16866 16867 if ((ipif->ipif_flags & IPIF_NOXMIT) || 16868 (ipif->ipif_subnet == INADDR_ANY)) 16869 return; 16870 16871 ipif_ire = ipif_to_ire(ipif); 16872 if (ipif_ire == NULL) 16873 return; 16874 16875 /* 16876 * We know that ipif uses some other source for its 16877 * IRE_INTERFACE. Is it using the source of this 16878 * old_ipif? 16879 */ 16880 if (old_ipif != NULL && 16881 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 16882 ire_refrele(ipif_ire); 16883 return; 16884 } 16885 if (ip_debug > 2) { 16886 /* ip1dbg */ 16887 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 16888 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 16889 } 16890 16891 stq = ipif_ire->ire_stq; 16892 16893 /* 16894 * Can't use our source address. Select a different 16895 * source address for the IRE_INTERFACE. 16896 */ 16897 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 16898 if (nipif == NULL) { 16899 /* Last resort - all ipif's have IPIF_NOLOCAL */ 16900 nipif = ipif; 16901 } else { 16902 need_rele = B_TRUE; 16903 } 16904 16905 ire = ire_create( 16906 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 16907 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 16908 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 16909 NULL, /* no gateway */ 16910 &ipif->ipif_mtu, /* max frag */ 16911 NULL, /* no src nce */ 16912 NULL, /* no recv from queue */ 16913 stq, /* send-to queue */ 16914 ill->ill_net_type, /* IF_[NO]RESOLVER */ 16915 ipif, 16916 0, 16917 0, 16918 0, 16919 0, 16920 &ire_uinfo_null, 16921 NULL, 16922 NULL, 16923 ipst); 16924 16925 if (ire != NULL) { 16926 ire_t *ret_ire; 16927 int error; 16928 16929 /* 16930 * We don't need ipif_ire anymore. We need to delete 16931 * before we add so that ire_add does not detect 16932 * duplicates. 16933 */ 16934 ire_delete(ipif_ire); 16935 ret_ire = ire; 16936 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 16937 ASSERT(error == 0); 16938 ASSERT(ire == ret_ire); 16939 /* Held in ire_add */ 16940 ire_refrele(ret_ire); 16941 } 16942 /* 16943 * Either we are falling through from above or could not 16944 * allocate a replacement. 16945 */ 16946 ire_refrele(ipif_ire); 16947 if (need_rele) 16948 ipif_refrele(nipif); 16949 } 16950 16951 /* 16952 * This old_ipif is going away. 16953 * 16954 * Determine if any other ipif's are using our address as 16955 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 16956 * IPIF_DEPRECATED). 16957 * Find the IRE_INTERFACE for such ipifs and recreate them 16958 * to use an different source address following the rules in 16959 * ipif_up_done. 16960 */ 16961 static void 16962 ipif_update_other_ipifs(ipif_t *old_ipif) 16963 { 16964 ipif_t *ipif; 16965 ill_t *ill; 16966 char buf[INET6_ADDRSTRLEN]; 16967 16968 ASSERT(IAM_WRITER_IPIF(old_ipif)); 16969 16970 ill = old_ipif->ipif_ill; 16971 16972 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name, 16973 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf)))); 16974 16975 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 16976 if (ipif == old_ipif) 16977 continue; 16978 ipif_recreate_interface_routes(old_ipif, ipif); 16979 } 16980 } 16981 16982 /* ARGSUSED */ 16983 int 16984 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 16985 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 16986 { 16987 /* 16988 * ill_phyint_reinit merged the v4 and v6 into a single 16989 * ipsq. We might not have been able to complete the 16990 * operation in ipif_set_values, if we could not become 16991 * exclusive. If so restart it here. 16992 */ 16993 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 16994 } 16995 16996 /* 16997 * Can operate on either a module or a driver queue. 16998 * Returns an error if not a module queue. 16999 */ 17000 /* ARGSUSED */ 17001 int 17002 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 17003 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 17004 { 17005 queue_t *q1 = q; 17006 char *cp; 17007 char interf_name[LIFNAMSIZ]; 17008 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 17009 17010 if (q->q_next == NULL) { 17011 ip1dbg(( 17012 "if_unitsel: IF_UNITSEL: no q_next\n")); 17013 return (EINVAL); 17014 } 17015 17016 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 17017 return (EALREADY); 17018 17019 do { 17020 q1 = q1->q_next; 17021 } while (q1->q_next); 17022 cp = q1->q_qinfo->qi_minfo->mi_idname; 17023 (void) sprintf(interf_name, "%s%d", cp, ppa); 17024 17025 /* 17026 * Here we are not going to delay the ioack until after 17027 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 17028 * original ioctl message before sending the requests. 17029 */ 17030 return (ipif_set_values(q, mp, interf_name, &ppa)); 17031 } 17032 17033 /* ARGSUSED */ 17034 int 17035 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 17036 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 17037 { 17038 return (ENXIO); 17039 } 17040 17041 /* 17042 * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the 17043 * minimum (but complete) set exist. This is necessary when adding or 17044 * removing an interface to/from an IPMP group, since interfaces in an 17045 * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever 17046 * its test address subnets overlap with IPMP data addresses). It's also 17047 * used to refresh the IRE_BROADCAST entries associated with the IPMP 17048 * interface when the nominated broadcast interface changes. 17049 */ 17050 void 17051 ill_refresh_bcast(ill_t *ill) 17052 { 17053 ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */ 17054 ire_t **irep; 17055 ipif_t *ipif; 17056 17057 ASSERT(!ill->ill_isv6); 17058 ASSERT(IAM_WRITER_ILL(ill)); 17059 17060 /* 17061 * Remove any old broadcast IREs. 17062 */ 17063 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST, 17064 ill_broadcast_delete, ill, ill); 17065 17066 /* 17067 * Create new ones for any ipifs that are up and broadcast-capable. 17068 */ 17069 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17070 if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) != 17071 (IPIF_UP|IPIF_BROADCAST)) 17072 continue; 17073 17074 irep = ipif_create_bcast_ires(ipif, ire_array); 17075 while (irep-- > ire_array) { 17076 (void) ire_add(irep, NULL, NULL, NULL, B_FALSE); 17077 if (*irep != NULL) 17078 ire_refrele(*irep); 17079 } 17080 } 17081 } 17082 17083 /* 17084 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 17085 * `irep'. Returns a pointer to the next free `irep' entry (just like 17086 * ire_check_and_create_bcast()). 17087 */ 17088 static ire_t ** 17089 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 17090 { 17091 ipaddr_t addr; 17092 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 17093 ipaddr_t subnetmask = ipif->ipif_net_mask; 17094 int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; 17095 17096 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 17097 17098 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 17099 17100 if (ipif->ipif_lcl_addr == INADDR_ANY || 17101 (ipif->ipif_flags & IPIF_NOLOCAL)) 17102 netmask = htonl(IN_CLASSA_NET); /* fallback */ 17103 17104 irep = ire_check_and_create_bcast(ipif, 0, irep, flags); 17105 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags); 17106 17107 /* 17108 * For backward compatibility, we create net broadcast IREs based on 17109 * the old "IP address class system", since some old machines only 17110 * respond to these class derived net broadcast. However, we must not 17111 * create these net broadcast IREs if the subnetmask is shorter than 17112 * the IP address class based derived netmask. Otherwise, we may 17113 * create a net broadcast address which is the same as an IP address 17114 * on the subnet -- and then TCP will refuse to talk to that address. 17115 */ 17116 if (netmask < subnetmask) { 17117 addr = netmask & ipif->ipif_subnet; 17118 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 17119 irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep, 17120 flags); 17121 } 17122 17123 /* 17124 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 17125 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 17126 * created. Creating these broadcast IREs will only create confusion 17127 * as `addr' will be the same as the IP address. 17128 */ 17129 if (subnetmask != 0xFFFFFFFF) { 17130 addr = ipif->ipif_subnet; 17131 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 17132 irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr, 17133 irep, flags); 17134 } 17135 17136 return (irep); 17137 } 17138 17139 /* 17140 * Broadcast IRE info structure used in the functions below. Since we 17141 * allocate BCAST_COUNT of them on the stack, keep the bit layout compact. 17142 */ 17143 typedef struct bcast_ireinfo { 17144 uchar_t bi_type; /* BCAST_* value from below */ 17145 uchar_t bi_willdie:1, /* will this IRE be going away? */ 17146 bi_needrep:1, /* do we need to replace it? */ 17147 bi_haverep:1, /* have we replaced it? */ 17148 bi_pad:5; 17149 ipaddr_t bi_addr; /* IRE address */ 17150 ipif_t *bi_backup; /* last-ditch ipif to replace it on */ 17151 } bcast_ireinfo_t; 17152 17153 enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT }; 17154 17155 /* 17156 * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and 17157 * return B_TRUE if it should immediately be used to recreate the IRE. 17158 */ 17159 static boolean_t 17160 ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop) 17161 { 17162 ipaddr_t addr; 17163 17164 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie); 17165 17166 switch (bireinfop->bi_type) { 17167 case BCAST_NET: 17168 addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet); 17169 if (addr != bireinfop->bi_addr) 17170 return (B_FALSE); 17171 break; 17172 case BCAST_SUBNET: 17173 if (ipif->ipif_subnet != bireinfop->bi_addr) 17174 return (B_FALSE); 17175 break; 17176 } 17177 17178 bireinfop->bi_needrep = 1; 17179 if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) { 17180 if (bireinfop->bi_backup == NULL) 17181 bireinfop->bi_backup = ipif; 17182 return (B_FALSE); 17183 } 17184 return (B_TRUE); 17185 } 17186 17187 /* 17188 * Create the broadcast IREs described by `bireinfop' on `ipif', and return 17189 * them ala ire_check_and_create_bcast(). 17190 */ 17191 static ire_t ** 17192 ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep) 17193 { 17194 ipaddr_t mask, addr; 17195 17196 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep); 17197 17198 addr = bireinfop->bi_addr; 17199 irep = ire_create_bcast(ipif, addr, irep); 17200 17201 switch (bireinfop->bi_type) { 17202 case BCAST_NET: 17203 mask = ip_net_mask(ipif->ipif_subnet); 17204 irep = ire_create_bcast(ipif, addr | ~mask, irep); 17205 break; 17206 case BCAST_SUBNET: 17207 mask = ipif->ipif_net_mask; 17208 irep = ire_create_bcast(ipif, addr | ~mask, irep); 17209 break; 17210 } 17211 17212 bireinfop->bi_haverep = 1; 17213 return (irep); 17214 } 17215 17216 /* 17217 * Walk through all of the ipifs on `ill' that will be affected by `test_ipif' 17218 * going away, and determine if any of the broadcast IREs (named by `bireinfop') 17219 * that are going away are still needed. If so, have ipif_create_bcast() 17220 * recreate them (except for the deprecated case, as explained below). 17221 */ 17222 static ire_t ** 17223 ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo, 17224 ire_t **irep) 17225 { 17226 int i; 17227 ipif_t *ipif; 17228 17229 ASSERT(!ill->ill_isv6); 17230 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17231 /* 17232 * Skip this ipif if it's (a) the one being taken down, (b) 17233 * not in the same zone, or (c) has no valid local address. 17234 */ 17235 if (ipif == test_ipif || 17236 ipif->ipif_zoneid != test_ipif->ipif_zoneid || 17237 ipif->ipif_subnet == 0 || 17238 (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) != 17239 (IPIF_UP|IPIF_BROADCAST)) 17240 continue; 17241 17242 /* 17243 * For each dying IRE that hasn't yet been replaced, see if 17244 * `ipif' needs it and whether the IRE should be recreated on 17245 * `ipif'. If `ipif' is deprecated, ipif_consider_bcast() 17246 * will return B_FALSE even if `ipif' needs the IRE on the 17247 * hopes that we'll later find a needy non-deprecated ipif. 17248 * However, the ipif is recorded in bi_backup for possible 17249 * subsequent use by ipif_check_bcast_ires(). 17250 */ 17251 for (i = 0; i < BCAST_COUNT; i++) { 17252 if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep) 17253 continue; 17254 if (!ipif_consider_bcast(ipif, &bireinfo[i])) 17255 continue; 17256 irep = ipif_create_bcast(ipif, &bireinfo[i], irep); 17257 } 17258 17259 /* 17260 * If we've replaced all of the broadcast IREs that are going 17261 * to be taken down, we know we're done. 17262 */ 17263 for (i = 0; i < BCAST_COUNT; i++) { 17264 if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep) 17265 break; 17266 } 17267 if (i == BCAST_COUNT) 17268 break; 17269 } 17270 return (irep); 17271 } 17272 17273 /* 17274 * Check if `test_ipif' (which is going away) is associated with any existing 17275 * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were 17276 * using those broadcast IREs. If so, recreate the broadcast IREs on one or 17277 * more of those other ipifs. (The old IREs will be deleted in ipif_down().) 17278 * 17279 * This is necessary because broadcast IREs are shared. In particular, a 17280 * given ill has one set of all-zeroes and all-ones broadcast IREs (for every 17281 * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones, 17282 * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP 17283 * ipifs on. Thus, if there are two IPIF_UP ipifs on the same subnet with the 17284 * same zone, they will share the same set of broadcast IREs. 17285 * 17286 * Note: the upper bound of 12 IREs comes from the worst case of replacing all 17287 * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes, 17288 * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones). 17289 */ 17290 static void 17291 ipif_check_bcast_ires(ipif_t *test_ipif) 17292 { 17293 ill_t *ill = test_ipif->ipif_ill; 17294 ire_t *ire, *ire_array[12]; /* see note above */ 17295 ire_t **irep1, **irep = &ire_array[0]; 17296 uint_t i, willdie; 17297 ipaddr_t mask = ip_net_mask(test_ipif->ipif_subnet); 17298 bcast_ireinfo_t bireinfo[BCAST_COUNT]; 17299 17300 ASSERT(!test_ipif->ipif_isv6); 17301 ASSERT(IAM_WRITER_IPIF(test_ipif)); 17302 17303 /* 17304 * No broadcast IREs for the LOOPBACK interface 17305 * or others such as point to point and IPIF_NOXMIT. 17306 */ 17307 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 17308 (test_ipif->ipif_flags & IPIF_NOXMIT)) 17309 return; 17310 17311 bzero(bireinfo, sizeof (bireinfo)); 17312 bireinfo[0].bi_type = BCAST_ALLZEROES; 17313 bireinfo[0].bi_addr = 0; 17314 17315 bireinfo[1].bi_type = BCAST_ALLONES; 17316 bireinfo[1].bi_addr = INADDR_BROADCAST; 17317 17318 bireinfo[2].bi_type = BCAST_NET; 17319 bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask; 17320 17321 if (test_ipif->ipif_net_mask != 0) 17322 mask = test_ipif->ipif_net_mask; 17323 bireinfo[3].bi_type = BCAST_SUBNET; 17324 bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask; 17325 17326 /* 17327 * Figure out what (if any) broadcast IREs will die as a result of 17328 * `test_ipif' going away. If none will die, we're done. 17329 */ 17330 for (i = 0, willdie = 0; i < BCAST_COUNT; i++) { 17331 ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST, 17332 test_ipif, ALL_ZONES, NULL, 17333 (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst); 17334 if (ire != NULL) { 17335 willdie++; 17336 bireinfo[i].bi_willdie = 1; 17337 ire_refrele(ire); 17338 } 17339 } 17340 17341 if (willdie == 0) 17342 return; 17343 17344 /* 17345 * Walk through all the ipifs that will be affected by the dying IREs, 17346 * and recreate the IREs as necessary. Note that all interfaces in an 17347 * IPMP illgrp share the same broadcast IREs, and thus the entire 17348 * illgrp must be walked, starting with the IPMP meta-interface (so 17349 * that broadcast IREs end up on it whenever possible). 17350 */ 17351 if (IS_UNDER_IPMP(ill)) 17352 ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 17353 17354 irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); 17355 17356 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 17357 ipmp_illgrp_t *illg = ill->ill_grp; 17358 17359 ill = list_head(&illg->ig_if); 17360 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 17361 for (i = 0; i < BCAST_COUNT; i++) { 17362 if (bireinfo[i].bi_willdie && 17363 !bireinfo[i].bi_haverep) 17364 break; 17365 } 17366 if (i == BCAST_COUNT) 17367 break; 17368 17369 irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); 17370 } 17371 } 17372 17373 /* 17374 * Scan through the set of broadcast IREs and see if there are any 17375 * that we need to replace that have not yet been replaced. If so, 17376 * replace them using the appropriate backup ipif. 17377 */ 17378 for (i = 0; i < BCAST_COUNT; i++) { 17379 if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep) 17380 irep = ipif_create_bcast(bireinfo[i].bi_backup, 17381 &bireinfo[i], irep); 17382 } 17383 17384 /* 17385 * If we can't create all of them, don't add any of them. (Code in 17386 * ip_wput_ire() and ire_to_ill() assumes that we always have a 17387 * non-loopback copy and loopback copy for a given address.) 17388 */ 17389 for (irep1 = irep; irep1 > ire_array; ) { 17390 irep1--; 17391 if (*irep1 == NULL) { 17392 ip0dbg(("ipif_check_bcast_ires: can't create " 17393 "IRE_BROADCAST, memory allocation failure\n")); 17394 while (irep > ire_array) { 17395 irep--; 17396 if (*irep != NULL) 17397 ire_delete(*irep); 17398 } 17399 return; 17400 } 17401 } 17402 17403 for (irep1 = irep; irep1 > ire_array; ) { 17404 irep1--; 17405 if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0) 17406 ire_refrele(*irep1); /* Held in ire_add */ 17407 } 17408 } 17409 17410 /* 17411 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 17412 * from lifr_flags and the name from lifr_name. 17413 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 17414 * since ipif_lookup_on_name uses the _isv6 flags when matching. 17415 * Returns EINPROGRESS when mp has been consumed by queueing it on 17416 * ill_pending_mp and the ioctl will complete in ip_rput. 17417 * 17418 * Can operate on either a module or a driver queue. 17419 * Returns an error if not a module queue. 17420 */ 17421 /* ARGSUSED */ 17422 int 17423 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17424 ip_ioctl_cmd_t *ipip, void *if_req) 17425 { 17426 ill_t *ill = q->q_ptr; 17427 phyint_t *phyi; 17428 ip_stack_t *ipst; 17429 struct lifreq *lifr = if_req; 17430 uint64_t new_flags; 17431 17432 ASSERT(ipif != NULL); 17433 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 17434 17435 if (q->q_next == NULL) { 17436 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 17437 return (EINVAL); 17438 } 17439 17440 /* 17441 * If we are not writer on 'q' then this interface exists already 17442 * and previous lookups (ip_extract_lifreq()) found this ipif -- 17443 * so return EALREADY. 17444 */ 17445 if (ill != ipif->ipif_ill) 17446 return (EALREADY); 17447 17448 if (ill->ill_name[0] != '\0') 17449 return (EALREADY); 17450 17451 /* 17452 * If there's another ill already with the requested name, ensure 17453 * that it's of the same type. Otherwise, ill_phyint_reinit() will 17454 * fuse together two unrelated ills, which will cause chaos. 17455 */ 17456 ipst = ill->ill_ipst; 17457 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 17458 lifr->lifr_name, NULL); 17459 if (phyi != NULL) { 17460 ill_t *ill_mate = phyi->phyint_illv4; 17461 17462 if (ill_mate == NULL) 17463 ill_mate = phyi->phyint_illv6; 17464 ASSERT(ill_mate != NULL); 17465 17466 if (ill_mate->ill_media->ip_m_mac_type != 17467 ill->ill_media->ip_m_mac_type) { 17468 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 17469 "use the same ill name on differing media\n")); 17470 return (EINVAL); 17471 } 17472 } 17473 17474 /* 17475 * We start off as IFF_IPV4 in ipif_allocate and become 17476 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 17477 * The only flags that we read from user space are IFF_IPV4, 17478 * IFF_IPV6, IFF_XRESOLV and IFF_BROADCAST. 17479 * 17480 * This ill has not been inserted into the global list. 17481 * So we are still single threaded and don't need any lock 17482 * 17483 * Saniy check the flags. 17484 */ 17485 17486 if ((lifr->lifr_flags & IFF_BROADCAST) && 17487 ((lifr->lifr_flags & IFF_IPV6) || 17488 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 17489 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 17490 "or IPv6 i.e., no broadcast \n")); 17491 return (EINVAL); 17492 } 17493 17494 new_flags = 17495 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_XRESOLV|IFF_BROADCAST); 17496 17497 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 17498 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 17499 "IFF_IPV4 or IFF_IPV6\n")); 17500 return (EINVAL); 17501 } 17502 /* 17503 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 17504 */ 17505 if ((new_flags & IFF_XRESOLV) && !(new_flags & IFF_IPV6) && 17506 !(ipif->ipif_isv6)) { 17507 ip1dbg(("ip_sioctl_slifname: XRESOLV only allowed on " 17508 "IPv6 interface\n")); 17509 return (EINVAL); 17510 } 17511 17512 /* 17513 * We always start off as IPv4, so only need to check for IPv6. 17514 */ 17515 if ((new_flags & IFF_IPV6) != 0) { 17516 ill->ill_flags |= ILLF_IPV6; 17517 ill->ill_flags &= ~ILLF_IPV4; 17518 } 17519 17520 if ((new_flags & IFF_BROADCAST) != 0) 17521 ipif->ipif_flags |= IPIF_BROADCAST; 17522 else 17523 ipif->ipif_flags &= ~IPIF_BROADCAST; 17524 17525 if ((new_flags & IFF_XRESOLV) != 0) 17526 ill->ill_flags |= ILLF_XRESOLV; 17527 else 17528 ill->ill_flags &= ~ILLF_XRESOLV; 17529 17530 /* We started off as V4. */ 17531 if (ill->ill_flags & ILLF_IPV6) { 17532 ill->ill_phyint->phyint_illv6 = ill; 17533 ill->ill_phyint->phyint_illv4 = NULL; 17534 } 17535 17536 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 17537 } 17538 17539 /* ARGSUSED */ 17540 int 17541 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17542 ip_ioctl_cmd_t *ipip, void *if_req) 17543 { 17544 /* 17545 * ill_phyint_reinit merged the v4 and v6 into a single 17546 * ipsq. We might not have been able to complete the 17547 * slifname in ipif_set_values, if we could not become 17548 * exclusive. If so restart it here 17549 */ 17550 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 17551 } 17552 17553 /* 17554 * Return a pointer to the ipif which matches the index, IP version type and 17555 * zoneid. 17556 */ 17557 ipif_t * 17558 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 17559 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst) 17560 { 17561 ill_t *ill; 17562 ipif_t *ipif = NULL; 17563 17564 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 17565 (q != NULL && mp != NULL && func != NULL && err != NULL)); 17566 17567 if (err != NULL) 17568 *err = 0; 17569 17570 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 17571 if (ill != NULL) { 17572 mutex_enter(&ill->ill_lock); 17573 for (ipif = ill->ill_ipif; ipif != NULL; 17574 ipif = ipif->ipif_next) { 17575 if (IPIF_CAN_LOOKUP(ipif) && (zoneid == ALL_ZONES || 17576 zoneid == ipif->ipif_zoneid || 17577 ipif->ipif_zoneid == ALL_ZONES)) { 17578 ipif_refhold_locked(ipif); 17579 break; 17580 } 17581 } 17582 mutex_exit(&ill->ill_lock); 17583 ill_refrele(ill); 17584 if (ipif == NULL && err != NULL) 17585 *err = ENXIO; 17586 } 17587 return (ipif); 17588 } 17589 17590 /* 17591 * Change an existing physical interface's index. If the new index 17592 * is acceptable we update the index and the phyint_list_avl_by_index tree. 17593 * Finally, we update other systems which may have a dependence on the 17594 * index value. 17595 */ 17596 /* ARGSUSED */ 17597 int 17598 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17599 ip_ioctl_cmd_t *ipip, void *ifreq) 17600 { 17601 ill_t *ill; 17602 phyint_t *phyi; 17603 struct ifreq *ifr = (struct ifreq *)ifreq; 17604 struct lifreq *lifr = (struct lifreq *)ifreq; 17605 uint_t old_index, index; 17606 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 17607 avl_index_t where; 17608 17609 if (ipip->ipi_cmd_type == IF_CMD) 17610 index = ifr->ifr_index; 17611 else 17612 index = lifr->lifr_index; 17613 17614 /* 17615 * Only allow on physical interface. Also, index zero is illegal. 17616 */ 17617 ill = ipif->ipif_ill; 17618 phyi = ill->ill_phyint; 17619 if (ipif->ipif_id != 0 || index == 0) { 17620 return (EINVAL); 17621 } 17622 17623 /* If the index is not changing, no work to do */ 17624 if (phyi->phyint_ifindex == index) 17625 return (0); 17626 17627 /* 17628 * Use phyint_exists() to determine if the new interface index 17629 * is already in use. If the index is unused then we need to 17630 * change the phyint's position in the phyint_list_avl_by_index 17631 * tree. If we do not do this, subsequent lookups (using the new 17632 * index value) will not find the phyint. 17633 */ 17634 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 17635 if (phyint_exists(index, ipst)) { 17636 rw_exit(&ipst->ips_ill_g_lock); 17637 return (EEXIST); 17638 } 17639 17640 /* 17641 * The new index is unused. Set it in the phyint. However we must not 17642 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 17643 * changes. The event must be bound to old ifindex value. 17644 */ 17645 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 17646 &index, sizeof (index)); 17647 17648 old_index = phyi->phyint_ifindex; 17649 phyi->phyint_ifindex = index; 17650 17651 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 17652 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 17653 &index, &where); 17654 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 17655 phyi, where); 17656 rw_exit(&ipst->ips_ill_g_lock); 17657 17658 /* Update SCTP's ILL list */ 17659 sctp_ill_reindex(ill, old_index); 17660 17661 /* Send the routing sockets message */ 17662 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 17663 if (ILL_OTHER(ill)) 17664 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 17665 17666 return (0); 17667 } 17668 17669 /* ARGSUSED */ 17670 int 17671 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17672 ip_ioctl_cmd_t *ipip, void *ifreq) 17673 { 17674 struct ifreq *ifr = (struct ifreq *)ifreq; 17675 struct lifreq *lifr = (struct lifreq *)ifreq; 17676 17677 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 17678 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17679 /* Get the interface index */ 17680 if (ipip->ipi_cmd_type == IF_CMD) { 17681 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 17682 } else { 17683 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 17684 } 17685 return (0); 17686 } 17687 17688 /* ARGSUSED */ 17689 int 17690 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17691 ip_ioctl_cmd_t *ipip, void *ifreq) 17692 { 17693 struct lifreq *lifr = (struct lifreq *)ifreq; 17694 17695 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 17696 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17697 /* Get the interface zone */ 17698 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17699 lifr->lifr_zoneid = ipif->ipif_zoneid; 17700 return (0); 17701 } 17702 17703 /* 17704 * Set the zoneid of an interface. 17705 */ 17706 /* ARGSUSED */ 17707 int 17708 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17709 ip_ioctl_cmd_t *ipip, void *ifreq) 17710 { 17711 struct lifreq *lifr = (struct lifreq *)ifreq; 17712 int err = 0; 17713 boolean_t need_up = B_FALSE; 17714 zone_t *zptr; 17715 zone_status_t status; 17716 zoneid_t zoneid; 17717 17718 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17719 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 17720 if (!is_system_labeled()) 17721 return (ENOTSUP); 17722 zoneid = GLOBAL_ZONEID; 17723 } 17724 17725 /* cannot assign instance zero to a non-global zone */ 17726 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 17727 return (ENOTSUP); 17728 17729 /* 17730 * Cannot assign to a zone that doesn't exist or is shutting down. In 17731 * the event of a race with the zone shutdown processing, since IP 17732 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 17733 * interface will be cleaned up even if the zone is shut down 17734 * immediately after the status check. If the interface can't be brought 17735 * down right away, and the zone is shut down before the restart 17736 * function is called, we resolve the possible races by rechecking the 17737 * zone status in the restart function. 17738 */ 17739 if ((zptr = zone_find_by_id(zoneid)) == NULL) 17740 return (EINVAL); 17741 status = zone_status_get(zptr); 17742 zone_rele(zptr); 17743 17744 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 17745 return (EINVAL); 17746 17747 if (ipif->ipif_flags & IPIF_UP) { 17748 /* 17749 * If the interface is already marked up, 17750 * we call ipif_down which will take care 17751 * of ditching any IREs that have been set 17752 * up based on the old interface address. 17753 */ 17754 err = ipif_logical_down(ipif, q, mp); 17755 if (err == EINPROGRESS) 17756 return (err); 17757 ipif_down_tail(ipif); 17758 need_up = B_TRUE; 17759 } 17760 17761 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 17762 return (err); 17763 } 17764 17765 static int 17766 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 17767 queue_t *q, mblk_t *mp, boolean_t need_up) 17768 { 17769 int err = 0; 17770 ip_stack_t *ipst; 17771 17772 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 17773 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17774 17775 if (CONN_Q(q)) 17776 ipst = CONNQ_TO_IPST(q); 17777 else 17778 ipst = ILLQ_TO_IPST(q); 17779 17780 /* 17781 * For exclusive stacks we don't allow a different zoneid than 17782 * global. 17783 */ 17784 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 17785 zoneid != GLOBAL_ZONEID) 17786 return (EINVAL); 17787 17788 /* Set the new zone id. */ 17789 ipif->ipif_zoneid = zoneid; 17790 17791 /* Update sctp list */ 17792 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 17793 17794 if (need_up) { 17795 /* 17796 * Now bring the interface back up. If this 17797 * is the only IPIF for the ILL, ipif_up 17798 * will have to re-bind to the device, so 17799 * we may get back EINPROGRESS, in which 17800 * case, this IOCTL will get completed in 17801 * ip_rput_dlpi when we see the DL_BIND_ACK. 17802 */ 17803 err = ipif_up(ipif, q, mp); 17804 } 17805 return (err); 17806 } 17807 17808 /* ARGSUSED */ 17809 int 17810 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17811 ip_ioctl_cmd_t *ipip, void *if_req) 17812 { 17813 struct lifreq *lifr = (struct lifreq *)if_req; 17814 zoneid_t zoneid; 17815 zone_t *zptr; 17816 zone_status_t status; 17817 17818 ASSERT(ipif->ipif_id != 0); 17819 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17820 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 17821 zoneid = GLOBAL_ZONEID; 17822 17823 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 17824 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17825 17826 /* 17827 * We recheck the zone status to resolve the following race condition: 17828 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 17829 * 2) hme0:1 is up and can't be brought down right away; 17830 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 17831 * 3) zone "myzone" is halted; the zone status switches to 17832 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 17833 * the interfaces to remove - hme0:1 is not returned because it's not 17834 * yet in "myzone", so it won't be removed; 17835 * 4) the restart function for SIOCSLIFZONE is called; without the 17836 * status check here, we would have hme0:1 in "myzone" after it's been 17837 * destroyed. 17838 * Note that if the status check fails, we need to bring the interface 17839 * back to its state prior to ip_sioctl_slifzone(), hence the call to 17840 * ipif_up_done[_v6](). 17841 */ 17842 status = ZONE_IS_UNINITIALIZED; 17843 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 17844 status = zone_status_get(zptr); 17845 zone_rele(zptr); 17846 } 17847 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 17848 if (ipif->ipif_isv6) { 17849 (void) ipif_up_done_v6(ipif); 17850 } else { 17851 (void) ipif_up_done(ipif); 17852 } 17853 return (EINVAL); 17854 } 17855 17856 ipif_down_tail(ipif); 17857 17858 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 17859 B_TRUE)); 17860 } 17861 17862 /* 17863 * Return the number of addresses on `ill' with one or more of the values 17864 * in `set' set and all of the values in `clear' clear. 17865 */ 17866 static uint_t 17867 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 17868 { 17869 ipif_t *ipif; 17870 uint_t cnt = 0; 17871 17872 ASSERT(IAM_WRITER_ILL(ill)); 17873 17874 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 17875 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 17876 cnt++; 17877 17878 return (cnt); 17879 } 17880 17881 /* 17882 * Return the number of migratable addresses on `ill' that are under 17883 * application control. 17884 */ 17885 uint_t 17886 ill_appaddr_cnt(const ill_t *ill) 17887 { 17888 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 17889 IPIF_NOFAILOVER)); 17890 } 17891 17892 /* 17893 * Return the number of point-to-point addresses on `ill'. 17894 */ 17895 uint_t 17896 ill_ptpaddr_cnt(const ill_t *ill) 17897 { 17898 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 17899 } 17900 17901 /* ARGSUSED */ 17902 int 17903 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17904 ip_ioctl_cmd_t *ipip, void *ifreq) 17905 { 17906 struct lifreq *lifr = ifreq; 17907 17908 ASSERT(q->q_next == NULL); 17909 ASSERT(CONN_Q(q)); 17910 17911 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 17912 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17913 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 17914 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 17915 17916 return (0); 17917 } 17918 17919 /* Find the previous ILL in this usesrc group */ 17920 static ill_t * 17921 ill_prev_usesrc(ill_t *uill) 17922 { 17923 ill_t *ill; 17924 17925 for (ill = uill->ill_usesrc_grp_next; 17926 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 17927 ill = ill->ill_usesrc_grp_next) 17928 /* do nothing */; 17929 return (ill); 17930 } 17931 17932 /* 17933 * Release all members of the usesrc group. This routine is called 17934 * from ill_delete when the interface being unplumbed is the 17935 * group head. 17936 */ 17937 static void 17938 ill_disband_usesrc_group(ill_t *uill) 17939 { 17940 ill_t *next_ill, *tmp_ill; 17941 ip_stack_t *ipst = uill->ill_ipst; 17942 17943 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 17944 next_ill = uill->ill_usesrc_grp_next; 17945 17946 do { 17947 ASSERT(next_ill != NULL); 17948 tmp_ill = next_ill->ill_usesrc_grp_next; 17949 ASSERT(tmp_ill != NULL); 17950 next_ill->ill_usesrc_grp_next = NULL; 17951 next_ill->ill_usesrc_ifindex = 0; 17952 next_ill = tmp_ill; 17953 } while (next_ill->ill_usesrc_ifindex != 0); 17954 uill->ill_usesrc_grp_next = NULL; 17955 } 17956 17957 /* 17958 * Remove the client usesrc ILL from the list and relink to a new list 17959 */ 17960 int 17961 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 17962 { 17963 ill_t *ill, *tmp_ill; 17964 ip_stack_t *ipst = ucill->ill_ipst; 17965 17966 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 17967 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 17968 17969 /* 17970 * Check if the usesrc client ILL passed in is not already 17971 * in use as a usesrc ILL i.e one whose source address is 17972 * in use OR a usesrc ILL is not already in use as a usesrc 17973 * client ILL 17974 */ 17975 if ((ucill->ill_usesrc_ifindex == 0) || 17976 (uill->ill_usesrc_ifindex != 0)) { 17977 return (-1); 17978 } 17979 17980 ill = ill_prev_usesrc(ucill); 17981 ASSERT(ill->ill_usesrc_grp_next != NULL); 17982 17983 /* Remove from the current list */ 17984 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 17985 /* Only two elements in the list */ 17986 ASSERT(ill->ill_usesrc_ifindex == 0); 17987 ill->ill_usesrc_grp_next = NULL; 17988 } else { 17989 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 17990 } 17991 17992 if (ifindex == 0) { 17993 ucill->ill_usesrc_ifindex = 0; 17994 ucill->ill_usesrc_grp_next = NULL; 17995 return (0); 17996 } 17997 17998 ucill->ill_usesrc_ifindex = ifindex; 17999 tmp_ill = uill->ill_usesrc_grp_next; 18000 uill->ill_usesrc_grp_next = ucill; 18001 ucill->ill_usesrc_grp_next = 18002 (tmp_ill != NULL) ? tmp_ill : uill; 18003 return (0); 18004 } 18005 18006 /* 18007 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 18008 * ip.c for locking details. 18009 */ 18010 /* ARGSUSED */ 18011 int 18012 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18013 ip_ioctl_cmd_t *ipip, void *ifreq) 18014 { 18015 struct lifreq *lifr = (struct lifreq *)ifreq; 18016 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 18017 ill_flag_changed = B_FALSE; 18018 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 18019 int err = 0, ret; 18020 uint_t ifindex; 18021 ipsq_t *ipsq = NULL; 18022 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 18023 18024 ASSERT(IAM_WRITER_IPIF(ipif)); 18025 ASSERT(q->q_next == NULL); 18026 ASSERT(CONN_Q(q)); 18027 18028 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 18029 18030 ifindex = lifr->lifr_index; 18031 if (ifindex == 0) { 18032 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 18033 /* non usesrc group interface, nothing to reset */ 18034 return (0); 18035 } 18036 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 18037 /* valid reset request */ 18038 reset_flg = B_TRUE; 18039 } 18040 18041 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 18042 ip_process_ioctl, &err, ipst); 18043 if (usesrc_ill == NULL) { 18044 return (err); 18045 } 18046 18047 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 18048 NEW_OP, B_TRUE); 18049 if (ipsq == NULL) { 18050 err = EINPROGRESS; 18051 /* Operation enqueued on the ipsq of the usesrc ILL */ 18052 goto done; 18053 } 18054 18055 /* USESRC isn't currently supported with IPMP */ 18056 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 18057 err = ENOTSUP; 18058 goto done; 18059 } 18060 18061 /* 18062 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 18063 * used by IPMP underlying interfaces, but someone might think it's 18064 * more general and try to use it independently with VNI.) 18065 */ 18066 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 18067 err = ENOTSUP; 18068 goto done; 18069 } 18070 18071 /* 18072 * If the client is already in use as a usesrc_ill or a usesrc_ill is 18073 * already a client then return EINVAL 18074 */ 18075 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 18076 err = EINVAL; 18077 goto done; 18078 } 18079 18080 /* 18081 * If the ill_usesrc_ifindex field is already set to what it needs to 18082 * be then this is a duplicate operation. 18083 */ 18084 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 18085 err = 0; 18086 goto done; 18087 } 18088 18089 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 18090 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 18091 usesrc_ill->ill_isv6)); 18092 18093 /* 18094 * The next step ensures that no new ires will be created referencing 18095 * the client ill, until the ILL_CHANGING flag is cleared. Then 18096 * we go through an ire walk deleting all ire caches that reference 18097 * the client ill. New ires referencing the client ill that are added 18098 * to the ire table before the ILL_CHANGING flag is set, will be 18099 * cleaned up by the ire walk below. Attempt to add new ires referencing 18100 * the client ill while the ILL_CHANGING flag is set will be failed 18101 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 18102 * checks (under the ill_g_usesrc_lock) that the ire being added 18103 * is not stale, i.e the ire_stq and ire_ipif are consistent and 18104 * belong to the same usesrc group. 18105 */ 18106 mutex_enter(&usesrc_cli_ill->ill_lock); 18107 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 18108 mutex_exit(&usesrc_cli_ill->ill_lock); 18109 ill_flag_changed = B_TRUE; 18110 18111 if (ipif->ipif_isv6) 18112 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 18113 ALL_ZONES, ipst); 18114 else 18115 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 18116 ALL_ZONES, ipst); 18117 18118 /* 18119 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 18120 * and the ill_usesrc_ifindex fields 18121 */ 18122 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 18123 18124 if (reset_flg) { 18125 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 18126 if (ret != 0) { 18127 err = EINVAL; 18128 } 18129 rw_exit(&ipst->ips_ill_g_usesrc_lock); 18130 goto done; 18131 } 18132 18133 /* 18134 * Four possibilities to consider: 18135 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 18136 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 18137 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 18138 * 4. Both are part of their respective usesrc groups 18139 */ 18140 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 18141 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 18142 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 18143 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 18144 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 18145 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 18146 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 18147 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 18148 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 18149 /* Insert at head of list */ 18150 usesrc_cli_ill->ill_usesrc_grp_next = 18151 usesrc_ill->ill_usesrc_grp_next; 18152 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 18153 } else { 18154 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 18155 ifindex); 18156 if (ret != 0) 18157 err = EINVAL; 18158 } 18159 rw_exit(&ipst->ips_ill_g_usesrc_lock); 18160 18161 done: 18162 if (ill_flag_changed) { 18163 mutex_enter(&usesrc_cli_ill->ill_lock); 18164 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 18165 mutex_exit(&usesrc_cli_ill->ill_lock); 18166 } 18167 if (ipsq != NULL) 18168 ipsq_exit(ipsq); 18169 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 18170 ill_refrele(usesrc_ill); 18171 return (err); 18172 } 18173 18174 /* 18175 * comparison function used by avl. 18176 */ 18177 static int 18178 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 18179 { 18180 18181 uint_t index; 18182 18183 ASSERT(phyip != NULL && index_ptr != NULL); 18184 18185 index = *((uint_t *)index_ptr); 18186 /* 18187 * let the phyint with the lowest index be on top. 18188 */ 18189 if (((phyint_t *)phyip)->phyint_ifindex < index) 18190 return (1); 18191 if (((phyint_t *)phyip)->phyint_ifindex > index) 18192 return (-1); 18193 return (0); 18194 } 18195 18196 /* 18197 * comparison function used by avl. 18198 */ 18199 static int 18200 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 18201 { 18202 ill_t *ill; 18203 int res = 0; 18204 18205 ASSERT(phyip != NULL && name_ptr != NULL); 18206 18207 if (((phyint_t *)phyip)->phyint_illv4) 18208 ill = ((phyint_t *)phyip)->phyint_illv4; 18209 else 18210 ill = ((phyint_t *)phyip)->phyint_illv6; 18211 ASSERT(ill != NULL); 18212 18213 res = strcmp(ill->ill_name, (char *)name_ptr); 18214 if (res > 0) 18215 return (1); 18216 else if (res < 0) 18217 return (-1); 18218 return (0); 18219 } 18220 18221 /* 18222 * This function is called on the unplumb path via ill_glist_delete() when 18223 * there are no ills left on the phyint and thus the phyint can be freed. 18224 */ 18225 static void 18226 phyint_free(phyint_t *phyi) 18227 { 18228 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 18229 18230 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 18231 18232 /* 18233 * If this phyint was an IPMP meta-interface, blow away the group. 18234 * This is safe to do because all of the illgrps have already been 18235 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 18236 * If we're cleaning up as a result of failed initialization, 18237 * phyint_grp may be NULL. 18238 */ 18239 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 18240 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 18241 ipmp_grp_destroy(phyi->phyint_grp); 18242 phyi->phyint_grp = NULL; 18243 rw_exit(&ipst->ips_ipmp_lock); 18244 } 18245 18246 /* 18247 * If this interface was under IPMP, take it out of the group. 18248 */ 18249 if (phyi->phyint_grp != NULL) 18250 ipmp_phyint_leave_grp(phyi); 18251 18252 /* 18253 * Delete the phyint and disassociate its ipsq. The ipsq itself 18254 * will be freed in ipsq_exit(). 18255 */ 18256 phyi->phyint_ipsq->ipsq_phyint = NULL; 18257 phyi->phyint_name[0] = '\0'; 18258 18259 mi_free(phyi); 18260 } 18261 18262 /* 18263 * Attach the ill to the phyint structure which can be shared by both 18264 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 18265 * function is called from ipif_set_values and ill_lookup_on_name (for 18266 * loopback) where we know the name of the ill. We lookup the ill and if 18267 * there is one present already with the name use that phyint. Otherwise 18268 * reuse the one allocated by ill_init. 18269 */ 18270 static void 18271 ill_phyint_reinit(ill_t *ill) 18272 { 18273 boolean_t isv6 = ill->ill_isv6; 18274 phyint_t *phyi_old; 18275 phyint_t *phyi; 18276 avl_index_t where = 0; 18277 ill_t *ill_other = NULL; 18278 ip_stack_t *ipst = ill->ill_ipst; 18279 18280 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 18281 18282 phyi_old = ill->ill_phyint; 18283 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 18284 phyi_old->phyint_illv6 == NULL)); 18285 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 18286 phyi_old->phyint_illv4 == NULL)); 18287 ASSERT(phyi_old->phyint_ifindex == 0); 18288 18289 /* 18290 * Now that our ill has a name, set it in the phyint. 18291 */ 18292 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 18293 18294 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18295 ill->ill_name, &where); 18296 18297 /* 18298 * 1. We grabbed the ill_g_lock before inserting this ill into 18299 * the global list of ills. So no other thread could have located 18300 * this ill and hence the ipsq of this ill is guaranteed to be empty. 18301 * 2. Now locate the other protocol instance of this ill. 18302 * 3. Now grab both ill locks in the right order, and the phyint lock of 18303 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 18304 * of neither ill can change. 18305 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 18306 * other ill. 18307 * 5. Release all locks. 18308 */ 18309 18310 /* 18311 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 18312 * we are initializing IPv4. 18313 */ 18314 if (phyi != NULL) { 18315 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 18316 ASSERT(ill_other->ill_phyint != NULL); 18317 ASSERT((isv6 && !ill_other->ill_isv6) || 18318 (!isv6 && ill_other->ill_isv6)); 18319 GRAB_ILL_LOCKS(ill, ill_other); 18320 /* 18321 * We are potentially throwing away phyint_flags which 18322 * could be different from the one that we obtain from 18323 * ill_other->ill_phyint. But it is okay as we are assuming 18324 * that the state maintained within IP is correct. 18325 */ 18326 mutex_enter(&phyi->phyint_lock); 18327 if (isv6) { 18328 ASSERT(phyi->phyint_illv6 == NULL); 18329 phyi->phyint_illv6 = ill; 18330 } else { 18331 ASSERT(phyi->phyint_illv4 == NULL); 18332 phyi->phyint_illv4 = ill; 18333 } 18334 18335 /* 18336 * Delete the old phyint and make its ipsq eligible 18337 * to be freed in ipsq_exit(). 18338 */ 18339 phyi_old->phyint_illv4 = NULL; 18340 phyi_old->phyint_illv6 = NULL; 18341 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 18342 phyi_old->phyint_name[0] = '\0'; 18343 mi_free(phyi_old); 18344 } else { 18345 mutex_enter(&ill->ill_lock); 18346 /* 18347 * We don't need to acquire any lock, since 18348 * the ill is not yet visible globally and we 18349 * have not yet released the ill_g_lock. 18350 */ 18351 phyi = phyi_old; 18352 mutex_enter(&phyi->phyint_lock); 18353 /* XXX We need a recovery strategy here. */ 18354 if (!phyint_assign_ifindex(phyi, ipst)) 18355 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 18356 18357 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18358 (void *)phyi, where); 18359 18360 (void) avl_find(&ipst->ips_phyint_g_list-> 18361 phyint_list_avl_by_index, 18362 &phyi->phyint_ifindex, &where); 18363 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 18364 (void *)phyi, where); 18365 } 18366 18367 /* 18368 * Reassigning ill_phyint automatically reassigns the ipsq also. 18369 * pending mp is not affected because that is per ill basis. 18370 */ 18371 ill->ill_phyint = phyi; 18372 18373 /* 18374 * Now that the phyint's ifindex has been assigned, complete the 18375 * remaining 18376 */ 18377 18378 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 18379 if (ill->ill_isv6) { 18380 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 18381 ill->ill_phyint->phyint_ifindex; 18382 ill->ill_mcast_type = ipst->ips_mld_max_version; 18383 } else { 18384 ill->ill_mcast_type = ipst->ips_igmp_max_version; 18385 } 18386 18387 /* 18388 * Generate an event within the hooks framework to indicate that 18389 * a new interface has just been added to IP. For this event to 18390 * be generated, the network interface must, at least, have an 18391 * ifindex assigned to it. (We don't generate the event for 18392 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 18393 * 18394 * This needs to be run inside the ill_g_lock perimeter to ensure 18395 * that the ordering of delivered events to listeners matches the 18396 * order of them in the kernel. 18397 */ 18398 if (!IS_LOOPBACK(ill)) { 18399 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 18400 ill->ill_name_length); 18401 } 18402 RELEASE_ILL_LOCKS(ill, ill_other); 18403 mutex_exit(&phyi->phyint_lock); 18404 } 18405 18406 /* 18407 * Notify any downstream modules of the name of this interface. 18408 * An M_IOCTL is used even though we don't expect a successful reply. 18409 * Any reply message from the driver (presumably an M_IOCNAK) will 18410 * eventually get discarded somewhere upstream. The message format is 18411 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 18412 * to IP. 18413 */ 18414 static void 18415 ip_ifname_notify(ill_t *ill, queue_t *q) 18416 { 18417 mblk_t *mp1, *mp2; 18418 struct iocblk *iocp; 18419 struct lifreq *lifr; 18420 18421 mp1 = mkiocb(SIOCSLIFNAME); 18422 if (mp1 == NULL) 18423 return; 18424 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 18425 if (mp2 == NULL) { 18426 freeb(mp1); 18427 return; 18428 } 18429 18430 mp1->b_cont = mp2; 18431 iocp = (struct iocblk *)mp1->b_rptr; 18432 iocp->ioc_count = sizeof (struct lifreq); 18433 18434 lifr = (struct lifreq *)mp2->b_rptr; 18435 mp2->b_wptr += sizeof (struct lifreq); 18436 bzero(lifr, sizeof (struct lifreq)); 18437 18438 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 18439 lifr->lifr_ppa = ill->ill_ppa; 18440 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 18441 18442 putnext(q, mp1); 18443 } 18444 18445 static int 18446 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 18447 { 18448 int err; 18449 ip_stack_t *ipst = ill->ill_ipst; 18450 phyint_t *phyi = ill->ill_phyint; 18451 18452 /* Set the obsolete NDD per-interface forwarding name. */ 18453 err = ill_set_ndd_name(ill); 18454 if (err != 0) { 18455 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 18456 err); 18457 } 18458 18459 /* 18460 * Now that ill_name is set, the configuration for the IPMP 18461 * meta-interface can be performed. 18462 */ 18463 if (IS_IPMP(ill)) { 18464 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 18465 /* 18466 * If phyi->phyint_grp is NULL, then this is the first IPMP 18467 * meta-interface and we need to create the IPMP group. 18468 */ 18469 if (phyi->phyint_grp == NULL) { 18470 /* 18471 * If someone has renamed another IPMP group to have 18472 * the same name as our interface, bail. 18473 */ 18474 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 18475 rw_exit(&ipst->ips_ipmp_lock); 18476 return (EEXIST); 18477 } 18478 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 18479 if (phyi->phyint_grp == NULL) { 18480 rw_exit(&ipst->ips_ipmp_lock); 18481 return (ENOMEM); 18482 } 18483 } 18484 rw_exit(&ipst->ips_ipmp_lock); 18485 } 18486 18487 /* Tell downstream modules where they are. */ 18488 ip_ifname_notify(ill, q); 18489 18490 /* 18491 * ill_dl_phys returns EINPROGRESS in the usual case. 18492 * Error cases are ENOMEM ... 18493 */ 18494 err = ill_dl_phys(ill, ipif, mp, q); 18495 18496 /* 18497 * If there is no IRE expiration timer running, get one started. 18498 * igmp and mld timers will be triggered by the first multicast 18499 */ 18500 if (ipst->ips_ip_ire_expire_id == 0) { 18501 /* 18502 * acquire the lock and check again. 18503 */ 18504 mutex_enter(&ipst->ips_ip_trash_timer_lock); 18505 if (ipst->ips_ip_ire_expire_id == 0) { 18506 ipst->ips_ip_ire_expire_id = timeout( 18507 ip_trash_timer_expire, ipst, 18508 MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 18509 } 18510 mutex_exit(&ipst->ips_ip_trash_timer_lock); 18511 } 18512 18513 if (ill->ill_isv6) { 18514 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 18515 if (ipst->ips_mld_slowtimeout_id == 0) { 18516 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 18517 (void *)ipst, 18518 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 18519 } 18520 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 18521 } else { 18522 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 18523 if (ipst->ips_igmp_slowtimeout_id == 0) { 18524 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 18525 (void *)ipst, 18526 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 18527 } 18528 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 18529 } 18530 18531 return (err); 18532 } 18533 18534 /* 18535 * Common routine for ppa and ifname setting. Should be called exclusive. 18536 * 18537 * Returns EINPROGRESS when mp has been consumed by queueing it on 18538 * ill_pending_mp and the ioctl will complete in ip_rput. 18539 * 18540 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 18541 * the new name and new ppa in lifr_name and lifr_ppa respectively. 18542 * For SLIFNAME, we pass these values back to the userland. 18543 */ 18544 static int 18545 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 18546 { 18547 ill_t *ill; 18548 ipif_t *ipif; 18549 ipsq_t *ipsq; 18550 char *ppa_ptr; 18551 char *old_ptr; 18552 char old_char; 18553 int error; 18554 ip_stack_t *ipst; 18555 18556 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 18557 ASSERT(q->q_next != NULL); 18558 ASSERT(interf_name != NULL); 18559 18560 ill = (ill_t *)q->q_ptr; 18561 ipst = ill->ill_ipst; 18562 18563 ASSERT(ill->ill_ipst != NULL); 18564 ASSERT(ill->ill_name[0] == '\0'); 18565 ASSERT(IAM_WRITER_ILL(ill)); 18566 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 18567 ASSERT(ill->ill_ppa == UINT_MAX); 18568 18569 /* The ppa is sent down by ifconfig or is chosen */ 18570 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 18571 return (EINVAL); 18572 } 18573 18574 /* 18575 * make sure ppa passed in is same as ppa in the name. 18576 * This check is not made when ppa == UINT_MAX in that case ppa 18577 * in the name could be anything. System will choose a ppa and 18578 * update new_ppa_ptr and inter_name to contain the choosen ppa. 18579 */ 18580 if (*new_ppa_ptr != UINT_MAX) { 18581 /* stoi changes the pointer */ 18582 old_ptr = ppa_ptr; 18583 /* 18584 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 18585 * (they don't have an externally visible ppa). We assign one 18586 * here so that we can manage the interface. Note that in 18587 * the past this value was always 0 for DLPI 1 drivers. 18588 */ 18589 if (*new_ppa_ptr == 0) 18590 *new_ppa_ptr = stoi(&old_ptr); 18591 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 18592 return (EINVAL); 18593 } 18594 /* 18595 * terminate string before ppa 18596 * save char at that location. 18597 */ 18598 old_char = ppa_ptr[0]; 18599 ppa_ptr[0] = '\0'; 18600 18601 ill->ill_ppa = *new_ppa_ptr; 18602 /* 18603 * Finish as much work now as possible before calling ill_glist_insert 18604 * which makes the ill globally visible and also merges it with the 18605 * other protocol instance of this phyint. The remaining work is 18606 * done after entering the ipsq which may happen sometime later. 18607 * ill_set_ndd_name occurs after the ill has been made globally visible. 18608 */ 18609 ipif = ill->ill_ipif; 18610 18611 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 18612 ipif_assign_seqid(ipif); 18613 18614 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 18615 ill->ill_flags |= ILLF_IPV4; 18616 18617 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 18618 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 18619 18620 if (ill->ill_flags & ILLF_IPV6) { 18621 18622 ill->ill_isv6 = B_TRUE; 18623 if (ill->ill_rq != NULL) { 18624 ill->ill_rq->q_qinfo = &iprinitv6; 18625 ill->ill_wq->q_qinfo = &ipwinitv6; 18626 } 18627 18628 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 18629 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 18630 ipif->ipif_v6src_addr = ipv6_all_zeros; 18631 ipif->ipif_v6subnet = ipv6_all_zeros; 18632 ipif->ipif_v6net_mask = ipv6_all_zeros; 18633 ipif->ipif_v6brd_addr = ipv6_all_zeros; 18634 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 18635 /* 18636 * point-to-point or Non-mulicast capable 18637 * interfaces won't do NUD unless explicitly 18638 * configured to do so. 18639 */ 18640 if (ipif->ipif_flags & IPIF_POINTOPOINT || 18641 !(ill->ill_flags & ILLF_MULTICAST)) { 18642 ill->ill_flags |= ILLF_NONUD; 18643 } 18644 /* Make sure IPv4 specific flag is not set on IPv6 if */ 18645 if (ill->ill_flags & ILLF_NOARP) { 18646 /* 18647 * Note: xresolv interfaces will eventually need 18648 * NOARP set here as well, but that will require 18649 * those external resolvers to have some 18650 * knowledge of that flag and act appropriately. 18651 * Not to be changed at present. 18652 */ 18653 ill->ill_flags &= ~ILLF_NOARP; 18654 } 18655 /* 18656 * Set the ILLF_ROUTER flag according to the global 18657 * IPv6 forwarding policy. 18658 */ 18659 if (ipst->ips_ipv6_forward != 0) 18660 ill->ill_flags |= ILLF_ROUTER; 18661 } else if (ill->ill_flags & ILLF_IPV4) { 18662 ill->ill_isv6 = B_FALSE; 18663 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 18664 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 18665 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 18666 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 18667 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 18668 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 18669 /* 18670 * Set the ILLF_ROUTER flag according to the global 18671 * IPv4 forwarding policy. 18672 */ 18673 if (ipst->ips_ip_g_forward != 0) 18674 ill->ill_flags |= ILLF_ROUTER; 18675 } 18676 18677 ASSERT(ill->ill_phyint != NULL); 18678 18679 /* 18680 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 18681 * be completed in ill_glist_insert -> ill_phyint_reinit 18682 */ 18683 if (!ill_allocate_mibs(ill)) 18684 return (ENOMEM); 18685 18686 /* 18687 * Pick a default sap until we get the DL_INFO_ACK back from 18688 * the driver. 18689 */ 18690 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 18691 ill->ill_media->ip_m_ipv4sap; 18692 18693 ill->ill_ifname_pending = 1; 18694 ill->ill_ifname_pending_err = 0; 18695 18696 /* 18697 * When the first ipif comes up in ipif_up_done(), multicast groups 18698 * that were joined while this ill was not bound to the DLPI link need 18699 * to be recovered by ill_recover_multicast(). 18700 */ 18701 ill->ill_need_recover_multicast = 1; 18702 18703 ill_refhold(ill); 18704 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 18705 if ((error = ill_glist_insert(ill, interf_name, 18706 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 18707 ill->ill_ppa = UINT_MAX; 18708 ill->ill_name[0] = '\0'; 18709 /* 18710 * undo null termination done above. 18711 */ 18712 ppa_ptr[0] = old_char; 18713 rw_exit(&ipst->ips_ill_g_lock); 18714 ill_refrele(ill); 18715 return (error); 18716 } 18717 18718 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 18719 18720 /* 18721 * When we return the buffer pointed to by interf_name should contain 18722 * the same name as in ill_name. 18723 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 18724 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 18725 * so copy full name and update the ppa ptr. 18726 * When ppa passed in != UINT_MAX all values are correct just undo 18727 * null termination, this saves a bcopy. 18728 */ 18729 if (*new_ppa_ptr == UINT_MAX) { 18730 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 18731 *new_ppa_ptr = ill->ill_ppa; 18732 } else { 18733 /* 18734 * undo null termination done above. 18735 */ 18736 ppa_ptr[0] = old_char; 18737 } 18738 18739 /* Let SCTP know about this ILL */ 18740 sctp_update_ill(ill, SCTP_ILL_INSERT); 18741 18742 /* 18743 * ill_glist_insert has made the ill visible globally, and 18744 * ill_phyint_reinit could have changed the ipsq. At this point, 18745 * we need to hold the ips_ill_g_lock across the call to enter the 18746 * ipsq to enforce atomicity and prevent reordering. In the event 18747 * the ipsq has changed, and if the new ipsq is currently busy, 18748 * we need to make sure that this half-completed ioctl is ahead of 18749 * any subsequent ioctl. We achieve this by not dropping the 18750 * ips_ill_g_lock which prevents any ill lookup itself thereby 18751 * ensuring that new ioctls can't start. 18752 */ 18753 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 18754 B_TRUE); 18755 18756 rw_exit(&ipst->ips_ill_g_lock); 18757 ill_refrele(ill); 18758 if (ipsq == NULL) 18759 return (EINPROGRESS); 18760 18761 /* 18762 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 18763 */ 18764 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 18765 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 18766 else 18767 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 18768 18769 error = ipif_set_values_tail(ill, ipif, mp, q); 18770 ipsq_exit(ipsq); 18771 if (error != 0 && error != EINPROGRESS) { 18772 /* 18773 * restore previous values 18774 */ 18775 ill->ill_isv6 = B_FALSE; 18776 } 18777 return (error); 18778 } 18779 18780 void 18781 ipif_init(ip_stack_t *ipst) 18782 { 18783 int i; 18784 18785 for (i = 0; i < MAX_G_HEADS; i++) { 18786 ipst->ips_ill_g_heads[i].ill_g_list_head = 18787 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 18788 ipst->ips_ill_g_heads[i].ill_g_list_tail = 18789 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 18790 } 18791 18792 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 18793 ill_phyint_compare_index, 18794 sizeof (phyint_t), 18795 offsetof(struct phyint, phyint_avl_by_index)); 18796 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18797 ill_phyint_compare_name, 18798 sizeof (phyint_t), 18799 offsetof(struct phyint, phyint_avl_by_name)); 18800 } 18801 18802 /* 18803 * Lookup the ipif corresponding to the onlink destination address. For 18804 * point-to-point interfaces, it matches with remote endpoint destination 18805 * address. For point-to-multipoint interfaces it only tries to match the 18806 * destination with the interface's subnet address. The longest, most specific 18807 * match is found to take care of such rare network configurations like - 18808 * le0: 129.146.1.1/16 18809 * le1: 129.146.2.2/24 18810 * 18811 * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are 18812 * supported on underlying interfaces in an IPMP group, underlying interfaces 18813 * are ignored when looking up a match. (If we didn't ignore them, we'd 18814 * risk using a test address as a source for outgoing traffic.) 18815 */ 18816 ipif_t * 18817 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 18818 { 18819 ipif_t *ipif, *best_ipif; 18820 ill_t *ill; 18821 ill_walk_context_t ctx; 18822 18823 ASSERT(zoneid != ALL_ZONES); 18824 best_ipif = NULL; 18825 18826 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18827 ill = ILL_START_WALK_V4(&ctx, ipst); 18828 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18829 if (IS_UNDER_IPMP(ill)) 18830 continue; 18831 mutex_enter(&ill->ill_lock); 18832 for (ipif = ill->ill_ipif; ipif != NULL; 18833 ipif = ipif->ipif_next) { 18834 if (!IPIF_CAN_LOOKUP(ipif)) 18835 continue; 18836 if (ipif->ipif_zoneid != zoneid && 18837 ipif->ipif_zoneid != ALL_ZONES) 18838 continue; 18839 /* 18840 * Point-to-point case. Look for exact match with 18841 * destination address. 18842 */ 18843 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 18844 if (ipif->ipif_pp_dst_addr == addr) { 18845 ipif_refhold_locked(ipif); 18846 mutex_exit(&ill->ill_lock); 18847 rw_exit(&ipst->ips_ill_g_lock); 18848 if (best_ipif != NULL) 18849 ipif_refrele(best_ipif); 18850 return (ipif); 18851 } 18852 } else if (ipif->ipif_subnet == (addr & 18853 ipif->ipif_net_mask)) { 18854 /* 18855 * Point-to-multipoint case. Looping through to 18856 * find the most specific match. If there are 18857 * multiple best match ipif's then prefer ipif's 18858 * that are UP. If there is only one best match 18859 * ipif and it is DOWN we must still return it. 18860 */ 18861 if ((best_ipif == NULL) || 18862 (ipif->ipif_net_mask > 18863 best_ipif->ipif_net_mask) || 18864 ((ipif->ipif_net_mask == 18865 best_ipif->ipif_net_mask) && 18866 ((ipif->ipif_flags & IPIF_UP) && 18867 (!(best_ipif->ipif_flags & IPIF_UP))))) { 18868 ipif_refhold_locked(ipif); 18869 mutex_exit(&ill->ill_lock); 18870 rw_exit(&ipst->ips_ill_g_lock); 18871 if (best_ipif != NULL) 18872 ipif_refrele(best_ipif); 18873 best_ipif = ipif; 18874 rw_enter(&ipst->ips_ill_g_lock, 18875 RW_READER); 18876 mutex_enter(&ill->ill_lock); 18877 } 18878 } 18879 } 18880 mutex_exit(&ill->ill_lock); 18881 } 18882 rw_exit(&ipst->ips_ill_g_lock); 18883 return (best_ipif); 18884 } 18885 18886 /* 18887 * Save enough information so that we can recreate the IRE if 18888 * the interface goes down and then up. 18889 */ 18890 static void 18891 ipif_save_ire(ipif_t *ipif, ire_t *ire) 18892 { 18893 mblk_t *save_mp; 18894 18895 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 18896 if (save_mp != NULL) { 18897 ifrt_t *ifrt; 18898 18899 save_mp->b_wptr += sizeof (ifrt_t); 18900 ifrt = (ifrt_t *)save_mp->b_rptr; 18901 bzero(ifrt, sizeof (ifrt_t)); 18902 ifrt->ifrt_type = ire->ire_type; 18903 ifrt->ifrt_addr = ire->ire_addr; 18904 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 18905 ifrt->ifrt_src_addr = ire->ire_src_addr; 18906 ifrt->ifrt_mask = ire->ire_mask; 18907 ifrt->ifrt_flags = ire->ire_flags; 18908 ifrt->ifrt_max_frag = ire->ire_max_frag; 18909 mutex_enter(&ipif->ipif_saved_ire_lock); 18910 save_mp->b_cont = ipif->ipif_saved_ire_mp; 18911 ipif->ipif_saved_ire_mp = save_mp; 18912 ipif->ipif_saved_ire_cnt++; 18913 mutex_exit(&ipif->ipif_saved_ire_lock); 18914 } 18915 } 18916 18917 static void 18918 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 18919 { 18920 mblk_t **mpp; 18921 mblk_t *mp; 18922 ifrt_t *ifrt; 18923 18924 /* Remove from ipif_saved_ire_mp list if it is there */ 18925 mutex_enter(&ipif->ipif_saved_ire_lock); 18926 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 18927 mpp = &(*mpp)->b_cont) { 18928 /* 18929 * On a given ipif, the triple of address, gateway and 18930 * mask is unique for each saved IRE (in the case of 18931 * ordinary interface routes, the gateway address is 18932 * all-zeroes). 18933 */ 18934 mp = *mpp; 18935 ifrt = (ifrt_t *)mp->b_rptr; 18936 if (ifrt->ifrt_addr == ire->ire_addr && 18937 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 18938 ifrt->ifrt_mask == ire->ire_mask) { 18939 *mpp = mp->b_cont; 18940 ipif->ipif_saved_ire_cnt--; 18941 freeb(mp); 18942 break; 18943 } 18944 } 18945 mutex_exit(&ipif->ipif_saved_ire_lock); 18946 } 18947 18948 /* 18949 * IP multirouting broadcast routes handling 18950 * Append CGTP broadcast IREs to regular ones created 18951 * at ifconfig time. 18952 */ 18953 static void 18954 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst) 18955 { 18956 ire_t *ire_prim; 18957 18958 ASSERT(ire != NULL); 18959 ASSERT(ire_dst != NULL); 18960 18961 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 18962 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 18963 if (ire_prim != NULL) { 18964 /* 18965 * We are in the special case of broadcasts for 18966 * CGTP. We add an IRE_BROADCAST that holds 18967 * the RTF_MULTIRT flag, the destination 18968 * address of ire_dst and the low level 18969 * info of ire_prim. In other words, CGTP 18970 * broadcast is added to the redundant ipif. 18971 */ 18972 ipif_t *ipif_prim; 18973 ire_t *bcast_ire; 18974 18975 ipif_prim = ire_prim->ire_ipif; 18976 18977 ip2dbg(("ip_cgtp_filter_bcast_add: " 18978 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 18979 (void *)ire_dst, (void *)ire_prim, 18980 (void *)ipif_prim)); 18981 18982 bcast_ire = ire_create( 18983 (uchar_t *)&ire->ire_addr, 18984 (uchar_t *)&ip_g_all_ones, 18985 (uchar_t *)&ire_dst->ire_src_addr, 18986 (uchar_t *)&ire->ire_gateway_addr, 18987 &ipif_prim->ipif_mtu, 18988 NULL, 18989 ipif_prim->ipif_rq, 18990 ipif_prim->ipif_wq, 18991 IRE_BROADCAST, 18992 ipif_prim, 18993 0, 18994 0, 18995 0, 18996 ire->ire_flags, 18997 &ire_uinfo_null, 18998 NULL, 18999 NULL, 19000 ipst); 19001 19002 if (bcast_ire != NULL) { 19003 19004 if (ire_add(&bcast_ire, NULL, NULL, NULL, 19005 B_FALSE) == 0) { 19006 ip2dbg(("ip_cgtp_filter_bcast_add: " 19007 "added bcast_ire %p\n", 19008 (void *)bcast_ire)); 19009 19010 ipif_save_ire(bcast_ire->ire_ipif, 19011 bcast_ire); 19012 ire_refrele(bcast_ire); 19013 } 19014 } 19015 ire_refrele(ire_prim); 19016 } 19017 } 19018 19019 /* 19020 * IP multirouting broadcast routes handling 19021 * Remove the broadcast ire 19022 */ 19023 static void 19024 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 19025 { 19026 ire_t *ire_dst; 19027 19028 ASSERT(ire != NULL); 19029 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 19030 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19031 if (ire_dst != NULL) { 19032 ire_t *ire_prim; 19033 19034 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 19035 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19036 if (ire_prim != NULL) { 19037 ipif_t *ipif_prim; 19038 ire_t *bcast_ire; 19039 19040 ipif_prim = ire_prim->ire_ipif; 19041 19042 ip2dbg(("ip_cgtp_filter_bcast_delete: " 19043 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 19044 (void *)ire_dst, (void *)ire_prim, 19045 (void *)ipif_prim)); 19046 19047 bcast_ire = ire_ctable_lookup(ire->ire_addr, 19048 ire->ire_gateway_addr, 19049 IRE_BROADCAST, 19050 ipif_prim, ALL_ZONES, 19051 NULL, 19052 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 19053 MATCH_IRE_MASK, ipst); 19054 19055 if (bcast_ire != NULL) { 19056 ip2dbg(("ip_cgtp_filter_bcast_delete: " 19057 "looked up bcast_ire %p\n", 19058 (void *)bcast_ire)); 19059 ipif_remove_ire(bcast_ire->ire_ipif, 19060 bcast_ire); 19061 ire_delete(bcast_ire); 19062 ire_refrele(bcast_ire); 19063 } 19064 ire_refrele(ire_prim); 19065 } 19066 ire_refrele(ire_dst); 19067 } 19068 } 19069 19070 /* 19071 * IPsec hardware acceleration capabilities related functions. 19072 */ 19073 19074 /* 19075 * Free a per-ill IPsec capabilities structure. 19076 */ 19077 static void 19078 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 19079 { 19080 if (capab->auth_hw_algs != NULL) 19081 kmem_free(capab->auth_hw_algs, capab->algs_size); 19082 if (capab->encr_hw_algs != NULL) 19083 kmem_free(capab->encr_hw_algs, capab->algs_size); 19084 if (capab->encr_algparm != NULL) 19085 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 19086 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 19087 } 19088 19089 /* 19090 * Allocate a new per-ill IPsec capabilities structure. This structure 19091 * is specific to an IPsec protocol (AH or ESP). It is implemented as 19092 * an array which specifies, for each algorithm, whether this algorithm 19093 * is supported by the ill or not. 19094 */ 19095 static ill_ipsec_capab_t * 19096 ill_ipsec_capab_alloc(void) 19097 { 19098 ill_ipsec_capab_t *capab; 19099 uint_t nelems; 19100 19101 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 19102 if (capab == NULL) 19103 return (NULL); 19104 19105 /* we need one bit per algorithm */ 19106 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 19107 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 19108 19109 /* allocate memory to store algorithm flags */ 19110 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 19111 if (capab->encr_hw_algs == NULL) 19112 goto nomem; 19113 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 19114 if (capab->auth_hw_algs == NULL) 19115 goto nomem; 19116 /* 19117 * Leave encr_algparm NULL for now since we won't need it half 19118 * the time 19119 */ 19120 return (capab); 19121 19122 nomem: 19123 ill_ipsec_capab_free(capab); 19124 return (NULL); 19125 } 19126 19127 /* 19128 * Resize capability array. Since we're exclusive, this is OK. 19129 */ 19130 static boolean_t 19131 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 19132 { 19133 ipsec_capab_algparm_t *nalp, *oalp; 19134 uint32_t olen, nlen; 19135 19136 oalp = capab->encr_algparm; 19137 olen = capab->encr_algparm_size; 19138 19139 if (oalp != NULL) { 19140 if (algid < capab->encr_algparm_end) 19141 return (B_TRUE); 19142 } 19143 19144 nlen = (algid + 1) * sizeof (*nalp); 19145 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 19146 if (nalp == NULL) 19147 return (B_FALSE); 19148 19149 if (oalp != NULL) { 19150 bcopy(oalp, nalp, olen); 19151 kmem_free(oalp, olen); 19152 } 19153 capab->encr_algparm = nalp; 19154 capab->encr_algparm_size = nlen; 19155 capab->encr_algparm_end = algid + 1; 19156 19157 return (B_TRUE); 19158 } 19159 19160 /* 19161 * Compare the capabilities of the specified ill with the protocol 19162 * and algorithms specified by the SA passed as argument. 19163 * If they match, returns B_TRUE, B_FALSE if they do not match. 19164 * 19165 * The ill can be passed as a pointer to it, or by specifying its index 19166 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 19167 * 19168 * Called by ipsec_out_is_accelerated() do decide whether an outbound 19169 * packet is eligible for hardware acceleration, and by 19170 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 19171 * to a particular ill. 19172 */ 19173 boolean_t 19174 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 19175 ipsa_t *sa, netstack_t *ns) 19176 { 19177 boolean_t sa_isv6; 19178 uint_t algid; 19179 struct ill_ipsec_capab_s *cpp; 19180 boolean_t need_refrele = B_FALSE; 19181 ip_stack_t *ipst = ns->netstack_ip; 19182 19183 if (ill == NULL) { 19184 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 19185 NULL, NULL, NULL, ipst); 19186 if (ill == NULL) { 19187 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 19188 return (B_FALSE); 19189 } 19190 need_refrele = B_TRUE; 19191 } 19192 19193 /* 19194 * Use the address length specified by the SA to determine 19195 * if it corresponds to a IPv6 address, and fail the matching 19196 * if the isv6 flag passed as argument does not match. 19197 * Note: this check is used for SADB capability checking before 19198 * sending SA information to an ill. 19199 */ 19200 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 19201 if (sa_isv6 != ill_isv6) 19202 /* protocol mismatch */ 19203 goto done; 19204 19205 /* 19206 * Check if the ill supports the protocol, algorithm(s) and 19207 * key size(s) specified by the SA, and get the pointers to 19208 * the algorithms supported by the ill. 19209 */ 19210 switch (sa->ipsa_type) { 19211 19212 case SADB_SATYPE_ESP: 19213 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 19214 /* ill does not support ESP acceleration */ 19215 goto done; 19216 cpp = ill->ill_ipsec_capab_esp; 19217 algid = sa->ipsa_auth_alg; 19218 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 19219 goto done; 19220 algid = sa->ipsa_encr_alg; 19221 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 19222 goto done; 19223 if (algid < cpp->encr_algparm_end) { 19224 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 19225 if (sa->ipsa_encrkeybits < alp->minkeylen) 19226 goto done; 19227 if (sa->ipsa_encrkeybits > alp->maxkeylen) 19228 goto done; 19229 } 19230 break; 19231 19232 case SADB_SATYPE_AH: 19233 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 19234 /* ill does not support AH acceleration */ 19235 goto done; 19236 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 19237 ill->ill_ipsec_capab_ah->auth_hw_algs)) 19238 goto done; 19239 break; 19240 } 19241 19242 if (need_refrele) 19243 ill_refrele(ill); 19244 return (B_TRUE); 19245 done: 19246 if (need_refrele) 19247 ill_refrele(ill); 19248 return (B_FALSE); 19249 } 19250 19251 /* 19252 * Add a new ill to the list of IPsec capable ills. 19253 * Called from ill_capability_ipsec_ack() when an ACK was received 19254 * indicating that IPsec hardware processing was enabled for an ill. 19255 * 19256 * ill must point to the ill for which acceleration was enabled. 19257 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 19258 */ 19259 static void 19260 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 19261 { 19262 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 19263 uint_t sa_type; 19264 uint_t ipproto; 19265 ip_stack_t *ipst = ill->ill_ipst; 19266 19267 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 19268 (dl_cap == DL_CAPAB_IPSEC_ESP)); 19269 19270 switch (dl_cap) { 19271 case DL_CAPAB_IPSEC_AH: 19272 sa_type = SADB_SATYPE_AH; 19273 ills = &ipst->ips_ipsec_capab_ills_ah; 19274 ipproto = IPPROTO_AH; 19275 break; 19276 case DL_CAPAB_IPSEC_ESP: 19277 sa_type = SADB_SATYPE_ESP; 19278 ills = &ipst->ips_ipsec_capab_ills_esp; 19279 ipproto = IPPROTO_ESP; 19280 break; 19281 } 19282 19283 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 19284 19285 /* 19286 * Add ill index to list of hardware accelerators. If 19287 * already in list, do nothing. 19288 */ 19289 for (cur_ill = *ills; cur_ill != NULL && 19290 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 19291 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 19292 ; 19293 19294 if (cur_ill == NULL) { 19295 /* if this is a new entry for this ill */ 19296 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 19297 if (new_ill == NULL) { 19298 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19299 return; 19300 } 19301 19302 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 19303 new_ill->ill_isv6 = ill->ill_isv6; 19304 new_ill->next = *ills; 19305 *ills = new_ill; 19306 } else if (!sadb_resync) { 19307 /* not resync'ing SADB and an entry exists for this ill */ 19308 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19309 return; 19310 } 19311 19312 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19313 19314 if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 19315 /* 19316 * IPsec module for protocol loaded, initiate dump 19317 * of the SADB to this ill. 19318 */ 19319 sadb_ill_download(ill, sa_type); 19320 } 19321 19322 /* 19323 * Remove an ill from the list of IPsec capable ills. 19324 */ 19325 static void 19326 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 19327 { 19328 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 19329 ip_stack_t *ipst = ill->ill_ipst; 19330 19331 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 19332 dl_cap == DL_CAPAB_IPSEC_ESP); 19333 19334 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah : 19335 &ipst->ips_ipsec_capab_ills_esp; 19336 19337 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 19338 19339 prev_ill = NULL; 19340 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 19341 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 19342 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 19343 ; 19344 if (cur_ill == NULL) { 19345 /* entry not found */ 19346 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19347 return; 19348 } 19349 if (prev_ill == NULL) { 19350 /* entry at front of list */ 19351 *ills = NULL; 19352 } else { 19353 prev_ill->next = cur_ill->next; 19354 } 19355 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 19356 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19357 } 19358 19359 /* 19360 * Called by SADB to send a DL_CONTROL_REQ message to every ill 19361 * supporting the specified IPsec protocol acceleration. 19362 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 19363 * We free the mblk and, if sa is non-null, release the held referece. 19364 */ 19365 void 19366 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, 19367 netstack_t *ns) 19368 { 19369 ipsec_capab_ill_t *ici, *cur_ici; 19370 ill_t *ill; 19371 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 19372 ip_stack_t *ipst = ns->netstack_ip; 19373 19374 ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah : 19375 ipst->ips_ipsec_capab_ills_esp; 19376 19377 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER); 19378 19379 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 19380 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 19381 cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst); 19382 19383 /* 19384 * Handle the case where the ill goes away while the SADB is 19385 * attempting to send messages. If it's going away, it's 19386 * nuking its shadow SADB, so we don't care.. 19387 */ 19388 19389 if (ill == NULL) 19390 continue; 19391 19392 if (sa != NULL) { 19393 /* 19394 * Make sure capabilities match before 19395 * sending SA to ill. 19396 */ 19397 if (!ipsec_capab_match(ill, cur_ici->ill_index, 19398 cur_ici->ill_isv6, sa, ipst->ips_netstack)) { 19399 ill_refrele(ill); 19400 continue; 19401 } 19402 19403 mutex_enter(&sa->ipsa_lock); 19404 sa->ipsa_flags |= IPSA_F_HW; 19405 mutex_exit(&sa->ipsa_lock); 19406 } 19407 19408 /* 19409 * Copy template message, and add it to the front 19410 * of the mblk ship list. We want to avoid holding 19411 * the ipsec_capab_ills_lock while sending the 19412 * message to the ills. 19413 * 19414 * The b_next and b_prev are temporarily used 19415 * to build a list of mblks to be sent down, and to 19416 * save the ill to which they must be sent. 19417 */ 19418 nmp = copymsg(mp); 19419 if (nmp == NULL) { 19420 ill_refrele(ill); 19421 continue; 19422 } 19423 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 19424 nmp->b_next = mp_ship_list; 19425 mp_ship_list = nmp; 19426 nmp->b_prev = (mblk_t *)ill; 19427 } 19428 19429 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19430 19431 for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) { 19432 /* restore the mblk to a sane state */ 19433 next_mp = nmp->b_next; 19434 nmp->b_next = NULL; 19435 ill = (ill_t *)nmp->b_prev; 19436 nmp->b_prev = NULL; 19437 19438 ill_dlpi_send(ill, nmp); 19439 ill_refrele(ill); 19440 } 19441 19442 if (sa != NULL) 19443 IPSA_REFRELE(sa); 19444 freemsg(mp); 19445 } 19446 19447 /* 19448 * Derive an interface id from the link layer address. 19449 * Knows about IEEE 802 and IEEE EUI-64 mappings. 19450 */ 19451 static void 19452 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19453 { 19454 char *addr; 19455 19456 /* 19457 * Note that some IPv6 interfaces get plumbed over links that claim to 19458 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 19459 * PPP links). The ETHERADDRL check here ensures that we only set the 19460 * interface ID on IPv6 interfaces above links that actually have real 19461 * Ethernet addresses. 19462 */ 19463 if (ill->ill_phys_addr_length == ETHERADDRL) { 19464 /* Form EUI-64 like address */ 19465 addr = (char *)&v6addr->s6_addr32[2]; 19466 bcopy(ill->ill_phys_addr, addr, 3); 19467 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 19468 addr[3] = (char)0xff; 19469 addr[4] = (char)0xfe; 19470 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 19471 } 19472 } 19473 19474 /* ARGSUSED */ 19475 static void 19476 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19477 { 19478 } 19479 19480 typedef struct ipmp_ifcookie { 19481 uint32_t ic_hostid; 19482 char ic_ifname[LIFNAMSIZ]; 19483 char ic_zonename[ZONENAME_MAX]; 19484 } ipmp_ifcookie_t; 19485 19486 /* 19487 * Construct a pseudo-random interface ID for the IPMP interface that's both 19488 * predictable and (almost) guaranteed to be unique. 19489 */ 19490 static void 19491 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19492 { 19493 zone_t *zp; 19494 uint8_t *addr; 19495 uchar_t hash[16]; 19496 ulong_t hostid; 19497 MD5_CTX ctx; 19498 ipmp_ifcookie_t ic = { 0 }; 19499 19500 ASSERT(IS_IPMP(ill)); 19501 19502 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 19503 ic.ic_hostid = htonl((uint32_t)hostid); 19504 19505 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 19506 19507 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 19508 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 19509 zone_rele(zp); 19510 } 19511 19512 MD5Init(&ctx); 19513 MD5Update(&ctx, &ic, sizeof (ic)); 19514 MD5Final(hash, &ctx); 19515 19516 /* 19517 * Map the hash to an interface ID per the basic approach in RFC3041. 19518 */ 19519 addr = &v6addr->s6_addr8[8]; 19520 bcopy(hash + 8, addr, sizeof (uint64_t)); 19521 addr[0] &= ~0x2; /* set local bit */ 19522 } 19523 19524 /* ARGSUSED */ 19525 static boolean_t 19526 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 19527 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 19528 { 19529 /* 19530 * Multicast address mappings used over Ethernet/802.X. 19531 * This address is used as a base for mappings. 19532 */ 19533 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 19534 0x00, 0x00, 0x00}; 19535 19536 /* 19537 * Extract low order 32 bits from IPv6 multicast address. 19538 * Or that into the link layer address, starting from the 19539 * second byte. 19540 */ 19541 *hw_start = 2; 19542 v6_extract_mask->s6_addr32[0] = 0; 19543 v6_extract_mask->s6_addr32[1] = 0; 19544 v6_extract_mask->s6_addr32[2] = 0; 19545 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 19546 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 19547 return (B_TRUE); 19548 } 19549 19550 /* 19551 * Indicate by return value whether multicast is supported. If not, 19552 * this code should not touch/change any parameters. 19553 */ 19554 /* ARGSUSED */ 19555 static boolean_t 19556 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 19557 uint32_t *hw_start, ipaddr_t *extract_mask) 19558 { 19559 /* 19560 * Multicast address mappings used over Ethernet/802.X. 19561 * This address is used as a base for mappings. 19562 */ 19563 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 19564 0x00, 0x00, 0x00 }; 19565 19566 if (phys_length != ETHERADDRL) 19567 return (B_FALSE); 19568 19569 *extract_mask = htonl(0x007fffff); 19570 *hw_start = 2; 19571 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 19572 return (B_TRUE); 19573 } 19574 19575 /* ARGSUSED */ 19576 static boolean_t 19577 ip_nodef_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 19578 uint32_t *hw_start, ipaddr_t *extract_mask) 19579 { 19580 return (B_FALSE); 19581 } 19582 19583 /* ARGSUSED */ 19584 static boolean_t 19585 ip_nodef_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 19586 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 19587 { 19588 return (B_FALSE); 19589 } 19590 19591 /* 19592 * Derive IPoIB interface id from the link layer address. 19593 */ 19594 static void 19595 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19596 { 19597 char *addr; 19598 19599 ASSERT(ill->ill_phys_addr_length == 20); 19600 addr = (char *)&v6addr->s6_addr32[2]; 19601 bcopy(ill->ill_phys_addr + 12, addr, 8); 19602 /* 19603 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 19604 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 19605 * rules. In these cases, the IBA considers these GUIDs to be in 19606 * "Modified EUI-64" format, and thus toggling the u/l bit is not 19607 * required; vendors are required not to assign global EUI-64's 19608 * that differ only in u/l bit values, thus guaranteeing uniqueness 19609 * of the interface identifier. Whether the GUID is in modified 19610 * or proper EUI-64 format, the ipv6 identifier must have the u/l 19611 * bit set to 1. 19612 */ 19613 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 19614 } 19615 19616 /* 19617 * Note on mapping from multicast IP addresses to IPoIB multicast link 19618 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 19619 * The format of an IPoIB multicast address is: 19620 * 19621 * 4 byte QPN Scope Sign. Pkey 19622 * +--------------------------------------------+ 19623 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 19624 * +--------------------------------------------+ 19625 * 19626 * The Scope and Pkey components are properties of the IBA port and 19627 * network interface. They can be ascertained from the broadcast address. 19628 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 19629 */ 19630 19631 static boolean_t 19632 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 19633 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 19634 { 19635 /* 19636 * Base IPoIB IPv6 multicast address used for mappings. 19637 * Does not contain the IBA scope/Pkey values. 19638 */ 19639 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 19640 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 19641 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 19642 19643 /* 19644 * Extract low order 80 bits from IPv6 multicast address. 19645 * Or that into the link layer address, starting from the 19646 * sixth byte. 19647 */ 19648 *hw_start = 6; 19649 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 19650 19651 /* 19652 * Now fill in the IBA scope/Pkey values from the broadcast address. 19653 */ 19654 *(maddr + 5) = *(bphys_addr + 5); 19655 *(maddr + 8) = *(bphys_addr + 8); 19656 *(maddr + 9) = *(bphys_addr + 9); 19657 19658 v6_extract_mask->s6_addr32[0] = 0; 19659 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 19660 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 19661 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 19662 return (B_TRUE); 19663 } 19664 19665 static boolean_t 19666 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 19667 uint32_t *hw_start, ipaddr_t *extract_mask) 19668 { 19669 /* 19670 * Base IPoIB IPv4 multicast address used for mappings. 19671 * Does not contain the IBA scope/Pkey values. 19672 */ 19673 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 19674 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 19675 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 19676 19677 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 19678 return (B_FALSE); 19679 19680 /* 19681 * Extract low order 28 bits from IPv4 multicast address. 19682 * Or that into the link layer address, starting from the 19683 * sixteenth byte. 19684 */ 19685 *extract_mask = htonl(0x0fffffff); 19686 *hw_start = 16; 19687 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 19688 19689 /* 19690 * Now fill in the IBA scope/Pkey values from the broadcast address. 19691 */ 19692 *(maddr + 5) = *(bphys_addr + 5); 19693 *(maddr + 8) = *(bphys_addr + 8); 19694 *(maddr + 9) = *(bphys_addr + 9); 19695 return (B_TRUE); 19696 } 19697 19698 /* 19699 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 19700 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 19701 * IPv6 interface id. This is a suggested mechanism described in section 3.7 19702 * of RFC4213. 19703 */ 19704 static void 19705 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 19706 { 19707 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 19708 v6addr->s6_addr32[2] = 0; 19709 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 19710 } 19711 19712 /* 19713 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 19714 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 19715 * id. 19716 */ 19717 static void 19718 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 19719 { 19720 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 19721 19722 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 19723 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 19724 } 19725 19726 static void 19727 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19728 { 19729 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 19730 } 19731 19732 static void 19733 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 19734 { 19735 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 19736 } 19737 19738 static void 19739 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19740 { 19741 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 19742 } 19743 19744 static void 19745 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 19746 { 19747 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 19748 } 19749 19750 /* 19751 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 19752 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 19753 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 19754 * the link-local address is preferred. 19755 */ 19756 boolean_t 19757 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 19758 { 19759 ipif_t *ipif; 19760 ipif_t *maybe_ipif = NULL; 19761 19762 mutex_enter(&ill->ill_lock); 19763 if (ill->ill_state_flags & ILL_CONDEMNED) { 19764 mutex_exit(&ill->ill_lock); 19765 if (ipifp != NULL) 19766 *ipifp = NULL; 19767 return (B_FALSE); 19768 } 19769 19770 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19771 if (!IPIF_CAN_LOOKUP(ipif)) 19772 continue; 19773 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 19774 ipif->ipif_zoneid != ALL_ZONES) 19775 continue; 19776 if ((ipif->ipif_flags & flags) != flags) 19777 continue; 19778 19779 if (ipifp == NULL) { 19780 mutex_exit(&ill->ill_lock); 19781 ASSERT(maybe_ipif == NULL); 19782 return (B_TRUE); 19783 } 19784 if (!ill->ill_isv6 || 19785 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 19786 ipif_refhold_locked(ipif); 19787 mutex_exit(&ill->ill_lock); 19788 *ipifp = ipif; 19789 return (B_TRUE); 19790 } 19791 if (maybe_ipif == NULL) 19792 maybe_ipif = ipif; 19793 } 19794 if (ipifp != NULL) { 19795 if (maybe_ipif != NULL) 19796 ipif_refhold_locked(maybe_ipif); 19797 *ipifp = maybe_ipif; 19798 } 19799 mutex_exit(&ill->ill_lock); 19800 return (maybe_ipif != NULL); 19801 } 19802 19803 /* 19804 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 19805 * If a pointer to an ipif_t is returned then the caller will need to do 19806 * an ill_refrele(). 19807 */ 19808 ipif_t * 19809 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 19810 ip_stack_t *ipst) 19811 { 19812 ipif_t *ipif; 19813 ill_t *ill; 19814 19815 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 19816 ipst); 19817 if (ill == NULL) 19818 return (NULL); 19819 19820 mutex_enter(&ill->ill_lock); 19821 if (ill->ill_state_flags & ILL_CONDEMNED) { 19822 mutex_exit(&ill->ill_lock); 19823 ill_refrele(ill); 19824 return (NULL); 19825 } 19826 19827 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19828 if (!IPIF_CAN_LOOKUP(ipif)) 19829 continue; 19830 if (lifidx == ipif->ipif_id) { 19831 ipif_refhold_locked(ipif); 19832 break; 19833 } 19834 } 19835 19836 mutex_exit(&ill->ill_lock); 19837 ill_refrele(ill); 19838 return (ipif); 19839 } 19840 19841 /* 19842 * Flush the fastpath by deleting any nce's that are waiting for the fastpath, 19843 * There is one exceptions IRE_BROADCAST are difficult to recreate, 19844 * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush() 19845 * for details. 19846 */ 19847 void 19848 ill_fastpath_flush(ill_t *ill) 19849 { 19850 ip_stack_t *ipst = ill->ill_ipst; 19851 19852 nce_fastpath_list_dispatch(ill, NULL, NULL); 19853 ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4), 19854 ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE); 19855 } 19856 19857 /* 19858 * Set the physical address information for `ill' to the contents of the 19859 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 19860 * asynchronous if `ill' cannot immediately be quiesced -- in which case 19861 * EINPROGRESS will be returned. 19862 */ 19863 int 19864 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 19865 { 19866 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19867 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 19868 19869 ASSERT(IAM_WRITER_IPSQ(ipsq)); 19870 19871 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 19872 dlindp->dl_data != DL_CURR_DEST_ADDR && 19873 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 19874 /* Changing DL_IPV6_TOKEN is not yet supported */ 19875 return (0); 19876 } 19877 19878 /* 19879 * We need to store up to two copies of `mp' in `ill'. Due to the 19880 * design of ipsq_pending_mp_add(), we can't pass them as separate 19881 * arguments to ill_set_phys_addr_tail(). Instead, chain them 19882 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 19883 */ 19884 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 19885 freemsg(mp); 19886 return (ENOMEM); 19887 } 19888 19889 ipsq_current_start(ipsq, ill->ill_ipif, 0); 19890 19891 /* 19892 * If we can quiesce the ill, then set the address. If not, then 19893 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 19894 */ 19895 ill_down_ipifs(ill, B_TRUE); 19896 mutex_enter(&ill->ill_lock); 19897 if (!ill_is_quiescent(ill)) { 19898 /* call cannot fail since `conn_t *' argument is NULL */ 19899 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 19900 mp, ILL_DOWN); 19901 mutex_exit(&ill->ill_lock); 19902 return (EINPROGRESS); 19903 } 19904 mutex_exit(&ill->ill_lock); 19905 19906 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 19907 return (0); 19908 } 19909 19910 /* 19911 * Once the ill associated with `q' has quiesced, set its physical address 19912 * information to the values in `addrmp'. Note that two copies of `addrmp' 19913 * are passed (linked by b_cont), since we sometimes need to save two distinct 19914 * copies in the ill_t, and our context doesn't permit sleeping or allocation 19915 * failure (we'll free the other copy if it's not needed). Since the ill_t 19916 * is quiesced, we know any stale IREs with the old address information have 19917 * already been removed, so we don't need to call ill_fastpath_flush(). 19918 */ 19919 /* ARGSUSED */ 19920 static void 19921 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 19922 { 19923 ill_t *ill = q->q_ptr; 19924 mblk_t *addrmp2 = unlinkb(addrmp); 19925 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 19926 uint_t addrlen, addroff; 19927 19928 ASSERT(IAM_WRITER_IPSQ(ipsq)); 19929 19930 addroff = dlindp->dl_addr_offset; 19931 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 19932 19933 switch (dlindp->dl_data) { 19934 case DL_IPV6_LINK_LAYER_ADDR: 19935 ill_set_ndmp(ill, addrmp, addroff, addrlen); 19936 freemsg(addrmp2); 19937 break; 19938 19939 case DL_CURR_DEST_ADDR: 19940 freemsg(ill->ill_dest_addr_mp); 19941 ill->ill_dest_addr = addrmp->b_rptr + addroff; 19942 ill->ill_dest_addr_mp = addrmp; 19943 if (ill->ill_isv6) { 19944 ill_setdesttoken(ill); 19945 ipif_setdestlinklocal(ill->ill_ipif); 19946 } 19947 freemsg(addrmp2); 19948 break; 19949 19950 case DL_CURR_PHYS_ADDR: 19951 freemsg(ill->ill_phys_addr_mp); 19952 ill->ill_phys_addr = addrmp->b_rptr + addroff; 19953 ill->ill_phys_addr_mp = addrmp; 19954 ill->ill_phys_addr_length = addrlen; 19955 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 19956 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 19957 else 19958 freemsg(addrmp2); 19959 if (ill->ill_isv6) { 19960 ill_setdefaulttoken(ill); 19961 ipif_setlinklocal(ill->ill_ipif); 19962 } 19963 break; 19964 default: 19965 ASSERT(0); 19966 } 19967 19968 /* 19969 * If there are ipifs to bring up, ill_up_ipifs() will return 19970 * EINPROGRESS, and ipsq_current_finish() will be called by 19971 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 19972 * brought up. 19973 */ 19974 if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS) 19975 ipsq_current_finish(ipsq); 19976 } 19977 19978 /* 19979 * Helper routine for setting the ill_nd_lla fields. 19980 */ 19981 void 19982 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 19983 { 19984 freemsg(ill->ill_nd_lla_mp); 19985 ill->ill_nd_lla = ndmp->b_rptr + addroff; 19986 ill->ill_nd_lla_mp = ndmp; 19987 ill->ill_nd_lla_len = addrlen; 19988 } 19989 19990 /* 19991 * Replumb the ill. 19992 */ 19993 int 19994 ill_replumb(ill_t *ill, mblk_t *mp) 19995 { 19996 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19997 19998 ASSERT(IAM_WRITER_IPSQ(ipsq)); 19999 20000 ipsq_current_start(ipsq, ill->ill_ipif, 0); 20001 20002 /* 20003 * If we can quiesce the ill, then continue. If not, then 20004 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 20005 */ 20006 ill_down_ipifs(ill, B_FALSE); 20007 20008 mutex_enter(&ill->ill_lock); 20009 if (!ill_is_quiescent(ill)) { 20010 /* call cannot fail since `conn_t *' argument is NULL */ 20011 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 20012 mp, ILL_DOWN); 20013 mutex_exit(&ill->ill_lock); 20014 return (EINPROGRESS); 20015 } 20016 mutex_exit(&ill->ill_lock); 20017 20018 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 20019 return (0); 20020 } 20021 20022 /* ARGSUSED */ 20023 static void 20024 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 20025 { 20026 ill_t *ill = q->q_ptr; 20027 20028 ASSERT(IAM_WRITER_IPSQ(ipsq)); 20029 20030 ill_down_ipifs_tail(ill); 20031 20032 freemsg(ill->ill_replumb_mp); 20033 ill->ill_replumb_mp = copyb(mp); 20034 20035 /* 20036 * Successfully quiesced and brought down the interface, now we send 20037 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 20038 * DL_NOTE_REPLUMB message. 20039 */ 20040 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 20041 DL_NOTIFY_CONF); 20042 ASSERT(mp != NULL); 20043 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 20044 DL_NOTE_REPLUMB_DONE; 20045 ill_dlpi_send(ill, mp); 20046 20047 /* 20048 * If there are ipifs to bring up, ill_up_ipifs() will return 20049 * EINPROGRESS, and ipsq_current_finish() will be called by 20050 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 20051 * brought up. 20052 */ 20053 if (ill->ill_replumb_mp == NULL || 20054 ill_up_ipifs(ill, q, ill->ill_replumb_mp) != EINPROGRESS) { 20055 ipsq_current_finish(ipsq); 20056 } 20057 } 20058 20059 /* 20060 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 20061 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 20062 * as per the ioctl. On failure, an errno is returned. 20063 */ 20064 static int 20065 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 20066 { 20067 int rval; 20068 struct strioctl iocb; 20069 20070 iocb.ic_cmd = cmd; 20071 iocb.ic_timout = 15; 20072 iocb.ic_len = bufsize; 20073 iocb.ic_dp = buf; 20074 20075 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 20076 } 20077 20078 /* 20079 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 20080 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 20081 */ 20082 static int 20083 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 20084 uint_t *bufsizep, cred_t *cr) 20085 { 20086 int err; 20087 struct lifnum lifn; 20088 20089 bzero(&lifn, sizeof (lifn)); 20090 lifn.lifn_family = af; 20091 lifn.lifn_flags = LIFC_UNDER_IPMP; 20092 20093 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 20094 return (err); 20095 20096 /* 20097 * Pad the interface count to account for additional interfaces that 20098 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 20099 */ 20100 lifn.lifn_count += 4; 20101 bzero(lifcp, sizeof (*lifcp)); 20102 lifcp->lifc_flags = LIFC_UNDER_IPMP; 20103 lifcp->lifc_family = af; 20104 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 20105 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 20106 20107 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 20108 if (err != 0) { 20109 kmem_free(lifcp->lifc_buf, *bufsizep); 20110 return (err); 20111 } 20112 20113 return (0); 20114 } 20115 20116 /* 20117 * Helper for ip_interface_cleanup() that removes the loopback interface. 20118 */ 20119 static void 20120 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 20121 { 20122 int err; 20123 struct lifreq lifr; 20124 20125 bzero(&lifr, sizeof (lifr)); 20126 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 20127 20128 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 20129 if (err != 0) { 20130 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 20131 "error %d\n", isv6 ? "v6" : "v4", err)); 20132 } 20133 } 20134 20135 /* 20136 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 20137 * groups and that IPMP data addresses are down. These conditions must be met 20138 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 20139 */ 20140 static void 20141 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 20142 { 20143 int af = isv6 ? AF_INET6 : AF_INET; 20144 int i, nifs; 20145 int err; 20146 uint_t bufsize; 20147 uint_t lifrsize = sizeof (struct lifreq); 20148 struct lifconf lifc; 20149 struct lifreq *lifrp; 20150 20151 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 20152 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 20153 "(error %d); any IPMP interfaces cannot be shutdown", err); 20154 return; 20155 } 20156 20157 nifs = lifc.lifc_len / lifrsize; 20158 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 20159 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 20160 if (err != 0) { 20161 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 20162 "flags: error %d", lifrp->lifr_name, err); 20163 continue; 20164 } 20165 20166 if (lifrp->lifr_flags & IFF_IPMP) { 20167 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 20168 continue; 20169 20170 lifrp->lifr_flags &= ~IFF_UP; 20171 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 20172 if (err != 0) { 20173 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 20174 "bring down (error %d); IPMP interface may " 20175 "not be shutdown", lifrp->lifr_name, err); 20176 } 20177 20178 /* 20179 * Check if IFF_DUPLICATE is still set -- and if so, 20180 * reset the address to clear it. 20181 */ 20182 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 20183 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 20184 continue; 20185 20186 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 20187 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 20188 lifrp, lifrsize, cr)) != 0) { 20189 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 20190 "reset DAD (error %d); IPMP interface may " 20191 "not be shutdown", lifrp->lifr_name, err); 20192 } 20193 continue; 20194 } 20195 20196 lifrp->lifr_groupname[0] = '\0'; 20197 err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, lifrsize, cr); 20198 if (err != 0) { 20199 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot leave " 20200 "IPMP group (error %d); associated IPMP interface " 20201 "may not be shutdown", lifrp->lifr_name, err); 20202 continue; 20203 } 20204 } 20205 20206 kmem_free(lifc.lifc_buf, bufsize); 20207 } 20208 20209 #define UDPDEV "/devices/pseudo/udp@0:udp" 20210 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 20211 20212 /* 20213 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 20214 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 20215 * when the user-level processes in the zone are killed and the latter are 20216 * cleaned up by str_stack_shutdown(). 20217 */ 20218 void 20219 ip_interface_cleanup(ip_stack_t *ipst) 20220 { 20221 ldi_handle_t lh; 20222 ldi_ident_t li; 20223 cred_t *cr; 20224 int err; 20225 int i; 20226 char *devs[] = { UDP6DEV, UDPDEV }; 20227 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 20228 20229 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 20230 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 20231 " error %d", err); 20232 return; 20233 } 20234 20235 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 20236 ASSERT(cr != NULL); 20237 20238 /* 20239 * NOTE: loop executes exactly twice and is hardcoded to know that the 20240 * first iteration is IPv6. (Unrolling yields repetitious code, hence 20241 * the loop.) 20242 */ 20243 for (i = 0; i < 2; i++) { 20244 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 20245 if (err != 0) { 20246 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 20247 " error %d", devs[i], err); 20248 continue; 20249 } 20250 20251 ip_loopback_removeif(lh, i == 0, cr); 20252 ip_ipmp_cleanup(lh, i == 0, cr); 20253 20254 (void) ldi_close(lh, FREAD|FWRITE, cr); 20255 } 20256 20257 ldi_ident_release(li); 20258 crfree(cr); 20259 } 20260 20261 /* 20262 * This needs to be in-sync with nic_event_t definition 20263 */ 20264 static const char * 20265 ill_hook_event2str(nic_event_t event) 20266 { 20267 switch (event) { 20268 case NE_PLUMB: 20269 return ("PLUMB"); 20270 case NE_UNPLUMB: 20271 return ("UNPLUMB"); 20272 case NE_UP: 20273 return ("UP"); 20274 case NE_DOWN: 20275 return ("DOWN"); 20276 case NE_ADDRESS_CHANGE: 20277 return ("ADDRESS_CHANGE"); 20278 case NE_LIF_UP: 20279 return ("LIF_UP"); 20280 case NE_LIF_DOWN: 20281 return ("LIF_DOWN"); 20282 case NE_IFINDEX_CHANGE: 20283 return ("IFINDEX_CHANGE"); 20284 default: 20285 return ("UNKNOWN"); 20286 } 20287 } 20288 20289 void 20290 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 20291 nic_event_data_t data, size_t datalen) 20292 { 20293 ip_stack_t *ipst = ill->ill_ipst; 20294 hook_nic_event_int_t *info; 20295 const char *str = NULL; 20296 20297 /* create a new nic event info */ 20298 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 20299 goto fail; 20300 20301 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 20302 info->hnei_event.hne_lif = lif; 20303 info->hnei_event.hne_event = event; 20304 info->hnei_event.hne_protocol = ill->ill_isv6 ? 20305 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 20306 info->hnei_event.hne_data = NULL; 20307 info->hnei_event.hne_datalen = 0; 20308 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 20309 20310 if (data != NULL && datalen != 0) { 20311 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 20312 if (info->hnei_event.hne_data == NULL) 20313 goto fail; 20314 bcopy(data, info->hnei_event.hne_data, datalen); 20315 info->hnei_event.hne_datalen = datalen; 20316 } 20317 20318 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 20319 DDI_NOSLEEP) == DDI_SUCCESS) 20320 return; 20321 20322 fail: 20323 if (info != NULL) { 20324 if (info->hnei_event.hne_data != NULL) { 20325 kmem_free(info->hnei_event.hne_data, 20326 info->hnei_event.hne_datalen); 20327 } 20328 kmem_free(info, sizeof (hook_nic_event_t)); 20329 } 20330 str = ill_hook_event2str(event); 20331 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 20332 "information for %s (ENOMEM)\n", str, ill->ill_name)); 20333 } 20334 20335 void 20336 ipif_up_notify(ipif_t *ipif) 20337 { 20338 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 20339 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 20340 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20341 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 20342 NE_LIF_UP, NULL, 0); 20343 } 20344