1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 /* 28 * This file contains the interface control functions for IP. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/stream.h> 33 #include <sys/dlpi.h> 34 #include <sys/stropts.h> 35 #include <sys/strsun.h> 36 #include <sys/sysmacros.h> 37 #include <sys/strsubr.h> 38 #include <sys/strlog.h> 39 #include <sys/ddi.h> 40 #include <sys/sunddi.h> 41 #include <sys/cmn_err.h> 42 #include <sys/kstat.h> 43 #include <sys/debug.h> 44 #include <sys/zone.h> 45 #include <sys/sunldi.h> 46 #include <sys/file.h> 47 #include <sys/bitmap.h> 48 #include <sys/cpuvar.h> 49 #include <sys/time.h> 50 #include <sys/ctype.h> 51 #include <sys/kmem.h> 52 #include <sys/systm.h> 53 #include <sys/param.h> 54 #include <sys/socket.h> 55 #include <sys/isa_defs.h> 56 #include <net/if.h> 57 #include <net/if_arp.h> 58 #include <net/if_types.h> 59 #include <net/if_dl.h> 60 #include <net/route.h> 61 #include <sys/sockio.h> 62 #include <netinet/in.h> 63 #include <netinet/ip6.h> 64 #include <netinet/icmp6.h> 65 #include <netinet/igmp_var.h> 66 #include <sys/policy.h> 67 #include <sys/ethernet.h> 68 #include <sys/callb.h> 69 #include <sys/md5.h> 70 71 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 72 #include <inet/mi.h> 73 #include <inet/nd.h> 74 #include <inet/arp.h> 75 #include <inet/mib2.h> 76 #include <inet/ip.h> 77 #include <inet/ip6.h> 78 #include <inet/ip6_asp.h> 79 #include <inet/tcp.h> 80 #include <inet/ip_multi.h> 81 #include <inet/ip_ire.h> 82 #include <inet/ip_ftable.h> 83 #include <inet/ip_rts.h> 84 #include <inet/ip_ndp.h> 85 #include <inet/ip_if.h> 86 #include <inet/ip_impl.h> 87 #include <inet/tun.h> 88 #include <inet/sctp_ip.h> 89 #include <inet/ip_netinfo.h> 90 91 #include <net/pfkeyv2.h> 92 #include <inet/ipsec_info.h> 93 #include <inet/sadb.h> 94 #include <inet/ipsec_impl.h> 95 #include <sys/iphada.h> 96 97 #include <netinet/igmp.h> 98 #include <inet/ip_listutils.h> 99 #include <inet/ipclassifier.h> 100 #include <sys/mac_client.h> 101 #include <sys/dld.h> 102 103 #include <sys/systeminfo.h> 104 #include <sys/bootconf.h> 105 106 #include <sys/tsol/tndb.h> 107 #include <sys/tsol/tnet.h> 108 109 /* The character which tells where the ill_name ends */ 110 #define IPIF_SEPARATOR_CHAR ':' 111 112 /* IP ioctl function table entry */ 113 typedef struct ipft_s { 114 int ipft_cmd; 115 pfi_t ipft_pfi; 116 int ipft_min_size; 117 int ipft_flags; 118 } ipft_t; 119 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 120 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 121 122 typedef struct ip_sock_ar_s { 123 union { 124 area_t ip_sock_area; 125 ared_t ip_sock_ared; 126 areq_t ip_sock_areq; 127 } ip_sock_ar_u; 128 queue_t *ip_sock_ar_q; 129 } ip_sock_ar_t; 130 131 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 132 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 133 char *value, caddr_t cp, cred_t *ioc_cr); 134 135 static boolean_t ill_is_quiescent(ill_t *); 136 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 137 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 138 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 139 mblk_t *mp, boolean_t need_up); 140 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 141 mblk_t *mp, boolean_t need_up); 142 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 143 queue_t *q, mblk_t *mp, boolean_t need_up); 144 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 145 mblk_t *mp); 146 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 147 mblk_t *mp); 148 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 149 queue_t *q, mblk_t *mp, boolean_t need_up); 150 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 151 int ioccmd, struct linkblk *li, boolean_t doconsist); 152 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 153 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 154 static void ipsq_flush(ill_t *ill); 155 156 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 157 queue_t *q, mblk_t *mp, boolean_t need_up); 158 static void ipsq_delete(ipsq_t *); 159 160 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 161 boolean_t initialize, boolean_t insert); 162 static void ipif_check_bcast_ires(ipif_t *test_ipif); 163 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 164 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 165 boolean_t isv6); 166 static void ipif_down_delete_ire(ire_t *ire, char *ipif); 167 static void ipif_delete_cache_ire(ire_t *, char *); 168 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 169 static void ipif_free(ipif_t *ipif); 170 static void ipif_free_tail(ipif_t *ipif); 171 static void ipif_mtu_change(ire_t *ire, char *ipif_arg); 172 static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); 173 static void ipif_set_default(ipif_t *ipif); 174 static int ipif_set_values(queue_t *q, mblk_t *mp, 175 char *interf_name, uint_t *ppa); 176 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 177 queue_t *q); 178 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 179 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 180 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); 181 static void ipif_update_other_ipifs(ipif_t *old_ipif); 182 183 static int ill_alloc_ppa(ill_if_t *, ill_t *); 184 static int ill_arp_off(ill_t *ill); 185 static int ill_arp_on(ill_t *ill); 186 static void ill_delete_interface_type(ill_if_t *); 187 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 188 static void ill_dl_down(ill_t *ill); 189 static void ill_down(ill_t *ill); 190 static void ill_downi(ire_t *ire, char *ill_arg); 191 static void ill_free_mib(ill_t *ill); 192 static void ill_glist_delete(ill_t *); 193 static void ill_phyint_reinit(ill_t *ill); 194 static void ill_set_nce_router_flags(ill_t *, boolean_t); 195 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 196 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 197 198 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 199 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 200 static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo; 201 static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo; 202 static void ipif_save_ire(ipif_t *, ire_t *); 203 static void ipif_remove_ire(ipif_t *, ire_t *); 204 static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); 205 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 206 static void phyint_free(phyint_t *); 207 208 /* 209 * Per-ill IPsec capabilities management. 210 */ 211 static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); 212 static void ill_ipsec_capab_free(ill_ipsec_capab_t *); 213 static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); 214 static void ill_ipsec_capab_delete(ill_t *, uint_t); 215 static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); 216 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, 217 boolean_t); 218 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 219 static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 220 static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *); 221 static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 222 static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *); 223 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 224 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 225 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 226 dl_capability_sub_t *); 227 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 228 static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *, 229 int *); 230 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 231 static void ill_capability_dld_ack(ill_t *, mblk_t *, 232 dl_capability_sub_t *); 233 static void ill_capability_dld_enable(ill_t *); 234 static void ill_capability_ack_thr(void *); 235 static void ill_capability_lso_enable(ill_t *); 236 static void ill_capability_send(ill_t *, mblk_t *); 237 238 static ill_t *ill_prev_usesrc(ill_t *); 239 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 240 static void ill_disband_usesrc_group(ill_t *); 241 static void conn_cleanup_stale_ire(conn_t *, caddr_t); 242 243 #ifdef DEBUG 244 static void ill_trace_cleanup(const ill_t *); 245 static void ipif_trace_cleanup(const ipif_t *); 246 #endif 247 248 /* 249 * if we go over the memory footprint limit more than once in this msec 250 * interval, we'll start pruning aggressively. 251 */ 252 int ip_min_frag_prune_time = 0; 253 254 /* 255 * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY 256 * and the IPsec DOI 257 */ 258 #define MAX_IPSEC_ALGS 256 259 260 #define BITSPERBYTE 8 261 #define BITS(type) (BITSPERBYTE * (long)sizeof (type)) 262 263 #define IPSEC_ALG_ENABLE(algs, algid) \ 264 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ 265 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 266 267 #define IPSEC_ALG_IS_ENABLED(algid, algs) \ 268 ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ 269 (1 << ((algid) % BITS(ipsec_capab_elem_t)))) 270 271 typedef uint8_t ipsec_capab_elem_t; 272 273 /* 274 * Per-algorithm parameters. Note that at present, only encryption 275 * algorithms have variable keysize (IKE does not provide a way to negotiate 276 * auth algorithm keysize). 277 * 278 * All sizes here are in bits. 279 */ 280 typedef struct 281 { 282 uint16_t minkeylen; 283 uint16_t maxkeylen; 284 } ipsec_capab_algparm_t; 285 286 /* 287 * Per-ill capabilities. 288 */ 289 struct ill_ipsec_capab_s { 290 ipsec_capab_elem_t *encr_hw_algs; 291 ipsec_capab_elem_t *auth_hw_algs; 292 uint32_t algs_size; /* size of _hw_algs in bytes */ 293 /* algorithm key lengths */ 294 ipsec_capab_algparm_t *encr_algparm; 295 uint32_t encr_algparm_size; 296 uint32_t encr_algparm_end; 297 }; 298 299 /* 300 * The field values are larger than strictly necessary for simple 301 * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. 302 */ 303 static area_t ip_area_template = { 304 AR_ENTRY_ADD, /* area_cmd */ 305 sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), 306 /* area_name_offset */ 307 /* area_name_length temporarily holds this structure length */ 308 sizeof (area_t), /* area_name_length */ 309 IP_ARP_PROTO_TYPE, /* area_proto */ 310 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 311 IP_ADDR_LEN, /* area_proto_addr_length */ 312 sizeof (ip_sock_ar_t) + IP_ADDR_LEN, 313 /* area_proto_mask_offset */ 314 0, /* area_flags */ 315 sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, 316 /* area_hw_addr_offset */ 317 /* Zero length hw_addr_length means 'use your idea of the address' */ 318 0 /* area_hw_addr_length */ 319 }; 320 321 /* 322 * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver 323 * support 324 */ 325 static area_t ip6_area_template = { 326 AR_ENTRY_ADD, /* area_cmd */ 327 sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), 328 /* area_name_offset */ 329 /* area_name_length temporarily holds this structure length */ 330 sizeof (area_t), /* area_name_length */ 331 IP_ARP_PROTO_TYPE, /* area_proto */ 332 sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ 333 IPV6_ADDR_LEN, /* area_proto_addr_length */ 334 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, 335 /* area_proto_mask_offset */ 336 0, /* area_flags */ 337 sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, 338 /* area_hw_addr_offset */ 339 /* Zero length hw_addr_length means 'use your idea of the address' */ 340 0 /* area_hw_addr_length */ 341 }; 342 343 static ared_t ip_ared_template = { 344 AR_ENTRY_DELETE, 345 sizeof (ared_t) + IP_ADDR_LEN, 346 sizeof (ared_t), 347 IP_ARP_PROTO_TYPE, 348 sizeof (ared_t), 349 IP_ADDR_LEN, 350 0 351 }; 352 353 static ared_t ip6_ared_template = { 354 AR_ENTRY_DELETE, 355 sizeof (ared_t) + IPV6_ADDR_LEN, 356 sizeof (ared_t), 357 IP_ARP_PROTO_TYPE, 358 sizeof (ared_t), 359 IPV6_ADDR_LEN, 360 0 361 }; 362 363 /* 364 * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as 365 * as the areq doesn't include an IP address in ill_dl_up() (the only place a 366 * areq is used). 367 */ 368 static areq_t ip_areq_template = { 369 AR_ENTRY_QUERY, /* cmd */ 370 sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ 371 sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ 372 IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ 373 sizeof (areq_t), /* target addr offset */ 374 IP_ADDR_LEN, /* target addr_length */ 375 0, /* flags */ 376 sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ 377 IP_ADDR_LEN, /* sender addr length */ 378 AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ 379 AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ 380 AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ 381 /* anything else filled in by the code */ 382 }; 383 384 static arc_t ip_aru_template = { 385 AR_INTERFACE_UP, 386 sizeof (arc_t), /* Name offset */ 387 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 388 }; 389 390 static arc_t ip_ard_template = { 391 AR_INTERFACE_DOWN, 392 sizeof (arc_t), /* Name offset */ 393 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 394 }; 395 396 static arc_t ip_aron_template = { 397 AR_INTERFACE_ON, 398 sizeof (arc_t), /* Name offset */ 399 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 400 }; 401 402 static arc_t ip_aroff_template = { 403 AR_INTERFACE_OFF, 404 sizeof (arc_t), /* Name offset */ 405 sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ 406 }; 407 408 static arma_t ip_arma_multi_template = { 409 AR_MAPPING_ADD, 410 sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, 411 /* Name offset */ 412 sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ 413 IP_ARP_PROTO_TYPE, 414 sizeof (arma_t), /* proto_addr_offset */ 415 IP_ADDR_LEN, /* proto_addr_length */ 416 sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ 417 sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ 418 ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ 419 sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ 420 IP_MAX_HW_LEN, /* hw_addr_length */ 421 0, /* hw_mapping_start */ 422 }; 423 424 static ipft_t ip_ioctl_ftbl[] = { 425 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 426 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 427 IPFT_F_NO_REPLY }, 428 { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), 429 IPFT_F_NO_REPLY }, 430 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 431 { 0 } 432 }; 433 434 /* Simple ICMP IP Header Template */ 435 static ipha_t icmp_ipha = { 436 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 437 }; 438 439 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 440 441 static ip_m_t ip_m_tbl[] = { 442 { DL_ETHER, IFT_ETHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 443 ip_ether_v6intfid }, 444 { DL_CSMACD, IFT_ISO88023, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 445 ip_nodef_v6intfid }, 446 { DL_TPB, IFT_ISO88024, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 447 ip_nodef_v6intfid }, 448 { DL_TPR, IFT_ISO88025, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 449 ip_nodef_v6intfid }, 450 { DL_FDDI, IFT_FDDI, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 451 ip_ether_v6intfid }, 452 { DL_IB, IFT_IB, ip_ib_v4mapinfo, ip_ib_v6mapinfo, 453 ip_ib_v6intfid }, 454 { SUNW_DL_VNI, IFT_OTHER, NULL, NULL, NULL }, 455 { SUNW_DL_IPMP, IFT_OTHER, NULL, NULL, ip_ipmp_v6intfid }, 456 { DL_OTHER, IFT_OTHER, ip_ether_v4mapinfo, ip_ether_v6mapinfo, 457 ip_nodef_v6intfid } 458 }; 459 460 static ill_t ill_null; /* Empty ILL for init. */ 461 char ipif_loopback_name[] = "lo0"; 462 static char *ipv4_forward_suffix = ":ip_forwarding"; 463 static char *ipv6_forward_suffix = ":ip6_forwarding"; 464 static sin6_t sin6_null; /* Zero address for quick clears */ 465 static sin_t sin_null; /* Zero address for quick clears */ 466 467 /* When set search for unused ipif_seqid */ 468 static ipif_t ipif_zero; 469 470 /* 471 * ppa arena is created after these many 472 * interfaces have been plumbed. 473 */ 474 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 475 476 /* 477 * Allocate per-interface mibs. 478 * Returns true if ok. False otherwise. 479 * ipsq may not yet be allocated (loopback case ). 480 */ 481 static boolean_t 482 ill_allocate_mibs(ill_t *ill) 483 { 484 /* Already allocated? */ 485 if (ill->ill_ip_mib != NULL) { 486 if (ill->ill_isv6) 487 ASSERT(ill->ill_icmp6_mib != NULL); 488 return (B_TRUE); 489 } 490 491 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 492 KM_NOSLEEP); 493 if (ill->ill_ip_mib == NULL) { 494 return (B_FALSE); 495 } 496 497 /* Setup static information */ 498 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 499 sizeof (mib2_ipIfStatsEntry_t)); 500 if (ill->ill_isv6) { 501 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 502 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 503 sizeof (mib2_ipv6AddrEntry_t)); 504 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 505 sizeof (mib2_ipv6RouteEntry_t)); 506 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 507 sizeof (mib2_ipv6NetToMediaEntry_t)); 508 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 509 sizeof (ipv6_member_t)); 510 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 511 sizeof (ipv6_grpsrc_t)); 512 } else { 513 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 514 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 515 sizeof (mib2_ipAddrEntry_t)); 516 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 517 sizeof (mib2_ipRouteEntry_t)); 518 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 519 sizeof (mib2_ipNetToMediaEntry_t)); 520 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 521 sizeof (ip_member_t)); 522 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 523 sizeof (ip_grpsrc_t)); 524 525 /* 526 * For a v4 ill, we are done at this point, because per ill 527 * icmp mibs are only used for v6. 528 */ 529 return (B_TRUE); 530 } 531 532 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 533 KM_NOSLEEP); 534 if (ill->ill_icmp6_mib == NULL) { 535 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 536 ill->ill_ip_mib = NULL; 537 return (B_FALSE); 538 } 539 /* static icmp info */ 540 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 541 sizeof (mib2_ipv6IfIcmpEntry_t); 542 /* 543 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 544 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 545 * -> ill_phyint_reinit 546 */ 547 return (B_TRUE); 548 } 549 550 /* 551 * Common code for preparation of ARP commands. Two points to remember: 552 * 1) The ill_name is tacked on at the end of the allocated space so 553 * the templates name_offset field must contain the total space 554 * to allocate less the name length. 555 * 556 * 2) The templates name_length field should contain the *template* 557 * length. We use it as a parameter to bcopy() and then write 558 * the real ill_name_length into the name_length field of the copy. 559 * (Always called as writer.) 560 */ 561 mblk_t * 562 ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr) 563 { 564 arc_t *arc = (arc_t *)template; 565 char *cp; 566 int len; 567 mblk_t *mp; 568 uint_t name_length = ill->ill_name_length; 569 uint_t template_len = arc->arc_name_length; 570 571 len = arc->arc_name_offset + name_length; 572 mp = allocb(len, BPRI_HI); 573 if (mp == NULL) 574 return (NULL); 575 cp = (char *)mp->b_rptr; 576 mp->b_wptr = (uchar_t *)&cp[len]; 577 if (template_len) 578 bcopy(template, cp, template_len); 579 if (len > template_len) 580 bzero(&cp[template_len], len - template_len); 581 mp->b_datap->db_type = M_PROTO; 582 583 arc = (arc_t *)cp; 584 arc->arc_name_length = name_length; 585 cp = (char *)arc + arc->arc_name_offset; 586 bcopy(ill->ill_name, cp, name_length); 587 588 if (addr) { 589 area_t *area = (area_t *)mp->b_rptr; 590 591 cp = (char *)area + area->area_proto_addr_offset; 592 bcopy(addr, cp, area->area_proto_addr_length); 593 if (area->area_cmd == AR_ENTRY_ADD) { 594 cp = (char *)area; 595 len = area->area_proto_addr_length; 596 if (area->area_proto_mask_offset) 597 cp += area->area_proto_mask_offset; 598 else 599 cp += area->area_proto_addr_offset + len; 600 while (len-- > 0) 601 *cp++ = (char)~0; 602 } 603 } 604 return (mp); 605 } 606 607 mblk_t * 608 ipif_area_alloc(ipif_t *ipif, uint_t optflags) 609 { 610 caddr_t addr; 611 mblk_t *mp; 612 area_t *area; 613 uchar_t *areap; 614 ill_t *ill = ipif->ipif_ill; 615 616 if (ill->ill_isv6) { 617 ASSERT(ill->ill_flags & ILLF_XRESOLV); 618 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 619 areap = (uchar_t *)&ip6_area_template; 620 } else { 621 addr = (caddr_t)&ipif->ipif_lcl_addr; 622 areap = (uchar_t *)&ip_area_template; 623 } 624 625 if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL) 626 return (NULL); 627 628 /* 629 * IPMP requires that the hardware address be included in all 630 * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on. 631 * If there are no active underlying ills in the group (and thus no 632 * hardware address, DAD will be deferred until an underlying ill 633 * becomes active. 634 */ 635 if (IS_IPMP(ill)) { 636 if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { 637 freemsg(mp); 638 return (NULL); 639 } 640 } else { 641 ill_refhold(ill); 642 } 643 644 area = (area_t *)mp->b_rptr; 645 area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; 646 area->area_flags |= optflags; 647 area->area_hw_addr_length = ill->ill_phys_addr_length; 648 bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset, 649 area->area_hw_addr_length); 650 651 ill_refrele(ill); 652 return (mp); 653 } 654 655 mblk_t * 656 ipif_ared_alloc(ipif_t *ipif) 657 { 658 caddr_t addr; 659 uchar_t *aredp; 660 661 if (ipif->ipif_ill->ill_isv6) { 662 ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV); 663 addr = (caddr_t)&ipif->ipif_v6lcl_addr; 664 aredp = (uchar_t *)&ip6_ared_template; 665 } else { 666 addr = (caddr_t)&ipif->ipif_lcl_addr; 667 aredp = (uchar_t *)&ip_ared_template; 668 } 669 670 return (ill_arp_alloc(ipif->ipif_ill, aredp, addr)); 671 } 672 673 mblk_t * 674 ill_ared_alloc(ill_t *ill, ipaddr_t addr) 675 { 676 return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 677 (char *)&addr)); 678 } 679 680 mblk_t * 681 ill_arie_alloc(ill_t *ill, const char *grifname, const void *template) 682 { 683 mblk_t *mp = ill_arp_alloc(ill, template, 0); 684 arie_t *arie; 685 686 if (mp != NULL) { 687 arie = (arie_t *)mp->b_rptr; 688 (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ); 689 } 690 return (mp); 691 } 692 693 /* 694 * Completely vaporize a lower level tap and all associated interfaces. 695 * ill_delete is called only out of ip_close when the device control 696 * stream is being closed. 697 */ 698 void 699 ill_delete(ill_t *ill) 700 { 701 ipif_t *ipif; 702 ill_t *prev_ill; 703 ip_stack_t *ipst = ill->ill_ipst; 704 705 /* 706 * ill_delete may be forcibly entering the ipsq. The previous 707 * ioctl may not have completed and may need to be aborted. 708 * ipsq_flush takes care of it. If we don't need to enter the 709 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 710 * ill_delete_tail is sufficient. 711 */ 712 ipsq_flush(ill); 713 714 /* 715 * Nuke all interfaces. ipif_free will take down the interface, 716 * remove it from the list, and free the data structure. 717 * Walk down the ipif list and remove the logical interfaces 718 * first before removing the main ipif. We can't unplumb 719 * zeroth interface first in the case of IPv6 as reset_conn_ill 720 * -> ip_ll_delmulti_v6 de-references ill_ipif for checking 721 * POINTOPOINT. 722 * 723 * If ill_ipif was not properly initialized (i.e low on memory), 724 * then no interfaces to clean up. In this case just clean up the 725 * ill. 726 */ 727 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 728 ipif_free(ipif); 729 730 /* 731 * Used only by ill_arp_on and ill_arp_off, which are writers. 732 * So nobody can be using this mp now. Free the mp allocated for 733 * honoring ILLF_NOARP 734 */ 735 freemsg(ill->ill_arp_on_mp); 736 ill->ill_arp_on_mp = NULL; 737 738 /* Clean up msgs on pending upcalls for mrouted */ 739 reset_mrt_ill(ill); 740 741 /* 742 * ipif_free -> reset_conn_ipif will remove all multicast 743 * references for IPv4. For IPv6, we need to do it here as 744 * it points only at ills. 745 */ 746 reset_conn_ill(ill); 747 748 /* 749 * Remove multicast references added as a result of calls to 750 * ip_join_allmulti(). 751 */ 752 ip_purge_allmulti(ill); 753 754 /* 755 * If the ill being deleted is under IPMP, boot it out of the illgrp. 756 */ 757 if (IS_UNDER_IPMP(ill)) 758 ipmp_ill_leave_illgrp(ill); 759 760 /* 761 * ill_down will arrange to blow off any IRE's dependent on this 762 * ILL, and shut down fragmentation reassembly. 763 */ 764 ill_down(ill); 765 766 /* Let SCTP know, so that it can remove this from its list. */ 767 sctp_update_ill(ill, SCTP_ILL_REMOVE); 768 769 /* 770 * If an address on this ILL is being used as a source address then 771 * clear out the pointers in other ILLs that point to this ILL. 772 */ 773 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 774 if (ill->ill_usesrc_grp_next != NULL) { 775 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 776 ill_disband_usesrc_group(ill); 777 } else { /* consumer of the usesrc ILL */ 778 prev_ill = ill_prev_usesrc(ill); 779 prev_ill->ill_usesrc_grp_next = 780 ill->ill_usesrc_grp_next; 781 } 782 } 783 rw_exit(&ipst->ips_ill_g_usesrc_lock); 784 } 785 786 static void 787 ipif_non_duplicate(ipif_t *ipif) 788 { 789 ill_t *ill = ipif->ipif_ill; 790 mutex_enter(&ill->ill_lock); 791 if (ipif->ipif_flags & IPIF_DUPLICATE) { 792 ipif->ipif_flags &= ~IPIF_DUPLICATE; 793 ASSERT(ill->ill_ipif_dup_count > 0); 794 ill->ill_ipif_dup_count--; 795 } 796 mutex_exit(&ill->ill_lock); 797 } 798 799 /* 800 * ill_delete_tail is called from ip_modclose after all references 801 * to the closing ill are gone. The wait is done in ip_modclose 802 */ 803 void 804 ill_delete_tail(ill_t *ill) 805 { 806 mblk_t **mpp; 807 ipif_t *ipif; 808 ip_stack_t *ipst = ill->ill_ipst; 809 810 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 811 ipif_non_duplicate(ipif); 812 ipif_down_tail(ipif); 813 } 814 815 ASSERT(ill->ill_ipif_dup_count == 0 && 816 ill->ill_arp_down_mp == NULL && 817 ill->ill_arp_del_mapping_mp == NULL); 818 819 /* 820 * If polling capability is enabled (which signifies direct 821 * upcall into IP and driver has ill saved as a handle), 822 * we need to make sure that unbind has completed before we 823 * let the ill disappear and driver no longer has any reference 824 * to this ill. 825 */ 826 mutex_enter(&ill->ill_lock); 827 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 828 cv_wait(&ill->ill_cv, &ill->ill_lock); 829 mutex_exit(&ill->ill_lock); 830 ASSERT(!(ill->ill_capabilities & 831 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 832 833 if (ill->ill_net_type != IRE_LOOPBACK) 834 qprocsoff(ill->ill_rq); 835 836 /* 837 * We do an ipsq_flush once again now. New messages could have 838 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 839 * could also have landed up if an ioctl thread had looked up 840 * the ill before we set the ILL_CONDEMNED flag, but not yet 841 * enqueued the ioctl when we did the ipsq_flush last time. 842 */ 843 ipsq_flush(ill); 844 845 /* 846 * Free capabilities. 847 */ 848 if (ill->ill_ipsec_capab_ah != NULL) { 849 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); 850 ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); 851 ill->ill_ipsec_capab_ah = NULL; 852 } 853 854 if (ill->ill_ipsec_capab_esp != NULL) { 855 ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); 856 ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); 857 ill->ill_ipsec_capab_esp = NULL; 858 } 859 860 if (ill->ill_mdt_capab != NULL) { 861 kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); 862 ill->ill_mdt_capab = NULL; 863 } 864 865 if (ill->ill_hcksum_capab != NULL) { 866 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 867 ill->ill_hcksum_capab = NULL; 868 } 869 870 if (ill->ill_zerocopy_capab != NULL) { 871 kmem_free(ill->ill_zerocopy_capab, 872 sizeof (ill_zerocopy_capab_t)); 873 ill->ill_zerocopy_capab = NULL; 874 } 875 876 if (ill->ill_lso_capab != NULL) { 877 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 878 ill->ill_lso_capab = NULL; 879 } 880 881 if (ill->ill_dld_capab != NULL) { 882 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 883 ill->ill_dld_capab = NULL; 884 } 885 886 while (ill->ill_ipif != NULL) 887 ipif_free_tail(ill->ill_ipif); 888 889 /* 890 * We have removed all references to ilm from conn and the ones joined 891 * within the kernel. 892 * 893 * We don't walk conns, mrts and ires because 894 * 895 * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. 896 * 2) ill_down ->ill_downi walks all the ires and cleans up 897 * ill references. 898 */ 899 ASSERT(ilm_walk_ill(ill) == 0); 900 901 /* 902 * If this ill is an IPMP meta-interface, blow away the illgrp. This 903 * is safe to do because the illgrp has already been unlinked from the 904 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 905 */ 906 if (IS_IPMP(ill)) { 907 ipmp_illgrp_destroy(ill->ill_grp); 908 ill->ill_grp = NULL; 909 } 910 911 /* 912 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 913 * could free the phyint. No more reference to the phyint after this 914 * point. 915 */ 916 (void) ill_glist_delete(ill); 917 918 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 919 if (ill->ill_ndd_name != NULL) 920 nd_unload(&ipst->ips_ip_g_nd, ill->ill_ndd_name); 921 rw_exit(&ipst->ips_ip_g_nd_lock); 922 923 if (ill->ill_frag_ptr != NULL) { 924 uint_t count; 925 926 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 927 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 928 } 929 mi_free(ill->ill_frag_ptr); 930 ill->ill_frag_ptr = NULL; 931 ill->ill_frag_hash_tbl = NULL; 932 } 933 934 freemsg(ill->ill_nd_lla_mp); 935 /* Free all retained control messages. */ 936 mpp = &ill->ill_first_mp_to_free; 937 do { 938 while (mpp[0]) { 939 mblk_t *mp; 940 mblk_t *mp1; 941 942 mp = mpp[0]; 943 mpp[0] = mp->b_next; 944 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 945 mp1->b_next = NULL; 946 mp1->b_prev = NULL; 947 } 948 freemsg(mp); 949 } 950 } while (mpp++ != &ill->ill_last_mp_to_free); 951 952 ill_free_mib(ill); 953 954 #ifdef DEBUG 955 ill_trace_cleanup(ill); 956 #endif 957 958 /* Drop refcnt here */ 959 netstack_rele(ill->ill_ipst->ips_netstack); 960 ill->ill_ipst = NULL; 961 } 962 963 static void 964 ill_free_mib(ill_t *ill) 965 { 966 ip_stack_t *ipst = ill->ill_ipst; 967 968 /* 969 * MIB statistics must not be lost, so when an interface 970 * goes away the counter values will be added to the global 971 * MIBs. 972 */ 973 if (ill->ill_ip_mib != NULL) { 974 if (ill->ill_isv6) { 975 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 976 ill->ill_ip_mib); 977 } else { 978 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 979 ill->ill_ip_mib); 980 } 981 982 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 983 ill->ill_ip_mib = NULL; 984 } 985 if (ill->ill_icmp6_mib != NULL) { 986 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 987 ill->ill_icmp6_mib); 988 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 989 ill->ill_icmp6_mib = NULL; 990 } 991 } 992 993 /* 994 * Concatenate together a physical address and a sap. 995 * 996 * Sap_lengths are interpreted as follows: 997 * sap_length == 0 ==> no sap 998 * sap_length > 0 ==> sap is at the head of the dlpi address 999 * sap_length < 0 ==> sap is at the tail of the dlpi address 1000 */ 1001 static void 1002 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 1003 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 1004 { 1005 uint16_t sap_addr = (uint16_t)sap_src; 1006 1007 if (sap_length == 0) { 1008 if (phys_src == NULL) 1009 bzero(dst, phys_length); 1010 else 1011 bcopy(phys_src, dst, phys_length); 1012 } else if (sap_length < 0) { 1013 if (phys_src == NULL) 1014 bzero(dst, phys_length); 1015 else 1016 bcopy(phys_src, dst, phys_length); 1017 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 1018 } else { 1019 bcopy(&sap_addr, dst, sizeof (sap_addr)); 1020 if (phys_src == NULL) 1021 bzero((char *)dst + sap_length, phys_length); 1022 else 1023 bcopy(phys_src, (char *)dst + sap_length, phys_length); 1024 } 1025 } 1026 1027 /* 1028 * Generate a dl_unitdata_req mblk for the device and address given. 1029 * addr_length is the length of the physical portion of the address. 1030 * If addr is NULL include an all zero address of the specified length. 1031 * TRUE? In any case, addr_length is taken to be the entire length of the 1032 * dlpi address, including the absolute value of sap_length. 1033 */ 1034 mblk_t * 1035 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 1036 t_scalar_t sap_length) 1037 { 1038 dl_unitdata_req_t *dlur; 1039 mblk_t *mp; 1040 t_scalar_t abs_sap_length; /* absolute value */ 1041 1042 abs_sap_length = ABS(sap_length); 1043 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 1044 DL_UNITDATA_REQ); 1045 if (mp == NULL) 1046 return (NULL); 1047 dlur = (dl_unitdata_req_t *)mp->b_rptr; 1048 /* HACK: accomodate incompatible DLPI drivers */ 1049 if (addr_length == 8) 1050 addr_length = 6; 1051 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 1052 dlur->dl_dest_addr_offset = sizeof (*dlur); 1053 dlur->dl_priority.dl_min = 0; 1054 dlur->dl_priority.dl_max = 0; 1055 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 1056 (uchar_t *)&dlur[1]); 1057 return (mp); 1058 } 1059 1060 /* 1061 * Add the 'mp' to the list of pending mp's headed by ill_pending_mp 1062 * Return an error if we already have 1 or more ioctls in progress. 1063 * This is used only for non-exclusive ioctls. Currently this is used 1064 * for SIOC*ARP and SIOCGTUNPARAM ioctls. Most set ioctls are exclusive 1065 * and thus need to use ipsq_pending_mp_add. 1066 */ 1067 boolean_t 1068 ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) 1069 { 1070 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1071 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1072 /* 1073 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls. 1074 */ 1075 ASSERT((add_mp->b_datap->db_type == M_IOCDATA) || 1076 (add_mp->b_datap->db_type == M_IOCTL)); 1077 1078 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1079 /* 1080 * Return error if the conn has started closing. The conn 1081 * could have finished cleaning up the pending mp list, 1082 * If so we should not add another mp to the list negating 1083 * the cleanup. 1084 */ 1085 if (connp->conn_state_flags & CONN_CLOSING) 1086 return (B_FALSE); 1087 /* 1088 * Add the pending mp to the head of the list, chained by b_next. 1089 * Note down the conn on which the ioctl request came, in b_prev. 1090 * This will be used to later get the conn, when we get a response 1091 * on the ill queue, from some other module (typically arp) 1092 */ 1093 add_mp->b_next = (void *)ill->ill_pending_mp; 1094 add_mp->b_queue = CONNP_TO_WQ(connp); 1095 ill->ill_pending_mp = add_mp; 1096 if (connp != NULL) 1097 connp->conn_oper_pending_ill = ill; 1098 return (B_TRUE); 1099 } 1100 1101 /* 1102 * Retrieve the ill_pending_mp and return it. We have to walk the list 1103 * of mblks starting at ill_pending_mp, and match based on the ioc_id. 1104 */ 1105 mblk_t * 1106 ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) 1107 { 1108 mblk_t *prev = NULL; 1109 mblk_t *curr = NULL; 1110 uint_t id; 1111 conn_t *connp; 1112 1113 /* 1114 * When the conn closes, conn_ioctl_cleanup needs to clean 1115 * up the pending mp, but it does not know the ioc_id and 1116 * passes in a zero for it. 1117 */ 1118 mutex_enter(&ill->ill_lock); 1119 if (ioc_id != 0) 1120 *connpp = NULL; 1121 1122 /* Search the list for the appropriate ioctl based on ioc_id */ 1123 for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; 1124 prev = curr, curr = curr->b_next) { 1125 id = ((struct iocblk *)curr->b_rptr)->ioc_id; 1126 connp = Q_TO_CONN(curr->b_queue); 1127 /* Match based on the ioc_id or based on the conn */ 1128 if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) 1129 break; 1130 } 1131 1132 if (curr != NULL) { 1133 /* Unlink the mblk from the pending mp list */ 1134 if (prev != NULL) { 1135 prev->b_next = curr->b_next; 1136 } else { 1137 ASSERT(ill->ill_pending_mp == curr); 1138 ill->ill_pending_mp = curr->b_next; 1139 } 1140 1141 /* 1142 * conn refcnt must have been bumped up at the start of 1143 * the ioctl. So we can safely access the conn. 1144 */ 1145 ASSERT(CONN_Q(curr->b_queue)); 1146 *connpp = Q_TO_CONN(curr->b_queue); 1147 curr->b_next = NULL; 1148 curr->b_queue = NULL; 1149 } 1150 1151 mutex_exit(&ill->ill_lock); 1152 1153 return (curr); 1154 } 1155 1156 /* 1157 * Add the pending mp to the list. There can be only 1 pending mp 1158 * in the list. Any exclusive ioctl that needs to wait for a response 1159 * from another module or driver needs to use this function to set 1160 * the ipx_pending_mp to the ioctl mblk and wait for the response from 1161 * the other module/driver. This is also used while waiting for the 1162 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 1163 */ 1164 boolean_t 1165 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 1166 int waitfor) 1167 { 1168 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 1169 1170 ASSERT(IAM_WRITER_IPIF(ipif)); 1171 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 1172 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 1173 ASSERT(ipx->ipx_pending_mp == NULL); 1174 /* 1175 * The caller may be using a different ipif than the one passed into 1176 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 1177 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 1178 * that `ipx_current_ipif == ipif'. 1179 */ 1180 ASSERT(ipx->ipx_current_ipif != NULL); 1181 1182 /* 1183 * M_IOCDATA from ioctls, M_IOCTL from tunnel ioctls, 1184 * M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the driver. 1185 */ 1186 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_IOCTL) || 1187 (DB_TYPE(add_mp) == M_ERROR) || (DB_TYPE(add_mp) == M_HANGUP) || 1188 (DB_TYPE(add_mp) == M_PROTO) || (DB_TYPE(add_mp) == M_PCPROTO)); 1189 1190 if (connp != NULL) { 1191 ASSERT(MUTEX_HELD(&connp->conn_lock)); 1192 /* 1193 * Return error if the conn has started closing. The conn 1194 * could have finished cleaning up the pending mp list, 1195 * If so we should not add another mp to the list negating 1196 * the cleanup. 1197 */ 1198 if (connp->conn_state_flags & CONN_CLOSING) 1199 return (B_FALSE); 1200 } 1201 mutex_enter(&ipx->ipx_lock); 1202 ipx->ipx_pending_ipif = ipif; 1203 /* 1204 * Note down the queue in b_queue. This will be returned by 1205 * ipsq_pending_mp_get. Caller will then use these values to restart 1206 * the processing 1207 */ 1208 add_mp->b_next = NULL; 1209 add_mp->b_queue = q; 1210 ipx->ipx_pending_mp = add_mp; 1211 ipx->ipx_waitfor = waitfor; 1212 mutex_exit(&ipx->ipx_lock); 1213 1214 if (connp != NULL) 1215 connp->conn_oper_pending_ill = ipif->ipif_ill; 1216 1217 return (B_TRUE); 1218 } 1219 1220 /* 1221 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 1222 * queued in the list. 1223 */ 1224 mblk_t * 1225 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 1226 { 1227 mblk_t *curr = NULL; 1228 ipxop_t *ipx = ipsq->ipsq_xop; 1229 1230 *connpp = NULL; 1231 mutex_enter(&ipx->ipx_lock); 1232 if (ipx->ipx_pending_mp == NULL) { 1233 mutex_exit(&ipx->ipx_lock); 1234 return (NULL); 1235 } 1236 1237 /* There can be only 1 such excl message */ 1238 curr = ipx->ipx_pending_mp; 1239 ASSERT(curr->b_next == NULL); 1240 ipx->ipx_pending_ipif = NULL; 1241 ipx->ipx_pending_mp = NULL; 1242 ipx->ipx_waitfor = 0; 1243 mutex_exit(&ipx->ipx_lock); 1244 1245 if (CONN_Q(curr->b_queue)) { 1246 /* 1247 * This mp did a refhold on the conn, at the start of the ioctl. 1248 * So we can safely return a pointer to the conn to the caller. 1249 */ 1250 *connpp = Q_TO_CONN(curr->b_queue); 1251 } else { 1252 *connpp = NULL; 1253 } 1254 curr->b_next = NULL; 1255 curr->b_prev = NULL; 1256 return (curr); 1257 } 1258 1259 /* 1260 * Cleanup the ioctl mp queued in ipx_pending_mp 1261 * - Called in the ill_delete path 1262 * - Called in the M_ERROR or M_HANGUP path on the ill. 1263 * - Called in the conn close path. 1264 */ 1265 boolean_t 1266 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 1267 { 1268 mblk_t *mp; 1269 ipxop_t *ipx; 1270 queue_t *q; 1271 ipif_t *ipif; 1272 1273 ASSERT(IAM_WRITER_ILL(ill)); 1274 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 1275 1276 /* 1277 * If connp is null, unconditionally clean up the ipx_pending_mp. 1278 * This happens in M_ERROR/M_HANGUP. We need to abort the current ioctl 1279 * even if it is meant for another ill, since we have to enqueue 1280 * a new mp now in ipx_pending_mp to complete the ipif_down. 1281 * If connp is non-null we are called from the conn close path. 1282 */ 1283 mutex_enter(&ipx->ipx_lock); 1284 mp = ipx->ipx_pending_mp; 1285 if (mp == NULL || (connp != NULL && 1286 mp->b_queue != CONNP_TO_WQ(connp))) { 1287 mutex_exit(&ipx->ipx_lock); 1288 return (B_FALSE); 1289 } 1290 /* Now remove from the ipx_pending_mp */ 1291 ipx->ipx_pending_mp = NULL; 1292 q = mp->b_queue; 1293 mp->b_next = NULL; 1294 mp->b_prev = NULL; 1295 mp->b_queue = NULL; 1296 1297 ipif = ipx->ipx_pending_ipif; 1298 ipx->ipx_pending_ipif = NULL; 1299 ipx->ipx_waitfor = 0; 1300 ipx->ipx_current_ipif = NULL; 1301 ipx->ipx_current_ioctl = 0; 1302 ipx->ipx_current_done = B_TRUE; 1303 mutex_exit(&ipx->ipx_lock); 1304 1305 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 1306 if (connp == NULL) { 1307 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1308 } else { 1309 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 1310 mutex_enter(&ipif->ipif_ill->ill_lock); 1311 ipif->ipif_state_flags &= ~IPIF_CHANGING; 1312 mutex_exit(&ipif->ipif_ill->ill_lock); 1313 } 1314 } else { 1315 /* 1316 * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't 1317 * be just inet_freemsg. we have to restart it 1318 * otherwise the thread will be stuck. 1319 */ 1320 inet_freemsg(mp); 1321 } 1322 return (B_TRUE); 1323 } 1324 1325 /* 1326 * The ill is closing. Cleanup all the pending mps. Called exclusively 1327 * towards the end of ill_delete. The refcount has gone to 0. So nobody 1328 * knows this ill, and hence nobody can add an mp to this list 1329 */ 1330 static void 1331 ill_pending_mp_cleanup(ill_t *ill) 1332 { 1333 mblk_t *mp; 1334 queue_t *q; 1335 1336 ASSERT(IAM_WRITER_ILL(ill)); 1337 1338 mutex_enter(&ill->ill_lock); 1339 /* 1340 * Every mp on the pending mp list originating from an ioctl 1341 * added 1 to the conn refcnt, at the start of the ioctl. 1342 * So bump it down now. See comments in ip_wput_nondata() 1343 */ 1344 while (ill->ill_pending_mp != NULL) { 1345 mp = ill->ill_pending_mp; 1346 ill->ill_pending_mp = mp->b_next; 1347 mutex_exit(&ill->ill_lock); 1348 1349 q = mp->b_queue; 1350 ASSERT(CONN_Q(q)); 1351 mp->b_next = NULL; 1352 mp->b_prev = NULL; 1353 mp->b_queue = NULL; 1354 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 1355 mutex_enter(&ill->ill_lock); 1356 } 1357 ill->ill_pending_ipif = NULL; 1358 1359 mutex_exit(&ill->ill_lock); 1360 } 1361 1362 /* 1363 * Called in the conn close path and ill delete path 1364 */ 1365 static void 1366 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 1367 { 1368 ipsq_t *ipsq; 1369 mblk_t *prev; 1370 mblk_t *curr; 1371 mblk_t *next; 1372 queue_t *q; 1373 mblk_t *tmp_list = NULL; 1374 1375 ASSERT(IAM_WRITER_ILL(ill)); 1376 if (connp != NULL) 1377 q = CONNP_TO_WQ(connp); 1378 else 1379 q = ill->ill_wq; 1380 1381 ipsq = ill->ill_phyint->phyint_ipsq; 1382 /* 1383 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 1384 * In the case of ioctl from a conn, there can be only 1 mp 1385 * queued on the ipsq. If an ill is being unplumbed, only messages 1386 * related to this ill are flushed, like M_ERROR or M_HANGUP message. 1387 * ioctls meant for this ill form conn's are not flushed. They will 1388 * be processed during ipsq_exit and will not find the ill and will 1389 * return error. 1390 */ 1391 mutex_enter(&ipsq->ipsq_lock); 1392 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 1393 curr = next) { 1394 next = curr->b_next; 1395 if (curr->b_queue == q || curr->b_queue == RD(q)) { 1396 /* Unlink the mblk from the pending mp list */ 1397 if (prev != NULL) { 1398 prev->b_next = curr->b_next; 1399 } else { 1400 ASSERT(ipsq->ipsq_xopq_mphead == curr); 1401 ipsq->ipsq_xopq_mphead = curr->b_next; 1402 } 1403 if (ipsq->ipsq_xopq_mptail == curr) 1404 ipsq->ipsq_xopq_mptail = prev; 1405 /* 1406 * Create a temporary list and release the ipsq lock 1407 * New elements are added to the head of the tmp_list 1408 */ 1409 curr->b_next = tmp_list; 1410 tmp_list = curr; 1411 } else { 1412 prev = curr; 1413 } 1414 } 1415 mutex_exit(&ipsq->ipsq_lock); 1416 1417 while (tmp_list != NULL) { 1418 curr = tmp_list; 1419 tmp_list = curr->b_next; 1420 curr->b_next = NULL; 1421 curr->b_prev = NULL; 1422 curr->b_queue = NULL; 1423 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 1424 ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? 1425 CONN_CLOSE : NO_COPYOUT, NULL); 1426 } else { 1427 /* 1428 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1429 * this can't be just inet_freemsg. we have to 1430 * restart it otherwise the thread will be stuck. 1431 */ 1432 inet_freemsg(curr); 1433 } 1434 } 1435 } 1436 1437 /* 1438 * This conn has started closing. Cleanup any pending ioctl from this conn. 1439 * STREAMS ensures that there can be at most 1 ioctl pending on a stream. 1440 */ 1441 void 1442 conn_ioctl_cleanup(conn_t *connp) 1443 { 1444 mblk_t *curr; 1445 ipsq_t *ipsq; 1446 ill_t *ill; 1447 boolean_t refheld; 1448 1449 /* 1450 * Is any exclusive ioctl pending ? If so clean it up. If the 1451 * ioctl has not yet started, the mp is pending in the list headed by 1452 * ipsq_xopq_head. If the ioctl has started the mp could be present in 1453 * ipx_pending_mp. If the ioctl timed out in the streamhead but 1454 * is currently executing now the mp is not queued anywhere but 1455 * conn_oper_pending_ill is null. The conn close will wait 1456 * till the conn_ref drops to zero. 1457 */ 1458 mutex_enter(&connp->conn_lock); 1459 ill = connp->conn_oper_pending_ill; 1460 if (ill == NULL) { 1461 mutex_exit(&connp->conn_lock); 1462 return; 1463 } 1464 1465 curr = ill_pending_mp_get(ill, &connp, 0); 1466 if (curr != NULL) { 1467 mutex_exit(&connp->conn_lock); 1468 CONN_DEC_REF(connp); 1469 inet_freemsg(curr); 1470 return; 1471 } 1472 /* 1473 * We may not be able to refhold the ill if the ill/ipif 1474 * is changing. But we need to make sure that the ill will 1475 * not vanish. So we just bump up the ill_waiter count. 1476 */ 1477 refheld = ill_waiter_inc(ill); 1478 mutex_exit(&connp->conn_lock); 1479 if (refheld) { 1480 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1481 ill_waiter_dcr(ill); 1482 /* 1483 * Check whether this ioctl has started and is 1484 * pending. If it is not found there then check 1485 * whether this ioctl has not even started and is in 1486 * the ipsq_xopq list. 1487 */ 1488 if (!ipsq_pending_mp_cleanup(ill, connp)) 1489 ipsq_xopq_mp_cleanup(ill, connp); 1490 ipsq = ill->ill_phyint->phyint_ipsq; 1491 ipsq_exit(ipsq); 1492 return; 1493 } 1494 } 1495 1496 /* 1497 * The ill is also closing and we could not bump up the 1498 * ill_waiter_count or we could not enter the ipsq. Leave 1499 * the cleanup to ill_delete 1500 */ 1501 mutex_enter(&connp->conn_lock); 1502 while (connp->conn_oper_pending_ill != NULL) 1503 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1504 mutex_exit(&connp->conn_lock); 1505 if (refheld) 1506 ill_waiter_dcr(ill); 1507 } 1508 1509 /* 1510 * ipcl_walk function for cleaning up conn_*_ill fields. 1511 */ 1512 static void 1513 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1514 { 1515 ill_t *ill = (ill_t *)arg; 1516 ire_t *ire; 1517 1518 mutex_enter(&connp->conn_lock); 1519 if (connp->conn_multicast_ill == ill) { 1520 /* Revert to late binding */ 1521 connp->conn_multicast_ill = NULL; 1522 } 1523 if (connp->conn_incoming_ill == ill) 1524 connp->conn_incoming_ill = NULL; 1525 if (connp->conn_outgoing_ill == ill) 1526 connp->conn_outgoing_ill = NULL; 1527 if (connp->conn_dhcpinit_ill == ill) { 1528 connp->conn_dhcpinit_ill = NULL; 1529 ASSERT(ill->ill_dhcpinit != 0); 1530 atomic_dec_32(&ill->ill_dhcpinit); 1531 } 1532 if (connp->conn_ire_cache != NULL) { 1533 ire = connp->conn_ire_cache; 1534 /* 1535 * Source address selection makes it possible for IRE_CACHE 1536 * entries to be created with ire_stq coming from interface X 1537 * and ipif coming from interface Y. Thus whenever interface 1538 * X goes down, remove all references to it by checking both 1539 * on ire_ipif and ire_stq. 1540 */ 1541 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1542 (ire->ire_type == IRE_CACHE && 1543 ire->ire_stq == ill->ill_wq)) { 1544 connp->conn_ire_cache = NULL; 1545 mutex_exit(&connp->conn_lock); 1546 ire_refrele_notr(ire); 1547 return; 1548 } 1549 } 1550 mutex_exit(&connp->conn_lock); 1551 } 1552 1553 static void 1554 ill_down_ipifs_tail(ill_t *ill) 1555 { 1556 ipif_t *ipif; 1557 1558 ASSERT(IAM_WRITER_ILL(ill)); 1559 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1560 ipif_non_duplicate(ipif); 1561 ipif_down_tail(ipif); 1562 } 1563 } 1564 1565 /* ARGSUSED */ 1566 void 1567 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1568 { 1569 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1570 ill_down_ipifs_tail(q->q_ptr); 1571 freemsg(mp); 1572 ipsq_current_finish(ipsq); 1573 } 1574 1575 /* 1576 * ill_down_start is called when we want to down this ill and bring it up again 1577 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1578 * all interfaces, but don't tear down any plumbing. 1579 */ 1580 boolean_t 1581 ill_down_start(queue_t *q, mblk_t *mp) 1582 { 1583 ill_t *ill = q->q_ptr; 1584 ipif_t *ipif; 1585 1586 ASSERT(IAM_WRITER_ILL(ill)); 1587 1588 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1589 (void) ipif_down(ipif, NULL, NULL); 1590 1591 ill_down(ill); 1592 1593 (void) ipsq_pending_mp_cleanup(ill, NULL); 1594 1595 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1596 1597 /* 1598 * Atomically test and add the pending mp if references are active. 1599 */ 1600 mutex_enter(&ill->ill_lock); 1601 if (!ill_is_quiescent(ill)) { 1602 /* call cannot fail since `conn_t *' argument is NULL */ 1603 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1604 mp, ILL_DOWN); 1605 mutex_exit(&ill->ill_lock); 1606 return (B_FALSE); 1607 } 1608 mutex_exit(&ill->ill_lock); 1609 return (B_TRUE); 1610 } 1611 1612 static void 1613 ill_down(ill_t *ill) 1614 { 1615 ip_stack_t *ipst = ill->ill_ipst; 1616 1617 /* Blow off any IREs dependent on this ILL. */ 1618 ire_walk(ill_downi, ill, ipst); 1619 1620 /* Remove any conn_*_ill depending on this ill */ 1621 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1622 } 1623 1624 /* 1625 * ire_walk routine used to delete every IRE that depends on queues 1626 * associated with 'ill'. (Always called as writer.) 1627 */ 1628 static void 1629 ill_downi(ire_t *ire, char *ill_arg) 1630 { 1631 ill_t *ill = (ill_t *)ill_arg; 1632 1633 /* 1634 * Source address selection makes it possible for IRE_CACHE 1635 * entries to be created with ire_stq coming from interface X 1636 * and ipif coming from interface Y. Thus whenever interface 1637 * X goes down, remove all references to it by checking both 1638 * on ire_ipif and ire_stq. 1639 */ 1640 if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || 1641 (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { 1642 ire_delete(ire); 1643 } 1644 } 1645 1646 /* 1647 * Remove ire/nce from the fastpath list. 1648 */ 1649 void 1650 ill_fastpath_nack(ill_t *ill) 1651 { 1652 nce_fastpath_list_dispatch(ill, NULL, NULL); 1653 } 1654 1655 /* Consume an M_IOCACK of the fastpath probe. */ 1656 void 1657 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1658 { 1659 mblk_t *mp1 = mp; 1660 1661 /* 1662 * If this was the first attempt turn on the fastpath probing. 1663 */ 1664 mutex_enter(&ill->ill_lock); 1665 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1666 ill->ill_dlpi_fastpath_state = IDS_OK; 1667 mutex_exit(&ill->ill_lock); 1668 1669 /* Free the M_IOCACK mblk, hold on to the data */ 1670 mp = mp->b_cont; 1671 freeb(mp1); 1672 if (mp == NULL) 1673 return; 1674 if (mp->b_cont != NULL) { 1675 /* 1676 * Update all IRE's or NCE's that are waiting for 1677 * fastpath update. 1678 */ 1679 nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp); 1680 mp1 = mp->b_cont; 1681 freeb(mp); 1682 mp = mp1; 1683 } else { 1684 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1685 } 1686 1687 freeb(mp); 1688 } 1689 1690 /* 1691 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1692 * The data portion of the request is a dl_unitdata_req_t template for 1693 * what we would send downstream in the absence of a fastpath confirmation. 1694 */ 1695 int 1696 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1697 { 1698 struct iocblk *ioc; 1699 mblk_t *mp; 1700 1701 if (dlur_mp == NULL) 1702 return (EINVAL); 1703 1704 mutex_enter(&ill->ill_lock); 1705 switch (ill->ill_dlpi_fastpath_state) { 1706 case IDS_FAILED: 1707 /* 1708 * Driver NAKed the first fastpath ioctl - assume it doesn't 1709 * support it. 1710 */ 1711 mutex_exit(&ill->ill_lock); 1712 return (ENOTSUP); 1713 case IDS_UNKNOWN: 1714 /* This is the first probe */ 1715 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1716 break; 1717 default: 1718 break; 1719 } 1720 mutex_exit(&ill->ill_lock); 1721 1722 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1723 return (EAGAIN); 1724 1725 mp->b_cont = copyb(dlur_mp); 1726 if (mp->b_cont == NULL) { 1727 freeb(mp); 1728 return (EAGAIN); 1729 } 1730 1731 ioc = (struct iocblk *)mp->b_rptr; 1732 ioc->ioc_count = msgdsize(mp->b_cont); 1733 1734 putnext(ill->ill_wq, mp); 1735 return (0); 1736 } 1737 1738 void 1739 ill_capability_probe(ill_t *ill) 1740 { 1741 mblk_t *mp; 1742 1743 ASSERT(IAM_WRITER_ILL(ill)); 1744 1745 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1746 ill->ill_dlpi_capab_state != IDCS_FAILED) 1747 return; 1748 1749 /* 1750 * We are starting a new cycle of capability negotiation. 1751 * Free up the capab reset messages of any previous incarnation. 1752 * We will do a fresh allocation when we get the response to our probe 1753 */ 1754 if (ill->ill_capab_reset_mp != NULL) { 1755 freemsg(ill->ill_capab_reset_mp); 1756 ill->ill_capab_reset_mp = NULL; 1757 } 1758 1759 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1760 1761 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1762 if (mp == NULL) 1763 return; 1764 1765 ill_capability_send(ill, mp); 1766 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1767 } 1768 1769 void 1770 ill_capability_reset(ill_t *ill, boolean_t reneg) 1771 { 1772 ASSERT(IAM_WRITER_ILL(ill)); 1773 1774 if (ill->ill_dlpi_capab_state != IDCS_OK) 1775 return; 1776 1777 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1778 1779 ill_capability_send(ill, ill->ill_capab_reset_mp); 1780 ill->ill_capab_reset_mp = NULL; 1781 /* 1782 * We turn off all capabilities except those pertaining to 1783 * direct function call capabilities viz. ILL_CAPAB_DLD* 1784 * which will be turned off by the corresponding reset functions. 1785 */ 1786 ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM | 1787 ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP); 1788 } 1789 1790 static void 1791 ill_capability_reset_alloc(ill_t *ill) 1792 { 1793 mblk_t *mp; 1794 size_t size = 0; 1795 int err; 1796 dl_capability_req_t *capb; 1797 1798 ASSERT(IAM_WRITER_ILL(ill)); 1799 ASSERT(ill->ill_capab_reset_mp == NULL); 1800 1801 if (ILL_MDT_CAPABLE(ill)) 1802 size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 1803 1804 if (ILL_HCKSUM_CAPABLE(ill)) { 1805 size += sizeof (dl_capability_sub_t) + 1806 sizeof (dl_capab_hcksum_t); 1807 } 1808 1809 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1810 size += sizeof (dl_capability_sub_t) + 1811 sizeof (dl_capab_zerocopy_t); 1812 } 1813 1814 if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) { 1815 size += sizeof (dl_capability_sub_t); 1816 size += ill_capability_ipsec_reset_size(ill, NULL, NULL, 1817 NULL, NULL); 1818 } 1819 1820 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1821 size += sizeof (dl_capability_sub_t) + 1822 sizeof (dl_capab_dld_t); 1823 } 1824 1825 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1826 STR_NOSIG, &err); 1827 1828 mp->b_datap->db_type = M_PROTO; 1829 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1830 1831 capb = (dl_capability_req_t *)mp->b_rptr; 1832 capb->dl_primitive = DL_CAPABILITY_REQ; 1833 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1834 capb->dl_sub_length = size; 1835 1836 mp->b_wptr += sizeof (dl_capability_req_t); 1837 1838 /* 1839 * Each handler fills in the corresponding dl_capability_sub_t 1840 * inside the mblk, 1841 */ 1842 ill_capability_mdt_reset_fill(ill, mp); 1843 ill_capability_hcksum_reset_fill(ill, mp); 1844 ill_capability_zerocopy_reset_fill(ill, mp); 1845 ill_capability_ipsec_reset_fill(ill, mp); 1846 ill_capability_dld_reset_fill(ill, mp); 1847 1848 ill->ill_capab_reset_mp = mp; 1849 } 1850 1851 static void 1852 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1853 { 1854 dl_capab_id_t *id_ic; 1855 uint_t sub_dl_cap = outers->dl_cap; 1856 dl_capability_sub_t *inners; 1857 uint8_t *capend; 1858 1859 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1860 1861 /* 1862 * Note: range checks here are not absolutely sufficient to 1863 * make us robust against malformed messages sent by drivers; 1864 * this is in keeping with the rest of IP's dlpi handling. 1865 * (Remember, it's coming from something else in the kernel 1866 * address space) 1867 */ 1868 1869 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1870 if (capend > mp->b_wptr) { 1871 cmn_err(CE_WARN, "ill_capability_id_ack: " 1872 "malformed sub-capability too long for mblk"); 1873 return; 1874 } 1875 1876 id_ic = (dl_capab_id_t *)(outers + 1); 1877 1878 if (outers->dl_length < sizeof (*id_ic) || 1879 (inners = &id_ic->id_subcap, 1880 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1881 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1882 "encapsulated capab type %d too long for mblk", 1883 inners->dl_cap); 1884 return; 1885 } 1886 1887 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1888 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1889 "isn't as expected; pass-thru module(s) detected, " 1890 "discarding capability\n", inners->dl_cap)); 1891 return; 1892 } 1893 1894 /* Process the encapsulated sub-capability */ 1895 ill_capability_dispatch(ill, mp, inners, B_TRUE); 1896 } 1897 1898 /* 1899 * Process Multidata Transmit capability negotiation ack received from a 1900 * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a 1901 * DL_CAPABILITY_ACK message. 1902 */ 1903 static void 1904 ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1905 { 1906 mblk_t *nmp = NULL; 1907 dl_capability_req_t *oc; 1908 dl_capab_mdt_t *mdt_ic, *mdt_oc; 1909 ill_mdt_capab_t **ill_mdt_capab; 1910 uint_t sub_dl_cap = isub->dl_cap; 1911 uint8_t *capend; 1912 1913 ASSERT(sub_dl_cap == DL_CAPAB_MDT); 1914 1915 ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; 1916 1917 /* 1918 * Note: range checks here are not absolutely sufficient to 1919 * make us robust against malformed messages sent by drivers; 1920 * this is in keeping with the rest of IP's dlpi handling. 1921 * (Remember, it's coming from something else in the kernel 1922 * address space) 1923 */ 1924 1925 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1926 if (capend > mp->b_wptr) { 1927 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1928 "malformed sub-capability too long for mblk"); 1929 return; 1930 } 1931 1932 mdt_ic = (dl_capab_mdt_t *)(isub + 1); 1933 1934 if (mdt_ic->mdt_version != MDT_VERSION_2) { 1935 cmn_err(CE_CONT, "ill_capability_mdt_ack: " 1936 "unsupported MDT sub-capability (version %d, expected %d)", 1937 mdt_ic->mdt_version, MDT_VERSION_2); 1938 return; 1939 } 1940 1941 if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { 1942 ip1dbg(("ill_capability_mdt_ack: mid token for MDT " 1943 "capability isn't as expected; pass-thru module(s) " 1944 "detected, discarding capability\n")); 1945 return; 1946 } 1947 1948 if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { 1949 1950 if (*ill_mdt_capab == NULL) { 1951 *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), 1952 KM_NOSLEEP); 1953 if (*ill_mdt_capab == NULL) { 1954 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1955 "could not enable MDT version %d " 1956 "for %s (ENOMEM)\n", MDT_VERSION_2, 1957 ill->ill_name); 1958 return; 1959 } 1960 } 1961 1962 ip1dbg(("ill_capability_mdt_ack: interface %s supports " 1963 "MDT version %d (%d bytes leading, %d bytes trailing " 1964 "header spaces, %d max pld bufs, %d span limit)\n", 1965 ill->ill_name, MDT_VERSION_2, 1966 mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, 1967 mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); 1968 1969 (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; 1970 (*ill_mdt_capab)->ill_mdt_on = 1; 1971 /* 1972 * Round the following values to the nearest 32-bit; ULP 1973 * may further adjust them to accomodate for additional 1974 * protocol headers. We pass these values to ULP during 1975 * bind time. 1976 */ 1977 (*ill_mdt_capab)->ill_mdt_hdr_head = 1978 roundup(mdt_ic->mdt_hdr_head, 4); 1979 (*ill_mdt_capab)->ill_mdt_hdr_tail = 1980 roundup(mdt_ic->mdt_hdr_tail, 4); 1981 (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; 1982 (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; 1983 1984 ill->ill_capabilities |= ILL_CAPAB_MDT; 1985 } else { 1986 uint_t size; 1987 uchar_t *rptr; 1988 1989 size = sizeof (dl_capability_req_t) + 1990 sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); 1991 1992 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1993 cmn_err(CE_WARN, "ill_capability_mdt_ack: " 1994 "could not enable MDT for %s (ENOMEM)\n", 1995 ill->ill_name); 1996 return; 1997 } 1998 1999 rptr = nmp->b_rptr; 2000 /* initialize dl_capability_req_t */ 2001 oc = (dl_capability_req_t *)nmp->b_rptr; 2002 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2003 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2004 sizeof (dl_capab_mdt_t); 2005 nmp->b_rptr += sizeof (dl_capability_req_t); 2006 2007 /* initialize dl_capability_sub_t */ 2008 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2009 nmp->b_rptr += sizeof (*isub); 2010 2011 /* initialize dl_capab_mdt_t */ 2012 mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; 2013 bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); 2014 2015 nmp->b_rptr = rptr; 2016 2017 ip1dbg(("ill_capability_mdt_ack: asking interface %s " 2018 "to enable MDT version %d\n", ill->ill_name, 2019 MDT_VERSION_2)); 2020 2021 /* set ENABLE flag */ 2022 mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; 2023 2024 /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ 2025 ill_capability_send(ill, nmp); 2026 } 2027 } 2028 2029 static void 2030 ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp) 2031 { 2032 dl_capab_mdt_t *mdt_subcap; 2033 dl_capability_sub_t *dl_subcap; 2034 2035 if (!ILL_MDT_CAPABLE(ill)) 2036 return; 2037 2038 ASSERT(ill->ill_mdt_capab != NULL); 2039 2040 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2041 dl_subcap->dl_cap = DL_CAPAB_MDT; 2042 dl_subcap->dl_length = sizeof (*mdt_subcap); 2043 2044 mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); 2045 mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; 2046 mdt_subcap->mdt_flags = 0; 2047 mdt_subcap->mdt_hdr_head = 0; 2048 mdt_subcap->mdt_hdr_tail = 0; 2049 2050 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap); 2051 } 2052 2053 static void 2054 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 2055 { 2056 dl_capability_sub_t *dl_subcap; 2057 2058 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2059 return; 2060 2061 /* 2062 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 2063 * initialized below since it is not used by DLD. 2064 */ 2065 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2066 dl_subcap->dl_cap = DL_CAPAB_DLD; 2067 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 2068 2069 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 2070 } 2071 2072 /* 2073 * Send a DL_NOTIFY_REQ to the specified ill to enable 2074 * DL_NOTE_PROMISC_ON/OFF_PHYS notifications. 2075 * Invoked by ill_capability_ipsec_ack() before enabling IPsec hardware 2076 * acceleration. 2077 * Returns B_TRUE on success, B_FALSE if the message could not be sent. 2078 */ 2079 static boolean_t 2080 ill_enable_promisc_notify(ill_t *ill) 2081 { 2082 mblk_t *mp; 2083 dl_notify_req_t *req; 2084 2085 IPSECHW_DEBUG(IPSECHW_PKT, ("ill_enable_promisc_notify:\n")); 2086 2087 mp = ip_dlpi_alloc(sizeof (dl_notify_req_t), DL_NOTIFY_REQ); 2088 if (mp == NULL) 2089 return (B_FALSE); 2090 2091 req = (dl_notify_req_t *)mp->b_rptr; 2092 req->dl_notifications = DL_NOTE_PROMISC_ON_PHYS | 2093 DL_NOTE_PROMISC_OFF_PHYS; 2094 2095 ill_dlpi_send(ill, mp); 2096 2097 return (B_TRUE); 2098 } 2099 2100 /* 2101 * Allocate an IPsec capability request which will be filled by our 2102 * caller to turn on support for one or more algorithms. 2103 */ 2104 static mblk_t * 2105 ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) 2106 { 2107 mblk_t *nmp; 2108 dl_capability_req_t *ocap; 2109 dl_capab_ipsec_t *ocip; 2110 dl_capab_ipsec_t *icip; 2111 uint8_t *ptr; 2112 icip = (dl_capab_ipsec_t *)(isub + 1); 2113 2114 /* 2115 * The first time around, we send a DL_NOTIFY_REQ to enable 2116 * PROMISC_ON/OFF notification from the provider. We need to 2117 * do this before enabling the algorithms to avoid leakage of 2118 * cleartext packets. 2119 */ 2120 2121 if (!ill_enable_promisc_notify(ill)) 2122 return (NULL); 2123 2124 /* 2125 * Allocate new mblk which will contain a new capability 2126 * request to enable the capabilities. 2127 */ 2128 2129 nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + 2130 sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); 2131 if (nmp == NULL) 2132 return (NULL); 2133 2134 ptr = nmp->b_rptr; 2135 2136 /* initialize dl_capability_req_t */ 2137 ocap = (dl_capability_req_t *)ptr; 2138 ocap->dl_sub_offset = sizeof (dl_capability_req_t); 2139 ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; 2140 ptr += sizeof (dl_capability_req_t); 2141 2142 /* initialize dl_capability_sub_t */ 2143 bcopy(isub, ptr, sizeof (*isub)); 2144 ptr += sizeof (*isub); 2145 2146 /* initialize dl_capab_ipsec_t */ 2147 ocip = (dl_capab_ipsec_t *)ptr; 2148 bcopy(icip, ocip, sizeof (*icip)); 2149 2150 nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); 2151 return (nmp); 2152 } 2153 2154 /* 2155 * Process an IPsec capability negotiation ack received from a DLS Provider. 2156 * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or 2157 * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. 2158 */ 2159 static void 2160 ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2161 { 2162 dl_capab_ipsec_t *icip; 2163 dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ 2164 dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ 2165 uint_t cipher, nciphers; 2166 mblk_t *nmp; 2167 uint_t alg_len; 2168 boolean_t need_sadb_dump; 2169 uint_t sub_dl_cap = isub->dl_cap; 2170 ill_ipsec_capab_t **ill_capab; 2171 uint64_t ill_capab_flag; 2172 uint8_t *capend, *ciphend; 2173 boolean_t sadb_resync; 2174 2175 ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || 2176 sub_dl_cap == DL_CAPAB_IPSEC_ESP); 2177 2178 if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { 2179 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; 2180 ill_capab_flag = ILL_CAPAB_AH; 2181 } else { 2182 ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; 2183 ill_capab_flag = ILL_CAPAB_ESP; 2184 } 2185 2186 /* 2187 * If the ill capability structure exists, then this incoming 2188 * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. 2189 * If this is so, then we'd need to resynchronize the SADB 2190 * after re-enabling the offloaded ciphers. 2191 */ 2192 sadb_resync = (*ill_capab != NULL); 2193 2194 /* 2195 * Note: range checks here are not absolutely sufficient to 2196 * make us robust against malformed messages sent by drivers; 2197 * this is in keeping with the rest of IP's dlpi handling. 2198 * (Remember, it's coming from something else in the kernel 2199 * address space) 2200 */ 2201 2202 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2203 if (capend > mp->b_wptr) { 2204 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2205 "malformed sub-capability too long for mblk"); 2206 return; 2207 } 2208 2209 /* 2210 * There are two types of acks we process here: 2211 * 1. acks in reply to a (first form) generic capability req 2212 * (no ENABLE flag set) 2213 * 2. acks in reply to a ENABLE capability req. 2214 * (ENABLE flag set) 2215 * 2216 * We process the subcapability passed as argument as follows: 2217 * 1 do initializations 2218 * 1.1 initialize nmp = NULL 2219 * 1.2 set need_sadb_dump to B_FALSE 2220 * 2 for each cipher in subcapability: 2221 * 2.1 if ENABLE flag is set: 2222 * 2.1.1 update per-ill ipsec capabilities info 2223 * 2.1.2 set need_sadb_dump to B_TRUE 2224 * 2.2 if ENABLE flag is not set: 2225 * 2.2.1 if nmp is NULL: 2226 * 2.2.1.1 allocate and initialize nmp 2227 * 2.2.1.2 init current pos in nmp 2228 * 2.2.2 copy current cipher to current pos in nmp 2229 * 2.2.3 set ENABLE flag in nmp 2230 * 2.2.4 update current pos 2231 * 3 if nmp is not equal to NULL, send enable request 2232 * 3.1 send capability request 2233 * 4 if need_sadb_dump is B_TRUE 2234 * 4.1 enable promiscuous on/off notifications 2235 * 4.2 call ill_dlpi_send(isub->dlcap) to send all 2236 * AH or ESP SA's to interface. 2237 */ 2238 2239 nmp = NULL; 2240 oalg = NULL; 2241 need_sadb_dump = B_FALSE; 2242 icip = (dl_capab_ipsec_t *)(isub + 1); 2243 ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); 2244 2245 nciphers = icip->cip_nciphers; 2246 ciphend = (uint8_t *)(ialg + icip->cip_nciphers); 2247 2248 if (ciphend > capend) { 2249 cmn_err(CE_WARN, "ill_capability_ipsec_ack: " 2250 "too many ciphers for sub-capability len"); 2251 return; 2252 } 2253 2254 for (cipher = 0; cipher < nciphers; cipher++) { 2255 alg_len = sizeof (dl_capab_ipsec_alg_t); 2256 2257 if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { 2258 /* 2259 * TBD: when we provide a way to disable capabilities 2260 * from above, need to manage the request-pending state 2261 * and fail if we were not expecting this ACK. 2262 */ 2263 IPSECHW_DEBUG(IPSECHW_CAPAB, 2264 ("ill_capability_ipsec_ack: got ENABLE ACK\n")); 2265 2266 /* 2267 * Update IPsec capabilities for this ill 2268 */ 2269 2270 if (*ill_capab == NULL) { 2271 IPSECHW_DEBUG(IPSECHW_CAPAB, 2272 ("ill_capability_ipsec_ack: " 2273 "allocating ipsec_capab for ill\n")); 2274 *ill_capab = ill_ipsec_capab_alloc(); 2275 2276 if (*ill_capab == NULL) { 2277 cmn_err(CE_WARN, 2278 "ill_capability_ipsec_ack: " 2279 "could not enable IPsec Hardware " 2280 "acceleration for %s (ENOMEM)\n", 2281 ill->ill_name); 2282 return; 2283 } 2284 } 2285 2286 ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || 2287 ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); 2288 2289 if (ialg->alg_prim >= MAX_IPSEC_ALGS) { 2290 cmn_err(CE_WARN, 2291 "ill_capability_ipsec_ack: " 2292 "malformed IPsec algorithm id %d", 2293 ialg->alg_prim); 2294 continue; 2295 } 2296 2297 if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { 2298 IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, 2299 ialg->alg_prim); 2300 } else { 2301 ipsec_capab_algparm_t *alp; 2302 2303 IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, 2304 ialg->alg_prim); 2305 if (!ill_ipsec_capab_resize_algparm(*ill_capab, 2306 ialg->alg_prim)) { 2307 cmn_err(CE_WARN, 2308 "ill_capability_ipsec_ack: " 2309 "no space for IPsec alg id %d", 2310 ialg->alg_prim); 2311 continue; 2312 } 2313 alp = &((*ill_capab)->encr_algparm[ 2314 ialg->alg_prim]); 2315 alp->minkeylen = ialg->alg_minbits; 2316 alp->maxkeylen = ialg->alg_maxbits; 2317 } 2318 ill->ill_capabilities |= ill_capab_flag; 2319 /* 2320 * indicate that a capability was enabled, which 2321 * will be used below to kick off a SADB dump 2322 * to the ill. 2323 */ 2324 need_sadb_dump = B_TRUE; 2325 } else { 2326 IPSECHW_DEBUG(IPSECHW_CAPAB, 2327 ("ill_capability_ipsec_ack: enabling alg 0x%x\n", 2328 ialg->alg_prim)); 2329 2330 if (nmp == NULL) { 2331 nmp = ill_alloc_ipsec_cap_req(ill, isub); 2332 if (nmp == NULL) { 2333 /* 2334 * Sending the PROMISC_ON/OFF 2335 * notification request failed. 2336 * We cannot enable the algorithms 2337 * since the Provider will not 2338 * notify IP of promiscous mode 2339 * changes, which could lead 2340 * to leakage of packets. 2341 */ 2342 cmn_err(CE_WARN, 2343 "ill_capability_ipsec_ack: " 2344 "could not enable IPsec Hardware " 2345 "acceleration for %s (ENOMEM)\n", 2346 ill->ill_name); 2347 return; 2348 } 2349 /* ptr to current output alg specifier */ 2350 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2351 } 2352 2353 /* 2354 * Copy current alg specifier, set ENABLE 2355 * flag, and advance to next output alg. 2356 * For now we enable all IPsec capabilities. 2357 */ 2358 ASSERT(oalg != NULL); 2359 bcopy(ialg, oalg, alg_len); 2360 oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; 2361 nmp->b_wptr += alg_len; 2362 oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; 2363 } 2364 2365 /* move to next input algorithm specifier */ 2366 ialg = (dl_capab_ipsec_alg_t *) 2367 ((char *)ialg + alg_len); 2368 } 2369 2370 if (nmp != NULL) 2371 /* 2372 * nmp points to a DL_CAPABILITY_REQ message to enable 2373 * IPsec hardware acceleration. 2374 */ 2375 ill_capability_send(ill, nmp); 2376 2377 if (need_sadb_dump) 2378 /* 2379 * An acknowledgement corresponding to a request to 2380 * enable acceleration was received, notify SADB. 2381 */ 2382 ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); 2383 } 2384 2385 /* 2386 * Given an mblk with enough space in it, create sub-capability entries for 2387 * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised 2388 * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, 2389 * in preparation for the reset the DL_CAPABILITY_REQ message. 2390 */ 2391 static void 2392 ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, 2393 ill_ipsec_capab_t *ill_cap, mblk_t *mp) 2394 { 2395 dl_capab_ipsec_t *oipsec; 2396 dl_capab_ipsec_alg_t *oalg; 2397 dl_capability_sub_t *dl_subcap; 2398 int i, k; 2399 2400 ASSERT(nciphers > 0); 2401 ASSERT(ill_cap != NULL); 2402 ASSERT(mp != NULL); 2403 ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); 2404 2405 /* dl_capability_sub_t for "stype" */ 2406 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2407 dl_subcap->dl_cap = stype; 2408 dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; 2409 mp->b_wptr += sizeof (dl_capability_sub_t); 2410 2411 /* dl_capab_ipsec_t for "stype" */ 2412 oipsec = (dl_capab_ipsec_t *)mp->b_wptr; 2413 oipsec->cip_version = 1; 2414 oipsec->cip_nciphers = nciphers; 2415 mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; 2416 2417 /* create entries for "stype" AUTH ciphers */ 2418 for (i = 0; i < ill_cap->algs_size; i++) { 2419 for (k = 0; k < BITSPERBYTE; k++) { 2420 if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) 2421 continue; 2422 2423 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2424 bzero((void *)oalg, sizeof (*oalg)); 2425 oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; 2426 oalg->alg_prim = k + (BITSPERBYTE * i); 2427 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2428 } 2429 } 2430 /* create entries for "stype" ENCR ciphers */ 2431 for (i = 0; i < ill_cap->algs_size; i++) { 2432 for (k = 0; k < BITSPERBYTE; k++) { 2433 if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) 2434 continue; 2435 2436 oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; 2437 bzero((void *)oalg, sizeof (*oalg)); 2438 oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; 2439 oalg->alg_prim = k + (BITSPERBYTE * i); 2440 mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); 2441 } 2442 } 2443 } 2444 2445 /* 2446 * Macro to count number of 1s in a byte (8-bit word). The total count is 2447 * accumulated into the passed-in argument (sum). We could use SPARCv9's 2448 * POPC instruction, but our macro is more flexible for an arbitrary length 2449 * of bytes, such as {auth,encr}_hw_algs. These variables are currently 2450 * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length 2451 * stays that way, we can reduce the number of iterations required. 2452 */ 2453 #define COUNT_1S(val, sum) { \ 2454 uint8_t x = val & 0xff; \ 2455 x = (x & 0x55) + ((x >> 1) & 0x55); \ 2456 x = (x & 0x33) + ((x >> 2) & 0x33); \ 2457 sum += (x & 0xf) + ((x >> 4) & 0xf); \ 2458 } 2459 2460 /* ARGSUSED */ 2461 static int 2462 ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp, 2463 int *esp_cntp, int *esp_lenp) 2464 { 2465 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2466 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2467 uint64_t ill_capabilities = ill->ill_capabilities; 2468 int ah_cnt = 0, esp_cnt = 0; 2469 int ah_len = 0, esp_len = 0; 2470 int i, size = 0; 2471 2472 if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) 2473 return (0); 2474 2475 ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); 2476 ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); 2477 2478 /* Find out the number of ciphers for AH */ 2479 if (cap_ah != NULL) { 2480 for (i = 0; i < cap_ah->algs_size; i++) { 2481 COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); 2482 COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); 2483 } 2484 if (ah_cnt > 0) { 2485 size += sizeof (dl_capability_sub_t) + 2486 sizeof (dl_capab_ipsec_t); 2487 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2488 ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2489 size += ah_len; 2490 } 2491 } 2492 2493 /* Find out the number of ciphers for ESP */ 2494 if (cap_esp != NULL) { 2495 for (i = 0; i < cap_esp->algs_size; i++) { 2496 COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); 2497 COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); 2498 } 2499 if (esp_cnt > 0) { 2500 size += sizeof (dl_capability_sub_t) + 2501 sizeof (dl_capab_ipsec_t); 2502 /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ 2503 esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); 2504 size += esp_len; 2505 } 2506 } 2507 2508 if (ah_cntp != NULL) 2509 *ah_cntp = ah_cnt; 2510 if (ah_lenp != NULL) 2511 *ah_lenp = ah_len; 2512 if (esp_cntp != NULL) 2513 *esp_cntp = esp_cnt; 2514 if (esp_lenp != NULL) 2515 *esp_lenp = esp_len; 2516 2517 return (size); 2518 } 2519 2520 /* ARGSUSED */ 2521 static void 2522 ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp) 2523 { 2524 ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; 2525 ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; 2526 int ah_cnt = 0, esp_cnt = 0; 2527 int ah_len = 0, esp_len = 0; 2528 int size; 2529 2530 size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len, 2531 &esp_cnt, &esp_len); 2532 if (size == 0) 2533 return; 2534 2535 /* 2536 * Clear the capability flags for IPsec HA but retain the ill 2537 * capability structures since it's possible that another thread 2538 * is still referring to them. The structures only get deallocated 2539 * when we destroy the ill. 2540 * 2541 * Various places check the flags to see if the ill is capable of 2542 * hardware acceleration, and by clearing them we ensure that new 2543 * outbound IPsec packets are sent down encrypted. 2544 */ 2545 2546 /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ 2547 if (ah_cnt > 0) { 2548 ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, 2549 cap_ah, mp); 2550 } 2551 2552 /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ 2553 if (esp_cnt > 0) { 2554 ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, 2555 cap_esp, mp); 2556 } 2557 2558 /* 2559 * At this point we've composed a bunch of sub-capabilities to be 2560 * encapsulated in a DL_CAPABILITY_REQ and later sent downstream 2561 * by the caller. Upon receiving this reset message, the driver 2562 * must stop inbound decryption (by destroying all inbound SAs) 2563 * and let the corresponding packets come in encrypted. 2564 */ 2565 } 2566 2567 static void 2568 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, 2569 boolean_t encapsulated) 2570 { 2571 boolean_t legacy = B_FALSE; 2572 2573 /* 2574 * Note that only the following two sub-capabilities may be 2575 * considered as "legacy", since their original definitions 2576 * do not incorporate the dl_mid_t module ID token, and hence 2577 * may require the use of the wrapper sub-capability. 2578 */ 2579 switch (subp->dl_cap) { 2580 case DL_CAPAB_IPSEC_AH: 2581 case DL_CAPAB_IPSEC_ESP: 2582 legacy = B_TRUE; 2583 break; 2584 } 2585 2586 /* 2587 * For legacy sub-capabilities which don't incorporate a queue_t 2588 * pointer in their structures, discard them if we detect that 2589 * there are intermediate modules in between IP and the driver. 2590 */ 2591 if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { 2592 ip1dbg(("ill_capability_dispatch: unencapsulated capab type " 2593 "%d discarded; %d module(s) present below IP\n", 2594 subp->dl_cap, ill->ill_lmod_cnt)); 2595 return; 2596 } 2597 2598 switch (subp->dl_cap) { 2599 case DL_CAPAB_IPSEC_AH: 2600 case DL_CAPAB_IPSEC_ESP: 2601 ill_capability_ipsec_ack(ill, mp, subp); 2602 break; 2603 case DL_CAPAB_MDT: 2604 ill_capability_mdt_ack(ill, mp, subp); 2605 break; 2606 case DL_CAPAB_HCKSUM: 2607 ill_capability_hcksum_ack(ill, mp, subp); 2608 break; 2609 case DL_CAPAB_ZEROCOPY: 2610 ill_capability_zerocopy_ack(ill, mp, subp); 2611 break; 2612 case DL_CAPAB_DLD: 2613 ill_capability_dld_ack(ill, mp, subp); 2614 break; 2615 default: 2616 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 2617 subp->dl_cap)); 2618 } 2619 } 2620 2621 /* 2622 * Process a hardware checksum offload capability negotiation ack received 2623 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 2624 * of a DL_CAPABILITY_ACK message. 2625 */ 2626 static void 2627 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2628 { 2629 dl_capability_req_t *ocap; 2630 dl_capab_hcksum_t *ihck, *ohck; 2631 ill_hcksum_capab_t **ill_hcksum; 2632 mblk_t *nmp = NULL; 2633 uint_t sub_dl_cap = isub->dl_cap; 2634 uint8_t *capend; 2635 2636 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 2637 2638 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 2639 2640 /* 2641 * Note: range checks here are not absolutely sufficient to 2642 * make us robust against malformed messages sent by drivers; 2643 * this is in keeping with the rest of IP's dlpi handling. 2644 * (Remember, it's coming from something else in the kernel 2645 * address space) 2646 */ 2647 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2648 if (capend > mp->b_wptr) { 2649 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2650 "malformed sub-capability too long for mblk"); 2651 return; 2652 } 2653 2654 /* 2655 * There are two types of acks we process here: 2656 * 1. acks in reply to a (first form) generic capability req 2657 * (no ENABLE flag set) 2658 * 2. acks in reply to a ENABLE capability req. 2659 * (ENABLE flag set) 2660 */ 2661 ihck = (dl_capab_hcksum_t *)(isub + 1); 2662 2663 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 2664 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 2665 "unsupported hardware checksum " 2666 "sub-capability (version %d, expected %d)", 2667 ihck->hcksum_version, HCKSUM_VERSION_1); 2668 return; 2669 } 2670 2671 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 2672 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 2673 "checksum capability isn't as expected; pass-thru " 2674 "module(s) detected, discarding capability\n")); 2675 return; 2676 } 2677 2678 #define CURR_HCKSUM_CAPAB \ 2679 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 2680 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 2681 2682 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 2683 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 2684 /* do ENABLE processing */ 2685 if (*ill_hcksum == NULL) { 2686 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 2687 KM_NOSLEEP); 2688 2689 if (*ill_hcksum == NULL) { 2690 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2691 "could not enable hcksum version %d " 2692 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 2693 ill->ill_name); 2694 return; 2695 } 2696 } 2697 2698 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 2699 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 2700 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 2701 ip1dbg(("ill_capability_hcksum_ack: interface %s " 2702 "has enabled hardware checksumming\n ", 2703 ill->ill_name)); 2704 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 2705 /* 2706 * Enabling hardware checksum offload 2707 * Currently IP supports {TCP,UDP}/IPv4 2708 * partial and full cksum offload and 2709 * IPv4 header checksum offload. 2710 * Allocate new mblk which will 2711 * contain a new capability request 2712 * to enable hardware checksum offload. 2713 */ 2714 uint_t size; 2715 uchar_t *rptr; 2716 2717 size = sizeof (dl_capability_req_t) + 2718 sizeof (dl_capability_sub_t) + isub->dl_length; 2719 2720 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2721 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 2722 "could not enable hardware cksum for %s (ENOMEM)\n", 2723 ill->ill_name); 2724 return; 2725 } 2726 2727 rptr = nmp->b_rptr; 2728 /* initialize dl_capability_req_t */ 2729 ocap = (dl_capability_req_t *)nmp->b_rptr; 2730 ocap->dl_sub_offset = 2731 sizeof (dl_capability_req_t); 2732 ocap->dl_sub_length = 2733 sizeof (dl_capability_sub_t) + 2734 isub->dl_length; 2735 nmp->b_rptr += sizeof (dl_capability_req_t); 2736 2737 /* initialize dl_capability_sub_t */ 2738 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 2739 nmp->b_rptr += sizeof (*isub); 2740 2741 /* initialize dl_capab_hcksum_t */ 2742 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 2743 bcopy(ihck, ohck, sizeof (*ihck)); 2744 2745 nmp->b_rptr = rptr; 2746 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 2747 2748 /* Set ENABLE flag */ 2749 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 2750 ohck->hcksum_txflags |= HCKSUM_ENABLE; 2751 2752 /* 2753 * nmp points to a DL_CAPABILITY_REQ message to enable 2754 * hardware checksum acceleration. 2755 */ 2756 ill_capability_send(ill, nmp); 2757 } else { 2758 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 2759 "advertised %x hardware checksum capability flags\n", 2760 ill->ill_name, ihck->hcksum_txflags)); 2761 } 2762 } 2763 2764 static void 2765 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 2766 { 2767 dl_capab_hcksum_t *hck_subcap; 2768 dl_capability_sub_t *dl_subcap; 2769 2770 if (!ILL_HCKSUM_CAPABLE(ill)) 2771 return; 2772 2773 ASSERT(ill->ill_hcksum_capab != NULL); 2774 2775 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2776 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 2777 dl_subcap->dl_length = sizeof (*hck_subcap); 2778 2779 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 2780 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 2781 hck_subcap->hcksum_txflags = 0; 2782 2783 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 2784 } 2785 2786 static void 2787 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2788 { 2789 mblk_t *nmp = NULL; 2790 dl_capability_req_t *oc; 2791 dl_capab_zerocopy_t *zc_ic, *zc_oc; 2792 ill_zerocopy_capab_t **ill_zerocopy_capab; 2793 uint_t sub_dl_cap = isub->dl_cap; 2794 uint8_t *capend; 2795 2796 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 2797 2798 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 2799 2800 /* 2801 * Note: range checks here are not absolutely sufficient to 2802 * make us robust against malformed messages sent by drivers; 2803 * this is in keeping with the rest of IP's dlpi handling. 2804 * (Remember, it's coming from something else in the kernel 2805 * address space) 2806 */ 2807 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2808 if (capend > mp->b_wptr) { 2809 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2810 "malformed sub-capability too long for mblk"); 2811 return; 2812 } 2813 2814 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 2815 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 2816 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 2817 "unsupported ZEROCOPY sub-capability (version %d, " 2818 "expected %d)", zc_ic->zerocopy_version, 2819 ZEROCOPY_VERSION_1); 2820 return; 2821 } 2822 2823 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 2824 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 2825 "capability isn't as expected; pass-thru module(s) " 2826 "detected, discarding capability\n")); 2827 return; 2828 } 2829 2830 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 2831 if (*ill_zerocopy_capab == NULL) { 2832 *ill_zerocopy_capab = 2833 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 2834 KM_NOSLEEP); 2835 2836 if (*ill_zerocopy_capab == NULL) { 2837 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2838 "could not enable Zero-copy version %d " 2839 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 2840 ill->ill_name); 2841 return; 2842 } 2843 } 2844 2845 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 2846 "supports Zero-copy version %d\n", ill->ill_name, 2847 ZEROCOPY_VERSION_1)); 2848 2849 (*ill_zerocopy_capab)->ill_zerocopy_version = 2850 zc_ic->zerocopy_version; 2851 (*ill_zerocopy_capab)->ill_zerocopy_flags = 2852 zc_ic->zerocopy_flags; 2853 2854 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 2855 } else { 2856 uint_t size; 2857 uchar_t *rptr; 2858 2859 size = sizeof (dl_capability_req_t) + 2860 sizeof (dl_capability_sub_t) + 2861 sizeof (dl_capab_zerocopy_t); 2862 2863 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 2864 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 2865 "could not enable zerocopy for %s (ENOMEM)\n", 2866 ill->ill_name); 2867 return; 2868 } 2869 2870 rptr = nmp->b_rptr; 2871 /* initialize dl_capability_req_t */ 2872 oc = (dl_capability_req_t *)rptr; 2873 oc->dl_sub_offset = sizeof (dl_capability_req_t); 2874 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 2875 sizeof (dl_capab_zerocopy_t); 2876 rptr += sizeof (dl_capability_req_t); 2877 2878 /* initialize dl_capability_sub_t */ 2879 bcopy(isub, rptr, sizeof (*isub)); 2880 rptr += sizeof (*isub); 2881 2882 /* initialize dl_capab_zerocopy_t */ 2883 zc_oc = (dl_capab_zerocopy_t *)rptr; 2884 *zc_oc = *zc_ic; 2885 2886 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 2887 "to enable zero-copy version %d\n", ill->ill_name, 2888 ZEROCOPY_VERSION_1)); 2889 2890 /* set VMSAFE_MEM flag */ 2891 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 2892 2893 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 2894 ill_capability_send(ill, nmp); 2895 } 2896 } 2897 2898 static void 2899 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 2900 { 2901 dl_capab_zerocopy_t *zerocopy_subcap; 2902 dl_capability_sub_t *dl_subcap; 2903 2904 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 2905 return; 2906 2907 ASSERT(ill->ill_zerocopy_capab != NULL); 2908 2909 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 2910 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 2911 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 2912 2913 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 2914 zerocopy_subcap->zerocopy_version = 2915 ill->ill_zerocopy_capab->ill_zerocopy_version; 2916 zerocopy_subcap->zerocopy_flags = 0; 2917 2918 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 2919 } 2920 2921 /* 2922 * DLD capability 2923 * Refer to dld.h for more information regarding the purpose and usage 2924 * of this capability. 2925 */ 2926 static void 2927 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 2928 { 2929 dl_capab_dld_t *dld_ic, dld; 2930 uint_t sub_dl_cap = isub->dl_cap; 2931 uint8_t *capend; 2932 ill_dld_capab_t *idc; 2933 2934 ASSERT(IAM_WRITER_ILL(ill)); 2935 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 2936 2937 /* 2938 * Note: range checks here are not absolutely sufficient to 2939 * make us robust against malformed messages sent by drivers; 2940 * this is in keeping with the rest of IP's dlpi handling. 2941 * (Remember, it's coming from something else in the kernel 2942 * address space) 2943 */ 2944 capend = (uint8_t *)(isub + 1) + isub->dl_length; 2945 if (capend > mp->b_wptr) { 2946 cmn_err(CE_WARN, "ill_capability_dld_ack: " 2947 "malformed sub-capability too long for mblk"); 2948 return; 2949 } 2950 dld_ic = (dl_capab_dld_t *)(isub + 1); 2951 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 2952 cmn_err(CE_CONT, "ill_capability_dld_ack: " 2953 "unsupported DLD sub-capability (version %d, " 2954 "expected %d)", dld_ic->dld_version, 2955 DLD_CURRENT_VERSION); 2956 return; 2957 } 2958 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 2959 ip1dbg(("ill_capability_dld_ack: mid token for dld " 2960 "capability isn't as expected; pass-thru module(s) " 2961 "detected, discarding capability\n")); 2962 return; 2963 } 2964 2965 /* 2966 * Copy locally to ensure alignment. 2967 */ 2968 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 2969 2970 if ((idc = ill->ill_dld_capab) == NULL) { 2971 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 2972 if (idc == NULL) { 2973 cmn_err(CE_WARN, "ill_capability_dld_ack: " 2974 "could not enable DLD version %d " 2975 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 2976 ill->ill_name); 2977 return; 2978 } 2979 ill->ill_dld_capab = idc; 2980 } 2981 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 2982 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 2983 ip1dbg(("ill_capability_dld_ack: interface %s " 2984 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 2985 2986 ill_capability_dld_enable(ill); 2987 } 2988 2989 /* 2990 * Typically capability negotiation between IP and the driver happens via 2991 * DLPI message exchange. However GLD also offers a direct function call 2992 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 2993 * But arbitrary function calls into IP or GLD are not permitted, since both 2994 * of them are protected by their own perimeter mechanism. The perimeter can 2995 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 2996 * these perimeters is IP -> MAC. Thus for example to enable the squeue 2997 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 2998 * to enter the mac perimeter and then do the direct function calls into 2999 * GLD to enable squeue polling. The ring related callbacks from the mac into 3000 * the stack to add, bind, quiesce, restart or cleanup a ring are all 3001 * protected by the mac perimeter. 3002 */ 3003 static void 3004 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 3005 { 3006 ill_dld_capab_t *idc = ill->ill_dld_capab; 3007 int err; 3008 3009 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 3010 DLD_ENABLE); 3011 ASSERT(err == 0); 3012 } 3013 3014 static void 3015 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 3016 { 3017 ill_dld_capab_t *idc = ill->ill_dld_capab; 3018 int err; 3019 3020 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 3021 DLD_DISABLE); 3022 ASSERT(err == 0); 3023 } 3024 3025 boolean_t 3026 ill_mac_perim_held(ill_t *ill) 3027 { 3028 ill_dld_capab_t *idc = ill->ill_dld_capab; 3029 3030 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 3031 DLD_QUERY)); 3032 } 3033 3034 static void 3035 ill_capability_direct_enable(ill_t *ill) 3036 { 3037 ill_dld_capab_t *idc = ill->ill_dld_capab; 3038 ill_dld_direct_t *idd = &idc->idc_direct; 3039 dld_capab_direct_t direct; 3040 int rc; 3041 3042 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3043 3044 bzero(&direct, sizeof (direct)); 3045 direct.di_rx_cf = (uintptr_t)ip_input; 3046 direct.di_rx_ch = ill; 3047 3048 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 3049 DLD_ENABLE); 3050 if (rc == 0) { 3051 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 3052 idd->idd_tx_dh = direct.di_tx_dh; 3053 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 3054 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 3055 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 3056 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 3057 ASSERT(idd->idd_tx_cb_df != NULL); 3058 ASSERT(idd->idd_tx_fctl_df != NULL); 3059 ASSERT(idd->idd_tx_df != NULL); 3060 /* 3061 * One time registration of flow enable callback function 3062 */ 3063 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 3064 ill_flow_enable, ill); 3065 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 3066 DTRACE_PROBE1(direct_on, (ill_t *), ill); 3067 } else { 3068 cmn_err(CE_WARN, "warning: could not enable DIRECT " 3069 "capability, rc = %d\n", rc); 3070 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 3071 } 3072 } 3073 3074 static void 3075 ill_capability_poll_enable(ill_t *ill) 3076 { 3077 ill_dld_capab_t *idc = ill->ill_dld_capab; 3078 dld_capab_poll_t poll; 3079 int rc; 3080 3081 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3082 3083 bzero(&poll, sizeof (poll)); 3084 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 3085 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 3086 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 3087 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 3088 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 3089 poll.poll_ring_ch = ill; 3090 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 3091 DLD_ENABLE); 3092 if (rc == 0) { 3093 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 3094 DTRACE_PROBE1(poll_on, (ill_t *), ill); 3095 } else { 3096 ip1dbg(("warning: could not enable POLL " 3097 "capability, rc = %d\n", rc)); 3098 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 3099 } 3100 } 3101 3102 /* 3103 * Enable the LSO capability. 3104 */ 3105 static void 3106 ill_capability_lso_enable(ill_t *ill) 3107 { 3108 ill_dld_capab_t *idc = ill->ill_dld_capab; 3109 dld_capab_lso_t lso; 3110 int rc; 3111 3112 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 3113 3114 if (ill->ill_lso_capab == NULL) { 3115 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 3116 KM_NOSLEEP); 3117 if (ill->ill_lso_capab == NULL) { 3118 cmn_err(CE_WARN, "ill_capability_lso_enable: " 3119 "could not enable LSO for %s (ENOMEM)\n", 3120 ill->ill_name); 3121 return; 3122 } 3123 } 3124 3125 bzero(&lso, sizeof (lso)); 3126 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 3127 DLD_ENABLE)) == 0) { 3128 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 3129 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 3130 ill->ill_capabilities |= ILL_CAPAB_DLD_LSO; 3131 ip1dbg(("ill_capability_lso_enable: interface %s " 3132 "has enabled LSO\n ", ill->ill_name)); 3133 } else { 3134 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 3135 ill->ill_lso_capab = NULL; 3136 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 3137 } 3138 } 3139 3140 static void 3141 ill_capability_dld_enable(ill_t *ill) 3142 { 3143 mac_perim_handle_t mph; 3144 3145 ASSERT(IAM_WRITER_ILL(ill)); 3146 3147 if (ill->ill_isv6) 3148 return; 3149 3150 ill_mac_perim_enter(ill, &mph); 3151 if (!ill->ill_isv6) { 3152 ill_capability_direct_enable(ill); 3153 ill_capability_poll_enable(ill); 3154 ill_capability_lso_enable(ill); 3155 } 3156 ill->ill_capabilities |= ILL_CAPAB_DLD; 3157 ill_mac_perim_exit(ill, mph); 3158 } 3159 3160 static void 3161 ill_capability_dld_disable(ill_t *ill) 3162 { 3163 ill_dld_capab_t *idc; 3164 ill_dld_direct_t *idd; 3165 mac_perim_handle_t mph; 3166 3167 ASSERT(IAM_WRITER_ILL(ill)); 3168 3169 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 3170 return; 3171 3172 ill_mac_perim_enter(ill, &mph); 3173 3174 idc = ill->ill_dld_capab; 3175 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 3176 /* 3177 * For performance we avoid locks in the transmit data path 3178 * and don't maintain a count of the number of threads using 3179 * direct calls. Thus some threads could be using direct 3180 * transmit calls to GLD, even after the capability mechanism 3181 * turns it off. This is still safe since the handles used in 3182 * the direct calls continue to be valid until the unplumb is 3183 * completed. Remove the callback that was added (1-time) at 3184 * capab enable time. 3185 */ 3186 mutex_enter(&ill->ill_lock); 3187 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 3188 mutex_exit(&ill->ill_lock); 3189 if (ill->ill_flownotify_mh != NULL) { 3190 idd = &idc->idc_direct; 3191 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 3192 ill->ill_flownotify_mh); 3193 ill->ill_flownotify_mh = NULL; 3194 } 3195 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 3196 NULL, DLD_DISABLE); 3197 } 3198 3199 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 3200 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 3201 ip_squeue_clean_all(ill); 3202 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 3203 NULL, DLD_DISABLE); 3204 } 3205 3206 if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) { 3207 ASSERT(ill->ill_lso_capab != NULL); 3208 /* 3209 * Clear the capability flag for LSO but retain the 3210 * ill_lso_capab structure since it's possible that another 3211 * thread is still referring to it. The structure only gets 3212 * deallocated when we destroy the ill. 3213 */ 3214 3215 ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO; 3216 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 3217 NULL, DLD_DISABLE); 3218 } 3219 3220 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 3221 ill_mac_perim_exit(ill, mph); 3222 } 3223 3224 /* 3225 * Capability Negotiation protocol 3226 * 3227 * We don't wait for DLPI capability operations to finish during interface 3228 * bringup or teardown. Doing so would introduce more asynchrony and the 3229 * interface up/down operations will need multiple return and restarts. 3230 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 3231 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 3232 * exclusive operation won't start until the DLPI operations of the previous 3233 * exclusive operation complete. 3234 * 3235 * The capability state machine is shown below. 3236 * 3237 * state next state event, action 3238 * 3239 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 3240 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 3241 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 3242 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 3243 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 3244 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 3245 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 3246 * ill_capability_probe. 3247 */ 3248 3249 /* 3250 * Dedicated thread started from ip_stack_init that handles capability 3251 * disable. This thread ensures the taskq dispatch does not fail by waiting 3252 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 3253 * that direct calls to DLD are done in a cv_waitable context. 3254 */ 3255 void 3256 ill_taskq_dispatch(ip_stack_t *ipst) 3257 { 3258 callb_cpr_t cprinfo; 3259 char name[64]; 3260 mblk_t *mp; 3261 3262 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 3263 ipst->ips_netstack->netstack_stackid); 3264 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 3265 name); 3266 mutex_enter(&ipst->ips_capab_taskq_lock); 3267 3268 for (;;) { 3269 mp = list_head(&ipst->ips_capab_taskq_list); 3270 while (mp != NULL) { 3271 list_remove(&ipst->ips_capab_taskq_list, mp); 3272 mutex_exit(&ipst->ips_capab_taskq_lock); 3273 VERIFY(taskq_dispatch(system_taskq, 3274 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 3275 mutex_enter(&ipst->ips_capab_taskq_lock); 3276 mp = list_head(&ipst->ips_capab_taskq_list); 3277 } 3278 3279 if (ipst->ips_capab_taskq_quit) 3280 break; 3281 CALLB_CPR_SAFE_BEGIN(&cprinfo); 3282 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 3283 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 3284 } 3285 VERIFY(list_head(&ipst->ips_capab_taskq_list) == NULL); 3286 CALLB_CPR_EXIT(&cprinfo); 3287 thread_exit(); 3288 } 3289 3290 /* 3291 * Consume a new-style hardware capabilities negotiation ack. 3292 * Called via taskq on receipt of DL_CAPABBILITY_ACK. 3293 */ 3294 static void 3295 ill_capability_ack_thr(void *arg) 3296 { 3297 mblk_t *mp = arg; 3298 dl_capability_ack_t *capp; 3299 dl_capability_sub_t *subp, *endp; 3300 ill_t *ill; 3301 boolean_t reneg; 3302 3303 ill = (ill_t *)mp->b_prev; 3304 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 3305 3306 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 3307 ill->ill_dlpi_capab_state == IDCS_RENEG) { 3308 /* 3309 * We have received the ack for our DL_CAPAB reset request. 3310 * There isnt' anything in the message that needs processing. 3311 * All message based capabilities have been disabled, now 3312 * do the function call based capability disable. 3313 */ 3314 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 3315 ill_capability_dld_disable(ill); 3316 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 3317 if (reneg) 3318 ill_capability_probe(ill); 3319 goto done; 3320 } 3321 3322 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 3323 ill->ill_dlpi_capab_state = IDCS_OK; 3324 3325 capp = (dl_capability_ack_t *)mp->b_rptr; 3326 3327 if (capp->dl_sub_length == 0) { 3328 /* no new-style capabilities */ 3329 goto done; 3330 } 3331 3332 /* make sure the driver supplied correct dl_sub_length */ 3333 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 3334 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 3335 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 3336 goto done; 3337 } 3338 3339 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 3340 /* 3341 * There are sub-capabilities. Process the ones we know about. 3342 * Loop until we don't have room for another sub-cap header.. 3343 */ 3344 for (subp = SC(capp, capp->dl_sub_offset), 3345 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 3346 subp <= endp; 3347 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 3348 3349 switch (subp->dl_cap) { 3350 case DL_CAPAB_ID_WRAPPER: 3351 ill_capability_id_ack(ill, mp, subp); 3352 break; 3353 default: 3354 ill_capability_dispatch(ill, mp, subp, B_FALSE); 3355 break; 3356 } 3357 } 3358 #undef SC 3359 done: 3360 inet_freemsg(mp); 3361 ill_capability_done(ill); 3362 ipsq_exit(ill->ill_phyint->phyint_ipsq); 3363 } 3364 3365 /* 3366 * This needs to be started in a taskq thread to provide a cv_waitable 3367 * context. 3368 */ 3369 void 3370 ill_capability_ack(ill_t *ill, mblk_t *mp) 3371 { 3372 ip_stack_t *ipst = ill->ill_ipst; 3373 3374 mp->b_prev = (mblk_t *)ill; 3375 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 3376 TQ_NOSLEEP) != 0) 3377 return; 3378 3379 /* 3380 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 3381 * which will do the dispatch using TQ_SLEEP to guarantee success. 3382 */ 3383 mutex_enter(&ipst->ips_capab_taskq_lock); 3384 list_insert_tail(&ipst->ips_capab_taskq_list, mp); 3385 cv_signal(&ipst->ips_capab_taskq_cv); 3386 mutex_exit(&ipst->ips_capab_taskq_lock); 3387 } 3388 3389 /* 3390 * This routine is called to scan the fragmentation reassembly table for 3391 * the specified ILL for any packets that are starting to smell. 3392 * dead_interval is the maximum time in seconds that will be tolerated. It 3393 * will either be the value specified in ip_g_frag_timeout, or zero if the 3394 * ILL is shutting down and it is time to blow everything off. 3395 * 3396 * It returns the number of seconds (as a time_t) that the next frag timer 3397 * should be scheduled for, 0 meaning that the timer doesn't need to be 3398 * re-started. Note that the method of calculating next_timeout isn't 3399 * entirely accurate since time will flow between the time we grab 3400 * current_time and the time we schedule the next timeout. This isn't a 3401 * big problem since this is the timer for sending an ICMP reassembly time 3402 * exceeded messages, and it doesn't have to be exactly accurate. 3403 * 3404 * This function is 3405 * sometimes called as writer, although this is not required. 3406 */ 3407 time_t 3408 ill_frag_timeout(ill_t *ill, time_t dead_interval) 3409 { 3410 ipfb_t *ipfb; 3411 ipfb_t *endp; 3412 ipf_t *ipf; 3413 ipf_t *ipfnext; 3414 mblk_t *mp; 3415 time_t current_time = gethrestime_sec(); 3416 time_t next_timeout = 0; 3417 uint32_t hdr_length; 3418 mblk_t *send_icmp_head; 3419 mblk_t *send_icmp_head_v6; 3420 zoneid_t zoneid; 3421 ip_stack_t *ipst = ill->ill_ipst; 3422 3423 ipfb = ill->ill_frag_hash_tbl; 3424 if (ipfb == NULL) 3425 return (B_FALSE); 3426 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 3427 /* Walk the frag hash table. */ 3428 for (; ipfb < endp; ipfb++) { 3429 send_icmp_head = NULL; 3430 send_icmp_head_v6 = NULL; 3431 mutex_enter(&ipfb->ipfb_lock); 3432 while ((ipf = ipfb->ipfb_ipf) != 0) { 3433 time_t frag_time = current_time - ipf->ipf_timestamp; 3434 time_t frag_timeout; 3435 3436 if (frag_time < dead_interval) { 3437 /* 3438 * There are some outstanding fragments 3439 * that will timeout later. Make note of 3440 * the time so that we can reschedule the 3441 * next timeout appropriately. 3442 */ 3443 frag_timeout = dead_interval - frag_time; 3444 if (next_timeout == 0 || 3445 frag_timeout < next_timeout) { 3446 next_timeout = frag_timeout; 3447 } 3448 break; 3449 } 3450 /* Time's up. Get it out of here. */ 3451 hdr_length = ipf->ipf_nf_hdr_len; 3452 ipfnext = ipf->ipf_hash_next; 3453 if (ipfnext) 3454 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 3455 *ipf->ipf_ptphn = ipfnext; 3456 mp = ipf->ipf_mp->b_cont; 3457 for (; mp; mp = mp->b_cont) { 3458 /* Extra points for neatness. */ 3459 IP_REASS_SET_START(mp, 0); 3460 IP_REASS_SET_END(mp, 0); 3461 } 3462 mp = ipf->ipf_mp->b_cont; 3463 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 3464 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 3465 ipfb->ipfb_count -= ipf->ipf_count; 3466 ASSERT(ipfb->ipfb_frag_pkts > 0); 3467 ipfb->ipfb_frag_pkts--; 3468 /* 3469 * We do not send any icmp message from here because 3470 * we currently are holding the ipfb_lock for this 3471 * hash chain. If we try and send any icmp messages 3472 * from here we may end up via a put back into ip 3473 * trying to get the same lock, causing a recursive 3474 * mutex panic. Instead we build a list and send all 3475 * the icmp messages after we have dropped the lock. 3476 */ 3477 if (ill->ill_isv6) { 3478 if (hdr_length != 0) { 3479 mp->b_next = send_icmp_head_v6; 3480 send_icmp_head_v6 = mp; 3481 } else { 3482 freemsg(mp); 3483 } 3484 } else { 3485 if (hdr_length != 0) { 3486 mp->b_next = send_icmp_head; 3487 send_icmp_head = mp; 3488 } else { 3489 freemsg(mp); 3490 } 3491 } 3492 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3493 freeb(ipf->ipf_mp); 3494 } 3495 mutex_exit(&ipfb->ipfb_lock); 3496 /* 3497 * Now need to send any icmp messages that we delayed from 3498 * above. 3499 */ 3500 while (send_icmp_head_v6 != NULL) { 3501 ip6_t *ip6h; 3502 3503 mp = send_icmp_head_v6; 3504 send_icmp_head_v6 = send_icmp_head_v6->b_next; 3505 mp->b_next = NULL; 3506 if (mp->b_datap->db_type == M_CTL) 3507 ip6h = (ip6_t *)mp->b_cont->b_rptr; 3508 else 3509 ip6h = (ip6_t *)mp->b_rptr; 3510 zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 3511 ill, ipst); 3512 if (zoneid == ALL_ZONES) { 3513 freemsg(mp); 3514 } else { 3515 icmp_time_exceeded_v6(ill->ill_wq, mp, 3516 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 3517 B_FALSE, zoneid, ipst); 3518 } 3519 } 3520 while (send_icmp_head != NULL) { 3521 ipaddr_t dst; 3522 3523 mp = send_icmp_head; 3524 send_icmp_head = send_icmp_head->b_next; 3525 mp->b_next = NULL; 3526 3527 if (mp->b_datap->db_type == M_CTL) 3528 dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; 3529 else 3530 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 3531 3532 zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); 3533 if (zoneid == ALL_ZONES) { 3534 freemsg(mp); 3535 } else { 3536 icmp_time_exceeded(ill->ill_wq, mp, 3537 ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid, 3538 ipst); 3539 } 3540 } 3541 } 3542 /* 3543 * A non-dying ILL will use the return value to decide whether to 3544 * restart the frag timer, and for how long. 3545 */ 3546 return (next_timeout); 3547 } 3548 3549 /* 3550 * This routine is called when the approximate count of mblk memory used 3551 * for the specified ILL has exceeded max_count. 3552 */ 3553 void 3554 ill_frag_prune(ill_t *ill, uint_t max_count) 3555 { 3556 ipfb_t *ipfb; 3557 ipf_t *ipf; 3558 size_t count; 3559 3560 /* 3561 * If we are here within ip_min_frag_prune_time msecs remove 3562 * ill_frag_free_num_pkts oldest packets from each bucket and increment 3563 * ill_frag_free_num_pkts. 3564 */ 3565 mutex_enter(&ill->ill_lock); 3566 if (TICK_TO_MSEC(lbolt - ill->ill_last_frag_clean_time) <= 3567 (ip_min_frag_prune_time != 0 ? 3568 ip_min_frag_prune_time : msec_per_tick)) { 3569 3570 ill->ill_frag_free_num_pkts++; 3571 3572 } else { 3573 ill->ill_frag_free_num_pkts = 0; 3574 } 3575 ill->ill_last_frag_clean_time = lbolt; 3576 mutex_exit(&ill->ill_lock); 3577 3578 /* 3579 * free ill_frag_free_num_pkts oldest packets from each bucket. 3580 */ 3581 if (ill->ill_frag_free_num_pkts != 0) { 3582 int ix; 3583 3584 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3585 ipfb = &ill->ill_frag_hash_tbl[ix]; 3586 mutex_enter(&ipfb->ipfb_lock); 3587 if (ipfb->ipfb_ipf != NULL) { 3588 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 3589 ill->ill_frag_free_num_pkts); 3590 } 3591 mutex_exit(&ipfb->ipfb_lock); 3592 } 3593 } 3594 /* 3595 * While the reassembly list for this ILL is too big, prune a fragment 3596 * queue by age, oldest first. 3597 */ 3598 while (ill->ill_frag_count > max_count) { 3599 int ix; 3600 ipfb_t *oipfb = NULL; 3601 uint_t oldest = UINT_MAX; 3602 3603 count = 0; 3604 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 3605 ipfb = &ill->ill_frag_hash_tbl[ix]; 3606 mutex_enter(&ipfb->ipfb_lock); 3607 ipf = ipfb->ipfb_ipf; 3608 if (ipf != NULL && ipf->ipf_gen < oldest) { 3609 oldest = ipf->ipf_gen; 3610 oipfb = ipfb; 3611 } 3612 count += ipfb->ipfb_count; 3613 mutex_exit(&ipfb->ipfb_lock); 3614 } 3615 if (oipfb == NULL) 3616 break; 3617 3618 if (count <= max_count) 3619 return; /* Somebody beat us to it, nothing to do */ 3620 mutex_enter(&oipfb->ipfb_lock); 3621 ipf = oipfb->ipfb_ipf; 3622 if (ipf != NULL) { 3623 ill_frag_free_pkts(ill, oipfb, ipf, 1); 3624 } 3625 mutex_exit(&oipfb->ipfb_lock); 3626 } 3627 } 3628 3629 /* 3630 * free 'free_cnt' fragmented packets starting at ipf. 3631 */ 3632 void 3633 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 3634 { 3635 size_t count; 3636 mblk_t *mp; 3637 mblk_t *tmp; 3638 ipf_t **ipfp = ipf->ipf_ptphn; 3639 3640 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 3641 ASSERT(ipfp != NULL); 3642 ASSERT(ipf != NULL); 3643 3644 while (ipf != NULL && free_cnt-- > 0) { 3645 count = ipf->ipf_count; 3646 mp = ipf->ipf_mp; 3647 ipf = ipf->ipf_hash_next; 3648 for (tmp = mp; tmp; tmp = tmp->b_cont) { 3649 IP_REASS_SET_START(tmp, 0); 3650 IP_REASS_SET_END(tmp, 0); 3651 } 3652 atomic_add_32(&ill->ill_frag_count, -count); 3653 ASSERT(ipfb->ipfb_count >= count); 3654 ipfb->ipfb_count -= count; 3655 ASSERT(ipfb->ipfb_frag_pkts > 0); 3656 ipfb->ipfb_frag_pkts--; 3657 freemsg(mp); 3658 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 3659 } 3660 3661 if (ipf) 3662 ipf->ipf_ptphn = ipfp; 3663 ipfp[0] = ipf; 3664 } 3665 3666 #define ND_FORWARD_WARNING "The <if>:ip*_forwarding ndd variables are " \ 3667 "obsolete and may be removed in a future release of Solaris. Use " \ 3668 "ifconfig(1M) to manipulate the forwarding status of an interface." 3669 3670 /* 3671 * For obsolete per-interface forwarding configuration; 3672 * called in response to ND_GET. 3673 */ 3674 /* ARGSUSED */ 3675 static int 3676 nd_ill_forward_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *ioc_cr) 3677 { 3678 ill_t *ill = (ill_t *)cp; 3679 3680 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3681 3682 (void) mi_mpprintf(mp, "%d", (ill->ill_flags & ILLF_ROUTER) != 0); 3683 return (0); 3684 } 3685 3686 /* 3687 * For obsolete per-interface forwarding configuration; 3688 * called in response to ND_SET. 3689 */ 3690 /* ARGSUSED */ 3691 static int 3692 nd_ill_forward_set(queue_t *q, mblk_t *mp, char *valuestr, caddr_t cp, 3693 cred_t *ioc_cr) 3694 { 3695 long value; 3696 int retval; 3697 ip_stack_t *ipst = CONNQ_TO_IPST(q); 3698 3699 cmn_err(CE_WARN, ND_FORWARD_WARNING); 3700 3701 if (ddi_strtol(valuestr, NULL, 10, &value) != 0 || 3702 value < 0 || value > 1) { 3703 return (EINVAL); 3704 } 3705 3706 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3707 retval = ill_forward_set((ill_t *)cp, (value != 0)); 3708 rw_exit(&ipst->ips_ill_g_lock); 3709 return (retval); 3710 } 3711 3712 /* 3713 * Helper function for ill_forward_set(). 3714 */ 3715 static void 3716 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 3717 { 3718 ip_stack_t *ipst = ill->ill_ipst; 3719 3720 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3721 3722 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 3723 (enable ? "Enabling" : "Disabling"), 3724 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 3725 mutex_enter(&ill->ill_lock); 3726 if (enable) 3727 ill->ill_flags |= ILLF_ROUTER; 3728 else 3729 ill->ill_flags &= ~ILLF_ROUTER; 3730 mutex_exit(&ill->ill_lock); 3731 if (ill->ill_isv6) 3732 ill_set_nce_router_flags(ill, enable); 3733 /* Notify routing socket listeners of this change. */ 3734 if (ill->ill_ipif != NULL) 3735 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 3736 } 3737 3738 /* 3739 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 3740 * socket messages for each interface whose flags we change. 3741 */ 3742 int 3743 ill_forward_set(ill_t *ill, boolean_t enable) 3744 { 3745 ipmp_illgrp_t *illg; 3746 ip_stack_t *ipst = ill->ill_ipst; 3747 3748 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 3749 3750 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 3751 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 3752 return (0); 3753 3754 if (IS_LOOPBACK(ill)) 3755 return (EINVAL); 3756 3757 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 3758 /* 3759 * Update all of the interfaces in the group. 3760 */ 3761 illg = ill->ill_grp; 3762 ill = list_head(&illg->ig_if); 3763 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 3764 ill_forward_set_on_ill(ill, enable); 3765 3766 /* 3767 * Update the IPMP meta-interface. 3768 */ 3769 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 3770 return (0); 3771 } 3772 3773 ill_forward_set_on_ill(ill, enable); 3774 return (0); 3775 } 3776 3777 /* 3778 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 3779 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 3780 * set or clear. 3781 */ 3782 static void 3783 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 3784 { 3785 ipif_t *ipif; 3786 nce_t *nce; 3787 3788 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 3789 /* 3790 * NOTE: we match across the illgrp because nce's for 3791 * addresses on IPMP interfaces have an nce_ill that points to 3792 * the bound underlying ill. 3793 */ 3794 nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr, 3795 B_FALSE); 3796 if (nce != NULL) { 3797 mutex_enter(&nce->nce_lock); 3798 if (enable) 3799 nce->nce_flags |= NCE_F_ISROUTER; 3800 else 3801 nce->nce_flags &= ~NCE_F_ISROUTER; 3802 mutex_exit(&nce->nce_lock); 3803 NCE_REFRELE(nce); 3804 } 3805 } 3806 } 3807 3808 /* 3809 * Given an ill with a _valid_ name, add the ip_forwarding ndd variable 3810 * for this ill. Make sure the v6/v4 question has been answered about this 3811 * ill. The creation of this ndd variable is only for backwards compatibility. 3812 * The preferred way to control per-interface IP forwarding is through the 3813 * ILLF_ROUTER interface flag. 3814 */ 3815 static int 3816 ill_set_ndd_name(ill_t *ill) 3817 { 3818 char *suffix; 3819 ip_stack_t *ipst = ill->ill_ipst; 3820 3821 ASSERT(IAM_WRITER_ILL(ill)); 3822 3823 if (ill->ill_isv6) 3824 suffix = ipv6_forward_suffix; 3825 else 3826 suffix = ipv4_forward_suffix; 3827 3828 ill->ill_ndd_name = ill->ill_name + ill->ill_name_length; 3829 bcopy(ill->ill_name, ill->ill_ndd_name, ill->ill_name_length - 1); 3830 /* 3831 * Copies over the '\0'. 3832 * Note that strlen(suffix) is always bounded. 3833 */ 3834 bcopy(suffix, ill->ill_ndd_name + ill->ill_name_length - 1, 3835 strlen(suffix) + 1); 3836 3837 /* 3838 * Use of the nd table requires holding the reader lock. 3839 * Modifying the nd table thru nd_load/nd_unload requires 3840 * the writer lock. 3841 */ 3842 rw_enter(&ipst->ips_ip_g_nd_lock, RW_WRITER); 3843 if (!nd_load(&ipst->ips_ip_g_nd, ill->ill_ndd_name, nd_ill_forward_get, 3844 nd_ill_forward_set, (caddr_t)ill)) { 3845 /* 3846 * If the nd_load failed, it only meant that it could not 3847 * allocate a new bunch of room for further NDD expansion. 3848 * Because of that, the ill_ndd_name will be set to 0, and 3849 * this interface is at the mercy of the global ip_forwarding 3850 * variable. 3851 */ 3852 rw_exit(&ipst->ips_ip_g_nd_lock); 3853 ill->ill_ndd_name = NULL; 3854 return (ENOMEM); 3855 } 3856 rw_exit(&ipst->ips_ip_g_nd_lock); 3857 return (0); 3858 } 3859 3860 /* 3861 * Intializes the context structure and returns the first ill in the list 3862 * cuurently start_list and end_list can have values: 3863 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 3864 * IP_V4_G_HEAD Traverse IPV4 list only. 3865 * IP_V6_G_HEAD Traverse IPV6 list only. 3866 */ 3867 3868 /* 3869 * We don't check for CONDEMNED ills here. Caller must do that if 3870 * necessary under the ill lock. 3871 */ 3872 ill_t * 3873 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 3874 ip_stack_t *ipst) 3875 { 3876 ill_if_t *ifp; 3877 ill_t *ill; 3878 avl_tree_t *avl_tree; 3879 3880 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3881 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 3882 3883 /* 3884 * setup the lists to search 3885 */ 3886 if (end_list != MAX_G_HEADS) { 3887 ctx->ctx_current_list = start_list; 3888 ctx->ctx_last_list = end_list; 3889 } else { 3890 ctx->ctx_last_list = MAX_G_HEADS - 1; 3891 ctx->ctx_current_list = 0; 3892 } 3893 3894 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 3895 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 3896 if (ifp != (ill_if_t *) 3897 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 3898 avl_tree = &ifp->illif_avl_by_ppa; 3899 ill = avl_first(avl_tree); 3900 /* 3901 * ill is guaranteed to be non NULL or ifp should have 3902 * not existed. 3903 */ 3904 ASSERT(ill != NULL); 3905 return (ill); 3906 } 3907 ctx->ctx_current_list++; 3908 } 3909 3910 return (NULL); 3911 } 3912 3913 /* 3914 * returns the next ill in the list. ill_first() must have been called 3915 * before calling ill_next() or bad things will happen. 3916 */ 3917 3918 /* 3919 * We don't check for CONDEMNED ills here. Caller must do that if 3920 * necessary under the ill lock. 3921 */ 3922 ill_t * 3923 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 3924 { 3925 ill_if_t *ifp; 3926 ill_t *ill; 3927 ip_stack_t *ipst = lastill->ill_ipst; 3928 3929 ASSERT(lastill->ill_ifptr != (ill_if_t *) 3930 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 3931 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 3932 AVL_AFTER)) != NULL) { 3933 return (ill); 3934 } 3935 3936 /* goto next ill_ifp in the list. */ 3937 ifp = lastill->ill_ifptr->illif_next; 3938 3939 /* make sure not at end of circular list */ 3940 while (ifp == 3941 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 3942 if (++ctx->ctx_current_list > ctx->ctx_last_list) 3943 return (NULL); 3944 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 3945 } 3946 3947 return (avl_first(&ifp->illif_avl_by_ppa)); 3948 } 3949 3950 /* 3951 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 3952 * The final number (PPA) must not have any leading zeros. Upon success, a 3953 * pointer to the start of the PPA is returned; otherwise NULL is returned. 3954 */ 3955 static char * 3956 ill_get_ppa_ptr(char *name) 3957 { 3958 int namelen = strlen(name); 3959 int end_ndx = namelen - 1; 3960 int ppa_ndx, i; 3961 3962 /* 3963 * Check that the first character is [a-zA-Z], and that the last 3964 * character is [0-9]. 3965 */ 3966 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 3967 return (NULL); 3968 3969 /* 3970 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 3971 */ 3972 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 3973 if (!isdigit(name[ppa_ndx - 1])) 3974 break; 3975 3976 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 3977 return (NULL); 3978 3979 /* 3980 * Check that the intermediate characters are [a-z0-9.] 3981 */ 3982 for (i = 1; i < ppa_ndx; i++) { 3983 if (!isalpha(name[i]) && !isdigit(name[i]) && 3984 name[i] != '.' && name[i] != '_') { 3985 return (NULL); 3986 } 3987 } 3988 3989 return (name + ppa_ndx); 3990 } 3991 3992 /* 3993 * use avl tree to locate the ill. 3994 */ 3995 static ill_t * 3996 ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, 3997 ipsq_func_t func, int *error, ip_stack_t *ipst) 3998 { 3999 char *ppa_ptr = NULL; 4000 int len; 4001 uint_t ppa; 4002 ill_t *ill = NULL; 4003 ill_if_t *ifp; 4004 int list; 4005 ipsq_t *ipsq; 4006 4007 if (error != NULL) 4008 *error = 0; 4009 4010 /* 4011 * get ppa ptr 4012 */ 4013 if (isv6) 4014 list = IP_V6_G_HEAD; 4015 else 4016 list = IP_V4_G_HEAD; 4017 4018 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 4019 if (error != NULL) 4020 *error = ENXIO; 4021 return (NULL); 4022 } 4023 4024 len = ppa_ptr - name + 1; 4025 4026 ppa = stoi(&ppa_ptr); 4027 4028 ifp = IP_VX_ILL_G_LIST(list, ipst); 4029 4030 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4031 /* 4032 * match is done on len - 1 as the name is not null 4033 * terminated it contains ppa in addition to the interface 4034 * name. 4035 */ 4036 if ((ifp->illif_name_len == len) && 4037 bcmp(ifp->illif_name, name, len - 1) == 0) { 4038 break; 4039 } else { 4040 ifp = ifp->illif_next; 4041 } 4042 } 4043 4044 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 4045 /* 4046 * Even the interface type does not exist. 4047 */ 4048 if (error != NULL) 4049 *error = ENXIO; 4050 return (NULL); 4051 } 4052 4053 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 4054 if (ill != NULL) { 4055 /* 4056 * The block comment at the start of ipif_down 4057 * explains the use of the macros used below 4058 */ 4059 GRAB_CONN_LOCK(q); 4060 mutex_enter(&ill->ill_lock); 4061 if (ILL_CAN_LOOKUP(ill)) { 4062 ill_refhold_locked(ill); 4063 mutex_exit(&ill->ill_lock); 4064 RELEASE_CONN_LOCK(q); 4065 return (ill); 4066 } else if (ILL_CAN_WAIT(ill, q)) { 4067 ipsq = ill->ill_phyint->phyint_ipsq; 4068 mutex_enter(&ipsq->ipsq_lock); 4069 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 4070 mutex_exit(&ill->ill_lock); 4071 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 4072 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 4073 mutex_exit(&ipsq->ipsq_lock); 4074 RELEASE_CONN_LOCK(q); 4075 if (error != NULL) 4076 *error = EINPROGRESS; 4077 return (NULL); 4078 } 4079 mutex_exit(&ill->ill_lock); 4080 RELEASE_CONN_LOCK(q); 4081 } 4082 if (error != NULL) 4083 *error = ENXIO; 4084 return (NULL); 4085 } 4086 4087 /* 4088 * comparison function for use with avl. 4089 */ 4090 static int 4091 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 4092 { 4093 uint_t ppa; 4094 uint_t ill_ppa; 4095 4096 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 4097 4098 ppa = *((uint_t *)ppa_ptr); 4099 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 4100 /* 4101 * We want the ill with the lowest ppa to be on the 4102 * top. 4103 */ 4104 if (ill_ppa < ppa) 4105 return (1); 4106 if (ill_ppa > ppa) 4107 return (-1); 4108 return (0); 4109 } 4110 4111 /* 4112 * remove an interface type from the global list. 4113 */ 4114 static void 4115 ill_delete_interface_type(ill_if_t *interface) 4116 { 4117 ASSERT(interface != NULL); 4118 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 4119 4120 avl_destroy(&interface->illif_avl_by_ppa); 4121 if (interface->illif_ppa_arena != NULL) 4122 vmem_destroy(interface->illif_ppa_arena); 4123 4124 remque(interface); 4125 4126 mi_free(interface); 4127 } 4128 4129 /* 4130 * remove ill from the global list. 4131 */ 4132 static void 4133 ill_glist_delete(ill_t *ill) 4134 { 4135 ip_stack_t *ipst; 4136 phyint_t *phyi; 4137 4138 if (ill == NULL) 4139 return; 4140 ipst = ill->ill_ipst; 4141 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4142 4143 /* 4144 * If the ill was never inserted into the AVL tree 4145 * we skip the if branch. 4146 */ 4147 if (ill->ill_ifptr != NULL) { 4148 /* 4149 * remove from AVL tree and free ppa number 4150 */ 4151 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 4152 4153 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 4154 vmem_free(ill->ill_ifptr->illif_ppa_arena, 4155 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4156 } 4157 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 4158 ill_delete_interface_type(ill->ill_ifptr); 4159 } 4160 4161 /* 4162 * Indicate ill is no longer in the list. 4163 */ 4164 ill->ill_ifptr = NULL; 4165 ill->ill_name_length = 0; 4166 ill->ill_name[0] = '\0'; 4167 ill->ill_ppa = UINT_MAX; 4168 } 4169 4170 /* Generate one last event for this ill. */ 4171 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 4172 ill->ill_name_length); 4173 4174 ASSERT(ill->ill_phyint != NULL); 4175 phyi = ill->ill_phyint; 4176 ill->ill_phyint = NULL; 4177 4178 /* 4179 * ill_init allocates a phyint always to store the copy 4180 * of flags relevant to phyint. At that point in time, we could 4181 * not assign the name and hence phyint_illv4/v6 could not be 4182 * initialized. Later in ipif_set_values, we assign the name to 4183 * the ill, at which point in time we assign phyint_illv4/v6. 4184 * Thus we don't rely on phyint_illv6 to be initialized always. 4185 */ 4186 if (ill->ill_flags & ILLF_IPV6) 4187 phyi->phyint_illv6 = NULL; 4188 else 4189 phyi->phyint_illv4 = NULL; 4190 4191 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 4192 rw_exit(&ipst->ips_ill_g_lock); 4193 return; 4194 } 4195 4196 /* 4197 * There are no ills left on this phyint; pull it out of the phyint 4198 * avl trees, and free it. 4199 */ 4200 if (phyi->phyint_ifindex > 0) { 4201 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4202 phyi); 4203 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4204 phyi); 4205 } 4206 rw_exit(&ipst->ips_ill_g_lock); 4207 4208 phyint_free(phyi); 4209 } 4210 4211 /* 4212 * allocate a ppa, if the number of plumbed interfaces of this type are 4213 * less than ill_no_arena do a linear search to find a unused ppa. 4214 * When the number goes beyond ill_no_arena switch to using an arena. 4215 * Note: ppa value of zero cannot be allocated from vmem_arena as it 4216 * is the return value for an error condition, so allocation starts at one 4217 * and is decremented by one. 4218 */ 4219 static int 4220 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 4221 { 4222 ill_t *tmp_ill; 4223 uint_t start, end; 4224 int ppa; 4225 4226 if (ifp->illif_ppa_arena == NULL && 4227 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 4228 /* 4229 * Create an arena. 4230 */ 4231 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 4232 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 4233 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 4234 /* allocate what has already been assigned */ 4235 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 4236 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 4237 tmp_ill, AVL_AFTER)) { 4238 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4239 1, /* size */ 4240 1, /* align/quantum */ 4241 0, /* phase */ 4242 0, /* nocross */ 4243 /* minaddr */ 4244 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 4245 /* maxaddr */ 4246 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 4247 VM_NOSLEEP|VM_FIRSTFIT); 4248 if (ppa == 0) { 4249 ip1dbg(("ill_alloc_ppa: ppa allocation" 4250 " failed while switching")); 4251 vmem_destroy(ifp->illif_ppa_arena); 4252 ifp->illif_ppa_arena = NULL; 4253 break; 4254 } 4255 } 4256 } 4257 4258 if (ifp->illif_ppa_arena != NULL) { 4259 if (ill->ill_ppa == UINT_MAX) { 4260 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 4261 1, VM_NOSLEEP|VM_FIRSTFIT); 4262 if (ppa == 0) 4263 return (EAGAIN); 4264 ill->ill_ppa = --ppa; 4265 } else { 4266 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 4267 1, /* size */ 4268 1, /* align/quantum */ 4269 0, /* phase */ 4270 0, /* nocross */ 4271 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 4272 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 4273 VM_NOSLEEP|VM_FIRSTFIT); 4274 /* 4275 * Most likely the allocation failed because 4276 * the requested ppa was in use. 4277 */ 4278 if (ppa == 0) 4279 return (EEXIST); 4280 } 4281 return (0); 4282 } 4283 4284 /* 4285 * No arena is in use and not enough (>ill_no_arena) interfaces have 4286 * been plumbed to create one. Do a linear search to get a unused ppa. 4287 */ 4288 if (ill->ill_ppa == UINT_MAX) { 4289 end = UINT_MAX - 1; 4290 start = 0; 4291 } else { 4292 end = start = ill->ill_ppa; 4293 } 4294 4295 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 4296 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 4297 if (start++ >= end) { 4298 if (ill->ill_ppa == UINT_MAX) 4299 return (EAGAIN); 4300 else 4301 return (EEXIST); 4302 } 4303 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 4304 } 4305 ill->ill_ppa = start; 4306 return (0); 4307 } 4308 4309 /* 4310 * Insert ill into the list of configured ill's. Once this function completes, 4311 * the ill is globally visible and is available through lookups. More precisely 4312 * this happens after the caller drops the ill_g_lock. 4313 */ 4314 static int 4315 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 4316 { 4317 ill_if_t *ill_interface; 4318 avl_index_t where = 0; 4319 int error; 4320 int name_length; 4321 int index; 4322 boolean_t check_length = B_FALSE; 4323 ip_stack_t *ipst = ill->ill_ipst; 4324 4325 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 4326 4327 name_length = mi_strlen(name) + 1; 4328 4329 if (isv6) 4330 index = IP_V6_G_HEAD; 4331 else 4332 index = IP_V4_G_HEAD; 4333 4334 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 4335 /* 4336 * Search for interface type based on name 4337 */ 4338 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4339 if ((ill_interface->illif_name_len == name_length) && 4340 (strcmp(ill_interface->illif_name, name) == 0)) { 4341 break; 4342 } 4343 ill_interface = ill_interface->illif_next; 4344 } 4345 4346 /* 4347 * Interface type not found, create one. 4348 */ 4349 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 4350 ill_g_head_t ghead; 4351 4352 /* 4353 * allocate ill_if_t structure 4354 */ 4355 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 4356 if (ill_interface == NULL) { 4357 return (ENOMEM); 4358 } 4359 4360 (void) strcpy(ill_interface->illif_name, name); 4361 ill_interface->illif_name_len = name_length; 4362 4363 avl_create(&ill_interface->illif_avl_by_ppa, 4364 ill_compare_ppa, sizeof (ill_t), 4365 offsetof(struct ill_s, ill_avl_byppa)); 4366 4367 /* 4368 * link the structure in the back to maintain order 4369 * of configuration for ifconfig output. 4370 */ 4371 ghead = ipst->ips_ill_g_heads[index]; 4372 insque(ill_interface, ghead.ill_g_list_tail); 4373 } 4374 4375 if (ill->ill_ppa == UINT_MAX) 4376 check_length = B_TRUE; 4377 4378 error = ill_alloc_ppa(ill_interface, ill); 4379 if (error != 0) { 4380 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4381 ill_delete_interface_type(ill->ill_ifptr); 4382 return (error); 4383 } 4384 4385 /* 4386 * When the ppa is choosen by the system, check that there is 4387 * enough space to insert ppa. if a specific ppa was passed in this 4388 * check is not required as the interface name passed in will have 4389 * the right ppa in it. 4390 */ 4391 if (check_length) { 4392 /* 4393 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 4394 */ 4395 char buf[sizeof (uint_t) * 3]; 4396 4397 /* 4398 * convert ppa to string to calculate the amount of space 4399 * required for it in the name. 4400 */ 4401 numtos(ill->ill_ppa, buf); 4402 4403 /* Do we have enough space to insert ppa ? */ 4404 4405 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 4406 /* Free ppa and interface type struct */ 4407 if (ill_interface->illif_ppa_arena != NULL) { 4408 vmem_free(ill_interface->illif_ppa_arena, 4409 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 4410 } 4411 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 4412 ill_delete_interface_type(ill->ill_ifptr); 4413 4414 return (EINVAL); 4415 } 4416 } 4417 4418 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 4419 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 4420 4421 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 4422 &where); 4423 ill->ill_ifptr = ill_interface; 4424 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 4425 4426 ill_phyint_reinit(ill); 4427 return (0); 4428 } 4429 4430 /* Initialize the per phyint ipsq used for serialization */ 4431 static boolean_t 4432 ipsq_init(ill_t *ill, boolean_t enter) 4433 { 4434 ipsq_t *ipsq; 4435 ipxop_t *ipx; 4436 4437 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 4438 return (B_FALSE); 4439 4440 ill->ill_phyint->phyint_ipsq = ipsq; 4441 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 4442 ipx->ipx_ipsq = ipsq; 4443 ipsq->ipsq_next = ipsq; 4444 ipsq->ipsq_phyint = ill->ill_phyint; 4445 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 4446 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 4447 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 4448 if (enter) { 4449 ipx->ipx_writer = curthread; 4450 ipx->ipx_forced = B_FALSE; 4451 ipx->ipx_reentry_cnt = 1; 4452 #ifdef DEBUG 4453 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 4454 #endif 4455 } 4456 return (B_TRUE); 4457 } 4458 4459 /* 4460 * ill_init is called by ip_open when a device control stream is opened. 4461 * It does a few initializations, and shoots a DL_INFO_REQ message down 4462 * to the driver. The response is later picked up in ip_rput_dlpi and 4463 * used to set up default mechanisms for talking to the driver. (Always 4464 * called as writer.) 4465 * 4466 * If this function returns error, ip_open will call ip_close which in 4467 * turn will call ill_delete to clean up any memory allocated here that 4468 * is not yet freed. 4469 */ 4470 int 4471 ill_init(queue_t *q, ill_t *ill) 4472 { 4473 int count; 4474 dl_info_req_t *dlir; 4475 mblk_t *info_mp; 4476 uchar_t *frag_ptr; 4477 4478 /* 4479 * The ill is initialized to zero by mi_alloc*(). In addition 4480 * some fields already contain valid values, initialized in 4481 * ip_open(), before we reach here. 4482 */ 4483 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 4484 4485 ill->ill_rq = q; 4486 ill->ill_wq = WR(q); 4487 4488 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 4489 BPRI_HI); 4490 if (info_mp == NULL) 4491 return (ENOMEM); 4492 4493 /* 4494 * Allocate sufficient space to contain our fragment hash table and 4495 * the device name. 4496 */ 4497 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 4498 2 * LIFNAMSIZ + 5 + strlen(ipv6_forward_suffix)); 4499 if (frag_ptr == NULL) { 4500 freemsg(info_mp); 4501 return (ENOMEM); 4502 } 4503 ill->ill_frag_ptr = frag_ptr; 4504 ill->ill_frag_free_num_pkts = 0; 4505 ill->ill_last_frag_clean_time = 0; 4506 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 4507 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 4508 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 4509 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 4510 NULL, MUTEX_DEFAULT, NULL); 4511 } 4512 4513 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4514 if (ill->ill_phyint == NULL) { 4515 freemsg(info_mp); 4516 mi_free(frag_ptr); 4517 return (ENOMEM); 4518 } 4519 4520 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4521 /* 4522 * For now pretend this is a v4 ill. We need to set phyint_ill* 4523 * at this point because of the following reason. If we can't 4524 * enter the ipsq at some point and cv_wait, the writer that 4525 * wakes us up tries to locate us using the list of all phyints 4526 * in an ipsq and the ills from the phyint thru the phyint_ill*. 4527 * If we don't set it now, we risk a missed wakeup. 4528 */ 4529 ill->ill_phyint->phyint_illv4 = ill; 4530 ill->ill_ppa = UINT_MAX; 4531 ill->ill_fastpath_list = &ill->ill_fastpath_list; 4532 4533 if (!ipsq_init(ill, B_TRUE)) { 4534 freemsg(info_mp); 4535 mi_free(frag_ptr); 4536 mi_free(ill->ill_phyint); 4537 return (ENOMEM); 4538 } 4539 4540 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 4541 4542 /* Frag queue limit stuff */ 4543 ill->ill_frag_count = 0; 4544 ill->ill_ipf_gen = 0; 4545 4546 ill->ill_global_timer = INFINITY; 4547 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4548 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4549 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4550 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4551 4552 /* 4553 * Initialize IPv6 configuration variables. The IP module is always 4554 * opened as an IPv4 module. Instead tracking down the cases where 4555 * it switches to do ipv6, we'll just initialize the IPv6 configuration 4556 * here for convenience, this has no effect until the ill is set to do 4557 * IPv6. 4558 */ 4559 ill->ill_reachable_time = ND_REACHABLE_TIME; 4560 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 4561 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 4562 ill->ill_max_buf = ND_MAX_Q; 4563 ill->ill_refcnt = 0; 4564 4565 /* Send down the Info Request to the driver. */ 4566 info_mp->b_datap->db_type = M_PCPROTO; 4567 dlir = (dl_info_req_t *)info_mp->b_rptr; 4568 info_mp->b_wptr = (uchar_t *)&dlir[1]; 4569 dlir->dl_primitive = DL_INFO_REQ; 4570 4571 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4572 4573 qprocson(q); 4574 ill_dlpi_send(ill, info_mp); 4575 4576 return (0); 4577 } 4578 4579 /* 4580 * ill_dls_info 4581 * creates datalink socket info from the device. 4582 */ 4583 int 4584 ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) 4585 { 4586 size_t len; 4587 ill_t *ill = ipif->ipif_ill; 4588 4589 sdl->sdl_family = AF_LINK; 4590 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4591 sdl->sdl_type = ill->ill_type; 4592 ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4593 len = strlen(sdl->sdl_data); 4594 ASSERT(len < 256); 4595 sdl->sdl_nlen = (uchar_t)len; 4596 sdl->sdl_alen = ill->ill_phys_addr_length; 4597 sdl->sdl_slen = 0; 4598 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 4599 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 4600 4601 return (sizeof (struct sockaddr_dl)); 4602 } 4603 4604 /* 4605 * ill_xarp_info 4606 * creates xarp info from the device. 4607 */ 4608 static int 4609 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 4610 { 4611 sdl->sdl_family = AF_LINK; 4612 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 4613 sdl->sdl_type = ill->ill_type; 4614 ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); 4615 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 4616 sdl->sdl_alen = ill->ill_phys_addr_length; 4617 sdl->sdl_slen = 0; 4618 return (sdl->sdl_nlen); 4619 } 4620 4621 static int 4622 loopback_kstat_update(kstat_t *ksp, int rw) 4623 { 4624 kstat_named_t *kn; 4625 netstackid_t stackid; 4626 netstack_t *ns; 4627 ip_stack_t *ipst; 4628 4629 if (ksp == NULL || ksp->ks_data == NULL) 4630 return (EIO); 4631 4632 if (rw == KSTAT_WRITE) 4633 return (EACCES); 4634 4635 kn = KSTAT_NAMED_PTR(ksp); 4636 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 4637 4638 ns = netstack_find_by_stackid(stackid); 4639 if (ns == NULL) 4640 return (-1); 4641 4642 ipst = ns->netstack_ip; 4643 if (ipst == NULL) { 4644 netstack_rele(ns); 4645 return (-1); 4646 } 4647 kn[0].value.ui32 = ipst->ips_loopback_packets; 4648 kn[1].value.ui32 = ipst->ips_loopback_packets; 4649 netstack_rele(ns); 4650 return (0); 4651 } 4652 4653 /* 4654 * Has ifindex been plumbed already? 4655 */ 4656 boolean_t 4657 phyint_exists(uint_t index, ip_stack_t *ipst) 4658 { 4659 ASSERT(index != 0); 4660 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 4661 4662 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 4663 &index, NULL) != NULL); 4664 } 4665 4666 /* Pick a unique ifindex */ 4667 boolean_t 4668 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 4669 { 4670 uint_t starting_index; 4671 4672 if (!ipst->ips_ill_index_wrap) { 4673 *indexp = ipst->ips_ill_index++; 4674 if (ipst->ips_ill_index == 0) { 4675 /* Reached the uint_t limit Next time wrap */ 4676 ipst->ips_ill_index_wrap = B_TRUE; 4677 } 4678 return (B_TRUE); 4679 } 4680 4681 /* 4682 * Start reusing unused indexes. Note that we hold the ill_g_lock 4683 * at this point and don't want to call any function that attempts 4684 * to get the lock again. 4685 */ 4686 starting_index = ipst->ips_ill_index++; 4687 for (; ipst->ips_ill_index != starting_index; ipst->ips_ill_index++) { 4688 if (ipst->ips_ill_index != 0 && 4689 !phyint_exists(ipst->ips_ill_index, ipst)) { 4690 /* found unused index - use it */ 4691 *indexp = ipst->ips_ill_index; 4692 return (B_TRUE); 4693 } 4694 } 4695 4696 /* 4697 * all interface indicies are inuse. 4698 */ 4699 return (B_FALSE); 4700 } 4701 4702 /* 4703 * Assign a unique interface index for the phyint. 4704 */ 4705 static boolean_t 4706 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 4707 { 4708 ASSERT(phyi->phyint_ifindex == 0); 4709 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 4710 } 4711 4712 /* 4713 * Initialize the flags on `phyi' as per the provided mactype. 4714 */ 4715 static void 4716 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 4717 { 4718 uint64_t flags = 0; 4719 4720 /* 4721 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 4722 * we always presume the underlying hardware is working and set 4723 * PHYI_RUNNING (if it's not, the driver will subsequently send a 4724 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 4725 * there are no active interfaces in the group so we set PHYI_FAILED. 4726 */ 4727 if (mactype == SUNW_DL_IPMP) 4728 flags |= PHYI_FAILED; 4729 else 4730 flags |= PHYI_RUNNING; 4731 4732 switch (mactype) { 4733 case SUNW_DL_VNI: 4734 flags |= PHYI_VIRTUAL; 4735 break; 4736 case SUNW_DL_IPMP: 4737 flags |= PHYI_IPMP; 4738 break; 4739 case DL_LOOP: 4740 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 4741 break; 4742 } 4743 4744 mutex_enter(&phyi->phyint_lock); 4745 phyi->phyint_flags |= flags; 4746 mutex_exit(&phyi->phyint_lock); 4747 } 4748 4749 /* 4750 * Return a pointer to the ill which matches the supplied name. Note that 4751 * the ill name length includes the null termination character. (May be 4752 * called as writer.) 4753 * If do_alloc and the interface is "lo0" it will be automatically created. 4754 * Cannot bump up reference on condemned ills. So dup detect can't be done 4755 * using this func. 4756 */ 4757 ill_t * 4758 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 4759 queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc, 4760 ip_stack_t *ipst) 4761 { 4762 ill_t *ill; 4763 ipif_t *ipif; 4764 ipsq_t *ipsq; 4765 kstat_named_t *kn; 4766 boolean_t isloopback; 4767 in6_addr_t ov6addr; 4768 4769 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 4770 4771 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4772 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4773 rw_exit(&ipst->ips_ill_g_lock); 4774 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) 4775 return (ill); 4776 4777 /* 4778 * Couldn't find it. Does this happen to be a lookup for the 4779 * loopback device and are we allowed to allocate it? 4780 */ 4781 if (!isloopback || !do_alloc) 4782 return (NULL); 4783 4784 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 4785 4786 ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); 4787 if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { 4788 rw_exit(&ipst->ips_ill_g_lock); 4789 return (ill); 4790 } 4791 4792 /* Create the loopback device on demand */ 4793 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 4794 sizeof (ipif_loopback_name), BPRI_MED)); 4795 if (ill == NULL) 4796 goto done; 4797 4798 *ill = ill_null; 4799 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 4800 ill->ill_ipst = ipst; 4801 netstack_hold(ipst->ips_netstack); 4802 /* 4803 * For exclusive stacks we set the zoneid to zero 4804 * to make IP operate as if in the global zone. 4805 */ 4806 ill->ill_zoneid = GLOBAL_ZONEID; 4807 4808 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 4809 if (ill->ill_phyint == NULL) 4810 goto done; 4811 4812 if (isv6) 4813 ill->ill_phyint->phyint_illv6 = ill; 4814 else 4815 ill->ill_phyint->phyint_illv4 = ill; 4816 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 4817 phyint_flags_init(ill->ill_phyint, DL_LOOP); 4818 4819 ill->ill_max_frag = IP_LOOPBACK_MTU; 4820 /* Add room for tcp+ip headers */ 4821 if (isv6) { 4822 ill->ill_isv6 = B_TRUE; 4823 ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ 4824 } else { 4825 ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; 4826 } 4827 if (!ill_allocate_mibs(ill)) 4828 goto done; 4829 ill->ill_max_mtu = ill->ill_max_frag; 4830 /* 4831 * ipif_loopback_name can't be pointed at directly because its used 4832 * by both the ipv4 and ipv6 interfaces. When the ill is removed 4833 * from the glist, ill_glist_delete() sets the first character of 4834 * ill_name to '\0'. 4835 */ 4836 ill->ill_name = (char *)ill + sizeof (*ill); 4837 (void) strcpy(ill->ill_name, ipif_loopback_name); 4838 ill->ill_name_length = sizeof (ipif_loopback_name); 4839 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 4840 ill->ill_dlpi_pending = DL_PRIM_INVAL; 4841 4842 ill->ill_global_timer = INFINITY; 4843 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 4844 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 4845 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 4846 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 4847 4848 /* No resolver here. */ 4849 ill->ill_net_type = IRE_LOOPBACK; 4850 4851 /* Initialize the ipsq */ 4852 if (!ipsq_init(ill, B_FALSE)) 4853 goto done; 4854 4855 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE); 4856 if (ipif == NULL) 4857 goto done; 4858 4859 ill->ill_flags = ILLF_MULTICAST; 4860 4861 ov6addr = ipif->ipif_v6lcl_addr; 4862 /* Set up default loopback address and mask. */ 4863 if (!isv6) { 4864 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 4865 4866 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 4867 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4868 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 4869 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4870 ipif->ipif_v6subnet); 4871 ill->ill_flags |= ILLF_IPV4; 4872 } else { 4873 ipif->ipif_v6lcl_addr = ipv6_loopback; 4874 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 4875 ipif->ipif_v6net_mask = ipv6_all_ones; 4876 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 4877 ipif->ipif_v6subnet); 4878 ill->ill_flags |= ILLF_IPV6; 4879 } 4880 4881 /* 4882 * Chain us in at the end of the ill list. hold the ill 4883 * before we make it globally visible. 1 for the lookup. 4884 */ 4885 ill->ill_refcnt = 0; 4886 ill_refhold(ill); 4887 4888 ill->ill_frag_count = 0; 4889 ill->ill_frag_free_num_pkts = 0; 4890 ill->ill_last_frag_clean_time = 0; 4891 4892 ipsq = ill->ill_phyint->phyint_ipsq; 4893 4894 if (ill_glist_insert(ill, "lo", isv6) != 0) 4895 cmn_err(CE_PANIC, "cannot insert loopback interface"); 4896 4897 /* Let SCTP know so that it can add this to its list */ 4898 sctp_update_ill(ill, SCTP_ILL_INSERT); 4899 4900 /* 4901 * We have already assigned ipif_v6lcl_addr above, but we need to 4902 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 4903 * requires to be after ill_glist_insert() since we need the 4904 * ill_index set. Pass on ipv6_loopback as the old address. 4905 */ 4906 sctp_update_ipif_addr(ipif, ov6addr); 4907 4908 /* 4909 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 4910 * If so, free our original one. 4911 */ 4912 if (ipsq != ill->ill_phyint->phyint_ipsq) 4913 ipsq_delete(ipsq); 4914 4915 if (ipst->ips_loopback_ksp == NULL) { 4916 /* Export loopback interface statistics */ 4917 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 4918 ipif_loopback_name, "net", 4919 KSTAT_TYPE_NAMED, 2, 0, 4920 ipst->ips_netstack->netstack_stackid); 4921 if (ipst->ips_loopback_ksp != NULL) { 4922 ipst->ips_loopback_ksp->ks_update = 4923 loopback_kstat_update; 4924 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 4925 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 4926 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 4927 ipst->ips_loopback_ksp->ks_private = 4928 (void *)(uintptr_t)ipst->ips_netstack-> 4929 netstack_stackid; 4930 kstat_install(ipst->ips_loopback_ksp); 4931 } 4932 } 4933 4934 if (error != NULL) 4935 *error = 0; 4936 *did_alloc = B_TRUE; 4937 rw_exit(&ipst->ips_ill_g_lock); 4938 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 4939 NE_PLUMB, ill->ill_name, ill->ill_name_length); 4940 return (ill); 4941 done: 4942 if (ill != NULL) { 4943 if (ill->ill_phyint != NULL) { 4944 ipsq = ill->ill_phyint->phyint_ipsq; 4945 if (ipsq != NULL) { 4946 ipsq->ipsq_phyint = NULL; 4947 ipsq_delete(ipsq); 4948 } 4949 mi_free(ill->ill_phyint); 4950 } 4951 ill_free_mib(ill); 4952 if (ill->ill_ipst != NULL) 4953 netstack_rele(ill->ill_ipst->ips_netstack); 4954 mi_free(ill); 4955 } 4956 rw_exit(&ipst->ips_ill_g_lock); 4957 if (error != NULL) 4958 *error = ENOMEM; 4959 return (NULL); 4960 } 4961 4962 /* 4963 * For IPP calls - use the ip_stack_t for global stack. 4964 */ 4965 ill_t * 4966 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, 4967 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) 4968 { 4969 ip_stack_t *ipst; 4970 ill_t *ill; 4971 4972 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 4973 if (ipst == NULL) { 4974 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 4975 return (NULL); 4976 } 4977 4978 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 4979 netstack_rele(ipst->ips_netstack); 4980 return (ill); 4981 } 4982 4983 /* 4984 * Return a pointer to the ill which matches the index and IP version type. 4985 */ 4986 ill_t * 4987 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, 4988 ipsq_func_t func, int *err, ip_stack_t *ipst) 4989 { 4990 ill_t *ill; 4991 ipsq_t *ipsq; 4992 phyint_t *phyi; 4993 4994 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 4995 (q != NULL && mp != NULL && func != NULL && err != NULL)); 4996 4997 if (err != NULL) 4998 *err = 0; 4999 5000 /* 5001 * Indexes are stored in the phyint - a common structure 5002 * to both IPv4 and IPv6. 5003 */ 5004 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5005 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5006 (void *) &index, NULL); 5007 if (phyi != NULL) { 5008 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 5009 if (ill != NULL) { 5010 /* 5011 * The block comment at the start of ipif_down 5012 * explains the use of the macros used below 5013 */ 5014 GRAB_CONN_LOCK(q); 5015 mutex_enter(&ill->ill_lock); 5016 if (ILL_CAN_LOOKUP(ill)) { 5017 ill_refhold_locked(ill); 5018 mutex_exit(&ill->ill_lock); 5019 RELEASE_CONN_LOCK(q); 5020 rw_exit(&ipst->ips_ill_g_lock); 5021 return (ill); 5022 } else if (ILL_CAN_WAIT(ill, q)) { 5023 ipsq = ill->ill_phyint->phyint_ipsq; 5024 mutex_enter(&ipsq->ipsq_lock); 5025 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5026 rw_exit(&ipst->ips_ill_g_lock); 5027 mutex_exit(&ill->ill_lock); 5028 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 5029 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5030 mutex_exit(&ipsq->ipsq_lock); 5031 RELEASE_CONN_LOCK(q); 5032 if (err != NULL) 5033 *err = EINPROGRESS; 5034 return (NULL); 5035 } 5036 RELEASE_CONN_LOCK(q); 5037 mutex_exit(&ill->ill_lock); 5038 } 5039 } 5040 rw_exit(&ipst->ips_ill_g_lock); 5041 if (err != NULL) 5042 *err = ENXIO; 5043 return (NULL); 5044 } 5045 5046 /* 5047 * Return the ifindex next in sequence after the passed in ifindex. 5048 * If there is no next ifindex for the given protocol, return 0. 5049 */ 5050 uint_t 5051 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 5052 { 5053 phyint_t *phyi; 5054 phyint_t *phyi_initial; 5055 uint_t ifindex; 5056 5057 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5058 5059 if (index == 0) { 5060 phyi = avl_first( 5061 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 5062 } else { 5063 phyi = phyi_initial = avl_find( 5064 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5065 (void *) &index, NULL); 5066 } 5067 5068 for (; phyi != NULL; 5069 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 5070 phyi, AVL_AFTER)) { 5071 /* 5072 * If we're not returning the first interface in the tree 5073 * and we still haven't moved past the phyint_t that 5074 * corresponds to index, avl_walk needs to be called again 5075 */ 5076 if (!((index != 0) && (phyi == phyi_initial))) { 5077 if (isv6) { 5078 if ((phyi->phyint_illv6) && 5079 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 5080 (phyi->phyint_illv6->ill_isv6 == 1)) 5081 break; 5082 } else { 5083 if ((phyi->phyint_illv4) && 5084 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 5085 (phyi->phyint_illv4->ill_isv6 == 0)) 5086 break; 5087 } 5088 } 5089 } 5090 5091 rw_exit(&ipst->ips_ill_g_lock); 5092 5093 if (phyi != NULL) 5094 ifindex = phyi->phyint_ifindex; 5095 else 5096 ifindex = 0; 5097 5098 return (ifindex); 5099 } 5100 5101 /* 5102 * Return the ifindex for the named interface. 5103 * If there is no next ifindex for the interface, return 0. 5104 */ 5105 uint_t 5106 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 5107 { 5108 phyint_t *phyi; 5109 avl_index_t where = 0; 5110 uint_t ifindex; 5111 5112 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5113 5114 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 5115 name, &where)) == NULL) { 5116 rw_exit(&ipst->ips_ill_g_lock); 5117 return (0); 5118 } 5119 5120 ifindex = phyi->phyint_ifindex; 5121 5122 rw_exit(&ipst->ips_ill_g_lock); 5123 5124 return (ifindex); 5125 } 5126 5127 /* 5128 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 5129 * that gives a running thread a reference to the ill. This reference must be 5130 * released by the thread when it is done accessing the ill and related 5131 * objects. ill_refcnt can not be used to account for static references 5132 * such as other structures pointing to an ill. Callers must generally 5133 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 5134 * or be sure that the ill is not being deleted or changing state before 5135 * calling the refhold functions. A non-zero ill_refcnt ensures that the 5136 * ill won't change any of its critical state such as address, netmask etc. 5137 */ 5138 void 5139 ill_refhold(ill_t *ill) 5140 { 5141 mutex_enter(&ill->ill_lock); 5142 ill->ill_refcnt++; 5143 ILL_TRACE_REF(ill); 5144 mutex_exit(&ill->ill_lock); 5145 } 5146 5147 void 5148 ill_refhold_locked(ill_t *ill) 5149 { 5150 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5151 ill->ill_refcnt++; 5152 ILL_TRACE_REF(ill); 5153 } 5154 5155 int 5156 ill_check_and_refhold(ill_t *ill) 5157 { 5158 mutex_enter(&ill->ill_lock); 5159 if (ILL_CAN_LOOKUP(ill)) { 5160 ill_refhold_locked(ill); 5161 mutex_exit(&ill->ill_lock); 5162 return (0); 5163 } 5164 mutex_exit(&ill->ill_lock); 5165 return (ILL_LOOKUP_FAILED); 5166 } 5167 5168 /* 5169 * Must not be called while holding any locks. Otherwise if this is 5170 * the last reference to be released, there is a chance of recursive mutex 5171 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5172 * to restart an ioctl. 5173 */ 5174 void 5175 ill_refrele(ill_t *ill) 5176 { 5177 mutex_enter(&ill->ill_lock); 5178 ASSERT(ill->ill_refcnt != 0); 5179 ill->ill_refcnt--; 5180 ILL_UNTRACE_REF(ill); 5181 if (ill->ill_refcnt != 0) { 5182 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 5183 mutex_exit(&ill->ill_lock); 5184 return; 5185 } 5186 5187 /* Drops the ill_lock */ 5188 ipif_ill_refrele_tail(ill); 5189 } 5190 5191 /* 5192 * Obtain a weak reference count on the ill. This reference ensures the 5193 * ill won't be freed, but the ill may change any of its critical state 5194 * such as netmask, address etc. Returns an error if the ill has started 5195 * closing. 5196 */ 5197 boolean_t 5198 ill_waiter_inc(ill_t *ill) 5199 { 5200 mutex_enter(&ill->ill_lock); 5201 if (ill->ill_state_flags & ILL_CONDEMNED) { 5202 mutex_exit(&ill->ill_lock); 5203 return (B_FALSE); 5204 } 5205 ill->ill_waiters++; 5206 mutex_exit(&ill->ill_lock); 5207 return (B_TRUE); 5208 } 5209 5210 void 5211 ill_waiter_dcr(ill_t *ill) 5212 { 5213 mutex_enter(&ill->ill_lock); 5214 ill->ill_waiters--; 5215 if (ill->ill_waiters == 0) 5216 cv_broadcast(&ill->ill_cv); 5217 mutex_exit(&ill->ill_lock); 5218 } 5219 5220 /* 5221 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 5222 * driver. We construct best guess defaults for lower level information that 5223 * we need. If an interface is brought up without injection of any overriding 5224 * information from outside, we have to be ready to go with these defaults. 5225 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 5226 * we primarely want the dl_provider_style. 5227 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 5228 * at which point we assume the other part of the information is valid. 5229 */ 5230 void 5231 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 5232 { 5233 uchar_t *brdcst_addr; 5234 uint_t brdcst_addr_length, phys_addr_length; 5235 t_scalar_t sap_length; 5236 dl_info_ack_t *dlia; 5237 ip_m_t *ipm; 5238 dl_qos_cl_sel1_t *sel1; 5239 int min_mtu; 5240 5241 ASSERT(IAM_WRITER_ILL(ill)); 5242 5243 /* 5244 * Till the ill is fully up ILL_CHANGING will be set and 5245 * the ill is not globally visible. So no need for a lock. 5246 */ 5247 dlia = (dl_info_ack_t *)mp->b_rptr; 5248 ill->ill_mactype = dlia->dl_mac_type; 5249 5250 ipm = ip_m_lookup(dlia->dl_mac_type); 5251 if (ipm == NULL) { 5252 ipm = ip_m_lookup(DL_OTHER); 5253 ASSERT(ipm != NULL); 5254 } 5255 ill->ill_media = ipm; 5256 5257 /* 5258 * When the new DLPI stuff is ready we'll pull lengths 5259 * from dlia. 5260 */ 5261 if (dlia->dl_version == DL_VERSION_2) { 5262 brdcst_addr_length = dlia->dl_brdcst_addr_length; 5263 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 5264 brdcst_addr_length); 5265 if (brdcst_addr == NULL) { 5266 brdcst_addr_length = 0; 5267 } 5268 sap_length = dlia->dl_sap_length; 5269 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 5270 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 5271 brdcst_addr_length, sap_length, phys_addr_length)); 5272 } else { 5273 brdcst_addr_length = 6; 5274 brdcst_addr = ip_six_byte_all_ones; 5275 sap_length = -2; 5276 phys_addr_length = brdcst_addr_length; 5277 } 5278 5279 ill->ill_bcast_addr_length = brdcst_addr_length; 5280 ill->ill_phys_addr_length = phys_addr_length; 5281 ill->ill_sap_length = sap_length; 5282 5283 /* 5284 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 5285 * but we must ensure a minimum IP MTU is used since other bits of 5286 * IP will fly apart otherwise. 5287 */ 5288 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 5289 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 5290 ill->ill_max_mtu = ill->ill_max_frag; 5291 5292 ill->ill_type = ipm->ip_m_type; 5293 5294 if (!ill->ill_dlpi_style_set) { 5295 if (dlia->dl_provider_style == DL_STYLE2) 5296 ill->ill_needs_attach = 1; 5297 5298 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 5299 5300 /* 5301 * Allocate the first ipif on this ill. We don't delay it 5302 * further as ioctl handling assumes at least one ipif exists. 5303 * 5304 * At this point we don't know whether the ill is v4 or v6. 5305 * We will know this whan the SIOCSLIFNAME happens and 5306 * the correct value for ill_isv6 will be assigned in 5307 * ipif_set_values(). We need to hold the ill lock and 5308 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 5309 * the wakeup. 5310 */ 5311 (void) ipif_allocate(ill, 0, IRE_LOCAL, 5312 dlia->dl_provider_style != DL_STYLE2, B_TRUE); 5313 mutex_enter(&ill->ill_lock); 5314 ASSERT(ill->ill_dlpi_style_set == 0); 5315 ill->ill_dlpi_style_set = 1; 5316 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 5317 cv_broadcast(&ill->ill_cv); 5318 mutex_exit(&ill->ill_lock); 5319 freemsg(mp); 5320 return; 5321 } 5322 ASSERT(ill->ill_ipif != NULL); 5323 /* 5324 * We know whether it is IPv4 or IPv6 now, as this is the 5325 * second DL_INFO_ACK we are recieving in response to the 5326 * DL_INFO_REQ sent in ipif_set_values. 5327 */ 5328 if (ill->ill_isv6) 5329 ill->ill_sap = IP6_DL_SAP; 5330 else 5331 ill->ill_sap = IP_DL_SAP; 5332 /* 5333 * Set ipif_mtu which is used to set the IRE's 5334 * ire_max_frag value. The driver could have sent 5335 * a different mtu from what it sent last time. No 5336 * need to call ipif_mtu_change because IREs have 5337 * not yet been created. 5338 */ 5339 ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; 5340 /* 5341 * Clear all the flags that were set based on ill_bcast_addr_length 5342 * and ill_phys_addr_length (in ipif_set_values) as these could have 5343 * changed now and we need to re-evaluate. 5344 */ 5345 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 5346 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 5347 5348 /* 5349 * Free ill_resolver_mp and ill_bcast_mp as things could have 5350 * changed now. 5351 * 5352 * NOTE: The IPMP meta-interface is special-cased because it starts 5353 * with no underlying interfaces (and thus an unknown broadcast 5354 * address length), but we enforce that an interface is broadcast- 5355 * capable as part of allowing it to join a group. 5356 */ 5357 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 5358 if (ill->ill_resolver_mp != NULL) 5359 freemsg(ill->ill_resolver_mp); 5360 if (ill->ill_bcast_mp != NULL) 5361 freemsg(ill->ill_bcast_mp); 5362 if (ill->ill_flags & ILLF_XRESOLV) 5363 ill->ill_net_type = IRE_IF_RESOLVER; 5364 else 5365 ill->ill_net_type = IRE_IF_NORESOLVER; 5366 ill->ill_resolver_mp = ill_dlur_gen(NULL, 5367 ill->ill_phys_addr_length, 5368 ill->ill_sap, 5369 ill->ill_sap_length); 5370 ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); 5371 5372 if (ill->ill_isv6) 5373 /* 5374 * Note: xresolv interfaces will eventually need NOARP 5375 * set here as well, but that will require those 5376 * external resolvers to have some knowledge of 5377 * that flag and act appropriately. Not to be changed 5378 * at present. 5379 */ 5380 ill->ill_flags |= ILLF_NONUD; 5381 else 5382 ill->ill_flags |= ILLF_NOARP; 5383 5384 if (ill->ill_phys_addr_length == 0) { 5385 if (ill->ill_media->ip_m_mac_type == SUNW_DL_VNI) { 5386 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 5387 } else { 5388 /* pt-pt supports multicast. */ 5389 ill->ill_flags |= ILLF_MULTICAST; 5390 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 5391 } 5392 } 5393 } else { 5394 ill->ill_net_type = IRE_IF_RESOLVER; 5395 if (ill->ill_bcast_mp != NULL) 5396 freemsg(ill->ill_bcast_mp); 5397 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 5398 ill->ill_bcast_addr_length, ill->ill_sap, 5399 ill->ill_sap_length); 5400 /* 5401 * Later detect lack of DLPI driver multicast 5402 * capability by catching DL_ENABMULTI errors in 5403 * ip_rput_dlpi. 5404 */ 5405 ill->ill_flags |= ILLF_MULTICAST; 5406 if (!ill->ill_isv6) 5407 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 5408 } 5409 5410 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 5411 if (ill->ill_mactype == SUNW_DL_IPMP) 5412 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 5413 5414 /* By default an interface does not support any CoS marking */ 5415 ill->ill_flags &= ~ILLF_COS_ENABLED; 5416 5417 /* 5418 * If we get QoS information in DL_INFO_ACK, the device supports 5419 * some form of CoS marking, set ILLF_COS_ENABLED. 5420 */ 5421 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 5422 dlia->dl_qos_length); 5423 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 5424 ill->ill_flags |= ILLF_COS_ENABLED; 5425 } 5426 5427 /* Clear any previous error indication. */ 5428 ill->ill_error = 0; 5429 freemsg(mp); 5430 } 5431 5432 /* 5433 * Perform various checks to verify that an address would make sense as a 5434 * local, remote, or subnet interface address. 5435 */ 5436 static boolean_t 5437 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 5438 { 5439 ipaddr_t net_mask; 5440 5441 /* 5442 * Don't allow all zeroes, or all ones, but allow 5443 * all ones netmask. 5444 */ 5445 if ((net_mask = ip_net_mask(addr)) == 0) 5446 return (B_FALSE); 5447 /* A given netmask overrides the "guess" netmask */ 5448 if (subnet_mask != 0) 5449 net_mask = subnet_mask; 5450 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 5451 (addr == (addr | ~net_mask)))) { 5452 return (B_FALSE); 5453 } 5454 5455 /* 5456 * Even if the netmask is all ones, we do not allow address to be 5457 * 255.255.255.255 5458 */ 5459 if (addr == INADDR_BROADCAST) 5460 return (B_FALSE); 5461 5462 if (CLASSD(addr)) 5463 return (B_FALSE); 5464 5465 return (B_TRUE); 5466 } 5467 5468 #define V6_IPIF_LINKLOCAL(p) \ 5469 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 5470 5471 /* 5472 * Compare two given ipifs and check if the second one is better than 5473 * the first one using the order of preference (not taking deprecated 5474 * into acount) specified in ipif_lookup_multicast(). 5475 */ 5476 static boolean_t 5477 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 5478 { 5479 /* Check the least preferred first. */ 5480 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 5481 /* If both ipifs are the same, use the first one. */ 5482 if (IS_LOOPBACK(new_ipif->ipif_ill)) 5483 return (B_FALSE); 5484 else 5485 return (B_TRUE); 5486 } 5487 5488 /* For IPv6, check for link local address. */ 5489 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 5490 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5491 V6_IPIF_LINKLOCAL(new_ipif)) { 5492 /* The second one is equal or less preferred. */ 5493 return (B_FALSE); 5494 } else { 5495 return (B_TRUE); 5496 } 5497 } 5498 5499 /* Then check for point to point interface. */ 5500 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 5501 if (IS_LOOPBACK(new_ipif->ipif_ill) || 5502 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 5503 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 5504 return (B_FALSE); 5505 } else { 5506 return (B_TRUE); 5507 } 5508 } 5509 5510 /* old_ipif is a normal interface, so no need to use the new one. */ 5511 return (B_FALSE); 5512 } 5513 5514 /* 5515 * Find a mulitcast-capable ipif given an IP instance and zoneid. 5516 * The ipif must be up, and its ill must multicast-capable, not 5517 * condemned, not an underlying interface in an IPMP group, and 5518 * not a VNI interface. Order of preference: 5519 * 5520 * 1a. normal 5521 * 1b. normal, but deprecated 5522 * 2a. point to point 5523 * 2b. point to point, but deprecated 5524 * 3a. link local 5525 * 3b. link local, but deprecated 5526 * 4. loopback. 5527 */ 5528 ipif_t * 5529 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 5530 { 5531 ill_t *ill; 5532 ill_walk_context_t ctx; 5533 ipif_t *ipif; 5534 ipif_t *saved_ipif = NULL; 5535 ipif_t *dep_ipif = NULL; 5536 5537 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5538 if (isv6) 5539 ill = ILL_START_WALK_V6(&ctx, ipst); 5540 else 5541 ill = ILL_START_WALK_V4(&ctx, ipst); 5542 5543 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5544 mutex_enter(&ill->ill_lock); 5545 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) || 5546 !(ill->ill_flags & ILLF_MULTICAST)) { 5547 mutex_exit(&ill->ill_lock); 5548 continue; 5549 } 5550 for (ipif = ill->ill_ipif; ipif != NULL; 5551 ipif = ipif->ipif_next) { 5552 if (zoneid != ipif->ipif_zoneid && 5553 zoneid != ALL_ZONES && 5554 ipif->ipif_zoneid != ALL_ZONES) { 5555 continue; 5556 } 5557 if (!(ipif->ipif_flags & IPIF_UP) || 5558 !IPIF_CAN_LOOKUP(ipif)) { 5559 continue; 5560 } 5561 5562 /* 5563 * Found one candidate. If it is deprecated, 5564 * remember it in dep_ipif. If it is not deprecated, 5565 * remember it in saved_ipif. 5566 */ 5567 if (ipif->ipif_flags & IPIF_DEPRECATED) { 5568 if (dep_ipif == NULL) { 5569 dep_ipif = ipif; 5570 } else if (ipif_comp_multi(dep_ipif, ipif, 5571 isv6)) { 5572 /* 5573 * If the previous dep_ipif does not 5574 * belong to the same ill, we've done 5575 * a ipif_refhold() on it. So we need 5576 * to release it. 5577 */ 5578 if (dep_ipif->ipif_ill != ill) 5579 ipif_refrele(dep_ipif); 5580 dep_ipif = ipif; 5581 } 5582 continue; 5583 } 5584 if (saved_ipif == NULL) { 5585 saved_ipif = ipif; 5586 } else { 5587 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 5588 if (saved_ipif->ipif_ill != ill) 5589 ipif_refrele(saved_ipif); 5590 saved_ipif = ipif; 5591 } 5592 } 5593 } 5594 /* 5595 * Before going to the next ill, do a ipif_refhold() on the 5596 * saved ones. 5597 */ 5598 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 5599 ipif_refhold_locked(saved_ipif); 5600 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 5601 ipif_refhold_locked(dep_ipif); 5602 mutex_exit(&ill->ill_lock); 5603 } 5604 rw_exit(&ipst->ips_ill_g_lock); 5605 5606 /* 5607 * If we have only the saved_ipif, return it. But if we have both 5608 * saved_ipif and dep_ipif, check to see which one is better. 5609 */ 5610 if (saved_ipif != NULL) { 5611 if (dep_ipif != NULL) { 5612 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 5613 ipif_refrele(saved_ipif); 5614 return (dep_ipif); 5615 } else { 5616 ipif_refrele(dep_ipif); 5617 return (saved_ipif); 5618 } 5619 } 5620 return (saved_ipif); 5621 } else { 5622 return (dep_ipif); 5623 } 5624 } 5625 5626 /* 5627 * This function is called when an application does not specify an interface 5628 * to be used for multicast traffic (joining a group/sending data). It 5629 * calls ire_lookup_multi() to look for an interface route for the 5630 * specified multicast group. Doing this allows the administrator to add 5631 * prefix routes for multicast to indicate which interface to be used for 5632 * multicast traffic in the above scenario. The route could be for all 5633 * multicast (224.0/4), for a single multicast group (a /32 route) or 5634 * anything in between. If there is no such multicast route, we just find 5635 * any multicast capable interface and return it. The returned ipif 5636 * is refhold'ed. 5637 */ 5638 ipif_t * 5639 ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) 5640 { 5641 ire_t *ire; 5642 ipif_t *ipif; 5643 5644 ire = ire_lookup_multi(group, zoneid, ipst); 5645 if (ire != NULL) { 5646 ipif = ire->ire_ipif; 5647 ipif_refhold(ipif); 5648 ire_refrele(ire); 5649 return (ipif); 5650 } 5651 5652 return (ipif_lookup_multicast(ipst, zoneid, B_FALSE)); 5653 } 5654 5655 /* 5656 * Look for an ipif with the specified interface address and destination. 5657 * The destination address is used only for matching point-to-point interfaces. 5658 */ 5659 ipif_t * 5660 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, 5661 ipsq_func_t func, int *error, ip_stack_t *ipst) 5662 { 5663 ipif_t *ipif; 5664 ill_t *ill; 5665 ill_walk_context_t ctx; 5666 ipsq_t *ipsq; 5667 5668 if (error != NULL) 5669 *error = 0; 5670 5671 /* 5672 * First match all the point-to-point interfaces 5673 * before looking at non-point-to-point interfaces. 5674 * This is done to avoid returning non-point-to-point 5675 * ipif instead of unnumbered point-to-point ipif. 5676 */ 5677 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5678 ill = ILL_START_WALK_V4(&ctx, ipst); 5679 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5680 GRAB_CONN_LOCK(q); 5681 mutex_enter(&ill->ill_lock); 5682 for (ipif = ill->ill_ipif; ipif != NULL; 5683 ipif = ipif->ipif_next) { 5684 /* Allow the ipif to be down */ 5685 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5686 (ipif->ipif_lcl_addr == if_addr) && 5687 (ipif->ipif_pp_dst_addr == dst)) { 5688 /* 5689 * The block comment at the start of ipif_down 5690 * explains the use of the macros used below 5691 */ 5692 if (IPIF_CAN_LOOKUP(ipif)) { 5693 ipif_refhold_locked(ipif); 5694 mutex_exit(&ill->ill_lock); 5695 RELEASE_CONN_LOCK(q); 5696 rw_exit(&ipst->ips_ill_g_lock); 5697 return (ipif); 5698 } else if (IPIF_CAN_WAIT(ipif, q)) { 5699 ipsq = ill->ill_phyint->phyint_ipsq; 5700 mutex_enter(&ipsq->ipsq_lock); 5701 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5702 mutex_exit(&ill->ill_lock); 5703 rw_exit(&ipst->ips_ill_g_lock); 5704 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5705 ill); 5706 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5707 mutex_exit(&ipsq->ipsq_lock); 5708 RELEASE_CONN_LOCK(q); 5709 if (error != NULL) 5710 *error = EINPROGRESS; 5711 return (NULL); 5712 } 5713 } 5714 } 5715 mutex_exit(&ill->ill_lock); 5716 RELEASE_CONN_LOCK(q); 5717 } 5718 rw_exit(&ipst->ips_ill_g_lock); 5719 5720 /* lookup the ipif based on interface address */ 5721 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error, 5722 ipst); 5723 ASSERT(ipif == NULL || !ipif->ipif_isv6); 5724 return (ipif); 5725 } 5726 5727 /* 5728 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 5729 */ 5730 static ipif_t * 5731 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp, 5732 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, 5733 ip_stack_t *ipst) 5734 { 5735 ipif_t *ipif; 5736 ill_t *ill; 5737 boolean_t ptp = B_FALSE; 5738 ipsq_t *ipsq; 5739 ill_walk_context_t ctx; 5740 5741 if (error != NULL) 5742 *error = 0; 5743 5744 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5745 /* 5746 * Repeat twice, first based on local addresses and 5747 * next time for pointopoint. 5748 */ 5749 repeat: 5750 ill = ILL_START_WALK_V4(&ctx, ipst); 5751 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5752 if (match_ill != NULL && ill != match_ill && 5753 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 5754 continue; 5755 } 5756 GRAB_CONN_LOCK(q); 5757 mutex_enter(&ill->ill_lock); 5758 for (ipif = ill->ill_ipif; ipif != NULL; 5759 ipif = ipif->ipif_next) { 5760 if (zoneid != ALL_ZONES && 5761 zoneid != ipif->ipif_zoneid && 5762 ipif->ipif_zoneid != ALL_ZONES) 5763 continue; 5764 /* Allow the ipif to be down */ 5765 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5766 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5767 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5768 (ipif->ipif_pp_dst_addr == addr))) { 5769 /* 5770 * The block comment at the start of ipif_down 5771 * explains the use of the macros used below 5772 */ 5773 if (IPIF_CAN_LOOKUP(ipif)) { 5774 ipif_refhold_locked(ipif); 5775 mutex_exit(&ill->ill_lock); 5776 RELEASE_CONN_LOCK(q); 5777 rw_exit(&ipst->ips_ill_g_lock); 5778 return (ipif); 5779 } else if (IPIF_CAN_WAIT(ipif, q)) { 5780 ipsq = ill->ill_phyint->phyint_ipsq; 5781 mutex_enter(&ipsq->ipsq_lock); 5782 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 5783 mutex_exit(&ill->ill_lock); 5784 rw_exit(&ipst->ips_ill_g_lock); 5785 ipsq_enq(ipsq, q, mp, func, NEW_OP, 5786 ill); 5787 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 5788 mutex_exit(&ipsq->ipsq_lock); 5789 RELEASE_CONN_LOCK(q); 5790 if (error != NULL) 5791 *error = EINPROGRESS; 5792 return (NULL); 5793 } 5794 } 5795 } 5796 mutex_exit(&ill->ill_lock); 5797 RELEASE_CONN_LOCK(q); 5798 } 5799 5800 /* If we already did the ptp case, then we are done */ 5801 if (ptp) { 5802 rw_exit(&ipst->ips_ill_g_lock); 5803 if (error != NULL) 5804 *error = ENXIO; 5805 return (NULL); 5806 } 5807 ptp = B_TRUE; 5808 goto repeat; 5809 } 5810 5811 /* 5812 * Check if the address exists in the system. 5813 * We don't hold the conn_lock as we will not perform defered ipsqueue 5814 * operation. 5815 */ 5816 boolean_t 5817 ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 5818 { 5819 ipif_t *ipif; 5820 ill_t *ill; 5821 ill_walk_context_t ctx; 5822 5823 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5824 5825 ill = ILL_START_WALK_V4(&ctx, ipst); 5826 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5827 mutex_enter(&ill->ill_lock); 5828 for (ipif = ill->ill_ipif; ipif != NULL; 5829 ipif = ipif->ipif_next) { 5830 if (zoneid != ALL_ZONES && 5831 zoneid != ipif->ipif_zoneid && 5832 ipif->ipif_zoneid != ALL_ZONES) 5833 continue; 5834 /* Allow the ipif to be down */ 5835 /* 5836 * XXX Different from ipif_lookup_addr(), we don't do 5837 * twice lookups. As from bind()'s point of view, we 5838 * may return once we find a match. 5839 */ 5840 if (((ipif->ipif_lcl_addr == addr) && 5841 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5842 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 5843 (ipif->ipif_pp_dst_addr == addr))) { 5844 /* 5845 * Allow bind() to be successful even if the 5846 * ipif is with IPIF_CHANGING bit set. 5847 */ 5848 mutex_exit(&ill->ill_lock); 5849 rw_exit(&ipst->ips_ill_g_lock); 5850 return (B_TRUE); 5851 } 5852 } 5853 mutex_exit(&ill->ill_lock); 5854 } 5855 5856 rw_exit(&ipst->ips_ill_g_lock); 5857 return (B_FALSE); 5858 } 5859 5860 /* 5861 * Lookup an ipif with the specified address. For point-to-point links we 5862 * look for matches on either the destination address or the local address, 5863 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 5864 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 5865 * (or illgrp if `match_ill' is in an IPMP group). 5866 */ 5867 ipif_t * 5868 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, 5869 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 5870 { 5871 return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp, 5872 func, error, ipst)); 5873 } 5874 5875 /* 5876 * Special abbreviated version of ipif_lookup_addr() that doesn't match 5877 * `match_ill' across the IPMP group. This function is only needed in some 5878 * corner-cases; almost everything should use ipif_lookup_addr(). 5879 */ 5880 static ipif_t * 5881 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 5882 { 5883 ASSERT(match_ill != NULL); 5884 return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES, 5885 NULL, NULL, NULL, NULL, ipst)); 5886 } 5887 5888 /* 5889 * Look for an ipif with the specified address. For point-point links 5890 * we look for matches on either the destination address and the local 5891 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 5892 * is set. 5893 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 5894 * ill (or illgrp if `match_ill' is in an IPMP group). 5895 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 5896 */ 5897 zoneid_t 5898 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 5899 { 5900 zoneid_t zoneid; 5901 ipif_t *ipif; 5902 ill_t *ill; 5903 boolean_t ptp = B_FALSE; 5904 ill_walk_context_t ctx; 5905 5906 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5907 /* 5908 * Repeat twice, first based on local addresses and 5909 * next time for pointopoint. 5910 */ 5911 repeat: 5912 ill = ILL_START_WALK_V4(&ctx, ipst); 5913 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 5914 if (match_ill != NULL && ill != match_ill && 5915 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 5916 continue; 5917 } 5918 mutex_enter(&ill->ill_lock); 5919 for (ipif = ill->ill_ipif; ipif != NULL; 5920 ipif = ipif->ipif_next) { 5921 /* Allow the ipif to be down */ 5922 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 5923 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 5924 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 5925 (ipif->ipif_pp_dst_addr == addr)) && 5926 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 5927 zoneid = ipif->ipif_zoneid; 5928 mutex_exit(&ill->ill_lock); 5929 rw_exit(&ipst->ips_ill_g_lock); 5930 /* 5931 * If ipif_zoneid was ALL_ZONES then we have 5932 * a trusted extensions shared IP address. 5933 * In that case GLOBAL_ZONEID works to send. 5934 */ 5935 if (zoneid == ALL_ZONES) 5936 zoneid = GLOBAL_ZONEID; 5937 return (zoneid); 5938 } 5939 } 5940 mutex_exit(&ill->ill_lock); 5941 } 5942 5943 /* If we already did the ptp case, then we are done */ 5944 if (ptp) { 5945 rw_exit(&ipst->ips_ill_g_lock); 5946 return (ALL_ZONES); 5947 } 5948 ptp = B_TRUE; 5949 goto repeat; 5950 } 5951 5952 /* 5953 * Look for an ipif that matches the specified remote address i.e. the 5954 * ipif that would receive the specified packet. 5955 * First look for directly connected interfaces and then do a recursive 5956 * IRE lookup and pick the first ipif corresponding to the source address in the 5957 * ire. 5958 * Returns: held ipif 5959 */ 5960 ipif_t * 5961 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 5962 { 5963 ipif_t *ipif; 5964 ire_t *ire; 5965 ip_stack_t *ipst = ill->ill_ipst; 5966 5967 ASSERT(!ill->ill_isv6); 5968 5969 /* 5970 * Someone could be changing this ipif currently or change it 5971 * after we return this. Thus a few packets could use the old 5972 * old values. However structure updates/creates (ire, ilg, ilm etc) 5973 * will atomically be updated or cleaned up with the new value 5974 * Thus we don't need a lock to check the flags or other attrs below. 5975 */ 5976 mutex_enter(&ill->ill_lock); 5977 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 5978 if (!IPIF_CAN_LOOKUP(ipif)) 5979 continue; 5980 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 5981 ipif->ipif_zoneid != ALL_ZONES) 5982 continue; 5983 /* Allow the ipif to be down */ 5984 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 5985 if ((ipif->ipif_pp_dst_addr == addr) || 5986 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 5987 ipif->ipif_lcl_addr == addr)) { 5988 ipif_refhold_locked(ipif); 5989 mutex_exit(&ill->ill_lock); 5990 return (ipif); 5991 } 5992 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 5993 ipif_refhold_locked(ipif); 5994 mutex_exit(&ill->ill_lock); 5995 return (ipif); 5996 } 5997 } 5998 mutex_exit(&ill->ill_lock); 5999 ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, 6000 NULL, MATCH_IRE_RECURSIVE, ipst); 6001 if (ire != NULL) { 6002 /* 6003 * The callers of this function wants to know the 6004 * interface on which they have to send the replies 6005 * back. For IREs that have ire_stq and ire_ipif 6006 * derived from different ills, we really don't care 6007 * what we return here. 6008 */ 6009 ipif = ire->ire_ipif; 6010 if (ipif != NULL) { 6011 ipif_refhold(ipif); 6012 ire_refrele(ire); 6013 return (ipif); 6014 } 6015 ire_refrele(ire); 6016 } 6017 /* Pick the first interface */ 6018 ipif = ipif_get_next_ipif(NULL, ill); 6019 return (ipif); 6020 } 6021 6022 /* 6023 * This func does not prevent refcnt from increasing. But if 6024 * the caller has taken steps to that effect, then this func 6025 * can be used to determine whether the ill has become quiescent 6026 */ 6027 static boolean_t 6028 ill_is_quiescent(ill_t *ill) 6029 { 6030 ipif_t *ipif; 6031 6032 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6033 6034 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6035 if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) { 6036 return (B_FALSE); 6037 } 6038 } 6039 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 6040 return (B_FALSE); 6041 } 6042 return (B_TRUE); 6043 } 6044 6045 boolean_t 6046 ill_is_freeable(ill_t *ill) 6047 { 6048 ipif_t *ipif; 6049 6050 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6051 6052 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 6053 if (ipif->ipif_refcnt != 0 || !IPIF_FREE_OK(ipif)) { 6054 return (B_FALSE); 6055 } 6056 } 6057 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 6058 return (B_FALSE); 6059 } 6060 return (B_TRUE); 6061 } 6062 6063 /* 6064 * This func does not prevent refcnt from increasing. But if 6065 * the caller has taken steps to that effect, then this func 6066 * can be used to determine whether the ipif has become quiescent 6067 */ 6068 static boolean_t 6069 ipif_is_quiescent(ipif_t *ipif) 6070 { 6071 ill_t *ill; 6072 6073 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6074 6075 if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) { 6076 return (B_FALSE); 6077 } 6078 6079 ill = ipif->ipif_ill; 6080 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 6081 ill->ill_logical_down) { 6082 return (B_TRUE); 6083 } 6084 6085 /* This is the last ipif going down or being deleted on this ill */ 6086 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 6087 return (B_FALSE); 6088 } 6089 6090 return (B_TRUE); 6091 } 6092 6093 /* 6094 * return true if the ipif can be destroyed: the ipif has to be quiescent 6095 * with zero references from ire/nce/ilm to it. 6096 */ 6097 static boolean_t 6098 ipif_is_freeable(ipif_t *ipif) 6099 { 6100 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6101 ASSERT(ipif->ipif_id != 0); 6102 return (ipif->ipif_refcnt == 0 && IPIF_FREE_OK(ipif)); 6103 } 6104 6105 /* 6106 * The ipif/ill/ire has been refreled. Do the tail processing. 6107 * Determine if the ipif or ill in question has become quiescent and if so 6108 * wakeup close and/or restart any queued pending ioctl that is waiting 6109 * for the ipif_down (or ill_down) 6110 */ 6111 void 6112 ipif_ill_refrele_tail(ill_t *ill) 6113 { 6114 mblk_t *mp; 6115 conn_t *connp; 6116 ipsq_t *ipsq; 6117 ipxop_t *ipx; 6118 ipif_t *ipif; 6119 dl_notify_ind_t *dlindp; 6120 6121 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6122 6123 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 6124 /* ip_modclose() may be waiting */ 6125 cv_broadcast(&ill->ill_cv); 6126 } 6127 6128 ipsq = ill->ill_phyint->phyint_ipsq; 6129 mutex_enter(&ipsq->ipsq_lock); 6130 ipx = ipsq->ipsq_xop; 6131 mutex_enter(&ipx->ipx_lock); 6132 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 6133 goto unlock; 6134 6135 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 6136 6137 ipif = ipx->ipx_pending_ipif; 6138 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 6139 goto unlock; 6140 6141 switch (ipx->ipx_waitfor) { 6142 case IPIF_DOWN: 6143 if (!ipif_is_quiescent(ipif)) 6144 goto unlock; 6145 break; 6146 case IPIF_FREE: 6147 if (!ipif_is_freeable(ipif)) 6148 goto unlock; 6149 break; 6150 case ILL_DOWN: 6151 if (!ill_is_quiescent(ill)) 6152 goto unlock; 6153 break; 6154 case ILL_FREE: 6155 /* 6156 * ILL_FREE is only for loopback; normal ill teardown waits 6157 * synchronously in ip_modclose() without using ipx_waitfor, 6158 * handled by the cv_broadcast() at the top of this function. 6159 */ 6160 if (!ill_is_freeable(ill)) 6161 goto unlock; 6162 break; 6163 default: 6164 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 6165 (void *)ipsq, ipx->ipx_waitfor); 6166 } 6167 6168 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 6169 mutex_exit(&ipx->ipx_lock); 6170 mp = ipsq_pending_mp_get(ipsq, &connp); 6171 mutex_exit(&ipsq->ipsq_lock); 6172 mutex_exit(&ill->ill_lock); 6173 6174 ASSERT(mp != NULL); 6175 /* 6176 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 6177 * we can only get here when the current operation decides it 6178 * it needs to quiesce via ipsq_pending_mp_add(). 6179 */ 6180 switch (mp->b_datap->db_type) { 6181 case M_PCPROTO: 6182 case M_PROTO: 6183 /* 6184 * For now, only DL_NOTIFY_IND messages can use this facility. 6185 */ 6186 dlindp = (dl_notify_ind_t *)mp->b_rptr; 6187 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 6188 6189 switch (dlindp->dl_notification) { 6190 case DL_NOTE_PHYS_ADDR: 6191 qwriter_ip(ill, ill->ill_rq, mp, 6192 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 6193 return; 6194 case DL_NOTE_REPLUMB: 6195 qwriter_ip(ill, ill->ill_rq, mp, 6196 ill_replumb_tail, CUR_OP, B_TRUE); 6197 return; 6198 default: 6199 ASSERT(0); 6200 ill_refrele(ill); 6201 } 6202 break; 6203 6204 case M_ERROR: 6205 case M_HANGUP: 6206 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 6207 B_TRUE); 6208 return; 6209 6210 case M_IOCTL: 6211 case M_IOCDATA: 6212 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 6213 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 6214 return; 6215 6216 default: 6217 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 6218 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 6219 } 6220 return; 6221 unlock: 6222 mutex_exit(&ipsq->ipsq_lock); 6223 mutex_exit(&ipx->ipx_lock); 6224 mutex_exit(&ill->ill_lock); 6225 } 6226 6227 #ifdef DEBUG 6228 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 6229 static void 6230 th_trace_rrecord(th_trace_t *th_trace) 6231 { 6232 tr_buf_t *tr_buf; 6233 uint_t lastref; 6234 6235 lastref = th_trace->th_trace_lastref; 6236 lastref++; 6237 if (lastref == TR_BUF_MAX) 6238 lastref = 0; 6239 th_trace->th_trace_lastref = lastref; 6240 tr_buf = &th_trace->th_trbuf[lastref]; 6241 tr_buf->tr_time = lbolt; 6242 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 6243 } 6244 6245 static void 6246 th_trace_free(void *value) 6247 { 6248 th_trace_t *th_trace = value; 6249 6250 ASSERT(th_trace->th_refcnt == 0); 6251 kmem_free(th_trace, sizeof (*th_trace)); 6252 } 6253 6254 /* 6255 * Find or create the per-thread hash table used to track object references. 6256 * The ipst argument is NULL if we shouldn't allocate. 6257 * 6258 * Accesses per-thread data, so there's no need to lock here. 6259 */ 6260 static mod_hash_t * 6261 th_trace_gethash(ip_stack_t *ipst) 6262 { 6263 th_hash_t *thh; 6264 6265 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 6266 mod_hash_t *mh; 6267 char name[256]; 6268 size_t objsize, rshift; 6269 int retv; 6270 6271 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 6272 return (NULL); 6273 (void) snprintf(name, sizeof (name), "th_trace_%p", 6274 (void *)curthread); 6275 6276 /* 6277 * We use mod_hash_create_extended here rather than the more 6278 * obvious mod_hash_create_ptrhash because the latter has a 6279 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 6280 * block. 6281 */ 6282 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 6283 MAX(sizeof (ire_t), sizeof (nce_t))); 6284 rshift = highbit(objsize); 6285 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 6286 th_trace_free, mod_hash_byptr, (void *)rshift, 6287 mod_hash_ptrkey_cmp, KM_NOSLEEP); 6288 if (mh == NULL) { 6289 kmem_free(thh, sizeof (*thh)); 6290 return (NULL); 6291 } 6292 thh->thh_hash = mh; 6293 thh->thh_ipst = ipst; 6294 /* 6295 * We trace ills, ipifs, ires, and nces. All of these are 6296 * per-IP-stack, so the lock on the thread list is as well. 6297 */ 6298 rw_enter(&ip_thread_rwlock, RW_WRITER); 6299 list_insert_tail(&ip_thread_list, thh); 6300 rw_exit(&ip_thread_rwlock); 6301 retv = tsd_set(ip_thread_data, thh); 6302 ASSERT(retv == 0); 6303 } 6304 return (thh != NULL ? thh->thh_hash : NULL); 6305 } 6306 6307 boolean_t 6308 th_trace_ref(const void *obj, ip_stack_t *ipst) 6309 { 6310 th_trace_t *th_trace; 6311 mod_hash_t *mh; 6312 mod_hash_val_t val; 6313 6314 if ((mh = th_trace_gethash(ipst)) == NULL) 6315 return (B_FALSE); 6316 6317 /* 6318 * Attempt to locate the trace buffer for this obj and thread. 6319 * If it does not exist, then allocate a new trace buffer and 6320 * insert into the hash. 6321 */ 6322 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 6323 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 6324 if (th_trace == NULL) 6325 return (B_FALSE); 6326 6327 th_trace->th_id = curthread; 6328 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 6329 (mod_hash_val_t)th_trace) != 0) { 6330 kmem_free(th_trace, sizeof (th_trace_t)); 6331 return (B_FALSE); 6332 } 6333 } else { 6334 th_trace = (th_trace_t *)val; 6335 } 6336 6337 ASSERT(th_trace->th_refcnt >= 0 && 6338 th_trace->th_refcnt < TR_BUF_MAX - 1); 6339 6340 th_trace->th_refcnt++; 6341 th_trace_rrecord(th_trace); 6342 return (B_TRUE); 6343 } 6344 6345 /* 6346 * For the purpose of tracing a reference release, we assume that global 6347 * tracing is always on and that the same thread initiated the reference hold 6348 * is releasing. 6349 */ 6350 void 6351 th_trace_unref(const void *obj) 6352 { 6353 int retv; 6354 mod_hash_t *mh; 6355 th_trace_t *th_trace; 6356 mod_hash_val_t val; 6357 6358 mh = th_trace_gethash(NULL); 6359 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 6360 ASSERT(retv == 0); 6361 th_trace = (th_trace_t *)val; 6362 6363 ASSERT(th_trace->th_refcnt > 0); 6364 th_trace->th_refcnt--; 6365 th_trace_rrecord(th_trace); 6366 } 6367 6368 /* 6369 * If tracing has been disabled, then we assume that the reference counts are 6370 * now useless, and we clear them out before destroying the entries. 6371 */ 6372 void 6373 th_trace_cleanup(const void *obj, boolean_t trace_disable) 6374 { 6375 th_hash_t *thh; 6376 mod_hash_t *mh; 6377 mod_hash_val_t val; 6378 th_trace_t *th_trace; 6379 int retv; 6380 6381 rw_enter(&ip_thread_rwlock, RW_READER); 6382 for (thh = list_head(&ip_thread_list); thh != NULL; 6383 thh = list_next(&ip_thread_list, thh)) { 6384 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 6385 &val) == 0) { 6386 th_trace = (th_trace_t *)val; 6387 if (trace_disable) 6388 th_trace->th_refcnt = 0; 6389 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 6390 ASSERT(retv == 0); 6391 } 6392 } 6393 rw_exit(&ip_thread_rwlock); 6394 } 6395 6396 void 6397 ipif_trace_ref(ipif_t *ipif) 6398 { 6399 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6400 6401 if (ipif->ipif_trace_disable) 6402 return; 6403 6404 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 6405 ipif->ipif_trace_disable = B_TRUE; 6406 ipif_trace_cleanup(ipif); 6407 } 6408 } 6409 6410 void 6411 ipif_untrace_ref(ipif_t *ipif) 6412 { 6413 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6414 6415 if (!ipif->ipif_trace_disable) 6416 th_trace_unref(ipif); 6417 } 6418 6419 void 6420 ill_trace_ref(ill_t *ill) 6421 { 6422 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6423 6424 if (ill->ill_trace_disable) 6425 return; 6426 6427 if (!th_trace_ref(ill, ill->ill_ipst)) { 6428 ill->ill_trace_disable = B_TRUE; 6429 ill_trace_cleanup(ill); 6430 } 6431 } 6432 6433 void 6434 ill_untrace_ref(ill_t *ill) 6435 { 6436 ASSERT(MUTEX_HELD(&ill->ill_lock)); 6437 6438 if (!ill->ill_trace_disable) 6439 th_trace_unref(ill); 6440 } 6441 6442 /* 6443 * Called when ipif is unplumbed or when memory alloc fails. Note that on 6444 * failure, ipif_trace_disable is set. 6445 */ 6446 static void 6447 ipif_trace_cleanup(const ipif_t *ipif) 6448 { 6449 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 6450 } 6451 6452 /* 6453 * Called when ill is unplumbed or when memory alloc fails. Note that on 6454 * failure, ill_trace_disable is set. 6455 */ 6456 static void 6457 ill_trace_cleanup(const ill_t *ill) 6458 { 6459 th_trace_cleanup(ill, ill->ill_trace_disable); 6460 } 6461 #endif /* DEBUG */ 6462 6463 void 6464 ipif_refhold_locked(ipif_t *ipif) 6465 { 6466 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6467 ipif->ipif_refcnt++; 6468 IPIF_TRACE_REF(ipif); 6469 } 6470 6471 void 6472 ipif_refhold(ipif_t *ipif) 6473 { 6474 ill_t *ill; 6475 6476 ill = ipif->ipif_ill; 6477 mutex_enter(&ill->ill_lock); 6478 ipif->ipif_refcnt++; 6479 IPIF_TRACE_REF(ipif); 6480 mutex_exit(&ill->ill_lock); 6481 } 6482 6483 /* 6484 * Must not be called while holding any locks. Otherwise if this is 6485 * the last reference to be released there is a chance of recursive mutex 6486 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 6487 * to restart an ioctl. 6488 */ 6489 void 6490 ipif_refrele(ipif_t *ipif) 6491 { 6492 ill_t *ill; 6493 6494 ill = ipif->ipif_ill; 6495 6496 mutex_enter(&ill->ill_lock); 6497 ASSERT(ipif->ipif_refcnt != 0); 6498 ipif->ipif_refcnt--; 6499 IPIF_UNTRACE_REF(ipif); 6500 if (ipif->ipif_refcnt != 0) { 6501 mutex_exit(&ill->ill_lock); 6502 return; 6503 } 6504 6505 /* Drops the ill_lock */ 6506 ipif_ill_refrele_tail(ill); 6507 } 6508 6509 ipif_t * 6510 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 6511 { 6512 ipif_t *ipif; 6513 6514 mutex_enter(&ill->ill_lock); 6515 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 6516 ipif != NULL; ipif = ipif->ipif_next) { 6517 if (!IPIF_CAN_LOOKUP(ipif)) 6518 continue; 6519 ipif_refhold_locked(ipif); 6520 mutex_exit(&ill->ill_lock); 6521 return (ipif); 6522 } 6523 mutex_exit(&ill->ill_lock); 6524 return (NULL); 6525 } 6526 6527 /* 6528 * TODO: make this table extendible at run time 6529 * Return a pointer to the mac type info for 'mac_type' 6530 */ 6531 static ip_m_t * 6532 ip_m_lookup(t_uscalar_t mac_type) 6533 { 6534 ip_m_t *ipm; 6535 6536 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 6537 if (ipm->ip_m_mac_type == mac_type) 6538 return (ipm); 6539 return (NULL); 6540 } 6541 6542 /* 6543 * ip_rt_add is called to add an IPv4 route to the forwarding table. 6544 * ipif_arg is passed in to associate it with the correct interface. 6545 * We may need to restart this operation if the ipif cannot be looked up 6546 * due to an exclusive operation that is currently in progress. The restart 6547 * entry point is specified by 'func' 6548 */ 6549 int 6550 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 6551 ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg, 6552 boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func, 6553 struct rtsa_s *sp, ip_stack_t *ipst) 6554 { 6555 ire_t *ire; 6556 ire_t *gw_ire = NULL; 6557 ipif_t *ipif = NULL; 6558 boolean_t ipif_refheld = B_FALSE; 6559 uint_t type; 6560 int match_flags = MATCH_IRE_TYPE; 6561 int error; 6562 tsol_gc_t *gc = NULL; 6563 tsol_gcgrp_t *gcgrp = NULL; 6564 boolean_t gcgrp_xtraref = B_FALSE; 6565 6566 ip1dbg(("ip_rt_add:")); 6567 6568 if (ire_arg != NULL) 6569 *ire_arg = NULL; 6570 6571 /* 6572 * If this is the case of RTF_HOST being set, then we set the netmask 6573 * to all ones (regardless if one was supplied). 6574 */ 6575 if (flags & RTF_HOST) 6576 mask = IP_HOST_MASK; 6577 6578 /* 6579 * Prevent routes with a zero gateway from being created (since 6580 * interfaces can currently be plumbed and brought up no assigned 6581 * address). 6582 */ 6583 if (gw_addr == 0) 6584 return (ENETUNREACH); 6585 /* 6586 * Get the ipif, if any, corresponding to the gw_addr 6587 */ 6588 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error, 6589 ipst); 6590 if (ipif != NULL) { 6591 if (IS_VNI(ipif->ipif_ill)) { 6592 ipif_refrele(ipif); 6593 return (EINVAL); 6594 } 6595 ipif_refheld = B_TRUE; 6596 } else if (error == EINPROGRESS) { 6597 ip1dbg(("ip_rt_add: null and EINPROGRESS")); 6598 return (EINPROGRESS); 6599 } else { 6600 error = 0; 6601 } 6602 6603 if (ipif != NULL) { 6604 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); 6605 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 6606 } else { 6607 ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); 6608 } 6609 6610 /* 6611 * GateD will attempt to create routes with a loopback interface 6612 * address as the gateway and with RTF_GATEWAY set. We allow 6613 * these routes to be added, but create them as interface routes 6614 * since the gateway is an interface address. 6615 */ 6616 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 6617 flags &= ~RTF_GATEWAY; 6618 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 6619 mask == IP_HOST_MASK) { 6620 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 6621 ALL_ZONES, NULL, match_flags, ipst); 6622 if (ire != NULL) { 6623 ire_refrele(ire); 6624 if (ipif_refheld) 6625 ipif_refrele(ipif); 6626 return (EEXIST); 6627 } 6628 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 6629 "for 0x%x\n", (void *)ipif, 6630 ipif->ipif_ire_type, 6631 ntohl(ipif->ipif_lcl_addr))); 6632 ire = ire_create( 6633 (uchar_t *)&dst_addr, /* dest address */ 6634 (uchar_t *)&mask, /* mask */ 6635 (uchar_t *)&ipif->ipif_src_addr, 6636 NULL, /* no gateway */ 6637 &ipif->ipif_mtu, 6638 NULL, 6639 ipif->ipif_rq, /* recv-from queue */ 6640 NULL, /* no send-to queue */ 6641 ipif->ipif_ire_type, /* LOOPBACK */ 6642 ipif, 6643 0, 6644 0, 6645 0, 6646 (ipif->ipif_flags & IPIF_PRIVATE) ? 6647 RTF_PRIVATE : 0, 6648 &ire_uinfo_null, 6649 NULL, 6650 NULL, 6651 ipst); 6652 6653 if (ire == NULL) { 6654 if (ipif_refheld) 6655 ipif_refrele(ipif); 6656 return (ENOMEM); 6657 } 6658 error = ire_add(&ire, q, mp, func, B_FALSE); 6659 if (error == 0) 6660 goto save_ire; 6661 if (ipif_refheld) 6662 ipif_refrele(ipif); 6663 return (error); 6664 6665 } 6666 } 6667 6668 /* 6669 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 6670 * and the gateway address provided is one of the system's interface 6671 * addresses. By using the routing socket interface and supplying an 6672 * RTA_IFP sockaddr with an interface index, an alternate method of 6673 * specifying an interface route to be created is available which uses 6674 * the interface index that specifies the outgoing interface rather than 6675 * the address of an outgoing interface (which may not be able to 6676 * uniquely identify an interface). When coupled with the RTF_GATEWAY 6677 * flag, routes can be specified which not only specify the next-hop to 6678 * be used when routing to a certain prefix, but also which outgoing 6679 * interface should be used. 6680 * 6681 * Previously, interfaces would have unique addresses assigned to them 6682 * and so the address assigned to a particular interface could be used 6683 * to identify a particular interface. One exception to this was the 6684 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 6685 * 6686 * With the advent of IPv6 and its link-local addresses, this 6687 * restriction was relaxed and interfaces could share addresses between 6688 * themselves. In fact, typically all of the link-local interfaces on 6689 * an IPv6 node or router will have the same link-local address. In 6690 * order to differentiate between these interfaces, the use of an 6691 * interface index is necessary and this index can be carried inside a 6692 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 6693 * of using the interface index, however, is that all of the ipif's that 6694 * are part of an ill have the same index and so the RTA_IFP sockaddr 6695 * cannot be used to differentiate between ipif's (or logical 6696 * interfaces) that belong to the same ill (physical interface). 6697 * 6698 * For example, in the following case involving IPv4 interfaces and 6699 * logical interfaces 6700 * 6701 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 6702 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 6703 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 6704 * 6705 * the ipif's corresponding to each of these interface routes can be 6706 * uniquely identified by the "gateway" (actually interface address). 6707 * 6708 * In this case involving multiple IPv6 default routes to a particular 6709 * link-local gateway, the use of RTA_IFP is necessary to specify which 6710 * default route is of interest: 6711 * 6712 * default fe80::123:4567:89ab:cdef U if0 6713 * default fe80::123:4567:89ab:cdef U if1 6714 */ 6715 6716 /* RTF_GATEWAY not set */ 6717 if (!(flags & RTF_GATEWAY)) { 6718 queue_t *stq; 6719 6720 if (sp != NULL) { 6721 ip2dbg(("ip_rt_add: gateway security attributes " 6722 "cannot be set with interface route\n")); 6723 if (ipif_refheld) 6724 ipif_refrele(ipif); 6725 return (EINVAL); 6726 } 6727 6728 /* 6729 * As the interface index specified with the RTA_IFP sockaddr is 6730 * the same for all ipif's off of an ill, the matching logic 6731 * below uses MATCH_IRE_ILL if such an index was specified. 6732 * This means that routes sharing the same prefix when added 6733 * using a RTA_IFP sockaddr must have distinct interface 6734 * indices (namely, they must be on distinct ill's). 6735 * 6736 * On the other hand, since the gateway address will usually be 6737 * different for each ipif on the system, the matching logic 6738 * uses MATCH_IRE_IPIF in the case of a traditional interface 6739 * route. This means that interface routes for the same prefix 6740 * can be created if they belong to distinct ipif's and if a 6741 * RTA_IFP sockaddr is not present. 6742 */ 6743 if (ipif_arg != NULL) { 6744 if (ipif_refheld) { 6745 ipif_refrele(ipif); 6746 ipif_refheld = B_FALSE; 6747 } 6748 ipif = ipif_arg; 6749 match_flags |= MATCH_IRE_ILL; 6750 } else { 6751 /* 6752 * Check the ipif corresponding to the gw_addr 6753 */ 6754 if (ipif == NULL) 6755 return (ENETUNREACH); 6756 match_flags |= MATCH_IRE_IPIF; 6757 } 6758 ASSERT(ipif != NULL); 6759 6760 /* 6761 * We check for an existing entry at this point. 6762 * 6763 * Since a netmask isn't passed in via the ioctl interface 6764 * (SIOCADDRT), we don't check for a matching netmask in that 6765 * case. 6766 */ 6767 if (!ioctl_msg) 6768 match_flags |= MATCH_IRE_MASK; 6769 ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif, 6770 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 6771 if (ire != NULL) { 6772 ire_refrele(ire); 6773 if (ipif_refheld) 6774 ipif_refrele(ipif); 6775 return (EEXIST); 6776 } 6777 6778 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 6779 ? ipif->ipif_rq : ipif->ipif_wq; 6780 6781 /* 6782 * Create a copy of the IRE_LOOPBACK, 6783 * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with 6784 * the modified address and netmask. 6785 */ 6786 ire = ire_create( 6787 (uchar_t *)&dst_addr, 6788 (uint8_t *)&mask, 6789 (uint8_t *)&ipif->ipif_src_addr, 6790 NULL, 6791 &ipif->ipif_mtu, 6792 NULL, 6793 NULL, 6794 stq, 6795 ipif->ipif_net_type, 6796 ipif, 6797 0, 6798 0, 6799 0, 6800 flags, 6801 &ire_uinfo_null, 6802 NULL, 6803 NULL, 6804 ipst); 6805 if (ire == NULL) { 6806 if (ipif_refheld) 6807 ipif_refrele(ipif); 6808 return (ENOMEM); 6809 } 6810 6811 /* 6812 * Some software (for example, GateD and Sun Cluster) attempts 6813 * to create (what amount to) IRE_PREFIX routes with the 6814 * loopback address as the gateway. This is primarily done to 6815 * set up prefixes with the RTF_REJECT flag set (for example, 6816 * when generating aggregate routes.) 6817 * 6818 * If the IRE type (as defined by ipif->ipif_net_type) is 6819 * IRE_LOOPBACK, then we map the request into a 6820 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 6821 * these interface routes, by definition, can only be that. 6822 * 6823 * Needless to say, the real IRE_LOOPBACK is NOT created by this 6824 * routine, but rather using ire_create() directly. 6825 * 6826 */ 6827 if (ipif->ipif_net_type == IRE_LOOPBACK) { 6828 ire->ire_type = IRE_IF_NORESOLVER; 6829 ire->ire_flags |= RTF_BLACKHOLE; 6830 } 6831 6832 error = ire_add(&ire, q, mp, func, B_FALSE); 6833 if (error == 0) 6834 goto save_ire; 6835 6836 /* 6837 * In the result of failure, ire_add() will have already 6838 * deleted the ire in question, so there is no need to 6839 * do that here. 6840 */ 6841 if (ipif_refheld) 6842 ipif_refrele(ipif); 6843 return (error); 6844 } 6845 if (ipif_refheld) { 6846 ipif_refrele(ipif); 6847 ipif_refheld = B_FALSE; 6848 } 6849 6850 /* 6851 * Get an interface IRE for the specified gateway. 6852 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 6853 * gateway, it is currently unreachable and we fail the request 6854 * accordingly. 6855 */ 6856 ipif = ipif_arg; 6857 if (ipif_arg != NULL) 6858 match_flags |= MATCH_IRE_ILL; 6859 again: 6860 gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, 6861 ALL_ZONES, 0, NULL, match_flags, ipst); 6862 if (gw_ire == NULL) { 6863 /* 6864 * With IPMP, we allow host routes to influence in.mpathd's 6865 * target selection. However, if the test addresses are on 6866 * their own network, the above lookup will fail since the 6867 * underlying IRE_INTERFACEs are marked hidden. So allow 6868 * hidden test IREs to be found and try again. 6869 */ 6870 if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) { 6871 match_flags |= MATCH_IRE_MARK_TESTHIDDEN; 6872 goto again; 6873 } 6874 return (ENETUNREACH); 6875 } 6876 6877 /* 6878 * We create one of three types of IREs as a result of this request 6879 * based on the netmask. A netmask of all ones (which is automatically 6880 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 6881 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 6882 * created. Otherwise, an IRE_PREFIX route is created for the 6883 * destination prefix. 6884 */ 6885 if (mask == IP_HOST_MASK) 6886 type = IRE_HOST; 6887 else if (mask == 0) 6888 type = IRE_DEFAULT; 6889 else 6890 type = IRE_PREFIX; 6891 6892 /* check for a duplicate entry */ 6893 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 6894 NULL, ALL_ZONES, 0, NULL, 6895 match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst); 6896 if (ire != NULL) { 6897 ire_refrele(gw_ire); 6898 ire_refrele(ire); 6899 return (EEXIST); 6900 } 6901 6902 /* Security attribute exists */ 6903 if (sp != NULL) { 6904 tsol_gcgrp_addr_t ga; 6905 6906 /* find or create the gateway credentials group */ 6907 ga.ga_af = AF_INET; 6908 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 6909 6910 /* we hold reference to it upon success */ 6911 gcgrp = gcgrp_lookup(&ga, B_TRUE); 6912 if (gcgrp == NULL) { 6913 ire_refrele(gw_ire); 6914 return (ENOMEM); 6915 } 6916 6917 /* 6918 * Create and add the security attribute to the group; a 6919 * reference to the group is made upon allocating a new 6920 * entry successfully. If it finds an already-existing 6921 * entry for the security attribute in the group, it simply 6922 * returns it and no new reference is made to the group. 6923 */ 6924 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 6925 if (gc == NULL) { 6926 /* release reference held by gcgrp_lookup */ 6927 GCGRP_REFRELE(gcgrp); 6928 ire_refrele(gw_ire); 6929 return (ENOMEM); 6930 } 6931 } 6932 6933 /* Create the IRE. */ 6934 ire = ire_create( 6935 (uchar_t *)&dst_addr, /* dest address */ 6936 (uchar_t *)&mask, /* mask */ 6937 /* src address assigned by the caller? */ 6938 (uchar_t *)(((src_addr != INADDR_ANY) && 6939 (flags & RTF_SETSRC)) ? &src_addr : NULL), 6940 (uchar_t *)&gw_addr, /* gateway address */ 6941 &gw_ire->ire_max_frag, 6942 NULL, /* no src nce */ 6943 NULL, /* no recv-from queue */ 6944 NULL, /* no send-to queue */ 6945 (ushort_t)type, /* IRE type */ 6946 ipif_arg, 6947 0, 6948 0, 6949 0, 6950 flags, 6951 &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ 6952 gc, /* security attribute */ 6953 NULL, 6954 ipst); 6955 6956 /* 6957 * The ire holds a reference to the 'gc' and the 'gc' holds a 6958 * reference to the 'gcgrp'. We can now release the extra reference 6959 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 6960 */ 6961 if (gcgrp_xtraref) 6962 GCGRP_REFRELE(gcgrp); 6963 if (ire == NULL) { 6964 if (gc != NULL) 6965 GC_REFRELE(gc); 6966 ire_refrele(gw_ire); 6967 return (ENOMEM); 6968 } 6969 6970 /* 6971 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 6972 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 6973 */ 6974 6975 /* Add the new IRE. */ 6976 error = ire_add(&ire, q, mp, func, B_FALSE); 6977 if (error != 0) { 6978 /* 6979 * In the result of failure, ire_add() will have already 6980 * deleted the ire in question, so there is no need to 6981 * do that here. 6982 */ 6983 ire_refrele(gw_ire); 6984 return (error); 6985 } 6986 6987 if (flags & RTF_MULTIRT) { 6988 /* 6989 * Invoke the CGTP (multirouting) filtering module 6990 * to add the dst address in the filtering database. 6991 * Replicated inbound packets coming from that address 6992 * will be filtered to discard the duplicates. 6993 * It is not necessary to call the CGTP filter hook 6994 * when the dst address is a broadcast or multicast, 6995 * because an IP source address cannot be a broadcast 6996 * or a multicast. 6997 */ 6998 ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, 6999 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 7000 if (ire_dst != NULL) { 7001 ip_cgtp_bcast_add(ire, ire_dst, ipst); 7002 ire_refrele(ire_dst); 7003 goto save_ire; 7004 } 7005 if (ipst->ips_ip_cgtp_filter_ops != NULL && 7006 !CLASSD(ire->ire_addr)) { 7007 int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4( 7008 ipst->ips_netstack->netstack_stackid, 7009 ire->ire_addr, 7010 ire->ire_gateway_addr, 7011 ire->ire_src_addr, 7012 gw_ire->ire_src_addr); 7013 if (res != 0) { 7014 ire_refrele(gw_ire); 7015 ire_delete(ire); 7016 return (res); 7017 } 7018 } 7019 } 7020 7021 /* 7022 * Now that the prefix IRE entry has been created, delete any 7023 * existing gateway IRE cache entries as well as any IRE caches 7024 * using the gateway, and force them to be created through 7025 * ip_newroute. 7026 */ 7027 if (gc != NULL) { 7028 ASSERT(gcgrp != NULL); 7029 ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst); 7030 } 7031 7032 save_ire: 7033 if (gw_ire != NULL) { 7034 ire_refrele(gw_ire); 7035 } 7036 if (ipif != NULL) { 7037 /* 7038 * Save enough information so that we can recreate the IRE if 7039 * the interface goes down and then up. The metrics associated 7040 * with the route will be saved as well when rts_setmetrics() is 7041 * called after the IRE has been created. In the case where 7042 * memory cannot be allocated, none of this information will be 7043 * saved. 7044 */ 7045 ipif_save_ire(ipif, ire); 7046 } 7047 if (ioctl_msg) 7048 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 7049 if (ire_arg != NULL) { 7050 /* 7051 * Store the ire that was successfully added into where ire_arg 7052 * points to so that callers don't have to look it up 7053 * themselves (but they are responsible for ire_refrele()ing 7054 * the ire when they are finished with it). 7055 */ 7056 *ire_arg = ire; 7057 } else { 7058 ire_refrele(ire); /* Held in ire_add */ 7059 } 7060 if (ipif_refheld) 7061 ipif_refrele(ipif); 7062 return (0); 7063 } 7064 7065 /* 7066 * ip_rt_delete is called to delete an IPv4 route. 7067 * ipif_arg is passed in to associate it with the correct interface. 7068 * We may need to restart this operation if the ipif cannot be looked up 7069 * due to an exclusive operation that is currently in progress. The restart 7070 * entry point is specified by 'func' 7071 */ 7072 /* ARGSUSED4 */ 7073 int 7074 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 7075 uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg, 7076 queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst) 7077 { 7078 ire_t *ire = NULL; 7079 ipif_t *ipif; 7080 boolean_t ipif_refheld = B_FALSE; 7081 uint_t type; 7082 uint_t match_flags = MATCH_IRE_TYPE; 7083 int err = 0; 7084 7085 ip1dbg(("ip_rt_delete:")); 7086 /* 7087 * If this is the case of RTF_HOST being set, then we set the netmask 7088 * to all ones. Otherwise, we use the netmask if one was supplied. 7089 */ 7090 if (flags & RTF_HOST) { 7091 mask = IP_HOST_MASK; 7092 match_flags |= MATCH_IRE_MASK; 7093 } else if (rtm_addrs & RTA_NETMASK) { 7094 match_flags |= MATCH_IRE_MASK; 7095 } 7096 7097 /* 7098 * Note that RTF_GATEWAY is never set on a delete, therefore 7099 * we check if the gateway address is one of our interfaces first, 7100 * and fall back on RTF_GATEWAY routes. 7101 * 7102 * This makes it possible to delete an original 7103 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 7104 * 7105 * As the interface index specified with the RTA_IFP sockaddr is the 7106 * same for all ipif's off of an ill, the matching logic below uses 7107 * MATCH_IRE_ILL if such an index was specified. This means a route 7108 * sharing the same prefix and interface index as the the route 7109 * intended to be deleted might be deleted instead if a RTA_IFP sockaddr 7110 * is specified in the request. 7111 * 7112 * On the other hand, since the gateway address will usually be 7113 * different for each ipif on the system, the matching logic 7114 * uses MATCH_IRE_IPIF in the case of a traditional interface 7115 * route. This means that interface routes for the same prefix can be 7116 * uniquely identified if they belong to distinct ipif's and if a 7117 * RTA_IFP sockaddr is not present. 7118 * 7119 * For more detail on specifying routes by gateway address and by 7120 * interface index, see the comments in ip_rt_add(). 7121 */ 7122 ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err, 7123 ipst); 7124 if (ipif != NULL) 7125 ipif_refheld = B_TRUE; 7126 else if (err == EINPROGRESS) 7127 return (err); 7128 else 7129 err = 0; 7130 if (ipif != NULL) { 7131 if (ipif_arg != NULL) { 7132 if (ipif_refheld) { 7133 ipif_refrele(ipif); 7134 ipif_refheld = B_FALSE; 7135 } 7136 ipif = ipif_arg; 7137 match_flags |= MATCH_IRE_ILL; 7138 } else { 7139 match_flags |= MATCH_IRE_IPIF; 7140 } 7141 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 7142 ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, 7143 ALL_ZONES, NULL, match_flags, ipst); 7144 } 7145 if (ire == NULL) { 7146 ire = ire_ftable_lookup(dst_addr, mask, 0, 7147 IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, 7148 match_flags, ipst); 7149 } 7150 } 7151 7152 if (ire == NULL) { 7153 /* 7154 * At this point, the gateway address is not one of our own 7155 * addresses or a matching interface route was not found. We 7156 * set the IRE type to lookup based on whether 7157 * this is a host route, a default route or just a prefix. 7158 * 7159 * If an ipif_arg was passed in, then the lookup is based on an 7160 * interface index so MATCH_IRE_ILL is added to match_flags. 7161 * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is 7162 * set as the route being looked up is not a traditional 7163 * interface route. 7164 */ 7165 match_flags &= ~MATCH_IRE_IPIF; 7166 match_flags |= MATCH_IRE_GW; 7167 if (ipif_arg != NULL) 7168 match_flags |= MATCH_IRE_ILL; 7169 if (mask == IP_HOST_MASK) 7170 type = IRE_HOST; 7171 else if (mask == 0) 7172 type = IRE_DEFAULT; 7173 else 7174 type = IRE_PREFIX; 7175 ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, 7176 NULL, ALL_ZONES, 0, NULL, match_flags, ipst); 7177 } 7178 7179 if (ipif_refheld) 7180 ipif_refrele(ipif); 7181 7182 /* ipif is not refheld anymore */ 7183 if (ire == NULL) 7184 return (ESRCH); 7185 7186 if (ire->ire_flags & RTF_MULTIRT) { 7187 /* 7188 * Invoke the CGTP (multirouting) filtering module 7189 * to remove the dst address from the filtering database. 7190 * Packets coming from that address will no longer be 7191 * filtered to remove duplicates. 7192 */ 7193 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 7194 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 7195 ipst->ips_netstack->netstack_stackid, 7196 ire->ire_addr, ire->ire_gateway_addr); 7197 } 7198 ip_cgtp_bcast_delete(ire, ipst); 7199 } 7200 7201 ipif = ire->ire_ipif; 7202 if (ipif != NULL) 7203 ipif_remove_ire(ipif, ire); 7204 if (ioctl_msg) 7205 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 7206 ire_delete(ire); 7207 ire_refrele(ire); 7208 return (err); 7209 } 7210 7211 /* 7212 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 7213 */ 7214 /* ARGSUSED */ 7215 int 7216 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7217 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7218 { 7219 ipaddr_t dst_addr; 7220 ipaddr_t gw_addr; 7221 ipaddr_t mask; 7222 int error = 0; 7223 mblk_t *mp1; 7224 struct rtentry *rt; 7225 ipif_t *ipif = NULL; 7226 ip_stack_t *ipst; 7227 7228 ASSERT(q->q_next == NULL); 7229 ipst = CONNQ_TO_IPST(q); 7230 7231 ip1dbg(("ip_siocaddrt:")); 7232 /* Existence of mp1 verified in ip_wput_nondata */ 7233 mp1 = mp->b_cont->b_cont; 7234 rt = (struct rtentry *)mp1->b_rptr; 7235 7236 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7237 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7238 7239 /* 7240 * If the RTF_HOST flag is on, this is a request to assign a gateway 7241 * to a particular host address. In this case, we set the netmask to 7242 * all ones for the particular destination address. Otherwise, 7243 * determine the netmask to be used based on dst_addr and the interfaces 7244 * in use. 7245 */ 7246 if (rt->rt_flags & RTF_HOST) { 7247 mask = IP_HOST_MASK; 7248 } else { 7249 /* 7250 * Note that ip_subnet_mask returns a zero mask in the case of 7251 * default (an all-zeroes address). 7252 */ 7253 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7254 } 7255 7256 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 7257 B_TRUE, q, mp, ip_process_ioctl, NULL, ipst); 7258 if (ipif != NULL) 7259 ipif_refrele(ipif); 7260 return (error); 7261 } 7262 7263 /* 7264 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 7265 */ 7266 /* ARGSUSED */ 7267 int 7268 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 7269 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 7270 { 7271 ipaddr_t dst_addr; 7272 ipaddr_t gw_addr; 7273 ipaddr_t mask; 7274 int error; 7275 mblk_t *mp1; 7276 struct rtentry *rt; 7277 ipif_t *ipif = NULL; 7278 ip_stack_t *ipst; 7279 7280 ASSERT(q->q_next == NULL); 7281 ipst = CONNQ_TO_IPST(q); 7282 7283 ip1dbg(("ip_siocdelrt:")); 7284 /* Existence of mp1 verified in ip_wput_nondata */ 7285 mp1 = mp->b_cont->b_cont; 7286 rt = (struct rtentry *)mp1->b_rptr; 7287 7288 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 7289 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 7290 7291 /* 7292 * If the RTF_HOST flag is on, this is a request to delete a gateway 7293 * to a particular host address. In this case, we set the netmask to 7294 * all ones for the particular destination address. Otherwise, 7295 * determine the netmask to be used based on dst_addr and the interfaces 7296 * in use. 7297 */ 7298 if (rt->rt_flags & RTF_HOST) { 7299 mask = IP_HOST_MASK; 7300 } else { 7301 /* 7302 * Note that ip_subnet_mask returns a zero mask in the case of 7303 * default (an all-zeroes address). 7304 */ 7305 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 7306 } 7307 7308 error = ip_rt_delete(dst_addr, mask, gw_addr, 7309 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q, 7310 mp, ip_process_ioctl, ipst); 7311 if (ipif != NULL) 7312 ipif_refrele(ipif); 7313 return (error); 7314 } 7315 7316 /* 7317 * Enqueue the mp onto the ipsq, chained by b_next. 7318 * b_prev stores the function to be executed later, and b_queue the queue 7319 * where this mp originated. 7320 */ 7321 void 7322 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7323 ill_t *pending_ill) 7324 { 7325 conn_t *connp; 7326 ipxop_t *ipx = ipsq->ipsq_xop; 7327 7328 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 7329 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 7330 ASSERT(func != NULL); 7331 7332 mp->b_queue = q; 7333 mp->b_prev = (void *)func; 7334 mp->b_next = NULL; 7335 7336 switch (type) { 7337 case CUR_OP: 7338 if (ipx->ipx_mptail != NULL) { 7339 ASSERT(ipx->ipx_mphead != NULL); 7340 ipx->ipx_mptail->b_next = mp; 7341 } else { 7342 ASSERT(ipx->ipx_mphead == NULL); 7343 ipx->ipx_mphead = mp; 7344 } 7345 ipx->ipx_mptail = mp; 7346 break; 7347 7348 case NEW_OP: 7349 if (ipsq->ipsq_xopq_mptail != NULL) { 7350 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 7351 ipsq->ipsq_xopq_mptail->b_next = mp; 7352 } else { 7353 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 7354 ipsq->ipsq_xopq_mphead = mp; 7355 } 7356 ipsq->ipsq_xopq_mptail = mp; 7357 ipx->ipx_ipsq_queued = B_TRUE; 7358 break; 7359 7360 case SWITCH_OP: 7361 ASSERT(ipsq->ipsq_swxop != NULL); 7362 /* only one switch operation is currently allowed */ 7363 ASSERT(ipsq->ipsq_switch_mp == NULL); 7364 ipsq->ipsq_switch_mp = mp; 7365 ipx->ipx_ipsq_queued = B_TRUE; 7366 break; 7367 default: 7368 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 7369 } 7370 7371 if (CONN_Q(q) && pending_ill != NULL) { 7372 connp = Q_TO_CONN(q); 7373 ASSERT(MUTEX_HELD(&connp->conn_lock)); 7374 connp->conn_oper_pending_ill = pending_ill; 7375 } 7376 } 7377 7378 /* 7379 * Dequeue the next message that requested exclusive access to this IPSQ's 7380 * xop. Specifically: 7381 * 7382 * 1. If we're still processing the current operation on `ipsq', then 7383 * dequeue the next message for the operation (from ipx_mphead), or 7384 * return NULL if there are no queued messages for the operation. 7385 * These messages are queued via CUR_OP to qwriter_ip() and friends. 7386 * 7387 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 7388 * not set) see if the ipsq has requested an xop switch. If so, switch 7389 * `ipsq' to a different xop. Xop switches only happen when joining or 7390 * leaving IPMP groups and require a careful dance -- see the comments 7391 * in-line below for details. If we're leaving a group xop or if we're 7392 * joining a group xop and become writer on it, then we proceed to (3). 7393 * Otherwise, we return NULL and exit the xop. 7394 * 7395 * 3. For each IPSQ in the xop, return any switch operation stored on 7396 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 7397 * any other messages queued on the IPSQ. Otherwise, dequeue the next 7398 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 7399 * Note that if the phyint tied to `ipsq' is not using IPMP there will 7400 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 7401 * each phyint in the group, including the IPMP meta-interface phyint. 7402 */ 7403 static mblk_t * 7404 ipsq_dq(ipsq_t *ipsq) 7405 { 7406 ill_t *illv4, *illv6; 7407 mblk_t *mp; 7408 ipsq_t *xopipsq; 7409 ipsq_t *leftipsq = NULL; 7410 ipxop_t *ipx; 7411 phyint_t *phyi = ipsq->ipsq_phyint; 7412 ip_stack_t *ipst = ipsq->ipsq_ipst; 7413 boolean_t emptied = B_FALSE; 7414 7415 /* 7416 * Grab all the locks we need in the defined order (ill_g_lock -> 7417 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 7418 */ 7419 rw_enter(&ipst->ips_ill_g_lock, 7420 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 7421 mutex_enter(&ipsq->ipsq_lock); 7422 ipx = ipsq->ipsq_xop; 7423 mutex_enter(&ipx->ipx_lock); 7424 7425 /* 7426 * Dequeue the next message associated with the current exclusive 7427 * operation, if any. 7428 */ 7429 if ((mp = ipx->ipx_mphead) != NULL) { 7430 ipx->ipx_mphead = mp->b_next; 7431 if (ipx->ipx_mphead == NULL) 7432 ipx->ipx_mptail = NULL; 7433 mp->b_next = (void *)ipsq; 7434 goto out; 7435 } 7436 7437 if (ipx->ipx_current_ipif != NULL) 7438 goto empty; 7439 7440 if (ipsq->ipsq_swxop != NULL) { 7441 /* 7442 * The exclusive operation that is now being completed has 7443 * requested a switch to a different xop. This happens 7444 * when an interface joins or leaves an IPMP group. Joins 7445 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 7446 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 7447 * (phyint_free()), or interface plumb for an ill type 7448 * not in the IPMP group (ip_rput_dlpi_writer()). 7449 * 7450 * Xop switches are not allowed on the IPMP meta-interface. 7451 */ 7452 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 7453 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 7454 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 7455 7456 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 7457 /* 7458 * We're switching back to our own xop, so we have two 7459 * xop's to drain/exit: our own, and the group xop 7460 * that we are leaving. 7461 * 7462 * First, pull ourselves out of the group ipsq list. 7463 * This is safe since we're writer on ill_g_lock. 7464 */ 7465 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 7466 7467 xopipsq = ipx->ipx_ipsq; 7468 while (xopipsq->ipsq_next != ipsq) 7469 xopipsq = xopipsq->ipsq_next; 7470 7471 xopipsq->ipsq_next = ipsq->ipsq_next; 7472 ipsq->ipsq_next = ipsq; 7473 ipsq->ipsq_xop = ipsq->ipsq_swxop; 7474 ipsq->ipsq_swxop = NULL; 7475 7476 /* 7477 * Second, prepare to exit the group xop. The actual 7478 * ipsq_exit() is done at the end of this function 7479 * since we cannot hold any locks across ipsq_exit(). 7480 * Note that although we drop the group's ipx_lock, no 7481 * threads can proceed since we're still ipx_writer. 7482 */ 7483 leftipsq = xopipsq; 7484 mutex_exit(&ipx->ipx_lock); 7485 7486 /* 7487 * Third, set ipx to point to our own xop (which was 7488 * inactive and therefore can be entered). 7489 */ 7490 ipx = ipsq->ipsq_xop; 7491 mutex_enter(&ipx->ipx_lock); 7492 ASSERT(ipx->ipx_writer == NULL); 7493 ASSERT(ipx->ipx_current_ipif == NULL); 7494 } else { 7495 /* 7496 * We're switching from our own xop to a group xop. 7497 * The requestor of the switch must ensure that the 7498 * group xop cannot go away (e.g. by ensuring the 7499 * phyint associated with the xop cannot go away). 7500 * 7501 * If we can become writer on our new xop, then we'll 7502 * do the drain. Otherwise, the current writer of our 7503 * new xop will do the drain when it exits. 7504 * 7505 * First, splice ourselves into the group IPSQ list. 7506 * This is safe since we're writer on ill_g_lock. 7507 */ 7508 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 7509 7510 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 7511 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 7512 xopipsq = xopipsq->ipsq_next; 7513 7514 xopipsq->ipsq_next = ipsq; 7515 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 7516 ipsq->ipsq_xop = ipsq->ipsq_swxop; 7517 ipsq->ipsq_swxop = NULL; 7518 7519 /* 7520 * Second, exit our own xop, since it's now unused. 7521 * This is safe since we've got the only reference. 7522 */ 7523 ASSERT(ipx->ipx_writer == curthread); 7524 ipx->ipx_writer = NULL; 7525 VERIFY(--ipx->ipx_reentry_cnt == 0); 7526 ipx->ipx_ipsq_queued = B_FALSE; 7527 mutex_exit(&ipx->ipx_lock); 7528 7529 /* 7530 * Third, set ipx to point to our new xop, and check 7531 * if we can become writer on it. If we cannot, then 7532 * the current writer will drain the IPSQ group when 7533 * it exits. Our ipsq_xop is guaranteed to be stable 7534 * because we're still holding ipsq_lock. 7535 */ 7536 ipx = ipsq->ipsq_xop; 7537 mutex_enter(&ipx->ipx_lock); 7538 if (ipx->ipx_writer != NULL || 7539 ipx->ipx_current_ipif != NULL) { 7540 goto out; 7541 } 7542 } 7543 7544 /* 7545 * Fourth, become writer on our new ipx before we continue 7546 * with the drain. Note that we never dropped ipsq_lock 7547 * above, so no other thread could've raced with us to 7548 * become writer first. Also, we're holding ipx_lock, so 7549 * no other thread can examine the ipx right now. 7550 */ 7551 ASSERT(ipx->ipx_current_ipif == NULL); 7552 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 7553 VERIFY(ipx->ipx_reentry_cnt++ == 0); 7554 ipx->ipx_writer = curthread; 7555 ipx->ipx_forced = B_FALSE; 7556 #ifdef DEBUG 7557 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7558 #endif 7559 } 7560 7561 xopipsq = ipsq; 7562 do { 7563 /* 7564 * So that other operations operate on a consistent and 7565 * complete phyint, a switch message on an IPSQ must be 7566 * handled prior to any other operations on that IPSQ. 7567 */ 7568 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 7569 xopipsq->ipsq_switch_mp = NULL; 7570 ASSERT(mp->b_next == NULL); 7571 mp->b_next = (void *)xopipsq; 7572 goto out; 7573 } 7574 7575 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 7576 xopipsq->ipsq_xopq_mphead = mp->b_next; 7577 if (xopipsq->ipsq_xopq_mphead == NULL) 7578 xopipsq->ipsq_xopq_mptail = NULL; 7579 mp->b_next = (void *)xopipsq; 7580 goto out; 7581 } 7582 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 7583 empty: 7584 /* 7585 * There are no messages. Further, we are holding ipx_lock, hence no 7586 * new messages can end up on any IPSQ in the xop. 7587 */ 7588 ipx->ipx_writer = NULL; 7589 ipx->ipx_forced = B_FALSE; 7590 VERIFY(--ipx->ipx_reentry_cnt == 0); 7591 ipx->ipx_ipsq_queued = B_FALSE; 7592 emptied = B_TRUE; 7593 #ifdef DEBUG 7594 ipx->ipx_depth = 0; 7595 #endif 7596 out: 7597 mutex_exit(&ipx->ipx_lock); 7598 mutex_exit(&ipsq->ipsq_lock); 7599 7600 /* 7601 * If we completely emptied the xop, then wake up any threads waiting 7602 * to enter any of the IPSQ's associated with it. 7603 */ 7604 if (emptied) { 7605 xopipsq = ipsq; 7606 do { 7607 if ((phyi = xopipsq->ipsq_phyint) == NULL) 7608 continue; 7609 7610 illv4 = phyi->phyint_illv4; 7611 illv6 = phyi->phyint_illv6; 7612 7613 GRAB_ILL_LOCKS(illv4, illv6); 7614 if (illv4 != NULL) 7615 cv_broadcast(&illv4->ill_cv); 7616 if (illv6 != NULL) 7617 cv_broadcast(&illv6->ill_cv); 7618 RELEASE_ILL_LOCKS(illv4, illv6); 7619 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 7620 } 7621 rw_exit(&ipst->ips_ill_g_lock); 7622 7623 /* 7624 * Now that all locks are dropped, exit the IPSQ we left. 7625 */ 7626 if (leftipsq != NULL) 7627 ipsq_exit(leftipsq); 7628 7629 return (mp); 7630 } 7631 7632 /* 7633 * Enter the ipsq corresponding to ill, by waiting synchronously till 7634 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 7635 * will have to drain completely before ipsq_enter returns success. 7636 * ipx_current_ipif will be set if some exclusive op is in progress, 7637 * and the ipsq_exit logic will start the next enqueued op after 7638 * completion of the current op. If 'force' is used, we don't wait 7639 * for the enqueued ops. This is needed when a conn_close wants to 7640 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 7641 * of an ill can also use this option. But we dont' use it currently. 7642 */ 7643 #define ENTER_SQ_WAIT_TICKS 100 7644 boolean_t 7645 ipsq_enter(ill_t *ill, boolean_t force, int type) 7646 { 7647 ipsq_t *ipsq; 7648 ipxop_t *ipx; 7649 boolean_t waited_enough = B_FALSE; 7650 7651 /* 7652 * Note that the relationship between ill and ipsq is fixed as long as 7653 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 7654 * relationship between the IPSQ and xop cannot change. However, 7655 * since we cannot hold ipsq_lock across the cv_wait(), it may change 7656 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 7657 * waking up all ills in the xop when it becomes available. 7658 */ 7659 mutex_enter(&ill->ill_lock); 7660 for (;;) { 7661 if (ill->ill_state_flags & ILL_CONDEMNED) { 7662 mutex_exit(&ill->ill_lock); 7663 return (B_FALSE); 7664 } 7665 7666 ipsq = ill->ill_phyint->phyint_ipsq; 7667 mutex_enter(&ipsq->ipsq_lock); 7668 ipx = ipsq->ipsq_xop; 7669 mutex_enter(&ipx->ipx_lock); 7670 7671 if (ipx->ipx_writer == NULL && (type == CUR_OP || 7672 ipx->ipx_current_ipif == NULL || waited_enough)) 7673 break; 7674 7675 if (!force || ipx->ipx_writer != NULL) { 7676 mutex_exit(&ipx->ipx_lock); 7677 mutex_exit(&ipsq->ipsq_lock); 7678 cv_wait(&ill->ill_cv, &ill->ill_lock); 7679 } else { 7680 mutex_exit(&ipx->ipx_lock); 7681 mutex_exit(&ipsq->ipsq_lock); 7682 (void) cv_timedwait(&ill->ill_cv, 7683 &ill->ill_lock, lbolt + ENTER_SQ_WAIT_TICKS); 7684 waited_enough = B_TRUE; 7685 } 7686 } 7687 7688 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 7689 ASSERT(ipx->ipx_reentry_cnt == 0); 7690 ipx->ipx_writer = curthread; 7691 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 7692 ipx->ipx_reentry_cnt++; 7693 #ifdef DEBUG 7694 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7695 #endif 7696 mutex_exit(&ipx->ipx_lock); 7697 mutex_exit(&ipsq->ipsq_lock); 7698 mutex_exit(&ill->ill_lock); 7699 return (B_TRUE); 7700 } 7701 7702 boolean_t 7703 ill_perim_enter(ill_t *ill) 7704 { 7705 return (ipsq_enter(ill, B_FALSE, CUR_OP)); 7706 } 7707 7708 void 7709 ill_perim_exit(ill_t *ill) 7710 { 7711 ipsq_exit(ill->ill_phyint->phyint_ipsq); 7712 } 7713 7714 /* 7715 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 7716 * certain critical operations like plumbing (i.e. most set ioctls), multicast 7717 * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq 7718 * serializes exclusive ioctls issued by applications on a per ipsq basis in 7719 * ipsq_xopq_mphead. It also protects against multiple threads executing in 7720 * the ipsq. Responses from the driver pertain to the current ioctl (say a 7721 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 7722 * up the interface) and are enqueued in ipx_mphead. 7723 * 7724 * If a thread does not want to reenter the ipsq when it is already writer, 7725 * it must make sure that the specified reentry point to be called later 7726 * when the ipsq is empty, nor any code path starting from the specified reentry 7727 * point must never ever try to enter the ipsq again. Otherwise it can lead 7728 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 7729 * When the thread that is currently exclusive finishes, it (ipsq_exit) 7730 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 7731 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 7732 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 7733 * ioctl if the current ioctl has completed. If the current ioctl is still 7734 * in progress it simply returns. The current ioctl could be waiting for 7735 * a response from another module (arp or the driver or could be waiting for 7736 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 7737 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 7738 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 7739 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 7740 * all associated DLPI operations have completed. 7741 */ 7742 7743 /* 7744 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 7745 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 7746 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 7747 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 7748 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 7749 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 7750 */ 7751 ipsq_t * 7752 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 7753 ipsq_func_t func, int type, boolean_t reentry_ok) 7754 { 7755 ipsq_t *ipsq; 7756 ipxop_t *ipx; 7757 7758 /* Only 1 of ipif or ill can be specified */ 7759 ASSERT((ipif != NULL) ^ (ill != NULL)); 7760 if (ipif != NULL) 7761 ill = ipif->ipif_ill; 7762 7763 /* 7764 * lock ordering: conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 7765 * ipx of an ipsq can't change when ipsq_lock is held. 7766 */ 7767 GRAB_CONN_LOCK(q); 7768 mutex_enter(&ill->ill_lock); 7769 ipsq = ill->ill_phyint->phyint_ipsq; 7770 mutex_enter(&ipsq->ipsq_lock); 7771 ipx = ipsq->ipsq_xop; 7772 mutex_enter(&ipx->ipx_lock); 7773 7774 /* 7775 * 1. Enter the ipsq if we are already writer and reentry is ok. 7776 * (Note: If the caller does not specify reentry_ok then neither 7777 * 'func' nor any of its callees must ever attempt to enter the ipsq 7778 * again. Otherwise it can lead to an infinite loop 7779 * 2. Enter the ipsq if there is no current writer and this attempted 7780 * entry is part of the current operation 7781 * 3. Enter the ipsq if there is no current writer and this is a new 7782 * operation and the operation queue is empty and there is no 7783 * operation currently in progress 7784 */ 7785 if ((ipx->ipx_writer == curthread && reentry_ok) || 7786 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 7787 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL)))) { 7788 /* Success. */ 7789 ipx->ipx_reentry_cnt++; 7790 ipx->ipx_writer = curthread; 7791 ipx->ipx_forced = B_FALSE; 7792 mutex_exit(&ipx->ipx_lock); 7793 mutex_exit(&ipsq->ipsq_lock); 7794 mutex_exit(&ill->ill_lock); 7795 RELEASE_CONN_LOCK(q); 7796 #ifdef DEBUG 7797 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 7798 #endif 7799 return (ipsq); 7800 } 7801 7802 if (func != NULL) 7803 ipsq_enq(ipsq, q, mp, func, type, ill); 7804 7805 mutex_exit(&ipx->ipx_lock); 7806 mutex_exit(&ipsq->ipsq_lock); 7807 mutex_exit(&ill->ill_lock); 7808 RELEASE_CONN_LOCK(q); 7809 return (NULL); 7810 } 7811 7812 /* 7813 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 7814 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 7815 * cannot be entered, the mp is queued for completion. 7816 */ 7817 void 7818 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 7819 boolean_t reentry_ok) 7820 { 7821 ipsq_t *ipsq; 7822 7823 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 7824 7825 /* 7826 * Drop the caller's refhold on the ill. This is safe since we either 7827 * entered the IPSQ (and thus are exclusive), or failed to enter the 7828 * IPSQ, in which case we return without accessing ill anymore. This 7829 * is needed because func needs to see the correct refcount. 7830 * e.g. removeif can work only then. 7831 */ 7832 ill_refrele(ill); 7833 if (ipsq != NULL) { 7834 (*func)(ipsq, q, mp, NULL); 7835 ipsq_exit(ipsq); 7836 } 7837 } 7838 7839 /* 7840 * Exit the specified IPSQ. If this is the final exit on it then drain it 7841 * prior to exiting. Caller must be writer on the specified IPSQ. 7842 */ 7843 void 7844 ipsq_exit(ipsq_t *ipsq) 7845 { 7846 mblk_t *mp; 7847 ipsq_t *mp_ipsq; 7848 queue_t *q; 7849 phyint_t *phyi; 7850 ipsq_func_t func; 7851 7852 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7853 7854 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 7855 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 7856 ipsq->ipsq_xop->ipx_reentry_cnt--; 7857 return; 7858 } 7859 7860 for (;;) { 7861 phyi = ipsq->ipsq_phyint; 7862 mp = ipsq_dq(ipsq); 7863 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 7864 7865 /* 7866 * If we've changed to a new IPSQ, and the phyint associated 7867 * with the old one has gone away, free the old IPSQ. Note 7868 * that this cannot happen while the IPSQ is in a group. 7869 */ 7870 if (mp_ipsq != ipsq && phyi == NULL) { 7871 ASSERT(ipsq->ipsq_next == ipsq); 7872 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 7873 ipsq_delete(ipsq); 7874 } 7875 7876 if (mp == NULL) 7877 break; 7878 7879 q = mp->b_queue; 7880 func = (ipsq_func_t)mp->b_prev; 7881 ipsq = mp_ipsq; 7882 mp->b_next = mp->b_prev = NULL; 7883 mp->b_queue = NULL; 7884 7885 /* 7886 * If 'q' is an conn queue, it is valid, since we did a 7887 * a refhold on the conn at the start of the ioctl. 7888 * If 'q' is an ill queue, it is valid, since close of an 7889 * ill will clean up its IPSQ. 7890 */ 7891 (*func)(ipsq, q, mp, NULL); 7892 } 7893 } 7894 7895 /* 7896 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 7897 * and `ioccmd'. 7898 */ 7899 void 7900 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 7901 { 7902 ill_t *ill = ipif->ipif_ill; 7903 ipxop_t *ipx = ipsq->ipsq_xop; 7904 7905 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7906 ASSERT(ipx->ipx_current_ipif == NULL); 7907 ASSERT(ipx->ipx_current_ioctl == 0); 7908 7909 ipx->ipx_current_done = B_FALSE; 7910 ipx->ipx_current_ioctl = ioccmd; 7911 mutex_enter(&ipx->ipx_lock); 7912 ipx->ipx_current_ipif = ipif; 7913 mutex_exit(&ipx->ipx_lock); 7914 7915 /* 7916 * Set IPIF_CHANGING on one or more ipifs associated with the 7917 * current exclusive operation. IPIF_CHANGING prevents any new 7918 * references to the ipif (so that the references will eventually 7919 * drop to zero) and also prevents any "get" operations (e.g., 7920 * SIOCGLIFFLAGS) from being able to access the ipif until the 7921 * operation has completed and the ipif is again in a stable state. 7922 * 7923 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 7924 * ioctl. For internal operations (where ioccmd is zero), all ipifs 7925 * on the ill are marked with IPIF_CHANGING since it's unclear which 7926 * ipifs will be affected. 7927 * 7928 * Note that SIOCLIFREMOVEIF is a special case as it sets 7929 * IPIF_CONDEMNED internally after identifying the right ipif to 7930 * operate on. 7931 */ 7932 switch (ioccmd) { 7933 case SIOCLIFREMOVEIF: 7934 break; 7935 case 0: 7936 mutex_enter(&ill->ill_lock); 7937 ipif = ipif->ipif_ill->ill_ipif; 7938 for (; ipif != NULL; ipif = ipif->ipif_next) 7939 ipif->ipif_state_flags |= IPIF_CHANGING; 7940 mutex_exit(&ill->ill_lock); 7941 break; 7942 default: 7943 mutex_enter(&ill->ill_lock); 7944 ipif->ipif_state_flags |= IPIF_CHANGING; 7945 mutex_exit(&ill->ill_lock); 7946 } 7947 } 7948 7949 /* 7950 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 7951 * the next exclusive operation to begin once we ipsq_exit(). However, if 7952 * pending DLPI operations remain, then we will wait for the queue to drain 7953 * before allowing the next exclusive operation to begin. This ensures that 7954 * DLPI operations from one exclusive operation are never improperly processed 7955 * as part of a subsequent exclusive operation. 7956 */ 7957 void 7958 ipsq_current_finish(ipsq_t *ipsq) 7959 { 7960 ipxop_t *ipx = ipsq->ipsq_xop; 7961 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 7962 ipif_t *ipif = ipx->ipx_current_ipif; 7963 7964 ASSERT(IAM_WRITER_IPSQ(ipsq)); 7965 7966 /* 7967 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 7968 * (but in that case, IPIF_CHANGING will already be clear and no 7969 * pending DLPI messages can remain). 7970 */ 7971 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 7972 ill_t *ill = ipif->ipif_ill; 7973 7974 mutex_enter(&ill->ill_lock); 7975 dlpi_pending = ill->ill_dlpi_pending; 7976 if (ipx->ipx_current_ioctl == 0) { 7977 ipif = ill->ill_ipif; 7978 for (; ipif != NULL; ipif = ipif->ipif_next) 7979 ipif->ipif_state_flags &= ~IPIF_CHANGING; 7980 } else { 7981 ipif->ipif_state_flags &= ~IPIF_CHANGING; 7982 } 7983 mutex_exit(&ill->ill_lock); 7984 } 7985 7986 ASSERT(!ipx->ipx_current_done); 7987 ipx->ipx_current_done = B_TRUE; 7988 ipx->ipx_current_ioctl = 0; 7989 if (dlpi_pending == DL_PRIM_INVAL) { 7990 mutex_enter(&ipx->ipx_lock); 7991 ipx->ipx_current_ipif = NULL; 7992 mutex_exit(&ipx->ipx_lock); 7993 } 7994 } 7995 7996 /* 7997 * The ill is closing. Flush all messages on the ipsq that originated 7998 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 7999 * for this ill since ipsq_enter could not have entered until then. 8000 * New messages can't be queued since the CONDEMNED flag is set. 8001 */ 8002 static void 8003 ipsq_flush(ill_t *ill) 8004 { 8005 queue_t *q; 8006 mblk_t *prev; 8007 mblk_t *mp; 8008 mblk_t *mp_next; 8009 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 8010 8011 ASSERT(IAM_WRITER_ILL(ill)); 8012 8013 /* 8014 * Flush any messages sent up by the driver. 8015 */ 8016 mutex_enter(&ipx->ipx_lock); 8017 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 8018 mp_next = mp->b_next; 8019 q = mp->b_queue; 8020 if (q == ill->ill_rq || q == ill->ill_wq) { 8021 /* dequeue mp */ 8022 if (prev == NULL) 8023 ipx->ipx_mphead = mp->b_next; 8024 else 8025 prev->b_next = mp->b_next; 8026 if (ipx->ipx_mptail == mp) { 8027 ASSERT(mp_next == NULL); 8028 ipx->ipx_mptail = prev; 8029 } 8030 inet_freemsg(mp); 8031 } else { 8032 prev = mp; 8033 } 8034 } 8035 mutex_exit(&ipx->ipx_lock); 8036 (void) ipsq_pending_mp_cleanup(ill, NULL); 8037 ipsq_xopq_mp_cleanup(ill, NULL); 8038 ill_pending_mp_cleanup(ill); 8039 } 8040 8041 /* 8042 * Parse an iftun_req structure coming down SIOC[GS]TUNPARAM ioctls, 8043 * refhold and return the associated ipif 8044 */ 8045 /* ARGSUSED */ 8046 int 8047 ip_extract_tunreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8048 cmd_info_t *ci, ipsq_func_t func) 8049 { 8050 boolean_t exists; 8051 struct iftun_req *ta; 8052 ipif_t *ipif; 8053 ill_t *ill; 8054 boolean_t isv6; 8055 mblk_t *mp1; 8056 int error; 8057 conn_t *connp; 8058 ip_stack_t *ipst; 8059 8060 /* Existence verified in ip_wput_nondata */ 8061 mp1 = mp->b_cont->b_cont; 8062 ta = (struct iftun_req *)mp1->b_rptr; 8063 /* 8064 * Null terminate the string to protect against buffer 8065 * overrun. String was generated by user code and may not 8066 * be trusted. 8067 */ 8068 ta->ifta_lifr_name[LIFNAMSIZ - 1] = '\0'; 8069 8070 connp = Q_TO_CONN(q); 8071 isv6 = connp->conn_af_isv6; 8072 ipst = connp->conn_netstack->netstack_ip; 8073 8074 /* Disallows implicit create */ 8075 ipif = ipif_lookup_on_name(ta->ifta_lifr_name, 8076 mi_strlen(ta->ifta_lifr_name), B_FALSE, &exists, isv6, 8077 connp->conn_zoneid, CONNP_TO_WQ(connp), mp, func, &error, ipst); 8078 if (ipif == NULL) 8079 return (error); 8080 8081 if (ipif->ipif_id != 0) { 8082 /* 8083 * We really don't want to set/get tunnel parameters 8084 * on virtual tunnel interfaces. Only allow the 8085 * base tunnel to do these. 8086 */ 8087 ipif_refrele(ipif); 8088 return (EINVAL); 8089 } 8090 8091 /* 8092 * Send down to tunnel mod for ioctl processing. 8093 * Will finish ioctl in ip_rput_other(). 8094 */ 8095 ill = ipif->ipif_ill; 8096 if (ill->ill_net_type == IRE_LOOPBACK) { 8097 ipif_refrele(ipif); 8098 return (EOPNOTSUPP); 8099 } 8100 8101 if (ill->ill_wq == NULL) { 8102 ipif_refrele(ipif); 8103 return (ENXIO); 8104 } 8105 /* 8106 * Mark the ioctl as coming from an IPv6 interface for 8107 * tun's convenience. 8108 */ 8109 if (ill->ill_isv6) 8110 ta->ifta_flags |= 0x80000000; 8111 ci->ci_ipif = ipif; 8112 return (0); 8113 } 8114 8115 /* 8116 * Parse an ifreq or lifreq struct coming down ioctls and refhold 8117 * and return the associated ipif. 8118 * Return value: 8119 * Non zero: An error has occurred. ci may not be filled out. 8120 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 8121 * a held ipif in ci.ci_ipif. 8122 */ 8123 int 8124 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8125 cmd_info_t *ci, ipsq_func_t func) 8126 { 8127 char *name; 8128 struct ifreq *ifr; 8129 struct lifreq *lifr; 8130 ipif_t *ipif = NULL; 8131 ill_t *ill; 8132 conn_t *connp; 8133 boolean_t isv6; 8134 boolean_t exists; 8135 int err; 8136 mblk_t *mp1; 8137 zoneid_t zoneid; 8138 ip_stack_t *ipst; 8139 8140 if (q->q_next != NULL) { 8141 ill = (ill_t *)q->q_ptr; 8142 isv6 = ill->ill_isv6; 8143 connp = NULL; 8144 zoneid = ALL_ZONES; 8145 ipst = ill->ill_ipst; 8146 } else { 8147 ill = NULL; 8148 connp = Q_TO_CONN(q); 8149 isv6 = connp->conn_af_isv6; 8150 zoneid = connp->conn_zoneid; 8151 if (zoneid == GLOBAL_ZONEID) { 8152 /* global zone can access ipifs in all zones */ 8153 zoneid = ALL_ZONES; 8154 } 8155 ipst = connp->conn_netstack->netstack_ip; 8156 } 8157 8158 /* Has been checked in ip_wput_nondata */ 8159 mp1 = mp->b_cont->b_cont; 8160 8161 if (ipip->ipi_cmd_type == IF_CMD) { 8162 /* This a old style SIOC[GS]IF* command */ 8163 ifr = (struct ifreq *)mp1->b_rptr; 8164 /* 8165 * Null terminate the string to protect against buffer 8166 * overrun. String was generated by user code and may not 8167 * be trusted. 8168 */ 8169 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 8170 name = ifr->ifr_name; 8171 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 8172 ci->ci_sin6 = NULL; 8173 ci->ci_lifr = (struct lifreq *)ifr; 8174 } else { 8175 /* This a new style SIOC[GS]LIF* command */ 8176 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 8177 lifr = (struct lifreq *)mp1->b_rptr; 8178 /* 8179 * Null terminate the string to protect against buffer 8180 * overrun. String was generated by user code and may not 8181 * be trusted. 8182 */ 8183 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 8184 name = lifr->lifr_name; 8185 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 8186 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 8187 ci->ci_lifr = lifr; 8188 } 8189 8190 if (ipip->ipi_cmd == SIOCSLIFNAME) { 8191 /* 8192 * The ioctl will be failed if the ioctl comes down 8193 * an conn stream 8194 */ 8195 if (ill == NULL) { 8196 /* 8197 * Not an ill queue, return EINVAL same as the 8198 * old error code. 8199 */ 8200 return (ENXIO); 8201 } 8202 ipif = ill->ill_ipif; 8203 ipif_refhold(ipif); 8204 } else { 8205 ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, 8206 &exists, isv6, zoneid, 8207 (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err, 8208 ipst); 8209 if (ipif == NULL) { 8210 if (err == EINPROGRESS) 8211 return (err); 8212 err = 0; /* Ensure we don't use it below */ 8213 } 8214 } 8215 8216 /* 8217 * Old style [GS]IFCMD does not admit IPv6 ipif 8218 */ 8219 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 8220 ipif_refrele(ipif); 8221 return (ENXIO); 8222 } 8223 8224 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 8225 name[0] == '\0') { 8226 /* 8227 * Handle a or a SIOC?IF* with a null name 8228 * during plumb (on the ill queue before the I_PLINK). 8229 */ 8230 ipif = ill->ill_ipif; 8231 ipif_refhold(ipif); 8232 } 8233 8234 if (ipif == NULL) 8235 return (ENXIO); 8236 8237 ci->ci_ipif = ipif; 8238 return (0); 8239 } 8240 8241 /* 8242 * Return the total number of ipifs. 8243 */ 8244 static uint_t 8245 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 8246 { 8247 uint_t numifs = 0; 8248 ill_t *ill; 8249 ill_walk_context_t ctx; 8250 ipif_t *ipif; 8251 8252 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8253 ill = ILL_START_WALK_V4(&ctx, ipst); 8254 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8255 if (IS_UNDER_IPMP(ill)) 8256 continue; 8257 for (ipif = ill->ill_ipif; ipif != NULL; 8258 ipif = ipif->ipif_next) { 8259 if (ipif->ipif_zoneid == zoneid || 8260 ipif->ipif_zoneid == ALL_ZONES) 8261 numifs++; 8262 } 8263 } 8264 rw_exit(&ipst->ips_ill_g_lock); 8265 return (numifs); 8266 } 8267 8268 /* 8269 * Return the total number of ipifs. 8270 */ 8271 static uint_t 8272 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 8273 { 8274 uint_t numifs = 0; 8275 ill_t *ill; 8276 ipif_t *ipif; 8277 ill_walk_context_t ctx; 8278 8279 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 8280 8281 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8282 if (family == AF_INET) 8283 ill = ILL_START_WALK_V4(&ctx, ipst); 8284 else if (family == AF_INET6) 8285 ill = ILL_START_WALK_V6(&ctx, ipst); 8286 else 8287 ill = ILL_START_WALK_ALL(&ctx, ipst); 8288 8289 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8290 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 8291 continue; 8292 8293 for (ipif = ill->ill_ipif; ipif != NULL; 8294 ipif = ipif->ipif_next) { 8295 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8296 !(lifn_flags & LIFC_NOXMIT)) 8297 continue; 8298 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8299 !(lifn_flags & LIFC_TEMPORARY)) 8300 continue; 8301 if (((ipif->ipif_flags & 8302 (IPIF_NOXMIT|IPIF_NOLOCAL| 8303 IPIF_DEPRECATED)) || 8304 IS_LOOPBACK(ill) || 8305 !(ipif->ipif_flags & IPIF_UP)) && 8306 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 8307 continue; 8308 8309 if (zoneid != ipif->ipif_zoneid && 8310 ipif->ipif_zoneid != ALL_ZONES && 8311 (zoneid != GLOBAL_ZONEID || 8312 !(lifn_flags & LIFC_ALLZONES))) 8313 continue; 8314 8315 numifs++; 8316 } 8317 } 8318 rw_exit(&ipst->ips_ill_g_lock); 8319 return (numifs); 8320 } 8321 8322 uint_t 8323 ip_get_lifsrcofnum(ill_t *ill) 8324 { 8325 uint_t numifs = 0; 8326 ill_t *ill_head = ill; 8327 ip_stack_t *ipst = ill->ill_ipst; 8328 8329 /* 8330 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 8331 * other thread may be trying to relink the ILLs in this usesrc group 8332 * and adjusting the ill_usesrc_grp_next pointers 8333 */ 8334 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8335 if ((ill->ill_usesrc_ifindex == 0) && 8336 (ill->ill_usesrc_grp_next != NULL)) { 8337 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 8338 ill = ill->ill_usesrc_grp_next) 8339 numifs++; 8340 } 8341 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8342 8343 return (numifs); 8344 } 8345 8346 /* Null values are passed in for ipif, sin, and ifreq */ 8347 /* ARGSUSED */ 8348 int 8349 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8350 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8351 { 8352 int *nump; 8353 conn_t *connp = Q_TO_CONN(q); 8354 8355 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8356 8357 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 8358 nump = (int *)mp->b_cont->b_cont->b_rptr; 8359 8360 *nump = ip_get_numifs(connp->conn_zoneid, 8361 connp->conn_netstack->netstack_ip); 8362 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 8363 return (0); 8364 } 8365 8366 /* Null values are passed in for ipif, sin, and ifreq */ 8367 /* ARGSUSED */ 8368 int 8369 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 8370 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8371 { 8372 struct lifnum *lifn; 8373 mblk_t *mp1; 8374 conn_t *connp = Q_TO_CONN(q); 8375 8376 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 8377 8378 /* Existence checked in ip_wput_nondata */ 8379 mp1 = mp->b_cont->b_cont; 8380 8381 lifn = (struct lifnum *)mp1->b_rptr; 8382 switch (lifn->lifn_family) { 8383 case AF_UNSPEC: 8384 case AF_INET: 8385 case AF_INET6: 8386 break; 8387 default: 8388 return (EAFNOSUPPORT); 8389 } 8390 8391 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 8392 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 8393 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 8394 return (0); 8395 } 8396 8397 /* ARGSUSED */ 8398 int 8399 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8400 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8401 { 8402 STRUCT_HANDLE(ifconf, ifc); 8403 mblk_t *mp1; 8404 struct iocblk *iocp; 8405 struct ifreq *ifr; 8406 ill_walk_context_t ctx; 8407 ill_t *ill; 8408 ipif_t *ipif; 8409 struct sockaddr_in *sin; 8410 int32_t ifclen; 8411 zoneid_t zoneid; 8412 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8413 8414 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 8415 8416 ip1dbg(("ip_sioctl_get_ifconf")); 8417 /* Existence verified in ip_wput_nondata */ 8418 mp1 = mp->b_cont->b_cont; 8419 iocp = (struct iocblk *)mp->b_rptr; 8420 zoneid = Q_TO_CONN(q)->conn_zoneid; 8421 8422 /* 8423 * The original SIOCGIFCONF passed in a struct ifconf which specified 8424 * the user buffer address and length into which the list of struct 8425 * ifreqs was to be copied. Since AT&T Streams does not seem to 8426 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 8427 * the SIOCGIFCONF operation was redefined to simply provide 8428 * a large output buffer into which we are supposed to jam the ifreq 8429 * array. The same ioctl command code was used, despite the fact that 8430 * both the applications and the kernel code had to change, thus making 8431 * it impossible to support both interfaces. 8432 * 8433 * For reasons not good enough to try to explain, the following 8434 * algorithm is used for deciding what to do with one of these: 8435 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 8436 * form with the output buffer coming down as the continuation message. 8437 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 8438 * and we have to copy in the ifconf structure to find out how big the 8439 * output buffer is and where to copy out to. Sure no problem... 8440 * 8441 */ 8442 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 8443 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 8444 int numifs = 0; 8445 size_t ifc_bufsize; 8446 8447 /* 8448 * Must be (better be!) continuation of a TRANSPARENT 8449 * IOCTL. We just copied in the ifconf structure. 8450 */ 8451 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 8452 (struct ifconf *)mp1->b_rptr); 8453 8454 /* 8455 * Allocate a buffer to hold requested information. 8456 * 8457 * If ifc_len is larger than what is needed, we only 8458 * allocate what we will use. 8459 * 8460 * If ifc_len is smaller than what is needed, return 8461 * EINVAL. 8462 * 8463 * XXX: the ill_t structure can hava 2 counters, for 8464 * v4 and v6 (not just ill_ipif_up_count) to store the 8465 * number of interfaces for a device, so we don't need 8466 * to count them here... 8467 */ 8468 numifs = ip_get_numifs(zoneid, ipst); 8469 8470 ifclen = STRUCT_FGET(ifc, ifc_len); 8471 ifc_bufsize = numifs * sizeof (struct ifreq); 8472 if (ifc_bufsize > ifclen) { 8473 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8474 /* old behaviour */ 8475 return (EINVAL); 8476 } else { 8477 ifc_bufsize = ifclen; 8478 } 8479 } 8480 8481 mp1 = mi_copyout_alloc(q, mp, 8482 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 8483 if (mp1 == NULL) 8484 return (ENOMEM); 8485 8486 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 8487 } 8488 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8489 /* 8490 * the SIOCGIFCONF ioctl only knows about 8491 * IPv4 addresses, so don't try to tell 8492 * it about interfaces with IPv6-only 8493 * addresses. (Last parm 'isv6' is B_FALSE) 8494 */ 8495 8496 ifr = (struct ifreq *)mp1->b_rptr; 8497 8498 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8499 ill = ILL_START_WALK_V4(&ctx, ipst); 8500 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8501 if (IS_UNDER_IPMP(ill)) 8502 continue; 8503 for (ipif = ill->ill_ipif; ipif != NULL; 8504 ipif = ipif->ipif_next) { 8505 if (zoneid != ipif->ipif_zoneid && 8506 ipif->ipif_zoneid != ALL_ZONES) 8507 continue; 8508 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 8509 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 8510 /* old behaviour */ 8511 rw_exit(&ipst->ips_ill_g_lock); 8512 return (EINVAL); 8513 } else { 8514 goto if_copydone; 8515 } 8516 } 8517 ipif_get_name(ipif, ifr->ifr_name, 8518 sizeof (ifr->ifr_name)); 8519 sin = (sin_t *)&ifr->ifr_addr; 8520 *sin = sin_null; 8521 sin->sin_family = AF_INET; 8522 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8523 ifr++; 8524 } 8525 } 8526 if_copydone: 8527 rw_exit(&ipst->ips_ill_g_lock); 8528 mp1->b_wptr = (uchar_t *)ifr; 8529 8530 if (STRUCT_BUF(ifc) != NULL) { 8531 STRUCT_FSET(ifc, ifc_len, 8532 (int)((uchar_t *)ifr - mp1->b_rptr)); 8533 } 8534 return (0); 8535 } 8536 8537 /* 8538 * Get the interfaces using the address hosted on the interface passed in, 8539 * as a source adddress 8540 */ 8541 /* ARGSUSED */ 8542 int 8543 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8544 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8545 { 8546 mblk_t *mp1; 8547 ill_t *ill, *ill_head; 8548 ipif_t *ipif, *orig_ipif; 8549 int numlifs = 0; 8550 size_t lifs_bufsize, lifsmaxlen; 8551 struct lifreq *lifr; 8552 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8553 uint_t ifindex; 8554 zoneid_t zoneid; 8555 int err = 0; 8556 boolean_t isv6 = B_FALSE; 8557 struct sockaddr_in *sin; 8558 struct sockaddr_in6 *sin6; 8559 STRUCT_HANDLE(lifsrcof, lifs); 8560 ip_stack_t *ipst; 8561 8562 ipst = CONNQ_TO_IPST(q); 8563 8564 ASSERT(q->q_next == NULL); 8565 8566 zoneid = Q_TO_CONN(q)->conn_zoneid; 8567 8568 /* Existence verified in ip_wput_nondata */ 8569 mp1 = mp->b_cont->b_cont; 8570 8571 /* 8572 * Must be (better be!) continuation of a TRANSPARENT 8573 * IOCTL. We just copied in the lifsrcof structure. 8574 */ 8575 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 8576 (struct lifsrcof *)mp1->b_rptr); 8577 8578 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 8579 return (EINVAL); 8580 8581 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 8582 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 8583 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, 8584 ip_process_ioctl, &err, ipst); 8585 if (ipif == NULL) { 8586 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 8587 ifindex)); 8588 return (err); 8589 } 8590 8591 /* Allocate a buffer to hold requested information */ 8592 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 8593 lifs_bufsize = numlifs * sizeof (struct lifreq); 8594 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 8595 /* The actual size needed is always returned in lifs_len */ 8596 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 8597 8598 /* If the amount we need is more than what is passed in, abort */ 8599 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 8600 ipif_refrele(ipif); 8601 return (0); 8602 } 8603 8604 mp1 = mi_copyout_alloc(q, mp, 8605 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 8606 if (mp1 == NULL) { 8607 ipif_refrele(ipif); 8608 return (ENOMEM); 8609 } 8610 8611 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 8612 bzero(mp1->b_rptr, lifs_bufsize); 8613 8614 lifr = (struct lifreq *)mp1->b_rptr; 8615 8616 ill = ill_head = ipif->ipif_ill; 8617 orig_ipif = ipif; 8618 8619 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 8620 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 8621 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8622 8623 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 8624 for (; (ill != NULL) && (ill != ill_head); 8625 ill = ill->ill_usesrc_grp_next) { 8626 8627 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 8628 break; 8629 8630 ipif = ill->ill_ipif; 8631 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 8632 if (ipif->ipif_isv6) { 8633 sin6 = (sin6_t *)&lifr->lifr_addr; 8634 *sin6 = sin6_null; 8635 sin6->sin6_family = AF_INET6; 8636 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 8637 lifr->lifr_addrlen = ip_mask_to_plen_v6( 8638 &ipif->ipif_v6net_mask); 8639 } else { 8640 sin = (sin_t *)&lifr->lifr_addr; 8641 *sin = sin_null; 8642 sin->sin_family = AF_INET; 8643 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 8644 lifr->lifr_addrlen = ip_mask_to_plen( 8645 ipif->ipif_net_mask); 8646 } 8647 lifr++; 8648 } 8649 rw_exit(&ipst->ips_ill_g_usesrc_lock); 8650 rw_exit(&ipst->ips_ill_g_lock); 8651 ipif_refrele(orig_ipif); 8652 mp1->b_wptr = (uchar_t *)lifr; 8653 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 8654 8655 return (0); 8656 } 8657 8658 /* ARGSUSED */ 8659 int 8660 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 8661 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 8662 { 8663 mblk_t *mp1; 8664 int list; 8665 ill_t *ill; 8666 ipif_t *ipif; 8667 int flags; 8668 int numlifs = 0; 8669 size_t lifc_bufsize; 8670 struct lifreq *lifr; 8671 sa_family_t family; 8672 struct sockaddr_in *sin; 8673 struct sockaddr_in6 *sin6; 8674 ill_walk_context_t ctx; 8675 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8676 int32_t lifclen; 8677 zoneid_t zoneid; 8678 STRUCT_HANDLE(lifconf, lifc); 8679 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8680 8681 ip1dbg(("ip_sioctl_get_lifconf")); 8682 8683 ASSERT(q->q_next == NULL); 8684 8685 zoneid = Q_TO_CONN(q)->conn_zoneid; 8686 8687 /* Existence verified in ip_wput_nondata */ 8688 mp1 = mp->b_cont->b_cont; 8689 8690 /* 8691 * An extended version of SIOCGIFCONF that takes an 8692 * additional address family and flags field. 8693 * AF_UNSPEC retrieve both IPv4 and IPv6. 8694 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 8695 * interfaces are omitted. 8696 * Similarly, IPIF_TEMPORARY interfaces are omitted 8697 * unless LIFC_TEMPORARY is specified. 8698 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 8699 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 8700 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 8701 * has priority over LIFC_NOXMIT. 8702 */ 8703 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 8704 8705 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 8706 return (EINVAL); 8707 8708 /* 8709 * Must be (better be!) continuation of a TRANSPARENT 8710 * IOCTL. We just copied in the lifconf structure. 8711 */ 8712 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 8713 8714 family = STRUCT_FGET(lifc, lifc_family); 8715 flags = STRUCT_FGET(lifc, lifc_flags); 8716 8717 switch (family) { 8718 case AF_UNSPEC: 8719 /* 8720 * walk all ILL's. 8721 */ 8722 list = MAX_G_HEADS; 8723 break; 8724 case AF_INET: 8725 /* 8726 * walk only IPV4 ILL's. 8727 */ 8728 list = IP_V4_G_HEAD; 8729 break; 8730 case AF_INET6: 8731 /* 8732 * walk only IPV6 ILL's. 8733 */ 8734 list = IP_V6_G_HEAD; 8735 break; 8736 default: 8737 return (EAFNOSUPPORT); 8738 } 8739 8740 /* 8741 * Allocate a buffer to hold requested information. 8742 * 8743 * If lifc_len is larger than what is needed, we only 8744 * allocate what we will use. 8745 * 8746 * If lifc_len is smaller than what is needed, return 8747 * EINVAL. 8748 */ 8749 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 8750 lifc_bufsize = numlifs * sizeof (struct lifreq); 8751 lifclen = STRUCT_FGET(lifc, lifc_len); 8752 if (lifc_bufsize > lifclen) { 8753 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 8754 return (EINVAL); 8755 else 8756 lifc_bufsize = lifclen; 8757 } 8758 8759 mp1 = mi_copyout_alloc(q, mp, 8760 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 8761 if (mp1 == NULL) 8762 return (ENOMEM); 8763 8764 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 8765 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 8766 8767 lifr = (struct lifreq *)mp1->b_rptr; 8768 8769 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 8770 ill = ill_first(list, list, &ctx, ipst); 8771 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 8772 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 8773 continue; 8774 8775 for (ipif = ill->ill_ipif; ipif != NULL; 8776 ipif = ipif->ipif_next) { 8777 if ((ipif->ipif_flags & IPIF_NOXMIT) && 8778 !(flags & LIFC_NOXMIT)) 8779 continue; 8780 8781 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 8782 !(flags & LIFC_TEMPORARY)) 8783 continue; 8784 8785 if (((ipif->ipif_flags & 8786 (IPIF_NOXMIT|IPIF_NOLOCAL| 8787 IPIF_DEPRECATED)) || 8788 IS_LOOPBACK(ill) || 8789 !(ipif->ipif_flags & IPIF_UP)) && 8790 (flags & LIFC_EXTERNAL_SOURCE)) 8791 continue; 8792 8793 if (zoneid != ipif->ipif_zoneid && 8794 ipif->ipif_zoneid != ALL_ZONES && 8795 (zoneid != GLOBAL_ZONEID || 8796 !(flags & LIFC_ALLZONES))) 8797 continue; 8798 8799 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 8800 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 8801 rw_exit(&ipst->ips_ill_g_lock); 8802 return (EINVAL); 8803 } else { 8804 goto lif_copydone; 8805 } 8806 } 8807 8808 ipif_get_name(ipif, lifr->lifr_name, 8809 sizeof (lifr->lifr_name)); 8810 lifr->lifr_type = ill->ill_type; 8811 if (ipif->ipif_isv6) { 8812 sin6 = (sin6_t *)&lifr->lifr_addr; 8813 *sin6 = sin6_null; 8814 sin6->sin6_family = AF_INET6; 8815 sin6->sin6_addr = 8816 ipif->ipif_v6lcl_addr; 8817 lifr->lifr_addrlen = 8818 ip_mask_to_plen_v6( 8819 &ipif->ipif_v6net_mask); 8820 } else { 8821 sin = (sin_t *)&lifr->lifr_addr; 8822 *sin = sin_null; 8823 sin->sin_family = AF_INET; 8824 sin->sin_addr.s_addr = 8825 ipif->ipif_lcl_addr; 8826 lifr->lifr_addrlen = 8827 ip_mask_to_plen( 8828 ipif->ipif_net_mask); 8829 } 8830 lifr++; 8831 } 8832 } 8833 lif_copydone: 8834 rw_exit(&ipst->ips_ill_g_lock); 8835 8836 mp1->b_wptr = (uchar_t *)lifr; 8837 if (STRUCT_BUF(lifc) != NULL) { 8838 STRUCT_FSET(lifc, lifc_len, 8839 (int)((uchar_t *)lifr - mp1->b_rptr)); 8840 } 8841 return (0); 8842 } 8843 8844 static void 8845 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 8846 { 8847 ip6_asp_t *table; 8848 size_t table_size; 8849 mblk_t *data_mp; 8850 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8851 ip_stack_t *ipst; 8852 8853 if (q->q_next == NULL) 8854 ipst = CONNQ_TO_IPST(q); 8855 else 8856 ipst = ILLQ_TO_IPST(q); 8857 8858 /* These two ioctls are I_STR only */ 8859 if (iocp->ioc_count == TRANSPARENT) { 8860 miocnak(q, mp, 0, EINVAL); 8861 return; 8862 } 8863 8864 data_mp = mp->b_cont; 8865 if (data_mp == NULL) { 8866 /* The user passed us a NULL argument */ 8867 table = NULL; 8868 table_size = iocp->ioc_count; 8869 } else { 8870 /* 8871 * The user provided a table. The stream head 8872 * may have copied in the user data in chunks, 8873 * so make sure everything is pulled up 8874 * properly. 8875 */ 8876 if (MBLKL(data_mp) < iocp->ioc_count) { 8877 mblk_t *new_data_mp; 8878 if ((new_data_mp = msgpullup(data_mp, -1)) == 8879 NULL) { 8880 miocnak(q, mp, 0, ENOMEM); 8881 return; 8882 } 8883 freemsg(data_mp); 8884 data_mp = new_data_mp; 8885 mp->b_cont = data_mp; 8886 } 8887 table = (ip6_asp_t *)data_mp->b_rptr; 8888 table_size = iocp->ioc_count; 8889 } 8890 8891 switch (iocp->ioc_cmd) { 8892 case SIOCGIP6ADDRPOLICY: 8893 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 8894 if (iocp->ioc_rval == -1) 8895 iocp->ioc_error = EINVAL; 8896 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8897 else if (table != NULL && 8898 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 8899 ip6_asp_t *src = table; 8900 ip6_asp32_t *dst = (void *)table; 8901 int count = table_size / sizeof (ip6_asp_t); 8902 int i; 8903 8904 /* 8905 * We need to do an in-place shrink of the array 8906 * to match the alignment attributes of the 8907 * 32-bit ABI looking at it. 8908 */ 8909 /* LINTED: logical expression always true: op "||" */ 8910 ASSERT(sizeof (*src) > sizeof (*dst)); 8911 for (i = 1; i < count; i++) 8912 bcopy(src + i, dst + i, sizeof (*dst)); 8913 } 8914 #endif 8915 break; 8916 8917 case SIOCSIP6ADDRPOLICY: 8918 ASSERT(mp->b_prev == NULL); 8919 mp->b_prev = (void *)q; 8920 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 8921 /* 8922 * We pass in the datamodel here so that the ip6_asp_replace() 8923 * routine can handle converting from 32-bit to native formats 8924 * where necessary. 8925 * 8926 * A better way to handle this might be to convert the inbound 8927 * data structure here, and hang it off a new 'mp'; thus the 8928 * ip6_asp_replace() logic would always be dealing with native 8929 * format data structures.. 8930 * 8931 * (An even simpler way to handle these ioctls is to just 8932 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 8933 * and just recompile everything that depends on it.) 8934 */ 8935 #endif 8936 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 8937 iocp->ioc_flag & IOC_MODELS); 8938 return; 8939 } 8940 8941 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 8942 qreply(q, mp); 8943 } 8944 8945 static void 8946 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 8947 { 8948 mblk_t *data_mp; 8949 struct dstinforeq *dir; 8950 uint8_t *end, *cur; 8951 in6_addr_t *daddr, *saddr; 8952 ipaddr_t v4daddr; 8953 ire_t *ire; 8954 char *slabel, *dlabel; 8955 boolean_t isipv4; 8956 int match_ire; 8957 ill_t *dst_ill; 8958 ipif_t *src_ipif, *ire_ipif; 8959 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8960 zoneid_t zoneid; 8961 ip_stack_t *ipst = CONNQ_TO_IPST(q); 8962 8963 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8964 zoneid = Q_TO_CONN(q)->conn_zoneid; 8965 8966 /* 8967 * This ioctl is I_STR only, and must have a 8968 * data mblk following the M_IOCTL mblk. 8969 */ 8970 data_mp = mp->b_cont; 8971 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 8972 miocnak(q, mp, 0, EINVAL); 8973 return; 8974 } 8975 8976 if (MBLKL(data_mp) < iocp->ioc_count) { 8977 mblk_t *new_data_mp; 8978 8979 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 8980 miocnak(q, mp, 0, ENOMEM); 8981 return; 8982 } 8983 freemsg(data_mp); 8984 data_mp = new_data_mp; 8985 mp->b_cont = data_mp; 8986 } 8987 match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; 8988 8989 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 8990 end - cur >= sizeof (struct dstinforeq); 8991 cur += sizeof (struct dstinforeq)) { 8992 dir = (struct dstinforeq *)cur; 8993 daddr = &dir->dir_daddr; 8994 saddr = &dir->dir_saddr; 8995 8996 /* 8997 * ip_addr_scope_v6() and ip6_asp_lookup() handle 8998 * v4 mapped addresses; ire_ftable_lookup[_v6]() 8999 * and ipif_select_source[_v6]() do not. 9000 */ 9001 dir->dir_dscope = ip_addr_scope_v6(daddr); 9002 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 9003 9004 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 9005 if (isipv4) { 9006 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 9007 ire = ire_ftable_lookup(v4daddr, NULL, NULL, 9008 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9009 } else { 9010 ire = ire_ftable_lookup_v6(daddr, NULL, NULL, 9011 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); 9012 } 9013 if (ire == NULL) { 9014 dir->dir_dreachable = 0; 9015 9016 /* move on to next dst addr */ 9017 continue; 9018 } 9019 dir->dir_dreachable = 1; 9020 9021 ire_ipif = ire->ire_ipif; 9022 if (ire_ipif == NULL) 9023 goto next_dst; 9024 9025 /* 9026 * We expect to get back an interface ire or a 9027 * gateway ire cache entry. For both types, the 9028 * output interface is ire_ipif->ipif_ill. 9029 */ 9030 dst_ill = ire_ipif->ipif_ill; 9031 dir->dir_dmactype = dst_ill->ill_mactype; 9032 9033 if (isipv4) { 9034 src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); 9035 } else { 9036 src_ipif = ipif_select_source_v6(dst_ill, 9037 daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); 9038 } 9039 if (src_ipif == NULL) 9040 goto next_dst; 9041 9042 *saddr = src_ipif->ipif_v6lcl_addr; 9043 dir->dir_sscope = ip_addr_scope_v6(saddr); 9044 slabel = ip6_asp_lookup(saddr, NULL, ipst); 9045 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 9046 dir->dir_sdeprecated = 9047 (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 9048 ipif_refrele(src_ipif); 9049 next_dst: 9050 ire_refrele(ire); 9051 } 9052 miocack(q, mp, iocp->ioc_count, 0); 9053 } 9054 9055 /* 9056 * Check if this is an address assigned to this machine. 9057 * Skips interfaces that are down by using ire checks. 9058 * Translates mapped addresses to v4 addresses and then 9059 * treats them as such, returning true if the v4 address 9060 * associated with this mapped address is configured. 9061 * Note: Applications will have to be careful what they do 9062 * with the response; use of mapped addresses limits 9063 * what can be done with the socket, especially with 9064 * respect to socket options and ioctls - neither IPv4 9065 * options nor IPv6 sticky options/ancillary data options 9066 * may be used. 9067 */ 9068 /* ARGSUSED */ 9069 int 9070 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9071 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9072 { 9073 struct sioc_addrreq *sia; 9074 sin_t *sin; 9075 ire_t *ire; 9076 mblk_t *mp1; 9077 zoneid_t zoneid; 9078 ip_stack_t *ipst; 9079 9080 ip1dbg(("ip_sioctl_tmyaddr")); 9081 9082 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9083 zoneid = Q_TO_CONN(q)->conn_zoneid; 9084 ipst = CONNQ_TO_IPST(q); 9085 9086 /* Existence verified in ip_wput_nondata */ 9087 mp1 = mp->b_cont->b_cont; 9088 sia = (struct sioc_addrreq *)mp1->b_rptr; 9089 sin = (sin_t *)&sia->sa_addr; 9090 switch (sin->sin_family) { 9091 case AF_INET6: { 9092 sin6_t *sin6 = (sin6_t *)sin; 9093 9094 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9095 ipaddr_t v4_addr; 9096 9097 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9098 v4_addr); 9099 ire = ire_ctable_lookup(v4_addr, 0, 9100 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9101 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9102 } else { 9103 in6_addr_t v6addr; 9104 9105 v6addr = sin6->sin6_addr; 9106 ire = ire_ctable_lookup_v6(&v6addr, 0, 9107 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9108 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9109 } 9110 break; 9111 } 9112 case AF_INET: { 9113 ipaddr_t v4addr; 9114 9115 v4addr = sin->sin_addr.s_addr; 9116 ire = ire_ctable_lookup(v4addr, 0, 9117 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 9118 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); 9119 break; 9120 } 9121 default: 9122 return (EAFNOSUPPORT); 9123 } 9124 if (ire != NULL) { 9125 sia->sa_res = 1; 9126 ire_refrele(ire); 9127 } else { 9128 sia->sa_res = 0; 9129 } 9130 return (0); 9131 } 9132 9133 /* 9134 * Check if this is an address assigned on-link i.e. neighbor, 9135 * and makes sure it's reachable from the current zone. 9136 * Returns true for my addresses as well. 9137 * Translates mapped addresses to v4 addresses and then 9138 * treats them as such, returning true if the v4 address 9139 * associated with this mapped address is configured. 9140 * Note: Applications will have to be careful what they do 9141 * with the response; use of mapped addresses limits 9142 * what can be done with the socket, especially with 9143 * respect to socket options and ioctls - neither IPv4 9144 * options nor IPv6 sticky options/ancillary data options 9145 * may be used. 9146 */ 9147 /* ARGSUSED */ 9148 int 9149 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9150 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 9151 { 9152 struct sioc_addrreq *sia; 9153 sin_t *sin; 9154 mblk_t *mp1; 9155 ire_t *ire = NULL; 9156 zoneid_t zoneid; 9157 ip_stack_t *ipst; 9158 9159 ip1dbg(("ip_sioctl_tonlink")); 9160 9161 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 9162 zoneid = Q_TO_CONN(q)->conn_zoneid; 9163 ipst = CONNQ_TO_IPST(q); 9164 9165 /* Existence verified in ip_wput_nondata */ 9166 mp1 = mp->b_cont->b_cont; 9167 sia = (struct sioc_addrreq *)mp1->b_rptr; 9168 sin = (sin_t *)&sia->sa_addr; 9169 9170 /* 9171 * Match addresses with a zero gateway field to avoid 9172 * routes going through a router. 9173 * Exclude broadcast and multicast addresses. 9174 */ 9175 switch (sin->sin_family) { 9176 case AF_INET6: { 9177 sin6_t *sin6 = (sin6_t *)sin; 9178 9179 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 9180 ipaddr_t v4_addr; 9181 9182 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 9183 v4_addr); 9184 if (!CLASSD(v4_addr)) { 9185 ire = ire_route_lookup(v4_addr, 0, 0, 0, 9186 NULL, NULL, zoneid, NULL, 9187 MATCH_IRE_GW, ipst); 9188 } 9189 } else { 9190 in6_addr_t v6addr; 9191 in6_addr_t v6gw; 9192 9193 v6addr = sin6->sin6_addr; 9194 v6gw = ipv6_all_zeros; 9195 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 9196 ire = ire_route_lookup_v6(&v6addr, 0, 9197 &v6gw, 0, NULL, NULL, zoneid, 9198 NULL, MATCH_IRE_GW, ipst); 9199 } 9200 } 9201 break; 9202 } 9203 case AF_INET: { 9204 ipaddr_t v4addr; 9205 9206 v4addr = sin->sin_addr.s_addr; 9207 if (!CLASSD(v4addr)) { 9208 ire = ire_route_lookup(v4addr, 0, 0, 0, 9209 NULL, NULL, zoneid, NULL, 9210 MATCH_IRE_GW, ipst); 9211 } 9212 break; 9213 } 9214 default: 9215 return (EAFNOSUPPORT); 9216 } 9217 sia->sa_res = 0; 9218 if (ire != NULL) { 9219 if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| 9220 IRE_LOCAL|IRE_LOOPBACK)) { 9221 sia->sa_res = 1; 9222 } 9223 ire_refrele(ire); 9224 } 9225 return (0); 9226 } 9227 9228 /* 9229 * TBD: implement when kernel maintaines a list of site prefixes. 9230 */ 9231 /* ARGSUSED */ 9232 int 9233 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9234 ip_ioctl_cmd_t *ipip, void *ifreq) 9235 { 9236 return (ENXIO); 9237 } 9238 9239 /* ARGSUSED */ 9240 int 9241 ip_sioctl_tunparam(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9242 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9243 { 9244 ill_t *ill; 9245 mblk_t *mp1; 9246 conn_t *connp; 9247 boolean_t success; 9248 9249 ip1dbg(("ip_sioctl_tunparam(%s:%u %p)\n", 9250 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9251 /* ioctl comes down on an conn */ 9252 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9253 connp = Q_TO_CONN(q); 9254 9255 mp->b_datap->db_type = M_IOCTL; 9256 9257 /* 9258 * Send down a copy. (copymsg does not copy b_next/b_prev). 9259 * The original mp contains contaminated b_next values due to 'mi', 9260 * which is needed to do the mi_copy_done. Unfortunately if we 9261 * send down the original mblk itself and if we are popped due to an 9262 * an unplumb before the response comes back from tunnel, 9263 * the streamhead (which does a freemsg) will see this contaminated 9264 * message and the assertion in freemsg about non-null b_next/b_prev 9265 * will panic a DEBUG kernel. 9266 */ 9267 mp1 = copymsg(mp); 9268 if (mp1 == NULL) 9269 return (ENOMEM); 9270 9271 ill = ipif->ipif_ill; 9272 mutex_enter(&connp->conn_lock); 9273 mutex_enter(&ill->ill_lock); 9274 if (ipip->ipi_cmd == SIOCSTUNPARAM || ipip->ipi_cmd == OSIOCSTUNPARAM) { 9275 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9276 mp, 0); 9277 } else { 9278 success = ill_pending_mp_add(ill, connp, mp); 9279 } 9280 mutex_exit(&ill->ill_lock); 9281 mutex_exit(&connp->conn_lock); 9282 9283 if (success) { 9284 ip1dbg(("sending down tunparam request ")); 9285 putnext(ill->ill_wq, mp1); 9286 return (EINPROGRESS); 9287 } else { 9288 /* The conn has started closing */ 9289 freemsg(mp1); 9290 return (EINTR); 9291 } 9292 } 9293 9294 /* 9295 * ARP IOCTLs. 9296 * How does IP get in the business of fronting ARP configuration/queries? 9297 * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) 9298 * are by tradition passed in through a datagram socket. That lands in IP. 9299 * As it happens, this is just as well since the interface is quite crude in 9300 * that it passes in no information about protocol or hardware types, or 9301 * interface association. After making the protocol assumption, IP is in 9302 * the position to look up the name of the ILL, which ARP will need, and 9303 * format a request that can be handled by ARP. The request is passed up 9304 * stream to ARP, and the original IOCTL is completed by IP when ARP passes 9305 * back a response. ARP supports its own set of more general IOCTLs, in 9306 * case anyone is interested. 9307 */ 9308 /* ARGSUSED */ 9309 int 9310 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9311 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 9312 { 9313 mblk_t *mp1; 9314 mblk_t *mp2; 9315 mblk_t *pending_mp; 9316 ipaddr_t ipaddr; 9317 area_t *area; 9318 struct iocblk *iocp; 9319 conn_t *connp; 9320 struct arpreq *ar; 9321 struct xarpreq *xar; 9322 int flags, alength; 9323 uchar_t *lladdr; 9324 ire_t *ire; 9325 ip_stack_t *ipst; 9326 ill_t *ill = ipif->ipif_ill; 9327 ill_t *proxy_ill = NULL; 9328 ipmp_arpent_t *entp = NULL; 9329 boolean_t if_arp_ioctl = B_FALSE; 9330 boolean_t proxyarp = B_FALSE; 9331 9332 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9333 connp = Q_TO_CONN(q); 9334 ipst = connp->conn_netstack->netstack_ip; 9335 9336 if (ipip->ipi_cmd_type == XARP_CMD) { 9337 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 9338 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 9339 ar = NULL; 9340 9341 flags = xar->xarp_flags; 9342 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 9343 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 9344 /* 9345 * Validate against user's link layer address length 9346 * input and name and addr length limits. 9347 */ 9348 alength = ill->ill_phys_addr_length; 9349 if (ipip->ipi_cmd == SIOCSXARP) { 9350 if (alength != xar->xarp_ha.sdl_alen || 9351 (alength + xar->xarp_ha.sdl_nlen > 9352 sizeof (xar->xarp_ha.sdl_data))) 9353 return (EINVAL); 9354 } 9355 } else { 9356 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 9357 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 9358 xar = NULL; 9359 9360 flags = ar->arp_flags; 9361 lladdr = (uchar_t *)ar->arp_ha.sa_data; 9362 /* 9363 * Theoretically, the sa_family could tell us what link 9364 * layer type this operation is trying to deal with. By 9365 * common usage AF_UNSPEC means ethernet. We'll assume 9366 * any attempt to use the SIOC?ARP ioctls is for ethernet, 9367 * for now. Our new SIOC*XARP ioctls can be used more 9368 * generally. 9369 * 9370 * If the underlying media happens to have a non 6 byte 9371 * address, arp module will fail set/get, but the del 9372 * operation will succeed. 9373 */ 9374 alength = 6; 9375 if ((ipip->ipi_cmd != SIOCDARP) && 9376 (alength != ill->ill_phys_addr_length)) { 9377 return (EINVAL); 9378 } 9379 } 9380 9381 ipaddr = sin->sin_addr.s_addr; 9382 9383 /* 9384 * IPMP ARP special handling: 9385 * 9386 * 1. Since ARP mappings must appear consistent across the group, 9387 * prohibit changing ARP mappings on the underlying interfaces. 9388 * 9389 * 2. Since ARP mappings for IPMP data addresses are maintained by 9390 * IP itself, prohibit changing them. 9391 * 9392 * 3. For proxy ARP, use a functioning hardware address in the group, 9393 * provided one exists. If one doesn't, just add the entry as-is; 9394 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 9395 */ 9396 if (IS_UNDER_IPMP(ill)) { 9397 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 9398 return (EPERM); 9399 } 9400 if (IS_IPMP(ill)) { 9401 ipmp_illgrp_t *illg = ill->ill_grp; 9402 9403 switch (ipip->ipi_cmd) { 9404 case SIOCSARP: 9405 case SIOCSXARP: 9406 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 9407 if (proxy_ill != NULL) { 9408 proxyarp = B_TRUE; 9409 if (!ipmp_ill_is_active(proxy_ill)) 9410 proxy_ill = ipmp_illgrp_next_ill(illg); 9411 if (proxy_ill != NULL) 9412 lladdr = proxy_ill->ill_phys_addr; 9413 } 9414 /* FALLTHRU */ 9415 case SIOCDARP: 9416 case SIOCDXARP: 9417 ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL, 9418 ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 9419 if (ire != NULL) { 9420 ire_refrele(ire); 9421 return (EPERM); 9422 } 9423 } 9424 } 9425 9426 /* 9427 * We are going to pass up to ARP a packet chain that looks 9428 * like: 9429 * 9430 * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 9431 * 9432 * Get a copy of the original IOCTL mblk to head the chain, 9433 * to be sent up (in mp1). Also get another copy to store 9434 * in the ill_pending_mp list, for matching the response 9435 * when it comes back from ARP. 9436 */ 9437 mp1 = copyb(mp); 9438 pending_mp = copymsg(mp); 9439 if (mp1 == NULL || pending_mp == NULL) { 9440 if (mp1 != NULL) 9441 freeb(mp1); 9442 if (pending_mp != NULL) 9443 inet_freemsg(pending_mp); 9444 return (ENOMEM); 9445 } 9446 9447 mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, 9448 (caddr_t)&ipaddr); 9449 if (mp2 == NULL) { 9450 freeb(mp1); 9451 inet_freemsg(pending_mp); 9452 return (ENOMEM); 9453 } 9454 /* Put together the chain. */ 9455 mp1->b_cont = mp2; 9456 mp1->b_datap->db_type = M_IOCTL; 9457 mp2->b_cont = mp; 9458 mp2->b_datap->db_type = M_DATA; 9459 9460 iocp = (struct iocblk *)mp1->b_rptr; 9461 9462 /* 9463 * An M_IOCDATA's payload (struct copyresp) is mostly the same as an 9464 * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a 9465 * cp_private field (or cp_rval on 32-bit systems) in place of the 9466 * ioc_count field; set ioc_count to be correct. 9467 */ 9468 iocp->ioc_count = MBLKL(mp1->b_cont); 9469 9470 /* 9471 * Set the proper command in the ARP message. 9472 * Convert the SIOC{G|S|D}ARP calls into our 9473 * AR_ENTRY_xxx calls. 9474 */ 9475 area = (area_t *)mp2->b_rptr; 9476 switch (iocp->ioc_cmd) { 9477 case SIOCDARP: 9478 case SIOCDXARP: 9479 /* 9480 * We defer deleting the corresponding IRE until 9481 * we return from arp. 9482 */ 9483 area->area_cmd = AR_ENTRY_DELETE; 9484 area->area_proto_mask_offset = 0; 9485 break; 9486 case SIOCGARP: 9487 case SIOCGXARP: 9488 area->area_cmd = AR_ENTRY_SQUERY; 9489 area->area_proto_mask_offset = 0; 9490 break; 9491 case SIOCSARP: 9492 case SIOCSXARP: 9493 /* 9494 * Delete the corresponding ire to make sure IP will 9495 * pick up any change from arp. 9496 */ 9497 if (!if_arp_ioctl) { 9498 (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst); 9499 } else { 9500 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 9501 if (ipif != NULL) { 9502 (void) ip_ire_clookup_and_delete(ipaddr, ipif, 9503 ipst); 9504 ipif_refrele(ipif); 9505 } 9506 } 9507 break; 9508 } 9509 iocp->ioc_cmd = area->area_cmd; 9510 9511 /* 9512 * Fill in the rest of the ARP operation fields. 9513 */ 9514 area->area_hw_addr_length = alength; 9515 bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength); 9516 9517 /* Translate the flags. */ 9518 if (flags & ATF_PERM) 9519 area->area_flags |= ACE_F_PERMANENT; 9520 if (flags & ATF_PUBL) 9521 area->area_flags |= ACE_F_PUBLISH; 9522 if (flags & ATF_AUTHORITY) 9523 area->area_flags |= ACE_F_AUTHORITY; 9524 9525 /* 9526 * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it 9527 * so that IP can update ARP as the active ills in the group change. 9528 */ 9529 if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD && 9530 (area->area_flags & ACE_F_PERMANENT)) { 9531 entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp); 9532 9533 /* 9534 * The second part of the conditional below handles a corner 9535 * case: if this is proxy ARP and the IPMP group has no active 9536 * interfaces, we can't send the request to ARP now since it 9537 * won't be able to build an ACE. So we return success and 9538 * notify ARP about the proxy ARP entry once an interface 9539 * becomes active. 9540 */ 9541 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 9542 mp2->b_cont = NULL; 9543 inet_freemsg(mp1); 9544 inet_freemsg(pending_mp); 9545 return (entp == NULL ? ENOMEM : 0); 9546 } 9547 } 9548 9549 /* 9550 * Before sending 'mp' to ARP, we have to clear the b_next 9551 * and b_prev. Otherwise if STREAMS encounters such a message 9552 * in freemsg(), (because ARP can close any time) it can cause 9553 * a panic. But mi code needs the b_next and b_prev values of 9554 * mp->b_cont, to complete the ioctl. So we store it here 9555 * in pending_mp->bcont, and restore it in ip_sioctl_iocack() 9556 * when the response comes down from ARP. 9557 */ 9558 pending_mp->b_cont->b_next = mp->b_cont->b_next; 9559 pending_mp->b_cont->b_prev = mp->b_cont->b_prev; 9560 mp->b_cont->b_next = NULL; 9561 mp->b_cont->b_prev = NULL; 9562 9563 mutex_enter(&connp->conn_lock); 9564 mutex_enter(&ill->ill_lock); 9565 /* conn has not yet started closing, hence this can't fail */ 9566 if (ipip->ipi_flags & IPI_WR) { 9567 VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), 9568 pending_mp, 0) != 0); 9569 } else { 9570 VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); 9571 } 9572 mutex_exit(&ill->ill_lock); 9573 mutex_exit(&connp->conn_lock); 9574 9575 /* 9576 * Up to ARP it goes. The response will come back in ip_wput() as an 9577 * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion. 9578 */ 9579 putnext(ill->ill_rq, mp1); 9580 9581 /* 9582 * If we created an IPMP ARP entry, mark that we've notified ARP. 9583 */ 9584 if (entp != NULL) 9585 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 9586 9587 return (EINPROGRESS); 9588 } 9589 9590 /* 9591 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 9592 * the associated sin and refhold and return the associated ipif via `ci'. 9593 */ 9594 int 9595 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 9596 cmd_info_t *ci, ipsq_func_t func) 9597 { 9598 mblk_t *mp1; 9599 int err; 9600 sin_t *sin; 9601 conn_t *connp; 9602 ipif_t *ipif; 9603 ire_t *ire = NULL; 9604 ill_t *ill = NULL; 9605 boolean_t exists; 9606 ip_stack_t *ipst; 9607 struct arpreq *ar; 9608 struct xarpreq *xar; 9609 struct sockaddr_dl *sdl; 9610 9611 /* ioctl comes down on a conn */ 9612 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 9613 connp = Q_TO_CONN(q); 9614 if (connp->conn_af_isv6) 9615 return (ENXIO); 9616 9617 ipst = connp->conn_netstack->netstack_ip; 9618 9619 /* Verified in ip_wput_nondata */ 9620 mp1 = mp->b_cont->b_cont; 9621 9622 if (ipip->ipi_cmd_type == XARP_CMD) { 9623 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 9624 xar = (struct xarpreq *)mp1->b_rptr; 9625 sin = (sin_t *)&xar->xarp_pa; 9626 sdl = &xar->xarp_ha; 9627 9628 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 9629 return (ENXIO); 9630 if (sdl->sdl_nlen >= LIFNAMSIZ) 9631 return (EINVAL); 9632 } else { 9633 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 9634 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 9635 ar = (struct arpreq *)mp1->b_rptr; 9636 sin = (sin_t *)&ar->arp_pa; 9637 } 9638 9639 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 9640 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 9641 B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp), 9642 mp, func, &err, ipst); 9643 if (ipif == NULL) 9644 return (err); 9645 if (ipif->ipif_id != 0) { 9646 ipif_refrele(ipif); 9647 return (ENXIO); 9648 } 9649 } else { 9650 /* 9651 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 9652 * of 0: use the IP address to find the ipif. If the IP 9653 * address is an IPMP test address, ire_ftable_lookup() will 9654 * find the wrong ill, so we first do an ipif_lookup_addr(). 9655 */ 9656 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 9657 CONNP_TO_WQ(connp), mp, func, &err, ipst); 9658 if (ipif == NULL) { 9659 ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0, 9660 IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL, 9661 MATCH_IRE_TYPE, ipst); 9662 if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) { 9663 if (ire != NULL) 9664 ire_refrele(ire); 9665 return (ENXIO); 9666 } 9667 ipif = ill->ill_ipif; 9668 ipif_refhold(ipif); 9669 ire_refrele(ire); 9670 } 9671 } 9672 9673 if (ipif->ipif_net_type != IRE_IF_RESOLVER) { 9674 ipif_refrele(ipif); 9675 return (ENXIO); 9676 } 9677 9678 ci->ci_sin = sin; 9679 ci->ci_ipif = ipif; 9680 return (0); 9681 } 9682 9683 /* 9684 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 9685 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 9686 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 9687 * up and thus an ill can join that illgrp. 9688 * 9689 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 9690 * open()/close() primarily because close() is not allowed to fail or block 9691 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 9692 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 9693 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 9694 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 9695 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 9696 * state if I_UNLINK didn't occur. 9697 * 9698 * Note that for each plumb/unplumb operation, we may end up here more than 9699 * once because of the way ifconfig works. However, it's OK to link the same 9700 * illgrp more than once, or unlink an illgrp that's already unlinked. 9701 */ 9702 static int 9703 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 9704 { 9705 int err; 9706 ip_stack_t *ipst = ill->ill_ipst; 9707 9708 ASSERT(IS_IPMP(ill)); 9709 ASSERT(IAM_WRITER_ILL(ill)); 9710 9711 switch (ioccmd) { 9712 case I_LINK: 9713 return (ENOTSUP); 9714 9715 case I_PLINK: 9716 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 9717 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 9718 rw_exit(&ipst->ips_ipmp_lock); 9719 break; 9720 9721 case I_PUNLINK: 9722 /* 9723 * Require all UP ipifs be brought down prior to unlinking the 9724 * illgrp so any associated IREs (and other state) is torched. 9725 */ 9726 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 9727 return (EBUSY); 9728 9729 /* 9730 * NOTE: We hold ipmp_lock across the unlink to prevent a race 9731 * with an SIOCSLIFGROUPNAME request from an ill trying to 9732 * join this group. Specifically: ills trying to join grab 9733 * ipmp_lock and bump a "pending join" counter checked by 9734 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 9735 * joins can occur (since we have ipmp_lock). Once we drop 9736 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 9737 * find the illgrp (since we unlinked it) and will return 9738 * EAFNOSUPPORT. This will then take them back through the 9739 * IPMP meta-interface plumbing logic in ifconfig, and thus 9740 * back through I_PLINK above. 9741 */ 9742 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 9743 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 9744 rw_exit(&ipst->ips_ipmp_lock); 9745 return (err); 9746 default: 9747 break; 9748 } 9749 return (0); 9750 } 9751 9752 /* 9753 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 9754 * atomically set/clear the muxids. Also complete the ioctl by acking or 9755 * naking it. Note that the code is structured such that the link type, 9756 * whether it's persistent or not, is treated equally. ifconfig(1M) and 9757 * its clones use the persistent link, while pppd(1M) and perhaps many 9758 * other daemons may use non-persistent link. When combined with some 9759 * ill_t states, linking and unlinking lower streams may be used as 9760 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 9761 */ 9762 /* ARGSUSED */ 9763 void 9764 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 9765 { 9766 mblk_t *mp1, *mp2; 9767 struct linkblk *li; 9768 struct ipmx_s *ipmxp; 9769 ill_t *ill; 9770 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 9771 int err = 0; 9772 boolean_t entered_ipsq = B_FALSE; 9773 boolean_t islink; 9774 ip_stack_t *ipst; 9775 9776 if (CONN_Q(q)) 9777 ipst = CONNQ_TO_IPST(q); 9778 else 9779 ipst = ILLQ_TO_IPST(q); 9780 9781 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 9782 ioccmd == I_LINK || ioccmd == I_UNLINK); 9783 9784 islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9785 9786 mp1 = mp->b_cont; /* This is the linkblk info */ 9787 li = (struct linkblk *)mp1->b_rptr; 9788 9789 /* 9790 * ARP has added this special mblk, and the utility is asking us 9791 * to perform consistency checks, and also atomically set the 9792 * muxid. Ifconfig is an example. It achieves this by using 9793 * /dev/arp as the mux to plink the arp stream, and pushes arp on 9794 * to /dev/udp[6] stream for use as the mux when plinking the IP 9795 * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c 9796 * and other comments in this routine for more details. 9797 */ 9798 mp2 = mp1->b_cont; /* This is added by ARP */ 9799 9800 /* 9801 * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than 9802 * ifconfig which didn't push ARP on top of the dummy mux, we won't 9803 * get the special mblk above. For backward compatibility, we 9804 * request ip_sioctl_plink_ipmod() to skip the consistency checks. 9805 * The utility will use SIOCSLIFMUXID to store the muxids. This is 9806 * not atomic, and can leave the streams unplumbable if the utility 9807 * is interrupted before it does the SIOCSLIFMUXID. 9808 */ 9809 if (mp2 == NULL) { 9810 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE); 9811 if (err == EINPROGRESS) 9812 return; 9813 goto done; 9814 } 9815 9816 /* 9817 * This is an I_{P}LINK sent down by ifconfig through the ARP module; 9818 * ARP has appended this last mblk to tell us whether the lower stream 9819 * is an arp-dev stream or an IP module stream. 9820 */ 9821 ipmxp = (struct ipmx_s *)mp2->b_rptr; 9822 if (ipmxp->ipmx_arpdev_stream) { 9823 /* 9824 * The lower stream is the arp-dev stream. 9825 */ 9826 ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, 9827 q, mp, ip_sioctl_plink, &err, NULL, ipst); 9828 if (ill == NULL) { 9829 if (err == EINPROGRESS) 9830 return; 9831 err = EINVAL; 9832 goto done; 9833 } 9834 9835 if (ipsq == NULL) { 9836 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9837 NEW_OP, B_FALSE); 9838 if (ipsq == NULL) { 9839 ill_refrele(ill); 9840 return; 9841 } 9842 entered_ipsq = B_TRUE; 9843 } 9844 ASSERT(IAM_WRITER_ILL(ill)); 9845 ill_refrele(ill); 9846 9847 /* 9848 * To ensure consistency between IP and ARP, the following 9849 * LIFO scheme is used in plink/punlink. (IP first, ARP last). 9850 * This is because the muxid's are stored in the IP stream on 9851 * the ill. 9852 * 9853 * I_{P}LINK: ifconfig plinks the IP stream before plinking 9854 * the ARP stream. On an arp-dev stream, IP checks that it is 9855 * not yet plinked, and it also checks that the corresponding 9856 * IP stream is already plinked. 9857 * 9858 * I_{P}UNLINK: ifconfig punlinks the ARP stream before 9859 * punlinking the IP stream. IP does not allow punlink of the 9860 * IP stream unless the arp stream has been punlinked. 9861 */ 9862 if ((islink && 9863 (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || 9864 (!islink && ill->ill_arp_muxid != li->l_index)) { 9865 err = EINVAL; 9866 goto done; 9867 } 9868 9869 if (IS_IPMP(ill) && 9870 (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 9871 goto done; 9872 9873 ill->ill_arp_muxid = islink ? li->l_index : 0; 9874 } else { 9875 /* 9876 * The lower stream is probably an IP module stream. Do 9877 * consistency checking. 9878 */ 9879 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE); 9880 if (err == EINPROGRESS) 9881 return; 9882 } 9883 done: 9884 if (err == 0) 9885 miocack(q, mp, 0, 0); 9886 else 9887 miocnak(q, mp, 0, err); 9888 9889 /* Conn was refheld in ip_sioctl_copyin_setup */ 9890 if (CONN_Q(q)) 9891 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 9892 if (entered_ipsq) 9893 ipsq_exit(ipsq); 9894 } 9895 9896 /* 9897 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 9898 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 9899 * module stream). If `doconsist' is set, then do the extended consistency 9900 * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here. 9901 * Returns zero on success, EINPROGRESS if the operation is still pending, or 9902 * an error code on failure. 9903 */ 9904 static int 9905 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 9906 struct linkblk *li, boolean_t doconsist) 9907 { 9908 int err = 0; 9909 ill_t *ill; 9910 queue_t *ipwq, *dwq; 9911 const char *name; 9912 struct qinit *qinfo; 9913 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 9914 boolean_t entered_ipsq = B_FALSE; 9915 9916 /* 9917 * Walk the lower stream to verify it's the IP module stream. 9918 * The IP module is identified by its name, wput function, 9919 * and non-NULL q_next. STREAMS ensures that the lower stream 9920 * (li->l_qbot) will not vanish until this ioctl completes. 9921 */ 9922 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 9923 qinfo = ipwq->q_qinfo; 9924 name = qinfo->qi_minfo->mi_idname; 9925 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 9926 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 9927 break; 9928 } 9929 } 9930 9931 /* 9932 * If this isn't an IP module stream, bail. 9933 */ 9934 if (ipwq == NULL) 9935 return (0); 9936 9937 ill = ipwq->q_ptr; 9938 ASSERT(ill != NULL); 9939 9940 if (ipsq == NULL) { 9941 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 9942 NEW_OP, B_FALSE); 9943 if (ipsq == NULL) 9944 return (EINPROGRESS); 9945 entered_ipsq = B_TRUE; 9946 } 9947 ASSERT(IAM_WRITER_ILL(ill)); 9948 9949 if (doconsist) { 9950 /* 9951 * Consistency checking requires that I_{P}LINK occurs 9952 * prior to setting ill_ip_muxid, and that I_{P}UNLINK 9953 * occurs prior to clearing ill_arp_muxid. 9954 */ 9955 if ((islink && ill->ill_ip_muxid != 0) || 9956 (!islink && ill->ill_arp_muxid != 0)) { 9957 err = EINVAL; 9958 goto done; 9959 } 9960 } 9961 9962 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 9963 goto done; 9964 9965 /* 9966 * As part of I_{P}LINKing, stash the number of downstream modules and 9967 * the read queue of the module immediately below IP in the ill. 9968 * These are used during the capability negotiation below. 9969 */ 9970 ill->ill_lmod_rq = NULL; 9971 ill->ill_lmod_cnt = 0; 9972 if (islink && ((dwq = ipwq->q_next) != NULL)) { 9973 ill->ill_lmod_rq = RD(dwq); 9974 for (; dwq != NULL; dwq = dwq->q_next) 9975 ill->ill_lmod_cnt++; 9976 } 9977 9978 if (doconsist) 9979 ill->ill_ip_muxid = islink ? li->l_index : 0; 9980 9981 /* 9982 * Mark the ipsq busy until the capability operations initiated below 9983 * complete. The PLINK/UNLINK ioctl itself completes when our caller 9984 * returns, but the capability operation may complete asynchronously 9985 * much later. 9986 */ 9987 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 9988 /* 9989 * If there's at least one up ipif on this ill, then we're bound to 9990 * the underlying driver via DLPI. In that case, renegotiate 9991 * capabilities to account for any possible change in modules 9992 * interposed between IP and the driver. 9993 */ 9994 if (ill->ill_ipif_up_count > 0) { 9995 if (islink) 9996 ill_capability_probe(ill); 9997 else 9998 ill_capability_reset(ill, B_FALSE); 9999 } 10000 ipsq_current_finish(ipsq); 10001 done: 10002 if (entered_ipsq) 10003 ipsq_exit(ipsq); 10004 10005 return (err); 10006 } 10007 10008 /* 10009 * Search the ioctl command in the ioctl tables and return a pointer 10010 * to the ioctl command information. The ioctl command tables are 10011 * static and fully populated at compile time. 10012 */ 10013 ip_ioctl_cmd_t * 10014 ip_sioctl_lookup(int ioc_cmd) 10015 { 10016 int index; 10017 ip_ioctl_cmd_t *ipip; 10018 ip_ioctl_cmd_t *ipip_end; 10019 10020 if (ioc_cmd == IPI_DONTCARE) 10021 return (NULL); 10022 10023 /* 10024 * Do a 2 step search. First search the indexed table 10025 * based on the least significant byte of the ioctl cmd. 10026 * If we don't find a match, then search the misc table 10027 * serially. 10028 */ 10029 index = ioc_cmd & 0xFF; 10030 if (index < ip_ndx_ioctl_count) { 10031 ipip = &ip_ndx_ioctl_table[index]; 10032 if (ipip->ipi_cmd == ioc_cmd) { 10033 /* Found a match in the ndx table */ 10034 return (ipip); 10035 } 10036 } 10037 10038 /* Search the misc table */ 10039 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 10040 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 10041 if (ipip->ipi_cmd == ioc_cmd) 10042 /* Found a match in the misc table */ 10043 return (ipip); 10044 } 10045 10046 return (NULL); 10047 } 10048 10049 /* 10050 * Wrapper function for resuming deferred ioctl processing 10051 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 10052 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 10053 */ 10054 /* ARGSUSED */ 10055 void 10056 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 10057 void *dummy_arg) 10058 { 10059 ip_sioctl_copyin_setup(q, mp); 10060 } 10061 10062 /* 10063 * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message 10064 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 10065 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 10066 * We establish here the size of the block to be copied in. mi_copyin 10067 * arranges for this to happen, an processing continues in ip_wput with 10068 * an M_IOCDATA message. 10069 */ 10070 void 10071 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 10072 { 10073 int copyin_size; 10074 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 10075 ip_ioctl_cmd_t *ipip; 10076 cred_t *cr; 10077 ip_stack_t *ipst; 10078 10079 if (CONN_Q(q)) 10080 ipst = CONNQ_TO_IPST(q); 10081 else 10082 ipst = ILLQ_TO_IPST(q); 10083 10084 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 10085 if (ipip == NULL) { 10086 /* 10087 * The ioctl is not one we understand or own. 10088 * Pass it along to be processed down stream, 10089 * if this is a module instance of IP, else nak 10090 * the ioctl. 10091 */ 10092 if (q->q_next == NULL) { 10093 goto nak; 10094 } else { 10095 putnext(q, mp); 10096 return; 10097 } 10098 } 10099 10100 /* 10101 * If this is deferred, then we will do all the checks when we 10102 * come back. 10103 */ 10104 if ((iocp->ioc_cmd == SIOCGDSTINFO || 10105 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 10106 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 10107 return; 10108 } 10109 10110 /* 10111 * Only allow a very small subset of IP ioctls on this stream if 10112 * IP is a module and not a driver. Allowing ioctls to be processed 10113 * in this case may cause assert failures or data corruption. 10114 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 10115 * ioctls allowed on an IP module stream, after which this stream 10116 * normally becomes a multiplexor (at which time the stream head 10117 * will fail all ioctls). 10118 */ 10119 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 10120 if (ipip->ipi_flags & IPI_PASS_DOWN) { 10121 /* 10122 * Pass common Streams ioctls which the IP 10123 * module does not own or consume along to 10124 * be processed down stream. 10125 */ 10126 putnext(q, mp); 10127 return; 10128 } else { 10129 goto nak; 10130 } 10131 } 10132 10133 /* Make sure we have ioctl data to process. */ 10134 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 10135 goto nak; 10136 10137 /* 10138 * Prefer dblk credential over ioctl credential; some synthesized 10139 * ioctls have kcred set because there's no way to crhold() 10140 * a credential in some contexts. (ioc_cr is not crfree() by 10141 * the framework; the caller of ioctl needs to hold the reference 10142 * for the duration of the call). 10143 */ 10144 cr = msg_getcred(mp, NULL); 10145 if (cr == NULL) 10146 cr = iocp->ioc_cr; 10147 10148 /* Make sure normal users don't send down privileged ioctls */ 10149 if ((ipip->ipi_flags & IPI_PRIV) && 10150 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 10151 /* We checked the privilege earlier but log it here */ 10152 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 10153 return; 10154 } 10155 10156 /* 10157 * The ioctl command tables can only encode fixed length 10158 * ioctl data. If the length is variable, the table will 10159 * encode the length as zero. Such special cases are handled 10160 * below in the switch. 10161 */ 10162 if (ipip->ipi_copyin_size != 0) { 10163 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 10164 return; 10165 } 10166 10167 switch (iocp->ioc_cmd) { 10168 case O_SIOCGIFCONF: 10169 case SIOCGIFCONF: 10170 /* 10171 * This IOCTL is hilarious. See comments in 10172 * ip_sioctl_get_ifconf for the story. 10173 */ 10174 if (iocp->ioc_count == TRANSPARENT) 10175 copyin_size = SIZEOF_STRUCT(ifconf, 10176 iocp->ioc_flag); 10177 else 10178 copyin_size = iocp->ioc_count; 10179 mi_copyin(q, mp, NULL, copyin_size); 10180 return; 10181 10182 case O_SIOCGLIFCONF: 10183 case SIOCGLIFCONF: 10184 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 10185 mi_copyin(q, mp, NULL, copyin_size); 10186 return; 10187 10188 case SIOCGLIFSRCOF: 10189 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 10190 mi_copyin(q, mp, NULL, copyin_size); 10191 return; 10192 case SIOCGIP6ADDRPOLICY: 10193 ip_sioctl_ip6addrpolicy(q, mp); 10194 ip6_asp_table_refrele(ipst); 10195 return; 10196 10197 case SIOCSIP6ADDRPOLICY: 10198 ip_sioctl_ip6addrpolicy(q, mp); 10199 return; 10200 10201 case SIOCGDSTINFO: 10202 ip_sioctl_dstinfo(q, mp); 10203 ip6_asp_table_refrele(ipst); 10204 return; 10205 10206 case I_PLINK: 10207 case I_PUNLINK: 10208 case I_LINK: 10209 case I_UNLINK: 10210 /* 10211 * We treat non-persistent link similarly as the persistent 10212 * link case, in terms of plumbing/unplumbing, as well as 10213 * dynamic re-plumbing events indicator. See comments 10214 * in ip_sioctl_plink() for more. 10215 * 10216 * Request can be enqueued in the 'ipsq' while waiting 10217 * to become exclusive. So bump up the conn ref. 10218 */ 10219 if (CONN_Q(q)) 10220 CONN_INC_REF(Q_TO_CONN(q)); 10221 ip_sioctl_plink(NULL, q, mp, NULL); 10222 return; 10223 10224 case ND_GET: 10225 case ND_SET: 10226 /* 10227 * Use of the nd table requires holding the reader lock. 10228 * Modifying the nd table thru nd_load/nd_unload requires 10229 * the writer lock. 10230 */ 10231 rw_enter(&ipst->ips_ip_g_nd_lock, RW_READER); 10232 if (nd_getset(q, ipst->ips_ip_g_nd, mp)) { 10233 rw_exit(&ipst->ips_ip_g_nd_lock); 10234 10235 if (iocp->ioc_error) 10236 iocp->ioc_count = 0; 10237 mp->b_datap->db_type = M_IOCACK; 10238 qreply(q, mp); 10239 return; 10240 } 10241 rw_exit(&ipst->ips_ip_g_nd_lock); 10242 /* 10243 * We don't understand this subioctl of ND_GET / ND_SET. 10244 * Maybe intended for some driver / module below us 10245 */ 10246 if (q->q_next) { 10247 putnext(q, mp); 10248 } else { 10249 iocp->ioc_error = ENOENT; 10250 mp->b_datap->db_type = M_IOCNAK; 10251 iocp->ioc_count = 0; 10252 qreply(q, mp); 10253 } 10254 return; 10255 10256 case IP_IOCTL: 10257 ip_wput_ioctl(q, mp); 10258 return; 10259 default: 10260 cmn_err(CE_PANIC, "should not happen "); 10261 } 10262 nak: 10263 if (mp->b_cont != NULL) { 10264 freemsg(mp->b_cont); 10265 mp->b_cont = NULL; 10266 } 10267 iocp->ioc_error = EINVAL; 10268 mp->b_datap->db_type = M_IOCNAK; 10269 iocp->ioc_count = 0; 10270 qreply(q, mp); 10271 } 10272 10273 /* ip_wput hands off ARP IOCTL responses to us */ 10274 /* ARGSUSED3 */ 10275 void 10276 ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 10277 { 10278 struct arpreq *ar; 10279 struct xarpreq *xar; 10280 area_t *area; 10281 mblk_t *area_mp; 10282 struct iocblk *iocp; 10283 mblk_t *orig_ioc_mp, *tmp; 10284 struct iocblk *orig_iocp; 10285 ill_t *ill; 10286 conn_t *connp = NULL; 10287 mblk_t *pending_mp; 10288 int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; 10289 int *flagsp; 10290 char *storage = NULL; 10291 sin_t *sin; 10292 ipaddr_t addr; 10293 int err; 10294 ip_stack_t *ipst; 10295 10296 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); 10297 ill = q->q_ptr; 10298 ASSERT(ill != NULL); 10299 ipst = ill->ill_ipst; 10300 10301 /* 10302 * We should get back from ARP a packet chain that looks like: 10303 * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK 10304 */ 10305 if (!(area_mp = mp->b_cont) || 10306 (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || 10307 !(orig_ioc_mp = area_mp->b_cont) || 10308 !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { 10309 freemsg(mp); 10310 return; 10311 } 10312 10313 orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; 10314 10315 tmp = (orig_ioc_mp->b_cont)->b_cont; 10316 if ((orig_iocp->ioc_cmd == SIOCGXARP) || 10317 (orig_iocp->ioc_cmd == SIOCSXARP) || 10318 (orig_iocp->ioc_cmd == SIOCDXARP)) { 10319 x_arp_ioctl = B_TRUE; 10320 xar = (struct xarpreq *)tmp->b_rptr; 10321 sin = (sin_t *)&xar->xarp_pa; 10322 flagsp = &xar->xarp_flags; 10323 storage = xar->xarp_ha.sdl_data; 10324 if (xar->xarp_ha.sdl_nlen != 0) 10325 ifx_arp_ioctl = B_TRUE; 10326 } else { 10327 ar = (struct arpreq *)tmp->b_rptr; 10328 sin = (sin_t *)&ar->arp_pa; 10329 flagsp = &ar->arp_flags; 10330 storage = ar->arp_ha.sa_data; 10331 } 10332 10333 iocp = (struct iocblk *)mp->b_rptr; 10334 10335 /* 10336 * Find the pending message; if we're exclusive, it'll be on our IPSQ. 10337 * Otherwise, we can find it from our ioc_id. 10338 */ 10339 if (ipsq != NULL) 10340 pending_mp = ipsq_pending_mp_get(ipsq, &connp); 10341 else 10342 pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id); 10343 10344 if (pending_mp == NULL) { 10345 ASSERT(connp == NULL); 10346 inet_freemsg(mp); 10347 return; 10348 } 10349 ASSERT(connp != NULL); 10350 q = CONNP_TO_WQ(connp); 10351 10352 /* Uncouple the internally generated IOCTL from the original one */ 10353 area = (area_t *)area_mp->b_rptr; 10354 area_mp->b_cont = NULL; 10355 10356 /* 10357 * Restore the b_next and b_prev used by mi code. This is needed 10358 * to complete the ioctl using mi* functions. We stored them in 10359 * the pending mp prior to sending the request to ARP. 10360 */ 10361 orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; 10362 orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; 10363 inet_freemsg(pending_mp); 10364 10365 /* 10366 * We're done if there was an error or if this is not an SIOCG{X}ARP 10367 * Catch the case where there is an IRE_CACHE by no entry in the 10368 * arp table. 10369 */ 10370 addr = sin->sin_addr.s_addr; 10371 if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { 10372 ire_t *ire; 10373 dl_unitdata_req_t *dlup; 10374 mblk_t *llmp; 10375 int addr_len; 10376 ill_t *ipsqill = NULL; 10377 10378 if (ifx_arp_ioctl) { 10379 /* 10380 * There's no need to lookup the ill, since 10381 * we've already done that when we started 10382 * processing the ioctl and sent the message 10383 * to ARP on that ill. So use the ill that 10384 * is stored in q->q_ptr. 10385 */ 10386 ipsqill = ill; 10387 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10388 ipsqill->ill_ipif, ALL_ZONES, 10389 NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); 10390 } else { 10391 ire = ire_ctable_lookup(addr, 0, IRE_CACHE, 10392 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 10393 if (ire != NULL) 10394 ipsqill = ire_to_ill(ire); 10395 } 10396 10397 if ((x_arp_ioctl) && (ipsqill != NULL)) 10398 storage += ill_xarp_info(&xar->xarp_ha, ipsqill); 10399 10400 if (ire != NULL) { 10401 /* 10402 * Since the ire obtained from cachetable is used for 10403 * mac addr copying below, treat an incomplete ire as if 10404 * as if we never found it. 10405 */ 10406 if (ire->ire_nce != NULL && 10407 ire->ire_nce->nce_state != ND_REACHABLE) { 10408 ire_refrele(ire); 10409 ire = NULL; 10410 ipsqill = NULL; 10411 goto errack; 10412 } 10413 *flagsp = ATF_INUSE; 10414 llmp = (ire->ire_nce != NULL ? 10415 ire->ire_nce->nce_res_mp : NULL); 10416 if (llmp != NULL && ipsqill != NULL) { 10417 uchar_t *macaddr; 10418 10419 addr_len = ipsqill->ill_phys_addr_length; 10420 if (x_arp_ioctl && ((addr_len + 10421 ipsqill->ill_name_length) > 10422 sizeof (xar->xarp_ha.sdl_data))) { 10423 ire_refrele(ire); 10424 freemsg(mp); 10425 ip_ioctl_finish(q, orig_ioc_mp, 10426 EINVAL, NO_COPYOUT, ipsq); 10427 return; 10428 } 10429 *flagsp |= ATF_COM; 10430 dlup = (dl_unitdata_req_t *)llmp->b_rptr; 10431 if (ipsqill->ill_sap_length < 0) 10432 macaddr = llmp->b_rptr + 10433 dlup->dl_dest_addr_offset; 10434 else 10435 macaddr = llmp->b_rptr + 10436 dlup->dl_dest_addr_offset + 10437 ipsqill->ill_sap_length; 10438 /* 10439 * For SIOCGARP, MAC address length 10440 * validation has already been done 10441 * before the ioctl was issued to ARP to 10442 * allow it to progress only on 6 byte 10443 * addressable (ethernet like) media. Thus 10444 * the mac address copying can not overwrite 10445 * the sa_data area below. 10446 */ 10447 bcopy(macaddr, storage, addr_len); 10448 } 10449 /* Ditch the internal IOCTL. */ 10450 freemsg(mp); 10451 ire_refrele(ire); 10452 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); 10453 return; 10454 } 10455 } 10456 10457 /* 10458 * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE 10459 * on the IPMP meta-interface, ensure any ARP entries added in 10460 * ip_sioctl_arp() are deleted. 10461 */ 10462 if (IS_IPMP(ill) && 10463 ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) || 10464 ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) { 10465 ipmp_illgrp_t *illg = ill->ill_grp; 10466 ipmp_arpent_t *entp; 10467 10468 if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL) 10469 ipmp_illgrp_destroy_arpent(illg, entp); 10470 } 10471 10472 /* 10473 * Delete the coresponding IRE_CACHE if any. 10474 * Reset the error if there was one (in case there was no entry 10475 * in arp.) 10476 */ 10477 if (iocp->ioc_cmd == AR_ENTRY_DELETE) { 10478 ipif_t *ipintf = NULL; 10479 10480 if (ifx_arp_ioctl) { 10481 /* 10482 * There's no need to lookup the ill, since 10483 * we've already done that when we started 10484 * processing the ioctl and sent the message 10485 * to ARP on that ill. So use the ill that 10486 * is stored in q->q_ptr. 10487 */ 10488 ipintf = ill->ill_ipif; 10489 } 10490 if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) { 10491 /* 10492 * The address in "addr" may be an entry for a 10493 * router. If that's true, then any off-net 10494 * IRE_CACHE entries that go through the router 10495 * with address "addr" must be clobbered. Use 10496 * ire_walk to achieve this goal. 10497 */ 10498 if (ifx_arp_ioctl) 10499 ire_walk_ill_v4(MATCH_IRE_ILL, 0, 10500 ire_delete_cache_gw, (char *)&addr, ill); 10501 else 10502 ire_walk_v4(ire_delete_cache_gw, (char *)&addr, 10503 ALL_ZONES, ipst); 10504 iocp->ioc_error = 0; 10505 } 10506 } 10507 errack: 10508 if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { 10509 err = iocp->ioc_error; 10510 freemsg(mp); 10511 ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq); 10512 return; 10513 } 10514 10515 /* 10516 * Completion of an SIOCG{X}ARP. Translate the information from 10517 * the area_t into the struct {x}arpreq. 10518 */ 10519 if (x_arp_ioctl) { 10520 storage += ill_xarp_info(&xar->xarp_ha, ill); 10521 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 10522 sizeof (xar->xarp_ha.sdl_data)) { 10523 freemsg(mp); 10524 ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, 10525 ipsq); 10526 return; 10527 } 10528 } 10529 *flagsp = ATF_INUSE; 10530 if (area->area_flags & ACE_F_PERMANENT) 10531 *flagsp |= ATF_PERM; 10532 if (area->area_flags & ACE_F_PUBLISH) 10533 *flagsp |= ATF_PUBL; 10534 if (area->area_flags & ACE_F_AUTHORITY) 10535 *flagsp |= ATF_AUTHORITY; 10536 if (area->area_hw_addr_length != 0) { 10537 *flagsp |= ATF_COM; 10538 /* 10539 * For SIOCGARP, MAC address length validation has 10540 * already been done before the ioctl was issued to ARP 10541 * to allow it to progress only on 6 byte addressable 10542 * (ethernet like) media. Thus the mac address copying 10543 * can not overwrite the sa_data area below. 10544 */ 10545 bcopy((char *)area + area->area_hw_addr_offset, 10546 storage, area->area_hw_addr_length); 10547 } 10548 10549 /* Ditch the internal IOCTL. */ 10550 freemsg(mp); 10551 /* Complete the original. */ 10552 ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); 10553 } 10554 10555 /* 10556 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 10557 * interface) create the next available logical interface for this 10558 * physical interface. 10559 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 10560 * ipif with the specified name. 10561 * 10562 * If the address family is not AF_UNSPEC then set the address as well. 10563 * 10564 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 10565 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 10566 * 10567 * Executed as a writer on the ill. 10568 * So no lock is needed to traverse the ipif chain, or examine the 10569 * phyint flags. 10570 */ 10571 /* ARGSUSED */ 10572 int 10573 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 10574 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10575 { 10576 mblk_t *mp1; 10577 struct lifreq *lifr; 10578 boolean_t isv6; 10579 boolean_t exists; 10580 char *name; 10581 char *endp; 10582 char *cp; 10583 int namelen; 10584 ipif_t *ipif; 10585 long id; 10586 ipsq_t *ipsq; 10587 ill_t *ill; 10588 sin_t *sin; 10589 int err = 0; 10590 boolean_t found_sep = B_FALSE; 10591 conn_t *connp; 10592 zoneid_t zoneid; 10593 ip_stack_t *ipst = CONNQ_TO_IPST(q); 10594 10595 ASSERT(q->q_next == NULL); 10596 ip1dbg(("ip_sioctl_addif\n")); 10597 /* Existence of mp1 has been checked in ip_wput_nondata */ 10598 mp1 = mp->b_cont->b_cont; 10599 /* 10600 * Null terminate the string to protect against buffer 10601 * overrun. String was generated by user code and may not 10602 * be trusted. 10603 */ 10604 lifr = (struct lifreq *)mp1->b_rptr; 10605 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 10606 name = lifr->lifr_name; 10607 ASSERT(CONN_Q(q)); 10608 connp = Q_TO_CONN(q); 10609 isv6 = connp->conn_af_isv6; 10610 zoneid = connp->conn_zoneid; 10611 namelen = mi_strlen(name); 10612 if (namelen == 0) 10613 return (EINVAL); 10614 10615 exists = B_FALSE; 10616 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 10617 (mi_strcmp(name, ipif_loopback_name) == 0)) { 10618 /* 10619 * Allow creating lo0 using SIOCLIFADDIF. 10620 * can't be any other writer thread. So can pass null below 10621 * for the last 4 args to ipif_lookup_name. 10622 */ 10623 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 10624 &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst); 10625 /* Prevent any further action */ 10626 if (ipif == NULL) { 10627 return (ENOBUFS); 10628 } else if (!exists) { 10629 /* We created the ipif now and as writer */ 10630 ipif_refrele(ipif); 10631 return (0); 10632 } else { 10633 ill = ipif->ipif_ill; 10634 ill_refhold(ill); 10635 ipif_refrele(ipif); 10636 } 10637 } else { 10638 /* Look for a colon in the name. */ 10639 endp = &name[namelen]; 10640 for (cp = endp; --cp > name; ) { 10641 if (*cp == IPIF_SEPARATOR_CHAR) { 10642 found_sep = B_TRUE; 10643 /* 10644 * Reject any non-decimal aliases for plumbing 10645 * of logical interfaces. Aliases with leading 10646 * zeroes are also rejected as they introduce 10647 * ambiguity in the naming of the interfaces. 10648 * Comparing with "0" takes care of all such 10649 * cases. 10650 */ 10651 if ((strncmp("0", cp+1, 1)) == 0) 10652 return (EINVAL); 10653 10654 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 10655 id <= 0 || *endp != '\0') { 10656 return (EINVAL); 10657 } 10658 *cp = '\0'; 10659 break; 10660 } 10661 } 10662 ill = ill_lookup_on_name(name, B_FALSE, isv6, 10663 CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst); 10664 if (found_sep) 10665 *cp = IPIF_SEPARATOR_CHAR; 10666 if (ill == NULL) 10667 return (err); 10668 } 10669 10670 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 10671 B_TRUE); 10672 10673 /* 10674 * Release the refhold due to the lookup, now that we are excl 10675 * or we are just returning 10676 */ 10677 ill_refrele(ill); 10678 10679 if (ipsq == NULL) 10680 return (EINPROGRESS); 10681 10682 /* We are now exclusive on the IPSQ */ 10683 ASSERT(IAM_WRITER_ILL(ill)); 10684 10685 if (found_sep) { 10686 /* Now see if there is an IPIF with this unit number. */ 10687 for (ipif = ill->ill_ipif; ipif != NULL; 10688 ipif = ipif->ipif_next) { 10689 if (ipif->ipif_id == id) { 10690 err = EEXIST; 10691 goto done; 10692 } 10693 } 10694 } 10695 10696 /* 10697 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 10698 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 10699 * instead. 10700 */ 10701 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 10702 B_TRUE, B_TRUE)) == NULL) { 10703 err = ENOBUFS; 10704 goto done; 10705 } 10706 10707 /* Return created name with ioctl */ 10708 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 10709 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 10710 ip1dbg(("created %s\n", lifr->lifr_name)); 10711 10712 /* Set address */ 10713 sin = (sin_t *)&lifr->lifr_addr; 10714 if (sin->sin_family != AF_UNSPEC) { 10715 err = ip_sioctl_addr(ipif, sin, q, mp, 10716 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 10717 } 10718 10719 done: 10720 ipsq_exit(ipsq); 10721 return (err); 10722 } 10723 10724 /* 10725 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 10726 * interface) delete it based on the IP address (on this physical interface). 10727 * Otherwise delete it based on the ipif_id. 10728 * Also, special handling to allow a removeif of lo0. 10729 */ 10730 /* ARGSUSED */ 10731 int 10732 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10733 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10734 { 10735 conn_t *connp; 10736 ill_t *ill = ipif->ipif_ill; 10737 boolean_t success; 10738 ip_stack_t *ipst; 10739 10740 ipst = CONNQ_TO_IPST(q); 10741 10742 ASSERT(q->q_next == NULL); 10743 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 10744 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10745 ASSERT(IAM_WRITER_IPIF(ipif)); 10746 10747 connp = Q_TO_CONN(q); 10748 /* 10749 * Special case for unplumbing lo0 (the loopback physical interface). 10750 * If unplumbing lo0, the incoming address structure has been 10751 * initialized to all zeros. When unplumbing lo0, all its logical 10752 * interfaces must be removed too. 10753 * 10754 * Note that this interface may be called to remove a specific 10755 * loopback logical interface (eg, lo0:1). But in that case 10756 * ipif->ipif_id != 0 so that the code path for that case is the 10757 * same as any other interface (meaning it skips the code directly 10758 * below). 10759 */ 10760 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10761 if (sin->sin_family == AF_UNSPEC && 10762 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 10763 /* 10764 * Mark it condemned. No new ref. will be made to ill. 10765 */ 10766 mutex_enter(&ill->ill_lock); 10767 ill->ill_state_flags |= ILL_CONDEMNED; 10768 for (ipif = ill->ill_ipif; ipif != NULL; 10769 ipif = ipif->ipif_next) { 10770 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10771 } 10772 mutex_exit(&ill->ill_lock); 10773 10774 ipif = ill->ill_ipif; 10775 /* unplumb the loopback interface */ 10776 ill_delete(ill); 10777 mutex_enter(&connp->conn_lock); 10778 mutex_enter(&ill->ill_lock); 10779 10780 /* Are any references to this ill active */ 10781 if (ill_is_freeable(ill)) { 10782 mutex_exit(&ill->ill_lock); 10783 mutex_exit(&connp->conn_lock); 10784 ill_delete_tail(ill); 10785 mi_free(ill); 10786 return (0); 10787 } 10788 success = ipsq_pending_mp_add(connp, ipif, 10789 CONNP_TO_WQ(connp), mp, ILL_FREE); 10790 mutex_exit(&connp->conn_lock); 10791 mutex_exit(&ill->ill_lock); 10792 if (success) 10793 return (EINPROGRESS); 10794 else 10795 return (EINTR); 10796 } 10797 } 10798 10799 if (ipif->ipif_id == 0) { 10800 ipsq_t *ipsq; 10801 10802 /* Find based on address */ 10803 if (ipif->ipif_isv6) { 10804 sin6_t *sin6; 10805 10806 if (sin->sin_family != AF_INET6) 10807 return (EAFNOSUPPORT); 10808 10809 sin6 = (sin6_t *)sin; 10810 /* We are a writer, so we should be able to lookup */ 10811 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 10812 ipst); 10813 } else { 10814 if (sin->sin_family != AF_INET) 10815 return (EAFNOSUPPORT); 10816 10817 /* We are a writer, so we should be able to lookup */ 10818 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 10819 ipst); 10820 } 10821 if (ipif == NULL) { 10822 return (EADDRNOTAVAIL); 10823 } 10824 10825 /* 10826 * It is possible for a user to send an SIOCLIFREMOVEIF with 10827 * lifr_name of the physical interface but with an ip address 10828 * lifr_addr of a logical interface plumbed over it. 10829 * So update ipx_current_ipif now that ipif points to the 10830 * correct one. 10831 */ 10832 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 10833 ipsq->ipsq_xop->ipx_current_ipif = ipif; 10834 10835 /* This is a writer */ 10836 ipif_refrele(ipif); 10837 } 10838 10839 /* 10840 * Can not delete instance zero since it is tied to the ill. 10841 */ 10842 if (ipif->ipif_id == 0) 10843 return (EBUSY); 10844 10845 mutex_enter(&ill->ill_lock); 10846 ipif->ipif_state_flags |= IPIF_CONDEMNED; 10847 mutex_exit(&ill->ill_lock); 10848 10849 ipif_free(ipif); 10850 10851 mutex_enter(&connp->conn_lock); 10852 mutex_enter(&ill->ill_lock); 10853 10854 /* Are any references to this ipif active */ 10855 if (ipif_is_freeable(ipif)) { 10856 mutex_exit(&ill->ill_lock); 10857 mutex_exit(&connp->conn_lock); 10858 ipif_non_duplicate(ipif); 10859 ipif_down_tail(ipif); 10860 ipif_free_tail(ipif); /* frees ipif */ 10861 return (0); 10862 } 10863 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 10864 IPIF_FREE); 10865 mutex_exit(&ill->ill_lock); 10866 mutex_exit(&connp->conn_lock); 10867 if (success) 10868 return (EINPROGRESS); 10869 else 10870 return (EINTR); 10871 } 10872 10873 /* 10874 * Restart the removeif ioctl. The refcnt has gone down to 0. 10875 * The ipif is already condemned. So can't find it thru lookups. 10876 */ 10877 /* ARGSUSED */ 10878 int 10879 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 10880 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 10881 { 10882 ill_t *ill = ipif->ipif_ill; 10883 10884 ASSERT(IAM_WRITER_IPIF(ipif)); 10885 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 10886 10887 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 10888 ill->ill_name, ipif->ipif_id, (void *)ipif)); 10889 10890 if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { 10891 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 10892 ill_delete_tail(ill); 10893 mi_free(ill); 10894 return (0); 10895 } 10896 10897 ipif_non_duplicate(ipif); 10898 ipif_down_tail(ipif); 10899 ipif_free_tail(ipif); 10900 10901 ILL_UNMARK_CHANGING(ill); 10902 return (0); 10903 } 10904 10905 /* 10906 * Set the local interface address. 10907 * Allow an address of all zero when the interface is down. 10908 */ 10909 /* ARGSUSED */ 10910 int 10911 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10912 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 10913 { 10914 int err = 0; 10915 in6_addr_t v6addr; 10916 boolean_t need_up = B_FALSE; 10917 10918 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 10919 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10920 10921 ASSERT(IAM_WRITER_IPIF(ipif)); 10922 10923 if (ipif->ipif_isv6) { 10924 sin6_t *sin6; 10925 ill_t *ill; 10926 phyint_t *phyi; 10927 10928 if (sin->sin_family != AF_INET6) 10929 return (EAFNOSUPPORT); 10930 10931 sin6 = (sin6_t *)sin; 10932 v6addr = sin6->sin6_addr; 10933 ill = ipif->ipif_ill; 10934 phyi = ill->ill_phyint; 10935 10936 /* 10937 * Enforce that true multicast interfaces have a link-local 10938 * address for logical unit 0. 10939 */ 10940 if (ipif->ipif_id == 0 && 10941 (ill->ill_flags & ILLF_MULTICAST) && 10942 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 10943 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 10944 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 10945 return (EADDRNOTAVAIL); 10946 } 10947 10948 /* 10949 * up interfaces shouldn't have the unspecified address 10950 * unless they also have the IPIF_NOLOCAL flags set and 10951 * have a subnet assigned. 10952 */ 10953 if ((ipif->ipif_flags & IPIF_UP) && 10954 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 10955 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 10956 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 10957 return (EADDRNOTAVAIL); 10958 } 10959 10960 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10961 return (EADDRNOTAVAIL); 10962 } else { 10963 ipaddr_t addr; 10964 10965 if (sin->sin_family != AF_INET) 10966 return (EAFNOSUPPORT); 10967 10968 addr = sin->sin_addr.s_addr; 10969 10970 /* Allow 0 as the local address. */ 10971 if (addr != 0 && !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 10972 return (EADDRNOTAVAIL); 10973 10974 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10975 } 10976 10977 /* 10978 * Even if there is no change we redo things just to rerun 10979 * ipif_set_default. 10980 */ 10981 if (ipif->ipif_flags & IPIF_UP) { 10982 /* 10983 * Setting a new local address, make sure 10984 * we have net and subnet bcast ire's for 10985 * the old address if we need them. 10986 */ 10987 if (!ipif->ipif_isv6) 10988 ipif_check_bcast_ires(ipif); 10989 /* 10990 * If the interface is already marked up, 10991 * we call ipif_down which will take care 10992 * of ditching any IREs that have been set 10993 * up based on the old interface address. 10994 */ 10995 err = ipif_logical_down(ipif, q, mp); 10996 if (err == EINPROGRESS) 10997 return (err); 10998 ipif_down_tail(ipif); 10999 need_up = 1; 11000 } 11001 11002 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 11003 return (err); 11004 } 11005 11006 int 11007 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11008 boolean_t need_up) 11009 { 11010 in6_addr_t v6addr; 11011 in6_addr_t ov6addr; 11012 ipaddr_t addr; 11013 sin6_t *sin6; 11014 int sinlen; 11015 int err = 0; 11016 ill_t *ill = ipif->ipif_ill; 11017 boolean_t need_dl_down; 11018 boolean_t need_arp_down; 11019 struct iocblk *iocp; 11020 11021 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 11022 11023 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 11024 ill->ill_name, ipif->ipif_id, (void *)ipif)); 11025 ASSERT(IAM_WRITER_IPIF(ipif)); 11026 11027 /* Must cancel any pending timer before taking the ill_lock */ 11028 if (ipif->ipif_recovery_id != 0) 11029 (void) untimeout(ipif->ipif_recovery_id); 11030 ipif->ipif_recovery_id = 0; 11031 11032 if (ipif->ipif_isv6) { 11033 sin6 = (sin6_t *)sin; 11034 v6addr = sin6->sin6_addr; 11035 sinlen = sizeof (struct sockaddr_in6); 11036 } else { 11037 addr = sin->sin_addr.s_addr; 11038 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11039 sinlen = sizeof (struct sockaddr_in); 11040 } 11041 mutex_enter(&ill->ill_lock); 11042 ov6addr = ipif->ipif_v6lcl_addr; 11043 ipif->ipif_v6lcl_addr = v6addr; 11044 sctp_update_ipif_addr(ipif, ov6addr); 11045 if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { 11046 ipif->ipif_v6src_addr = ipv6_all_zeros; 11047 } else { 11048 ipif->ipif_v6src_addr = v6addr; 11049 } 11050 ipif->ipif_addr_ready = 0; 11051 11052 /* 11053 * If the interface was previously marked as a duplicate, then since 11054 * we've now got a "new" address, it should no longer be considered a 11055 * duplicate -- even if the "new" address is the same as the old one. 11056 * Note that if all ipifs are down, we may have a pending ARP down 11057 * event to handle. This is because we want to recover from duplicates 11058 * and thus delay tearing down ARP until the duplicates have been 11059 * removed or disabled. 11060 */ 11061 need_dl_down = need_arp_down = B_FALSE; 11062 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11063 need_arp_down = !need_up; 11064 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11065 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11066 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11067 need_dl_down = B_TRUE; 11068 } 11069 } 11070 11071 if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && 11072 !ill->ill_is_6to4tun) { 11073 queue_t *wqp = ill->ill_wq; 11074 11075 /* 11076 * The local address of this interface is a 6to4 address, 11077 * check if this interface is in fact a 6to4 tunnel or just 11078 * an interface configured with a 6to4 address. We are only 11079 * interested in the former. 11080 */ 11081 if (wqp != NULL) { 11082 while ((wqp->q_next != NULL) && 11083 (wqp->q_next->q_qinfo != NULL) && 11084 (wqp->q_next->q_qinfo->qi_minfo != NULL)) { 11085 11086 if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum 11087 == TUN6TO4_MODID) { 11088 /* set for use in IP */ 11089 ill->ill_is_6to4tun = 1; 11090 break; 11091 } 11092 wqp = wqp->q_next; 11093 } 11094 } 11095 } 11096 11097 ipif_set_default(ipif); 11098 11099 /* 11100 * When publishing an interface address change event, we only notify 11101 * the event listeners of the new address. It is assumed that if they 11102 * actively care about the addresses assigned that they will have 11103 * already discovered the previous address assigned (if there was one.) 11104 * 11105 * Don't attach nic event message for SIOCLIFADDIF ioctl. 11106 */ 11107 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 11108 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 11109 NE_ADDRESS_CHANGE, sin, sinlen); 11110 } 11111 11112 mutex_exit(&ill->ill_lock); 11113 11114 if (need_up) { 11115 /* 11116 * Now bring the interface back up. If this 11117 * is the only IPIF for the ILL, ipif_up 11118 * will have to re-bind to the device, so 11119 * we may get back EINPROGRESS, in which 11120 * case, this IOCTL will get completed in 11121 * ip_rput_dlpi when we see the DL_BIND_ACK. 11122 */ 11123 err = ipif_up(ipif, q, mp); 11124 } 11125 11126 if (need_dl_down) 11127 ill_dl_down(ill); 11128 if (need_arp_down) 11129 ipif_resolver_down(ipif); 11130 11131 return (err); 11132 } 11133 11134 /* 11135 * Restart entry point to restart the address set operation after the 11136 * refcounts have dropped to zero. 11137 */ 11138 /* ARGSUSED */ 11139 int 11140 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11141 ip_ioctl_cmd_t *ipip, void *ifreq) 11142 { 11143 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 11144 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11145 ASSERT(IAM_WRITER_IPIF(ipif)); 11146 ipif_down_tail(ipif); 11147 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 11148 } 11149 11150 /* ARGSUSED */ 11151 int 11152 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11153 ip_ioctl_cmd_t *ipip, void *if_req) 11154 { 11155 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11156 struct lifreq *lifr = (struct lifreq *)if_req; 11157 11158 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 11159 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11160 /* 11161 * The net mask and address can't change since we have a 11162 * reference to the ipif. So no lock is necessary. 11163 */ 11164 if (ipif->ipif_isv6) { 11165 *sin6 = sin6_null; 11166 sin6->sin6_family = AF_INET6; 11167 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 11168 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11169 lifr->lifr_addrlen = 11170 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11171 } else { 11172 *sin = sin_null; 11173 sin->sin_family = AF_INET; 11174 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 11175 if (ipip->ipi_cmd_type == LIF_CMD) { 11176 lifr->lifr_addrlen = 11177 ip_mask_to_plen(ipif->ipif_net_mask); 11178 } 11179 } 11180 return (0); 11181 } 11182 11183 /* 11184 * Set the destination address for a pt-pt interface. 11185 */ 11186 /* ARGSUSED */ 11187 int 11188 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11189 ip_ioctl_cmd_t *ipip, void *if_req) 11190 { 11191 int err = 0; 11192 in6_addr_t v6addr; 11193 boolean_t need_up = B_FALSE; 11194 11195 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 11196 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11197 ASSERT(IAM_WRITER_IPIF(ipif)); 11198 11199 if (ipif->ipif_isv6) { 11200 sin6_t *sin6; 11201 11202 if (sin->sin_family != AF_INET6) 11203 return (EAFNOSUPPORT); 11204 11205 sin6 = (sin6_t *)sin; 11206 v6addr = sin6->sin6_addr; 11207 11208 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 11209 return (EADDRNOTAVAIL); 11210 } else { 11211 ipaddr_t addr; 11212 11213 if (sin->sin_family != AF_INET) 11214 return (EAFNOSUPPORT); 11215 11216 addr = sin->sin_addr.s_addr; 11217 if (!ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 11218 return (EADDRNOTAVAIL); 11219 11220 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11221 } 11222 11223 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 11224 return (0); /* No change */ 11225 11226 if (ipif->ipif_flags & IPIF_UP) { 11227 /* 11228 * If the interface is already marked up, 11229 * we call ipif_down which will take care 11230 * of ditching any IREs that have been set 11231 * up based on the old pp dst address. 11232 */ 11233 err = ipif_logical_down(ipif, q, mp); 11234 if (err == EINPROGRESS) 11235 return (err); 11236 ipif_down_tail(ipif); 11237 need_up = B_TRUE; 11238 } 11239 /* 11240 * could return EINPROGRESS. If so ioctl will complete in 11241 * ip_rput_dlpi_writer 11242 */ 11243 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 11244 return (err); 11245 } 11246 11247 static int 11248 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11249 boolean_t need_up) 11250 { 11251 in6_addr_t v6addr; 11252 ill_t *ill = ipif->ipif_ill; 11253 int err = 0; 11254 boolean_t need_dl_down; 11255 boolean_t need_arp_down; 11256 11257 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 11258 ipif->ipif_id, (void *)ipif)); 11259 11260 /* Must cancel any pending timer before taking the ill_lock */ 11261 if (ipif->ipif_recovery_id != 0) 11262 (void) untimeout(ipif->ipif_recovery_id); 11263 ipif->ipif_recovery_id = 0; 11264 11265 if (ipif->ipif_isv6) { 11266 sin6_t *sin6; 11267 11268 sin6 = (sin6_t *)sin; 11269 v6addr = sin6->sin6_addr; 11270 } else { 11271 ipaddr_t addr; 11272 11273 addr = sin->sin_addr.s_addr; 11274 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11275 } 11276 mutex_enter(&ill->ill_lock); 11277 /* Set point to point destination address. */ 11278 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11279 /* 11280 * Allow this as a means of creating logical 11281 * pt-pt interfaces on top of e.g. an Ethernet. 11282 * XXX Undocumented HACK for testing. 11283 * pt-pt interfaces are created with NUD disabled. 11284 */ 11285 ipif->ipif_flags |= IPIF_POINTOPOINT; 11286 ipif->ipif_flags &= ~IPIF_BROADCAST; 11287 if (ipif->ipif_isv6) 11288 ill->ill_flags |= ILLF_NONUD; 11289 } 11290 11291 /* 11292 * If the interface was previously marked as a duplicate, then since 11293 * we've now got a "new" address, it should no longer be considered a 11294 * duplicate -- even if the "new" address is the same as the old one. 11295 * Note that if all ipifs are down, we may have a pending ARP down 11296 * event to handle. 11297 */ 11298 need_dl_down = need_arp_down = B_FALSE; 11299 if (ipif->ipif_flags & IPIF_DUPLICATE) { 11300 need_arp_down = !need_up; 11301 ipif->ipif_flags &= ~IPIF_DUPLICATE; 11302 if (--ill->ill_ipif_dup_count == 0 && !need_up && 11303 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 11304 need_dl_down = B_TRUE; 11305 } 11306 } 11307 11308 /* Set the new address. */ 11309 ipif->ipif_v6pp_dst_addr = v6addr; 11310 /* Make sure subnet tracks pp_dst */ 11311 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 11312 mutex_exit(&ill->ill_lock); 11313 11314 if (need_up) { 11315 /* 11316 * Now bring the interface back up. If this 11317 * is the only IPIF for the ILL, ipif_up 11318 * will have to re-bind to the device, so 11319 * we may get back EINPROGRESS, in which 11320 * case, this IOCTL will get completed in 11321 * ip_rput_dlpi when we see the DL_BIND_ACK. 11322 */ 11323 err = ipif_up(ipif, q, mp); 11324 } 11325 11326 if (need_dl_down) 11327 ill_dl_down(ill); 11328 if (need_arp_down) 11329 ipif_resolver_down(ipif); 11330 11331 return (err); 11332 } 11333 11334 /* 11335 * Restart entry point to restart the dstaddress set operation after the 11336 * refcounts have dropped to zero. 11337 */ 11338 /* ARGSUSED */ 11339 int 11340 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11341 ip_ioctl_cmd_t *ipip, void *ifreq) 11342 { 11343 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 11344 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11345 ipif_down_tail(ipif); 11346 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 11347 } 11348 11349 /* ARGSUSED */ 11350 int 11351 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11352 ip_ioctl_cmd_t *ipip, void *if_req) 11353 { 11354 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 11355 11356 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 11357 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11358 /* 11359 * Get point to point destination address. The addresses can't 11360 * change since we hold a reference to the ipif. 11361 */ 11362 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 11363 return (EADDRNOTAVAIL); 11364 11365 if (ipif->ipif_isv6) { 11366 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11367 *sin6 = sin6_null; 11368 sin6->sin6_family = AF_INET6; 11369 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 11370 } else { 11371 *sin = sin_null; 11372 sin->sin_family = AF_INET; 11373 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 11374 } 11375 return (0); 11376 } 11377 11378 /* 11379 * Set interface flags. Many flags require special handling (e.g., 11380 * bringing the interface down); see below for details. 11381 * 11382 * NOTE : We really don't enforce that ipif_id zero should be used 11383 * for setting any flags other than IFF_LOGINT_FLAGS. This 11384 * is because applications generally does SICGLIFFLAGS and 11385 * ORs in the new flags (that affects the logical) and does a 11386 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 11387 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 11388 * flags that will be turned on is correct with respect to 11389 * ipif_id 0. For backward compatibility reasons, it is not done. 11390 */ 11391 /* ARGSUSED */ 11392 int 11393 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11394 ip_ioctl_cmd_t *ipip, void *if_req) 11395 { 11396 uint64_t turn_on; 11397 uint64_t turn_off; 11398 int err = 0; 11399 phyint_t *phyi; 11400 ill_t *ill; 11401 uint64_t intf_flags, cantchange_flags; 11402 boolean_t phyint_flags_modified = B_FALSE; 11403 uint64_t flags; 11404 struct ifreq *ifr; 11405 struct lifreq *lifr; 11406 boolean_t set_linklocal = B_FALSE; 11407 boolean_t zero_source = B_FALSE; 11408 11409 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 11410 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11411 11412 ASSERT(IAM_WRITER_IPIF(ipif)); 11413 11414 ill = ipif->ipif_ill; 11415 phyi = ill->ill_phyint; 11416 11417 if (ipip->ipi_cmd_type == IF_CMD) { 11418 ifr = (struct ifreq *)if_req; 11419 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 11420 } else { 11421 lifr = (struct lifreq *)if_req; 11422 flags = lifr->lifr_flags; 11423 } 11424 11425 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11426 11427 /* 11428 * Have the flags been set correctly until now? 11429 */ 11430 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11431 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11432 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11433 /* 11434 * Compare the new flags to the old, and partition 11435 * into those coming on and those going off. 11436 * For the 16 bit command keep the bits above bit 16 unchanged. 11437 */ 11438 if (ipip->ipi_cmd == SIOCSIFFLAGS) 11439 flags |= intf_flags & ~0xFFFF; 11440 11441 /* 11442 * Explicitly fail attempts to change flags that are always invalid on 11443 * an IPMP meta-interface. 11444 */ 11445 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 11446 return (EINVAL); 11447 11448 /* 11449 * Check which flags will change; silently ignore flags which userland 11450 * is not allowed to control. (Because these flags may change between 11451 * SIOCGLIFFLAGS and SIOCSLIFFLAGS, and that's outside of userland's 11452 * control, we need to silently ignore them rather than fail.) 11453 */ 11454 cantchange_flags = IFF_CANTCHANGE; 11455 if (IS_IPMP(ill)) 11456 cantchange_flags |= IFF_IPMP_CANTCHANGE; 11457 11458 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 11459 if (turn_on == 0) 11460 return (0); /* No change */ 11461 11462 turn_off = intf_flags & turn_on; 11463 turn_on ^= turn_off; 11464 11465 /* 11466 * All test addresses must be IFF_DEPRECATED (to ensure source address 11467 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 11468 * allow it to be turned off. 11469 */ 11470 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 11471 (turn_on|intf_flags) & IFF_NOFAILOVER) 11472 return (EINVAL); 11473 11474 if (turn_on & IFF_NOFAILOVER) { 11475 turn_on |= IFF_DEPRECATED; 11476 flags |= IFF_DEPRECATED; 11477 } 11478 11479 /* 11480 * On underlying interfaces, only allow applications to manage test 11481 * addresses -- otherwise, they may get confused when the address 11482 * moves as part of being brought up. Likewise, prevent an 11483 * application-managed test address from being converted to a data 11484 * address. To prevent migration of administratively up addresses in 11485 * the kernel, we don't allow them to be converted either. 11486 */ 11487 if (IS_UNDER_IPMP(ill)) { 11488 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 11489 11490 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 11491 return (EINVAL); 11492 11493 if ((turn_off & IFF_NOFAILOVER) && 11494 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 11495 return (EINVAL); 11496 } 11497 11498 /* 11499 * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on 11500 * IPv6 interfaces. 11501 */ 11502 if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) 11503 return (EINVAL); 11504 11505 /* 11506 * cannot turn off IFF_NOXMIT on VNI interfaces. 11507 */ 11508 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 11509 return (EINVAL); 11510 11511 /* 11512 * Don't allow the IFF_ROUTER flag to be turned on on loopback 11513 * interfaces. It makes no sense in that context. 11514 */ 11515 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 11516 return (EINVAL); 11517 11518 if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) 11519 zero_source = B_TRUE; 11520 11521 /* 11522 * For IPv6 ipif_id 0, don't allow the interface to be up without 11523 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 11524 * If the link local address isn't set, and can be set, it will get 11525 * set later on in this function. 11526 */ 11527 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 11528 (flags & IFF_UP) && !zero_source && 11529 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 11530 if (ipif_cant_setlinklocal(ipif)) 11531 return (EINVAL); 11532 set_linklocal = B_TRUE; 11533 } 11534 11535 /* 11536 * If we modify physical interface flags, we'll potentially need to 11537 * send up two routing socket messages for the changes (one for the 11538 * IPv4 ill, and another for the IPv6 ill). Note that here. 11539 */ 11540 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11541 phyint_flags_modified = B_TRUE; 11542 11543 /* 11544 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 11545 * (otherwise, we'd immediately use them, defeating standby). Also, 11546 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 11547 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 11548 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 11549 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 11550 * will not be honored. 11551 */ 11552 if (turn_on & PHYI_STANDBY) { 11553 /* 11554 * No need to grab ill_g_usesrc_lock here; see the 11555 * synchronization notes in ip.c. 11556 */ 11557 if (ill->ill_usesrc_grp_next != NULL || 11558 intf_flags & PHYI_INACTIVE) 11559 return (EINVAL); 11560 if (!(flags & PHYI_FAILED)) { 11561 flags |= PHYI_INACTIVE; 11562 turn_on |= PHYI_INACTIVE; 11563 } 11564 } 11565 11566 if (turn_off & PHYI_STANDBY) { 11567 flags &= ~PHYI_INACTIVE; 11568 turn_off |= PHYI_INACTIVE; 11569 } 11570 11571 /* 11572 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 11573 * would end up on. 11574 */ 11575 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 11576 (PHYI_FAILED | PHYI_INACTIVE)) 11577 return (EINVAL); 11578 11579 /* 11580 * If ILLF_ROUTER changes, we need to change the ip forwarding 11581 * status of the interface. 11582 */ 11583 if ((turn_on | turn_off) & ILLF_ROUTER) 11584 (void) ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 11585 11586 /* 11587 * If the interface is not UP and we are not going to 11588 * bring it UP, record the flags and return. When the 11589 * interface comes UP later, the right actions will be 11590 * taken. 11591 */ 11592 if (!(ipif->ipif_flags & IPIF_UP) && 11593 !(turn_on & IPIF_UP)) { 11594 /* Record new flags in their respective places. */ 11595 mutex_enter(&ill->ill_lock); 11596 mutex_enter(&ill->ill_phyint->phyint_lock); 11597 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11598 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11599 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11600 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11601 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11602 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11603 mutex_exit(&ill->ill_lock); 11604 mutex_exit(&ill->ill_phyint->phyint_lock); 11605 11606 /* 11607 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 11608 * same to the kernel: if any of them has been set by 11609 * userland, the interface cannot be used for data traffic. 11610 */ 11611 if ((turn_on|turn_off) & 11612 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 11613 ASSERT(!IS_IPMP(ill)); 11614 /* 11615 * It's possible the ill is part of an "anonymous" 11616 * IPMP group rather than a real group. In that case, 11617 * there are no other interfaces in the group and thus 11618 * no need to call ipmp_phyint_refresh_active(). 11619 */ 11620 if (IS_UNDER_IPMP(ill)) 11621 ipmp_phyint_refresh_active(phyi); 11622 } 11623 11624 if (phyint_flags_modified) { 11625 if (phyi->phyint_illv4 != NULL) { 11626 ip_rts_ifmsg(phyi->phyint_illv4-> 11627 ill_ipif, RTSQ_DEFAULT); 11628 } 11629 if (phyi->phyint_illv6 != NULL) { 11630 ip_rts_ifmsg(phyi->phyint_illv6-> 11631 ill_ipif, RTSQ_DEFAULT); 11632 } 11633 } 11634 return (0); 11635 } else if (set_linklocal || zero_source) { 11636 mutex_enter(&ill->ill_lock); 11637 if (set_linklocal) 11638 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 11639 if (zero_source) 11640 ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; 11641 mutex_exit(&ill->ill_lock); 11642 } 11643 11644 /* 11645 * Disallow IPv6 interfaces coming up that have the unspecified address, 11646 * or point-to-point interfaces with an unspecified destination. We do 11647 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 11648 * have a subnet assigned, which is how in.ndpd currently manages its 11649 * onlink prefix list when no addresses are configured with those 11650 * prefixes. 11651 */ 11652 if (ipif->ipif_isv6 && 11653 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 11654 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 11655 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 11656 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11657 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 11658 return (EINVAL); 11659 } 11660 11661 /* 11662 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 11663 * from being brought up. 11664 */ 11665 if (!ipif->ipif_isv6 && 11666 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 11667 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 11668 return (EINVAL); 11669 } 11670 11671 /* 11672 * The only flag changes that we currently take specific action on are 11673 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 11674 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 11675 * IPIF_NOFAILOVER. This is done by bring the ipif down, changing the 11676 * flags and bringing it back up again. For IPIF_NOFAILOVER, the act 11677 * of bringing it back up will trigger the address to be moved. 11678 */ 11679 if ((turn_on|turn_off) & 11680 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 11681 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 11682 IPIF_NOFAILOVER)) { 11683 /* 11684 * Taking this ipif down, make sure we have 11685 * valid net and subnet bcast ire's for other 11686 * logical interfaces, if we need them. 11687 */ 11688 if (!ipif->ipif_isv6) 11689 ipif_check_bcast_ires(ipif); 11690 11691 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 11692 !(turn_off & IPIF_UP)) { 11693 if (ipif->ipif_flags & IPIF_UP) 11694 ill->ill_logical_down = 1; 11695 turn_on &= ~IPIF_UP; 11696 } 11697 err = ipif_down(ipif, q, mp); 11698 ip1dbg(("ipif_down returns %d err ", err)); 11699 if (err == EINPROGRESS) 11700 return (err); 11701 ipif_down_tail(ipif); 11702 } 11703 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 11704 } 11705 11706 static int 11707 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 11708 { 11709 ill_t *ill; 11710 phyint_t *phyi; 11711 uint64_t turn_on, turn_off; 11712 uint64_t intf_flags, cantchange_flags; 11713 boolean_t phyint_flags_modified = B_FALSE; 11714 int err = 0; 11715 boolean_t set_linklocal = B_FALSE; 11716 boolean_t zero_source = B_FALSE; 11717 11718 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 11719 ipif->ipif_ill->ill_name, ipif->ipif_id)); 11720 11721 ASSERT(IAM_WRITER_IPIF(ipif)); 11722 11723 ill = ipif->ipif_ill; 11724 phyi = ill->ill_phyint; 11725 11726 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 11727 cantchange_flags = IFF_CANTCHANGE | IFF_UP; 11728 if (IS_IPMP(ill)) 11729 cantchange_flags |= IFF_IPMP_CANTCHANGE; 11730 11731 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 11732 turn_off = intf_flags & turn_on; 11733 turn_on ^= turn_off; 11734 11735 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 11736 phyint_flags_modified = B_TRUE; 11737 11738 /* 11739 * Now we change the flags. Track current value of 11740 * other flags in their respective places. 11741 */ 11742 mutex_enter(&ill->ill_lock); 11743 mutex_enter(&phyi->phyint_lock); 11744 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 11745 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 11746 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 11747 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 11748 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 11749 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 11750 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 11751 set_linklocal = B_TRUE; 11752 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 11753 } 11754 if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { 11755 zero_source = B_TRUE; 11756 ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; 11757 } 11758 mutex_exit(&ill->ill_lock); 11759 mutex_exit(&phyi->phyint_lock); 11760 11761 if (set_linklocal) 11762 (void) ipif_setlinklocal(ipif); 11763 11764 if (zero_source) 11765 ipif->ipif_v6src_addr = ipv6_all_zeros; 11766 else 11767 ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; 11768 11769 /* 11770 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 11771 * the kernel: if any of them has been set by userland, the interface 11772 * cannot be used for data traffic. 11773 */ 11774 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 11775 ASSERT(!IS_IPMP(ill)); 11776 /* 11777 * It's possible the ill is part of an "anonymous" IPMP group 11778 * rather than a real group. In that case, there are no other 11779 * interfaces in the group and thus no need for us to call 11780 * ipmp_phyint_refresh_active(). 11781 */ 11782 if (IS_UNDER_IPMP(ill)) 11783 ipmp_phyint_refresh_active(phyi); 11784 } 11785 11786 if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 11787 /* 11788 * XXX ipif_up really does not know whether a phyint flags 11789 * was modified or not. So, it sends up information on 11790 * only one routing sockets message. As we don't bring up 11791 * the interface and also set PHYI_ flags simultaneously 11792 * it should be okay. 11793 */ 11794 err = ipif_up(ipif, q, mp); 11795 } else { 11796 /* 11797 * Make sure routing socket sees all changes to the flags. 11798 * ipif_up_done* handles this when we use ipif_up. 11799 */ 11800 if (phyint_flags_modified) { 11801 if (phyi->phyint_illv4 != NULL) { 11802 ip_rts_ifmsg(phyi->phyint_illv4-> 11803 ill_ipif, RTSQ_DEFAULT); 11804 } 11805 if (phyi->phyint_illv6 != NULL) { 11806 ip_rts_ifmsg(phyi->phyint_illv6-> 11807 ill_ipif, RTSQ_DEFAULT); 11808 } 11809 } else { 11810 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 11811 } 11812 /* 11813 * Update the flags in SCTP's IPIF list, ipif_up() will do 11814 * this in need_up case. 11815 */ 11816 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11817 } 11818 return (err); 11819 } 11820 11821 /* 11822 * Restart the flags operation now that the refcounts have dropped to zero. 11823 */ 11824 /* ARGSUSED */ 11825 int 11826 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11827 ip_ioctl_cmd_t *ipip, void *if_req) 11828 { 11829 uint64_t flags; 11830 struct ifreq *ifr = if_req; 11831 struct lifreq *lifr = if_req; 11832 11833 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 11834 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11835 11836 ipif_down_tail(ipif); 11837 if (ipip->ipi_cmd_type == IF_CMD) { 11838 /* cast to uint16_t prevents unwanted sign extension */ 11839 flags = (uint16_t)ifr->ifr_flags; 11840 } else { 11841 flags = lifr->lifr_flags; 11842 } 11843 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 11844 } 11845 11846 /* 11847 * Can operate on either a module or a driver queue. 11848 */ 11849 /* ARGSUSED */ 11850 int 11851 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11852 ip_ioctl_cmd_t *ipip, void *if_req) 11853 { 11854 /* 11855 * Has the flags been set correctly till now ? 11856 */ 11857 ill_t *ill = ipif->ipif_ill; 11858 phyint_t *phyi = ill->ill_phyint; 11859 11860 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 11861 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11862 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 11863 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 11864 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 11865 11866 /* 11867 * Need a lock since some flags can be set even when there are 11868 * references to the ipif. 11869 */ 11870 mutex_enter(&ill->ill_lock); 11871 if (ipip->ipi_cmd_type == IF_CMD) { 11872 struct ifreq *ifr = (struct ifreq *)if_req; 11873 11874 /* Get interface flags (low 16 only). */ 11875 ifr->ifr_flags = ((ipif->ipif_flags | 11876 ill->ill_flags | phyi->phyint_flags) & 0xffff); 11877 } else { 11878 struct lifreq *lifr = (struct lifreq *)if_req; 11879 11880 /* Get interface flags. */ 11881 lifr->lifr_flags = ipif->ipif_flags | 11882 ill->ill_flags | phyi->phyint_flags; 11883 } 11884 mutex_exit(&ill->ill_lock); 11885 return (0); 11886 } 11887 11888 /* ARGSUSED */ 11889 int 11890 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11891 ip_ioctl_cmd_t *ipip, void *if_req) 11892 { 11893 int mtu; 11894 int ip_min_mtu; 11895 struct ifreq *ifr; 11896 struct lifreq *lifr; 11897 ire_t *ire; 11898 ip_stack_t *ipst; 11899 11900 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 11901 ipif->ipif_id, (void *)ipif)); 11902 if (ipip->ipi_cmd_type == IF_CMD) { 11903 ifr = (struct ifreq *)if_req; 11904 mtu = ifr->ifr_metric; 11905 } else { 11906 lifr = (struct lifreq *)if_req; 11907 mtu = lifr->lifr_mtu; 11908 } 11909 11910 if (ipif->ipif_isv6) 11911 ip_min_mtu = IPV6_MIN_MTU; 11912 else 11913 ip_min_mtu = IP_MIN_MTU; 11914 11915 if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) 11916 return (EINVAL); 11917 11918 /* 11919 * Change the MTU size in all relevant ire's. 11920 * Mtu change Vs. new ire creation - protocol below. 11921 * First change ipif_mtu and the ire_max_frag of the 11922 * interface ire. Then do an ire walk and change the 11923 * ire_max_frag of all affected ires. During ire_add 11924 * under the bucket lock, set the ire_max_frag of the 11925 * new ire being created from the ipif/ire from which 11926 * it is being derived. If an mtu change happens after 11927 * the ire is added, the new ire will be cleaned up. 11928 * Conversely if the mtu change happens before the ire 11929 * is added, ire_add will see the new value of the mtu. 11930 */ 11931 ipif->ipif_mtu = mtu; 11932 ipif->ipif_flags |= IPIF_FIXEDMTU; 11933 11934 if (ipif->ipif_isv6) 11935 ire = ipif_to_ire_v6(ipif); 11936 else 11937 ire = ipif_to_ire(ipif); 11938 if (ire != NULL) { 11939 ire->ire_max_frag = ipif->ipif_mtu; 11940 ire_refrele(ire); 11941 } 11942 ipst = ipif->ipif_ill->ill_ipst; 11943 if (ipif->ipif_flags & IPIF_UP) { 11944 if (ipif->ipif_isv6) 11945 ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES, 11946 ipst); 11947 else 11948 ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES, 11949 ipst); 11950 } 11951 /* Update the MTU in SCTP's list */ 11952 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 11953 return (0); 11954 } 11955 11956 /* Get interface MTU. */ 11957 /* ARGSUSED */ 11958 int 11959 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11960 ip_ioctl_cmd_t *ipip, void *if_req) 11961 { 11962 struct ifreq *ifr; 11963 struct lifreq *lifr; 11964 11965 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 11966 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11967 if (ipip->ipi_cmd_type == IF_CMD) { 11968 ifr = (struct ifreq *)if_req; 11969 ifr->ifr_metric = ipif->ipif_mtu; 11970 } else { 11971 lifr = (struct lifreq *)if_req; 11972 lifr->lifr_mtu = ipif->ipif_mtu; 11973 } 11974 return (0); 11975 } 11976 11977 /* Set interface broadcast address. */ 11978 /* ARGSUSED2 */ 11979 int 11980 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11981 ip_ioctl_cmd_t *ipip, void *if_req) 11982 { 11983 ipaddr_t addr; 11984 ire_t *ire; 11985 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11986 11987 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, 11988 ipif->ipif_id)); 11989 11990 ASSERT(IAM_WRITER_IPIF(ipif)); 11991 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 11992 return (EADDRNOTAVAIL); 11993 11994 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 11995 11996 if (sin->sin_family != AF_INET) 11997 return (EAFNOSUPPORT); 11998 11999 addr = sin->sin_addr.s_addr; 12000 if (ipif->ipif_flags & IPIF_UP) { 12001 /* 12002 * If we are already up, make sure the new 12003 * broadcast address makes sense. If it does, 12004 * there should be an IRE for it already. 12005 * Don't match on ipif, only on the ill 12006 * since we are sharing these now. 12007 */ 12008 ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, 12009 ipif, ALL_ZONES, NULL, 12010 (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst); 12011 if (ire == NULL) { 12012 return (EINVAL); 12013 } else { 12014 ire_refrele(ire); 12015 } 12016 } 12017 /* 12018 * Changing the broadcast addr for this ipif. 12019 * Make sure we have valid net and subnet bcast 12020 * ire's for other logical interfaces, if needed. 12021 */ 12022 if (addr != ipif->ipif_brd_addr) 12023 ipif_check_bcast_ires(ipif); 12024 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 12025 return (0); 12026 } 12027 12028 /* Get interface broadcast address. */ 12029 /* ARGSUSED */ 12030 int 12031 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12032 ip_ioctl_cmd_t *ipip, void *if_req) 12033 { 12034 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 12035 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12036 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 12037 return (EADDRNOTAVAIL); 12038 12039 /* IPIF_BROADCAST not possible with IPv6 */ 12040 ASSERT(!ipif->ipif_isv6); 12041 *sin = sin_null; 12042 sin->sin_family = AF_INET; 12043 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 12044 return (0); 12045 } 12046 12047 /* 12048 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 12049 */ 12050 /* ARGSUSED */ 12051 int 12052 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12053 ip_ioctl_cmd_t *ipip, void *if_req) 12054 { 12055 int err = 0; 12056 in6_addr_t v6mask; 12057 12058 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 12059 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12060 12061 ASSERT(IAM_WRITER_IPIF(ipif)); 12062 12063 if (ipif->ipif_isv6) { 12064 sin6_t *sin6; 12065 12066 if (sin->sin_family != AF_INET6) 12067 return (EAFNOSUPPORT); 12068 12069 sin6 = (sin6_t *)sin; 12070 v6mask = sin6->sin6_addr; 12071 } else { 12072 ipaddr_t mask; 12073 12074 if (sin->sin_family != AF_INET) 12075 return (EAFNOSUPPORT); 12076 12077 mask = sin->sin_addr.s_addr; 12078 V4MASK_TO_V6(mask, v6mask); 12079 } 12080 12081 /* 12082 * No big deal if the interface isn't already up, or the mask 12083 * isn't really changing, or this is pt-pt. 12084 */ 12085 if (!(ipif->ipif_flags & IPIF_UP) || 12086 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 12087 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 12088 ipif->ipif_v6net_mask = v6mask; 12089 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12090 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 12091 ipif->ipif_v6net_mask, 12092 ipif->ipif_v6subnet); 12093 } 12094 return (0); 12095 } 12096 /* 12097 * Make sure we have valid net and subnet broadcast ire's 12098 * for the old netmask, if needed by other logical interfaces. 12099 */ 12100 if (!ipif->ipif_isv6) 12101 ipif_check_bcast_ires(ipif); 12102 12103 err = ipif_logical_down(ipif, q, mp); 12104 if (err == EINPROGRESS) 12105 return (err); 12106 ipif_down_tail(ipif); 12107 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 12108 return (err); 12109 } 12110 12111 static int 12112 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 12113 { 12114 in6_addr_t v6mask; 12115 int err = 0; 12116 12117 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 12118 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12119 12120 if (ipif->ipif_isv6) { 12121 sin6_t *sin6; 12122 12123 sin6 = (sin6_t *)sin; 12124 v6mask = sin6->sin6_addr; 12125 } else { 12126 ipaddr_t mask; 12127 12128 mask = sin->sin_addr.s_addr; 12129 V4MASK_TO_V6(mask, v6mask); 12130 } 12131 12132 ipif->ipif_v6net_mask = v6mask; 12133 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12134 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 12135 ipif->ipif_v6subnet); 12136 } 12137 err = ipif_up(ipif, q, mp); 12138 12139 if (err == 0 || err == EINPROGRESS) { 12140 /* 12141 * The interface must be DL_BOUND if this packet has to 12142 * go out on the wire. Since we only go through a logical 12143 * down and are bound with the driver during an internal 12144 * down/up that is satisfied. 12145 */ 12146 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 12147 /* Potentially broadcast an address mask reply. */ 12148 ipif_mask_reply(ipif); 12149 } 12150 } 12151 return (err); 12152 } 12153 12154 /* ARGSUSED */ 12155 int 12156 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12157 ip_ioctl_cmd_t *ipip, void *if_req) 12158 { 12159 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 12160 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12161 ipif_down_tail(ipif); 12162 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 12163 } 12164 12165 /* Get interface net mask. */ 12166 /* ARGSUSED */ 12167 int 12168 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12169 ip_ioctl_cmd_t *ipip, void *if_req) 12170 { 12171 struct lifreq *lifr = (struct lifreq *)if_req; 12172 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 12173 12174 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 12175 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12176 12177 /* 12178 * net mask can't change since we have a reference to the ipif. 12179 */ 12180 if (ipif->ipif_isv6) { 12181 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12182 *sin6 = sin6_null; 12183 sin6->sin6_family = AF_INET6; 12184 sin6->sin6_addr = ipif->ipif_v6net_mask; 12185 lifr->lifr_addrlen = 12186 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12187 } else { 12188 *sin = sin_null; 12189 sin->sin_family = AF_INET; 12190 sin->sin_addr.s_addr = ipif->ipif_net_mask; 12191 if (ipip->ipi_cmd_type == LIF_CMD) { 12192 lifr->lifr_addrlen = 12193 ip_mask_to_plen(ipif->ipif_net_mask); 12194 } 12195 } 12196 return (0); 12197 } 12198 12199 /* ARGSUSED */ 12200 int 12201 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12202 ip_ioctl_cmd_t *ipip, void *if_req) 12203 { 12204 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 12205 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12206 12207 /* 12208 * Since no applications should ever be setting metrics on underlying 12209 * interfaces, we explicitly fail to smoke 'em out. 12210 */ 12211 if (IS_UNDER_IPMP(ipif->ipif_ill)) 12212 return (EINVAL); 12213 12214 /* 12215 * Set interface metric. We don't use this for 12216 * anything but we keep track of it in case it is 12217 * important to routing applications or such. 12218 */ 12219 if (ipip->ipi_cmd_type == IF_CMD) { 12220 struct ifreq *ifr; 12221 12222 ifr = (struct ifreq *)if_req; 12223 ipif->ipif_metric = ifr->ifr_metric; 12224 } else { 12225 struct lifreq *lifr; 12226 12227 lifr = (struct lifreq *)if_req; 12228 ipif->ipif_metric = lifr->lifr_metric; 12229 } 12230 return (0); 12231 } 12232 12233 /* ARGSUSED */ 12234 int 12235 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12236 ip_ioctl_cmd_t *ipip, void *if_req) 12237 { 12238 /* Get interface metric. */ 12239 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 12240 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12241 12242 if (ipip->ipi_cmd_type == IF_CMD) { 12243 struct ifreq *ifr; 12244 12245 ifr = (struct ifreq *)if_req; 12246 ifr->ifr_metric = ipif->ipif_metric; 12247 } else { 12248 struct lifreq *lifr; 12249 12250 lifr = (struct lifreq *)if_req; 12251 lifr->lifr_metric = ipif->ipif_metric; 12252 } 12253 12254 return (0); 12255 } 12256 12257 /* ARGSUSED */ 12258 int 12259 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12260 ip_ioctl_cmd_t *ipip, void *if_req) 12261 { 12262 12263 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 12264 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12265 /* 12266 * Set the muxid returned from I_PLINK. 12267 */ 12268 if (ipip->ipi_cmd_type == IF_CMD) { 12269 struct ifreq *ifr = (struct ifreq *)if_req; 12270 12271 ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; 12272 ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; 12273 } else { 12274 struct lifreq *lifr = (struct lifreq *)if_req; 12275 12276 ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; 12277 ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; 12278 } 12279 return (0); 12280 } 12281 12282 /* ARGSUSED */ 12283 int 12284 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12285 ip_ioctl_cmd_t *ipip, void *if_req) 12286 { 12287 12288 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 12289 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12290 /* 12291 * Get the muxid saved in ill for I_PUNLINK. 12292 */ 12293 if (ipip->ipi_cmd_type == IF_CMD) { 12294 struct ifreq *ifr = (struct ifreq *)if_req; 12295 12296 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12297 ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12298 } else { 12299 struct lifreq *lifr = (struct lifreq *)if_req; 12300 12301 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; 12302 lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; 12303 } 12304 return (0); 12305 } 12306 12307 /* 12308 * Set the subnet prefix. Does not modify the broadcast address. 12309 */ 12310 /* ARGSUSED */ 12311 int 12312 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12313 ip_ioctl_cmd_t *ipip, void *if_req) 12314 { 12315 int err = 0; 12316 in6_addr_t v6addr; 12317 in6_addr_t v6mask; 12318 boolean_t need_up = B_FALSE; 12319 int addrlen; 12320 12321 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 12322 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12323 12324 ASSERT(IAM_WRITER_IPIF(ipif)); 12325 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 12326 12327 if (ipif->ipif_isv6) { 12328 sin6_t *sin6; 12329 12330 if (sin->sin_family != AF_INET6) 12331 return (EAFNOSUPPORT); 12332 12333 sin6 = (sin6_t *)sin; 12334 v6addr = sin6->sin6_addr; 12335 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 12336 return (EADDRNOTAVAIL); 12337 } else { 12338 ipaddr_t addr; 12339 12340 if (sin->sin_family != AF_INET) 12341 return (EAFNOSUPPORT); 12342 12343 addr = sin->sin_addr.s_addr; 12344 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 12345 return (EADDRNOTAVAIL); 12346 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12347 /* Add 96 bits */ 12348 addrlen += IPV6_ABITS - IP_ABITS; 12349 } 12350 12351 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 12352 return (EINVAL); 12353 12354 /* Check if bits in the address is set past the mask */ 12355 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 12356 return (EINVAL); 12357 12358 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 12359 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 12360 return (0); /* No change */ 12361 12362 if (ipif->ipif_flags & IPIF_UP) { 12363 /* 12364 * If the interface is already marked up, 12365 * we call ipif_down which will take care 12366 * of ditching any IREs that have been set 12367 * up based on the old interface address. 12368 */ 12369 err = ipif_logical_down(ipif, q, mp); 12370 if (err == EINPROGRESS) 12371 return (err); 12372 ipif_down_tail(ipif); 12373 need_up = B_TRUE; 12374 } 12375 12376 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 12377 return (err); 12378 } 12379 12380 static int 12381 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 12382 queue_t *q, mblk_t *mp, boolean_t need_up) 12383 { 12384 ill_t *ill = ipif->ipif_ill; 12385 int err = 0; 12386 12387 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 12388 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12389 12390 /* Set the new address. */ 12391 mutex_enter(&ill->ill_lock); 12392 ipif->ipif_v6net_mask = v6mask; 12393 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 12394 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 12395 ipif->ipif_v6subnet); 12396 } 12397 mutex_exit(&ill->ill_lock); 12398 12399 if (need_up) { 12400 /* 12401 * Now bring the interface back up. If this 12402 * is the only IPIF for the ILL, ipif_up 12403 * will have to re-bind to the device, so 12404 * we may get back EINPROGRESS, in which 12405 * case, this IOCTL will get completed in 12406 * ip_rput_dlpi when we see the DL_BIND_ACK. 12407 */ 12408 err = ipif_up(ipif, q, mp); 12409 if (err == EINPROGRESS) 12410 return (err); 12411 } 12412 return (err); 12413 } 12414 12415 /* ARGSUSED */ 12416 int 12417 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12418 ip_ioctl_cmd_t *ipip, void *if_req) 12419 { 12420 int addrlen; 12421 in6_addr_t v6addr; 12422 in6_addr_t v6mask; 12423 struct lifreq *lifr = (struct lifreq *)if_req; 12424 12425 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 12426 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12427 ipif_down_tail(ipif); 12428 12429 addrlen = lifr->lifr_addrlen; 12430 if (ipif->ipif_isv6) { 12431 sin6_t *sin6; 12432 12433 sin6 = (sin6_t *)sin; 12434 v6addr = sin6->sin6_addr; 12435 } else { 12436 ipaddr_t addr; 12437 12438 addr = sin->sin_addr.s_addr; 12439 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 12440 addrlen += IPV6_ABITS - IP_ABITS; 12441 } 12442 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 12443 12444 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 12445 } 12446 12447 /* ARGSUSED */ 12448 int 12449 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12450 ip_ioctl_cmd_t *ipip, void *if_req) 12451 { 12452 struct lifreq *lifr = (struct lifreq *)if_req; 12453 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 12454 12455 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 12456 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12457 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 12458 12459 if (ipif->ipif_isv6) { 12460 *sin6 = sin6_null; 12461 sin6->sin6_family = AF_INET6; 12462 sin6->sin6_addr = ipif->ipif_v6subnet; 12463 lifr->lifr_addrlen = 12464 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 12465 } else { 12466 *sin = sin_null; 12467 sin->sin_family = AF_INET; 12468 sin->sin_addr.s_addr = ipif->ipif_subnet; 12469 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 12470 } 12471 return (0); 12472 } 12473 12474 /* 12475 * Set the IPv6 address token. 12476 */ 12477 /* ARGSUSED */ 12478 int 12479 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12480 ip_ioctl_cmd_t *ipi, void *if_req) 12481 { 12482 ill_t *ill = ipif->ipif_ill; 12483 int err; 12484 in6_addr_t v6addr; 12485 in6_addr_t v6mask; 12486 boolean_t need_up = B_FALSE; 12487 int i; 12488 sin6_t *sin6 = (sin6_t *)sin; 12489 struct lifreq *lifr = (struct lifreq *)if_req; 12490 int addrlen; 12491 12492 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 12493 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12494 ASSERT(IAM_WRITER_IPIF(ipif)); 12495 12496 addrlen = lifr->lifr_addrlen; 12497 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12498 if (ipif->ipif_id != 0) 12499 return (EINVAL); 12500 12501 if (!ipif->ipif_isv6) 12502 return (EINVAL); 12503 12504 if (addrlen > IPV6_ABITS) 12505 return (EINVAL); 12506 12507 v6addr = sin6->sin6_addr; 12508 12509 /* 12510 * The length of the token is the length from the end. To get 12511 * the proper mask for this, compute the mask of the bits not 12512 * in the token; ie. the prefix, and then xor to get the mask. 12513 */ 12514 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 12515 return (EINVAL); 12516 for (i = 0; i < 4; i++) { 12517 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12518 } 12519 12520 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 12521 ill->ill_token_length == addrlen) 12522 return (0); /* No change */ 12523 12524 if (ipif->ipif_flags & IPIF_UP) { 12525 err = ipif_logical_down(ipif, q, mp); 12526 if (err == EINPROGRESS) 12527 return (err); 12528 ipif_down_tail(ipif); 12529 need_up = B_TRUE; 12530 } 12531 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 12532 return (err); 12533 } 12534 12535 static int 12536 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 12537 mblk_t *mp, boolean_t need_up) 12538 { 12539 in6_addr_t v6addr; 12540 in6_addr_t v6mask; 12541 ill_t *ill = ipif->ipif_ill; 12542 int i; 12543 int err = 0; 12544 12545 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 12546 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12547 v6addr = sin6->sin6_addr; 12548 /* 12549 * The length of the token is the length from the end. To get 12550 * the proper mask for this, compute the mask of the bits not 12551 * in the token; ie. the prefix, and then xor to get the mask. 12552 */ 12553 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 12554 for (i = 0; i < 4; i++) 12555 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 12556 12557 mutex_enter(&ill->ill_lock); 12558 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 12559 ill->ill_token_length = addrlen; 12560 mutex_exit(&ill->ill_lock); 12561 12562 if (need_up) { 12563 /* 12564 * Now bring the interface back up. If this 12565 * is the only IPIF for the ILL, ipif_up 12566 * will have to re-bind to the device, so 12567 * we may get back EINPROGRESS, in which 12568 * case, this IOCTL will get completed in 12569 * ip_rput_dlpi when we see the DL_BIND_ACK. 12570 */ 12571 err = ipif_up(ipif, q, mp); 12572 if (err == EINPROGRESS) 12573 return (err); 12574 } 12575 return (err); 12576 } 12577 12578 /* ARGSUSED */ 12579 int 12580 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12581 ip_ioctl_cmd_t *ipi, void *if_req) 12582 { 12583 ill_t *ill; 12584 sin6_t *sin6 = (sin6_t *)sin; 12585 struct lifreq *lifr = (struct lifreq *)if_req; 12586 12587 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 12588 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12589 if (ipif->ipif_id != 0) 12590 return (EINVAL); 12591 12592 ill = ipif->ipif_ill; 12593 if (!ill->ill_isv6) 12594 return (ENXIO); 12595 12596 *sin6 = sin6_null; 12597 sin6->sin6_family = AF_INET6; 12598 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 12599 sin6->sin6_addr = ill->ill_token; 12600 lifr->lifr_addrlen = ill->ill_token_length; 12601 return (0); 12602 } 12603 12604 /* 12605 * Set (hardware) link specific information that might override 12606 * what was acquired through the DL_INFO_ACK. 12607 * The logic is as follows. 12608 * 12609 * become exclusive 12610 * set CHANGING flag 12611 * change mtu on affected IREs 12612 * clear CHANGING flag 12613 * 12614 * An ire add that occurs before the CHANGING flag is set will have its mtu 12615 * changed by the ip_sioctl_lnkinfo. 12616 * 12617 * During the time the CHANGING flag is set, no new ires will be added to the 12618 * bucket, and ire add will fail (due the CHANGING flag). 12619 * 12620 * An ire add that occurs after the CHANGING flag is set will have the right mtu 12621 * before it is added to the bucket. 12622 * 12623 * Obviously only 1 thread can set the CHANGING flag and we need to become 12624 * exclusive to set the flag. 12625 */ 12626 /* ARGSUSED */ 12627 int 12628 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12629 ip_ioctl_cmd_t *ipi, void *if_req) 12630 { 12631 ill_t *ill = ipif->ipif_ill; 12632 ipif_t *nipif; 12633 int ip_min_mtu; 12634 boolean_t mtu_walk = B_FALSE; 12635 struct lifreq *lifr = (struct lifreq *)if_req; 12636 lif_ifinfo_req_t *lir; 12637 ire_t *ire; 12638 12639 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 12640 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12641 lir = &lifr->lifr_ifinfo; 12642 ASSERT(IAM_WRITER_IPIF(ipif)); 12643 12644 /* Only allow for logical unit zero i.e. not on "le0:17" */ 12645 if (ipif->ipif_id != 0) 12646 return (EINVAL); 12647 12648 /* Set interface MTU. */ 12649 if (ipif->ipif_isv6) 12650 ip_min_mtu = IPV6_MIN_MTU; 12651 else 12652 ip_min_mtu = IP_MIN_MTU; 12653 12654 /* 12655 * Verify values before we set anything. Allow zero to 12656 * mean unspecified. 12657 */ 12658 if (lir->lir_maxmtu != 0 && 12659 (lir->lir_maxmtu > ill->ill_max_frag || 12660 lir->lir_maxmtu < ip_min_mtu)) 12661 return (EINVAL); 12662 if (lir->lir_reachtime != 0 && 12663 lir->lir_reachtime > ND_MAX_REACHTIME) 12664 return (EINVAL); 12665 if (lir->lir_reachretrans != 0 && 12666 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 12667 return (EINVAL); 12668 12669 mutex_enter(&ill->ill_lock); 12670 ill->ill_state_flags |= ILL_CHANGING; 12671 for (nipif = ill->ill_ipif; nipif != NULL; 12672 nipif = nipif->ipif_next) { 12673 nipif->ipif_state_flags |= IPIF_CHANGING; 12674 } 12675 12676 if (lir->lir_maxmtu != 0) { 12677 ill->ill_max_mtu = lir->lir_maxmtu; 12678 ill->ill_user_mtu = lir->lir_maxmtu; 12679 mtu_walk = B_TRUE; 12680 } 12681 mutex_exit(&ill->ill_lock); 12682 12683 if (lir->lir_reachtime != 0) 12684 ill->ill_reachable_time = lir->lir_reachtime; 12685 12686 if (lir->lir_reachretrans != 0) 12687 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 12688 12689 ill->ill_max_hops = lir->lir_maxhops; 12690 12691 ill->ill_max_buf = ND_MAX_Q; 12692 12693 if (mtu_walk) { 12694 /* 12695 * Set the MTU on all ipifs associated with this ill except 12696 * for those whose MTU was fixed via SIOCSLIFMTU. 12697 */ 12698 for (nipif = ill->ill_ipif; nipif != NULL; 12699 nipif = nipif->ipif_next) { 12700 if (nipif->ipif_flags & IPIF_FIXEDMTU) 12701 continue; 12702 12703 nipif->ipif_mtu = ill->ill_max_mtu; 12704 12705 if (!(nipif->ipif_flags & IPIF_UP)) 12706 continue; 12707 12708 if (nipif->ipif_isv6) 12709 ire = ipif_to_ire_v6(nipif); 12710 else 12711 ire = ipif_to_ire(nipif); 12712 if (ire != NULL) { 12713 ire->ire_max_frag = ipif->ipif_mtu; 12714 ire_refrele(ire); 12715 } 12716 12717 ire_walk_ill(MATCH_IRE_ILL, 0, ipif_mtu_change, 12718 nipif, ill); 12719 } 12720 } 12721 12722 mutex_enter(&ill->ill_lock); 12723 for (nipif = ill->ill_ipif; nipif != NULL; 12724 nipif = nipif->ipif_next) { 12725 nipif->ipif_state_flags &= ~IPIF_CHANGING; 12726 } 12727 ILL_UNMARK_CHANGING(ill); 12728 mutex_exit(&ill->ill_lock); 12729 12730 /* 12731 * Refresh IPMP meta-interface MTU if necessary. 12732 */ 12733 if (IS_UNDER_IPMP(ill)) 12734 ipmp_illgrp_refresh_mtu(ill->ill_grp); 12735 12736 return (0); 12737 } 12738 12739 /* ARGSUSED */ 12740 int 12741 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12742 ip_ioctl_cmd_t *ipi, void *if_req) 12743 { 12744 struct lif_ifinfo_req *lir; 12745 ill_t *ill = ipif->ipif_ill; 12746 12747 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 12748 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 12749 if (ipif->ipif_id != 0) 12750 return (EINVAL); 12751 12752 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 12753 lir->lir_maxhops = ill->ill_max_hops; 12754 lir->lir_reachtime = ill->ill_reachable_time; 12755 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 12756 lir->lir_maxmtu = ill->ill_max_mtu; 12757 12758 return (0); 12759 } 12760 12761 /* 12762 * Return best guess as to the subnet mask for the specified address. 12763 * Based on the subnet masks for all the configured interfaces. 12764 * 12765 * We end up returning a zero mask in the case of default, multicast or 12766 * experimental. 12767 */ 12768 static ipaddr_t 12769 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 12770 { 12771 ipaddr_t net_mask; 12772 ill_t *ill; 12773 ipif_t *ipif; 12774 ill_walk_context_t ctx; 12775 ipif_t *fallback_ipif = NULL; 12776 12777 net_mask = ip_net_mask(addr); 12778 if (net_mask == 0) { 12779 *ipifp = NULL; 12780 return (0); 12781 } 12782 12783 /* Let's check to see if this is maybe a local subnet route. */ 12784 /* this function only applies to IPv4 interfaces */ 12785 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 12786 ill = ILL_START_WALK_V4(&ctx, ipst); 12787 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 12788 mutex_enter(&ill->ill_lock); 12789 for (ipif = ill->ill_ipif; ipif != NULL; 12790 ipif = ipif->ipif_next) { 12791 if (!IPIF_CAN_LOOKUP(ipif)) 12792 continue; 12793 if (!(ipif->ipif_flags & IPIF_UP)) 12794 continue; 12795 if ((ipif->ipif_subnet & net_mask) == 12796 (addr & net_mask)) { 12797 /* 12798 * Don't trust pt-pt interfaces if there are 12799 * other interfaces. 12800 */ 12801 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 12802 if (fallback_ipif == NULL) { 12803 ipif_refhold_locked(ipif); 12804 fallback_ipif = ipif; 12805 } 12806 continue; 12807 } 12808 12809 /* 12810 * Fine. Just assume the same net mask as the 12811 * directly attached subnet interface is using. 12812 */ 12813 ipif_refhold_locked(ipif); 12814 mutex_exit(&ill->ill_lock); 12815 rw_exit(&ipst->ips_ill_g_lock); 12816 if (fallback_ipif != NULL) 12817 ipif_refrele(fallback_ipif); 12818 *ipifp = ipif; 12819 return (ipif->ipif_net_mask); 12820 } 12821 } 12822 mutex_exit(&ill->ill_lock); 12823 } 12824 rw_exit(&ipst->ips_ill_g_lock); 12825 12826 *ipifp = fallback_ipif; 12827 return ((fallback_ipif != NULL) ? 12828 fallback_ipif->ipif_net_mask : net_mask); 12829 } 12830 12831 /* 12832 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 12833 */ 12834 static void 12835 ip_wput_ioctl(queue_t *q, mblk_t *mp) 12836 { 12837 IOCP iocp; 12838 ipft_t *ipft; 12839 ipllc_t *ipllc; 12840 mblk_t *mp1; 12841 cred_t *cr; 12842 int error = 0; 12843 conn_t *connp; 12844 12845 ip1dbg(("ip_wput_ioctl")); 12846 iocp = (IOCP)mp->b_rptr; 12847 mp1 = mp->b_cont; 12848 if (mp1 == NULL) { 12849 iocp->ioc_error = EINVAL; 12850 mp->b_datap->db_type = M_IOCNAK; 12851 iocp->ioc_count = 0; 12852 qreply(q, mp); 12853 return; 12854 } 12855 12856 /* 12857 * These IOCTLs provide various control capabilities to 12858 * upstream agents such as ULPs and processes. There 12859 * are currently two such IOCTLs implemented. They 12860 * are used by TCP to provide update information for 12861 * existing IREs and to forcibly delete an IRE for a 12862 * host that is not responding, thereby forcing an 12863 * attempt at a new route. 12864 */ 12865 iocp->ioc_error = EINVAL; 12866 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 12867 goto done; 12868 12869 ipllc = (ipllc_t *)mp1->b_rptr; 12870 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 12871 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 12872 break; 12873 } 12874 /* 12875 * prefer credential from mblk over ioctl; 12876 * see ip_sioctl_copyin_setup 12877 */ 12878 cr = msg_getcred(mp, NULL); 12879 if (cr == NULL) 12880 cr = iocp->ioc_cr; 12881 12882 /* 12883 * Refhold the conn in case the request gets queued up in some lookup 12884 */ 12885 ASSERT(CONN_Q(q)); 12886 connp = Q_TO_CONN(q); 12887 CONN_INC_REF(connp); 12888 if (ipft->ipft_pfi && 12889 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 12890 pullupmsg(mp1, ipft->ipft_min_size))) { 12891 error = (*ipft->ipft_pfi)(q, 12892 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 12893 } 12894 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 12895 /* 12896 * CONN_OPER_PENDING_DONE happens in the function called 12897 * through ipft_pfi above. 12898 */ 12899 return; 12900 } 12901 12902 CONN_OPER_PENDING_DONE(connp); 12903 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 12904 freemsg(mp); 12905 return; 12906 } 12907 iocp->ioc_error = error; 12908 12909 done: 12910 mp->b_datap->db_type = M_IOCACK; 12911 if (iocp->ioc_error) 12912 iocp->ioc_count = 0; 12913 qreply(q, mp); 12914 } 12915 12916 /* 12917 * Lookup an ipif using the sequence id (ipif_seqid) 12918 */ 12919 ipif_t * 12920 ipif_lookup_seqid(ill_t *ill, uint_t seqid) 12921 { 12922 ipif_t *ipif; 12923 12924 ASSERT(MUTEX_HELD(&ill->ill_lock)); 12925 12926 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12927 if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) 12928 return (ipif); 12929 } 12930 return (NULL); 12931 } 12932 12933 /* 12934 * Assign a unique id for the ipif. This is used later when we send 12935 * IRES to ARP for resolution where we initialize ire_ipif_seqid 12936 * to the value pointed by ire_ipif->ipif_seqid. Later when the 12937 * IRE is added, we verify that ipif has not disappeared. 12938 */ 12939 12940 static void 12941 ipif_assign_seqid(ipif_t *ipif) 12942 { 12943 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12944 12945 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 12946 } 12947 12948 /* 12949 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 12950 * administratively down (i.e., no DAD), of the same type, and locked. Note 12951 * that the clone is complete -- including the seqid -- and the expectation is 12952 * that the caller will either free or overwrite `sipif' before it's unlocked. 12953 */ 12954 static void 12955 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 12956 { 12957 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 12958 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 12959 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 12960 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 12961 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 12962 ASSERT(sipif->ipif_arp_del_mp == NULL); 12963 ASSERT(dipif->ipif_arp_del_mp == NULL); 12964 ASSERT(sipif->ipif_igmp_rpt == NULL); 12965 ASSERT(dipif->ipif_igmp_rpt == NULL); 12966 ASSERT(sipif->ipif_multicast_up == 0); 12967 ASSERT(dipif->ipif_multicast_up == 0); 12968 ASSERT(sipif->ipif_joined_allhosts == 0); 12969 ASSERT(dipif->ipif_joined_allhosts == 0); 12970 12971 dipif->ipif_mtu = sipif->ipif_mtu; 12972 dipif->ipif_flags = sipif->ipif_flags; 12973 dipif->ipif_metric = sipif->ipif_metric; 12974 dipif->ipif_zoneid = sipif->ipif_zoneid; 12975 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 12976 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 12977 dipif->ipif_v6src_addr = sipif->ipif_v6src_addr; 12978 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 12979 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 12980 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 12981 12982 /* 12983 * While dipif is down right now, it might've been up before. Since 12984 * it's changing identity, its packet counters need to be reset. 12985 */ 12986 dipif->ipif_ib_pkt_count = 0; 12987 dipif->ipif_ob_pkt_count = 0; 12988 dipif->ipif_fo_pkt_count = 0; 12989 12990 /* 12991 * As per the comment atop the function, we assume that these sipif 12992 * fields will be changed before sipif is unlocked. 12993 */ 12994 dipif->ipif_seqid = sipif->ipif_seqid; 12995 dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp; 12996 dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt; 12997 dipif->ipif_state_flags = sipif->ipif_state_flags; 12998 } 12999 13000 /* 13001 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 13002 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 13003 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 13004 * transfer the xop to `dipif'. Requires that all ipifs are administratively 13005 * down (i.e., no DAD), of the same type, and unlocked. 13006 */ 13007 static void 13008 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 13009 { 13010 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 13011 ipxop_t *ipx = ipsq->ipsq_xop; 13012 13013 ASSERT(sipif != dipif); 13014 ASSERT(sipif != virgipif); 13015 13016 /* 13017 * Grab all of the locks that protect the ipif in a defined order. 13018 */ 13019 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 13020 if (sipif > dipif) { 13021 mutex_enter(&sipif->ipif_saved_ire_lock); 13022 mutex_enter(&dipif->ipif_saved_ire_lock); 13023 } else { 13024 mutex_enter(&dipif->ipif_saved_ire_lock); 13025 mutex_enter(&sipif->ipif_saved_ire_lock); 13026 } 13027 13028 ipif_clone(sipif, dipif); 13029 if (virgipif != NULL) { 13030 ipif_clone(virgipif, sipif); 13031 mi_free(virgipif); 13032 } 13033 13034 mutex_exit(&sipif->ipif_saved_ire_lock); 13035 mutex_exit(&dipif->ipif_saved_ire_lock); 13036 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 13037 13038 /* 13039 * Transfer ownership of the current xop, if necessary. 13040 */ 13041 if (ipx->ipx_current_ipif == sipif) { 13042 ASSERT(ipx->ipx_pending_ipif == NULL); 13043 mutex_enter(&ipx->ipx_lock); 13044 ipx->ipx_current_ipif = dipif; 13045 mutex_exit(&ipx->ipx_lock); 13046 } 13047 13048 if (virgipif == NULL) 13049 mi_free(sipif); 13050 } 13051 13052 /* 13053 * Insert the ipif, so that the list of ipifs on the ill will be sorted 13054 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 13055 * be inserted into the first space available in the list. The value of 13056 * ipif_id will then be set to the appropriate value for its position. 13057 */ 13058 static int 13059 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 13060 { 13061 ill_t *ill; 13062 ipif_t *tipif; 13063 ipif_t **tipifp; 13064 int id; 13065 ip_stack_t *ipst; 13066 13067 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 13068 IAM_WRITER_IPIF(ipif)); 13069 13070 ill = ipif->ipif_ill; 13071 ASSERT(ill != NULL); 13072 ipst = ill->ill_ipst; 13073 13074 /* 13075 * In the case of lo0:0 we already hold the ill_g_lock. 13076 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 13077 * ipif_insert. 13078 */ 13079 if (acquire_g_lock) 13080 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13081 mutex_enter(&ill->ill_lock); 13082 id = ipif->ipif_id; 13083 tipifp = &(ill->ill_ipif); 13084 if (id == -1) { /* need to find a real id */ 13085 id = 0; 13086 while ((tipif = *tipifp) != NULL) { 13087 ASSERT(tipif->ipif_id >= id); 13088 if (tipif->ipif_id != id) 13089 break; /* non-consecutive id */ 13090 id++; 13091 tipifp = &(tipif->ipif_next); 13092 } 13093 /* limit number of logical interfaces */ 13094 if (id >= ipst->ips_ip_addrs_per_if) { 13095 mutex_exit(&ill->ill_lock); 13096 if (acquire_g_lock) 13097 rw_exit(&ipst->ips_ill_g_lock); 13098 return (-1); 13099 } 13100 ipif->ipif_id = id; /* assign new id */ 13101 } else if (id < ipst->ips_ip_addrs_per_if) { 13102 /* we have a real id; insert ipif in the right place */ 13103 while ((tipif = *tipifp) != NULL) { 13104 ASSERT(tipif->ipif_id != id); 13105 if (tipif->ipif_id > id) 13106 break; /* found correct location */ 13107 tipifp = &(tipif->ipif_next); 13108 } 13109 } else { 13110 mutex_exit(&ill->ill_lock); 13111 if (acquire_g_lock) 13112 rw_exit(&ipst->ips_ill_g_lock); 13113 return (-1); 13114 } 13115 13116 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 13117 13118 ipif->ipif_next = tipif; 13119 *tipifp = ipif; 13120 mutex_exit(&ill->ill_lock); 13121 if (acquire_g_lock) 13122 rw_exit(&ipst->ips_ill_g_lock); 13123 13124 return (0); 13125 } 13126 13127 static void 13128 ipif_remove(ipif_t *ipif) 13129 { 13130 ipif_t **ipifp; 13131 ill_t *ill = ipif->ipif_ill; 13132 13133 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 13134 13135 mutex_enter(&ill->ill_lock); 13136 ipifp = &ill->ill_ipif; 13137 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 13138 if (*ipifp == ipif) { 13139 *ipifp = ipif->ipif_next; 13140 break; 13141 } 13142 } 13143 mutex_exit(&ill->ill_lock); 13144 } 13145 13146 /* 13147 * Allocate and initialize a new interface control structure. (Always 13148 * called as writer.) 13149 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 13150 * is not part of the global linked list of ills. ipif_seqid is unique 13151 * in the system and to preserve the uniqueness, it is assigned only 13152 * when ill becomes part of the global list. At that point ill will 13153 * have a name. If it doesn't get assigned here, it will get assigned 13154 * in ipif_set_values() as part of SIOCSLIFNAME processing. 13155 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 13156 * the interface flags or any other information from the DL_INFO_ACK for 13157 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 13158 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 13159 * second DL_INFO_ACK comes in from the driver. 13160 */ 13161 static ipif_t * 13162 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 13163 boolean_t insert) 13164 { 13165 ipif_t *ipif; 13166 ip_stack_t *ipst = ill->ill_ipst; 13167 13168 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 13169 ill->ill_name, id, (void *)ill)); 13170 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 13171 13172 if ((ipif = (ipif_t *)mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) 13173 return (NULL); 13174 *ipif = ipif_zero; /* start clean */ 13175 13176 ipif->ipif_ill = ill; 13177 ipif->ipif_id = id; /* could be -1 */ 13178 /* 13179 * Inherit the zoneid from the ill; for the shared stack instance 13180 * this is always the global zone 13181 */ 13182 ipif->ipif_zoneid = ill->ill_zoneid; 13183 13184 mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 13185 13186 ipif->ipif_refcnt = 0; 13187 ipif->ipif_saved_ire_cnt = 0; 13188 13189 if (insert) { 13190 if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) { 13191 mi_free(ipif); 13192 return (NULL); 13193 } 13194 /* -1 id should have been replaced by real id */ 13195 id = ipif->ipif_id; 13196 ASSERT(id >= 0); 13197 } 13198 13199 if (ill->ill_name[0] != '\0') 13200 ipif_assign_seqid(ipif); 13201 13202 /* 13203 * If this is the zeroth ipif on the IPMP ill, create the illgrp 13204 * (which must not exist yet because the zeroth ipif is created once 13205 * per ill). However, do not not link it to the ipmp_grp_t until 13206 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 13207 */ 13208 if (id == 0 && IS_IPMP(ill)) { 13209 if (ipmp_illgrp_create(ill) == NULL) { 13210 if (insert) { 13211 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13212 ipif_remove(ipif); 13213 rw_exit(&ipst->ips_ill_g_lock); 13214 } 13215 mi_free(ipif); 13216 return (NULL); 13217 } 13218 } 13219 13220 /* 13221 * We grab ill_lock to protect the flag changes. The ipif is still 13222 * not up and can't be looked up until the ioctl completes and the 13223 * IPIF_CHANGING flag is cleared. 13224 */ 13225 mutex_enter(&ill->ill_lock); 13226 13227 ipif->ipif_ire_type = ire_type; 13228 13229 if (ipif->ipif_isv6) { 13230 ill->ill_flags |= ILLF_IPV6; 13231 } else { 13232 ipaddr_t inaddr_any = INADDR_ANY; 13233 13234 ill->ill_flags |= ILLF_IPV4; 13235 13236 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 13237 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13238 &ipif->ipif_v6lcl_addr); 13239 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13240 &ipif->ipif_v6src_addr); 13241 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13242 &ipif->ipif_v6subnet); 13243 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13244 &ipif->ipif_v6net_mask); 13245 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13246 &ipif->ipif_v6brd_addr); 13247 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 13248 &ipif->ipif_v6pp_dst_addr); 13249 } 13250 13251 /* 13252 * Don't set the interface flags etc. now, will do it in 13253 * ip_ll_subnet_defaults. 13254 */ 13255 if (!initialize) 13256 goto out; 13257 13258 ipif->ipif_mtu = ill->ill_max_mtu; 13259 13260 /* 13261 * NOTE: The IPMP meta-interface is special-cased because it starts 13262 * with no underlying interfaces (and thus an unknown broadcast 13263 * address length), but all interfaces that can be placed into an IPMP 13264 * group are required to be broadcast-capable. 13265 */ 13266 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 13267 /* 13268 * Later detect lack of DLPI driver multicast capability by 13269 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 13270 */ 13271 ill->ill_flags |= ILLF_MULTICAST; 13272 if (!ipif->ipif_isv6) 13273 ipif->ipif_flags |= IPIF_BROADCAST; 13274 } else { 13275 if (ill->ill_net_type != IRE_LOOPBACK) { 13276 if (ipif->ipif_isv6) 13277 /* 13278 * Note: xresolv interfaces will eventually need 13279 * NOARP set here as well, but that will require 13280 * those external resolvers to have some 13281 * knowledge of that flag and act appropriately. 13282 * Not to be changed at present. 13283 */ 13284 ill->ill_flags |= ILLF_NONUD; 13285 else 13286 ill->ill_flags |= ILLF_NOARP; 13287 } 13288 if (ill->ill_phys_addr_length == 0) { 13289 if (IS_VNI(ill)) { 13290 ipif->ipif_flags |= IPIF_NOXMIT; 13291 } else { 13292 /* pt-pt supports multicast. */ 13293 ill->ill_flags |= ILLF_MULTICAST; 13294 if (ill->ill_net_type != IRE_LOOPBACK) 13295 ipif->ipif_flags |= IPIF_POINTOPOINT; 13296 } 13297 } 13298 } 13299 out: 13300 mutex_exit(&ill->ill_lock); 13301 return (ipif); 13302 } 13303 13304 /* 13305 * If appropriate, send a message up to the resolver delete the entry 13306 * for the address of this interface which is going out of business. 13307 * (Always called as writer). 13308 * 13309 * NOTE : We need to check for NULL mps as some of the fields are 13310 * initialized only for some interface types. See ipif_resolver_up() 13311 * for details. 13312 */ 13313 void 13314 ipif_resolver_down(ipif_t *ipif) 13315 { 13316 mblk_t *mp; 13317 ill_t *ill = ipif->ipif_ill; 13318 13319 ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13320 ASSERT(IAM_WRITER_IPIF(ipif)); 13321 13322 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13323 return; 13324 13325 /* Delete the mapping for the local address */ 13326 mp = ipif->ipif_arp_del_mp; 13327 if (mp != NULL) { 13328 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13329 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13330 putnext(ill->ill_rq, mp); 13331 ipif->ipif_arp_del_mp = NULL; 13332 } 13333 13334 /* 13335 * Make IPMP aware of the deleted data address. 13336 */ 13337 if (IS_IPMP(ill)) 13338 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 13339 13340 /* 13341 * If this is the last ipif that is going down and there are no 13342 * duplicate addresses we may yet attempt to re-probe, then we need to 13343 * clean up ARP completely. 13344 */ 13345 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { 13346 /* 13347 * If this was the last ipif on an IPMP interface, purge any 13348 * IPMP ARP entries associated with it. 13349 */ 13350 if (IS_IPMP(ill)) 13351 ipmp_illgrp_refresh_arpent(ill->ill_grp); 13352 13353 /* Send up AR_INTERFACE_DOWN message */ 13354 mp = ill->ill_arp_down_mp; 13355 if (mp != NULL) { 13356 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13357 *(unsigned *)mp->b_rptr, ill->ill_name, 13358 ipif->ipif_id)); 13359 putnext(ill->ill_rq, mp); 13360 ill->ill_arp_down_mp = NULL; 13361 } 13362 13363 /* Tell ARP to delete the multicast mappings */ 13364 mp = ill->ill_arp_del_mapping_mp; 13365 if (mp != NULL) { 13366 ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", 13367 *(unsigned *)mp->b_rptr, ill->ill_name, 13368 ipif->ipif_id)); 13369 putnext(ill->ill_rq, mp); 13370 ill->ill_arp_del_mapping_mp = NULL; 13371 } 13372 } 13373 } 13374 13375 /* 13376 * Set up the multicast mappings for `ipif' in ARP. If `arp_add_mapping_mp' 13377 * is non-NULL, then upon success it will contain an mblk that can be passed 13378 * to ARP to create the mapping. Otherwise, if it's NULL, upon success ARP 13379 * will have already been notified to create the mapping. Returns zero on 13380 * success, -1 upon failure. 13381 */ 13382 int 13383 ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) 13384 { 13385 mblk_t *del_mp = NULL; 13386 mblk_t *add_mp = NULL; 13387 mblk_t *mp; 13388 ill_t *ill = ipif->ipif_ill; 13389 phyint_t *phyi = ill->ill_phyint; 13390 ipaddr_t addr, mask, extract_mask = 0; 13391 arma_t *arma; 13392 uint8_t *maddr, *bphys_addr; 13393 uint32_t hw_start; 13394 dl_unitdata_req_t *dlur; 13395 13396 ASSERT(IAM_WRITER_IPIF(ipif)); 13397 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13398 return (0); 13399 13400 /* 13401 * IPMP meta-interfaces don't have any inherent multicast mappings, 13402 * and instead use the ones on the underlying interfaces. 13403 */ 13404 if (IS_IPMP(ill)) 13405 return (0); 13406 13407 /* 13408 * Delete the existing mapping from ARP. Normally, ipif_down() -> 13409 * ipif_resolver_down() will send this up to ARP, but it may be that 13410 * we are enabling PHYI_MULTI_BCAST via ip_rput_dlpi_writer(). 13411 */ 13412 mp = ill->ill_arp_del_mapping_mp; 13413 if (mp != NULL) { 13414 ip1dbg(("ipif_arp_setup_multicast: arp cmd %x for %s:%u\n", 13415 *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); 13416 putnext(ill->ill_rq, mp); 13417 ill->ill_arp_del_mapping_mp = NULL; 13418 } 13419 13420 if (arp_add_mapping_mp != NULL) 13421 *arp_add_mapping_mp = NULL; 13422 13423 /* 13424 * Check that the address is not to long for the constant 13425 * length reserved in the template arma_t. 13426 */ 13427 if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) 13428 return (-1); 13429 13430 /* Add mapping mblk */ 13431 addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); 13432 mask = (ipaddr_t)htonl(IN_CLASSD_NET); 13433 add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, 13434 (caddr_t)&addr); 13435 if (add_mp == NULL) 13436 return (-1); 13437 arma = (arma_t *)add_mp->b_rptr; 13438 maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; 13439 bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); 13440 arma->arma_hw_addr_length = ill->ill_phys_addr_length; 13441 13442 /* 13443 * Determine the broadcast address. 13444 */ 13445 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 13446 if (ill->ill_sap_length < 0) 13447 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 13448 else 13449 bphys_addr = (uchar_t *)dlur + 13450 dlur->dl_dest_addr_offset + ill->ill_sap_length; 13451 /* 13452 * Check PHYI_MULTI_BCAST and length of physical 13453 * address to determine if we use the mapping or the 13454 * broadcast address. 13455 */ 13456 if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) 13457 if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, 13458 bphys_addr, maddr, &hw_start, &extract_mask)) 13459 phyi->phyint_flags |= PHYI_MULTI_BCAST; 13460 13461 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || 13462 (ill->ill_flags & ILLF_MULTICAST)) { 13463 /* Make sure this will not match the "exact" entry. */ 13464 addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); 13465 del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, 13466 (caddr_t)&addr); 13467 if (del_mp == NULL) { 13468 freemsg(add_mp); 13469 return (-1); 13470 } 13471 bcopy(&extract_mask, (char *)arma + 13472 arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); 13473 if (phyi->phyint_flags & PHYI_MULTI_BCAST) { 13474 /* Use link-layer broadcast address for MULTI_BCAST */ 13475 bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); 13476 ip2dbg(("ipif_arp_setup_multicast: adding" 13477 " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); 13478 } else { 13479 arma->arma_hw_mapping_start = hw_start; 13480 ip2dbg(("ipif_arp_setup_multicast: adding multicast" 13481 " ARP setup for %s\n", ill->ill_name)); 13482 } 13483 } else { 13484 freemsg(add_mp); 13485 ASSERT(del_mp == NULL); 13486 /* It is neither MULTICAST nor MULTI_BCAST */ 13487 return (0); 13488 } 13489 ASSERT(add_mp != NULL && del_mp != NULL); 13490 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13491 ill->ill_arp_del_mapping_mp = del_mp; 13492 if (arp_add_mapping_mp != NULL) { 13493 /* The caller just wants the mblks allocated */ 13494 *arp_add_mapping_mp = add_mp; 13495 } else { 13496 /* The caller wants us to send it to arp */ 13497 putnext(ill->ill_rq, add_mp); 13498 } 13499 return (0); 13500 } 13501 13502 /* 13503 * Get the resolver set up for a new IP address. (Always called as writer.) 13504 * Called both for IPv4 and IPv6 interfaces, though it only sets up the 13505 * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP. 13506 * 13507 * The enumerated value res_act tunes the behavior: 13508 * * Res_act_initial: set up all the resolver structures for a new 13509 * IP address. 13510 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 13511 * ARP message in defense of the address. 13512 * * Res_act_rebind: tell ARP to change the hardware address for an IP 13513 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 13514 * 13515 * Returns zero on success, or an errno upon failure. 13516 */ 13517 int 13518 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 13519 { 13520 mblk_t *arp_up_mp = NULL; 13521 mblk_t *arp_down_mp = NULL; 13522 mblk_t *arp_add_mp = NULL; 13523 mblk_t *arp_del_mp = NULL; 13524 mblk_t *arp_add_mapping_mp = NULL; 13525 mblk_t *arp_del_mapping_mp = NULL; 13526 ill_t *ill = ipif->ipif_ill; 13527 int err = ENOMEM; 13528 boolean_t added_ipif = B_FALSE; 13529 boolean_t publish; 13530 boolean_t was_dup; 13531 13532 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 13533 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 13534 ASSERT(IAM_WRITER_IPIF(ipif)); 13535 13536 was_dup = B_FALSE; 13537 if (res_act == Res_act_initial) { 13538 ipif->ipif_addr_ready = 0; 13539 /* 13540 * We're bringing an interface up here. There's no way that we 13541 * should need to shut down ARP now. 13542 */ 13543 mutex_enter(&ill->ill_lock); 13544 if (ipif->ipif_flags & IPIF_DUPLICATE) { 13545 ipif->ipif_flags &= ~IPIF_DUPLICATE; 13546 ill->ill_ipif_dup_count--; 13547 was_dup = B_TRUE; 13548 } 13549 mutex_exit(&ill->ill_lock); 13550 } 13551 if (ipif->ipif_recovery_id != 0) 13552 (void) untimeout(ipif->ipif_recovery_id); 13553 ipif->ipif_recovery_id = 0; 13554 if (ill->ill_net_type != IRE_IF_RESOLVER) { 13555 ipif->ipif_addr_ready = 1; 13556 return (0); 13557 } 13558 /* NDP will set the ipif_addr_ready flag when it's ready */ 13559 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 13560 return (0); 13561 13562 if (ill->ill_isv6) { 13563 /* 13564 * External resolver for IPv6 13565 */ 13566 ASSERT(res_act == Res_act_initial); 13567 publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr); 13568 } else { 13569 /* 13570 * IPv4 arp case. If the ARP stream has already started 13571 * closing, fail this request for ARP bringup. Else 13572 * record the fact that an ARP bringup is pending. 13573 */ 13574 mutex_enter(&ill->ill_lock); 13575 if (ill->ill_arp_closing) { 13576 mutex_exit(&ill->ill_lock); 13577 err = EINVAL; 13578 goto failed; 13579 } else { 13580 if (ill->ill_ipif_up_count == 0 && 13581 ill->ill_ipif_dup_count == 0 && !was_dup) 13582 ill->ill_arp_bringup_pending = 1; 13583 mutex_exit(&ill->ill_lock); 13584 } 13585 publish = (ipif->ipif_lcl_addr != INADDR_ANY); 13586 } 13587 13588 if (IS_IPMP(ill) && publish) { 13589 /* 13590 * If we're here via ipif_up(), then the ipif won't be bound 13591 * yet -- add it to the group, which will bind it if possible. 13592 * (We would add it in ipif_up(), but deleting on failure 13593 * there is gruesome.) If we're here via ipmp_ill_bind_ipif(), 13594 * then the ipif has already been added to the group and we 13595 * just need to use the binding. 13596 */ 13597 if (ipmp_ipif_bound_ill(ipif) == NULL) { 13598 if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) { 13599 /* 13600 * We couldn't bind the ipif to an ill yet, 13601 * so we have nothing to publish. 13602 */ 13603 publish = B_FALSE; 13604 } 13605 added_ipif = B_TRUE; 13606 } 13607 } 13608 13609 /* 13610 * Add an entry for the local address in ARP only if it 13611 * is not UNNUMBERED and it is suitable for publishing. 13612 */ 13613 if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) { 13614 if (res_act == Res_act_defend) { 13615 arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND); 13616 if (arp_add_mp == NULL) 13617 goto failed; 13618 /* 13619 * If we're just defending our address now, then 13620 * there's no need to set up ARP multicast mappings. 13621 * The publish command is enough. 13622 */ 13623 goto done; 13624 } 13625 13626 /* 13627 * Allocate an ARP add message and an ARP delete message (the 13628 * latter is saved for use when the address goes down). 13629 */ 13630 if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL) 13631 goto failed; 13632 13633 if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) 13634 goto failed; 13635 13636 if (res_act != Res_act_initial) 13637 goto arp_setup_multicast; 13638 } else { 13639 if (res_act != Res_act_initial) 13640 goto done; 13641 } 13642 /* 13643 * Need to bring up ARP or setup multicast mapping only 13644 * when the first interface is coming UP. 13645 */ 13646 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup) 13647 goto done; 13648 13649 /* 13650 * Allocate an ARP down message (to be saved) and an ARP up message. 13651 */ 13652 arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); 13653 if (arp_down_mp == NULL) 13654 goto failed; 13655 13656 arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); 13657 if (arp_up_mp == NULL) 13658 goto failed; 13659 13660 if (ipif->ipif_flags & IPIF_POINTOPOINT) 13661 goto done; 13662 13663 arp_setup_multicast: 13664 /* 13665 * Setup the multicast mappings. This function initializes 13666 * ill_arp_del_mapping_mp also. This does not need to be done for 13667 * IPv6, or for the IPMP interface (since it has no link-layer). 13668 */ 13669 if (!ill->ill_isv6 && !IS_IPMP(ill)) { 13670 err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); 13671 if (err != 0) 13672 goto failed; 13673 ASSERT(ill->ill_arp_del_mapping_mp != NULL); 13674 ASSERT(arp_add_mapping_mp != NULL); 13675 } 13676 done: 13677 if (arp_up_mp != NULL) { 13678 ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", 13679 ill->ill_name, ipif->ipif_id)); 13680 putnext(ill->ill_rq, arp_up_mp); 13681 arp_up_mp = NULL; 13682 } 13683 if (arp_add_mp != NULL) { 13684 ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", 13685 ill->ill_name, ipif->ipif_id)); 13686 /* 13687 * If it's an extended ARP implementation, then we'll wait to 13688 * hear that DAD has finished before using the interface. 13689 */ 13690 if (!ill->ill_arp_extend) 13691 ipif->ipif_addr_ready = 1; 13692 putnext(ill->ill_rq, arp_add_mp); 13693 arp_add_mp = NULL; 13694 } else { 13695 ipif->ipif_addr_ready = 1; 13696 } 13697 if (arp_add_mapping_mp != NULL) { 13698 ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", 13699 ill->ill_name, ipif->ipif_id)); 13700 putnext(ill->ill_rq, arp_add_mapping_mp); 13701 arp_add_mapping_mp = NULL; 13702 } 13703 13704 if (res_act == Res_act_initial) { 13705 if (ill->ill_flags & ILLF_NOARP) 13706 err = ill_arp_off(ill); 13707 else 13708 err = ill_arp_on(ill); 13709 if (err != 0) { 13710 ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", 13711 err)); 13712 goto failed; 13713 } 13714 } 13715 13716 if (arp_del_mp != NULL) { 13717 ASSERT(ipif->ipif_arp_del_mp == NULL); 13718 ipif->ipif_arp_del_mp = arp_del_mp; 13719 } 13720 if (arp_down_mp != NULL) { 13721 ASSERT(ill->ill_arp_down_mp == NULL); 13722 ill->ill_arp_down_mp = arp_down_mp; 13723 } 13724 if (arp_del_mapping_mp != NULL) { 13725 ASSERT(ill->ill_arp_del_mapping_mp == NULL); 13726 ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; 13727 } 13728 13729 return ((ill->ill_ipif_up_count != 0 || was_dup || 13730 ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); 13731 failed: 13732 ip1dbg(("ipif_resolver_up: FAILED\n")); 13733 if (added_ipif) 13734 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 13735 freemsg(arp_add_mp); 13736 freemsg(arp_del_mp); 13737 freemsg(arp_add_mapping_mp); 13738 freemsg(arp_up_mp); 13739 freemsg(arp_down_mp); 13740 ill->ill_arp_bringup_pending = 0; 13741 return (err); 13742 } 13743 13744 /* 13745 * This routine restarts IPv4 duplicate address detection (DAD) when a link has 13746 * just gone back up. 13747 */ 13748 static void 13749 ipif_arp_start_dad(ipif_t *ipif) 13750 { 13751 ill_t *ill = ipif->ipif_ill; 13752 mblk_t *arp_add_mp; 13753 13754 /* ACE_F_UNVERIFIED restarts DAD */ 13755 if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || 13756 (ipif->ipif_flags & IPIF_UNNUMBERED) || 13757 ipif->ipif_lcl_addr == INADDR_ANY || 13758 (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) { 13759 /* 13760 * If we can't contact ARP for some reason, that's not really a 13761 * problem. Just send out the routing socket notification that 13762 * DAD completion would have done, and continue. 13763 */ 13764 ipif_mask_reply(ipif); 13765 ipif_up_notify(ipif); 13766 ipif->ipif_addr_ready = 1; 13767 return; 13768 } 13769 13770 putnext(ill->ill_rq, arp_add_mp); 13771 } 13772 13773 static void 13774 ipif_ndp_start_dad(ipif_t *ipif) 13775 { 13776 nce_t *nce; 13777 13778 nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr, 13779 B_FALSE); 13780 if (nce == NULL) 13781 return; 13782 13783 if (!ndp_restart_dad(nce)) { 13784 /* 13785 * If we can't restart DAD for some reason, that's not really a 13786 * problem. Just send out the routing socket notification that 13787 * DAD completion would have done, and continue. 13788 */ 13789 ipif_up_notify(ipif); 13790 ipif->ipif_addr_ready = 1; 13791 } 13792 NCE_REFRELE(nce); 13793 } 13794 13795 /* 13796 * Restart duplicate address detection on all interfaces on the given ill. 13797 * 13798 * This is called when an interface transitions from down to up 13799 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 13800 * 13801 * Note that since the underlying physical link has transitioned, we must cause 13802 * at least one routing socket message to be sent here, either via DAD 13803 * completion or just by default on the first ipif. (If we don't do this, then 13804 * in.mpathd will see long delays when doing link-based failure recovery.) 13805 */ 13806 void 13807 ill_restart_dad(ill_t *ill, boolean_t went_up) 13808 { 13809 ipif_t *ipif; 13810 13811 if (ill == NULL) 13812 return; 13813 13814 /* 13815 * If layer two doesn't support duplicate address detection, then just 13816 * send the routing socket message now and be done with it. 13817 */ 13818 if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || 13819 (!ill->ill_isv6 && !ill->ill_arp_extend)) { 13820 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 13821 return; 13822 } 13823 13824 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13825 if (went_up) { 13826 if (ipif->ipif_flags & IPIF_UP) { 13827 if (ill->ill_isv6) 13828 ipif_ndp_start_dad(ipif); 13829 else 13830 ipif_arp_start_dad(ipif); 13831 } else if (ill->ill_isv6 && 13832 (ipif->ipif_flags & IPIF_DUPLICATE)) { 13833 /* 13834 * For IPv4, the ARP module itself will 13835 * automatically start the DAD process when it 13836 * sees DL_NOTE_LINK_UP. We respond to the 13837 * AR_CN_READY at the completion of that task. 13838 * For IPv6, we must kick off the bring-up 13839 * process now. 13840 */ 13841 ndp_do_recovery(ipif); 13842 } else { 13843 /* 13844 * Unfortunately, the first ipif is "special" 13845 * and represents the underlying ill in the 13846 * routing socket messages. Thus, when this 13847 * one ipif is down, we must still notify so 13848 * that the user knows the IFF_RUNNING status 13849 * change. (If the first ipif is up, then 13850 * we'll handle eventual routing socket 13851 * notification via DAD completion.) 13852 */ 13853 if (ipif == ill->ill_ipif) { 13854 ip_rts_ifmsg(ill->ill_ipif, 13855 RTSQ_DEFAULT); 13856 } 13857 } 13858 } else { 13859 /* 13860 * After link down, we'll need to send a new routing 13861 * message when the link comes back, so clear 13862 * ipif_addr_ready. 13863 */ 13864 ipif->ipif_addr_ready = 0; 13865 } 13866 } 13867 13868 /* 13869 * If we've torn down links, then notify the user right away. 13870 */ 13871 if (!went_up) 13872 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 13873 } 13874 13875 static void 13876 ipsq_delete(ipsq_t *ipsq) 13877 { 13878 ipxop_t *ipx = ipsq->ipsq_xop; 13879 13880 ipsq->ipsq_ipst = NULL; 13881 ASSERT(ipsq->ipsq_phyint == NULL); 13882 ASSERT(ipsq->ipsq_xop != NULL); 13883 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 13884 ASSERT(ipx->ipx_pending_mp == NULL); 13885 kmem_free(ipsq, sizeof (ipsq_t)); 13886 } 13887 13888 static int 13889 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 13890 { 13891 int err; 13892 ipif_t *ipif; 13893 13894 if (ill == NULL) 13895 return (0); 13896 13897 ASSERT(IAM_WRITER_ILL(ill)); 13898 ill->ill_up_ipifs = B_TRUE; 13899 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13900 if (ipif->ipif_was_up) { 13901 if (!(ipif->ipif_flags & IPIF_UP)) 13902 err = ipif_up(ipif, q, mp); 13903 ipif->ipif_was_up = B_FALSE; 13904 if (err != 0) { 13905 ASSERT(err == EINPROGRESS); 13906 return (err); 13907 } 13908 } 13909 } 13910 mutex_enter(&ill->ill_lock); 13911 ill->ill_state_flags &= ~ILL_CHANGING; 13912 mutex_exit(&ill->ill_lock); 13913 ill->ill_up_ipifs = B_FALSE; 13914 return (0); 13915 } 13916 13917 /* 13918 * This function is called to bring up all the ipifs that were up before 13919 * bringing the ill down via ill_down_ipifs(). 13920 */ 13921 int 13922 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 13923 { 13924 int err; 13925 13926 ASSERT(IAM_WRITER_ILL(ill)); 13927 13928 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 13929 if (err != 0) 13930 return (err); 13931 13932 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 13933 } 13934 13935 /* 13936 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 13937 * down the ipifs without sending DL_UNBIND_REQ to the driver. 13938 */ 13939 static void 13940 ill_down_ipifs(ill_t *ill, boolean_t logical) 13941 { 13942 ipif_t *ipif; 13943 13944 ASSERT(IAM_WRITER_ILL(ill)); 13945 13946 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13947 /* 13948 * We go through the ipif_down logic even if the ipif 13949 * is already down, since routes can be added based 13950 * on down ipifs. Going through ipif_down once again 13951 * will delete any IREs created based on these routes. 13952 */ 13953 if (ipif->ipif_flags & IPIF_UP) 13954 ipif->ipif_was_up = B_TRUE; 13955 13956 /* 13957 * Need to re-create net/subnet bcast ires if 13958 * they are dependent on ipif. 13959 */ 13960 if (!ipif->ipif_isv6) 13961 ipif_check_bcast_ires(ipif); 13962 if (logical) { 13963 (void) ipif_logical_down(ipif, NULL, NULL); 13964 ipif_non_duplicate(ipif); 13965 ipif_down_tail(ipif); 13966 } else { 13967 (void) ipif_down(ipif, NULL, NULL); 13968 } 13969 } 13970 } 13971 13972 /* 13973 * Redo source address selection. This is called when a 13974 * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up. 13975 */ 13976 void 13977 ill_update_source_selection(ill_t *ill) 13978 { 13979 ipif_t *ipif; 13980 13981 ASSERT(IAM_WRITER_ILL(ill)); 13982 13983 /* 13984 * Underlying interfaces are only used for test traffic and thus 13985 * should always send with their (deprecated) source addresses. 13986 */ 13987 if (IS_UNDER_IPMP(ill)) 13988 return; 13989 13990 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13991 if (ill->ill_isv6) 13992 ipif_recreate_interface_routes_v6(NULL, ipif); 13993 else 13994 ipif_recreate_interface_routes(NULL, ipif); 13995 } 13996 } 13997 13998 /* 13999 * Finish the group join started in ip_sioctl_groupname(). 14000 */ 14001 /* ARGSUSED */ 14002 static void 14003 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 14004 { 14005 ill_t *ill = q->q_ptr; 14006 phyint_t *phyi = ill->ill_phyint; 14007 ipmp_grp_t *grp = phyi->phyint_grp; 14008 ip_stack_t *ipst = ill->ill_ipst; 14009 14010 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 14011 ASSERT(!IS_IPMP(ill) && grp != NULL); 14012 ASSERT(IAM_WRITER_IPSQ(ipsq)); 14013 14014 if (phyi->phyint_illv4 != NULL) { 14015 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 14016 VERIFY(grp->gr_pendv4-- > 0); 14017 rw_exit(&ipst->ips_ipmp_lock); 14018 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 14019 } 14020 if (phyi->phyint_illv6 != NULL) { 14021 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 14022 VERIFY(grp->gr_pendv6-- > 0); 14023 rw_exit(&ipst->ips_ipmp_lock); 14024 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 14025 } 14026 freemsg(mp); 14027 } 14028 14029 /* 14030 * Process an SIOCSLIFGROUPNAME request. 14031 */ 14032 /* ARGSUSED */ 14033 int 14034 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14035 ip_ioctl_cmd_t *ipip, void *ifreq) 14036 { 14037 struct lifreq *lifr = ifreq; 14038 ill_t *ill = ipif->ipif_ill; 14039 ip_stack_t *ipst = ill->ill_ipst; 14040 phyint_t *phyi = ill->ill_phyint; 14041 ipmp_grp_t *grp = phyi->phyint_grp; 14042 mblk_t *ipsq_mp; 14043 int err = 0; 14044 14045 /* 14046 * Note that phyint_grp can only change here, where we're exclusive. 14047 */ 14048 ASSERT(IAM_WRITER_ILL(ill)); 14049 14050 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 14051 (phyi->phyint_flags & PHYI_VIRTUAL)) 14052 return (EINVAL); 14053 14054 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 14055 14056 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 14057 14058 /* 14059 * If the name hasn't changed, there's nothing to do. 14060 */ 14061 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 14062 goto unlock; 14063 14064 /* 14065 * Handle requests to rename an IPMP meta-interface. 14066 * 14067 * Note that creation of the IPMP meta-interface is handled in 14068 * userland through the standard plumbing sequence. As part of the 14069 * plumbing the IPMP meta-interface, its initial groupname is set to 14070 * the name of the interface (see ipif_set_values_tail()). 14071 */ 14072 if (IS_IPMP(ill)) { 14073 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 14074 goto unlock; 14075 } 14076 14077 /* 14078 * Handle requests to add or remove an IP interface from a group. 14079 */ 14080 if (lifr->lifr_groupname[0] != '\0') { /* add */ 14081 /* 14082 * Moves are handled by first removing the interface from 14083 * its existing group, and then adding it to another group. 14084 * So, fail if it's already in a group. 14085 */ 14086 if (IS_UNDER_IPMP(ill)) { 14087 err = EALREADY; 14088 goto unlock; 14089 } 14090 14091 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 14092 if (grp == NULL) { 14093 err = ENOENT; 14094 goto unlock; 14095 } 14096 14097 /* 14098 * Check if the phyint and its ills are suitable for 14099 * inclusion into the group. 14100 */ 14101 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 14102 goto unlock; 14103 14104 /* 14105 * Checks pass; join the group, and enqueue the remaining 14106 * illgrp joins for when we've become part of the group xop 14107 * and are exclusive across its IPSQs. Since qwriter_ip() 14108 * requires an mblk_t to scribble on, and since `mp' will be 14109 * freed as part of completing the ioctl, allocate another. 14110 */ 14111 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 14112 err = ENOMEM; 14113 goto unlock; 14114 } 14115 14116 /* 14117 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 14118 * IPMP meta-interface ills needed by `phyi' cannot go away 14119 * before ip_join_illgrps() is called back. See the comments 14120 * in ip_sioctl_plink_ipmp() for more. 14121 */ 14122 if (phyi->phyint_illv4 != NULL) 14123 grp->gr_pendv4++; 14124 if (phyi->phyint_illv6 != NULL) 14125 grp->gr_pendv6++; 14126 14127 rw_exit(&ipst->ips_ipmp_lock); 14128 14129 ipmp_phyint_join_grp(phyi, grp); 14130 ill_refhold(ill); 14131 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 14132 SWITCH_OP, B_FALSE); 14133 return (0); 14134 } else { 14135 /* 14136 * Request to remove the interface from a group. If the 14137 * interface is not in a group, this trivially succeeds. 14138 */ 14139 rw_exit(&ipst->ips_ipmp_lock); 14140 if (IS_UNDER_IPMP(ill)) 14141 ipmp_phyint_leave_grp(phyi); 14142 return (0); 14143 } 14144 unlock: 14145 rw_exit(&ipst->ips_ipmp_lock); 14146 return (err); 14147 } 14148 14149 /* 14150 * Process an SIOCGLIFBINDING request. 14151 */ 14152 /* ARGSUSED */ 14153 int 14154 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14155 ip_ioctl_cmd_t *ipip, void *ifreq) 14156 { 14157 ill_t *ill; 14158 struct lifreq *lifr = ifreq; 14159 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14160 14161 if (!IS_IPMP(ipif->ipif_ill)) 14162 return (EINVAL); 14163 14164 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14165 if ((ill = ipif->ipif_bound_ill) == NULL) 14166 lifr->lifr_binding[0] = '\0'; 14167 else 14168 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 14169 rw_exit(&ipst->ips_ipmp_lock); 14170 return (0); 14171 } 14172 14173 /* 14174 * Process an SIOCGLIFGROUPNAME request. 14175 */ 14176 /* ARGSUSED */ 14177 int 14178 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14179 ip_ioctl_cmd_t *ipip, void *ifreq) 14180 { 14181 ipmp_grp_t *grp; 14182 struct lifreq *lifr = ifreq; 14183 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 14184 14185 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14186 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 14187 lifr->lifr_groupname[0] = '\0'; 14188 else 14189 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 14190 rw_exit(&ipst->ips_ipmp_lock); 14191 return (0); 14192 } 14193 14194 /* 14195 * Process an SIOCGLIFGROUPINFO request. 14196 */ 14197 /* ARGSUSED */ 14198 int 14199 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 14200 ip_ioctl_cmd_t *ipip, void *dummy) 14201 { 14202 ipmp_grp_t *grp; 14203 lifgroupinfo_t *lifgr; 14204 ip_stack_t *ipst = CONNQ_TO_IPST(q); 14205 14206 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 14207 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 14208 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 14209 14210 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 14211 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 14212 rw_exit(&ipst->ips_ipmp_lock); 14213 return (ENOENT); 14214 } 14215 ipmp_grp_info(grp, lifgr); 14216 rw_exit(&ipst->ips_ipmp_lock); 14217 return (0); 14218 } 14219 14220 static void 14221 ill_dl_down(ill_t *ill) 14222 { 14223 /* 14224 * The ill is down; unbind but stay attached since we're still 14225 * associated with a PPA. If we have negotiated DLPI capabilites 14226 * with the data link service provider (IDS_OK) then reset them. 14227 * The interval between unbinding and rebinding is potentially 14228 * unbounded hence we cannot assume things will be the same. 14229 * The DLPI capabilities will be probed again when the data link 14230 * is brought up. 14231 */ 14232 mblk_t *mp = ill->ill_unbind_mp; 14233 14234 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 14235 14236 ill->ill_unbind_mp = NULL; 14237 if (mp != NULL) { 14238 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 14239 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 14240 ill->ill_name)); 14241 mutex_enter(&ill->ill_lock); 14242 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 14243 mutex_exit(&ill->ill_lock); 14244 /* 14245 * ip_rput does not pass up normal (M_PROTO) DLPI messages 14246 * after ILL_CONDEMNED is set. So in the unplumb case, we call 14247 * ill_capability_dld_disable disable rightaway. If this is not 14248 * an unplumb operation then the disable happens on receipt of 14249 * the capab ack via ip_rput_dlpi_writer -> 14250 * ill_capability_ack_thr. In both cases the order of 14251 * the operations seen by DLD is capability disable followed 14252 * by DL_UNBIND. Also the DLD capability disable needs a 14253 * cv_wait'able context. 14254 */ 14255 if (ill->ill_state_flags & ILL_CONDEMNED) 14256 ill_capability_dld_disable(ill); 14257 ill_capability_reset(ill, B_FALSE); 14258 ill_dlpi_send(ill, mp); 14259 } 14260 14261 /* 14262 * Toss all of our multicast memberships. We could keep them, but 14263 * then we'd have to do bookkeeping of any joins and leaves performed 14264 * by the application while the the interface is down (we can't just 14265 * issue them because arp cannot currently process AR_ENTRY_SQUERY's 14266 * on a downed interface). 14267 */ 14268 ill_leave_multicast(ill); 14269 14270 mutex_enter(&ill->ill_lock); 14271 ill->ill_dl_up = 0; 14272 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 14273 mutex_exit(&ill->ill_lock); 14274 } 14275 14276 static void 14277 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 14278 { 14279 union DL_primitives *dlp; 14280 t_uscalar_t prim; 14281 boolean_t waitack = B_FALSE; 14282 14283 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 14284 14285 dlp = (union DL_primitives *)mp->b_rptr; 14286 prim = dlp->dl_primitive; 14287 14288 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 14289 dl_primstr(prim), prim, ill->ill_name)); 14290 14291 switch (prim) { 14292 case DL_PHYS_ADDR_REQ: 14293 { 14294 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 14295 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 14296 break; 14297 } 14298 case DL_BIND_REQ: 14299 mutex_enter(&ill->ill_lock); 14300 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 14301 mutex_exit(&ill->ill_lock); 14302 break; 14303 } 14304 14305 /* 14306 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 14307 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 14308 * we only wait for the ACK of the DL_UNBIND_REQ. 14309 */ 14310 mutex_enter(&ill->ill_lock); 14311 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 14312 (prim == DL_UNBIND_REQ)) { 14313 ill->ill_dlpi_pending = prim; 14314 waitack = B_TRUE; 14315 } 14316 14317 mutex_exit(&ill->ill_lock); 14318 putnext(ill->ill_wq, mp); 14319 14320 /* 14321 * There is no ack for DL_NOTIFY_CONF messages 14322 */ 14323 if (waitack && prim == DL_NOTIFY_CONF) 14324 ill_dlpi_done(ill, prim); 14325 } 14326 14327 /* 14328 * Helper function for ill_dlpi_send(). 14329 */ 14330 /* ARGSUSED */ 14331 static void 14332 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 14333 { 14334 ill_dlpi_send(q->q_ptr, mp); 14335 } 14336 14337 /* 14338 * Send a DLPI control message to the driver but make sure there 14339 * is only one outstanding message. Uses ill_dlpi_pending to tell 14340 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 14341 * when an ACK or a NAK is received to process the next queued message. 14342 */ 14343 void 14344 ill_dlpi_send(ill_t *ill, mblk_t *mp) 14345 { 14346 mblk_t **mpp; 14347 14348 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 14349 14350 /* 14351 * To ensure that any DLPI requests for current exclusive operation 14352 * are always completely sent before any DLPI messages for other 14353 * operations, require writer access before enqueuing. 14354 */ 14355 if (!IAM_WRITER_ILL(ill)) { 14356 ill_refhold(ill); 14357 /* qwriter_ip() does the ill_refrele() */ 14358 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 14359 NEW_OP, B_TRUE); 14360 return; 14361 } 14362 14363 mutex_enter(&ill->ill_lock); 14364 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 14365 /* Must queue message. Tail insertion */ 14366 mpp = &ill->ill_dlpi_deferred; 14367 while (*mpp != NULL) 14368 mpp = &((*mpp)->b_next); 14369 14370 ip1dbg(("ill_dlpi_send: deferring request for %s\n", 14371 ill->ill_name)); 14372 14373 *mpp = mp; 14374 mutex_exit(&ill->ill_lock); 14375 return; 14376 } 14377 mutex_exit(&ill->ill_lock); 14378 ill_dlpi_dispatch(ill, mp); 14379 } 14380 14381 static void 14382 ill_capability_send(ill_t *ill, mblk_t *mp) 14383 { 14384 ill->ill_capab_pending_cnt++; 14385 ill_dlpi_send(ill, mp); 14386 } 14387 14388 void 14389 ill_capability_done(ill_t *ill) 14390 { 14391 ASSERT(ill->ill_capab_pending_cnt != 0); 14392 14393 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 14394 14395 ill->ill_capab_pending_cnt--; 14396 if (ill->ill_capab_pending_cnt == 0 && 14397 ill->ill_dlpi_capab_state == IDCS_OK) 14398 ill_capability_reset_alloc(ill); 14399 } 14400 14401 /* 14402 * Send all deferred DLPI messages without waiting for their ACKs. 14403 */ 14404 void 14405 ill_dlpi_send_deferred(ill_t *ill) 14406 { 14407 mblk_t *mp, *nextmp; 14408 14409 /* 14410 * Clear ill_dlpi_pending so that the message is not queued in 14411 * ill_dlpi_send(). 14412 */ 14413 mutex_enter(&ill->ill_lock); 14414 ill->ill_dlpi_pending = DL_PRIM_INVAL; 14415 mp = ill->ill_dlpi_deferred; 14416 ill->ill_dlpi_deferred = NULL; 14417 mutex_exit(&ill->ill_lock); 14418 14419 for (; mp != NULL; mp = nextmp) { 14420 nextmp = mp->b_next; 14421 mp->b_next = NULL; 14422 ill_dlpi_send(ill, mp); 14423 } 14424 } 14425 14426 /* 14427 * Check if the DLPI primitive `prim' is pending; print a warning if not. 14428 */ 14429 boolean_t 14430 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 14431 { 14432 t_uscalar_t pending; 14433 14434 mutex_enter(&ill->ill_lock); 14435 if (ill->ill_dlpi_pending == prim) { 14436 mutex_exit(&ill->ill_lock); 14437 return (B_TRUE); 14438 } 14439 14440 /* 14441 * During teardown, ill_dlpi_dispatch() will send DLPI requests 14442 * without waiting, so don't print any warnings in that case. 14443 */ 14444 if (ill->ill_state_flags & ILL_CONDEMNED) { 14445 mutex_exit(&ill->ill_lock); 14446 return (B_FALSE); 14447 } 14448 pending = ill->ill_dlpi_pending; 14449 mutex_exit(&ill->ill_lock); 14450 14451 if (pending == DL_PRIM_INVAL) { 14452 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14453 "received unsolicited ack for %s on %s\n", 14454 dl_primstr(prim), ill->ill_name); 14455 } else { 14456 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 14457 "received unexpected ack for %s on %s (expecting %s)\n", 14458 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 14459 } 14460 return (B_FALSE); 14461 } 14462 14463 /* 14464 * Complete the current DLPI operation associated with `prim' on `ill' and 14465 * start the next queued DLPI operation (if any). If there are no queued DLPI 14466 * operations and the ill's current exclusive IPSQ operation has finished 14467 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 14468 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 14469 * the comments above ipsq_current_finish() for details. 14470 */ 14471 void 14472 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 14473 { 14474 mblk_t *mp; 14475 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 14476 ipxop_t *ipx = ipsq->ipsq_xop; 14477 14478 ASSERT(IAM_WRITER_IPSQ(ipsq)); 14479 mutex_enter(&ill->ill_lock); 14480 14481 ASSERT(prim != DL_PRIM_INVAL); 14482 ASSERT(ill->ill_dlpi_pending == prim); 14483 14484 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 14485 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 14486 14487 if ((mp = ill->ill_dlpi_deferred) == NULL) { 14488 ill->ill_dlpi_pending = DL_PRIM_INVAL; 14489 if (ipx->ipx_current_done) { 14490 mutex_enter(&ipx->ipx_lock); 14491 ipx->ipx_current_ipif = NULL; 14492 mutex_exit(&ipx->ipx_lock); 14493 } 14494 cv_signal(&ill->ill_cv); 14495 mutex_exit(&ill->ill_lock); 14496 return; 14497 } 14498 14499 ill->ill_dlpi_deferred = mp->b_next; 14500 mp->b_next = NULL; 14501 mutex_exit(&ill->ill_lock); 14502 14503 ill_dlpi_dispatch(ill, mp); 14504 } 14505 14506 void 14507 conn_delete_ire(conn_t *connp, caddr_t arg) 14508 { 14509 ipif_t *ipif = (ipif_t *)arg; 14510 ire_t *ire; 14511 14512 /* 14513 * Look at the cached ires on conns which has pointers to ipifs. 14514 * We just call ire_refrele which clears up the reference 14515 * to ire. Called when a conn closes. Also called from ipif_free 14516 * to cleanup indirect references to the stale ipif via the cached ire. 14517 */ 14518 mutex_enter(&connp->conn_lock); 14519 ire = connp->conn_ire_cache; 14520 if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { 14521 connp->conn_ire_cache = NULL; 14522 mutex_exit(&connp->conn_lock); 14523 IRE_REFRELE_NOTR(ire); 14524 return; 14525 } 14526 mutex_exit(&connp->conn_lock); 14527 14528 } 14529 14530 /* 14531 * Some operations (e.g., ipif_down()) conditionally delete a number 14532 * of IREs. Those IREs may have been previously cached in the conn structure. 14533 * This ipcl_walk() walker function releases all references to such IREs based 14534 * on the condemned flag. 14535 */ 14536 /* ARGSUSED */ 14537 void 14538 conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) 14539 { 14540 ire_t *ire; 14541 14542 mutex_enter(&connp->conn_lock); 14543 ire = connp->conn_ire_cache; 14544 if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { 14545 connp->conn_ire_cache = NULL; 14546 mutex_exit(&connp->conn_lock); 14547 IRE_REFRELE_NOTR(ire); 14548 return; 14549 } 14550 mutex_exit(&connp->conn_lock); 14551 } 14552 14553 /* 14554 * Take down a specific interface, but don't lose any information about it. 14555 * (Always called as writer.) 14556 * This function goes through the down sequence even if the interface is 14557 * already down. There are 2 reasons. 14558 * a. Currently we permit interface routes that depend on down interfaces 14559 * to be added. This behaviour itself is questionable. However it appears 14560 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 14561 * time. We go thru the cleanup in order to remove these routes. 14562 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 14563 * DL_ERROR_ACK in response to the the DL_BIND request. The interface is 14564 * down, but we need to cleanup i.e. do ill_dl_down and 14565 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 14566 * 14567 * IP-MT notes: 14568 * 14569 * Model of reference to interfaces. 14570 * 14571 * The following members in ipif_t track references to the ipif. 14572 * int ipif_refcnt; Active reference count 14573 * uint_t ipif_ire_cnt; Number of ire's referencing this ipif 14574 * uint_t ipif_ilm_cnt; Number of ilms's references this ipif. 14575 * 14576 * The following members in ill_t track references to the ill. 14577 * int ill_refcnt; active refcnt 14578 * uint_t ill_ire_cnt; Number of ires referencing ill 14579 * uint_t ill_nce_cnt; Number of nces referencing ill 14580 * uint_t ill_ilm_cnt; Number of ilms referencing ill 14581 * 14582 * Reference to an ipif or ill can be obtained in any of the following ways. 14583 * 14584 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 14585 * Pointers to ipif / ill from other data structures viz ire and conn. 14586 * Implicit reference to the ipif / ill by holding a reference to the ire. 14587 * 14588 * The ipif/ill lookup functions return a reference held ipif / ill. 14589 * ipif_refcnt and ill_refcnt track the reference counts respectively. 14590 * This is a purely dynamic reference count associated with threads holding 14591 * references to the ipif / ill. Pointers from other structures do not 14592 * count towards this reference count. 14593 * 14594 * ipif_ire_cnt/ill_ire_cnt is the number of ire's 14595 * associated with the ipif/ill. This is incremented whenever a new 14596 * ire is created referencing the ipif/ill. This is done atomically inside 14597 * ire_add_v[46] where the ire is actually added to the ire hash table. 14598 * The count is decremented in ire_inactive where the ire is destroyed. 14599 * 14600 * nce's reference ill's thru nce_ill and the count of nce's associated with 14601 * an ill is recorded in ill_nce_cnt. This is incremented atomically in 14602 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 14603 * table. Similarly it is decremented in ndp_inactive() where the nce 14604 * is destroyed. 14605 * 14606 * ilm's reference to the ipif (for IPv4 ilm's) or the ill (for IPv6 ilm's) 14607 * is incremented in ilm_add_v6() and decremented before the ilm is freed 14608 * in ilm_walker_cleanup() or ilm_delete(). 14609 * 14610 * Flow of ioctls involving interface down/up 14611 * 14612 * The following is the sequence of an attempt to set some critical flags on an 14613 * up interface. 14614 * ip_sioctl_flags 14615 * ipif_down 14616 * wait for ipif to be quiescent 14617 * ipif_down_tail 14618 * ip_sioctl_flags_tail 14619 * 14620 * All set ioctls that involve down/up sequence would have a skeleton similar 14621 * to the above. All the *tail functions are called after the refcounts have 14622 * dropped to the appropriate values. 14623 * 14624 * The mechanism to quiesce an ipif is as follows. 14625 * 14626 * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed 14627 * on the ipif. Callers either pass a flag requesting wait or the lookup 14628 * functions will return NULL. 14629 * 14630 * Delete all ires referencing this ipif 14631 * 14632 * Any thread attempting to do an ipif_refhold on an ipif that has been 14633 * obtained thru a cached pointer will first make sure that 14634 * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then 14635 * increment the refcount. 14636 * 14637 * The above guarantees that the ipif refcount will eventually come down to 14638 * zero and the ipif will quiesce, once all threads that currently hold a 14639 * reference to the ipif refrelease the ipif. The ipif is quiescent after the 14640 * ipif_refcount has dropped to zero and all ire's associated with this ipif 14641 * have also been ire_inactive'd. i.e. when ipif_{ire, ill}_cnt and 14642 * ipif_refcnt both drop to zero. See also: comments above IPIF_DOWN_OK() 14643 * in ip.h 14644 * 14645 * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. 14646 * 14647 * Threads trying to lookup an ipif or ill can pass a flag requesting 14648 * wait and restart if the ipif / ill cannot be looked up currently. 14649 * For eg. bind, and route operations (Eg. route add / delete) cannot return 14650 * failure if the ipif is currently undergoing an exclusive operation, and 14651 * hence pass the flag. The mblk is then enqueued in the ipsq and the operation 14652 * is restarted by ipsq_exit() when the current exclusive operation completes. 14653 * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The 14654 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 14655 * change while the ill_lock is held. Before dropping the ill_lock we acquire 14656 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 14657 * until we release the ipsq_lock, even though the the ill/ipif state flags 14658 * can change after we drop the ill_lock. 14659 * 14660 * An attempt to send out a packet using an ipif that is currently 14661 * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this 14662 * operation and restart it later when the exclusive condition on the ipif ends. 14663 * This is an example of not passing the wait flag to the lookup functions. For 14664 * example an attempt to refhold and use conn->conn_multicast_ipif and send 14665 * out a multicast packet on that ipif will fail while the ipif is 14666 * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is 14667 * currently IPIF_CHANGING will also fail. 14668 */ 14669 int 14670 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 14671 { 14672 ill_t *ill = ipif->ipif_ill; 14673 conn_t *connp; 14674 boolean_t success; 14675 boolean_t ipif_was_up = B_FALSE; 14676 ip_stack_t *ipst = ill->ill_ipst; 14677 14678 ASSERT(IAM_WRITER_IPIF(ipif)); 14679 14680 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 14681 14682 if (ipif->ipif_flags & IPIF_UP) { 14683 mutex_enter(&ill->ill_lock); 14684 ipif->ipif_flags &= ~IPIF_UP; 14685 ASSERT(ill->ill_ipif_up_count > 0); 14686 --ill->ill_ipif_up_count; 14687 mutex_exit(&ill->ill_lock); 14688 ipif_was_up = B_TRUE; 14689 /* Update status in SCTP's list */ 14690 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 14691 ill_nic_event_dispatch(ipif->ipif_ill, 14692 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 14693 } 14694 14695 /* 14696 * Blow away memberships we established in ipif_multicast_up(). 14697 */ 14698 ipif_multicast_down(ipif); 14699 14700 /* 14701 * Remove from the mapping for __sin6_src_id. We insert only 14702 * when the address is not INADDR_ANY. As IPv4 addresses are 14703 * stored as mapped addresses, we need to check for mapped 14704 * INADDR_ANY also. 14705 */ 14706 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 14707 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 14708 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14709 int err; 14710 14711 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 14712 ipif->ipif_zoneid, ipst); 14713 if (err != 0) { 14714 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 14715 } 14716 } 14717 14718 /* 14719 * Delete all IRE's pointing at this ipif or its source address. 14720 */ 14721 if (ipif->ipif_isv6) { 14722 ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 14723 ipst); 14724 } else { 14725 ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, 14726 ipst); 14727 } 14728 14729 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 14730 /* 14731 * Since the interface is now down, it may have just become 14732 * inactive. Note that this needs to be done even for a 14733 * lll_logical_down(), or ARP entries will not get correctly 14734 * restored when the interface comes back up. 14735 */ 14736 if (IS_UNDER_IPMP(ill)) 14737 ipmp_ill_refresh_active(ill); 14738 } 14739 14740 /* 14741 * Cleaning up the conn_ire_cache or conns must be done only after the 14742 * ires have been deleted above. Otherwise a thread could end up 14743 * caching an ire in a conn after we have finished the cleanup of the 14744 * conn. The caching is done after making sure that the ire is not yet 14745 * condemned. Also documented in the block comment above ip_output 14746 */ 14747 ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); 14748 /* Also, delete the ires cached in SCTP */ 14749 sctp_ire_cache_flush(ipif); 14750 14751 /* 14752 * Update any other ipifs which have used "our" local address as 14753 * a source address. This entails removing and recreating IRE_INTERFACE 14754 * entries for such ipifs. 14755 */ 14756 if (ipif->ipif_isv6) 14757 ipif_update_other_ipifs_v6(ipif); 14758 else 14759 ipif_update_other_ipifs(ipif); 14760 14761 /* 14762 * neighbor-discovery or arp entries for this interface. 14763 */ 14764 ipif_ndp_down(ipif); 14765 14766 /* 14767 * If mp is NULL the caller will wait for the appropriate refcnt. 14768 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 14769 * and ill_delete -> ipif_free -> ipif_down 14770 */ 14771 if (mp == NULL) { 14772 ASSERT(q == NULL); 14773 return (0); 14774 } 14775 14776 if (CONN_Q(q)) { 14777 connp = Q_TO_CONN(q); 14778 mutex_enter(&connp->conn_lock); 14779 } else { 14780 connp = NULL; 14781 } 14782 mutex_enter(&ill->ill_lock); 14783 /* 14784 * Are there any ire's pointing to this ipif that are still active ? 14785 * If this is the last ipif going down, are there any ire's pointing 14786 * to this ill that are still active ? 14787 */ 14788 if (ipif_is_quiescent(ipif)) { 14789 mutex_exit(&ill->ill_lock); 14790 if (connp != NULL) 14791 mutex_exit(&connp->conn_lock); 14792 return (0); 14793 } 14794 14795 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 14796 ill->ill_name, (void *)ill)); 14797 /* 14798 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 14799 * drops down, the operation will be restarted by ipif_ill_refrele_tail 14800 * which in turn is called by the last refrele on the ipif/ill/ire. 14801 */ 14802 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 14803 if (!success) { 14804 /* The conn is closing. So just return */ 14805 ASSERT(connp != NULL); 14806 mutex_exit(&ill->ill_lock); 14807 mutex_exit(&connp->conn_lock); 14808 return (EINTR); 14809 } 14810 14811 mutex_exit(&ill->ill_lock); 14812 if (connp != NULL) 14813 mutex_exit(&connp->conn_lock); 14814 return (EINPROGRESS); 14815 } 14816 14817 void 14818 ipif_down_tail(ipif_t *ipif) 14819 { 14820 ill_t *ill = ipif->ipif_ill; 14821 14822 /* 14823 * Skip any loopback interface (null wq). 14824 * If this is the last logical interface on the ill 14825 * have ill_dl_down tell the driver we are gone (unbind) 14826 * Note that lun 0 can ipif_down even though 14827 * there are other logical units that are up. 14828 * This occurs e.g. when we change a "significant" IFF_ flag. 14829 */ 14830 if (ill->ill_wq != NULL && !ill->ill_logical_down && 14831 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 14832 ill->ill_dl_up) { 14833 ill_dl_down(ill); 14834 } 14835 ill->ill_logical_down = 0; 14836 14837 /* 14838 * Has to be after removing the routes in ipif_down_delete_ire. 14839 */ 14840 ipif_resolver_down(ipif); 14841 14842 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 14843 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 14844 } 14845 14846 /* 14847 * Bring interface logically down without bringing the physical interface 14848 * down e.g. when the netmask is changed. This avoids long lasting link 14849 * negotiations between an ethernet interface and a certain switches. 14850 */ 14851 static int 14852 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 14853 { 14854 /* 14855 * The ill_logical_down flag is a transient flag. It is set here 14856 * and is cleared once the down has completed in ipif_down_tail. 14857 * This flag does not indicate whether the ill stream is in the 14858 * DL_BOUND state with the driver. Instead this flag is used by 14859 * ipif_down_tail to determine whether to DL_UNBIND the stream with 14860 * the driver. The state of the ill stream i.e. whether it is 14861 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 14862 */ 14863 ipif->ipif_ill->ill_logical_down = 1; 14864 return (ipif_down(ipif, q, mp)); 14865 } 14866 14867 /* 14868 * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. 14869 * If the usesrc client ILL is already part of a usesrc group or not, 14870 * in either case a ire_stq with the matching usesrc client ILL will 14871 * locate the IRE's that need to be deleted. We want IREs to be created 14872 * with the new source address. 14873 */ 14874 static void 14875 ipif_delete_cache_ire(ire_t *ire, char *ill_arg) 14876 { 14877 ill_t *ucill = (ill_t *)ill_arg; 14878 14879 ASSERT(IAM_WRITER_ILL(ucill)); 14880 14881 if (ire->ire_stq == NULL) 14882 return; 14883 14884 if ((ire->ire_type == IRE_CACHE) && 14885 ((ill_t *)ire->ire_stq->q_ptr == ucill)) 14886 ire_delete(ire); 14887 } 14888 14889 /* 14890 * ire_walk routine to delete every IRE dependent on the interface 14891 * address that is going down. (Always called as writer.) 14892 * Works for both v4 and v6. 14893 * In addition for checking for ire_ipif matches it also checks for 14894 * IRE_CACHE entries which have the same source address as the 14895 * disappearing ipif since ipif_select_source might have picked 14896 * that source. Note that ipif_down/ipif_update_other_ipifs takes 14897 * care of any IRE_INTERFACE with the disappearing source address. 14898 */ 14899 static void 14900 ipif_down_delete_ire(ire_t *ire, char *ipif_arg) 14901 { 14902 ipif_t *ipif = (ipif_t *)ipif_arg; 14903 14904 ASSERT(IAM_WRITER_IPIF(ipif)); 14905 if (ire->ire_ipif == NULL) 14906 return; 14907 14908 if (ire->ire_ipif != ipif) { 14909 /* 14910 * Look for a matching source address. 14911 */ 14912 if (ire->ire_type != IRE_CACHE) 14913 return; 14914 if (ipif->ipif_flags & IPIF_NOLOCAL) 14915 return; 14916 14917 if (ire->ire_ipversion == IPV4_VERSION) { 14918 if (ire->ire_src_addr != ipif->ipif_src_addr) 14919 return; 14920 } else { 14921 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, 14922 &ipif->ipif_v6lcl_addr)) 14923 return; 14924 } 14925 ire_delete(ire); 14926 return; 14927 } 14928 /* 14929 * ire_delete() will do an ire_flush_cache which will delete 14930 * all ire_ipif matches 14931 */ 14932 ire_delete(ire); 14933 } 14934 14935 /* 14936 * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when 14937 * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or 14938 * 2) when an interface is brought up or down (on that ill). 14939 * This ensures that the IRE_CACHE entries don't retain stale source 14940 * address selection results. 14941 */ 14942 void 14943 ill_ipif_cache_delete(ire_t *ire, char *ill_arg) 14944 { 14945 ill_t *ill = (ill_t *)ill_arg; 14946 14947 ASSERT(IAM_WRITER_ILL(ill)); 14948 ASSERT(ire->ire_type == IRE_CACHE); 14949 14950 /* 14951 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches 14952 * ill, but we only want to delete the IRE if ire_ipif matches. 14953 */ 14954 ASSERT(ire->ire_ipif != NULL); 14955 if (ill == ire->ire_ipif->ipif_ill) 14956 ire_delete(ire); 14957 } 14958 14959 /* 14960 * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this 14961 * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references 14962 * the IPMP ill. 14963 */ 14964 void 14965 ill_stq_cache_delete(ire_t *ire, char *ill_arg) 14966 { 14967 ill_t *ill = (ill_t *)ill_arg; 14968 14969 ASSERT(IAM_WRITER_ILL(ill)); 14970 ASSERT(ire->ire_type == IRE_CACHE); 14971 14972 /* 14973 * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches 14974 * ill, but we only want to delete the IRE if ire_stq matches. 14975 */ 14976 if (ire->ire_stq->q_ptr == ill_arg) 14977 ire_delete(ire); 14978 } 14979 14980 /* 14981 * Delete all the IREs whose ire_stq's reference any ill in the same IPMP 14982 * group as `ill_arg'. Used by ipmp_ill_deactivate() to flush all IRE_CACHE 14983 * entries for the illgrp. 14984 */ 14985 void 14986 ill_grp_cache_delete(ire_t *ire, char *ill_arg) 14987 { 14988 ill_t *ill = (ill_t *)ill_arg; 14989 14990 ASSERT(IAM_WRITER_ILL(ill)); 14991 14992 if (ire->ire_type == IRE_CACHE && 14993 IS_IN_SAME_ILLGRP((ill_t *)ire->ire_stq->q_ptr, ill)) { 14994 ire_delete(ire); 14995 } 14996 } 14997 14998 /* 14999 * Delete all broadcast IREs with a source address on `ill_arg'. 15000 */ 15001 static void 15002 ill_broadcast_delete(ire_t *ire, char *ill_arg) 15003 { 15004 ill_t *ill = (ill_t *)ill_arg; 15005 15006 ASSERT(IAM_WRITER_ILL(ill)); 15007 ASSERT(ire->ire_type == IRE_BROADCAST); 15008 15009 if (ire->ire_ipif->ipif_ill == ill) 15010 ire_delete(ire); 15011 } 15012 15013 /* 15014 * Initiate deallocate of an IPIF. Always called as writer. Called by 15015 * ill_delete or ip_sioctl_removeif. 15016 */ 15017 static void 15018 ipif_free(ipif_t *ipif) 15019 { 15020 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15021 15022 ASSERT(IAM_WRITER_IPIF(ipif)); 15023 15024 if (ipif->ipif_recovery_id != 0) 15025 (void) untimeout(ipif->ipif_recovery_id); 15026 ipif->ipif_recovery_id = 0; 15027 15028 /* Remove conn references */ 15029 reset_conn_ipif(ipif); 15030 15031 /* 15032 * Make sure we have valid net and subnet broadcast ire's for the 15033 * other ipif's which share them with this ipif. 15034 */ 15035 if (!ipif->ipif_isv6) 15036 ipif_check_bcast_ires(ipif); 15037 15038 /* 15039 * Take down the interface. We can be called either from ill_delete 15040 * or from ip_sioctl_removeif. 15041 */ 15042 (void) ipif_down(ipif, NULL, NULL); 15043 15044 /* 15045 * Now that the interface is down, there's no chance it can still 15046 * become a duplicate. Cancel any timer that may have been set while 15047 * tearing down. 15048 */ 15049 if (ipif->ipif_recovery_id != 0) 15050 (void) untimeout(ipif->ipif_recovery_id); 15051 ipif->ipif_recovery_id = 0; 15052 15053 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15054 /* Remove pointers to this ill in the multicast routing tables */ 15055 reset_mrt_vif_ipif(ipif); 15056 /* If necessary, clear the cached source ipif rotor. */ 15057 if (ipif->ipif_ill->ill_src_ipif == ipif) 15058 ipif->ipif_ill->ill_src_ipif = NULL; 15059 rw_exit(&ipst->ips_ill_g_lock); 15060 } 15061 15062 static void 15063 ipif_free_tail(ipif_t *ipif) 15064 { 15065 mblk_t *mp; 15066 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15067 15068 /* 15069 * Free state for addition IRE_IF_[NO]RESOLVER ire's. 15070 */ 15071 mutex_enter(&ipif->ipif_saved_ire_lock); 15072 mp = ipif->ipif_saved_ire_mp; 15073 ipif->ipif_saved_ire_mp = NULL; 15074 mutex_exit(&ipif->ipif_saved_ire_lock); 15075 freemsg(mp); 15076 15077 /* 15078 * Need to hold both ill_g_lock and ill_lock while 15079 * inserting or removing an ipif from the linked list 15080 * of ipifs hanging off the ill. 15081 */ 15082 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15083 15084 ASSERT(ilm_walk_ipif(ipif) == 0); 15085 15086 #ifdef DEBUG 15087 ipif_trace_cleanup(ipif); 15088 #endif 15089 15090 /* Ask SCTP to take it out of it list */ 15091 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 15092 15093 /* Get it out of the ILL interface list. */ 15094 ipif_remove(ipif); 15095 rw_exit(&ipst->ips_ill_g_lock); 15096 15097 mutex_destroy(&ipif->ipif_saved_ire_lock); 15098 15099 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 15100 ASSERT(ipif->ipif_recovery_id == 0); 15101 15102 /* Free the memory. */ 15103 mi_free(ipif); 15104 } 15105 15106 /* 15107 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 15108 * is zero. 15109 */ 15110 void 15111 ipif_get_name(const ipif_t *ipif, char *buf, int len) 15112 { 15113 char lbuf[LIFNAMSIZ]; 15114 char *name; 15115 size_t name_len; 15116 15117 buf[0] = '\0'; 15118 name = ipif->ipif_ill->ill_name; 15119 name_len = ipif->ipif_ill->ill_name_length; 15120 if (ipif->ipif_id != 0) { 15121 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 15122 ipif->ipif_id); 15123 name = lbuf; 15124 name_len = mi_strlen(name) + 1; 15125 } 15126 len -= 1; 15127 buf[len] = '\0'; 15128 len = MIN(len, name_len); 15129 bcopy(name, buf, len); 15130 } 15131 15132 /* 15133 * Find an IPIF based on the name passed in. Names can be of the 15134 * form <phys> (e.g., le0), <phys>:<#> (e.g., le0:1), 15135 * The <phys> string can have forms like <dev><#> (e.g., le0), 15136 * <dev><#>.<module> (e.g. le0.foo), or <dev>.<module><#> (e.g. ip.tun3). 15137 * When there is no colon, the implied unit id is zero. <phys> must 15138 * correspond to the name of an ILL. (May be called as writer.) 15139 */ 15140 static ipif_t * 15141 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 15142 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, 15143 mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) 15144 { 15145 char *cp; 15146 char *endp; 15147 long id; 15148 ill_t *ill; 15149 ipif_t *ipif; 15150 uint_t ire_type; 15151 boolean_t did_alloc = B_FALSE; 15152 ipsq_t *ipsq; 15153 15154 if (error != NULL) 15155 *error = 0; 15156 15157 /* 15158 * If the caller wants to us to create the ipif, make sure we have a 15159 * valid zoneid 15160 */ 15161 ASSERT(!do_alloc || zoneid != ALL_ZONES); 15162 15163 if (namelen == 0) { 15164 if (error != NULL) 15165 *error = ENXIO; 15166 return (NULL); 15167 } 15168 15169 *exists = B_FALSE; 15170 /* Look for a colon in the name. */ 15171 endp = &name[namelen]; 15172 for (cp = endp; --cp > name; ) { 15173 if (*cp == IPIF_SEPARATOR_CHAR) 15174 break; 15175 } 15176 15177 if (*cp == IPIF_SEPARATOR_CHAR) { 15178 /* 15179 * Reject any non-decimal aliases for logical 15180 * interfaces. Aliases with leading zeroes 15181 * are also rejected as they introduce ambiguity 15182 * in the naming of the interfaces. 15183 * In order to confirm with existing semantics, 15184 * and to not break any programs/script relying 15185 * on that behaviour, if<0>:0 is considered to be 15186 * a valid interface. 15187 * 15188 * If alias has two or more digits and the first 15189 * is zero, fail. 15190 */ 15191 if (&cp[2] < endp && cp[1] == '0') { 15192 if (error != NULL) 15193 *error = EINVAL; 15194 return (NULL); 15195 } 15196 } 15197 15198 if (cp <= name) { 15199 cp = endp; 15200 } else { 15201 *cp = '\0'; 15202 } 15203 15204 /* 15205 * Look up the ILL, based on the portion of the name 15206 * before the slash. ill_lookup_on_name returns a held ill. 15207 * Temporary to check whether ill exists already. If so 15208 * ill_lookup_on_name will clear it. 15209 */ 15210 ill = ill_lookup_on_name(name, do_alloc, isv6, 15211 q, mp, func, error, &did_alloc, ipst); 15212 if (cp != endp) 15213 *cp = IPIF_SEPARATOR_CHAR; 15214 if (ill == NULL) 15215 return (NULL); 15216 15217 /* Establish the unit number in the name. */ 15218 id = 0; 15219 if (cp < endp && *endp == '\0') { 15220 /* If there was a colon, the unit number follows. */ 15221 cp++; 15222 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 15223 ill_refrele(ill); 15224 if (error != NULL) 15225 *error = ENXIO; 15226 return (NULL); 15227 } 15228 } 15229 15230 GRAB_CONN_LOCK(q); 15231 mutex_enter(&ill->ill_lock); 15232 /* Now see if there is an IPIF with this unit number. */ 15233 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15234 if (ipif->ipif_id == id) { 15235 if (zoneid != ALL_ZONES && 15236 zoneid != ipif->ipif_zoneid && 15237 ipif->ipif_zoneid != ALL_ZONES) { 15238 mutex_exit(&ill->ill_lock); 15239 RELEASE_CONN_LOCK(q); 15240 ill_refrele(ill); 15241 if (error != NULL) 15242 *error = ENXIO; 15243 return (NULL); 15244 } 15245 /* 15246 * The block comment at the start of ipif_down 15247 * explains the use of the macros used below 15248 */ 15249 if (IPIF_CAN_LOOKUP(ipif)) { 15250 ipif_refhold_locked(ipif); 15251 mutex_exit(&ill->ill_lock); 15252 if (!did_alloc) 15253 *exists = B_TRUE; 15254 /* 15255 * Drop locks before calling ill_refrele 15256 * since it can potentially call into 15257 * ipif_ill_refrele_tail which can end up 15258 * in trying to acquire any lock. 15259 */ 15260 RELEASE_CONN_LOCK(q); 15261 ill_refrele(ill); 15262 return (ipif); 15263 } else if (IPIF_CAN_WAIT(ipif, q)) { 15264 ipsq = ill->ill_phyint->phyint_ipsq; 15265 mutex_enter(&ipsq->ipsq_lock); 15266 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 15267 mutex_exit(&ill->ill_lock); 15268 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 15269 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 15270 mutex_exit(&ipsq->ipsq_lock); 15271 RELEASE_CONN_LOCK(q); 15272 ill_refrele(ill); 15273 if (error != NULL) 15274 *error = EINPROGRESS; 15275 return (NULL); 15276 } 15277 } 15278 } 15279 RELEASE_CONN_LOCK(q); 15280 15281 if (!do_alloc) { 15282 mutex_exit(&ill->ill_lock); 15283 ill_refrele(ill); 15284 if (error != NULL) 15285 *error = ENXIO; 15286 return (NULL); 15287 } 15288 15289 /* 15290 * If none found, atomically allocate and return a new one. 15291 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 15292 * to support "receive only" use of lo0:1 etc. as is still done 15293 * below as an initial guess. 15294 * However, this is now likely to be overriden later in ipif_up_done() 15295 * when we know for sure what address has been configured on the 15296 * interface, since we might have more than one loopback interface 15297 * with a loopback address, e.g. in the case of zones, and all the 15298 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 15299 */ 15300 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 15301 ire_type = IRE_LOOPBACK; 15302 else 15303 ire_type = IRE_LOCAL; 15304 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE); 15305 if (ipif != NULL) 15306 ipif_refhold_locked(ipif); 15307 else if (error != NULL) 15308 *error = ENOMEM; 15309 mutex_exit(&ill->ill_lock); 15310 ill_refrele(ill); 15311 return (ipif); 15312 } 15313 15314 /* 15315 * This routine is called whenever a new address comes up on an ipif. If 15316 * we are configured to respond to address mask requests, then we are supposed 15317 * to broadcast an address mask reply at this time. This routine is also 15318 * called if we are already up, but a netmask change is made. This is legal 15319 * but might not make the system manager very popular. (May be called 15320 * as writer.) 15321 */ 15322 void 15323 ipif_mask_reply(ipif_t *ipif) 15324 { 15325 icmph_t *icmph; 15326 ipha_t *ipha; 15327 mblk_t *mp; 15328 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15329 15330 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 15331 15332 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 15333 return; 15334 15335 /* ICMP mask reply is IPv4 only */ 15336 ASSERT(!ipif->ipif_isv6); 15337 /* ICMP mask reply is not for a loopback interface */ 15338 ASSERT(ipif->ipif_ill->ill_wq != NULL); 15339 15340 mp = allocb(REPLY_LEN, BPRI_HI); 15341 if (mp == NULL) 15342 return; 15343 mp->b_wptr = mp->b_rptr + REPLY_LEN; 15344 15345 ipha = (ipha_t *)mp->b_rptr; 15346 bzero(ipha, REPLY_LEN); 15347 *ipha = icmp_ipha; 15348 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 15349 ipha->ipha_src = ipif->ipif_src_addr; 15350 ipha->ipha_dst = ipif->ipif_brd_addr; 15351 ipha->ipha_length = htons(REPLY_LEN); 15352 ipha->ipha_ident = 0; 15353 15354 icmph = (icmph_t *)&ipha[1]; 15355 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 15356 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 15357 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 15358 15359 put(ipif->ipif_wq, mp); 15360 15361 #undef REPLY_LEN 15362 } 15363 15364 /* 15365 * When the mtu in the ipif changes, we call this routine through ire_walk 15366 * to update all the relevant IREs. 15367 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 15368 */ 15369 static void 15370 ipif_mtu_change(ire_t *ire, char *ipif_arg) 15371 { 15372 ipif_t *ipif = (ipif_t *)ipif_arg; 15373 15374 if (ire->ire_stq == NULL || ire->ire_ipif != ipif) 15375 return; 15376 15377 mutex_enter(&ire->ire_lock); 15378 if (ire->ire_marks & IRE_MARK_PMTU) { 15379 /* Avoid increasing the PMTU */ 15380 ire->ire_max_frag = MIN(ipif->ipif_mtu, ire->ire_max_frag); 15381 if (ire->ire_max_frag == ipif->ipif_mtu) 15382 ire->ire_marks &= ~IRE_MARK_PMTU; 15383 } else { 15384 ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); 15385 } 15386 mutex_exit(&ire->ire_lock); 15387 } 15388 15389 /* 15390 * When the mtu in the ill changes, we call this routine through ire_walk 15391 * to update all the relevant IREs. 15392 * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. 15393 */ 15394 void 15395 ill_mtu_change(ire_t *ire, char *ill_arg) 15396 { 15397 ill_t *ill = (ill_t *)ill_arg; 15398 15399 if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) 15400 return; 15401 15402 mutex_enter(&ire->ire_lock); 15403 if (ire->ire_marks & IRE_MARK_PMTU) { 15404 /* Avoid increasing the PMTU */ 15405 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, 15406 ire->ire_max_frag); 15407 if (ire->ire_max_frag == ire->ire_ipif->ipif_mtu) { 15408 ire->ire_marks &= ~IRE_MARK_PMTU; 15409 } 15410 } else { 15411 ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, IP_MAXPACKET); 15412 } 15413 mutex_exit(&ire->ire_lock); 15414 } 15415 15416 /* 15417 * Join the ipif specific multicast groups. 15418 * Must be called after a mapping has been set up in the resolver. (Always 15419 * called as writer.) 15420 */ 15421 void 15422 ipif_multicast_up(ipif_t *ipif) 15423 { 15424 int err; 15425 ill_t *ill; 15426 15427 ASSERT(IAM_WRITER_IPIF(ipif)); 15428 15429 ill = ipif->ipif_ill; 15430 15431 ip1dbg(("ipif_multicast_up\n")); 15432 if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) 15433 return; 15434 15435 if (ipif->ipif_isv6) { 15436 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 15437 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 15438 15439 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 15440 15441 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 15442 return; 15443 15444 ip1dbg(("ipif_multicast_up - addmulti\n")); 15445 15446 /* 15447 * Join the all hosts multicast address. We skip this for 15448 * underlying IPMP interfaces since they should be invisible. 15449 */ 15450 if (!IS_UNDER_IPMP(ill)) { 15451 err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid, 15452 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15453 if (err != 0) { 15454 ip0dbg(("ipif_multicast_up: " 15455 "all_hosts_mcast failed %d\n", err)); 15456 return; 15457 } 15458 ipif->ipif_joined_allhosts = 1; 15459 } 15460 15461 /* 15462 * Enable multicast for the solicited node multicast address 15463 */ 15464 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 15465 err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid, 15466 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15467 if (err != 0) { 15468 ip0dbg(("ipif_multicast_up: solicited MC" 15469 " failed %d\n", err)); 15470 if (ipif->ipif_joined_allhosts) { 15471 (void) ip_delmulti_v6(&v6allmc, ill, 15472 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15473 ipif->ipif_joined_allhosts = 0; 15474 } 15475 return; 15476 } 15477 } 15478 } else { 15479 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 15480 return; 15481 15482 /* Join the all hosts multicast address */ 15483 ip1dbg(("ipif_multicast_up - addmulti\n")); 15484 err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, 15485 ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); 15486 if (err) { 15487 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 15488 return; 15489 } 15490 } 15491 ipif->ipif_multicast_up = 1; 15492 } 15493 15494 /* 15495 * Blow away any multicast groups that we joined in ipif_multicast_up(). 15496 * (Explicit memberships are blown away in ill_leave_multicast() when the 15497 * ill is brought down.) 15498 */ 15499 void 15500 ipif_multicast_down(ipif_t *ipif) 15501 { 15502 int err; 15503 15504 ASSERT(IAM_WRITER_IPIF(ipif)); 15505 15506 ip1dbg(("ipif_multicast_down\n")); 15507 if (!ipif->ipif_multicast_up) 15508 return; 15509 15510 ip1dbg(("ipif_multicast_down - delmulti\n")); 15511 15512 if (!ipif->ipif_isv6) { 15513 err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE, 15514 B_TRUE); 15515 if (err != 0) 15516 ip0dbg(("ipif_multicast_down: failed %d\n", err)); 15517 15518 ipif->ipif_multicast_up = 0; 15519 return; 15520 } 15521 15522 /* 15523 * Leave the all-hosts multicast address. 15524 */ 15525 if (ipif->ipif_joined_allhosts) { 15526 err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, 15527 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15528 if (err != 0) { 15529 ip0dbg(("ipif_multicast_down: all_hosts_mcast " 15530 "failed %d\n", err)); 15531 } 15532 ipif->ipif_joined_allhosts = 0; 15533 } 15534 15535 /* 15536 * Disable multicast for the solicited node multicast address 15537 */ 15538 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 15539 in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; 15540 15541 ipv6_multi.s6_addr32[3] |= 15542 ipif->ipif_v6lcl_addr.s6_addr32[3]; 15543 15544 err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, 15545 ipif->ipif_zoneid, B_TRUE, B_TRUE); 15546 if (err != 0) { 15547 ip0dbg(("ipif_multicast_down: sol MC failed %d\n", 15548 err)); 15549 } 15550 } 15551 15552 ipif->ipif_multicast_up = 0; 15553 } 15554 15555 /* 15556 * Used when an interface comes up to recreate any extra routes on this 15557 * interface. 15558 */ 15559 static ire_t ** 15560 ipif_recover_ire(ipif_t *ipif) 15561 { 15562 mblk_t *mp; 15563 ire_t **ipif_saved_irep; 15564 ire_t **irep; 15565 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15566 15567 ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, 15568 ipif->ipif_id)); 15569 15570 mutex_enter(&ipif->ipif_saved_ire_lock); 15571 ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * 15572 ipif->ipif_saved_ire_cnt, KM_NOSLEEP); 15573 if (ipif_saved_irep == NULL) { 15574 mutex_exit(&ipif->ipif_saved_ire_lock); 15575 return (NULL); 15576 } 15577 15578 irep = ipif_saved_irep; 15579 for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 15580 ire_t *ire; 15581 queue_t *rfq; 15582 queue_t *stq; 15583 ifrt_t *ifrt; 15584 uchar_t *src_addr; 15585 uchar_t *gateway_addr; 15586 ushort_t type; 15587 15588 /* 15589 * When the ire was initially created and then added in 15590 * ip_rt_add(), it was created either using ipif->ipif_net_type 15591 * in the case of a traditional interface route, or as one of 15592 * the IRE_OFFSUBNET types (with the exception of 15593 * IRE_HOST types ire which is created by icmp_redirect() and 15594 * which we don't need to save or recover). In the case where 15595 * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update 15596 * the ire_type to IRE_IF_NORESOLVER before calling ire_add() 15597 * to satisfy software like GateD and Sun Cluster which creates 15598 * routes using the the loopback interface's address as a 15599 * gateway. 15600 * 15601 * As ifrt->ifrt_type reflects the already updated ire_type, 15602 * ire_create() will be called in the same way here as 15603 * in ip_rt_add(), namely using ipif->ipif_net_type when 15604 * the route looks like a traditional interface route (where 15605 * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using 15606 * the saved ifrt->ifrt_type. This means that in the case where 15607 * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by 15608 * ire_create() will be an IRE_LOOPBACK, it will then be turned 15609 * into an IRE_IF_NORESOLVER and then added by ire_add(). 15610 */ 15611 ifrt = (ifrt_t *)mp->b_rptr; 15612 ASSERT(ifrt->ifrt_type != IRE_CACHE); 15613 if (ifrt->ifrt_type & IRE_INTERFACE) { 15614 rfq = NULL; 15615 stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) 15616 ? ipif->ipif_rq : ipif->ipif_wq; 15617 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15618 ? (uint8_t *)&ifrt->ifrt_src_addr 15619 : (uint8_t *)&ipif->ipif_src_addr; 15620 gateway_addr = NULL; 15621 type = ipif->ipif_net_type; 15622 } else if (ifrt->ifrt_type & IRE_BROADCAST) { 15623 /* Recover multiroute broadcast IRE. */ 15624 rfq = ipif->ipif_rq; 15625 stq = ipif->ipif_wq; 15626 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15627 ? (uint8_t *)&ifrt->ifrt_src_addr 15628 : (uint8_t *)&ipif->ipif_src_addr; 15629 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 15630 type = ifrt->ifrt_type; 15631 } else { 15632 rfq = NULL; 15633 stq = NULL; 15634 src_addr = (ifrt->ifrt_flags & RTF_SETSRC) 15635 ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; 15636 gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; 15637 type = ifrt->ifrt_type; 15638 } 15639 15640 /* 15641 * Create a copy of the IRE with the saved address and netmask. 15642 */ 15643 ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " 15644 "0x%x/0x%x\n", 15645 ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, 15646 ntohl(ifrt->ifrt_addr), 15647 ntohl(ifrt->ifrt_mask))); 15648 ire = ire_create( 15649 (uint8_t *)&ifrt->ifrt_addr, 15650 (uint8_t *)&ifrt->ifrt_mask, 15651 src_addr, 15652 gateway_addr, 15653 &ifrt->ifrt_max_frag, 15654 NULL, 15655 rfq, 15656 stq, 15657 type, 15658 ipif, 15659 0, 15660 0, 15661 0, 15662 ifrt->ifrt_flags, 15663 &ifrt->ifrt_iulp_info, 15664 NULL, 15665 NULL, 15666 ipst); 15667 15668 if (ire == NULL) { 15669 mutex_exit(&ipif->ipif_saved_ire_lock); 15670 kmem_free(ipif_saved_irep, 15671 ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); 15672 return (NULL); 15673 } 15674 15675 /* 15676 * Some software (for example, GateD and Sun Cluster) attempts 15677 * to create (what amount to) IRE_PREFIX routes with the 15678 * loopback address as the gateway. This is primarily done to 15679 * set up prefixes with the RTF_REJECT flag set (for example, 15680 * when generating aggregate routes.) 15681 * 15682 * If the IRE type (as defined by ipif->ipif_net_type) is 15683 * IRE_LOOPBACK, then we map the request into a 15684 * IRE_IF_NORESOLVER. 15685 */ 15686 if (ipif->ipif_net_type == IRE_LOOPBACK) 15687 ire->ire_type = IRE_IF_NORESOLVER; 15688 /* 15689 * ire held by ire_add, will be refreled' towards the 15690 * the end of ipif_up_done 15691 */ 15692 (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); 15693 *irep = ire; 15694 irep++; 15695 ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); 15696 } 15697 mutex_exit(&ipif->ipif_saved_ire_lock); 15698 return (ipif_saved_irep); 15699 } 15700 15701 /* 15702 * Used to set the netmask and broadcast address to default values when the 15703 * interface is brought up. (Always called as writer.) 15704 */ 15705 static void 15706 ipif_set_default(ipif_t *ipif) 15707 { 15708 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 15709 15710 if (!ipif->ipif_isv6) { 15711 /* 15712 * Interface holds an IPv4 address. Default 15713 * mask is the natural netmask. 15714 */ 15715 if (!ipif->ipif_net_mask) { 15716 ipaddr_t v4mask; 15717 15718 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 15719 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 15720 } 15721 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 15722 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 15723 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 15724 } else { 15725 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 15726 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 15727 } 15728 /* 15729 * NOTE: SunOS 4.X does this even if the broadcast address 15730 * has been already set thus we do the same here. 15731 */ 15732 if (ipif->ipif_flags & IPIF_BROADCAST) { 15733 ipaddr_t v4addr; 15734 15735 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 15736 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 15737 } 15738 } else { 15739 /* 15740 * Interface holds an IPv6-only address. Default 15741 * mask is all-ones. 15742 */ 15743 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 15744 ipif->ipif_v6net_mask = ipv6_all_ones; 15745 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 15746 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 15747 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 15748 } else { 15749 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 15750 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 15751 } 15752 } 15753 } 15754 15755 /* 15756 * Return 0 if this address can be used as local address without causing 15757 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 15758 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 15759 * Note that the same IPv6 link-local address is allowed as long as the ills 15760 * are not on the same link. 15761 */ 15762 int 15763 ip_addr_availability_check(ipif_t *new_ipif) 15764 { 15765 in6_addr_t our_v6addr; 15766 ill_t *ill; 15767 ipif_t *ipif; 15768 ill_walk_context_t ctx; 15769 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 15770 15771 ASSERT(IAM_WRITER_IPIF(new_ipif)); 15772 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 15773 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 15774 15775 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 15776 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 15777 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 15778 return (0); 15779 15780 our_v6addr = new_ipif->ipif_v6lcl_addr; 15781 15782 if (new_ipif->ipif_isv6) 15783 ill = ILL_START_WALK_V6(&ctx, ipst); 15784 else 15785 ill = ILL_START_WALK_V4(&ctx, ipst); 15786 15787 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 15788 for (ipif = ill->ill_ipif; ipif != NULL; 15789 ipif = ipif->ipif_next) { 15790 if ((ipif == new_ipif) || 15791 !(ipif->ipif_flags & IPIF_UP) || 15792 (ipif->ipif_flags & IPIF_UNNUMBERED) || 15793 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 15794 &our_v6addr)) 15795 continue; 15796 15797 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 15798 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 15799 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 15800 ipif->ipif_flags |= IPIF_UNNUMBERED; 15801 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 15802 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 15803 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 15804 continue; 15805 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 15806 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 15807 continue; 15808 else if (new_ipif->ipif_ill == ill) 15809 return (EADDRINUSE); 15810 else 15811 return (EADDRNOTAVAIL); 15812 } 15813 } 15814 15815 return (0); 15816 } 15817 15818 /* 15819 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 15820 * IREs for the ipif. 15821 * When the routine returns EINPROGRESS then mp has been consumed and 15822 * the ioctl will be acked from ip_rput_dlpi. 15823 */ 15824 int 15825 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 15826 { 15827 ill_t *ill = ipif->ipif_ill; 15828 boolean_t isv6 = ipif->ipif_isv6; 15829 int err = 0; 15830 boolean_t success; 15831 uint_t ipif_orig_id; 15832 ip_stack_t *ipst = ill->ill_ipst; 15833 15834 ASSERT(IAM_WRITER_IPIF(ipif)); 15835 15836 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 15837 15838 /* Shouldn't get here if it is already up. */ 15839 if (ipif->ipif_flags & IPIF_UP) 15840 return (EALREADY); 15841 15842 /* 15843 * If this is a request to bring up a data address on an interface 15844 * under IPMP, then move the address to its IPMP meta-interface and 15845 * try to bring it up. One complication is that the zeroth ipif for 15846 * an ill is special, in that every ill always has one, and that code 15847 * throughout IP deferences ill->ill_ipif without holding any locks. 15848 */ 15849 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 15850 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 15851 ipif_t *stubipif = NULL, *moveipif = NULL; 15852 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 15853 15854 /* 15855 * The ipif being brought up should be quiesced. If it's not, 15856 * something has gone amiss and we need to bail out. (If it's 15857 * quiesced, we know it will remain so via IPIF_CHANGING.) 15858 */ 15859 mutex_enter(&ill->ill_lock); 15860 if (!ipif_is_quiescent(ipif)) { 15861 mutex_exit(&ill->ill_lock); 15862 return (EINVAL); 15863 } 15864 mutex_exit(&ill->ill_lock); 15865 15866 /* 15867 * If we're going to need to allocate ipifs, do it prior 15868 * to starting the move (and grabbing locks). 15869 */ 15870 if (ipif->ipif_id == 0) { 15871 moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 15872 B_FALSE); 15873 stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 15874 B_FALSE); 15875 if (moveipif == NULL || stubipif == NULL) { 15876 mi_free(moveipif); 15877 mi_free(stubipif); 15878 return (ENOMEM); 15879 } 15880 } 15881 15882 /* 15883 * Grab or transfer the ipif to move. During the move, keep 15884 * ill_g_lock held to prevent any ill walker threads from 15885 * seeing things in an inconsistent state. 15886 */ 15887 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15888 if (ipif->ipif_id != 0) { 15889 ipif_remove(ipif); 15890 } else { 15891 ipif_transfer(ipif, moveipif, stubipif); 15892 ipif = moveipif; 15893 } 15894 15895 /* 15896 * Place the ipif on the IPMP ill. If the zeroth ipif on 15897 * the IPMP ill is a stub (0.0.0.0 down address) then we 15898 * replace that one. Otherwise, pick the next available slot. 15899 */ 15900 ipif->ipif_ill = ipmp_ill; 15901 ipif_orig_id = ipif->ipif_id; 15902 15903 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 15904 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 15905 ipif = ipmp_ill->ill_ipif; 15906 } else { 15907 ipif->ipif_id = -1; 15908 if (ipif_insert(ipif, B_FALSE) != 0) { 15909 /* 15910 * No more available ipif_id's -- put it back 15911 * on the original ill and fail the operation. 15912 * Since we're writer on the ill, we can be 15913 * sure our old slot is still available. 15914 */ 15915 ipif->ipif_id = ipif_orig_id; 15916 ipif->ipif_ill = ill; 15917 if (ipif_orig_id == 0) { 15918 ipif_transfer(ipif, ill->ill_ipif, 15919 NULL); 15920 } else { 15921 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 15922 } 15923 rw_exit(&ipst->ips_ill_g_lock); 15924 return (ENOMEM); 15925 } 15926 } 15927 rw_exit(&ipst->ips_ill_g_lock); 15928 15929 /* 15930 * Tell SCTP that the ipif has moved. Note that even if we 15931 * had to allocate a new ipif, the original sequence id was 15932 * preserved and therefore SCTP won't know. 15933 */ 15934 sctp_move_ipif(ipif, ill, ipmp_ill); 15935 15936 /* 15937 * If the ipif being brought up was on slot zero, then we 15938 * first need to bring up the placeholder we stuck there. In 15939 * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call 15940 * to ipif_up() itself, if we successfully bring up the 15941 * placeholder, we'll check ill_move_ipif and bring it up too. 15942 */ 15943 if (ipif_orig_id == 0) { 15944 ASSERT(ill->ill_move_ipif == NULL); 15945 ill->ill_move_ipif = ipif; 15946 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 15947 ASSERT(ill->ill_move_ipif == NULL); 15948 if (err != EINPROGRESS) 15949 ill->ill_move_ipif = NULL; 15950 return (err); 15951 } 15952 15953 /* 15954 * Bring it up on the IPMP ill. 15955 */ 15956 return (ipif_up(ipif, q, mp)); 15957 } 15958 15959 /* Skip arp/ndp for any loopback interface. */ 15960 if (ill->ill_wq != NULL) { 15961 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 15962 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 15963 15964 if (!ill->ill_dl_up) { 15965 /* 15966 * ill_dl_up is not yet set. i.e. we are yet to 15967 * DL_BIND with the driver and this is the first 15968 * logical interface on the ill to become "up". 15969 * Tell the driver to get going (via DL_BIND_REQ). 15970 * Note that changing "significant" IFF_ flags 15971 * address/netmask etc cause a down/up dance, but 15972 * does not cause an unbind (DL_UNBIND) with the driver 15973 */ 15974 return (ill_dl_up(ill, ipif, mp, q)); 15975 } 15976 15977 /* 15978 * ipif_resolver_up may end up sending an 15979 * AR_INTERFACE_UP message to ARP, which would, in 15980 * turn send a DLPI message to the driver. ioctls are 15981 * serialized and so we cannot send more than one 15982 * interface up message at a time. If ipif_resolver_up 15983 * does send an interface up message to ARP, we get 15984 * EINPROGRESS and we will complete in ip_arp_done. 15985 */ 15986 15987 ASSERT(connp != NULL || !CONN_Q(q)); 15988 if (connp != NULL) 15989 mutex_enter(&connp->conn_lock); 15990 mutex_enter(&ill->ill_lock); 15991 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 15992 mutex_exit(&ill->ill_lock); 15993 if (connp != NULL) 15994 mutex_exit(&connp->conn_lock); 15995 if (!success) 15996 return (EINTR); 15997 15998 /* 15999 * Crank up the resolver. For IPv6, this cranks up the 16000 * external resolver if one is configured, but even if an 16001 * external resolver isn't configured, it must be called to 16002 * reset DAD state. For IPv6, if an external resolver is not 16003 * being used, ipif_resolver_up() will never return 16004 * EINPROGRESS, so we can always call ipif_ndp_up() here. 16005 * Note that if an external resolver is being used, there's no 16006 * need to call ipif_ndp_up() since it will do nothing. 16007 */ 16008 err = ipif_resolver_up(ipif, Res_act_initial); 16009 if (err == EINPROGRESS) { 16010 /* We will complete it in ip_arp_done() */ 16011 return (err); 16012 } 16013 16014 if (isv6 && err == 0) 16015 err = ipif_ndp_up(ipif, B_TRUE); 16016 16017 ASSERT(err != EINPROGRESS); 16018 mp = ipsq_pending_mp_get(ipsq, &connp); 16019 ASSERT(mp != NULL); 16020 if (err != 0) 16021 return (err); 16022 } else { 16023 /* 16024 * Interfaces without underlying hardware don't do duplicate 16025 * address detection. 16026 */ 16027 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 16028 ipif->ipif_addr_ready = 1; 16029 } 16030 16031 err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif); 16032 if (err == 0 && ill->ill_move_ipif != NULL) { 16033 ipif = ill->ill_move_ipif; 16034 ill->ill_move_ipif = NULL; 16035 return (ipif_up(ipif, q, mp)); 16036 } 16037 return (err); 16038 } 16039 16040 /* 16041 * Perform a bind for the physical device. 16042 * When the routine returns EINPROGRESS then mp has been consumed and 16043 * the ioctl will be acked from ip_rput_dlpi. 16044 * Allocate an unbind message and save it until ipif_down. 16045 */ 16046 static int 16047 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 16048 { 16049 areq_t *areq; 16050 mblk_t *areq_mp = NULL; 16051 mblk_t *bind_mp = NULL; 16052 mblk_t *unbind_mp = NULL; 16053 conn_t *connp; 16054 boolean_t success; 16055 uint16_t sap_addr; 16056 16057 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 16058 ASSERT(IAM_WRITER_ILL(ill)); 16059 ASSERT(mp != NULL); 16060 16061 /* Create a resolver cookie for ARP */ 16062 if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { 16063 areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0); 16064 if (areq_mp == NULL) 16065 return (ENOMEM); 16066 16067 freemsg(ill->ill_resolver_mp); 16068 ill->ill_resolver_mp = areq_mp; 16069 areq = (areq_t *)areq_mp->b_rptr; 16070 sap_addr = ill->ill_sap; 16071 bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); 16072 } 16073 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 16074 DL_BIND_REQ); 16075 if (bind_mp == NULL) 16076 goto bad; 16077 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 16078 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 16079 16080 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); 16081 if (unbind_mp == NULL) 16082 goto bad; 16083 16084 /* 16085 * Record state needed to complete this operation when the 16086 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 16087 */ 16088 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 16089 ASSERT(connp != NULL || !CONN_Q(q)); 16090 GRAB_CONN_LOCK(q); 16091 mutex_enter(&ipif->ipif_ill->ill_lock); 16092 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 16093 mutex_exit(&ipif->ipif_ill->ill_lock); 16094 RELEASE_CONN_LOCK(q); 16095 if (!success) 16096 goto bad; 16097 16098 /* 16099 * Save the unbind message for ill_dl_down(); it will be consumed when 16100 * the interface goes down. 16101 */ 16102 ASSERT(ill->ill_unbind_mp == NULL); 16103 ill->ill_unbind_mp = unbind_mp; 16104 16105 ill_dlpi_send(ill, bind_mp); 16106 /* Send down link-layer capabilities probe if not already done. */ 16107 ill_capability_probe(ill); 16108 16109 /* 16110 * Sysid used to rely on the fact that netboots set domainname 16111 * and the like. Now that miniroot boots aren't strictly netboots 16112 * and miniroot network configuration is driven from userland 16113 * these things still need to be set. This situation can be detected 16114 * by comparing the interface being configured here to the one 16115 * dhcifname was set to reference by the boot loader. Once sysid is 16116 * converted to use dhcp_ipc_getinfo() this call can go away. 16117 */ 16118 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 16119 (strcmp(ill->ill_name, dhcifname) == 0) && 16120 (strlen(srpc_domain) == 0)) { 16121 if (dhcpinit() != 0) 16122 cmn_err(CE_WARN, "no cached dhcp response"); 16123 } 16124 16125 /* 16126 * This operation will complete in ip_rput_dlpi with either 16127 * a DL_BIND_ACK or DL_ERROR_ACK. 16128 */ 16129 return (EINPROGRESS); 16130 bad: 16131 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 16132 16133 freemsg(bind_mp); 16134 freemsg(unbind_mp); 16135 return (ENOMEM); 16136 } 16137 16138 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 16139 16140 /* 16141 * DLPI and ARP is up. 16142 * Create all the IREs associated with an interface bring up multicast. 16143 * Set the interface flag and finish other initialization 16144 * that potentially had to be differed to after DL_BIND_ACK. 16145 */ 16146 int 16147 ipif_up_done(ipif_t *ipif) 16148 { 16149 ire_t *ire_array[20]; 16150 ire_t **irep = ire_array; 16151 ire_t **irep1; 16152 ipaddr_t net_mask = 0; 16153 ipaddr_t subnet_mask, route_mask; 16154 ill_t *ill = ipif->ipif_ill; 16155 queue_t *stq; 16156 ipif_t *src_ipif; 16157 ipif_t *tmp_ipif; 16158 boolean_t flush_ire_cache = B_TRUE; 16159 int err = 0; 16160 ire_t **ipif_saved_irep = NULL; 16161 int ipif_saved_ire_cnt; 16162 int cnt; 16163 boolean_t src_ipif_held = B_FALSE; 16164 boolean_t loopback = B_FALSE; 16165 ip_stack_t *ipst = ill->ill_ipst; 16166 16167 ip1dbg(("ipif_up_done(%s:%u)\n", 16168 ipif->ipif_ill->ill_name, ipif->ipif_id)); 16169 /* Check if this is a loopback interface */ 16170 if (ipif->ipif_ill->ill_wq == NULL) 16171 loopback = B_TRUE; 16172 16173 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 16174 /* 16175 * If all other interfaces for this ill are down or DEPRECATED, 16176 * or otherwise unsuitable for source address selection, remove 16177 * any IRE_CACHE entries for this ill to make sure source 16178 * address selection gets to take this new ipif into account. 16179 * No need to hold ill_lock while traversing the ipif list since 16180 * we are writer 16181 */ 16182 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 16183 tmp_ipif = tmp_ipif->ipif_next) { 16184 if (((tmp_ipif->ipif_flags & 16185 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 16186 !(tmp_ipif->ipif_flags & IPIF_UP)) || 16187 (tmp_ipif == ipif)) 16188 continue; 16189 /* first useable pre-existing interface */ 16190 flush_ire_cache = B_FALSE; 16191 break; 16192 } 16193 if (flush_ire_cache) 16194 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, 16195 IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); 16196 16197 /* 16198 * Figure out which way the send-to queue should go. Only 16199 * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK 16200 * should show up here. 16201 */ 16202 switch (ill->ill_net_type) { 16203 case IRE_IF_RESOLVER: 16204 stq = ill->ill_rq; 16205 break; 16206 case IRE_IF_NORESOLVER: 16207 case IRE_LOOPBACK: 16208 stq = ill->ill_wq; 16209 break; 16210 default: 16211 return (EINVAL); 16212 } 16213 16214 if (IS_LOOPBACK(ill)) { 16215 /* 16216 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 16217 * ipif_lookup_on_name(), but in the case of zones we can have 16218 * several loopback addresses on lo0. So all the interfaces with 16219 * loopback addresses need to be marked IRE_LOOPBACK. 16220 */ 16221 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 16222 htonl(INADDR_LOOPBACK)) 16223 ipif->ipif_ire_type = IRE_LOOPBACK; 16224 else 16225 ipif->ipif_ire_type = IRE_LOCAL; 16226 } 16227 16228 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || 16229 ((ipif->ipif_flags & IPIF_DEPRECATED) && 16230 !(ipif->ipif_flags & IPIF_NOFAILOVER))) { 16231 /* 16232 * Can't use our source address. Select a different 16233 * source address for the IRE_INTERFACE and IRE_LOCAL 16234 */ 16235 src_ipif = ipif_select_source(ipif->ipif_ill, 16236 ipif->ipif_subnet, ipif->ipif_zoneid); 16237 if (src_ipif == NULL) 16238 src_ipif = ipif; /* Last resort */ 16239 else 16240 src_ipif_held = B_TRUE; 16241 } else { 16242 src_ipif = ipif; 16243 } 16244 16245 /* Create all the IREs associated with this interface */ 16246 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 16247 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 16248 16249 /* 16250 * If we're on a labeled system then make sure that zone- 16251 * private addresses have proper remote host database entries. 16252 */ 16253 if (is_system_labeled() && 16254 ipif->ipif_ire_type != IRE_LOOPBACK && 16255 !tsol_check_interface_address(ipif)) 16256 return (EINVAL); 16257 16258 /* Register the source address for __sin6_src_id */ 16259 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 16260 ipif->ipif_zoneid, ipst); 16261 if (err != 0) { 16262 ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); 16263 return (err); 16264 } 16265 16266 /* If the interface address is set, create the local IRE. */ 16267 ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", 16268 (void *)ipif, 16269 ipif->ipif_ire_type, 16270 ntohl(ipif->ipif_lcl_addr))); 16271 *irep++ = ire_create( 16272 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 16273 (uchar_t *)&ip_g_all_ones, /* mask */ 16274 (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ 16275 NULL, /* no gateway */ 16276 &ip_loopback_mtuplus, /* max frag size */ 16277 NULL, 16278 ipif->ipif_rq, /* recv-from queue */ 16279 NULL, /* no send-to queue */ 16280 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 16281 ipif, 16282 0, 16283 0, 16284 0, 16285 (ipif->ipif_flags & IPIF_PRIVATE) ? 16286 RTF_PRIVATE : 0, 16287 &ire_uinfo_null, 16288 NULL, 16289 NULL, 16290 ipst); 16291 } else { 16292 ip1dbg(( 16293 "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", 16294 ipif->ipif_ire_type, 16295 ntohl(ipif->ipif_lcl_addr), 16296 (uint_t)ipif->ipif_flags)); 16297 } 16298 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 16299 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 16300 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 16301 } else { 16302 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 16303 } 16304 16305 subnet_mask = ipif->ipif_net_mask; 16306 16307 /* 16308 * If mask was not specified, use natural netmask of 16309 * interface address. Also, store this mask back into the 16310 * ipif struct. 16311 */ 16312 if (subnet_mask == 0) { 16313 subnet_mask = net_mask; 16314 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 16315 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 16316 ipif->ipif_v6subnet); 16317 } 16318 16319 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 16320 if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && 16321 ipif->ipif_subnet != INADDR_ANY) { 16322 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 16323 16324 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 16325 route_mask = IP_HOST_MASK; 16326 } else { 16327 route_mask = subnet_mask; 16328 } 16329 16330 ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " 16331 "creating if IRE ill_net_type 0x%x for 0x%x\n", 16332 (void *)ipif, (void *)ill, 16333 ill->ill_net_type, 16334 ntohl(ipif->ipif_subnet))); 16335 *irep++ = ire_create( 16336 (uchar_t *)&ipif->ipif_subnet, /* dest address */ 16337 (uchar_t *)&route_mask, /* mask */ 16338 (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ 16339 NULL, /* no gateway */ 16340 &ipif->ipif_mtu, /* max frag */ 16341 NULL, 16342 NULL, /* no recv queue */ 16343 stq, /* send-to queue */ 16344 ill->ill_net_type, /* IF_[NO]RESOLVER */ 16345 ipif, 16346 0, 16347 0, 16348 0, 16349 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, 16350 &ire_uinfo_null, 16351 NULL, 16352 NULL, 16353 ipst); 16354 } 16355 16356 /* 16357 * Create any necessary broadcast IREs. 16358 */ 16359 if (ipif->ipif_flags & IPIF_BROADCAST) 16360 irep = ipif_create_bcast_ires(ipif, irep); 16361 16362 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 16363 16364 /* If an earlier ire_create failed, get out now */ 16365 for (irep1 = irep; irep1 > ire_array; ) { 16366 irep1--; 16367 if (*irep1 == NULL) { 16368 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 16369 err = ENOMEM; 16370 goto bad; 16371 } 16372 } 16373 16374 /* 16375 * Need to atomically check for IP address availability under 16376 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 16377 * ills or new ipifs can be added while we are checking availability. 16378 */ 16379 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16380 mutex_enter(&ipst->ips_ip_addr_avail_lock); 16381 /* Mark it up, and increment counters. */ 16382 ipif->ipif_flags |= IPIF_UP; 16383 ill->ill_ipif_up_count++; 16384 err = ip_addr_availability_check(ipif); 16385 mutex_exit(&ipst->ips_ip_addr_avail_lock); 16386 rw_exit(&ipst->ips_ill_g_lock); 16387 16388 if (err != 0) { 16389 /* 16390 * Our address may already be up on the same ill. In this case, 16391 * the ARP entry for our ipif replaced the one for the other 16392 * ipif. So we don't want to delete it (otherwise the other ipif 16393 * would be unable to send packets). 16394 * ip_addr_availability_check() identifies this case for us and 16395 * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL 16396 * which is the expected error code. 16397 */ 16398 if (err == EADDRINUSE) { 16399 freemsg(ipif->ipif_arp_del_mp); 16400 ipif->ipif_arp_del_mp = NULL; 16401 err = EADDRNOTAVAIL; 16402 } 16403 ill->ill_ipif_up_count--; 16404 ipif->ipif_flags &= ~IPIF_UP; 16405 goto bad; 16406 } 16407 16408 /* 16409 * Add in all newly created IREs. ire_create_bcast() has 16410 * already checked for duplicates of the IRE_BROADCAST type. 16411 */ 16412 for (irep1 = irep; irep1 > ire_array; ) { 16413 irep1--; 16414 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); 16415 /* 16416 * refheld by ire_add. refele towards the end of the func 16417 */ 16418 (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); 16419 } 16420 16421 /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ 16422 ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; 16423 ipif_saved_irep = ipif_recover_ire(ipif); 16424 16425 if (!loopback) { 16426 /* 16427 * If the broadcast address has been set, make sure it makes 16428 * sense based on the interface address. 16429 * Only match on ill since we are sharing broadcast addresses. 16430 */ 16431 if ((ipif->ipif_brd_addr != INADDR_ANY) && 16432 (ipif->ipif_flags & IPIF_BROADCAST)) { 16433 ire_t *ire; 16434 16435 ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, 16436 IRE_BROADCAST, ipif, ALL_ZONES, 16437 NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); 16438 16439 if (ire == NULL) { 16440 /* 16441 * If there isn't a matching broadcast IRE, 16442 * revert to the default for this netmask. 16443 */ 16444 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16445 mutex_enter(&ipif->ipif_ill->ill_lock); 16446 ipif_set_default(ipif); 16447 mutex_exit(&ipif->ipif_ill->ill_lock); 16448 } else { 16449 ire_refrele(ire); 16450 } 16451 } 16452 16453 } 16454 16455 if (ill->ill_need_recover_multicast) { 16456 /* 16457 * Need to recover all multicast memberships in the driver. 16458 * This had to be deferred until we had attached. The same 16459 * code exists in ipif_up_done_v6() to recover IPv6 16460 * memberships. 16461 * 16462 * Note that it would be preferable to unconditionally do the 16463 * ill_recover_multicast() in ill_dl_up(), but we cannot do 16464 * that since ill_join_allmulti() depends on ill_dl_up being 16465 * set, and it is not set until we receive a DL_BIND_ACK after 16466 * having called ill_dl_up(). 16467 */ 16468 ill_recover_multicast(ill); 16469 } 16470 16471 if (ill->ill_ipif_up_count == 1) { 16472 /* 16473 * Since the interface is now up, it may now be active. 16474 */ 16475 if (IS_UNDER_IPMP(ill)) 16476 ipmp_ill_refresh_active(ill); 16477 16478 /* 16479 * If this is an IPMP interface, we may now be able to 16480 * establish ARP entries. 16481 */ 16482 if (IS_IPMP(ill)) 16483 ipmp_illgrp_refresh_arpent(ill->ill_grp); 16484 } 16485 16486 /* Join the allhosts multicast address */ 16487 ipif_multicast_up(ipif); 16488 16489 /* 16490 * See if anybody else would benefit from our new ipif. 16491 */ 16492 if (!loopback && 16493 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 16494 ill_update_source_selection(ill); 16495 } 16496 16497 for (irep1 = irep; irep1 > ire_array; ) { 16498 irep1--; 16499 if (*irep1 != NULL) { 16500 /* was held in ire_add */ 16501 ire_refrele(*irep1); 16502 } 16503 } 16504 16505 cnt = ipif_saved_ire_cnt; 16506 for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { 16507 if (*irep1 != NULL) { 16508 /* was held in ire_add */ 16509 ire_refrele(*irep1); 16510 } 16511 } 16512 16513 if (!loopback && ipif->ipif_addr_ready) { 16514 /* Broadcast an address mask reply. */ 16515 ipif_mask_reply(ipif); 16516 } 16517 if (ipif_saved_irep != NULL) { 16518 kmem_free(ipif_saved_irep, 16519 ipif_saved_ire_cnt * sizeof (ire_t *)); 16520 } 16521 if (src_ipif_held) 16522 ipif_refrele(src_ipif); 16523 16524 /* 16525 * This had to be deferred until we had bound. Tell routing sockets and 16526 * others that this interface is up if it looks like the address has 16527 * been validated. Otherwise, if it isn't ready yet, wait for 16528 * duplicate address detection to do its thing. 16529 */ 16530 if (ipif->ipif_addr_ready) 16531 ipif_up_notify(ipif); 16532 return (0); 16533 16534 bad: 16535 ip1dbg(("ipif_up_done: FAILED \n")); 16536 16537 while (irep > ire_array) { 16538 irep--; 16539 if (*irep != NULL) 16540 ire_delete(*irep); 16541 } 16542 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 16543 16544 if (ipif_saved_irep != NULL) { 16545 kmem_free(ipif_saved_irep, 16546 ipif_saved_ire_cnt * sizeof (ire_t *)); 16547 } 16548 if (src_ipif_held) 16549 ipif_refrele(src_ipif); 16550 16551 ipif_resolver_down(ipif); 16552 return (err); 16553 } 16554 16555 /* 16556 * Turn off the ARP with the ILLF_NOARP flag. 16557 */ 16558 static int 16559 ill_arp_off(ill_t *ill) 16560 { 16561 mblk_t *arp_off_mp = NULL; 16562 mblk_t *arp_on_mp = NULL; 16563 16564 ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); 16565 16566 ASSERT(IAM_WRITER_ILL(ill)); 16567 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 16568 16569 /* 16570 * If the on message is still around we've already done 16571 * an arp_off without doing an arp_on thus there is no 16572 * work needed. 16573 */ 16574 if (ill->ill_arp_on_mp != NULL) 16575 return (0); 16576 16577 /* 16578 * Allocate an ARP on message (to be saved) and an ARP off message 16579 */ 16580 arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); 16581 if (!arp_off_mp) 16582 return (ENOMEM); 16583 16584 arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); 16585 if (!arp_on_mp) 16586 goto failed; 16587 16588 ASSERT(ill->ill_arp_on_mp == NULL); 16589 ill->ill_arp_on_mp = arp_on_mp; 16590 16591 /* Send an AR_INTERFACE_OFF request */ 16592 putnext(ill->ill_rq, arp_off_mp); 16593 return (0); 16594 failed: 16595 16596 if (arp_off_mp) 16597 freemsg(arp_off_mp); 16598 return (ENOMEM); 16599 } 16600 16601 /* 16602 * Turn on ARP by turning off the ILLF_NOARP flag. 16603 */ 16604 static int 16605 ill_arp_on(ill_t *ill) 16606 { 16607 mblk_t *mp; 16608 16609 ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); 16610 16611 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 16612 16613 ASSERT(IAM_WRITER_ILL(ill)); 16614 /* 16615 * Send an AR_INTERFACE_ON request if we have already done 16616 * an arp_off (which allocated the message). 16617 */ 16618 if (ill->ill_arp_on_mp != NULL) { 16619 mp = ill->ill_arp_on_mp; 16620 ill->ill_arp_on_mp = NULL; 16621 putnext(ill->ill_rq, mp); 16622 } 16623 return (0); 16624 } 16625 16626 /* 16627 * Checks for availbility of a usable source address (if there is one) when the 16628 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 16629 * this selection is done regardless of the destination. 16630 */ 16631 boolean_t 16632 ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) 16633 { 16634 uint_t ifindex; 16635 ipif_t *ipif = NULL; 16636 ill_t *uill; 16637 boolean_t isv6; 16638 ip_stack_t *ipst = ill->ill_ipst; 16639 16640 ASSERT(ill != NULL); 16641 16642 isv6 = ill->ill_isv6; 16643 ifindex = ill->ill_usesrc_ifindex; 16644 if (ifindex != 0) { 16645 uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, 16646 NULL, ipst); 16647 if (uill == NULL) 16648 return (NULL); 16649 mutex_enter(&uill->ill_lock); 16650 for (ipif = uill->ill_ipif; ipif != NULL; 16651 ipif = ipif->ipif_next) { 16652 if (!IPIF_CAN_LOOKUP(ipif)) 16653 continue; 16654 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 16655 continue; 16656 if (!(ipif->ipif_flags & IPIF_UP)) 16657 continue; 16658 if (ipif->ipif_zoneid != zoneid) 16659 continue; 16660 if ((isv6 && 16661 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || 16662 (ipif->ipif_lcl_addr == INADDR_ANY)) 16663 continue; 16664 mutex_exit(&uill->ill_lock); 16665 ill_refrele(uill); 16666 return (B_TRUE); 16667 } 16668 mutex_exit(&uill->ill_lock); 16669 ill_refrele(uill); 16670 } 16671 return (B_FALSE); 16672 } 16673 16674 /* 16675 * IP source address type, sorted from worst to best. For a given type, 16676 * always prefer IP addresses on the same subnet. All-zones addresses are 16677 * suboptimal because they pose problems with unlabeled destinations. 16678 */ 16679 typedef enum { 16680 IPIF_NONE, 16681 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 16682 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 16683 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 16684 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 16685 IPIF_DIFFNET, /* normal and different subnet */ 16686 IPIF_SAMENET /* normal and same subnet */ 16687 } ipif_type_t; 16688 16689 /* 16690 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 16691 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 16692 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 16693 * the first one, unless IPMP is used in which case we round-robin among them; 16694 * see below for more. 16695 * 16696 * Returns NULL if there is no suitable source address for the ill. 16697 * This only occurs when there is no valid source address for the ill. 16698 */ 16699 ipif_t * 16700 ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) 16701 { 16702 ill_t *usill = NULL; 16703 ill_t *ipmp_ill = NULL; 16704 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 16705 ipif_type_t type, best_type; 16706 tsol_tpc_t *src_rhtp, *dst_rhtp; 16707 ip_stack_t *ipst = ill->ill_ipst; 16708 boolean_t samenet; 16709 16710 if (ill->ill_usesrc_ifindex != 0) { 16711 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 16712 B_FALSE, NULL, NULL, NULL, NULL, ipst); 16713 if (usill != NULL) 16714 ill = usill; /* Select source from usesrc ILL */ 16715 else 16716 return (NULL); 16717 } 16718 16719 /* 16720 * Test addresses should never be used for source address selection, 16721 * so if we were passed one, switch to the IPMP meta-interface. 16722 */ 16723 if (IS_UNDER_IPMP(ill)) { 16724 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 16725 ill = ipmp_ill; /* Select source from IPMP ill */ 16726 else 16727 return (NULL); 16728 } 16729 16730 /* 16731 * If we're dealing with an unlabeled destination on a labeled system, 16732 * make sure that we ignore source addresses that are incompatible with 16733 * the destination's default label. That destination's default label 16734 * must dominate the minimum label on the source address. 16735 */ 16736 dst_rhtp = NULL; 16737 if (is_system_labeled()) { 16738 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 16739 if (dst_rhtp == NULL) 16740 return (NULL); 16741 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 16742 TPC_RELE(dst_rhtp); 16743 dst_rhtp = NULL; 16744 } 16745 } 16746 16747 /* 16748 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 16749 * can be deleted. But an ipif/ill can get CONDEMNED any time. 16750 * After selecting the right ipif, under ill_lock make sure ipif is 16751 * not condemned, and increment refcnt. If ipif is CONDEMNED, 16752 * we retry. Inside the loop we still need to check for CONDEMNED, 16753 * but not under a lock. 16754 */ 16755 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 16756 retry: 16757 /* 16758 * For source address selection, we treat the ipif list as circular 16759 * and continue until we get back to where we started. This allows 16760 * IPMP to vary source address selection (which improves inbound load 16761 * spreading) by caching its last ending point and starting from 16762 * there. NOTE: we don't have to worry about ill_src_ipif changing 16763 * ills since that can't happen on the IPMP ill. 16764 */ 16765 start_ipif = ill->ill_ipif; 16766 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 16767 start_ipif = ill->ill_src_ipif; 16768 16769 ipif = start_ipif; 16770 best_ipif = NULL; 16771 best_type = IPIF_NONE; 16772 do { 16773 if ((next_ipif = ipif->ipif_next) == NULL) 16774 next_ipif = ill->ill_ipif; 16775 16776 if (!IPIF_CAN_LOOKUP(ipif)) 16777 continue; 16778 /* Always skip NOLOCAL and ANYCAST interfaces */ 16779 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 16780 continue; 16781 if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready) 16782 continue; 16783 if (ipif->ipif_zoneid != zoneid && 16784 ipif->ipif_zoneid != ALL_ZONES) 16785 continue; 16786 16787 /* 16788 * Interfaces with 0.0.0.0 address are allowed to be UP, but 16789 * are not valid as source addresses. 16790 */ 16791 if (ipif->ipif_lcl_addr == INADDR_ANY) 16792 continue; 16793 16794 /* 16795 * Check compatibility of local address for destination's 16796 * default label if we're on a labeled system. Incompatible 16797 * addresses can't be used at all. 16798 */ 16799 if (dst_rhtp != NULL) { 16800 boolean_t incompat; 16801 16802 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 16803 IPV4_VERSION, B_FALSE); 16804 if (src_rhtp == NULL) 16805 continue; 16806 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 16807 src_rhtp->tpc_tp.tp_doi != 16808 dst_rhtp->tpc_tp.tp_doi || 16809 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 16810 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 16811 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 16812 src_rhtp->tpc_tp.tp_sl_set_cipso)); 16813 TPC_RELE(src_rhtp); 16814 if (incompat) 16815 continue; 16816 } 16817 16818 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 16819 16820 if (ipif->ipif_flags & IPIF_DEPRECATED) { 16821 type = samenet ? IPIF_SAMENET_DEPRECATED : 16822 IPIF_DIFFNET_DEPRECATED; 16823 } else if (ipif->ipif_zoneid == ALL_ZONES) { 16824 type = samenet ? IPIF_SAMENET_ALLZONES : 16825 IPIF_DIFFNET_ALLZONES; 16826 } else { 16827 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 16828 } 16829 16830 if (type > best_type) { 16831 best_type = type; 16832 best_ipif = ipif; 16833 if (best_type == IPIF_SAMENET) 16834 break; /* can't get better */ 16835 } 16836 } while ((ipif = next_ipif) != start_ipif); 16837 16838 if ((ipif = best_ipif) != NULL) { 16839 mutex_enter(&ipif->ipif_ill->ill_lock); 16840 if (!IPIF_CAN_LOOKUP(ipif)) { 16841 mutex_exit(&ipif->ipif_ill->ill_lock); 16842 goto retry; 16843 } 16844 ipif_refhold_locked(ipif); 16845 16846 /* 16847 * For IPMP, update the source ipif rotor to the next ipif, 16848 * provided we can look it up. (We must not use it if it's 16849 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 16850 * ipif_free() checked ill_src_ipif.) 16851 */ 16852 if (IS_IPMP(ill) && ipif != NULL) { 16853 next_ipif = ipif->ipif_next; 16854 if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) 16855 ill->ill_src_ipif = next_ipif; 16856 else 16857 ill->ill_src_ipif = NULL; 16858 } 16859 mutex_exit(&ipif->ipif_ill->ill_lock); 16860 } 16861 16862 rw_exit(&ipst->ips_ill_g_lock); 16863 if (usill != NULL) 16864 ill_refrele(usill); 16865 if (ipmp_ill != NULL) 16866 ill_refrele(ipmp_ill); 16867 if (dst_rhtp != NULL) 16868 TPC_RELE(dst_rhtp); 16869 16870 #ifdef DEBUG 16871 if (ipif == NULL) { 16872 char buf1[INET6_ADDRSTRLEN]; 16873 16874 ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", 16875 ill->ill_name, 16876 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 16877 } else { 16878 char buf1[INET6_ADDRSTRLEN]; 16879 char buf2[INET6_ADDRSTRLEN]; 16880 16881 ip1dbg(("ipif_select_source(%s, %s) -> %s\n", 16882 ipif->ipif_ill->ill_name, 16883 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 16884 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 16885 buf2, sizeof (buf2)))); 16886 } 16887 #endif /* DEBUG */ 16888 return (ipif); 16889 } 16890 16891 /* 16892 * If old_ipif is not NULL, see if ipif was derived from old 16893 * ipif and if so, recreate the interface route by re-doing 16894 * source address selection. This happens when ipif_down -> 16895 * ipif_update_other_ipifs calls us. 16896 * 16897 * If old_ipif is NULL, just redo the source address selection 16898 * if needed. This happens when ipif_up_done calls us. 16899 */ 16900 static void 16901 ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) 16902 { 16903 ire_t *ire; 16904 ire_t *ipif_ire; 16905 queue_t *stq; 16906 ipif_t *nipif; 16907 ill_t *ill; 16908 boolean_t need_rele = B_FALSE; 16909 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16910 16911 ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); 16912 ASSERT(IAM_WRITER_IPIF(ipif)); 16913 16914 ill = ipif->ipif_ill; 16915 if (!(ipif->ipif_flags & 16916 (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { 16917 /* 16918 * Can't possibly have borrowed the source 16919 * from old_ipif. 16920 */ 16921 return; 16922 } 16923 16924 /* 16925 * Is there any work to be done? No work if the address 16926 * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( 16927 * ipif_select_source() does not borrow addresses from 16928 * NOLOCAL and ANYCAST interfaces). 16929 */ 16930 if ((old_ipif != NULL) && 16931 ((old_ipif->ipif_lcl_addr == INADDR_ANY) || 16932 (old_ipif->ipif_ill->ill_wq == NULL) || 16933 (old_ipif->ipif_flags & 16934 (IPIF_NOLOCAL|IPIF_ANYCAST)))) { 16935 return; 16936 } 16937 16938 /* 16939 * Perform the same checks as when creating the 16940 * IRE_INTERFACE in ipif_up_done. 16941 */ 16942 if (!(ipif->ipif_flags & IPIF_UP)) 16943 return; 16944 16945 if ((ipif->ipif_flags & IPIF_NOXMIT) || 16946 (ipif->ipif_subnet == INADDR_ANY)) 16947 return; 16948 16949 ipif_ire = ipif_to_ire(ipif); 16950 if (ipif_ire == NULL) 16951 return; 16952 16953 /* 16954 * We know that ipif uses some other source for its 16955 * IRE_INTERFACE. Is it using the source of this 16956 * old_ipif? 16957 */ 16958 if (old_ipif != NULL && 16959 old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { 16960 ire_refrele(ipif_ire); 16961 return; 16962 } 16963 if (ip_debug > 2) { 16964 /* ip1dbg */ 16965 pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" 16966 " src %s\n", AF_INET, &ipif_ire->ire_src_addr); 16967 } 16968 16969 stq = ipif_ire->ire_stq; 16970 16971 /* 16972 * Can't use our source address. Select a different 16973 * source address for the IRE_INTERFACE. 16974 */ 16975 nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); 16976 if (nipif == NULL) { 16977 /* Last resort - all ipif's have IPIF_NOLOCAL */ 16978 nipif = ipif; 16979 } else { 16980 need_rele = B_TRUE; 16981 } 16982 16983 ire = ire_create( 16984 (uchar_t *)&ipif->ipif_subnet, /* dest pref */ 16985 (uchar_t *)&ipif->ipif_net_mask, /* mask */ 16986 (uchar_t *)&nipif->ipif_src_addr, /* src addr */ 16987 NULL, /* no gateway */ 16988 &ipif->ipif_mtu, /* max frag */ 16989 NULL, /* no src nce */ 16990 NULL, /* no recv from queue */ 16991 stq, /* send-to queue */ 16992 ill->ill_net_type, /* IF_[NO]RESOLVER */ 16993 ipif, 16994 0, 16995 0, 16996 0, 16997 0, 16998 &ire_uinfo_null, 16999 NULL, 17000 NULL, 17001 ipst); 17002 17003 if (ire != NULL) { 17004 ire_t *ret_ire; 17005 int error; 17006 17007 /* 17008 * We don't need ipif_ire anymore. We need to delete 17009 * before we add so that ire_add does not detect 17010 * duplicates. 17011 */ 17012 ire_delete(ipif_ire); 17013 ret_ire = ire; 17014 error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); 17015 ASSERT(error == 0); 17016 ASSERT(ire == ret_ire); 17017 /* Held in ire_add */ 17018 ire_refrele(ret_ire); 17019 } 17020 /* 17021 * Either we are falling through from above or could not 17022 * allocate a replacement. 17023 */ 17024 ire_refrele(ipif_ire); 17025 if (need_rele) 17026 ipif_refrele(nipif); 17027 } 17028 17029 /* 17030 * This old_ipif is going away. 17031 * 17032 * Determine if any other ipif's are using our address as 17033 * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or 17034 * IPIF_DEPRECATED). 17035 * Find the IRE_INTERFACE for such ipifs and recreate them 17036 * to use an different source address following the rules in 17037 * ipif_up_done. 17038 */ 17039 static void 17040 ipif_update_other_ipifs(ipif_t *old_ipif) 17041 { 17042 ipif_t *ipif; 17043 ill_t *ill; 17044 char buf[INET6_ADDRSTRLEN]; 17045 17046 ASSERT(IAM_WRITER_IPIF(old_ipif)); 17047 17048 ill = old_ipif->ipif_ill; 17049 17050 ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name, 17051 inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf)))); 17052 17053 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17054 if (ipif == old_ipif) 17055 continue; 17056 ipif_recreate_interface_routes(old_ipif, ipif); 17057 } 17058 } 17059 17060 /* ARGSUSED */ 17061 int 17062 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 17063 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 17064 { 17065 /* 17066 * ill_phyint_reinit merged the v4 and v6 into a single 17067 * ipsq. We might not have been able to complete the 17068 * operation in ipif_set_values, if we could not become 17069 * exclusive. If so restart it here. 17070 */ 17071 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 17072 } 17073 17074 /* 17075 * Can operate on either a module or a driver queue. 17076 * Returns an error if not a module queue. 17077 */ 17078 /* ARGSUSED */ 17079 int 17080 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 17081 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 17082 { 17083 queue_t *q1 = q; 17084 char *cp; 17085 char interf_name[LIFNAMSIZ]; 17086 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 17087 17088 if (q->q_next == NULL) { 17089 ip1dbg(( 17090 "if_unitsel: IF_UNITSEL: no q_next\n")); 17091 return (EINVAL); 17092 } 17093 17094 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 17095 return (EALREADY); 17096 17097 do { 17098 q1 = q1->q_next; 17099 } while (q1->q_next); 17100 cp = q1->q_qinfo->qi_minfo->mi_idname; 17101 (void) sprintf(interf_name, "%s%d", cp, ppa); 17102 17103 /* 17104 * Here we are not going to delay the ioack until after 17105 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 17106 * original ioctl message before sending the requests. 17107 */ 17108 return (ipif_set_values(q, mp, interf_name, &ppa)); 17109 } 17110 17111 /* ARGSUSED */ 17112 int 17113 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 17114 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 17115 { 17116 return (ENXIO); 17117 } 17118 17119 /* 17120 * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the 17121 * minimum (but complete) set exist. This is necessary when adding or 17122 * removing an interface to/from an IPMP group, since interfaces in an 17123 * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever 17124 * its test address subnets overlap with IPMP data addresses). It's also 17125 * used to refresh the IRE_BROADCAST entries associated with the IPMP 17126 * interface when the nominated broadcast interface changes. 17127 */ 17128 void 17129 ill_refresh_bcast(ill_t *ill) 17130 { 17131 ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */ 17132 ire_t **irep; 17133 ipif_t *ipif; 17134 17135 ASSERT(!ill->ill_isv6); 17136 ASSERT(IAM_WRITER_ILL(ill)); 17137 17138 /* 17139 * Remove any old broadcast IREs. 17140 */ 17141 ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST, 17142 ill_broadcast_delete, ill, ill); 17143 17144 /* 17145 * Create new ones for any ipifs that are up and broadcast-capable. 17146 */ 17147 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17148 if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) != 17149 (IPIF_UP|IPIF_BROADCAST)) 17150 continue; 17151 17152 irep = ipif_create_bcast_ires(ipif, ire_array); 17153 while (irep-- > ire_array) { 17154 (void) ire_add(irep, NULL, NULL, NULL, B_FALSE); 17155 if (*irep != NULL) 17156 ire_refrele(*irep); 17157 } 17158 } 17159 } 17160 17161 /* 17162 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 17163 * `irep'. Returns a pointer to the next free `irep' entry (just like 17164 * ire_check_and_create_bcast()). 17165 */ 17166 static ire_t ** 17167 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 17168 { 17169 ipaddr_t addr; 17170 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 17171 ipaddr_t subnetmask = ipif->ipif_net_mask; 17172 int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; 17173 17174 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 17175 17176 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 17177 17178 if (ipif->ipif_lcl_addr == INADDR_ANY || 17179 (ipif->ipif_flags & IPIF_NOLOCAL)) 17180 netmask = htonl(IN_CLASSA_NET); /* fallback */ 17181 17182 irep = ire_check_and_create_bcast(ipif, 0, irep, flags); 17183 irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags); 17184 17185 /* 17186 * For backward compatibility, we create net broadcast IREs based on 17187 * the old "IP address class system", since some old machines only 17188 * respond to these class derived net broadcast. However, we must not 17189 * create these net broadcast IREs if the subnetmask is shorter than 17190 * the IP address class based derived netmask. Otherwise, we may 17191 * create a net broadcast address which is the same as an IP address 17192 * on the subnet -- and then TCP will refuse to talk to that address. 17193 */ 17194 if (netmask < subnetmask) { 17195 addr = netmask & ipif->ipif_subnet; 17196 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 17197 irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep, 17198 flags); 17199 } 17200 17201 /* 17202 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 17203 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 17204 * created. Creating these broadcast IREs will only create confusion 17205 * as `addr' will be the same as the IP address. 17206 */ 17207 if (subnetmask != 0xFFFFFFFF) { 17208 addr = ipif->ipif_subnet; 17209 irep = ire_check_and_create_bcast(ipif, addr, irep, flags); 17210 irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr, 17211 irep, flags); 17212 } 17213 17214 return (irep); 17215 } 17216 17217 /* 17218 * Broadcast IRE info structure used in the functions below. Since we 17219 * allocate BCAST_COUNT of them on the stack, keep the bit layout compact. 17220 */ 17221 typedef struct bcast_ireinfo { 17222 uchar_t bi_type; /* BCAST_* value from below */ 17223 uchar_t bi_willdie:1, /* will this IRE be going away? */ 17224 bi_needrep:1, /* do we need to replace it? */ 17225 bi_haverep:1, /* have we replaced it? */ 17226 bi_pad:5; 17227 ipaddr_t bi_addr; /* IRE address */ 17228 ipif_t *bi_backup; /* last-ditch ipif to replace it on */ 17229 } bcast_ireinfo_t; 17230 17231 enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT }; 17232 17233 /* 17234 * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and 17235 * return B_TRUE if it should immediately be used to recreate the IRE. 17236 */ 17237 static boolean_t 17238 ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop) 17239 { 17240 ipaddr_t addr; 17241 17242 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie); 17243 17244 switch (bireinfop->bi_type) { 17245 case BCAST_NET: 17246 addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet); 17247 if (addr != bireinfop->bi_addr) 17248 return (B_FALSE); 17249 break; 17250 case BCAST_SUBNET: 17251 if (ipif->ipif_subnet != bireinfop->bi_addr) 17252 return (B_FALSE); 17253 break; 17254 } 17255 17256 bireinfop->bi_needrep = 1; 17257 if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) { 17258 if (bireinfop->bi_backup == NULL) 17259 bireinfop->bi_backup = ipif; 17260 return (B_FALSE); 17261 } 17262 return (B_TRUE); 17263 } 17264 17265 /* 17266 * Create the broadcast IREs described by `bireinfop' on `ipif', and return 17267 * them ala ire_check_and_create_bcast(). 17268 */ 17269 static ire_t ** 17270 ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep) 17271 { 17272 ipaddr_t mask, addr; 17273 17274 ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep); 17275 17276 addr = bireinfop->bi_addr; 17277 irep = ire_create_bcast(ipif, addr, irep); 17278 17279 switch (bireinfop->bi_type) { 17280 case BCAST_NET: 17281 mask = ip_net_mask(ipif->ipif_subnet); 17282 irep = ire_create_bcast(ipif, addr | ~mask, irep); 17283 break; 17284 case BCAST_SUBNET: 17285 mask = ipif->ipif_net_mask; 17286 irep = ire_create_bcast(ipif, addr | ~mask, irep); 17287 break; 17288 } 17289 17290 bireinfop->bi_haverep = 1; 17291 return (irep); 17292 } 17293 17294 /* 17295 * Walk through all of the ipifs on `ill' that will be affected by `test_ipif' 17296 * going away, and determine if any of the broadcast IREs (named by `bireinfop') 17297 * that are going away are still needed. If so, have ipif_create_bcast() 17298 * recreate them (except for the deprecated case, as explained below). 17299 */ 17300 static ire_t ** 17301 ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo, 17302 ire_t **irep) 17303 { 17304 int i; 17305 ipif_t *ipif; 17306 17307 ASSERT(!ill->ill_isv6); 17308 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17309 /* 17310 * Skip this ipif if it's (a) the one being taken down, (b) 17311 * not in the same zone, or (c) has no valid local address. 17312 */ 17313 if (ipif == test_ipif || 17314 ipif->ipif_zoneid != test_ipif->ipif_zoneid || 17315 ipif->ipif_subnet == 0 || 17316 (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) != 17317 (IPIF_UP|IPIF_BROADCAST)) 17318 continue; 17319 17320 /* 17321 * For each dying IRE that hasn't yet been replaced, see if 17322 * `ipif' needs it and whether the IRE should be recreated on 17323 * `ipif'. If `ipif' is deprecated, ipif_consider_bcast() 17324 * will return B_FALSE even if `ipif' needs the IRE on the 17325 * hopes that we'll later find a needy non-deprecated ipif. 17326 * However, the ipif is recorded in bi_backup for possible 17327 * subsequent use by ipif_check_bcast_ires(). 17328 */ 17329 for (i = 0; i < BCAST_COUNT; i++) { 17330 if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep) 17331 continue; 17332 if (!ipif_consider_bcast(ipif, &bireinfo[i])) 17333 continue; 17334 irep = ipif_create_bcast(ipif, &bireinfo[i], irep); 17335 } 17336 17337 /* 17338 * If we've replaced all of the broadcast IREs that are going 17339 * to be taken down, we know we're done. 17340 */ 17341 for (i = 0; i < BCAST_COUNT; i++) { 17342 if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep) 17343 break; 17344 } 17345 if (i == BCAST_COUNT) 17346 break; 17347 } 17348 return (irep); 17349 } 17350 17351 /* 17352 * Check if `test_ipif' (which is going away) is associated with any existing 17353 * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were 17354 * using those broadcast IREs. If so, recreate the broadcast IREs on one or 17355 * more of those other ipifs. (The old IREs will be deleted in ipif_down().) 17356 * 17357 * This is necessary because broadcast IREs are shared. In particular, a 17358 * given ill has one set of all-zeroes and all-ones broadcast IREs (for every 17359 * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones, 17360 * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP 17361 * ipifs on. Thus, if there are two IPIF_UP ipifs on the same subnet with the 17362 * same zone, they will share the same set of broadcast IREs. 17363 * 17364 * Note: the upper bound of 12 IREs comes from the worst case of replacing all 17365 * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes, 17366 * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones). 17367 */ 17368 static void 17369 ipif_check_bcast_ires(ipif_t *test_ipif) 17370 { 17371 ill_t *ill = test_ipif->ipif_ill; 17372 ire_t *ire, *ire_array[12]; /* see note above */ 17373 ire_t **irep1, **irep = &ire_array[0]; 17374 uint_t i, willdie; 17375 ipaddr_t mask = ip_net_mask(test_ipif->ipif_subnet); 17376 bcast_ireinfo_t bireinfo[BCAST_COUNT]; 17377 17378 ASSERT(!test_ipif->ipif_isv6); 17379 ASSERT(IAM_WRITER_IPIF(test_ipif)); 17380 17381 /* 17382 * No broadcast IREs for the LOOPBACK interface 17383 * or others such as point to point and IPIF_NOXMIT. 17384 */ 17385 if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || 17386 (test_ipif->ipif_flags & IPIF_NOXMIT)) 17387 return; 17388 17389 bzero(bireinfo, sizeof (bireinfo)); 17390 bireinfo[0].bi_type = BCAST_ALLZEROES; 17391 bireinfo[0].bi_addr = 0; 17392 17393 bireinfo[1].bi_type = BCAST_ALLONES; 17394 bireinfo[1].bi_addr = INADDR_BROADCAST; 17395 17396 bireinfo[2].bi_type = BCAST_NET; 17397 bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask; 17398 17399 if (test_ipif->ipif_net_mask != 0) 17400 mask = test_ipif->ipif_net_mask; 17401 bireinfo[3].bi_type = BCAST_SUBNET; 17402 bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask; 17403 17404 /* 17405 * Figure out what (if any) broadcast IREs will die as a result of 17406 * `test_ipif' going away. If none will die, we're done. 17407 */ 17408 for (i = 0, willdie = 0; i < BCAST_COUNT; i++) { 17409 ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST, 17410 test_ipif, ALL_ZONES, NULL, 17411 (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst); 17412 if (ire != NULL) { 17413 willdie++; 17414 bireinfo[i].bi_willdie = 1; 17415 ire_refrele(ire); 17416 } 17417 } 17418 17419 if (willdie == 0) 17420 return; 17421 17422 /* 17423 * Walk through all the ipifs that will be affected by the dying IREs, 17424 * and recreate the IREs as necessary. Note that all interfaces in an 17425 * IPMP illgrp share the same broadcast IREs, and thus the entire 17426 * illgrp must be walked, starting with the IPMP meta-interface (so 17427 * that broadcast IREs end up on it whenever possible). 17428 */ 17429 if (IS_UNDER_IPMP(ill)) 17430 ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 17431 17432 irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); 17433 17434 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 17435 ipmp_illgrp_t *illg = ill->ill_grp; 17436 17437 ill = list_head(&illg->ig_if); 17438 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 17439 for (i = 0; i < BCAST_COUNT; i++) { 17440 if (bireinfo[i].bi_willdie && 17441 !bireinfo[i].bi_haverep) 17442 break; 17443 } 17444 if (i == BCAST_COUNT) 17445 break; 17446 17447 irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); 17448 } 17449 } 17450 17451 /* 17452 * Scan through the set of broadcast IREs and see if there are any 17453 * that we need to replace that have not yet been replaced. If so, 17454 * replace them using the appropriate backup ipif. 17455 */ 17456 for (i = 0; i < BCAST_COUNT; i++) { 17457 if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep) 17458 irep = ipif_create_bcast(bireinfo[i].bi_backup, 17459 &bireinfo[i], irep); 17460 } 17461 17462 /* 17463 * If we can't create all of them, don't add any of them. (Code in 17464 * ip_wput_ire() and ire_to_ill() assumes that we always have a 17465 * non-loopback copy and loopback copy for a given address.) 17466 */ 17467 for (irep1 = irep; irep1 > ire_array; ) { 17468 irep1--; 17469 if (*irep1 == NULL) { 17470 ip0dbg(("ipif_check_bcast_ires: can't create " 17471 "IRE_BROADCAST, memory allocation failure\n")); 17472 while (irep > ire_array) { 17473 irep--; 17474 if (*irep != NULL) 17475 ire_delete(*irep); 17476 } 17477 return; 17478 } 17479 } 17480 17481 for (irep1 = irep; irep1 > ire_array; ) { 17482 irep1--; 17483 if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0) 17484 ire_refrele(*irep1); /* Held in ire_add */ 17485 } 17486 } 17487 17488 /* 17489 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 17490 * from lifr_flags and the name from lifr_name. 17491 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 17492 * since ipif_lookup_on_name uses the _isv6 flags when matching. 17493 * Returns EINPROGRESS when mp has been consumed by queueing it on 17494 * ill_pending_mp and the ioctl will complete in ip_rput. 17495 * 17496 * Can operate on either a module or a driver queue. 17497 * Returns an error if not a module queue. 17498 */ 17499 /* ARGSUSED */ 17500 int 17501 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17502 ip_ioctl_cmd_t *ipip, void *if_req) 17503 { 17504 ill_t *ill = q->q_ptr; 17505 phyint_t *phyi; 17506 ip_stack_t *ipst; 17507 struct lifreq *lifr = if_req; 17508 uint64_t new_flags; 17509 17510 ASSERT(ipif != NULL); 17511 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 17512 17513 if (q->q_next == NULL) { 17514 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 17515 return (EINVAL); 17516 } 17517 17518 /* 17519 * If we are not writer on 'q' then this interface exists already 17520 * and previous lookups (ip_extract_lifreq()) found this ipif -- 17521 * so return EALREADY. 17522 */ 17523 if (ill != ipif->ipif_ill) 17524 return (EALREADY); 17525 17526 if (ill->ill_name[0] != '\0') 17527 return (EALREADY); 17528 17529 /* 17530 * If there's another ill already with the requested name, ensure 17531 * that it's of the same type. Otherwise, ill_phyint_reinit() will 17532 * fuse together two unrelated ills, which will cause chaos. 17533 */ 17534 ipst = ill->ill_ipst; 17535 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 17536 lifr->lifr_name, NULL); 17537 if (phyi != NULL) { 17538 ill_t *ill_mate = phyi->phyint_illv4; 17539 17540 if (ill_mate == NULL) 17541 ill_mate = phyi->phyint_illv6; 17542 ASSERT(ill_mate != NULL); 17543 17544 if (ill_mate->ill_media->ip_m_mac_type != 17545 ill->ill_media->ip_m_mac_type) { 17546 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 17547 "use the same ill name on differing media\n")); 17548 return (EINVAL); 17549 } 17550 } 17551 17552 /* 17553 * We start off as IFF_IPV4 in ipif_allocate and become 17554 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 17555 * The only flags that we read from user space are IFF_IPV4, 17556 * IFF_IPV6, IFF_XRESOLV and IFF_BROADCAST. 17557 * 17558 * This ill has not been inserted into the global list. 17559 * So we are still single threaded and don't need any lock 17560 * 17561 * Saniy check the flags. 17562 */ 17563 17564 if ((lifr->lifr_flags & IFF_BROADCAST) && 17565 ((lifr->lifr_flags & IFF_IPV6) || 17566 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 17567 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 17568 "or IPv6 i.e., no broadcast \n")); 17569 return (EINVAL); 17570 } 17571 17572 new_flags = 17573 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_XRESOLV|IFF_BROADCAST); 17574 17575 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 17576 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 17577 "IFF_IPV4 or IFF_IPV6\n")); 17578 return (EINVAL); 17579 } 17580 /* 17581 * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. 17582 */ 17583 if ((new_flags & IFF_XRESOLV) && !(new_flags & IFF_IPV6) && 17584 !(ipif->ipif_isv6)) { 17585 ip1dbg(("ip_sioctl_slifname: XRESOLV only allowed on " 17586 "IPv6 interface\n")); 17587 return (EINVAL); 17588 } 17589 17590 /* 17591 * We always start off as IPv4, so only need to check for IPv6. 17592 */ 17593 if ((new_flags & IFF_IPV6) != 0) { 17594 ill->ill_flags |= ILLF_IPV6; 17595 ill->ill_flags &= ~ILLF_IPV4; 17596 } 17597 17598 if ((new_flags & IFF_BROADCAST) != 0) 17599 ipif->ipif_flags |= IPIF_BROADCAST; 17600 else 17601 ipif->ipif_flags &= ~IPIF_BROADCAST; 17602 17603 if ((new_flags & IFF_XRESOLV) != 0) 17604 ill->ill_flags |= ILLF_XRESOLV; 17605 else 17606 ill->ill_flags &= ~ILLF_XRESOLV; 17607 17608 /* We started off as V4. */ 17609 if (ill->ill_flags & ILLF_IPV6) { 17610 ill->ill_phyint->phyint_illv6 = ill; 17611 ill->ill_phyint->phyint_illv4 = NULL; 17612 } 17613 17614 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 17615 } 17616 17617 /* ARGSUSED */ 17618 int 17619 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17620 ip_ioctl_cmd_t *ipip, void *if_req) 17621 { 17622 /* 17623 * ill_phyint_reinit merged the v4 and v6 into a single 17624 * ipsq. We might not have been able to complete the 17625 * slifname in ipif_set_values, if we could not become 17626 * exclusive. If so restart it here 17627 */ 17628 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 17629 } 17630 17631 /* 17632 * Return a pointer to the ipif which matches the index, IP version type and 17633 * zoneid. 17634 */ 17635 ipif_t * 17636 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 17637 queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst) 17638 { 17639 ill_t *ill; 17640 ipif_t *ipif = NULL; 17641 17642 ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || 17643 (q != NULL && mp != NULL && func != NULL && err != NULL)); 17644 17645 if (err != NULL) 17646 *err = 0; 17647 17648 ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); 17649 if (ill != NULL) { 17650 mutex_enter(&ill->ill_lock); 17651 for (ipif = ill->ill_ipif; ipif != NULL; 17652 ipif = ipif->ipif_next) { 17653 if (IPIF_CAN_LOOKUP(ipif) && (zoneid == ALL_ZONES || 17654 zoneid == ipif->ipif_zoneid || 17655 ipif->ipif_zoneid == ALL_ZONES)) { 17656 ipif_refhold_locked(ipif); 17657 break; 17658 } 17659 } 17660 mutex_exit(&ill->ill_lock); 17661 ill_refrele(ill); 17662 if (ipif == NULL && err != NULL) 17663 *err = ENXIO; 17664 } 17665 return (ipif); 17666 } 17667 17668 /* 17669 * Change an existing physical interface's index. If the new index 17670 * is acceptable we update the index and the phyint_list_avl_by_index tree. 17671 * Finally, we update other systems which may have a dependence on the 17672 * index value. 17673 */ 17674 /* ARGSUSED */ 17675 int 17676 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17677 ip_ioctl_cmd_t *ipip, void *ifreq) 17678 { 17679 ill_t *ill; 17680 phyint_t *phyi; 17681 struct ifreq *ifr = (struct ifreq *)ifreq; 17682 struct lifreq *lifr = (struct lifreq *)ifreq; 17683 uint_t old_index, index; 17684 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 17685 avl_index_t where; 17686 17687 if (ipip->ipi_cmd_type == IF_CMD) 17688 index = ifr->ifr_index; 17689 else 17690 index = lifr->lifr_index; 17691 17692 /* 17693 * Only allow on physical interface. Also, index zero is illegal. 17694 */ 17695 ill = ipif->ipif_ill; 17696 phyi = ill->ill_phyint; 17697 if (ipif->ipif_id != 0 || index == 0) { 17698 return (EINVAL); 17699 } 17700 17701 /* If the index is not changing, no work to do */ 17702 if (phyi->phyint_ifindex == index) 17703 return (0); 17704 17705 /* 17706 * Use phyint_exists() to determine if the new interface index 17707 * is already in use. If the index is unused then we need to 17708 * change the phyint's position in the phyint_list_avl_by_index 17709 * tree. If we do not do this, subsequent lookups (using the new 17710 * index value) will not find the phyint. 17711 */ 17712 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 17713 if (phyint_exists(index, ipst)) { 17714 rw_exit(&ipst->ips_ill_g_lock); 17715 return (EEXIST); 17716 } 17717 17718 /* The new index is unused. Set it in the phyint. */ 17719 old_index = phyi->phyint_ifindex; 17720 phyi->phyint_ifindex = index; 17721 17722 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 17723 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 17724 &index, &where); 17725 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 17726 phyi, where); 17727 rw_exit(&ipst->ips_ill_g_lock); 17728 17729 /* Update SCTP's ILL list */ 17730 sctp_ill_reindex(ill, old_index); 17731 17732 /* Send the routing sockets message */ 17733 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 17734 if (ILL_OTHER(ill)) 17735 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 17736 17737 return (0); 17738 } 17739 17740 /* ARGSUSED */ 17741 int 17742 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17743 ip_ioctl_cmd_t *ipip, void *ifreq) 17744 { 17745 struct ifreq *ifr = (struct ifreq *)ifreq; 17746 struct lifreq *lifr = (struct lifreq *)ifreq; 17747 17748 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 17749 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17750 /* Get the interface index */ 17751 if (ipip->ipi_cmd_type == IF_CMD) { 17752 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 17753 } else { 17754 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 17755 } 17756 return (0); 17757 } 17758 17759 /* ARGSUSED */ 17760 int 17761 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17762 ip_ioctl_cmd_t *ipip, void *ifreq) 17763 { 17764 struct lifreq *lifr = (struct lifreq *)ifreq; 17765 17766 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 17767 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17768 /* Get the interface zone */ 17769 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17770 lifr->lifr_zoneid = ipif->ipif_zoneid; 17771 return (0); 17772 } 17773 17774 /* 17775 * Set the zoneid of an interface. 17776 */ 17777 /* ARGSUSED */ 17778 int 17779 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17780 ip_ioctl_cmd_t *ipip, void *ifreq) 17781 { 17782 struct lifreq *lifr = (struct lifreq *)ifreq; 17783 int err = 0; 17784 boolean_t need_up = B_FALSE; 17785 zone_t *zptr; 17786 zone_status_t status; 17787 zoneid_t zoneid; 17788 17789 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17790 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 17791 if (!is_system_labeled()) 17792 return (ENOTSUP); 17793 zoneid = GLOBAL_ZONEID; 17794 } 17795 17796 /* cannot assign instance zero to a non-global zone */ 17797 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 17798 return (ENOTSUP); 17799 17800 /* 17801 * Cannot assign to a zone that doesn't exist or is shutting down. In 17802 * the event of a race with the zone shutdown processing, since IP 17803 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 17804 * interface will be cleaned up even if the zone is shut down 17805 * immediately after the status check. If the interface can't be brought 17806 * down right away, and the zone is shut down before the restart 17807 * function is called, we resolve the possible races by rechecking the 17808 * zone status in the restart function. 17809 */ 17810 if ((zptr = zone_find_by_id(zoneid)) == NULL) 17811 return (EINVAL); 17812 status = zone_status_get(zptr); 17813 zone_rele(zptr); 17814 17815 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 17816 return (EINVAL); 17817 17818 if (ipif->ipif_flags & IPIF_UP) { 17819 /* 17820 * If the interface is already marked up, 17821 * we call ipif_down which will take care 17822 * of ditching any IREs that have been set 17823 * up based on the old interface address. 17824 */ 17825 err = ipif_logical_down(ipif, q, mp); 17826 if (err == EINPROGRESS) 17827 return (err); 17828 ipif_down_tail(ipif); 17829 need_up = B_TRUE; 17830 } 17831 17832 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 17833 return (err); 17834 } 17835 17836 static int 17837 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 17838 queue_t *q, mblk_t *mp, boolean_t need_up) 17839 { 17840 int err = 0; 17841 ip_stack_t *ipst; 17842 17843 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 17844 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17845 17846 if (CONN_Q(q)) 17847 ipst = CONNQ_TO_IPST(q); 17848 else 17849 ipst = ILLQ_TO_IPST(q); 17850 17851 /* 17852 * For exclusive stacks we don't allow a different zoneid than 17853 * global. 17854 */ 17855 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 17856 zoneid != GLOBAL_ZONEID) 17857 return (EINVAL); 17858 17859 /* Set the new zone id. */ 17860 ipif->ipif_zoneid = zoneid; 17861 17862 /* Update sctp list */ 17863 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 17864 17865 if (need_up) { 17866 /* 17867 * Now bring the interface back up. If this 17868 * is the only IPIF for the ILL, ipif_up 17869 * will have to re-bind to the device, so 17870 * we may get back EINPROGRESS, in which 17871 * case, this IOCTL will get completed in 17872 * ip_rput_dlpi when we see the DL_BIND_ACK. 17873 */ 17874 err = ipif_up(ipif, q, mp); 17875 } 17876 return (err); 17877 } 17878 17879 /* ARGSUSED */ 17880 int 17881 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17882 ip_ioctl_cmd_t *ipip, void *if_req) 17883 { 17884 struct lifreq *lifr = (struct lifreq *)if_req; 17885 zoneid_t zoneid; 17886 zone_t *zptr; 17887 zone_status_t status; 17888 17889 ASSERT(ipif->ipif_id != 0); 17890 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 17891 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 17892 zoneid = GLOBAL_ZONEID; 17893 17894 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 17895 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17896 17897 /* 17898 * We recheck the zone status to resolve the following race condition: 17899 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 17900 * 2) hme0:1 is up and can't be brought down right away; 17901 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 17902 * 3) zone "myzone" is halted; the zone status switches to 17903 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 17904 * the interfaces to remove - hme0:1 is not returned because it's not 17905 * yet in "myzone", so it won't be removed; 17906 * 4) the restart function for SIOCSLIFZONE is called; without the 17907 * status check here, we would have hme0:1 in "myzone" after it's been 17908 * destroyed. 17909 * Note that if the status check fails, we need to bring the interface 17910 * back to its state prior to ip_sioctl_slifzone(), hence the call to 17911 * ipif_up_done[_v6](). 17912 */ 17913 status = ZONE_IS_UNINITIALIZED; 17914 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 17915 status = zone_status_get(zptr); 17916 zone_rele(zptr); 17917 } 17918 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 17919 if (ipif->ipif_isv6) { 17920 (void) ipif_up_done_v6(ipif); 17921 } else { 17922 (void) ipif_up_done(ipif); 17923 } 17924 return (EINVAL); 17925 } 17926 17927 ipif_down_tail(ipif); 17928 17929 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 17930 B_TRUE)); 17931 } 17932 17933 /* 17934 * Return the number of addresses on `ill' with one or more of the values 17935 * in `set' set and all of the values in `clear' clear. 17936 */ 17937 static uint_t 17938 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 17939 { 17940 ipif_t *ipif; 17941 uint_t cnt = 0; 17942 17943 ASSERT(IAM_WRITER_ILL(ill)); 17944 17945 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 17946 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 17947 cnt++; 17948 17949 return (cnt); 17950 } 17951 17952 /* 17953 * Return the number of migratable addresses on `ill' that are under 17954 * application control. 17955 */ 17956 uint_t 17957 ill_appaddr_cnt(const ill_t *ill) 17958 { 17959 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 17960 IPIF_NOFAILOVER)); 17961 } 17962 17963 /* 17964 * Return the number of point-to-point addresses on `ill'. 17965 */ 17966 uint_t 17967 ill_ptpaddr_cnt(const ill_t *ill) 17968 { 17969 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 17970 } 17971 17972 /* ARGSUSED */ 17973 int 17974 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 17975 ip_ioctl_cmd_t *ipip, void *ifreq) 17976 { 17977 struct lifreq *lifr = ifreq; 17978 17979 ASSERT(q->q_next == NULL); 17980 ASSERT(CONN_Q(q)); 17981 17982 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 17983 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 17984 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 17985 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 17986 17987 return (0); 17988 } 17989 17990 /* Find the previous ILL in this usesrc group */ 17991 static ill_t * 17992 ill_prev_usesrc(ill_t *uill) 17993 { 17994 ill_t *ill; 17995 17996 for (ill = uill->ill_usesrc_grp_next; 17997 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 17998 ill = ill->ill_usesrc_grp_next) 17999 /* do nothing */; 18000 return (ill); 18001 } 18002 18003 /* 18004 * Release all members of the usesrc group. This routine is called 18005 * from ill_delete when the interface being unplumbed is the 18006 * group head. 18007 */ 18008 static void 18009 ill_disband_usesrc_group(ill_t *uill) 18010 { 18011 ill_t *next_ill, *tmp_ill; 18012 ip_stack_t *ipst = uill->ill_ipst; 18013 18014 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 18015 next_ill = uill->ill_usesrc_grp_next; 18016 18017 do { 18018 ASSERT(next_ill != NULL); 18019 tmp_ill = next_ill->ill_usesrc_grp_next; 18020 ASSERT(tmp_ill != NULL); 18021 next_ill->ill_usesrc_grp_next = NULL; 18022 next_ill->ill_usesrc_ifindex = 0; 18023 next_ill = tmp_ill; 18024 } while (next_ill->ill_usesrc_ifindex != 0); 18025 uill->ill_usesrc_grp_next = NULL; 18026 } 18027 18028 /* 18029 * Remove the client usesrc ILL from the list and relink to a new list 18030 */ 18031 int 18032 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 18033 { 18034 ill_t *ill, *tmp_ill; 18035 ip_stack_t *ipst = ucill->ill_ipst; 18036 18037 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 18038 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 18039 18040 /* 18041 * Check if the usesrc client ILL passed in is not already 18042 * in use as a usesrc ILL i.e one whose source address is 18043 * in use OR a usesrc ILL is not already in use as a usesrc 18044 * client ILL 18045 */ 18046 if ((ucill->ill_usesrc_ifindex == 0) || 18047 (uill->ill_usesrc_ifindex != 0)) { 18048 return (-1); 18049 } 18050 18051 ill = ill_prev_usesrc(ucill); 18052 ASSERT(ill->ill_usesrc_grp_next != NULL); 18053 18054 /* Remove from the current list */ 18055 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 18056 /* Only two elements in the list */ 18057 ASSERT(ill->ill_usesrc_ifindex == 0); 18058 ill->ill_usesrc_grp_next = NULL; 18059 } else { 18060 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 18061 } 18062 18063 if (ifindex == 0) { 18064 ucill->ill_usesrc_ifindex = 0; 18065 ucill->ill_usesrc_grp_next = NULL; 18066 return (0); 18067 } 18068 18069 ucill->ill_usesrc_ifindex = ifindex; 18070 tmp_ill = uill->ill_usesrc_grp_next; 18071 uill->ill_usesrc_grp_next = ucill; 18072 ucill->ill_usesrc_grp_next = 18073 (tmp_ill != NULL) ? tmp_ill : uill; 18074 return (0); 18075 } 18076 18077 /* 18078 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 18079 * ip.c for locking details. 18080 */ 18081 /* ARGSUSED */ 18082 int 18083 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18084 ip_ioctl_cmd_t *ipip, void *ifreq) 18085 { 18086 struct lifreq *lifr = (struct lifreq *)ifreq; 18087 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, 18088 ill_flag_changed = B_FALSE; 18089 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 18090 int err = 0, ret; 18091 uint_t ifindex; 18092 ipsq_t *ipsq = NULL; 18093 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 18094 18095 ASSERT(IAM_WRITER_IPIF(ipif)); 18096 ASSERT(q->q_next == NULL); 18097 ASSERT(CONN_Q(q)); 18098 18099 isv6 = (Q_TO_CONN(q))->conn_af_isv6; 18100 18101 ifindex = lifr->lifr_index; 18102 if (ifindex == 0) { 18103 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 18104 /* non usesrc group interface, nothing to reset */ 18105 return (0); 18106 } 18107 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 18108 /* valid reset request */ 18109 reset_flg = B_TRUE; 18110 } 18111 18112 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, 18113 ip_process_ioctl, &err, ipst); 18114 if (usesrc_ill == NULL) { 18115 return (err); 18116 } 18117 18118 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 18119 NEW_OP, B_TRUE); 18120 if (ipsq == NULL) { 18121 err = EINPROGRESS; 18122 /* Operation enqueued on the ipsq of the usesrc ILL */ 18123 goto done; 18124 } 18125 18126 /* USESRC isn't currently supported with IPMP */ 18127 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 18128 err = ENOTSUP; 18129 goto done; 18130 } 18131 18132 /* 18133 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 18134 * used by IPMP underlying interfaces, but someone might think it's 18135 * more general and try to use it independently with VNI.) 18136 */ 18137 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 18138 err = ENOTSUP; 18139 goto done; 18140 } 18141 18142 /* 18143 * If the client is already in use as a usesrc_ill or a usesrc_ill is 18144 * already a client then return EINVAL 18145 */ 18146 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 18147 err = EINVAL; 18148 goto done; 18149 } 18150 18151 /* 18152 * If the ill_usesrc_ifindex field is already set to what it needs to 18153 * be then this is a duplicate operation. 18154 */ 18155 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 18156 err = 0; 18157 goto done; 18158 } 18159 18160 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 18161 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 18162 usesrc_ill->ill_isv6)); 18163 18164 /* 18165 * The next step ensures that no new ires will be created referencing 18166 * the client ill, until the ILL_CHANGING flag is cleared. Then 18167 * we go through an ire walk deleting all ire caches that reference 18168 * the client ill. New ires referencing the client ill that are added 18169 * to the ire table before the ILL_CHANGING flag is set, will be 18170 * cleaned up by the ire walk below. Attempt to add new ires referencing 18171 * the client ill while the ILL_CHANGING flag is set will be failed 18172 * during the ire_add in ire_atomic_start. ire_atomic_start atomically 18173 * checks (under the ill_g_usesrc_lock) that the ire being added 18174 * is not stale, i.e the ire_stq and ire_ipif are consistent and 18175 * belong to the same usesrc group. 18176 */ 18177 mutex_enter(&usesrc_cli_ill->ill_lock); 18178 usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; 18179 mutex_exit(&usesrc_cli_ill->ill_lock); 18180 ill_flag_changed = B_TRUE; 18181 18182 if (ipif->ipif_isv6) 18183 ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 18184 ALL_ZONES, ipst); 18185 else 18186 ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, 18187 ALL_ZONES, ipst); 18188 18189 /* 18190 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 18191 * and the ill_usesrc_ifindex fields 18192 */ 18193 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 18194 18195 if (reset_flg) { 18196 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 18197 if (ret != 0) { 18198 err = EINVAL; 18199 } 18200 rw_exit(&ipst->ips_ill_g_usesrc_lock); 18201 goto done; 18202 } 18203 18204 /* 18205 * Four possibilities to consider: 18206 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 18207 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 18208 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 18209 * 4. Both are part of their respective usesrc groups 18210 */ 18211 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 18212 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 18213 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 18214 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 18215 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 18216 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 18217 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 18218 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 18219 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 18220 /* Insert at head of list */ 18221 usesrc_cli_ill->ill_usesrc_grp_next = 18222 usesrc_ill->ill_usesrc_grp_next; 18223 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 18224 } else { 18225 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 18226 ifindex); 18227 if (ret != 0) 18228 err = EINVAL; 18229 } 18230 rw_exit(&ipst->ips_ill_g_usesrc_lock); 18231 18232 done: 18233 if (ill_flag_changed) { 18234 mutex_enter(&usesrc_cli_ill->ill_lock); 18235 usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; 18236 mutex_exit(&usesrc_cli_ill->ill_lock); 18237 } 18238 if (ipsq != NULL) 18239 ipsq_exit(ipsq); 18240 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 18241 ill_refrele(usesrc_ill); 18242 return (err); 18243 } 18244 18245 /* 18246 * comparison function used by avl. 18247 */ 18248 static int 18249 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 18250 { 18251 18252 uint_t index; 18253 18254 ASSERT(phyip != NULL && index_ptr != NULL); 18255 18256 index = *((uint_t *)index_ptr); 18257 /* 18258 * let the phyint with the lowest index be on top. 18259 */ 18260 if (((phyint_t *)phyip)->phyint_ifindex < index) 18261 return (1); 18262 if (((phyint_t *)phyip)->phyint_ifindex > index) 18263 return (-1); 18264 return (0); 18265 } 18266 18267 /* 18268 * comparison function used by avl. 18269 */ 18270 static int 18271 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 18272 { 18273 ill_t *ill; 18274 int res = 0; 18275 18276 ASSERT(phyip != NULL && name_ptr != NULL); 18277 18278 if (((phyint_t *)phyip)->phyint_illv4) 18279 ill = ((phyint_t *)phyip)->phyint_illv4; 18280 else 18281 ill = ((phyint_t *)phyip)->phyint_illv6; 18282 ASSERT(ill != NULL); 18283 18284 res = strcmp(ill->ill_name, (char *)name_ptr); 18285 if (res > 0) 18286 return (1); 18287 else if (res < 0) 18288 return (-1); 18289 return (0); 18290 } 18291 18292 /* 18293 * This function is called on the unplumb path via ill_glist_delete() when 18294 * there are no ills left on the phyint and thus the phyint can be freed. 18295 */ 18296 static void 18297 phyint_free(phyint_t *phyi) 18298 { 18299 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 18300 18301 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 18302 18303 /* 18304 * If this phyint was an IPMP meta-interface, blow away the group. 18305 * This is safe to do because all of the illgrps have already been 18306 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 18307 * If we're cleaning up as a result of failed initialization, 18308 * phyint_grp may be NULL. 18309 */ 18310 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 18311 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 18312 ipmp_grp_destroy(phyi->phyint_grp); 18313 phyi->phyint_grp = NULL; 18314 rw_exit(&ipst->ips_ipmp_lock); 18315 } 18316 18317 /* 18318 * If this interface was under IPMP, take it out of the group. 18319 */ 18320 if (phyi->phyint_grp != NULL) 18321 ipmp_phyint_leave_grp(phyi); 18322 18323 /* 18324 * Delete the phyint and disassociate its ipsq. The ipsq itself 18325 * will be freed in ipsq_exit(). 18326 */ 18327 phyi->phyint_ipsq->ipsq_phyint = NULL; 18328 phyi->phyint_name[0] = '\0'; 18329 18330 mi_free(phyi); 18331 } 18332 18333 /* 18334 * Attach the ill to the phyint structure which can be shared by both 18335 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 18336 * function is called from ipif_set_values and ill_lookup_on_name (for 18337 * loopback) where we know the name of the ill. We lookup the ill and if 18338 * there is one present already with the name use that phyint. Otherwise 18339 * reuse the one allocated by ill_init. 18340 */ 18341 static void 18342 ill_phyint_reinit(ill_t *ill) 18343 { 18344 boolean_t isv6 = ill->ill_isv6; 18345 phyint_t *phyi_old; 18346 phyint_t *phyi; 18347 avl_index_t where = 0; 18348 ill_t *ill_other = NULL; 18349 ip_stack_t *ipst = ill->ill_ipst; 18350 18351 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 18352 18353 phyi_old = ill->ill_phyint; 18354 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 18355 phyi_old->phyint_illv6 == NULL)); 18356 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 18357 phyi_old->phyint_illv4 == NULL)); 18358 ASSERT(phyi_old->phyint_ifindex == 0); 18359 18360 /* 18361 * Now that our ill has a name, set it in the phyint. 18362 */ 18363 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 18364 18365 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18366 ill->ill_name, &where); 18367 18368 /* 18369 * 1. We grabbed the ill_g_lock before inserting this ill into 18370 * the global list of ills. So no other thread could have located 18371 * this ill and hence the ipsq of this ill is guaranteed to be empty. 18372 * 2. Now locate the other protocol instance of this ill. 18373 * 3. Now grab both ill locks in the right order, and the phyint lock of 18374 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 18375 * of neither ill can change. 18376 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 18377 * other ill. 18378 * 5. Release all locks. 18379 */ 18380 18381 /* 18382 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 18383 * we are initializing IPv4. 18384 */ 18385 if (phyi != NULL) { 18386 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 18387 ASSERT(ill_other->ill_phyint != NULL); 18388 ASSERT((isv6 && !ill_other->ill_isv6) || 18389 (!isv6 && ill_other->ill_isv6)); 18390 GRAB_ILL_LOCKS(ill, ill_other); 18391 /* 18392 * We are potentially throwing away phyint_flags which 18393 * could be different from the one that we obtain from 18394 * ill_other->ill_phyint. But it is okay as we are assuming 18395 * that the state maintained within IP is correct. 18396 */ 18397 mutex_enter(&phyi->phyint_lock); 18398 if (isv6) { 18399 ASSERT(phyi->phyint_illv6 == NULL); 18400 phyi->phyint_illv6 = ill; 18401 } else { 18402 ASSERT(phyi->phyint_illv4 == NULL); 18403 phyi->phyint_illv4 = ill; 18404 } 18405 18406 /* 18407 * Delete the old phyint and make its ipsq eligible 18408 * to be freed in ipsq_exit(). 18409 */ 18410 phyi_old->phyint_illv4 = NULL; 18411 phyi_old->phyint_illv6 = NULL; 18412 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 18413 phyi_old->phyint_name[0] = '\0'; 18414 mi_free(phyi_old); 18415 } else { 18416 mutex_enter(&ill->ill_lock); 18417 /* 18418 * We don't need to acquire any lock, since 18419 * the ill is not yet visible globally and we 18420 * have not yet released the ill_g_lock. 18421 */ 18422 phyi = phyi_old; 18423 mutex_enter(&phyi->phyint_lock); 18424 /* XXX We need a recovery strategy here. */ 18425 if (!phyint_assign_ifindex(phyi, ipst)) 18426 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 18427 18428 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18429 (void *)phyi, where); 18430 18431 (void) avl_find(&ipst->ips_phyint_g_list-> 18432 phyint_list_avl_by_index, 18433 &phyi->phyint_ifindex, &where); 18434 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 18435 (void *)phyi, where); 18436 } 18437 18438 /* 18439 * Reassigning ill_phyint automatically reassigns the ipsq also. 18440 * pending mp is not affected because that is per ill basis. 18441 */ 18442 ill->ill_phyint = phyi; 18443 18444 /* 18445 * Now that the phyint's ifindex has been assigned, complete the 18446 * remaining 18447 */ 18448 18449 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 18450 if (ill->ill_isv6) { 18451 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 18452 ill->ill_phyint->phyint_ifindex; 18453 ill->ill_mcast_type = ipst->ips_mld_max_version; 18454 } else { 18455 ill->ill_mcast_type = ipst->ips_igmp_max_version; 18456 } 18457 18458 /* 18459 * Generate an event within the hooks framework to indicate that 18460 * a new interface has just been added to IP. For this event to 18461 * be generated, the network interface must, at least, have an 18462 * ifindex assigned to it. (We don't generate the event for 18463 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 18464 * 18465 * This needs to be run inside the ill_g_lock perimeter to ensure 18466 * that the ordering of delivered events to listeners matches the 18467 * order of them in the kernel. 18468 */ 18469 if (!IS_LOOPBACK(ill)) { 18470 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 18471 ill->ill_name_length); 18472 } 18473 RELEASE_ILL_LOCKS(ill, ill_other); 18474 mutex_exit(&phyi->phyint_lock); 18475 } 18476 18477 /* 18478 * Notify any downstream modules of the name of this interface. 18479 * An M_IOCTL is used even though we don't expect a successful reply. 18480 * Any reply message from the driver (presumably an M_IOCNAK) will 18481 * eventually get discarded somewhere upstream. The message format is 18482 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 18483 * to IP. 18484 */ 18485 static void 18486 ip_ifname_notify(ill_t *ill, queue_t *q) 18487 { 18488 mblk_t *mp1, *mp2; 18489 struct iocblk *iocp; 18490 struct lifreq *lifr; 18491 18492 mp1 = mkiocb(SIOCSLIFNAME); 18493 if (mp1 == NULL) 18494 return; 18495 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 18496 if (mp2 == NULL) { 18497 freeb(mp1); 18498 return; 18499 } 18500 18501 mp1->b_cont = mp2; 18502 iocp = (struct iocblk *)mp1->b_rptr; 18503 iocp->ioc_count = sizeof (struct lifreq); 18504 18505 lifr = (struct lifreq *)mp2->b_rptr; 18506 mp2->b_wptr += sizeof (struct lifreq); 18507 bzero(lifr, sizeof (struct lifreq)); 18508 18509 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 18510 lifr->lifr_ppa = ill->ill_ppa; 18511 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 18512 18513 putnext(q, mp1); 18514 } 18515 18516 static int 18517 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 18518 { 18519 int err; 18520 ip_stack_t *ipst = ill->ill_ipst; 18521 phyint_t *phyi = ill->ill_phyint; 18522 18523 /* Set the obsolete NDD per-interface forwarding name. */ 18524 err = ill_set_ndd_name(ill); 18525 if (err != 0) { 18526 cmn_err(CE_WARN, "ipif_set_values: ill_set_ndd_name (%d)\n", 18527 err); 18528 } 18529 18530 /* 18531 * Now that ill_name is set, the configuration for the IPMP 18532 * meta-interface can be performed. 18533 */ 18534 if (IS_IPMP(ill)) { 18535 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 18536 /* 18537 * If phyi->phyint_grp is NULL, then this is the first IPMP 18538 * meta-interface and we need to create the IPMP group. 18539 */ 18540 if (phyi->phyint_grp == NULL) { 18541 /* 18542 * If someone has renamed another IPMP group to have 18543 * the same name as our interface, bail. 18544 */ 18545 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 18546 rw_exit(&ipst->ips_ipmp_lock); 18547 return (EEXIST); 18548 } 18549 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 18550 if (phyi->phyint_grp == NULL) { 18551 rw_exit(&ipst->ips_ipmp_lock); 18552 return (ENOMEM); 18553 } 18554 } 18555 rw_exit(&ipst->ips_ipmp_lock); 18556 } 18557 18558 /* Tell downstream modules where they are. */ 18559 ip_ifname_notify(ill, q); 18560 18561 /* 18562 * ill_dl_phys returns EINPROGRESS in the usual case. 18563 * Error cases are ENOMEM ... 18564 */ 18565 err = ill_dl_phys(ill, ipif, mp, q); 18566 18567 /* 18568 * If there is no IRE expiration timer running, get one started. 18569 * igmp and mld timers will be triggered by the first multicast 18570 */ 18571 if (ipst->ips_ip_ire_expire_id == 0) { 18572 /* 18573 * acquire the lock and check again. 18574 */ 18575 mutex_enter(&ipst->ips_ip_trash_timer_lock); 18576 if (ipst->ips_ip_ire_expire_id == 0) { 18577 ipst->ips_ip_ire_expire_id = timeout( 18578 ip_trash_timer_expire, ipst, 18579 MSEC_TO_TICK(ipst->ips_ip_timer_interval)); 18580 } 18581 mutex_exit(&ipst->ips_ip_trash_timer_lock); 18582 } 18583 18584 if (ill->ill_isv6) { 18585 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 18586 if (ipst->ips_mld_slowtimeout_id == 0) { 18587 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 18588 (void *)ipst, 18589 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 18590 } 18591 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 18592 } else { 18593 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 18594 if (ipst->ips_igmp_slowtimeout_id == 0) { 18595 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 18596 (void *)ipst, 18597 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 18598 } 18599 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 18600 } 18601 18602 return (err); 18603 } 18604 18605 /* 18606 * Common routine for ppa and ifname setting. Should be called exclusive. 18607 * 18608 * Returns EINPROGRESS when mp has been consumed by queueing it on 18609 * ill_pending_mp and the ioctl will complete in ip_rput. 18610 * 18611 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 18612 * the new name and new ppa in lifr_name and lifr_ppa respectively. 18613 * For SLIFNAME, we pass these values back to the userland. 18614 */ 18615 static int 18616 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 18617 { 18618 ill_t *ill; 18619 ipif_t *ipif; 18620 ipsq_t *ipsq; 18621 char *ppa_ptr; 18622 char *old_ptr; 18623 char old_char; 18624 int error; 18625 ip_stack_t *ipst; 18626 18627 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 18628 ASSERT(q->q_next != NULL); 18629 ASSERT(interf_name != NULL); 18630 18631 ill = (ill_t *)q->q_ptr; 18632 ipst = ill->ill_ipst; 18633 18634 ASSERT(ill->ill_ipst != NULL); 18635 ASSERT(ill->ill_name[0] == '\0'); 18636 ASSERT(IAM_WRITER_ILL(ill)); 18637 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 18638 ASSERT(ill->ill_ppa == UINT_MAX); 18639 18640 /* The ppa is sent down by ifconfig or is chosen */ 18641 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 18642 return (EINVAL); 18643 } 18644 18645 /* 18646 * make sure ppa passed in is same as ppa in the name. 18647 * This check is not made when ppa == UINT_MAX in that case ppa 18648 * in the name could be anything. System will choose a ppa and 18649 * update new_ppa_ptr and inter_name to contain the choosen ppa. 18650 */ 18651 if (*new_ppa_ptr != UINT_MAX) { 18652 /* stoi changes the pointer */ 18653 old_ptr = ppa_ptr; 18654 /* 18655 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 18656 * (they don't have an externally visible ppa). We assign one 18657 * here so that we can manage the interface. Note that in 18658 * the past this value was always 0 for DLPI 1 drivers. 18659 */ 18660 if (*new_ppa_ptr == 0) 18661 *new_ppa_ptr = stoi(&old_ptr); 18662 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 18663 return (EINVAL); 18664 } 18665 /* 18666 * terminate string before ppa 18667 * save char at that location. 18668 */ 18669 old_char = ppa_ptr[0]; 18670 ppa_ptr[0] = '\0'; 18671 18672 ill->ill_ppa = *new_ppa_ptr; 18673 /* 18674 * Finish as much work now as possible before calling ill_glist_insert 18675 * which makes the ill globally visible and also merges it with the 18676 * other protocol instance of this phyint. The remaining work is 18677 * done after entering the ipsq which may happen sometime later. 18678 * ill_set_ndd_name occurs after the ill has been made globally visible. 18679 */ 18680 ipif = ill->ill_ipif; 18681 18682 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 18683 ipif_assign_seqid(ipif); 18684 18685 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 18686 ill->ill_flags |= ILLF_IPV4; 18687 18688 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 18689 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 18690 18691 if (ill->ill_flags & ILLF_IPV6) { 18692 18693 ill->ill_isv6 = B_TRUE; 18694 if (ill->ill_rq != NULL) { 18695 ill->ill_rq->q_qinfo = &iprinitv6; 18696 ill->ill_wq->q_qinfo = &ipwinitv6; 18697 } 18698 18699 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 18700 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 18701 ipif->ipif_v6src_addr = ipv6_all_zeros; 18702 ipif->ipif_v6subnet = ipv6_all_zeros; 18703 ipif->ipif_v6net_mask = ipv6_all_zeros; 18704 ipif->ipif_v6brd_addr = ipv6_all_zeros; 18705 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 18706 /* 18707 * point-to-point or Non-mulicast capable 18708 * interfaces won't do NUD unless explicitly 18709 * configured to do so. 18710 */ 18711 if (ipif->ipif_flags & IPIF_POINTOPOINT || 18712 !(ill->ill_flags & ILLF_MULTICAST)) { 18713 ill->ill_flags |= ILLF_NONUD; 18714 } 18715 /* Make sure IPv4 specific flag is not set on IPv6 if */ 18716 if (ill->ill_flags & ILLF_NOARP) { 18717 /* 18718 * Note: xresolv interfaces will eventually need 18719 * NOARP set here as well, but that will require 18720 * those external resolvers to have some 18721 * knowledge of that flag and act appropriately. 18722 * Not to be changed at present. 18723 */ 18724 ill->ill_flags &= ~ILLF_NOARP; 18725 } 18726 /* 18727 * Set the ILLF_ROUTER flag according to the global 18728 * IPv6 forwarding policy. 18729 */ 18730 if (ipst->ips_ipv6_forward != 0) 18731 ill->ill_flags |= ILLF_ROUTER; 18732 } else if (ill->ill_flags & ILLF_IPV4) { 18733 ill->ill_isv6 = B_FALSE; 18734 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 18735 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); 18736 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 18737 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 18738 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 18739 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 18740 /* 18741 * Set the ILLF_ROUTER flag according to the global 18742 * IPv4 forwarding policy. 18743 */ 18744 if (ipst->ips_ip_g_forward != 0) 18745 ill->ill_flags |= ILLF_ROUTER; 18746 } 18747 18748 ASSERT(ill->ill_phyint != NULL); 18749 18750 /* 18751 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 18752 * be completed in ill_glist_insert -> ill_phyint_reinit 18753 */ 18754 if (!ill_allocate_mibs(ill)) 18755 return (ENOMEM); 18756 18757 /* 18758 * Pick a default sap until we get the DL_INFO_ACK back from 18759 * the driver. 18760 */ 18761 if (ill->ill_sap == 0) { 18762 if (ill->ill_isv6) 18763 ill->ill_sap = IP6_DL_SAP; 18764 else 18765 ill->ill_sap = IP_DL_SAP; 18766 } 18767 18768 ill->ill_ifname_pending = 1; 18769 ill->ill_ifname_pending_err = 0; 18770 18771 /* 18772 * When the first ipif comes up in ipif_up_done(), multicast groups 18773 * that were joined while this ill was not bound to the DLPI link need 18774 * to be recovered by ill_recover_multicast(). 18775 */ 18776 ill->ill_need_recover_multicast = 1; 18777 18778 ill_refhold(ill); 18779 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 18780 if ((error = ill_glist_insert(ill, interf_name, 18781 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 18782 ill->ill_ppa = UINT_MAX; 18783 ill->ill_name[0] = '\0'; 18784 /* 18785 * undo null termination done above. 18786 */ 18787 ppa_ptr[0] = old_char; 18788 rw_exit(&ipst->ips_ill_g_lock); 18789 ill_refrele(ill); 18790 return (error); 18791 } 18792 18793 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 18794 18795 /* 18796 * When we return the buffer pointed to by interf_name should contain 18797 * the same name as in ill_name. 18798 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 18799 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 18800 * so copy full name and update the ppa ptr. 18801 * When ppa passed in != UINT_MAX all values are correct just undo 18802 * null termination, this saves a bcopy. 18803 */ 18804 if (*new_ppa_ptr == UINT_MAX) { 18805 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 18806 *new_ppa_ptr = ill->ill_ppa; 18807 } else { 18808 /* 18809 * undo null termination done above. 18810 */ 18811 ppa_ptr[0] = old_char; 18812 } 18813 18814 /* Let SCTP know about this ILL */ 18815 sctp_update_ill(ill, SCTP_ILL_INSERT); 18816 18817 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_reprocess_ioctl, NEW_OP, 18818 B_TRUE); 18819 18820 rw_exit(&ipst->ips_ill_g_lock); 18821 ill_refrele(ill); 18822 if (ipsq == NULL) 18823 return (EINPROGRESS); 18824 18825 /* 18826 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 18827 */ 18828 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 18829 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 18830 else 18831 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 18832 18833 error = ipif_set_values_tail(ill, ipif, mp, q); 18834 ipsq_exit(ipsq); 18835 if (error != 0 && error != EINPROGRESS) { 18836 /* 18837 * restore previous values 18838 */ 18839 ill->ill_isv6 = B_FALSE; 18840 } 18841 return (error); 18842 } 18843 18844 void 18845 ipif_init(ip_stack_t *ipst) 18846 { 18847 int i; 18848 18849 for (i = 0; i < MAX_G_HEADS; i++) { 18850 ipst->ips_ill_g_heads[i].ill_g_list_head = 18851 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 18852 ipst->ips_ill_g_heads[i].ill_g_list_tail = 18853 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 18854 } 18855 18856 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 18857 ill_phyint_compare_index, 18858 sizeof (phyint_t), 18859 offsetof(struct phyint, phyint_avl_by_index)); 18860 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 18861 ill_phyint_compare_name, 18862 sizeof (phyint_t), 18863 offsetof(struct phyint, phyint_avl_by_name)); 18864 } 18865 18866 /* 18867 * Lookup the ipif corresponding to the onlink destination address. For 18868 * point-to-point interfaces, it matches with remote endpoint destination 18869 * address. For point-to-multipoint interfaces it only tries to match the 18870 * destination with the interface's subnet address. The longest, most specific 18871 * match is found to take care of such rare network configurations like - 18872 * le0: 129.146.1.1/16 18873 * le1: 129.146.2.2/24 18874 * 18875 * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are 18876 * supported on underlying interfaces in an IPMP group, underlying interfaces 18877 * are ignored when looking up a match. (If we didn't ignore them, we'd 18878 * risk using a test address as a source for outgoing traffic.) 18879 */ 18880 ipif_t * 18881 ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) 18882 { 18883 ipif_t *ipif, *best_ipif; 18884 ill_t *ill; 18885 ill_walk_context_t ctx; 18886 18887 ASSERT(zoneid != ALL_ZONES); 18888 best_ipif = NULL; 18889 18890 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 18891 ill = ILL_START_WALK_V4(&ctx, ipst); 18892 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 18893 if (IS_UNDER_IPMP(ill)) 18894 continue; 18895 mutex_enter(&ill->ill_lock); 18896 for (ipif = ill->ill_ipif; ipif != NULL; 18897 ipif = ipif->ipif_next) { 18898 if (!IPIF_CAN_LOOKUP(ipif)) 18899 continue; 18900 if (ipif->ipif_zoneid != zoneid && 18901 ipif->ipif_zoneid != ALL_ZONES) 18902 continue; 18903 /* 18904 * Point-to-point case. Look for exact match with 18905 * destination address. 18906 */ 18907 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 18908 if (ipif->ipif_pp_dst_addr == addr) { 18909 ipif_refhold_locked(ipif); 18910 mutex_exit(&ill->ill_lock); 18911 rw_exit(&ipst->ips_ill_g_lock); 18912 if (best_ipif != NULL) 18913 ipif_refrele(best_ipif); 18914 return (ipif); 18915 } 18916 } else if (ipif->ipif_subnet == (addr & 18917 ipif->ipif_net_mask)) { 18918 /* 18919 * Point-to-multipoint case. Looping through to 18920 * find the most specific match. If there are 18921 * multiple best match ipif's then prefer ipif's 18922 * that are UP. If there is only one best match 18923 * ipif and it is DOWN we must still return it. 18924 */ 18925 if ((best_ipif == NULL) || 18926 (ipif->ipif_net_mask > 18927 best_ipif->ipif_net_mask) || 18928 ((ipif->ipif_net_mask == 18929 best_ipif->ipif_net_mask) && 18930 ((ipif->ipif_flags & IPIF_UP) && 18931 (!(best_ipif->ipif_flags & IPIF_UP))))) { 18932 ipif_refhold_locked(ipif); 18933 mutex_exit(&ill->ill_lock); 18934 rw_exit(&ipst->ips_ill_g_lock); 18935 if (best_ipif != NULL) 18936 ipif_refrele(best_ipif); 18937 best_ipif = ipif; 18938 rw_enter(&ipst->ips_ill_g_lock, 18939 RW_READER); 18940 mutex_enter(&ill->ill_lock); 18941 } 18942 } 18943 } 18944 mutex_exit(&ill->ill_lock); 18945 } 18946 rw_exit(&ipst->ips_ill_g_lock); 18947 return (best_ipif); 18948 } 18949 18950 /* 18951 * Save enough information so that we can recreate the IRE if 18952 * the interface goes down and then up. 18953 */ 18954 static void 18955 ipif_save_ire(ipif_t *ipif, ire_t *ire) 18956 { 18957 mblk_t *save_mp; 18958 18959 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 18960 if (save_mp != NULL) { 18961 ifrt_t *ifrt; 18962 18963 save_mp->b_wptr += sizeof (ifrt_t); 18964 ifrt = (ifrt_t *)save_mp->b_rptr; 18965 bzero(ifrt, sizeof (ifrt_t)); 18966 ifrt->ifrt_type = ire->ire_type; 18967 ifrt->ifrt_addr = ire->ire_addr; 18968 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 18969 ifrt->ifrt_src_addr = ire->ire_src_addr; 18970 ifrt->ifrt_mask = ire->ire_mask; 18971 ifrt->ifrt_flags = ire->ire_flags; 18972 ifrt->ifrt_max_frag = ire->ire_max_frag; 18973 mutex_enter(&ipif->ipif_saved_ire_lock); 18974 save_mp->b_cont = ipif->ipif_saved_ire_mp; 18975 ipif->ipif_saved_ire_mp = save_mp; 18976 ipif->ipif_saved_ire_cnt++; 18977 mutex_exit(&ipif->ipif_saved_ire_lock); 18978 } 18979 } 18980 18981 static void 18982 ipif_remove_ire(ipif_t *ipif, ire_t *ire) 18983 { 18984 mblk_t **mpp; 18985 mblk_t *mp; 18986 ifrt_t *ifrt; 18987 18988 /* Remove from ipif_saved_ire_mp list if it is there */ 18989 mutex_enter(&ipif->ipif_saved_ire_lock); 18990 for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; 18991 mpp = &(*mpp)->b_cont) { 18992 /* 18993 * On a given ipif, the triple of address, gateway and 18994 * mask is unique for each saved IRE (in the case of 18995 * ordinary interface routes, the gateway address is 18996 * all-zeroes). 18997 */ 18998 mp = *mpp; 18999 ifrt = (ifrt_t *)mp->b_rptr; 19000 if (ifrt->ifrt_addr == ire->ire_addr && 19001 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 19002 ifrt->ifrt_mask == ire->ire_mask) { 19003 *mpp = mp->b_cont; 19004 ipif->ipif_saved_ire_cnt--; 19005 freeb(mp); 19006 break; 19007 } 19008 } 19009 mutex_exit(&ipif->ipif_saved_ire_lock); 19010 } 19011 19012 /* 19013 * IP multirouting broadcast routes handling 19014 * Append CGTP broadcast IREs to regular ones created 19015 * at ifconfig time. 19016 */ 19017 static void 19018 ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst) 19019 { 19020 ire_t *ire_prim; 19021 19022 ASSERT(ire != NULL); 19023 ASSERT(ire_dst != NULL); 19024 19025 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 19026 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19027 if (ire_prim != NULL) { 19028 /* 19029 * We are in the special case of broadcasts for 19030 * CGTP. We add an IRE_BROADCAST that holds 19031 * the RTF_MULTIRT flag, the destination 19032 * address of ire_dst and the low level 19033 * info of ire_prim. In other words, CGTP 19034 * broadcast is added to the redundant ipif. 19035 */ 19036 ipif_t *ipif_prim; 19037 ire_t *bcast_ire; 19038 19039 ipif_prim = ire_prim->ire_ipif; 19040 19041 ip2dbg(("ip_cgtp_filter_bcast_add: " 19042 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 19043 (void *)ire_dst, (void *)ire_prim, 19044 (void *)ipif_prim)); 19045 19046 bcast_ire = ire_create( 19047 (uchar_t *)&ire->ire_addr, 19048 (uchar_t *)&ip_g_all_ones, 19049 (uchar_t *)&ire_dst->ire_src_addr, 19050 (uchar_t *)&ire->ire_gateway_addr, 19051 &ipif_prim->ipif_mtu, 19052 NULL, 19053 ipif_prim->ipif_rq, 19054 ipif_prim->ipif_wq, 19055 IRE_BROADCAST, 19056 ipif_prim, 19057 0, 19058 0, 19059 0, 19060 ire->ire_flags, 19061 &ire_uinfo_null, 19062 NULL, 19063 NULL, 19064 ipst); 19065 19066 if (bcast_ire != NULL) { 19067 19068 if (ire_add(&bcast_ire, NULL, NULL, NULL, 19069 B_FALSE) == 0) { 19070 ip2dbg(("ip_cgtp_filter_bcast_add: " 19071 "added bcast_ire %p\n", 19072 (void *)bcast_ire)); 19073 19074 ipif_save_ire(bcast_ire->ire_ipif, 19075 bcast_ire); 19076 ire_refrele(bcast_ire); 19077 } 19078 } 19079 ire_refrele(ire_prim); 19080 } 19081 } 19082 19083 /* 19084 * IP multirouting broadcast routes handling 19085 * Remove the broadcast ire 19086 */ 19087 static void 19088 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 19089 { 19090 ire_t *ire_dst; 19091 19092 ASSERT(ire != NULL); 19093 ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, 19094 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19095 if (ire_dst != NULL) { 19096 ire_t *ire_prim; 19097 19098 ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, 19099 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); 19100 if (ire_prim != NULL) { 19101 ipif_t *ipif_prim; 19102 ire_t *bcast_ire; 19103 19104 ipif_prim = ire_prim->ire_ipif; 19105 19106 ip2dbg(("ip_cgtp_filter_bcast_delete: " 19107 "ire_dst %p, ire_prim %p, ipif_prim %p\n", 19108 (void *)ire_dst, (void *)ire_prim, 19109 (void *)ipif_prim)); 19110 19111 bcast_ire = ire_ctable_lookup(ire->ire_addr, 19112 ire->ire_gateway_addr, 19113 IRE_BROADCAST, 19114 ipif_prim, ALL_ZONES, 19115 NULL, 19116 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | 19117 MATCH_IRE_MASK, ipst); 19118 19119 if (bcast_ire != NULL) { 19120 ip2dbg(("ip_cgtp_filter_bcast_delete: " 19121 "looked up bcast_ire %p\n", 19122 (void *)bcast_ire)); 19123 ipif_remove_ire(bcast_ire->ire_ipif, 19124 bcast_ire); 19125 ire_delete(bcast_ire); 19126 ire_refrele(bcast_ire); 19127 } 19128 ire_refrele(ire_prim); 19129 } 19130 ire_refrele(ire_dst); 19131 } 19132 } 19133 19134 /* 19135 * IPsec hardware acceleration capabilities related functions. 19136 */ 19137 19138 /* 19139 * Free a per-ill IPsec capabilities structure. 19140 */ 19141 static void 19142 ill_ipsec_capab_free(ill_ipsec_capab_t *capab) 19143 { 19144 if (capab->auth_hw_algs != NULL) 19145 kmem_free(capab->auth_hw_algs, capab->algs_size); 19146 if (capab->encr_hw_algs != NULL) 19147 kmem_free(capab->encr_hw_algs, capab->algs_size); 19148 if (capab->encr_algparm != NULL) 19149 kmem_free(capab->encr_algparm, capab->encr_algparm_size); 19150 kmem_free(capab, sizeof (ill_ipsec_capab_t)); 19151 } 19152 19153 /* 19154 * Allocate a new per-ill IPsec capabilities structure. This structure 19155 * is specific to an IPsec protocol (AH or ESP). It is implemented as 19156 * an array which specifies, for each algorithm, whether this algorithm 19157 * is supported by the ill or not. 19158 */ 19159 static ill_ipsec_capab_t * 19160 ill_ipsec_capab_alloc(void) 19161 { 19162 ill_ipsec_capab_t *capab; 19163 uint_t nelems; 19164 19165 capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); 19166 if (capab == NULL) 19167 return (NULL); 19168 19169 /* we need one bit per algorithm */ 19170 nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); 19171 capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); 19172 19173 /* allocate memory to store algorithm flags */ 19174 capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 19175 if (capab->encr_hw_algs == NULL) 19176 goto nomem; 19177 capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); 19178 if (capab->auth_hw_algs == NULL) 19179 goto nomem; 19180 /* 19181 * Leave encr_algparm NULL for now since we won't need it half 19182 * the time 19183 */ 19184 return (capab); 19185 19186 nomem: 19187 ill_ipsec_capab_free(capab); 19188 return (NULL); 19189 } 19190 19191 /* 19192 * Resize capability array. Since we're exclusive, this is OK. 19193 */ 19194 static boolean_t 19195 ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) 19196 { 19197 ipsec_capab_algparm_t *nalp, *oalp; 19198 uint32_t olen, nlen; 19199 19200 oalp = capab->encr_algparm; 19201 olen = capab->encr_algparm_size; 19202 19203 if (oalp != NULL) { 19204 if (algid < capab->encr_algparm_end) 19205 return (B_TRUE); 19206 } 19207 19208 nlen = (algid + 1) * sizeof (*nalp); 19209 nalp = kmem_zalloc(nlen, KM_NOSLEEP); 19210 if (nalp == NULL) 19211 return (B_FALSE); 19212 19213 if (oalp != NULL) { 19214 bcopy(oalp, nalp, olen); 19215 kmem_free(oalp, olen); 19216 } 19217 capab->encr_algparm = nalp; 19218 capab->encr_algparm_size = nlen; 19219 capab->encr_algparm_end = algid + 1; 19220 19221 return (B_TRUE); 19222 } 19223 19224 /* 19225 * Compare the capabilities of the specified ill with the protocol 19226 * and algorithms specified by the SA passed as argument. 19227 * If they match, returns B_TRUE, B_FALSE if they do not match. 19228 * 19229 * The ill can be passed as a pointer to it, or by specifying its index 19230 * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). 19231 * 19232 * Called by ipsec_out_is_accelerated() do decide whether an outbound 19233 * packet is eligible for hardware acceleration, and by 19234 * ill_ipsec_capab_send_all() to decide whether a SA must be sent down 19235 * to a particular ill. 19236 */ 19237 boolean_t 19238 ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, 19239 ipsa_t *sa, netstack_t *ns) 19240 { 19241 boolean_t sa_isv6; 19242 uint_t algid; 19243 struct ill_ipsec_capab_s *cpp; 19244 boolean_t need_refrele = B_FALSE; 19245 ip_stack_t *ipst = ns->netstack_ip; 19246 19247 if (ill == NULL) { 19248 ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, 19249 NULL, NULL, NULL, ipst); 19250 if (ill == NULL) { 19251 ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); 19252 return (B_FALSE); 19253 } 19254 need_refrele = B_TRUE; 19255 } 19256 19257 /* 19258 * Use the address length specified by the SA to determine 19259 * if it corresponds to a IPv6 address, and fail the matching 19260 * if the isv6 flag passed as argument does not match. 19261 * Note: this check is used for SADB capability checking before 19262 * sending SA information to an ill. 19263 */ 19264 sa_isv6 = (sa->ipsa_addrfam == AF_INET6); 19265 if (sa_isv6 != ill_isv6) 19266 /* protocol mismatch */ 19267 goto done; 19268 19269 /* 19270 * Check if the ill supports the protocol, algorithm(s) and 19271 * key size(s) specified by the SA, and get the pointers to 19272 * the algorithms supported by the ill. 19273 */ 19274 switch (sa->ipsa_type) { 19275 19276 case SADB_SATYPE_ESP: 19277 if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) 19278 /* ill does not support ESP acceleration */ 19279 goto done; 19280 cpp = ill->ill_ipsec_capab_esp; 19281 algid = sa->ipsa_auth_alg; 19282 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) 19283 goto done; 19284 algid = sa->ipsa_encr_alg; 19285 if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) 19286 goto done; 19287 if (algid < cpp->encr_algparm_end) { 19288 ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; 19289 if (sa->ipsa_encrkeybits < alp->minkeylen) 19290 goto done; 19291 if (sa->ipsa_encrkeybits > alp->maxkeylen) 19292 goto done; 19293 } 19294 break; 19295 19296 case SADB_SATYPE_AH: 19297 if (!(ill->ill_capabilities & ILL_CAPAB_AH)) 19298 /* ill does not support AH acceleration */ 19299 goto done; 19300 if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, 19301 ill->ill_ipsec_capab_ah->auth_hw_algs)) 19302 goto done; 19303 break; 19304 } 19305 19306 if (need_refrele) 19307 ill_refrele(ill); 19308 return (B_TRUE); 19309 done: 19310 if (need_refrele) 19311 ill_refrele(ill); 19312 return (B_FALSE); 19313 } 19314 19315 /* 19316 * Add a new ill to the list of IPsec capable ills. 19317 * Called from ill_capability_ipsec_ack() when an ACK was received 19318 * indicating that IPsec hardware processing was enabled for an ill. 19319 * 19320 * ill must point to the ill for which acceleration was enabled. 19321 * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. 19322 */ 19323 static void 19324 ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) 19325 { 19326 ipsec_capab_ill_t **ills, *cur_ill, *new_ill; 19327 uint_t sa_type; 19328 uint_t ipproto; 19329 ip_stack_t *ipst = ill->ill_ipst; 19330 19331 ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || 19332 (dl_cap == DL_CAPAB_IPSEC_ESP)); 19333 19334 switch (dl_cap) { 19335 case DL_CAPAB_IPSEC_AH: 19336 sa_type = SADB_SATYPE_AH; 19337 ills = &ipst->ips_ipsec_capab_ills_ah; 19338 ipproto = IPPROTO_AH; 19339 break; 19340 case DL_CAPAB_IPSEC_ESP: 19341 sa_type = SADB_SATYPE_ESP; 19342 ills = &ipst->ips_ipsec_capab_ills_esp; 19343 ipproto = IPPROTO_ESP; 19344 break; 19345 } 19346 19347 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 19348 19349 /* 19350 * Add ill index to list of hardware accelerators. If 19351 * already in list, do nothing. 19352 */ 19353 for (cur_ill = *ills; cur_ill != NULL && 19354 (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || 19355 cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) 19356 ; 19357 19358 if (cur_ill == NULL) { 19359 /* if this is a new entry for this ill */ 19360 new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); 19361 if (new_ill == NULL) { 19362 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19363 return; 19364 } 19365 19366 new_ill->ill_index = ill->ill_phyint->phyint_ifindex; 19367 new_ill->ill_isv6 = ill->ill_isv6; 19368 new_ill->next = *ills; 19369 *ills = new_ill; 19370 } else if (!sadb_resync) { 19371 /* not resync'ing SADB and an entry exists for this ill */ 19372 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19373 return; 19374 } 19375 19376 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19377 19378 if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL) 19379 /* 19380 * IPsec module for protocol loaded, initiate dump 19381 * of the SADB to this ill. 19382 */ 19383 sadb_ill_download(ill, sa_type); 19384 } 19385 19386 /* 19387 * Remove an ill from the list of IPsec capable ills. 19388 */ 19389 static void 19390 ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) 19391 { 19392 ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; 19393 ip_stack_t *ipst = ill->ill_ipst; 19394 19395 ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || 19396 dl_cap == DL_CAPAB_IPSEC_ESP); 19397 19398 ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah : 19399 &ipst->ips_ipsec_capab_ills_esp; 19400 19401 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); 19402 19403 prev_ill = NULL; 19404 for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != 19405 ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != 19406 ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) 19407 ; 19408 if (cur_ill == NULL) { 19409 /* entry not found */ 19410 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19411 return; 19412 } 19413 if (prev_ill == NULL) { 19414 /* entry at front of list */ 19415 *ills = NULL; 19416 } else { 19417 prev_ill->next = cur_ill->next; 19418 } 19419 kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); 19420 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19421 } 19422 19423 /* 19424 * Called by SADB to send a DL_CONTROL_REQ message to every ill 19425 * supporting the specified IPsec protocol acceleration. 19426 * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. 19427 * We free the mblk and, if sa is non-null, release the held referece. 19428 */ 19429 void 19430 ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, 19431 netstack_t *ns) 19432 { 19433 ipsec_capab_ill_t *ici, *cur_ici; 19434 ill_t *ill; 19435 mblk_t *nmp, *mp_ship_list = NULL, *next_mp; 19436 ip_stack_t *ipst = ns->netstack_ip; 19437 19438 ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah : 19439 ipst->ips_ipsec_capab_ills_esp; 19440 19441 rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER); 19442 19443 for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { 19444 ill = ill_lookup_on_ifindex(cur_ici->ill_index, 19445 cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst); 19446 19447 /* 19448 * Handle the case where the ill goes away while the SADB is 19449 * attempting to send messages. If it's going away, it's 19450 * nuking its shadow SADB, so we don't care.. 19451 */ 19452 19453 if (ill == NULL) 19454 continue; 19455 19456 if (sa != NULL) { 19457 /* 19458 * Make sure capabilities match before 19459 * sending SA to ill. 19460 */ 19461 if (!ipsec_capab_match(ill, cur_ici->ill_index, 19462 cur_ici->ill_isv6, sa, ipst->ips_netstack)) { 19463 ill_refrele(ill); 19464 continue; 19465 } 19466 19467 mutex_enter(&sa->ipsa_lock); 19468 sa->ipsa_flags |= IPSA_F_HW; 19469 mutex_exit(&sa->ipsa_lock); 19470 } 19471 19472 /* 19473 * Copy template message, and add it to the front 19474 * of the mblk ship list. We want to avoid holding 19475 * the ipsec_capab_ills_lock while sending the 19476 * message to the ills. 19477 * 19478 * The b_next and b_prev are temporarily used 19479 * to build a list of mblks to be sent down, and to 19480 * save the ill to which they must be sent. 19481 */ 19482 nmp = copymsg(mp); 19483 if (nmp == NULL) { 19484 ill_refrele(ill); 19485 continue; 19486 } 19487 ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); 19488 nmp->b_next = mp_ship_list; 19489 mp_ship_list = nmp; 19490 nmp->b_prev = (mblk_t *)ill; 19491 } 19492 19493 rw_exit(&ipst->ips_ipsec_capab_ills_lock); 19494 19495 for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) { 19496 /* restore the mblk to a sane state */ 19497 next_mp = nmp->b_next; 19498 nmp->b_next = NULL; 19499 ill = (ill_t *)nmp->b_prev; 19500 nmp->b_prev = NULL; 19501 19502 ill_dlpi_send(ill, nmp); 19503 ill_refrele(ill); 19504 } 19505 19506 if (sa != NULL) 19507 IPSA_REFRELE(sa); 19508 freemsg(mp); 19509 } 19510 19511 /* 19512 * Derive an interface id from the link layer address. 19513 * Knows about IEEE 802 and IEEE EUI-64 mappings. 19514 */ 19515 static boolean_t 19516 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19517 { 19518 char *addr; 19519 19520 if (ill->ill_phys_addr_length != ETHERADDRL) 19521 return (B_FALSE); 19522 19523 /* Form EUI-64 like address */ 19524 addr = (char *)&v6addr->s6_addr32[2]; 19525 bcopy(ill->ill_phys_addr, addr, 3); 19526 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 19527 addr[3] = (char)0xff; 19528 addr[4] = (char)0xfe; 19529 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 19530 return (B_TRUE); 19531 } 19532 19533 /* ARGSUSED */ 19534 static boolean_t 19535 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19536 { 19537 return (B_FALSE); 19538 } 19539 19540 typedef struct ipmp_ifcookie { 19541 uint32_t ic_hostid; 19542 char ic_ifname[LIFNAMSIZ]; 19543 char ic_zonename[ZONENAME_MAX]; 19544 } ipmp_ifcookie_t; 19545 19546 /* 19547 * Construct a pseudo-random interface ID for the IPMP interface that's both 19548 * predictable and (almost) guaranteed to be unique. 19549 */ 19550 static boolean_t 19551 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19552 { 19553 zone_t *zp; 19554 uint8_t *addr; 19555 uchar_t hash[16]; 19556 ulong_t hostid; 19557 MD5_CTX ctx; 19558 ipmp_ifcookie_t ic = { 0 }; 19559 19560 ASSERT(IS_IPMP(ill)); 19561 19562 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 19563 ic.ic_hostid = htonl((uint32_t)hostid); 19564 19565 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 19566 19567 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 19568 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 19569 zone_rele(zp); 19570 } 19571 19572 MD5Init(&ctx); 19573 MD5Update(&ctx, &ic, sizeof (ic)); 19574 MD5Final(hash, &ctx); 19575 19576 /* 19577 * Map the hash to an interface ID per the basic approach in RFC3041. 19578 */ 19579 addr = &v6addr->s6_addr8[8]; 19580 bcopy(hash + 8, addr, sizeof (uint64_t)); 19581 addr[0] &= ~0x2; /* set local bit */ 19582 19583 return (B_TRUE); 19584 } 19585 19586 /* ARGSUSED */ 19587 static boolean_t 19588 ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 19589 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 19590 { 19591 /* 19592 * Multicast address mappings used over Ethernet/802.X. 19593 * This address is used as a base for mappings. 19594 */ 19595 static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, 19596 0x00, 0x00, 0x00}; 19597 19598 /* 19599 * Extract low order 32 bits from IPv6 multicast address. 19600 * Or that into the link layer address, starting from the 19601 * second byte. 19602 */ 19603 *hw_start = 2; 19604 v6_extract_mask->s6_addr32[0] = 0; 19605 v6_extract_mask->s6_addr32[1] = 0; 19606 v6_extract_mask->s6_addr32[2] = 0; 19607 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 19608 bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); 19609 return (B_TRUE); 19610 } 19611 19612 /* 19613 * Indicate by return value whether multicast is supported. If not, 19614 * this code should not touch/change any parameters. 19615 */ 19616 /* ARGSUSED */ 19617 static boolean_t 19618 ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 19619 uint32_t *hw_start, ipaddr_t *extract_mask) 19620 { 19621 /* 19622 * Multicast address mappings used over Ethernet/802.X. 19623 * This address is used as a base for mappings. 19624 */ 19625 static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, 19626 0x00, 0x00, 0x00 }; 19627 19628 if (phys_length != ETHERADDRL) 19629 return (B_FALSE); 19630 19631 *extract_mask = htonl(0x007fffff); 19632 *hw_start = 2; 19633 bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); 19634 return (B_TRUE); 19635 } 19636 19637 /* 19638 * Derive IPoIB interface id from the link layer address. 19639 */ 19640 static boolean_t 19641 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 19642 { 19643 char *addr; 19644 19645 if (ill->ill_phys_addr_length != 20) 19646 return (B_FALSE); 19647 addr = (char *)&v6addr->s6_addr32[2]; 19648 bcopy(ill->ill_phys_addr + 12, addr, 8); 19649 /* 19650 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 19651 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 19652 * rules. In these cases, the IBA considers these GUIDs to be in 19653 * "Modified EUI-64" format, and thus toggling the u/l bit is not 19654 * required; vendors are required not to assign global EUI-64's 19655 * that differ only in u/l bit values, thus guaranteeing uniqueness 19656 * of the interface identifier. Whether the GUID is in modified 19657 * or proper EUI-64 format, the ipv6 identifier must have the u/l 19658 * bit set to 1. 19659 */ 19660 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 19661 return (B_TRUE); 19662 } 19663 19664 /* 19665 * Note on mapping from multicast IP addresses to IPoIB multicast link 19666 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 19667 * The format of an IPoIB multicast address is: 19668 * 19669 * 4 byte QPN Scope Sign. Pkey 19670 * +--------------------------------------------+ 19671 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 19672 * +--------------------------------------------+ 19673 * 19674 * The Scope and Pkey components are properties of the IBA port and 19675 * network interface. They can be ascertained from the broadcast address. 19676 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 19677 */ 19678 19679 static boolean_t 19680 ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, 19681 uint32_t *hw_start, in6_addr_t *v6_extract_mask) 19682 { 19683 /* 19684 * Base IPoIB IPv6 multicast address used for mappings. 19685 * Does not contain the IBA scope/Pkey values. 19686 */ 19687 static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 19688 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 19689 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 19690 19691 /* 19692 * Extract low order 80 bits from IPv6 multicast address. 19693 * Or that into the link layer address, starting from the 19694 * sixth byte. 19695 */ 19696 *hw_start = 6; 19697 bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); 19698 19699 /* 19700 * Now fill in the IBA scope/Pkey values from the broadcast address. 19701 */ 19702 *(maddr + 5) = *(bphys_addr + 5); 19703 *(maddr + 8) = *(bphys_addr + 8); 19704 *(maddr + 9) = *(bphys_addr + 9); 19705 19706 v6_extract_mask->s6_addr32[0] = 0; 19707 v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); 19708 v6_extract_mask->s6_addr32[2] = 0xffffffffU; 19709 v6_extract_mask->s6_addr32[3] = 0xffffffffU; 19710 return (B_TRUE); 19711 } 19712 19713 static boolean_t 19714 ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, 19715 uint32_t *hw_start, ipaddr_t *extract_mask) 19716 { 19717 /* 19718 * Base IPoIB IPv4 multicast address used for mappings. 19719 * Does not contain the IBA scope/Pkey values. 19720 */ 19721 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 19722 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 19723 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 19724 19725 if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) 19726 return (B_FALSE); 19727 19728 /* 19729 * Extract low order 28 bits from IPv4 multicast address. 19730 * Or that into the link layer address, starting from the 19731 * sixteenth byte. 19732 */ 19733 *extract_mask = htonl(0x0fffffff); 19734 *hw_start = 16; 19735 bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); 19736 19737 /* 19738 * Now fill in the IBA scope/Pkey values from the broadcast address. 19739 */ 19740 *(maddr + 5) = *(bphys_addr + 5); 19741 *(maddr + 8) = *(bphys_addr + 8); 19742 *(maddr + 9) = *(bphys_addr + 9); 19743 return (B_TRUE); 19744 } 19745 19746 /* 19747 * Returns B_TRUE if an ipif is present in the given zone, matching some flags 19748 * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. 19749 * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with 19750 * the link-local address is preferred. 19751 */ 19752 boolean_t 19753 ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) 19754 { 19755 ipif_t *ipif; 19756 ipif_t *maybe_ipif = NULL; 19757 19758 mutex_enter(&ill->ill_lock); 19759 if (ill->ill_state_flags & ILL_CONDEMNED) { 19760 mutex_exit(&ill->ill_lock); 19761 if (ipifp != NULL) 19762 *ipifp = NULL; 19763 return (B_FALSE); 19764 } 19765 19766 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19767 if (!IPIF_CAN_LOOKUP(ipif)) 19768 continue; 19769 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 19770 ipif->ipif_zoneid != ALL_ZONES) 19771 continue; 19772 if ((ipif->ipif_flags & flags) != flags) 19773 continue; 19774 19775 if (ipifp == NULL) { 19776 mutex_exit(&ill->ill_lock); 19777 ASSERT(maybe_ipif == NULL); 19778 return (B_TRUE); 19779 } 19780 if (!ill->ill_isv6 || 19781 IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { 19782 ipif_refhold_locked(ipif); 19783 mutex_exit(&ill->ill_lock); 19784 *ipifp = ipif; 19785 return (B_TRUE); 19786 } 19787 if (maybe_ipif == NULL) 19788 maybe_ipif = ipif; 19789 } 19790 if (ipifp != NULL) { 19791 if (maybe_ipif != NULL) 19792 ipif_refhold_locked(maybe_ipif); 19793 *ipifp = maybe_ipif; 19794 } 19795 mutex_exit(&ill->ill_lock); 19796 return (maybe_ipif != NULL); 19797 } 19798 19799 /* 19800 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 19801 * If a pointer to an ipif_t is returned then the caller will need to do 19802 * an ill_refrele(). 19803 */ 19804 ipif_t * 19805 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 19806 ip_stack_t *ipst) 19807 { 19808 ipif_t *ipif; 19809 ill_t *ill; 19810 19811 ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, 19812 ipst); 19813 if (ill == NULL) 19814 return (NULL); 19815 19816 mutex_enter(&ill->ill_lock); 19817 if (ill->ill_state_flags & ILL_CONDEMNED) { 19818 mutex_exit(&ill->ill_lock); 19819 ill_refrele(ill); 19820 return (NULL); 19821 } 19822 19823 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 19824 if (!IPIF_CAN_LOOKUP(ipif)) 19825 continue; 19826 if (lifidx == ipif->ipif_id) { 19827 ipif_refhold_locked(ipif); 19828 break; 19829 } 19830 } 19831 19832 mutex_exit(&ill->ill_lock); 19833 ill_refrele(ill); 19834 return (ipif); 19835 } 19836 19837 /* 19838 * Flush the fastpath by deleting any nce's that are waiting for the fastpath, 19839 * There is one exceptions IRE_BROADCAST are difficult to recreate, 19840 * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush() 19841 * for details. 19842 */ 19843 void 19844 ill_fastpath_flush(ill_t *ill) 19845 { 19846 ip_stack_t *ipst = ill->ill_ipst; 19847 19848 nce_fastpath_list_dispatch(ill, NULL, NULL); 19849 ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4), 19850 ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE); 19851 } 19852 19853 /* 19854 * Set the physical address information for `ill' to the contents of the 19855 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 19856 * asynchronous if `ill' cannot immediately be quiesced -- in which case 19857 * EINPROGRESS will be returned. 19858 */ 19859 int 19860 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 19861 { 19862 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19863 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 19864 19865 ASSERT(IAM_WRITER_IPSQ(ipsq)); 19866 19867 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 19868 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 19869 /* Changing DL_IPV6_TOKEN is not yet supported */ 19870 return (0); 19871 } 19872 19873 /* 19874 * We need to store up to two copies of `mp' in `ill'. Due to the 19875 * design of ipsq_pending_mp_add(), we can't pass them as separate 19876 * arguments to ill_set_phys_addr_tail(). Instead, chain them 19877 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 19878 */ 19879 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 19880 freemsg(mp); 19881 return (ENOMEM); 19882 } 19883 19884 ipsq_current_start(ipsq, ill->ill_ipif, 0); 19885 19886 /* 19887 * If we can quiesce the ill, then set the address. If not, then 19888 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 19889 */ 19890 ill_down_ipifs(ill, B_TRUE); 19891 mutex_enter(&ill->ill_lock); 19892 if (!ill_is_quiescent(ill)) { 19893 /* call cannot fail since `conn_t *' argument is NULL */ 19894 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 19895 mp, ILL_DOWN); 19896 mutex_exit(&ill->ill_lock); 19897 return (EINPROGRESS); 19898 } 19899 mutex_exit(&ill->ill_lock); 19900 19901 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 19902 return (0); 19903 } 19904 19905 /* 19906 * Once the ill associated with `q' has quiesced, set its physical address 19907 * information to the values in `addrmp'. Note that two copies of `addrmp' 19908 * are passed (linked by b_cont), since we sometimes need to save two distinct 19909 * copies in the ill_t, and our context doesn't permit sleeping or allocation 19910 * failure (we'll free the other copy if it's not needed). Since the ill_t 19911 * is quiesced, we know any stale IREs with the old address information have 19912 * already been removed, so we don't need to call ill_fastpath_flush(). 19913 */ 19914 /* ARGSUSED */ 19915 static void 19916 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 19917 { 19918 ill_t *ill = q->q_ptr; 19919 mblk_t *addrmp2 = unlinkb(addrmp); 19920 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 19921 uint_t addrlen, addroff; 19922 19923 ASSERT(IAM_WRITER_IPSQ(ipsq)); 19924 19925 addroff = dlindp->dl_addr_offset; 19926 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 19927 19928 switch (dlindp->dl_data) { 19929 case DL_IPV6_LINK_LAYER_ADDR: 19930 ill_set_ndmp(ill, addrmp, addroff, addrlen); 19931 freemsg(addrmp2); 19932 break; 19933 19934 case DL_CURR_PHYS_ADDR: 19935 freemsg(ill->ill_phys_addr_mp); 19936 ill->ill_phys_addr = addrmp->b_rptr + addroff; 19937 ill->ill_phys_addr_mp = addrmp; 19938 ill->ill_phys_addr_length = addrlen; 19939 19940 if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) 19941 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 19942 else 19943 freemsg(addrmp2); 19944 break; 19945 default: 19946 ASSERT(0); 19947 } 19948 19949 /* 19950 * If there are ipifs to bring up, ill_up_ipifs() will return 19951 * EINPROGRESS, and ipsq_current_finish() will be called by 19952 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 19953 * brought up. 19954 */ 19955 if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS) 19956 ipsq_current_finish(ipsq); 19957 } 19958 19959 /* 19960 * Helper routine for setting the ill_nd_lla fields. 19961 */ 19962 void 19963 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 19964 { 19965 freemsg(ill->ill_nd_lla_mp); 19966 ill->ill_nd_lla = ndmp->b_rptr + addroff; 19967 ill->ill_nd_lla_mp = ndmp; 19968 ill->ill_nd_lla_len = addrlen; 19969 } 19970 19971 /* 19972 * Replumb the ill. 19973 */ 19974 int 19975 ill_replumb(ill_t *ill, mblk_t *mp) 19976 { 19977 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 19978 19979 ASSERT(IAM_WRITER_IPSQ(ipsq)); 19980 19981 ipsq_current_start(ipsq, ill->ill_ipif, 0); 19982 19983 /* 19984 * If we can quiesce the ill, then continue. If not, then 19985 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 19986 */ 19987 ill_down_ipifs(ill, B_FALSE); 19988 19989 mutex_enter(&ill->ill_lock); 19990 if (!ill_is_quiescent(ill)) { 19991 /* call cannot fail since `conn_t *' argument is NULL */ 19992 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 19993 mp, ILL_DOWN); 19994 mutex_exit(&ill->ill_lock); 19995 return (EINPROGRESS); 19996 } 19997 mutex_exit(&ill->ill_lock); 19998 19999 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 20000 return (0); 20001 } 20002 20003 /* ARGSUSED */ 20004 static void 20005 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 20006 { 20007 ill_t *ill = q->q_ptr; 20008 20009 ASSERT(IAM_WRITER_IPSQ(ipsq)); 20010 20011 ill_down_ipifs_tail(ill); 20012 20013 freemsg(ill->ill_replumb_mp); 20014 ill->ill_replumb_mp = copyb(mp); 20015 20016 /* 20017 * Successfully quiesced and brought down the interface, now we send 20018 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 20019 * DL_NOTE_REPLUMB message. 20020 */ 20021 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 20022 DL_NOTIFY_CONF); 20023 ASSERT(mp != NULL); 20024 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 20025 DL_NOTE_REPLUMB_DONE; 20026 ill_dlpi_send(ill, mp); 20027 20028 /* 20029 * If there are ipifs to bring up, ill_up_ipifs() will return 20030 * EINPROGRESS, and ipsq_current_finish() will be called by 20031 * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is 20032 * brought up. 20033 */ 20034 if (ill->ill_replumb_mp == NULL || 20035 ill_up_ipifs(ill, q, ill->ill_replumb_mp) != EINPROGRESS) { 20036 ipsq_current_finish(ipsq); 20037 } 20038 } 20039 20040 major_t IP_MAJ; 20041 #define IP "ip" 20042 20043 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 20044 #define UDPDEV "/devices/pseudo/udp@0:udp" 20045 20046 /* 20047 * Issue REMOVEIF ioctls to have the loopback interfaces 20048 * go away. Other interfaces are either I_LINKed or I_PLINKed; 20049 * the former going away when the user-level processes in the zone 20050 * are killed * and the latter are cleaned up by the stream head 20051 * str_stack_shutdown callback that undoes all I_PLINKs. 20052 */ 20053 void 20054 ip_loopback_cleanup(ip_stack_t *ipst) 20055 { 20056 int error; 20057 ldi_handle_t lh = NULL; 20058 ldi_ident_t li = NULL; 20059 int rval; 20060 cred_t *cr; 20061 struct strioctl iocb; 20062 struct lifreq lifreq; 20063 20064 IP_MAJ = ddi_name_to_major(IP); 20065 20066 #ifdef NS_DEBUG 20067 (void) printf("ip_loopback_cleanup() stackid %d\n", 20068 ipst->ips_netstack->netstack_stackid); 20069 #endif 20070 20071 bzero(&lifreq, sizeof (lifreq)); 20072 (void) strcpy(lifreq.lifr_name, ipif_loopback_name); 20073 20074 error = ldi_ident_from_major(IP_MAJ, &li); 20075 if (error) { 20076 #ifdef DEBUG 20077 printf("ip_loopback_cleanup: lyr ident get failed error %d\n", 20078 error); 20079 #endif 20080 return; 20081 } 20082 20083 cr = zone_get_kcred(netstackid_to_zoneid( 20084 ipst->ips_netstack->netstack_stackid)); 20085 ASSERT(cr != NULL); 20086 error = ldi_open_by_name(UDP6DEV, FREAD|FWRITE, cr, &lh, li); 20087 if (error) { 20088 #ifdef DEBUG 20089 printf("ip_loopback_cleanup: open of UDP6DEV failed error %d\n", 20090 error); 20091 #endif 20092 goto out; 20093 } 20094 iocb.ic_cmd = SIOCLIFREMOVEIF; 20095 iocb.ic_timout = 15; 20096 iocb.ic_len = sizeof (lifreq); 20097 iocb.ic_dp = (char *)&lifreq; 20098 20099 error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval); 20100 /* LINTED - statement has no consequent */ 20101 if (error) { 20102 #ifdef NS_DEBUG 20103 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on " 20104 "UDP6 error %d\n", error); 20105 #endif 20106 } 20107 (void) ldi_close(lh, FREAD|FWRITE, cr); 20108 lh = NULL; 20109 20110 error = ldi_open_by_name(UDPDEV, FREAD|FWRITE, cr, &lh, li); 20111 if (error) { 20112 #ifdef NS_DEBUG 20113 printf("ip_loopback_cleanup: open of UDPDEV failed error %d\n", 20114 error); 20115 #endif 20116 goto out; 20117 } 20118 20119 iocb.ic_cmd = SIOCLIFREMOVEIF; 20120 iocb.ic_timout = 15; 20121 iocb.ic_len = sizeof (lifreq); 20122 iocb.ic_dp = (char *)&lifreq; 20123 20124 error = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval); 20125 /* LINTED - statement has no consequent */ 20126 if (error) { 20127 #ifdef NS_DEBUG 20128 printf("ip_loopback_cleanup: ioctl SIOCLIFREMOVEIF failed on " 20129 "UDP error %d\n", error); 20130 #endif 20131 } 20132 (void) ldi_close(lh, FREAD|FWRITE, cr); 20133 lh = NULL; 20134 20135 out: 20136 /* Close layered handles */ 20137 if (lh) 20138 (void) ldi_close(lh, FREAD|FWRITE, cr); 20139 if (li) 20140 ldi_ident_release(li); 20141 20142 crfree(cr); 20143 } 20144 20145 /* 20146 * This needs to be in-sync with nic_event_t definition 20147 */ 20148 static const char * 20149 ill_hook_event2str(nic_event_t event) 20150 { 20151 switch (event) { 20152 case NE_PLUMB: 20153 return ("PLUMB"); 20154 case NE_UNPLUMB: 20155 return ("UNPLUMB"); 20156 case NE_UP: 20157 return ("UP"); 20158 case NE_DOWN: 20159 return ("DOWN"); 20160 case NE_ADDRESS_CHANGE: 20161 return ("ADDRESS_CHANGE"); 20162 case NE_LIF_UP: 20163 return ("LIF_UP"); 20164 case NE_LIF_DOWN: 20165 return ("LIF_DOWN"); 20166 default: 20167 return ("UNKNOWN"); 20168 } 20169 } 20170 20171 void 20172 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 20173 nic_event_data_t data, size_t datalen) 20174 { 20175 ip_stack_t *ipst = ill->ill_ipst; 20176 hook_nic_event_int_t *info; 20177 const char *str = NULL; 20178 20179 /* create a new nic event info */ 20180 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 20181 goto fail; 20182 20183 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 20184 info->hnei_event.hne_lif = lif; 20185 info->hnei_event.hne_event = event; 20186 info->hnei_event.hne_protocol = ill->ill_isv6 ? 20187 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 20188 info->hnei_event.hne_data = NULL; 20189 info->hnei_event.hne_datalen = 0; 20190 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 20191 20192 if (data != NULL && datalen != 0) { 20193 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 20194 if (info->hnei_event.hne_data == NULL) 20195 goto fail; 20196 bcopy(data, info->hnei_event.hne_data, datalen); 20197 info->hnei_event.hne_datalen = datalen; 20198 } 20199 20200 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 20201 DDI_NOSLEEP) == DDI_SUCCESS) 20202 return; 20203 20204 fail: 20205 if (info != NULL) { 20206 if (info->hnei_event.hne_data != NULL) { 20207 kmem_free(info->hnei_event.hne_data, 20208 info->hnei_event.hne_datalen); 20209 } 20210 kmem_free(info, sizeof (hook_nic_event_t)); 20211 } 20212 str = ill_hook_event2str(event); 20213 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 20214 "information for %s (ENOMEM)\n", str, ill->ill_name)); 20215 } 20216 20217 void 20218 ipif_up_notify(ipif_t *ipif) 20219 { 20220 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 20221 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 20222 sctp_update_ipif(ipif, SCTP_IPIF_UP); 20223 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 20224 NE_LIF_UP, NULL, 0); 20225 } 20226