1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* Copyright (c) 1990 Mentat Inc. */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsun.h> 30 #include <sys/zone.h> 31 #include <sys/ddi.h> 32 #include <sys/sunddi.h> 33 #include <sys/cmn_err.h> 34 #include <sys/debug.h> 35 #include <sys/atomic.h> 36 37 #include <sys/systm.h> 38 #include <sys/param.h> 39 #include <sys/kmem.h> 40 #include <sys/sdt.h> 41 #include <sys/socket.h> 42 #include <sys/mac.h> 43 #include <net/if.h> 44 #include <net/if_arp.h> 45 #include <net/route.h> 46 #include <sys/sockio.h> 47 #include <netinet/in.h> 48 #include <net/if_dl.h> 49 50 #include <inet/common.h> 51 #include <inet/mi.h> 52 #include <inet/mib2.h> 53 #include <inet/nd.h> 54 #include <inet/arp.h> 55 #include <inet/snmpcom.h> 56 #include <inet/kstatcom.h> 57 58 #include <netinet/igmp_var.h> 59 #include <netinet/ip6.h> 60 #include <netinet/icmp6.h> 61 #include <netinet/sctp.h> 62 63 #include <inet/ip.h> 64 #include <inet/ip_impl.h> 65 #include <inet/ip6.h> 66 #include <inet/ip6_asp.h> 67 #include <inet/tcp.h> 68 #include <inet/ip_multi.h> 69 #include <inet/ip_if.h> 70 #include <inet/ip_ire.h> 71 #include <inet/ip_ftable.h> 72 #include <inet/ip_rts.h> 73 #include <inet/optcom.h> 74 #include <inet/ip_ndp.h> 75 #include <inet/ip_listutils.h> 76 #include <netinet/igmp.h> 77 #include <netinet/ip_mroute.h> 78 #include <inet/ipp_common.h> 79 80 #include <net/pfkeyv2.h> 81 #include <inet/sadb.h> 82 #include <inet/ipsec_impl.h> 83 #include <inet/ipdrop.h> 84 #include <inet/ip_netinfo.h> 85 #include <sys/squeue_impl.h> 86 #include <sys/squeue.h> 87 88 #include <inet/ipclassifier.h> 89 #include <inet/sctp_ip.h> 90 #include <inet/sctp/sctp_impl.h> 91 #include <inet/udp_impl.h> 92 #include <sys/sunddi.h> 93 94 #include <sys/tsol/label.h> 95 #include <sys/tsol/tnet.h> 96 97 /* 98 * Release a reference on ip_xmit_attr. 99 * The reference is acquired by conn_get_ixa() 100 */ 101 #define IXA_REFRELE(ixa) \ 102 { \ 103 if (atomic_dec_32_nv(&(ixa)->ixa_refcnt) == 0) \ 104 ixa_inactive(ixa); \ 105 } 106 107 #define IXA_REFHOLD(ixa) \ 108 { \ 109 ASSERT((ixa)->ixa_refcnt != 0); \ 110 atomic_inc_32(&(ixa)->ixa_refcnt); \ 111 } 112 113 /* 114 * When we need to handle a transmit side asynchronous operation, then we need 115 * to save sufficient information so that we can call the fragment and postfrag 116 * functions. That information is captured in an mblk containing this structure. 117 * 118 * Since this is currently only used for IPsec, we include information for 119 * the kernel crypto framework. 120 */ 121 typedef struct ixamblk_s { 122 boolean_t ixm_inbound; /* B_FALSE */ 123 iaflags_t ixm_flags; /* ixa_flags */ 124 netstackid_t ixm_stackid; /* Verify it didn't go away */ 125 uint_t ixm_ifindex; /* Used to find the nce */ 126 in6_addr_t ixm_nceaddr_v6; /* Used to find nce */ 127 #define ixm_nceaddr_v4 V4_PART_OF_V6(ixm_nceaddr_v6) 128 uint32_t ixm_fragsize; 129 uint_t ixm_pktlen; 130 uint16_t ixm_ip_hdr_length; /* Points to ULP header */ 131 uint8_t ixm_protocol; /* Protocol number for ULP cksum */ 132 pfirepostfrag_t ixm_postfragfn; 133 134 zoneid_t ixm_zoneid; /* Needed for ipobs */ 135 zoneid_t ixm_no_loop_zoneid; /* IXAF_NO_LOOP_ZONEID_SET */ 136 137 uint_t ixm_scopeid; /* For IPv6 link-locals */ 138 139 uint32_t ixm_ident; /* For IPv6 fragment header */ 140 uint32_t ixm_xmit_hint; 141 142 uint64_t ixm_conn_id; /* Used by DTrace */ 143 cred_t *ixm_cred; /* For getpeerucred - refhold if set */ 144 pid_t ixm_cpid; /* For getpeerucred */ 145 146 ts_label_t *ixm_tsl; /* Refhold if set. */ 147 148 /* 149 * When the pointers below are set they have a refhold on the struct. 150 */ 151 ipsec_latch_t *ixm_ipsec_latch; 152 struct ipsa_s *ixm_ipsec_ah_sa; /* SA for AH */ 153 struct ipsa_s *ixm_ipsec_esp_sa; /* SA for ESP */ 154 struct ipsec_policy_s *ixm_ipsec_policy; /* why are we here? */ 155 struct ipsec_action_s *ixm_ipsec_action; /* For reflected packets */ 156 157 ipsa_ref_t ixm_ipsec_ref[2]; /* Soft reference to SA */ 158 159 /* Need these while waiting for SA */ 160 uint16_t ixm_ipsec_src_port; /* Source port number of d-gram. */ 161 uint16_t ixm_ipsec_dst_port; /* Destination port number of d-gram. */ 162 uint8_t ixm_ipsec_icmp_type; /* ICMP type of d-gram */ 163 uint8_t ixm_ipsec_icmp_code; /* ICMP code of d-gram */ 164 165 sa_family_t ixm_ipsec_inaf; /* Inner address family */ 166 uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN]; /* Inner src address */ 167 uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN]; /* Inner dest address */ 168 uint8_t ixm_ipsec_insrcpfx; /* Inner source prefix */ 169 uint8_t ixm_ipsec_indstpfx; /* Inner destination prefix */ 170 171 uint8_t ixm_ipsec_proto; /* IP protocol number for d-gram. */ 172 } ixamblk_t; 173 174 175 /* 176 * When we need to handle a receive side asynchronous operation, then we need 177 * to save sufficient information so that we can call ip_fanout. 178 * That information is captured in an mblk containing this structure. 179 * 180 * Since this is currently only used for IPsec, we include information for 181 * the kernel crypto framework. 182 */ 183 typedef struct iramblk_s { 184 boolean_t irm_inbound; /* B_TRUE */ 185 iaflags_t irm_flags; /* ira_flags */ 186 netstackid_t irm_stackid; /* Verify it didn't go away */ 187 uint_t irm_ifindex; /* To find ira_ill */ 188 189 uint_t irm_rifindex; /* ira_rifindex */ 190 uint_t irm_ruifindex; /* ira_ruifindex */ 191 uint_t irm_pktlen; 192 uint16_t irm_ip_hdr_length; /* Points to ULP header */ 193 uint8_t irm_protocol; /* Protocol number for ULP cksum */ 194 zoneid_t irm_zoneid; /* ALL_ZONES unless local delivery */ 195 196 squeue_t *irm_sqp; 197 ill_rx_ring_t *irm_ring; 198 199 ipaddr_t irm_mroute_tunnel; /* IRAF_MROUTE_TUNNEL_SET */ 200 zoneid_t irm_no_loop_zoneid; /* IRAF_NO_LOOP_ZONEID_SET */ 201 uint32_t irm_esp_udp_ports; /* IRAF_ESP_UDP_PORTS */ 202 203 char irm_l2src[IRA_L2SRC_SIZE]; /* If IRAF_L2SRC_SET */ 204 205 cred_t *irm_cred; /* For getpeerucred - refhold if set */ 206 pid_t irm_cpid; /* For getpeerucred */ 207 208 ts_label_t *irm_tsl; /* Refhold if set. */ 209 210 /* 211 * When set these correspond to a refhold on the object. 212 */ 213 struct ipsa_s *irm_ipsec_ah_sa; /* SA for AH */ 214 struct ipsa_s *irm_ipsec_esp_sa; /* SA for ESP */ 215 struct ipsec_action_s *irm_ipsec_action; /* For reflected packets */ 216 } iramblk_t; 217 218 219 /* 220 * Take the information in ip_xmit_attr_t and stick it in an mblk 221 * that can later be passed to ip_xmit_attr_from_mblk to recreate the 222 * ip_xmit_attr_t. 223 * 224 * Returns NULL on memory allocation failure. 225 */ 226 mblk_t * 227 ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa) 228 { 229 mblk_t *ixamp; 230 ixamblk_t *ixm; 231 nce_t *nce = ixa->ixa_nce; 232 233 ASSERT(nce != NULL); 234 ixamp = allocb(sizeof (*ixm), BPRI_MED); 235 if (ixamp == NULL) 236 return (NULL); 237 238 ixamp->b_datap->db_type = M_BREAK; 239 ixamp->b_wptr += sizeof (*ixm); 240 ixm = (ixamblk_t *)ixamp->b_rptr; 241 242 bzero(ixm, sizeof (*ixm)); 243 ixm->ixm_inbound = B_FALSE; 244 ixm->ixm_flags = ixa->ixa_flags; 245 ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid; 246 ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex; 247 ixm->ixm_nceaddr_v6 = nce->nce_addr; 248 ixm->ixm_fragsize = ixa->ixa_fragsize; 249 ixm->ixm_pktlen = ixa->ixa_pktlen; 250 ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length; 251 ixm->ixm_protocol = ixa->ixa_protocol; 252 ixm->ixm_postfragfn = ixa->ixa_postfragfn; 253 ixm->ixm_zoneid = ixa->ixa_zoneid; 254 ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid; 255 ixm->ixm_scopeid = ixa->ixa_scopeid; 256 ixm->ixm_ident = ixa->ixa_ident; 257 ixm->ixm_xmit_hint = ixa->ixa_xmit_hint; 258 259 if (ixa->ixa_tsl != NULL) { 260 ixm->ixm_tsl = ixa->ixa_tsl; 261 label_hold(ixm->ixm_tsl); 262 } 263 if (ixa->ixa_cred != NULL) { 264 ixm->ixm_cred = ixa->ixa_cred; 265 crhold(ixa->ixa_cred); 266 } 267 ixm->ixm_cpid = ixa->ixa_cpid; 268 ixm->ixm_conn_id = ixa->ixa_conn_id; 269 270 if (ixa->ixa_flags & IXAF_IPSEC_SECURE) { 271 if (ixa->ixa_ipsec_ah_sa != NULL) { 272 ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa; 273 IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa); 274 } 275 if (ixa->ixa_ipsec_esp_sa != NULL) { 276 ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa; 277 IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa); 278 } 279 if (ixa->ixa_ipsec_policy != NULL) { 280 ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy; 281 IPPOL_REFHOLD(ixa->ixa_ipsec_policy); 282 } 283 if (ixa->ixa_ipsec_action != NULL) { 284 ixm->ixm_ipsec_action = ixa->ixa_ipsec_action; 285 IPACT_REFHOLD(ixa->ixa_ipsec_action); 286 } 287 if (ixa->ixa_ipsec_latch != NULL) { 288 ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch; 289 IPLATCH_REFHOLD(ixa->ixa_ipsec_latch); 290 } 291 ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0]; 292 ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1]; 293 ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port; 294 ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port; 295 ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type; 296 ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code; 297 ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf; 298 ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0]; 299 ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1]; 300 ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2]; 301 ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3]; 302 ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0]; 303 ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1]; 304 ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2]; 305 ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3]; 306 ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx; 307 ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx; 308 ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto; 309 } 310 return (ixamp); 311 } 312 313 /* 314 * Extract the ip_xmit_attr_t from the mblk, checking that the 315 * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is 316 * not the case. 317 * 318 * Otherwise ixa is updated. 319 * Caller needs to release references on the ixa by calling ixa_refrele() 320 * which will imediately call ixa_inactive to release the references. 321 */ 322 boolean_t 323 ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa) 324 { 325 ixamblk_t *ixm; 326 netstack_t *ns; 327 ip_stack_t *ipst; 328 ill_t *ill; 329 nce_t *nce; 330 331 /* We assume the caller hasn't initialized ixa */ 332 bzero(ixa, sizeof (*ixa)); 333 334 ASSERT(DB_TYPE(ixamp) == M_BREAK); 335 ASSERT(ixamp->b_cont == NULL); 336 337 ixm = (ixamblk_t *)ixamp->b_rptr; 338 ASSERT(!ixm->ixm_inbound); 339 340 /* Verify the netstack is still around */ 341 ns = netstack_find_by_stackid(ixm->ixm_stackid); 342 if (ns == NULL) { 343 /* Disappeared on us */ 344 (void) ip_xmit_attr_free_mblk(ixamp); 345 return (B_FALSE); 346 } 347 ipst = ns->netstack_ip; 348 349 /* Verify the ill is still around */ 350 ill = ill_lookup_on_ifindex(ixm->ixm_ifindex, 351 !(ixm->ixm_flags & IXAF_IS_IPV4), ipst); 352 353 /* We have the ill, hence the netstack can't go away */ 354 netstack_rele(ns); 355 if (ill == NULL) { 356 /* Disappeared on us */ 357 (void) ip_xmit_attr_free_mblk(ixamp); 358 return (B_FALSE); 359 } 360 /* 361 * Find the nce. We don't load-spread (only lookup nce's on the ill) 362 * because we want to find the same nce as the one we had when 363 * ip_xmit_attr_to_mblk was called. 364 */ 365 if (ixm->ixm_flags & IXAF_IS_IPV4) { 366 nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4); 367 } else { 368 nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6); 369 } 370 371 /* We have the nce, hence the ill can't go away */ 372 ill_refrele(ill); 373 if (nce == NULL) { 374 /* 375 * Since this is unusual and we don't know what type of 376 * nce it was, we drop the packet. 377 */ 378 (void) ip_xmit_attr_free_mblk(ixamp); 379 return (B_FALSE); 380 } 381 382 ixa->ixa_flags = ixm->ixm_flags; 383 ixa->ixa_refcnt = 1; 384 ixa->ixa_ipst = ipst; 385 ixa->ixa_fragsize = ixm->ixm_fragsize; 386 ixa->ixa_pktlen = ixm->ixm_pktlen; 387 ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length; 388 ixa->ixa_protocol = ixm->ixm_protocol; 389 ixa->ixa_nce = nce; 390 ixa->ixa_postfragfn = ixm->ixm_postfragfn; 391 ixa->ixa_zoneid = ixm->ixm_zoneid; 392 ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid; 393 ixa->ixa_scopeid = ixm->ixm_scopeid; 394 ixa->ixa_ident = ixm->ixm_ident; 395 ixa->ixa_xmit_hint = ixm->ixm_xmit_hint; 396 397 if (ixm->ixm_tsl != NULL) { 398 ixa->ixa_tsl = ixm->ixm_tsl; 399 ixa->ixa_free_flags |= IXA_FREE_TSL; 400 ixm->ixm_tsl = NULL; 401 } 402 if (ixm->ixm_cred != NULL) { 403 ixa->ixa_cred = ixm->ixm_cred; 404 ixa->ixa_free_flags |= IXA_FREE_CRED; 405 ixm->ixm_cred = NULL; 406 } 407 ixa->ixa_cpid = ixm->ixm_cpid; 408 ixa->ixa_conn_id = ixm->ixm_conn_id; 409 410 ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa; 411 ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa; 412 ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy; 413 ixa->ixa_ipsec_action = ixm->ixm_ipsec_action; 414 ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch; 415 416 ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0]; 417 ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1]; 418 ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port; 419 ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port; 420 ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type; 421 ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code; 422 ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf; 423 ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0]; 424 ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1]; 425 ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2]; 426 ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3]; 427 ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0]; 428 ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1]; 429 ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2]; 430 ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3]; 431 ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx; 432 ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx; 433 ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto; 434 435 freeb(ixamp); 436 return (B_TRUE); 437 } 438 439 /* 440 * Free the ixm mblk and any references it holds 441 * Returns b_cont. 442 */ 443 mblk_t * 444 ip_xmit_attr_free_mblk(mblk_t *ixamp) 445 { 446 ixamblk_t *ixm; 447 mblk_t *mp; 448 449 /* Consume mp */ 450 ASSERT(DB_TYPE(ixamp) == M_BREAK); 451 mp = ixamp->b_cont; 452 453 ixm = (ixamblk_t *)ixamp->b_rptr; 454 ASSERT(!ixm->ixm_inbound); 455 456 if (ixm->ixm_ipsec_ah_sa != NULL) { 457 IPSA_REFRELE(ixm->ixm_ipsec_ah_sa); 458 ixm->ixm_ipsec_ah_sa = NULL; 459 } 460 if (ixm->ixm_ipsec_esp_sa != NULL) { 461 IPSA_REFRELE(ixm->ixm_ipsec_esp_sa); 462 ixm->ixm_ipsec_esp_sa = NULL; 463 } 464 if (ixm->ixm_ipsec_policy != NULL) { 465 IPPOL_REFRELE(ixm->ixm_ipsec_policy); 466 ixm->ixm_ipsec_policy = NULL; 467 } 468 if (ixm->ixm_ipsec_action != NULL) { 469 IPACT_REFRELE(ixm->ixm_ipsec_action); 470 ixm->ixm_ipsec_action = NULL; 471 } 472 if (ixm->ixm_ipsec_latch) { 473 IPLATCH_REFRELE(ixm->ixm_ipsec_latch); 474 ixm->ixm_ipsec_latch = NULL; 475 } 476 477 if (ixm->ixm_tsl != NULL) { 478 label_rele(ixm->ixm_tsl); 479 ixm->ixm_tsl = NULL; 480 } 481 if (ixm->ixm_cred != NULL) { 482 crfree(ixm->ixm_cred); 483 ixm->ixm_cred = NULL; 484 } 485 freeb(ixamp); 486 return (mp); 487 } 488 489 /* 490 * Take the information in ip_recv_attr_t and stick it in an mblk 491 * that can later be passed to ip_recv_attr_from_mblk to recreate the 492 * ip_recv_attr_t. 493 * 494 * Returns NULL on memory allocation failure. 495 */ 496 mblk_t * 497 ip_recv_attr_to_mblk(ip_recv_attr_t *ira) 498 { 499 mblk_t *iramp; 500 iramblk_t *irm; 501 ill_t *ill = ira->ira_ill; 502 503 ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0); 504 505 iramp = allocb(sizeof (*irm), BPRI_MED); 506 if (iramp == NULL) 507 return (NULL); 508 509 iramp->b_datap->db_type = M_BREAK; 510 iramp->b_wptr += sizeof (*irm); 511 irm = (iramblk_t *)iramp->b_rptr; 512 513 bzero(irm, sizeof (*irm)); 514 irm->irm_inbound = B_TRUE; 515 irm->irm_flags = ira->ira_flags; 516 if (ill != NULL) { 517 /* Internal to IP - preserve ip_stack_t, ill and rill */ 518 irm->irm_stackid = 519 ill->ill_ipst->ips_netstack->netstack_stackid; 520 irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex; 521 ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex == 522 ira->ira_rifindex); 523 } else { 524 /* Let ip_recv_attr_from_stackid know there isn't one */ 525 irm->irm_stackid = -1; 526 } 527 irm->irm_rifindex = ira->ira_rifindex; 528 irm->irm_ruifindex = ira->ira_ruifindex; 529 irm->irm_pktlen = ira->ira_pktlen; 530 irm->irm_ip_hdr_length = ira->ira_ip_hdr_length; 531 irm->irm_protocol = ira->ira_protocol; 532 533 irm->irm_sqp = ira->ira_sqp; 534 irm->irm_ring = ira->ira_ring; 535 536 irm->irm_zoneid = ira->ira_zoneid; 537 irm->irm_mroute_tunnel = ira->ira_mroute_tunnel; 538 irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid; 539 irm->irm_esp_udp_ports = ira->ira_esp_udp_ports; 540 541 if (ira->ira_tsl != NULL) { 542 irm->irm_tsl = ira->ira_tsl; 543 label_hold(irm->irm_tsl); 544 } 545 if (ira->ira_cred != NULL) { 546 irm->irm_cred = ira->ira_cred; 547 crhold(ira->ira_cred); 548 } 549 irm->irm_cpid = ira->ira_cpid; 550 551 if (ira->ira_flags & IRAF_L2SRC_SET) 552 bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE); 553 554 if (ira->ira_flags & IRAF_IPSEC_SECURE) { 555 if (ira->ira_ipsec_ah_sa != NULL) { 556 irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa; 557 IPSA_REFHOLD(ira->ira_ipsec_ah_sa); 558 } 559 if (ira->ira_ipsec_esp_sa != NULL) { 560 irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa; 561 IPSA_REFHOLD(ira->ira_ipsec_esp_sa); 562 } 563 if (ira->ira_ipsec_action != NULL) { 564 irm->irm_ipsec_action = ira->ira_ipsec_action; 565 IPACT_REFHOLD(ira->ira_ipsec_action); 566 } 567 } 568 return (iramp); 569 } 570 571 /* 572 * Extract the ip_recv_attr_t from the mblk. If we are used inside IP 573 * then irm_stackid is not -1, in which case we check that the 574 * ip_stack_t and ill_t still exist. Returns B_FALSE if that is 575 * not the case. 576 * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter) 577 * and we just proceed with ira_ill and ira_rill as NULL. 578 * 579 * The caller needs to release any references on the pointers inside the ire 580 * by calling ira_cleanup. 581 */ 582 boolean_t 583 ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira) 584 { 585 iramblk_t *irm; 586 netstack_t *ns; 587 ip_stack_t *ipst = NULL; 588 ill_t *ill = NULL, *rill = NULL; 589 590 /* We assume the caller hasn't initialized ira */ 591 bzero(ira, sizeof (*ira)); 592 593 ASSERT(DB_TYPE(iramp) == M_BREAK); 594 ASSERT(iramp->b_cont == NULL); 595 596 irm = (iramblk_t *)iramp->b_rptr; 597 ASSERT(irm->irm_inbound); 598 599 if (irm->irm_stackid != -1) { 600 /* Verify the netstack is still around */ 601 ns = netstack_find_by_stackid(irm->irm_stackid); 602 if (ns == NULL) { 603 /* Disappeared on us */ 604 (void) ip_recv_attr_free_mblk(iramp); 605 return (B_FALSE); 606 } 607 ipst = ns->netstack_ip; 608 609 /* Verify the ill is still around */ 610 ill = ill_lookup_on_ifindex(irm->irm_ifindex, 611 !(irm->irm_flags & IRAF_IS_IPV4), ipst); 612 613 if (irm->irm_ifindex == irm->irm_rifindex) { 614 rill = ill; 615 } else { 616 rill = ill_lookup_on_ifindex(irm->irm_rifindex, 617 !(irm->irm_flags & IRAF_IS_IPV4), ipst); 618 } 619 620 /* We have the ill, hence the netstack can't go away */ 621 netstack_rele(ns); 622 if (ill == NULL || rill == NULL) { 623 /* Disappeared on us */ 624 if (ill != NULL) 625 ill_refrele(ill); 626 if (rill != NULL && rill != ill) 627 ill_refrele(rill); 628 (void) ip_recv_attr_free_mblk(iramp); 629 return (B_FALSE); 630 } 631 } 632 633 ira->ira_flags = irm->irm_flags; 634 /* Caller must ill_refele(ira_ill) by using ira_cleanup() */ 635 ira->ira_ill = ill; 636 ira->ira_rill = rill; 637 638 ira->ira_rifindex = irm->irm_rifindex; 639 ira->ira_ruifindex = irm->irm_ruifindex; 640 ira->ira_pktlen = irm->irm_pktlen; 641 ira->ira_ip_hdr_length = irm->irm_ip_hdr_length; 642 ira->ira_protocol = irm->irm_protocol; 643 644 ira->ira_sqp = irm->irm_sqp; 645 /* The rest of IP assumes that the rings never go away. */ 646 ira->ira_ring = irm->irm_ring; 647 648 ira->ira_zoneid = irm->irm_zoneid; 649 ira->ira_mroute_tunnel = irm->irm_mroute_tunnel; 650 ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid; 651 ira->ira_esp_udp_ports = irm->irm_esp_udp_ports; 652 653 if (irm->irm_tsl != NULL) { 654 ira->ira_tsl = irm->irm_tsl; 655 ira->ira_free_flags |= IRA_FREE_TSL; 656 irm->irm_tsl = NULL; 657 } 658 if (irm->irm_cred != NULL) { 659 ira->ira_cred = irm->irm_cred; 660 ira->ira_free_flags |= IRA_FREE_CRED; 661 irm->irm_cred = NULL; 662 } 663 ira->ira_cpid = irm->irm_cpid; 664 665 if (ira->ira_flags & IRAF_L2SRC_SET) 666 bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE); 667 668 ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa; 669 ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa; 670 ira->ira_ipsec_action = irm->irm_ipsec_action; 671 672 freeb(iramp); 673 return (B_TRUE); 674 } 675 676 /* 677 * Free the irm mblk and any references it holds 678 * Returns b_cont. 679 */ 680 mblk_t * 681 ip_recv_attr_free_mblk(mblk_t *iramp) 682 { 683 iramblk_t *irm; 684 mblk_t *mp; 685 686 /* Consume mp */ 687 ASSERT(DB_TYPE(iramp) == M_BREAK); 688 mp = iramp->b_cont; 689 690 irm = (iramblk_t *)iramp->b_rptr; 691 ASSERT(irm->irm_inbound); 692 693 if (irm->irm_ipsec_ah_sa != NULL) { 694 IPSA_REFRELE(irm->irm_ipsec_ah_sa); 695 irm->irm_ipsec_ah_sa = NULL; 696 } 697 if (irm->irm_ipsec_esp_sa != NULL) { 698 IPSA_REFRELE(irm->irm_ipsec_esp_sa); 699 irm->irm_ipsec_esp_sa = NULL; 700 } 701 if (irm->irm_ipsec_action != NULL) { 702 IPACT_REFRELE(irm->irm_ipsec_action); 703 irm->irm_ipsec_action = NULL; 704 } 705 if (irm->irm_tsl != NULL) { 706 label_rele(irm->irm_tsl); 707 irm->irm_tsl = NULL; 708 } 709 if (irm->irm_cred != NULL) { 710 crfree(irm->irm_cred); 711 irm->irm_cred = NULL; 712 } 713 714 freeb(iramp); 715 return (mp); 716 } 717 718 /* 719 * Returns true if the mblk contains an ip_recv_attr_t 720 * For now we just check db_type. 721 */ 722 boolean_t 723 ip_recv_attr_is_mblk(mblk_t *mp) 724 { 725 /* 726 * Need to handle the various forms of tcp_timermp which are tagged 727 * with b_wptr and might have a NULL b_datap. 728 */ 729 if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1) 730 return (B_FALSE); 731 732 #ifdef DEBUG 733 iramblk_t *irm; 734 735 if (DB_TYPE(mp) != M_BREAK) 736 return (B_FALSE); 737 738 irm = (iramblk_t *)mp->b_rptr; 739 ASSERT(irm->irm_inbound); 740 return (B_TRUE); 741 #else 742 return (DB_TYPE(mp) == M_BREAK); 743 #endif 744 } 745 746 static ip_xmit_attr_t * 747 conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag) 748 { 749 ip_xmit_attr_t *ixa; 750 ip_xmit_attr_t *oldixa; 751 752 mutex_enter(&connp->conn_lock); 753 ixa = connp->conn_ixa; 754 755 /* At least one references for the conn_t */ 756 ASSERT(ixa->ixa_refcnt >= 1); 757 if (atomic_inc_32_nv(&ixa->ixa_refcnt) == 2) { 758 /* No other thread using conn_ixa */ 759 mutex_exit(&connp->conn_lock); 760 return (ixa); 761 } 762 ixa = kmem_alloc(sizeof (*ixa), kmflag); 763 if (ixa == NULL) { 764 mutex_exit(&connp->conn_lock); 765 ixa_refrele(connp->conn_ixa); 766 return (NULL); 767 } 768 ixa_safe_copy(connp->conn_ixa, ixa); 769 770 /* Make sure we drop conn_lock before any refrele */ 771 if (replace) { 772 ixa->ixa_refcnt++; /* No atomic needed - not visible */ 773 oldixa = connp->conn_ixa; 774 connp->conn_ixa = ixa; 775 mutex_exit(&connp->conn_lock); 776 IXA_REFRELE(oldixa); /* Undo refcnt from conn_t */ 777 } else { 778 oldixa = connp->conn_ixa; 779 mutex_exit(&connp->conn_lock); 780 } 781 IXA_REFRELE(oldixa); /* Undo above atomic_add_32_nv */ 782 783 return (ixa); 784 } 785 786 /* 787 * Return an ip_xmit_attr_t to use with a conn_t that ensures that only 788 * the caller can access the ip_xmit_attr_t. 789 * 790 * If nobody else is using conn_ixa we return it. 791 * Otherwise we make a "safe" copy of conn_ixa 792 * and return it. The "safe" copy has the pointers set to NULL 793 * (since the pointers might be changed by another thread using 794 * conn_ixa). The caller needs to check for NULL pointers to see 795 * if ip_set_destination needs to be called to re-establish the pointers. 796 * 797 * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t. 798 * That is used when we connect() the ULP. 799 */ 800 ip_xmit_attr_t * 801 conn_get_ixa(conn_t *connp, boolean_t replace) 802 { 803 return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP)); 804 } 805 806 /* 807 * Used only when the option is to have the kernel hang due to not 808 * cleaning up ixa references on ills etc. 809 */ 810 ip_xmit_attr_t * 811 conn_get_ixa_tryhard(conn_t *connp, boolean_t replace) 812 { 813 return (conn_get_ixa_impl(connp, replace, KM_SLEEP)); 814 } 815 816 /* 817 * Replace conn_ixa with the ixa argument. 818 * 819 * The caller must hold conn_lock. 820 * 821 * We return the old ixa; the caller must ixa_refrele that after conn_lock 822 * has been dropped. 823 */ 824 ip_xmit_attr_t * 825 conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa) 826 { 827 ip_xmit_attr_t *oldixa; 828 829 ASSERT(MUTEX_HELD(&connp->conn_lock)); 830 831 oldixa = connp->conn_ixa; 832 IXA_REFHOLD(ixa); 833 ixa->ixa_conn_id = oldixa->ixa_conn_id; 834 connp->conn_ixa = ixa; 835 return (oldixa); 836 } 837 838 /* 839 * Return a ip_xmit_attr_t to use with a conn_t that is based on but 840 * separate from conn_ixa. 841 * 842 * This "safe" copy has the pointers set to NULL 843 * (since the pointers might be changed by another thread using 844 * conn_ixa). The caller needs to check for NULL pointers to see 845 * if ip_set_destination needs to be called to re-establish the pointers. 846 */ 847 ip_xmit_attr_t * 848 conn_get_ixa_exclusive(conn_t *connp) 849 { 850 ip_xmit_attr_t *ixa; 851 852 mutex_enter(&connp->conn_lock); 853 ixa = connp->conn_ixa; 854 855 /* At least one references for the conn_t */ 856 ASSERT(ixa->ixa_refcnt >= 1); 857 858 /* Make sure conn_ixa doesn't disappear while we copy it */ 859 atomic_inc_32(&ixa->ixa_refcnt); 860 861 ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP); 862 if (ixa == NULL) { 863 mutex_exit(&connp->conn_lock); 864 ixa_refrele(connp->conn_ixa); 865 return (NULL); 866 } 867 ixa_safe_copy(connp->conn_ixa, ixa); 868 mutex_exit(&connp->conn_lock); 869 IXA_REFRELE(connp->conn_ixa); 870 return (ixa); 871 } 872 873 void 874 ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa) 875 { 876 bcopy(src, ixa, sizeof (*ixa)); 877 ixa->ixa_refcnt = 1; 878 /* 879 * Clear any pointers that have references and might be changed 880 * by ip_set_destination or the ULP 881 */ 882 ixa->ixa_ire = NULL; 883 ixa->ixa_nce = NULL; 884 ixa->ixa_dce = NULL; 885 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 886 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 887 #ifdef DEBUG 888 ixa->ixa_curthread = NULL; 889 #endif 890 /* Clear all the IPsec pointers and the flag as well. */ 891 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; 892 893 ixa->ixa_ipsec_latch = NULL; 894 ixa->ixa_ipsec_ah_sa = NULL; 895 ixa->ixa_ipsec_esp_sa = NULL; 896 ixa->ixa_ipsec_policy = NULL; 897 ixa->ixa_ipsec_action = NULL; 898 899 /* 900 * We leave ixa_tsl unchanged, but if it has a refhold we need 901 * to get an extra refhold. 902 */ 903 if (ixa->ixa_free_flags & IXA_FREE_TSL) 904 label_hold(ixa->ixa_tsl); 905 906 /* 907 * We leave ixa_cred unchanged, but if it has a refhold we need 908 * to get an extra refhold. 909 */ 910 if (ixa->ixa_free_flags & IXA_FREE_CRED) 911 crhold(ixa->ixa_cred); 912 } 913 914 /* 915 * Duplicate an ip_xmit_attr_t. 916 * Assumes that the caller controls the ixa, hence we do not need to use 917 * a safe copy. We just have to increase the refcnt on any pointers. 918 */ 919 ip_xmit_attr_t * 920 ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa) 921 { 922 ip_xmit_attr_t *ixa; 923 924 ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP); 925 if (ixa == NULL) 926 return (NULL); 927 bcopy(src_ixa, ixa, sizeof (*ixa)); 928 ixa->ixa_refcnt = 1; 929 930 if (ixa->ixa_ire != NULL) 931 ire_refhold_notr(ixa->ixa_ire); 932 if (ixa->ixa_nce != NULL) 933 nce_refhold(ixa->ixa_nce); 934 if (ixa->ixa_dce != NULL) 935 dce_refhold_notr(ixa->ixa_dce); 936 937 #ifdef DEBUG 938 ixa->ixa_curthread = NULL; 939 #endif 940 941 if (ixa->ixa_ipsec_latch != NULL) 942 IPLATCH_REFHOLD(ixa->ixa_ipsec_latch); 943 if (ixa->ixa_ipsec_ah_sa != NULL) 944 IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa); 945 if (ixa->ixa_ipsec_esp_sa != NULL) 946 IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa); 947 if (ixa->ixa_ipsec_policy != NULL) 948 IPPOL_REFHOLD(ixa->ixa_ipsec_policy); 949 if (ixa->ixa_ipsec_action != NULL) 950 IPACT_REFHOLD(ixa->ixa_ipsec_action); 951 952 if (ixa->ixa_tsl != NULL) { 953 label_hold(ixa->ixa_tsl); 954 ixa->ixa_free_flags |= IXA_FREE_TSL; 955 } 956 if (ixa->ixa_cred != NULL) { 957 crhold(ixa->ixa_cred); 958 ixa->ixa_free_flags |= IXA_FREE_CRED; 959 } 960 return (ixa); 961 } 962 963 /* 964 * Used to replace the ixa_label field. 965 * The caller should have a reference on the label, which we transfer to 966 * the attributes so that when the attribute is freed/cleaned up 967 * we will release that reference. 968 */ 969 void 970 ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl) 971 { 972 ASSERT(tsl != NULL); 973 974 if (ixa->ixa_free_flags & IXA_FREE_TSL) { 975 ASSERT(ixa->ixa_tsl != NULL); 976 label_rele(ixa->ixa_tsl); 977 } else { 978 ixa->ixa_free_flags |= IXA_FREE_TSL; 979 } 980 ixa->ixa_tsl = tsl; 981 } 982 983 /* 984 * Replace the ip_recv_attr_t's label. 985 * Due to kernel RPC's use of db_credp we also need to replace ira_cred; 986 * TCP/UDP uses ira_cred to set db_credp for non-socket users. 987 * This can fail (and return B_FALSE) due to lack of memory. 988 */ 989 boolean_t 990 ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl) 991 { 992 cred_t *newcr; 993 994 if (ira->ira_free_flags & IRA_FREE_TSL) { 995 ASSERT(ira->ira_tsl != NULL); 996 label_rele(ira->ira_tsl); 997 } 998 label_hold(tsl); 999 ira->ira_tsl = tsl; 1000 ira->ira_free_flags |= IRA_FREE_TSL; 1001 1002 /* 1003 * Reset zoneid if we have a shared address. That allows 1004 * ip_fanout_tx_v4/v6 to determine the zoneid again. 1005 */ 1006 if (ira->ira_flags & IRAF_TX_SHARED_ADDR) 1007 ira->ira_zoneid = ALL_ZONES; 1008 1009 /* We update ira_cred for RPC */ 1010 newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP); 1011 if (newcr == NULL) 1012 return (B_FALSE); 1013 if (ira->ira_free_flags & IRA_FREE_CRED) 1014 crfree(ira->ira_cred); 1015 ira->ira_cred = newcr; 1016 ira->ira_free_flags |= IRA_FREE_CRED; 1017 return (B_TRUE); 1018 } 1019 1020 /* 1021 * This needs to be called after ip_set_destination/tsol_check_dest might 1022 * have changed ixa_tsl to be specific for a destination, and we now want to 1023 * send to a different destination. 1024 * We have to restart with crgetlabel() since ip_set_destination/ 1025 * tsol_check_dest will start with ixa_tsl. 1026 */ 1027 void 1028 ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr) 1029 { 1030 if (!is_system_labeled()) 1031 return; 1032 1033 if (ixa->ixa_free_flags & IXA_FREE_TSL) { 1034 ASSERT(ixa->ixa_tsl != NULL); 1035 label_rele(ixa->ixa_tsl); 1036 ixa->ixa_free_flags &= ~IXA_FREE_TSL; 1037 } 1038 ixa->ixa_tsl = crgetlabel(cr); 1039 } 1040 1041 void 1042 ixa_refrele(ip_xmit_attr_t *ixa) 1043 { 1044 IXA_REFRELE(ixa); 1045 } 1046 1047 void 1048 ixa_inactive(ip_xmit_attr_t *ixa) 1049 { 1050 ASSERT(ixa->ixa_refcnt == 0); 1051 1052 ixa_cleanup(ixa); 1053 kmem_free(ixa, sizeof (*ixa)); 1054 } 1055 1056 /* 1057 * Release any references contained in the ixa. 1058 * Also clear any fields that are not controlled by ixa_flags. 1059 */ 1060 void 1061 ixa_cleanup(ip_xmit_attr_t *ixa) 1062 { 1063 if (ixa->ixa_ire != NULL) { 1064 ire_refrele_notr(ixa->ixa_ire); 1065 ixa->ixa_ire = NULL; 1066 } 1067 if (ixa->ixa_dce != NULL) { 1068 dce_refrele_notr(ixa->ixa_dce); 1069 ixa->ixa_dce = NULL; 1070 } 1071 if (ixa->ixa_nce != NULL) { 1072 nce_refrele(ixa->ixa_nce); 1073 ixa->ixa_nce = NULL; 1074 } 1075 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 1076 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 1077 if (ixa->ixa_flags & IXAF_IPSEC_SECURE) { 1078 ipsec_out_release_refs(ixa); 1079 } 1080 if (ixa->ixa_free_flags & IXA_FREE_TSL) { 1081 ASSERT(ixa->ixa_tsl != NULL); 1082 label_rele(ixa->ixa_tsl); 1083 ixa->ixa_free_flags &= ~IXA_FREE_TSL; 1084 } 1085 ixa->ixa_tsl = NULL; 1086 if (ixa->ixa_free_flags & IXA_FREE_CRED) { 1087 ASSERT(ixa->ixa_cred != NULL); 1088 crfree(ixa->ixa_cred); 1089 ixa->ixa_free_flags &= ~IXA_FREE_CRED; 1090 } 1091 ixa->ixa_cred = NULL; 1092 ixa->ixa_src_preferences = 0; 1093 ixa->ixa_ifindex = 0; 1094 ixa->ixa_multicast_ifindex = 0; 1095 ixa->ixa_multicast_ifaddr = INADDR_ANY; 1096 } 1097 1098 /* 1099 * Release any references contained in the ira. 1100 * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second 1101 * argument. 1102 */ 1103 void 1104 ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill) 1105 { 1106 if (ira->ira_ill != NULL) { 1107 if (ira->ira_rill != ira->ira_ill) { 1108 /* Caused by async processing */ 1109 ill_refrele(ira->ira_rill); 1110 } 1111 if (refrele_ill) 1112 ill_refrele(ira->ira_ill); 1113 } 1114 if (ira->ira_flags & IRAF_IPSEC_SECURE) { 1115 ipsec_in_release_refs(ira); 1116 } 1117 if (ira->ira_free_flags & IRA_FREE_TSL) { 1118 ASSERT(ira->ira_tsl != NULL); 1119 label_rele(ira->ira_tsl); 1120 ira->ira_free_flags &= ~IRA_FREE_TSL; 1121 } 1122 ira->ira_tsl = NULL; 1123 if (ira->ira_free_flags & IRA_FREE_CRED) { 1124 ASSERT(ira->ira_cred != NULL); 1125 crfree(ira->ira_cred); 1126 ira->ira_free_flags &= ~IRA_FREE_CRED; 1127 } 1128 ira->ira_cred = NULL; 1129 } 1130 1131 /* 1132 * Function to help release any IRE, NCE, or DCEs that 1133 * have been deleted and are marked as condemned. 1134 * The caller is responsible for any serialization which is different 1135 * for TCP, SCTP, and others. 1136 */ 1137 static void 1138 ixa_cleanup_stale(ip_xmit_attr_t *ixa) 1139 { 1140 ire_t *ire; 1141 nce_t *nce; 1142 dce_t *dce; 1143 1144 ire = ixa->ixa_ire; 1145 nce = ixa->ixa_nce; 1146 dce = ixa->ixa_dce; 1147 1148 if (ire != NULL && IRE_IS_CONDEMNED(ire)) { 1149 ire_refrele_notr(ire); 1150 ire = ire_blackhole(ixa->ixa_ipst, 1151 !(ixa->ixa_flags & IXAF_IS_IPV4)); 1152 ASSERT(ire != NULL); 1153 #ifdef DEBUG 1154 ire_refhold_notr(ire); 1155 ire_refrele(ire); 1156 #endif 1157 ixa->ixa_ire = ire; 1158 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 1159 } 1160 if (nce != NULL && nce->nce_is_condemned) { 1161 /* Can make it NULL as long as we set IRE_GENERATION_VERIFY */ 1162 nce_refrele(nce); 1163 ixa->ixa_nce = NULL; 1164 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; 1165 } 1166 if (dce != NULL && DCE_IS_CONDEMNED(dce)) { 1167 dce_refrele_notr(dce); 1168 dce = dce_get_default(ixa->ixa_ipst); 1169 ASSERT(dce != NULL); 1170 #ifdef DEBUG 1171 dce_refhold_notr(dce); 1172 dce_refrele(dce); 1173 #endif 1174 ixa->ixa_dce = dce; 1175 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; 1176 } 1177 } 1178 1179 static mblk_t * 1180 tcp_ixa_cleanup_getmblk(conn_t *connp) 1181 { 1182 tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp; 1183 int need_retry; 1184 mblk_t *mp; 1185 1186 mutex_enter(&tcps->tcps_ixa_cleanup_lock); 1187 1188 /* 1189 * It's possible that someone else came in and started cleaning up 1190 * another connection between the time we verified this one is not being 1191 * cleaned up and the time we actually get the shared mblk. If that's 1192 * the case, we've dropped the lock, and some other thread may have 1193 * cleaned up this connection again, and is still waiting for 1194 * notification of that cleanup's completion. Therefore we need to 1195 * recheck. 1196 */ 1197 do { 1198 need_retry = 0; 1199 while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) { 1200 cv_wait(&tcps->tcps_ixa_cleanup_done_cv, 1201 &tcps->tcps_ixa_cleanup_lock); 1202 } 1203 1204 while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) { 1205 /* 1206 * Multiple concurrent cleanups; need to have the last 1207 * one run since it could be an unplumb. 1208 */ 1209 need_retry = 1; 1210 cv_wait(&tcps->tcps_ixa_cleanup_ready_cv, 1211 &tcps->tcps_ixa_cleanup_lock); 1212 } 1213 } while (need_retry); 1214 1215 /* 1216 * We now have the lock and the mblk; now make sure that no one else can 1217 * try to clean up this connection or enqueue it for cleanup, clear the 1218 * mblk pointer for this stack, drop the lock, and return the mblk. 1219 */ 1220 ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock)); 1221 ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE); 1222 ASSERT(tcps->tcps_ixa_cleanup_mp == mp); 1223 ASSERT(mp != NULL); 1224 1225 connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS; 1226 tcps->tcps_ixa_cleanup_mp = NULL; 1227 mutex_exit(&tcps->tcps_ixa_cleanup_lock); 1228 1229 return (mp); 1230 } 1231 1232 /* 1233 * Used to run ixa_cleanup_stale inside the tcp squeue. 1234 * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp 1235 * and waking up the caller. 1236 */ 1237 /* ARGSUSED2 */ 1238 static void 1239 tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2, 1240 ip_recv_attr_t *dummy) 1241 { 1242 conn_t *connp = (conn_t *)arg; 1243 tcp_stack_t *tcps; 1244 1245 tcps = connp->conn_netstack->netstack_tcp; 1246 1247 ixa_cleanup_stale(connp->conn_ixa); 1248 1249 mutex_enter(&tcps->tcps_ixa_cleanup_lock); 1250 ASSERT(tcps->tcps_ixa_cleanup_mp == NULL); 1251 connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE; 1252 tcps->tcps_ixa_cleanup_mp = mp; 1253 cv_signal(&tcps->tcps_ixa_cleanup_ready_cv); 1254 /* 1255 * It is possible for any number of threads to be waiting for cleanup of 1256 * different connections. Absent a per-connection (or per-IXA) CV, we 1257 * need to wake them all up even though only one can be waiting on this 1258 * particular cleanup. 1259 */ 1260 cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv); 1261 mutex_exit(&tcps->tcps_ixa_cleanup_lock); 1262 } 1263 1264 static void 1265 tcp_ixa_cleanup_wait_and_finish(conn_t *connp) 1266 { 1267 tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp; 1268 1269 mutex_enter(&tcps->tcps_ixa_cleanup_lock); 1270 1271 ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE); 1272 1273 while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) { 1274 cv_wait(&tcps->tcps_ixa_cleanup_done_cv, 1275 &tcps->tcps_ixa_cleanup_lock); 1276 } 1277 1278 ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE); 1279 connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE; 1280 cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv); 1281 1282 mutex_exit(&tcps->tcps_ixa_cleanup_lock); 1283 } 1284 1285 /* 1286 * ipcl_walk() function to help release any IRE, NCE, or DCEs that 1287 * have been deleted and are marked as condemned. 1288 * Note that we can't cleanup the pointers since there can be threads 1289 * in conn_ip_output() sending while we are called. 1290 */ 1291 void 1292 conn_ixa_cleanup(conn_t *connp, void *arg) 1293 { 1294 boolean_t tryhard = (boolean_t)arg; 1295 1296 if (IPCL_IS_TCP(connp)) { 1297 mblk_t *mp; 1298 1299 mp = tcp_ixa_cleanup_getmblk(connp); 1300 1301 if (connp->conn_sqp->sq_run == curthread) { 1302 /* Already on squeue */ 1303 tcp_ixa_cleanup(connp, mp, NULL, NULL); 1304 } else { 1305 CONN_INC_REF(connp); 1306 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup, 1307 connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP); 1308 } 1309 tcp_ixa_cleanup_wait_and_finish(connp); 1310 } else if (IPCL_IS_SCTP(connp)) { 1311 sctp_t *sctp; 1312 sctp_faddr_t *fp; 1313 1314 sctp = CONN2SCTP(connp); 1315 RUN_SCTP(sctp); 1316 ixa_cleanup_stale(connp->conn_ixa); 1317 for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->sf_next) 1318 ixa_cleanup_stale(fp->sf_ixa); 1319 WAKE_SCTP(sctp); 1320 } else { 1321 ip_xmit_attr_t *ixa; 1322 1323 /* 1324 * If there is a different thread using conn_ixa then we get a 1325 * new copy and cut the old one loose from conn_ixa. Otherwise 1326 * we use conn_ixa and prevent any other thread from 1327 * using/changing it. Anybody using conn_ixa (e.g., a thread in 1328 * conn_ip_output) will do an ixa_refrele which will remove any 1329 * references on the ire etc. 1330 * 1331 * Once we are done other threads can use conn_ixa since the 1332 * refcnt will be back at one. 1333 * 1334 * We are called either because an ill is going away, or 1335 * due to memory reclaim. In the former case we wait for 1336 * memory since we must remove the refcnts on the ill. 1337 */ 1338 if (tryhard) { 1339 ixa = conn_get_ixa_tryhard(connp, B_TRUE); 1340 ASSERT(ixa != NULL); 1341 } else { 1342 ixa = conn_get_ixa(connp, B_TRUE); 1343 if (ixa == NULL) { 1344 /* 1345 * Somebody else was using it and kmem_alloc 1346 * failed! Next memory reclaim will try to 1347 * clean up. 1348 */ 1349 DTRACE_PROBE1(conn__ixa__cleanup__bail, 1350 conn_t *, connp); 1351 return; 1352 } 1353 } 1354 ixa_cleanup_stale(ixa); 1355 ixa_refrele(ixa); 1356 } 1357 } 1358 1359 /* 1360 * ixa needs to be an exclusive copy so that no one changes the cookie 1361 * or the ixa_nce. 1362 */ 1363 boolean_t 1364 ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa) 1365 { 1366 uintptr_t cookie = ixa->ixa_cookie; 1367 ill_dld_direct_t *idd; 1368 idl_tx_list_t *idl_txl; 1369 ill_t *ill = ixa->ixa_nce->nce_ill; 1370 boolean_t inserted = B_FALSE; 1371 1372 idd = &(ill)->ill_dld_capab->idc_direct; 1373 idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; 1374 mutex_enter(&idl_txl->txl_lock); 1375 1376 /* 1377 * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow 1378 * control is asserted on an ill that does not support direct calls. 1379 * Jump to insert. 1380 */ 1381 if (cookie == 0) 1382 goto tryinsert; 1383 1384 ASSERT(ILL_DIRECT_CAPABLE(ill)); 1385 1386 if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) { 1387 DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie); 1388 } else if (idl_txl->txl_cookie != NULL && 1389 idl_txl->txl_cookie != ixa->ixa_cookie) { 1390 DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie, 1391 uintptr_t, idl_txl->txl_cookie); 1392 /* TODO: bump kstat for cookie collision */ 1393 } else { 1394 /* 1395 * Check/set conn_blocked under conn_lock. Note that txl_lock 1396 * will not suffice since two separate UDP threads may be 1397 * racing to send to different destinations that are 1398 * associated with different cookies and thus may not be 1399 * holding the same txl_lock. Further, since a given conn_t 1400 * can only be on a single drain list, the conn_t will be 1401 * enqueued on whichever thread wins this race. 1402 */ 1403 tryinsert: mutex_enter(&connp->conn_lock); 1404 if (connp->conn_blocked) { 1405 DTRACE_PROBE1(ill__tx__conn__already__blocked, 1406 conn_t *, connp); 1407 mutex_exit(&connp->conn_lock); 1408 } else { 1409 connp->conn_blocked = B_TRUE; 1410 mutex_exit(&connp->conn_lock); 1411 idl_txl->txl_cookie = cookie; 1412 conn_drain_insert(connp, idl_txl); 1413 if (!IPCL_IS_NONSTR(connp)) 1414 noenable(connp->conn_wq); 1415 inserted = B_TRUE; 1416 } 1417 } 1418 mutex_exit(&idl_txl->txl_lock); 1419 return (inserted); 1420 } 1421