xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_attr.c (revision 9164a50bf932130cbb5097a16f6986873ce0e6e5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * Copyright 2019 Joyent, Inc.
29  * Copyright 2024 Oxide Computer Company
30  */
31 
32 #include <sys/types.h>
33 #include <sys/stream.h>
34 #include <sys/strsun.h>
35 #include <sys/zone.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/atomic.h>
41 
42 #include <sys/systm.h>
43 #include <sys/param.h>
44 #include <sys/kmem.h>
45 #include <sys/sdt.h>
46 #include <sys/socket.h>
47 #include <sys/mac.h>
48 #include <net/if.h>
49 #include <net/if_arp.h>
50 #include <net/route.h>
51 #include <sys/sockio.h>
52 #include <netinet/in.h>
53 #include <net/if_dl.h>
54 
55 #include <inet/common.h>
56 #include <inet/mi.h>
57 #include <inet/mib2.h>
58 #include <inet/nd.h>
59 #include <inet/arp.h>
60 #include <inet/snmpcom.h>
61 #include <inet/kstatcom.h>
62 
63 #include <netinet/igmp_var.h>
64 #include <netinet/ip6.h>
65 #include <netinet/icmp6.h>
66 #include <netinet/sctp.h>
67 
68 #include <inet/ip.h>
69 #include <inet/ip_impl.h>
70 #include <inet/ip6.h>
71 #include <inet/ip6_asp.h>
72 #include <inet/tcp.h>
73 #include <inet/ip_multi.h>
74 #include <inet/ip_if.h>
75 #include <inet/ip_ire.h>
76 #include <inet/ip_ftable.h>
77 #include <inet/ip_rts.h>
78 #include <inet/optcom.h>
79 #include <inet/ip_ndp.h>
80 #include <inet/ip_listutils.h>
81 #include <netinet/igmp.h>
82 #include <netinet/ip_mroute.h>
83 #include <inet/ipp_common.h>
84 
85 #include <net/pfkeyv2.h>
86 #include <inet/sadb.h>
87 #include <inet/ipsec_impl.h>
88 #include <inet/ipdrop.h>
89 #include <inet/ip_netinfo.h>
90 #include <sys/squeue_impl.h>
91 #include <sys/squeue.h>
92 
93 #include <inet/ipclassifier.h>
94 #include <inet/sctp_ip.h>
95 #include <inet/sctp/sctp_impl.h>
96 #include <inet/udp_impl.h>
97 #include <sys/sunddi.h>
98 
99 #include <sys/tsol/label.h>
100 #include <sys/tsol/tnet.h>
101 
102 /*
103  * Release a reference on ip_xmit_attr.
104  * The reference is acquired by conn_get_ixa()
105  *
106  * This macro has a lowercase function-call version for callers outside
107  * this file.
108  */
109 #define	IXA_REFRELE(ixa)					\
110 {								\
111 	if (atomic_dec_32_nv(&(ixa)->ixa_refcnt) == 0)	\
112 		ixa_inactive(ixa);				\
113 }
114 
115 #define	IXA_REFHOLD(ixa)					\
116 {								\
117 	ASSERT3U((ixa)->ixa_refcnt, !=, 0);			\
118 	atomic_inc_32(&(ixa)->ixa_refcnt);			\
119 }
120 
121 /*
122  * When we need to handle a transmit side asynchronous operation, then we need
123  * to save sufficient information so that we can call the fragment and postfrag
124  * functions. That information is captured in an mblk containing this structure.
125  *
126  * Since this is currently only used for IPsec, we include information for
127  * the kernel crypto framework.
128  */
129 typedef struct ixamblk_s {
130 	boolean_t	ixm_inbound;	/* B_FALSE */
131 	iaflags_t	ixm_flags;	/* ixa_flags */
132 	netstackid_t	ixm_stackid;	/* Verify it didn't go away */
133 	uint_t		ixm_ifindex;	/* Used to find the nce */
134 	in6_addr_t	ixm_nceaddr_v6;	/* Used to find nce */
135 #define	ixm_nceaddr_v4	V4_PART_OF_V6(ixm_nceaddr_v6)
136 	uint32_t	ixm_fragsize;
137 	uint_t		ixm_pktlen;
138 	uint16_t	ixm_ip_hdr_length; /* Points to ULP header */
139 	uint8_t		ixm_protocol;	/* Protocol number for ULP cksum */
140 	pfirepostfrag_t	ixm_postfragfn;
141 
142 	zoneid_t	ixm_zoneid;		/* Needed for ipobs */
143 	zoneid_t	ixm_no_loop_zoneid;	/* IXAF_NO_LOOP_ZONEID_SET */
144 
145 	uint_t		ixm_scopeid;		/* For IPv6 link-locals */
146 
147 	uint32_t	ixm_ident;		/* For IPv6 fragment header */
148 	uint32_t	ixm_xmit_hint;
149 
150 	uint64_t	ixm_conn_id;		/* Used by DTrace */
151 	cred_t		*ixm_cred;	/* For getpeerucred - refhold if set */
152 	pid_t		ixm_cpid;	/* For getpeerucred */
153 
154 	ts_label_t	*ixm_tsl;	/* Refhold if set. */
155 
156 	/*
157 	 * When the pointers below are set they have a refhold on the struct.
158 	 */
159 	ipsec_latch_t		*ixm_ipsec_latch;
160 	struct ipsa_s		*ixm_ipsec_ah_sa;	/* SA for AH */
161 	struct ipsa_s		*ixm_ipsec_esp_sa;	/* SA for ESP */
162 	struct ipsec_policy_s	*ixm_ipsec_policy;	/* why are we here? */
163 	struct ipsec_action_s	*ixm_ipsec_action; /* For reflected packets */
164 
165 	ipsa_ref_t		ixm_ipsec_ref[2]; /* Soft reference to SA */
166 
167 	/* Need these while waiting for SA */
168 	uint16_t ixm_ipsec_src_port;	/* Source port number of d-gram. */
169 	uint16_t ixm_ipsec_dst_port;	/* Destination port number of d-gram. */
170 	uint8_t  ixm_ipsec_icmp_type;	/* ICMP type of d-gram */
171 	uint8_t  ixm_ipsec_icmp_code;	/* ICMP code of d-gram */
172 
173 	sa_family_t ixm_ipsec_inaf;	/* Inner address family */
174 	uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN];	/* Inner src address */
175 	uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN];	/* Inner dest address */
176 	uint8_t  ixm_ipsec_insrcpfx;	/* Inner source prefix */
177 	uint8_t  ixm_ipsec_indstpfx;	/* Inner destination prefix */
178 
179 	uint8_t ixm_ipsec_proto;	/* IP protocol number for d-gram. */
180 } ixamblk_t;
181 
182 
183 /*
184  * When we need to handle a receive side asynchronous operation, then we need
185  * to save sufficient information so that we can call ip_fanout.
186  * That information is captured in an mblk containing this structure.
187  *
188  * Since this is currently only used for IPsec, we include information for
189  * the kernel crypto framework.
190  */
191 typedef struct iramblk_s {
192 	boolean_t	irm_inbound;	/* B_TRUE */
193 	iaflags_t	irm_flags;	/* ira_flags */
194 	netstackid_t	irm_stackid;	/* Verify it didn't go away */
195 	uint_t		irm_ifindex;	/* To find ira_ill */
196 
197 	uint_t		irm_rifindex;	/* ira_rifindex */
198 	uint_t		irm_ruifindex;	/* ira_ruifindex */
199 	uint_t		irm_pktlen;
200 	uint16_t	irm_ip_hdr_length; /* Points to ULP header */
201 	uint8_t		irm_protocol;	/* Protocol number for ULP cksum */
202 	uint8_t		irm_ttl;	/* IP TTL, IPv6 hop limit */
203 	zoneid_t	irm_zoneid;	/* ALL_ZONES unless local delivery */
204 
205 	squeue_t	*irm_sqp;
206 	ill_rx_ring_t	*irm_ring;
207 
208 	ipaddr_t	irm_mroute_tunnel;	/* IRAF_MROUTE_TUNNEL_SET */
209 	zoneid_t	irm_no_loop_zoneid;	/* IRAF_NO_LOOP_ZONEID_SET */
210 	uint32_t	irm_esp_udp_ports;	/* IRAF_ESP_UDP_PORTS */
211 
212 	char		irm_l2src[IRA_L2SRC_SIZE];	/* If IRAF_L2SRC_SET */
213 
214 	cred_t		*irm_cred;	/* For getpeerucred - refhold if set */
215 	pid_t		irm_cpid;	/* For getpeerucred */
216 
217 	ts_label_t	*irm_tsl;	/* Refhold if set. */
218 
219 	/*
220 	 * When set these correspond to a refhold on the object.
221 	 */
222 	struct ipsa_s		*irm_ipsec_ah_sa;	/* SA for AH */
223 	struct ipsa_s		*irm_ipsec_esp_sa;	/* SA for ESP */
224 	struct ipsec_action_s	*irm_ipsec_action; /* For reflected packets */
225 } iramblk_t;
226 
227 
228 /*
229  * Take the information in ip_xmit_attr_t and stick it in an mblk
230  * that can later be passed to ip_xmit_attr_from_mblk to recreate the
231  * ip_xmit_attr_t.
232  *
233  * Returns NULL on memory allocation failure.
234  */
235 mblk_t *
236 ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
237 {
238 	mblk_t		*ixamp;
239 	ixamblk_t	*ixm;
240 	nce_t		*nce = ixa->ixa_nce;
241 
242 	ASSERT(nce != NULL);
243 	ixamp = allocb(sizeof (*ixm), BPRI_MED);
244 	if (ixamp == NULL)
245 		return (NULL);
246 
247 	ixamp->b_datap->db_type = M_BREAK;
248 	ixamp->b_wptr += sizeof (*ixm);
249 	ixm = (ixamblk_t *)ixamp->b_rptr;
250 
251 	bzero(ixm, sizeof (*ixm));
252 	ixm->ixm_inbound = B_FALSE;
253 	ixm->ixm_flags = ixa->ixa_flags;
254 	ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
255 	ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
256 	ixm->ixm_nceaddr_v6 = nce->nce_addr;
257 	ixm->ixm_fragsize = ixa->ixa_fragsize;
258 	ixm->ixm_pktlen = ixa->ixa_pktlen;
259 	ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
260 	ixm->ixm_protocol = ixa->ixa_protocol;
261 	ixm->ixm_postfragfn = ixa->ixa_postfragfn;
262 	ixm->ixm_zoneid = ixa->ixa_zoneid;
263 	ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
264 	ixm->ixm_scopeid = ixa->ixa_scopeid;
265 	ixm->ixm_ident = ixa->ixa_ident;
266 	ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;
267 
268 	if (ixa->ixa_tsl != NULL) {
269 		ixm->ixm_tsl = ixa->ixa_tsl;
270 		label_hold(ixm->ixm_tsl);
271 	}
272 	if (ixa->ixa_cred != NULL) {
273 		ixm->ixm_cred = ixa->ixa_cred;
274 		crhold(ixa->ixa_cred);
275 	}
276 	ixm->ixm_cpid = ixa->ixa_cpid;
277 	ixm->ixm_conn_id = ixa->ixa_conn_id;
278 
279 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
280 		if (ixa->ixa_ipsec_ah_sa != NULL) {
281 			ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
282 			IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
283 		}
284 		if (ixa->ixa_ipsec_esp_sa != NULL) {
285 			ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
286 			IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
287 		}
288 		if (ixa->ixa_ipsec_policy != NULL) {
289 			ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
290 			IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
291 		}
292 		if (ixa->ixa_ipsec_action != NULL) {
293 			ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
294 			IPACT_REFHOLD(ixa->ixa_ipsec_action);
295 		}
296 		if (ixa->ixa_ipsec_latch != NULL) {
297 			ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
298 			IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
299 		}
300 		ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
301 		ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
302 		ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
303 		ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
304 		ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
305 		ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
306 		ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
307 		ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
308 		ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
309 		ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
310 		ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
311 		ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
312 		ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
313 		ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
314 		ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
315 		ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
316 		ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
317 		ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
318 	}
319 	return (ixamp);
320 }
321 
322 /*
323  * Extract the ip_xmit_attr_t from the mblk, checking that the
324  * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
325  * not the case.
326  *
327  * Otherwise ixa is updated.
328  * Caller needs to release references on the ixa by calling ixa_refrele()
329  * which will imediately call ixa_inactive to release the references.
330  */
331 boolean_t
332 ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
333 {
334 	ixamblk_t	*ixm;
335 	netstack_t	*ns;
336 	ip_stack_t	*ipst;
337 	ill_t		*ill;
338 	nce_t		*nce;
339 
340 	/* We assume the caller hasn't initialized ixa */
341 	bzero(ixa, sizeof (*ixa));
342 
343 	ASSERT(DB_TYPE(ixamp) == M_BREAK);
344 	ASSERT(ixamp->b_cont == NULL);
345 
346 	ixm = (ixamblk_t *)ixamp->b_rptr;
347 	ASSERT(!ixm->ixm_inbound);
348 
349 	/* Verify the netstack is still around */
350 	ns = netstack_find_by_stackid(ixm->ixm_stackid);
351 	if (ns == NULL) {
352 		/* Disappeared on us */
353 		(void) ip_xmit_attr_free_mblk(ixamp);
354 		return (B_FALSE);
355 	}
356 	ipst = ns->netstack_ip;
357 
358 	/* Verify the ill is still around */
359 	ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
360 	    !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);
361 
362 	/* We have the ill, hence the netstack can't go away */
363 	netstack_rele(ns);
364 	if (ill == NULL) {
365 		/* Disappeared on us */
366 		(void) ip_xmit_attr_free_mblk(ixamp);
367 		return (B_FALSE);
368 	}
369 	/*
370 	 * Find the nce. We don't load-spread (only lookup nce's on the ill)
371 	 * because we want to find the same nce as the one we had when
372 	 * ip_xmit_attr_to_mblk was called.
373 	 */
374 	if (ixm->ixm_flags & IXAF_IS_IPV4) {
375 		nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
376 	} else {
377 		nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
378 	}
379 
380 	/* We have the nce, hence the ill can't go away */
381 	ill_refrele(ill);
382 	if (nce == NULL) {
383 		/*
384 		 * Since this is unusual and we don't know what type of
385 		 * nce it was, we drop the packet.
386 		 */
387 		(void) ip_xmit_attr_free_mblk(ixamp);
388 		return (B_FALSE);
389 	}
390 
391 	ixa->ixa_flags = ixm->ixm_flags;
392 	ixa->ixa_refcnt = 1;
393 	ixa->ixa_ipst = ipst;
394 	ixa->ixa_fragsize = ixm->ixm_fragsize;
395 	ixa->ixa_pktlen =  ixm->ixm_pktlen;
396 	ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
397 	ixa->ixa_protocol = ixm->ixm_protocol;
398 	ixa->ixa_nce = nce;
399 	ixa->ixa_postfragfn = ixm->ixm_postfragfn;
400 	ixa->ixa_zoneid = ixm->ixm_zoneid;
401 	ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
402 	ixa->ixa_scopeid = ixm->ixm_scopeid;
403 	ixa->ixa_ident = ixm->ixm_ident;
404 	ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;
405 
406 	if (ixm->ixm_tsl != NULL) {
407 		ixa->ixa_tsl = ixm->ixm_tsl;
408 		ixa->ixa_free_flags |= IXA_FREE_TSL;
409 		ixm->ixm_tsl = NULL;
410 	}
411 	if (ixm->ixm_cred != NULL) {
412 		ixa->ixa_cred = ixm->ixm_cred;
413 		ixa->ixa_free_flags |= IXA_FREE_CRED;
414 		ixm->ixm_cred = NULL;
415 	}
416 	ixa->ixa_cpid = ixm->ixm_cpid;
417 	ixa->ixa_conn_id = ixm->ixm_conn_id;
418 
419 	ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
420 	ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
421 	ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
422 	ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
423 	ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;
424 
425 	ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
426 	ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
427 	ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
428 	ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
429 	ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
430 	ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
431 	ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
432 	ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
433 	ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
434 	ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
435 	ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
436 	ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
437 	ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
438 	ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
439 	ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
440 	ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
441 	ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
442 	ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;
443 
444 	freeb(ixamp);
445 	return (B_TRUE);
446 }
447 
448 /*
449  * Free the ixm mblk and any references it holds
450  * Returns b_cont.
451  */
452 mblk_t *
453 ip_xmit_attr_free_mblk(mblk_t *ixamp)
454 {
455 	ixamblk_t	*ixm;
456 	mblk_t		*mp;
457 
458 	/* Consume mp */
459 	ASSERT(DB_TYPE(ixamp) == M_BREAK);
460 	mp = ixamp->b_cont;
461 
462 	ixm = (ixamblk_t *)ixamp->b_rptr;
463 	ASSERT(!ixm->ixm_inbound);
464 
465 	if (ixm->ixm_ipsec_ah_sa != NULL) {
466 		IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
467 		ixm->ixm_ipsec_ah_sa = NULL;
468 	}
469 	if (ixm->ixm_ipsec_esp_sa != NULL) {
470 		IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
471 		ixm->ixm_ipsec_esp_sa = NULL;
472 	}
473 	if (ixm->ixm_ipsec_policy != NULL) {
474 		IPPOL_REFRELE(ixm->ixm_ipsec_policy);
475 		ixm->ixm_ipsec_policy = NULL;
476 	}
477 	if (ixm->ixm_ipsec_action != NULL) {
478 		IPACT_REFRELE(ixm->ixm_ipsec_action);
479 		ixm->ixm_ipsec_action = NULL;
480 	}
481 	if (ixm->ixm_ipsec_latch) {
482 		IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
483 		ixm->ixm_ipsec_latch = NULL;
484 	}
485 
486 	if (ixm->ixm_tsl != NULL) {
487 		label_rele(ixm->ixm_tsl);
488 		ixm->ixm_tsl = NULL;
489 	}
490 	if (ixm->ixm_cred != NULL) {
491 		crfree(ixm->ixm_cred);
492 		ixm->ixm_cred = NULL;
493 	}
494 	freeb(ixamp);
495 	return (mp);
496 }
497 
498 /*
499  * Take the information in ip_recv_attr_t and stick it in an mblk
500  * that can later be passed to ip_recv_attr_from_mblk to recreate the
501  * ip_recv_attr_t.
502  *
503  * Returns NULL on memory allocation failure.
504  */
505 mblk_t *
506 ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
507 {
508 	mblk_t		*iramp;
509 	iramblk_t	*irm;
510 	ill_t		*ill = ira->ira_ill;
511 
512 	ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);
513 
514 	iramp = allocb(sizeof (*irm), BPRI_MED);
515 	if (iramp == NULL)
516 		return (NULL);
517 
518 	iramp->b_datap->db_type = M_BREAK;
519 	iramp->b_wptr += sizeof (*irm);
520 	irm = (iramblk_t *)iramp->b_rptr;
521 
522 	bzero(irm, sizeof (*irm));
523 	irm->irm_inbound = B_TRUE;
524 	irm->irm_flags = ira->ira_flags;
525 	if (ill != NULL) {
526 		/* Internal to IP - preserve ip_stack_t, ill and rill */
527 		irm->irm_stackid =
528 		    ill->ill_ipst->ips_netstack->netstack_stackid;
529 		irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
530 		ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
531 		    ira->ira_rifindex);
532 	} else {
533 		/* Let ip_recv_attr_from_stackid know there isn't one */
534 		irm->irm_stackid = -1;
535 	}
536 	irm->irm_rifindex = ira->ira_rifindex;
537 	irm->irm_ruifindex = ira->ira_ruifindex;
538 	irm->irm_pktlen = ira->ira_pktlen;
539 	irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
540 	irm->irm_protocol = ira->ira_protocol;
541 	irm->irm_ttl = ira->ira_ttl;
542 
543 	irm->irm_sqp = ira->ira_sqp;
544 	irm->irm_ring = ira->ira_ring;
545 
546 	irm->irm_zoneid = ira->ira_zoneid;
547 	irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
548 	irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
549 	irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;
550 
551 	if (ira->ira_tsl != NULL) {
552 		irm->irm_tsl = ira->ira_tsl;
553 		label_hold(irm->irm_tsl);
554 	}
555 	if (ira->ira_cred != NULL) {
556 		irm->irm_cred = ira->ira_cred;
557 		crhold(ira->ira_cred);
558 	}
559 	irm->irm_cpid = ira->ira_cpid;
560 
561 	if (ira->ira_flags & IRAF_L2SRC_SET)
562 		bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);
563 
564 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
565 		if (ira->ira_ipsec_ah_sa != NULL) {
566 			irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
567 			IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
568 		}
569 		if (ira->ira_ipsec_esp_sa != NULL) {
570 			irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
571 			IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
572 		}
573 		if (ira->ira_ipsec_action != NULL) {
574 			irm->irm_ipsec_action = ira->ira_ipsec_action;
575 			IPACT_REFHOLD(ira->ira_ipsec_action);
576 		}
577 	}
578 	return (iramp);
579 }
580 
581 /*
582  * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
583  * then irm_stackid is not -1, in which case we check that the
584  * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
585  * not the case.
586  * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
587  * and we just proceed with ira_ill and ira_rill as NULL.
588  *
589  * The caller needs to release any references on the pointers inside the ire
590  * by calling ira_cleanup.
591  */
592 boolean_t
593 ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
594 {
595 	iramblk_t	*irm;
596 	netstack_t	*ns;
597 	ip_stack_t	*ipst = NULL;
598 	ill_t		*ill = NULL, *rill = NULL;
599 
600 	/* We assume the caller hasn't initialized ira */
601 	bzero(ira, sizeof (*ira));
602 
603 	ASSERT(DB_TYPE(iramp) == M_BREAK);
604 	ASSERT(iramp->b_cont == NULL);
605 
606 	irm = (iramblk_t *)iramp->b_rptr;
607 	ASSERT(irm->irm_inbound);
608 
609 	if (irm->irm_stackid != -1) {
610 		/* Verify the netstack is still around */
611 		ns = netstack_find_by_stackid(irm->irm_stackid);
612 		if (ns == NULL) {
613 			/* Disappeared on us */
614 			(void) ip_recv_attr_free_mblk(iramp);
615 			return (B_FALSE);
616 		}
617 		ipst = ns->netstack_ip;
618 
619 		/* Verify the ill is still around */
620 		ill = ill_lookup_on_ifindex(irm->irm_ifindex,
621 		    !(irm->irm_flags & IRAF_IS_IPV4), ipst);
622 
623 		if (irm->irm_ifindex == irm->irm_rifindex) {
624 			rill = ill;
625 		} else {
626 			rill = ill_lookup_on_ifindex(irm->irm_rifindex,
627 			    !(irm->irm_flags & IRAF_IS_IPV4), ipst);
628 		}
629 
630 		/* We have the ill, hence the netstack can't go away */
631 		netstack_rele(ns);
632 		if (ill == NULL || rill == NULL) {
633 			/* Disappeared on us */
634 			if (ill != NULL)
635 				ill_refrele(ill);
636 			if (rill != NULL && rill != ill)
637 				ill_refrele(rill);
638 			(void) ip_recv_attr_free_mblk(iramp);
639 			return (B_FALSE);
640 		}
641 	}
642 
643 	ira->ira_flags = irm->irm_flags;
644 	/* Caller must ill_refele(ira_ill) by using ira_cleanup() */
645 	ira->ira_ill = ill;
646 	ira->ira_rill = rill;
647 
648 	ira->ira_rifindex = irm->irm_rifindex;
649 	ira->ira_ruifindex = irm->irm_ruifindex;
650 	ira->ira_pktlen = irm->irm_pktlen;
651 	ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
652 	ira->ira_protocol = irm->irm_protocol;
653 	ira->ira_ttl = irm->irm_ttl;
654 
655 	ira->ira_sqp = irm->irm_sqp;
656 	/* The rest of IP assumes that the rings never go away. */
657 	ira->ira_ring = irm->irm_ring;
658 
659 	ira->ira_zoneid = irm->irm_zoneid;
660 	ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
661 	ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
662 	ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;
663 
664 	if (irm->irm_tsl != NULL) {
665 		ira->ira_tsl = irm->irm_tsl;
666 		ira->ira_free_flags |= IRA_FREE_TSL;
667 		irm->irm_tsl = NULL;
668 	}
669 	if (irm->irm_cred != NULL) {
670 		ira->ira_cred = irm->irm_cred;
671 		ira->ira_free_flags |= IRA_FREE_CRED;
672 		irm->irm_cred = NULL;
673 	}
674 	ira->ira_cpid = irm->irm_cpid;
675 
676 	if (ira->ira_flags & IRAF_L2SRC_SET)
677 		bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);
678 
679 	ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
680 	ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
681 	ira->ira_ipsec_action = irm->irm_ipsec_action;
682 
683 	freeb(iramp);
684 	return (B_TRUE);
685 }
686 
687 /*
688  * Free the irm mblk and any references it holds
689  * Returns b_cont.
690  */
691 mblk_t *
692 ip_recv_attr_free_mblk(mblk_t *iramp)
693 {
694 	iramblk_t	*irm;
695 	mblk_t		*mp;
696 
697 	/* Consume mp */
698 	ASSERT(DB_TYPE(iramp) == M_BREAK);
699 	mp = iramp->b_cont;
700 
701 	irm = (iramblk_t *)iramp->b_rptr;
702 	ASSERT(irm->irm_inbound);
703 
704 	if (irm->irm_ipsec_ah_sa != NULL) {
705 		IPSA_REFRELE(irm->irm_ipsec_ah_sa);
706 		irm->irm_ipsec_ah_sa = NULL;
707 	}
708 	if (irm->irm_ipsec_esp_sa != NULL) {
709 		IPSA_REFRELE(irm->irm_ipsec_esp_sa);
710 		irm->irm_ipsec_esp_sa = NULL;
711 	}
712 	if (irm->irm_ipsec_action != NULL) {
713 		IPACT_REFRELE(irm->irm_ipsec_action);
714 		irm->irm_ipsec_action = NULL;
715 	}
716 	if (irm->irm_tsl != NULL) {
717 		label_rele(irm->irm_tsl);
718 		irm->irm_tsl = NULL;
719 	}
720 	if (irm->irm_cred != NULL) {
721 		crfree(irm->irm_cred);
722 		irm->irm_cred = NULL;
723 	}
724 
725 	freeb(iramp);
726 	return (mp);
727 }
728 
729 /*
730  * Returns true if the mblk contains an ip_recv_attr_t
731  * For now we just check db_type.
732  */
733 boolean_t
734 ip_recv_attr_is_mblk(mblk_t *mp)
735 {
736 	/*
737 	 * Need to handle the various forms of tcp_timermp which are tagged
738 	 * with b_wptr and might have a NULL b_datap.
739 	 */
740 	if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
741 		return (B_FALSE);
742 
743 #ifdef	DEBUG
744 	iramblk_t	*irm;
745 
746 	if (DB_TYPE(mp) != M_BREAK)
747 		return (B_FALSE);
748 
749 	irm = (iramblk_t *)mp->b_rptr;
750 	ASSERT(irm->irm_inbound);
751 	return (B_TRUE);
752 #else
753 	return (DB_TYPE(mp) == M_BREAK);
754 #endif
755 }
756 
757 static ip_xmit_attr_t *
758 conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
759 {
760 	ip_xmit_attr_t	*oldixa;	/* Already attached to conn_t */
761 	ip_xmit_attr_t	*ixa;		/* New one, which we return. */
762 
763 	/*
764 	 * NOTE: If the marked-below common case isn't, move the
765 	 * kmem_alloc() up here and put a free in what was marked as the
766 	 * (not really) common case instead.
767 	 */
768 
769 	mutex_enter(&connp->conn_lock);
770 	oldixa = connp->conn_ixa;
771 
772 	/* At least one reference for the conn_t */
773 	ASSERT3U(oldixa->ixa_refcnt, >=, 1);
774 	if (atomic_inc_32_nv(&oldixa->ixa_refcnt) == 2) {
775 		/* No other thread using conn_ixa (common case) */
776 		mutex_exit(&connp->conn_lock);
777 		return (oldixa);
778 	}
779 	/* Do allocation inside-the-conn_lock because it's less common. */
780 	ixa = kmem_alloc(sizeof (*ixa), kmflag);
781 	if (ixa == NULL) {
782 		mutex_exit(&connp->conn_lock);
783 		IXA_REFRELE(oldixa);
784 		return (NULL);
785 	}
786 	ixa_safe_copy(oldixa, ixa);
787 
788 	/* Make sure we drop conn_lock before any refrele */
789 	if (replace) {
790 		ixa->ixa_refcnt++;	/* No atomic needed - not visible */
791 		connp->conn_ixa = ixa;
792 		mutex_exit(&connp->conn_lock);
793 		IXA_REFRELE(oldixa);	/* Undo refcnt from conn_t */
794 	} else {
795 		mutex_exit(&connp->conn_lock);
796 	}
797 	IXA_REFRELE(oldixa);	/* Undo above atomic_add_32_nv */
798 
799 	return (ixa);
800 }
801 
802 /*
803  * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
804  * the caller can access the ip_xmit_attr_t.
805  *
806  * If nobody else is using conn_ixa we return it.
807  * Otherwise we make a "safe" copy of conn_ixa
808  * and return it. The "safe" copy has the pointers set to NULL
809  * (since the pointers might be changed by another thread using
810  * conn_ixa). The caller needs to check for NULL pointers to see
811  * if ip_set_destination needs to be called to re-establish the pointers.
812  *
813  * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
814  * That is used when we connect() the ULP.
815  */
816 ip_xmit_attr_t *
817 conn_get_ixa(conn_t *connp, boolean_t replace)
818 {
819 	return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
820 }
821 
822 /*
823  * Used only when the option is to have the kernel hang due to not
824  * cleaning up ixa references on ills etc.
825  */
826 ip_xmit_attr_t *
827 conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
828 {
829 	return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
830 }
831 
832 /*
833  * Replace conn_ixa with the ixa argument.
834  *
835  * The caller must hold conn_lock.
836  *
837  * We return the old ixa; the caller must ixa_refrele that after conn_lock
838  * has been dropped.
839  */
840 ip_xmit_attr_t *
841 conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
842 {
843 	ip_xmit_attr_t	*oldixa;
844 
845 	ASSERT(MUTEX_HELD(&connp->conn_lock));
846 
847 	oldixa = connp->conn_ixa;
848 	IXA_REFHOLD(ixa);
849 	ixa->ixa_conn_id = oldixa->ixa_conn_id;
850 	connp->conn_ixa = ixa;
851 	return (oldixa);
852 }
853 
854 /*
855  * Return a ip_xmit_attr_t to use with a conn_t that is based on but
856  * separate from conn_ixa.
857  *
858  * This "safe" copy has the pointers set to NULL
859  * (since the pointers might be changed by another thread using
860  * conn_ixa). The caller needs to check for NULL pointers to see
861  * if ip_set_destination needs to be called to re-establish the pointers.
862  */
863 ip_xmit_attr_t *
864 conn_get_ixa_exclusive(conn_t *connp)
865 {
866 	ip_xmit_attr_t *oldixa;
867 	ip_xmit_attr_t *ixa;
868 
869 	ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP_LAZY);
870 	if (ixa == NULL)
871 		return (NULL);
872 
873 	mutex_enter(&connp->conn_lock);
874 
875 	oldixa = connp->conn_ixa;
876 	IXA_REFHOLD(oldixa);
877 
878 	ixa_safe_copy(oldixa, ixa);
879 	mutex_exit(&connp->conn_lock);
880 	IXA_REFRELE(oldixa);
881 	return (ixa);
882 }
883 
884 void
885 ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
886 {
887 	bcopy(src, ixa, sizeof (*ixa));
888 	ixa->ixa_refcnt = 1;
889 	/*
890 	 * Clear any pointers that have references and might be changed
891 	 * by ip_set_destination or the ULP
892 	 */
893 	ixa->ixa_ire = NULL;
894 	ixa->ixa_nce = NULL;
895 	ixa->ixa_dce = NULL;
896 	ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
897 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
898 #ifdef DEBUG
899 	ixa->ixa_curthread = NULL;
900 #endif
901 	/* Clear all the IPsec pointers and the flag as well. */
902 	ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
903 
904 	ixa->ixa_ipsec_latch = NULL;
905 	ixa->ixa_ipsec_ah_sa = NULL;
906 	ixa->ixa_ipsec_esp_sa = NULL;
907 	ixa->ixa_ipsec_policy = NULL;
908 	ixa->ixa_ipsec_action = NULL;
909 
910 	/*
911 	 * We leave ixa_tsl unchanged, but if it has a refhold we need
912 	 * to get an extra refhold.
913 	 */
914 	if (ixa->ixa_free_flags & IXA_FREE_TSL)
915 		label_hold(ixa->ixa_tsl);
916 
917 	/*
918 	 * We leave ixa_cred unchanged, but if it has a refhold we need
919 	 * to get an extra refhold.
920 	 */
921 	if (ixa->ixa_free_flags & IXA_FREE_CRED)
922 		crhold(ixa->ixa_cred);
923 
924 	/*
925 	 * There is no cleanup in progress on this new copy.
926 	 */
927 	ixa->ixa_tcpcleanup = IXATC_IDLE;
928 }
929 
930 /*
931  * Duplicate an ip_xmit_attr_t.
932  * Assumes that the caller controls the ixa, hence we do not need to use
933  * a safe copy. We just have to increase the refcnt on any pointers.
934  */
935 ip_xmit_attr_t *
936 ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
937 {
938 	ip_xmit_attr_t *ixa;
939 
940 	ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
941 	if (ixa == NULL)
942 		return (NULL);
943 	bcopy(src_ixa, ixa, sizeof (*ixa));
944 	ixa->ixa_refcnt = 1;
945 
946 	if (ixa->ixa_ire != NULL)
947 		ire_refhold_notr(ixa->ixa_ire);
948 	if (ixa->ixa_nce != NULL)
949 		nce_refhold(ixa->ixa_nce);
950 	if (ixa->ixa_dce != NULL)
951 		dce_refhold_notr(ixa->ixa_dce);
952 
953 #ifdef DEBUG
954 	ixa->ixa_curthread = NULL;
955 #endif
956 
957 	if (ixa->ixa_ipsec_latch != NULL)
958 		IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
959 	if (ixa->ixa_ipsec_ah_sa != NULL)
960 		IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
961 	if (ixa->ixa_ipsec_esp_sa != NULL)
962 		IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
963 	if (ixa->ixa_ipsec_policy != NULL)
964 		IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
965 	if (ixa->ixa_ipsec_action != NULL)
966 		IPACT_REFHOLD(ixa->ixa_ipsec_action);
967 
968 	if (ixa->ixa_tsl != NULL) {
969 		label_hold(ixa->ixa_tsl);
970 		ixa->ixa_free_flags |= IXA_FREE_TSL;
971 	}
972 	if (ixa->ixa_cred != NULL) {
973 		crhold(ixa->ixa_cred);
974 		ixa->ixa_free_flags |= IXA_FREE_CRED;
975 	}
976 	return (ixa);
977 }
978 
979 /*
980  * Used to replace the ixa_label field.
981  * The caller should have a reference on the label, which we transfer to
982  * the attributes so that when the attribute is freed/cleaned up
983  * we will release that reference.
984  */
985 void
986 ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
987 {
988 	ASSERT(tsl != NULL);
989 
990 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
991 		ASSERT(ixa->ixa_tsl != NULL);
992 		label_rele(ixa->ixa_tsl);
993 	} else {
994 		ixa->ixa_free_flags |= IXA_FREE_TSL;
995 	}
996 	ixa->ixa_tsl = tsl;
997 }
998 
999 /*
1000  * Replace the ip_recv_attr_t's label.
1001  * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
1002  * TCP/UDP uses ira_cred to set db_credp for non-socket users.
1003  * This can fail (and return B_FALSE) due to lack of memory.
1004  */
1005 boolean_t
1006 ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
1007 {
1008 	cred_t	*newcr;
1009 
1010 	if (ira->ira_free_flags & IRA_FREE_TSL) {
1011 		ASSERT(ira->ira_tsl != NULL);
1012 		label_rele(ira->ira_tsl);
1013 	}
1014 	label_hold(tsl);
1015 	ira->ira_tsl = tsl;
1016 	ira->ira_free_flags |= IRA_FREE_TSL;
1017 
1018 	/*
1019 	 * Reset zoneid if we have a shared address. That allows
1020 	 * ip_fanout_tx_v4/v6 to determine the zoneid again.
1021 	 */
1022 	if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
1023 		ira->ira_zoneid = ALL_ZONES;
1024 
1025 	/* We update ira_cred for RPC */
1026 	newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
1027 	if (newcr == NULL)
1028 		return (B_FALSE);
1029 	if (ira->ira_free_flags & IRA_FREE_CRED)
1030 		crfree(ira->ira_cred);
1031 	ira->ira_cred = newcr;
1032 	ira->ira_free_flags |= IRA_FREE_CRED;
1033 	return (B_TRUE);
1034 }
1035 
1036 /*
1037  * This needs to be called after ip_set_destination/tsol_check_dest might
1038  * have changed ixa_tsl to be specific for a destination, and we now want to
1039  * send to a different destination.
1040  * We have to restart with crgetlabel() since ip_set_destination/
1041  * tsol_check_dest will start with ixa_tsl.
1042  */
1043 void
1044 ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
1045 {
1046 	if (!is_system_labeled())
1047 		return;
1048 
1049 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1050 		ASSERT(ixa->ixa_tsl != NULL);
1051 		label_rele(ixa->ixa_tsl);
1052 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1053 	}
1054 	ixa->ixa_tsl = crgetlabel(cr);
1055 }
1056 
1057 void
1058 ixa_refrele(ip_xmit_attr_t *ixa)
1059 {
1060 	IXA_REFRELE(ixa);
1061 }
1062 
1063 void
1064 ixa_inactive(ip_xmit_attr_t *ixa)
1065 {
1066 	ASSERT(ixa->ixa_refcnt == 0);
1067 
1068 	ixa_cleanup(ixa);
1069 	kmem_free(ixa, sizeof (*ixa));
1070 }
1071 
1072 /*
1073  * Release any references contained in the ixa.
1074  * Also clear any fields that are not controlled by ixa_flags.
1075  */
1076 void
1077 ixa_cleanup(ip_xmit_attr_t *ixa)
1078 {
1079 	if (ixa->ixa_ire != NULL) {
1080 		ire_refrele_notr(ixa->ixa_ire);
1081 		ixa->ixa_ire = NULL;
1082 	}
1083 	if (ixa->ixa_dce != NULL) {
1084 		dce_refrele_notr(ixa->ixa_dce);
1085 		ixa->ixa_dce = NULL;
1086 	}
1087 	if (ixa->ixa_nce != NULL) {
1088 		nce_refrele(ixa->ixa_nce);
1089 		ixa->ixa_nce = NULL;
1090 	}
1091 	ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1092 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1093 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
1094 		ipsec_out_release_refs(ixa);
1095 	}
1096 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1097 		ASSERT(ixa->ixa_tsl != NULL);
1098 		label_rele(ixa->ixa_tsl);
1099 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1100 	}
1101 	ixa->ixa_tsl = NULL;
1102 	if (ixa->ixa_free_flags & IXA_FREE_CRED) {
1103 		ASSERT(ixa->ixa_cred != NULL);
1104 		crfree(ixa->ixa_cred);
1105 		ixa->ixa_free_flags &= ~IXA_FREE_CRED;
1106 	}
1107 	ixa->ixa_cred = NULL;
1108 	ixa->ixa_src_preferences = 0;
1109 	ixa->ixa_ifindex = 0;
1110 	ixa->ixa_multicast_ifindex = 0;
1111 	ixa->ixa_multicast_ifaddr = INADDR_ANY;
1112 }
1113 
1114 /*
1115  * Release any references contained in the ira.
1116  * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
1117  * argument.
1118  */
1119 void
1120 ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
1121 {
1122 	if (ira->ira_ill != NULL) {
1123 		if (ira->ira_rill != ira->ira_ill) {
1124 			/* Caused by async processing */
1125 			ill_refrele(ira->ira_rill);
1126 		}
1127 		if (refrele_ill)
1128 			ill_refrele(ira->ira_ill);
1129 	}
1130 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1131 		ipsec_in_release_refs(ira);
1132 	}
1133 	if (ira->ira_free_flags & IRA_FREE_TSL) {
1134 		ASSERT(ira->ira_tsl != NULL);
1135 		label_rele(ira->ira_tsl);
1136 		ira->ira_free_flags &= ~IRA_FREE_TSL;
1137 	}
1138 	ira->ira_tsl = NULL;
1139 	if (ira->ira_free_flags & IRA_FREE_CRED) {
1140 		ASSERT(ira->ira_cred != NULL);
1141 		crfree(ira->ira_cred);
1142 		ira->ira_free_flags &= ~IRA_FREE_CRED;
1143 	}
1144 	ira->ira_cred = NULL;
1145 }
1146 
1147 /*
1148  * Function to help release any IRE, NCE, or DCEs that
1149  * have been deleted and are marked as condemned.
1150  * The caller is responsible for any serialization which is different
1151  * for TCP, SCTP, and others.
1152  */
1153 static void
1154 ixa_cleanup_stale(ip_xmit_attr_t *ixa)
1155 {
1156 	ire_t		*ire;
1157 	nce_t		*nce;
1158 	dce_t		*dce;
1159 
1160 	ire = ixa->ixa_ire;
1161 	nce = ixa->ixa_nce;
1162 	dce = ixa->ixa_dce;
1163 
1164 	if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
1165 		ire_refrele_notr(ire);
1166 		ire = ire_blackhole(ixa->ixa_ipst,
1167 		    !(ixa->ixa_flags & IXAF_IS_IPV4));
1168 		ASSERT(ire != NULL);
1169 #ifdef DEBUG
1170 		ire_refhold_notr(ire);
1171 		ire_refrele(ire);
1172 #endif
1173 		ixa->ixa_ire = ire;
1174 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1175 	}
1176 	if (nce != NULL && nce->nce_is_condemned) {
1177 		/* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
1178 		nce_refrele(nce);
1179 		ixa->ixa_nce = NULL;
1180 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1181 	}
1182 	if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
1183 		dce_refrele_notr(dce);
1184 		dce = dce_get_default(ixa->ixa_ipst);
1185 		ASSERT(dce != NULL);
1186 #ifdef DEBUG
1187 		dce_refhold_notr(dce);
1188 		dce_refrele(dce);
1189 #endif
1190 		ixa->ixa_dce = dce;
1191 		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1192 	}
1193 }
1194 
1195 static mblk_t *
1196 tcp_ixa_cleanup_getmblk(conn_t *connp)
1197 {
1198 	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1199 	int need_retry;
1200 	mblk_t *mp;
1201 
1202 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1203 
1204 	/*
1205 	 * It's possible that someone else came in and started cleaning up
1206 	 * another connection between the time we verified this one is not being
1207 	 * cleaned up and the time we actually get the shared mblk.  If that's
1208 	 * the case, we've dropped the lock, and some other thread may have
1209 	 * cleaned up this connection again, and is still waiting for
1210 	 * notification of that cleanup's completion.  Therefore we need to
1211 	 * recheck.
1212 	 */
1213 	do {
1214 		need_retry = 0;
1215 		while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
1216 			cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1217 			    &tcps->tcps_ixa_cleanup_lock);
1218 		}
1219 
1220 		while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
1221 			/*
1222 			 * Multiple concurrent cleanups; need to have the last
1223 			 * one run since it could be an unplumb.
1224 			 */
1225 			need_retry = 1;
1226 			cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
1227 			    &tcps->tcps_ixa_cleanup_lock);
1228 		}
1229 	} while (need_retry);
1230 
1231 	/*
1232 	 * We now have the lock and the mblk; now make sure that no one else can
1233 	 * try to clean up this connection or enqueue it for cleanup, clear the
1234 	 * mblk pointer for this stack, drop the lock, and return the mblk.
1235 	 */
1236 	ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
1237 	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
1238 	ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
1239 	ASSERT(mp != NULL);
1240 
1241 	connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
1242 	tcps->tcps_ixa_cleanup_mp = NULL;
1243 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1244 
1245 	return (mp);
1246 }
1247 
1248 /*
1249  * Used to run ixa_cleanup_stale inside the tcp squeue.
1250  * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
1251  * and waking up the caller.
1252  */
1253 /* ARGSUSED2 */
1254 static void
1255 tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
1256     ip_recv_attr_t *dummy)
1257 {
1258 	conn_t	*connp = (conn_t *)arg;
1259 	tcp_stack_t	*tcps;
1260 
1261 	tcps = connp->conn_netstack->netstack_tcp;
1262 
1263 	ixa_cleanup_stale(connp->conn_ixa);
1264 
1265 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1266 	ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
1267 	connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
1268 	tcps->tcps_ixa_cleanup_mp = mp;
1269 	cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
1270 	/*
1271 	 * It is possible for any number of threads to be waiting for cleanup of
1272 	 * different connections.  Absent a per-connection (or per-IXA) CV, we
1273 	 * need to wake them all up even though only one can be waiting on this
1274 	 * particular cleanup.
1275 	 */
1276 	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1277 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1278 }
1279 
1280 static void
1281 tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
1282 {
1283 	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1284 
1285 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1286 
1287 	ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
1288 
1289 	while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
1290 		cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1291 		    &tcps->tcps_ixa_cleanup_lock);
1292 	}
1293 
1294 	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
1295 	connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
1296 	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1297 
1298 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1299 }
1300 
1301 /*
1302  * ipcl_walk() function to help release any IRE, NCE, or DCEs that
1303  * have been deleted and are marked as condemned.
1304  * Note that we can't cleanup the pointers since there can be threads
1305  * in conn_ip_output() sending while we are called.
1306  */
1307 void
1308 conn_ixa_cleanup(conn_t *connp, void *arg)
1309 {
1310 	boolean_t tryhard = (boolean_t)arg;
1311 
1312 	if (IPCL_IS_TCP(connp)) {
1313 		mblk_t		*mp;
1314 
1315 		mp = tcp_ixa_cleanup_getmblk(connp);
1316 
1317 		if (connp->conn_sqp->sq_run == curthread) {
1318 			/* Already on squeue */
1319 			tcp_ixa_cleanup(connp, mp, NULL, NULL);
1320 		} else {
1321 			CONN_INC_REF(connp);
1322 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
1323 			    connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
1324 		}
1325 		tcp_ixa_cleanup_wait_and_finish(connp);
1326 	} else if (IPCL_IS_SCTP(connp)) {
1327 		sctp_t	*sctp;
1328 		sctp_faddr_t *fp;
1329 
1330 		sctp = CONN2SCTP(connp);
1331 		RUN_SCTP(sctp);
1332 		ixa_cleanup_stale(connp->conn_ixa);
1333 		for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->sf_next)
1334 			ixa_cleanup_stale(fp->sf_ixa);
1335 		WAKE_SCTP(sctp);
1336 	} else {
1337 		ip_xmit_attr_t	*ixa;
1338 
1339 		/*
1340 		 * If there is a different thread using conn_ixa then we get a
1341 		 * new copy and cut the old one loose from conn_ixa. Otherwise
1342 		 * we use conn_ixa and prevent any other thread from
1343 		 * using/changing it. Anybody using conn_ixa (e.g., a thread in
1344 		 * conn_ip_output) will do an ixa_refrele which will remove any
1345 		 * references on the ire etc.
1346 		 *
1347 		 * Once we are done other threads can use conn_ixa since the
1348 		 * refcnt will be back at one.
1349 		 *
1350 		 * We are called either because an ill is going away, or
1351 		 * due to memory reclaim. In the former case we wait for
1352 		 * memory since we must remove the refcnts on the ill.
1353 		 */
1354 		if (tryhard) {
1355 			ixa = conn_get_ixa_tryhard(connp, B_TRUE);
1356 			ASSERT(ixa != NULL);
1357 		} else {
1358 			ixa = conn_get_ixa(connp, B_TRUE);
1359 			if (ixa == NULL) {
1360 				/*
1361 				 * Somebody else was using it and kmem_alloc
1362 				 * failed! Next memory reclaim will try to
1363 				 * clean up.
1364 				 */
1365 				DTRACE_PROBE1(conn__ixa__cleanup__bail,
1366 				    conn_t *, connp);
1367 				return;
1368 			}
1369 		}
1370 		ixa_cleanup_stale(ixa);
1371 		IXA_REFRELE(ixa);
1372 	}
1373 }
1374 
1375 /*
1376  * ixa needs to be an exclusive copy so that no one changes the cookie
1377  * or the ixa_nce.
1378  */
1379 boolean_t
1380 ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
1381 {
1382 	uintptr_t cookie = ixa->ixa_cookie;
1383 	ill_dld_direct_t *idd;
1384 	idl_tx_list_t *idl_txl;
1385 	ill_t *ill = ixa->ixa_nce->nce_ill;
1386 	boolean_t inserted = B_FALSE;
1387 
1388 	idd = &(ill)->ill_dld_capab->idc_direct;
1389 	idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
1390 	mutex_enter(&idl_txl->txl_lock);
1391 
1392 	/*
1393 	 * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow
1394 	 * control is asserted on an ill that does not support direct calls.
1395 	 * Jump to insert.
1396 	 */
1397 	if (cookie == 0)
1398 		goto tryinsert;
1399 
1400 	ASSERT(ILL_DIRECT_CAPABLE(ill));
1401 
1402 	if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) {
1403 		DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie);
1404 	} else if (idl_txl->txl_cookie != (uintptr_t)NULL &&
1405 	    idl_txl->txl_cookie != ixa->ixa_cookie) {
1406 		DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie,
1407 		    uintptr_t, idl_txl->txl_cookie);
1408 		/* TODO: bump kstat for cookie collision */
1409 	} else {
1410 		/*
1411 		 * Check/set conn_blocked under conn_lock.  Note that txl_lock
1412 		 * will not suffice since two separate UDP threads may be
1413 		 * racing to send to different destinations that are
1414 		 * associated with different cookies and thus may not be
1415 		 * holding the same txl_lock.  Further, since a given conn_t
1416 		 * can only be on a single drain list, the conn_t will be
1417 		 * enqueued on whichever thread wins this race.
1418 		 */
1419 tryinsert:	mutex_enter(&connp->conn_lock);
1420 		if (connp->conn_blocked) {
1421 			DTRACE_PROBE1(ill__tx__conn__already__blocked,
1422 			    conn_t *, connp);
1423 			mutex_exit(&connp->conn_lock);
1424 		} else {
1425 			connp->conn_blocked = B_TRUE;
1426 			mutex_exit(&connp->conn_lock);
1427 			idl_txl->txl_cookie = cookie;
1428 			conn_drain_insert(connp, idl_txl);
1429 			if (!IPCL_IS_NONSTR(connp))
1430 				noenable(connp->conn_wq);
1431 			inserted = B_TRUE;
1432 		}
1433 	}
1434 	mutex_exit(&idl_txl->txl_lock);
1435 	return (inserted);
1436 }
1437