xref: /titanic_50/usr/src/uts/common/inet/ip/ip_attr.c (revision 84ba300aaa958c8e8427c2ec66a932d86bee71c4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/zone.h>
31 #include <sys/ddi.h>
32 #include <sys/sunddi.h>
33 #include <sys/cmn_err.h>
34 #include <sys/debug.h>
35 #include <sys/atomic.h>
36 
37 #include <sys/systm.h>
38 #include <sys/param.h>
39 #include <sys/kmem.h>
40 #include <sys/sdt.h>
41 #include <sys/socket.h>
42 #include <sys/mac.h>
43 #include <net/if.h>
44 #include <net/if_arp.h>
45 #include <net/route.h>
46 #include <sys/sockio.h>
47 #include <netinet/in.h>
48 #include <net/if_dl.h>
49 
50 #include <inet/common.h>
51 #include <inet/mi.h>
52 #include <inet/mib2.h>
53 #include <inet/nd.h>
54 #include <inet/arp.h>
55 #include <inet/snmpcom.h>
56 #include <inet/kstatcom.h>
57 
58 #include <netinet/igmp_var.h>
59 #include <netinet/ip6.h>
60 #include <netinet/icmp6.h>
61 #include <netinet/sctp.h>
62 
63 #include <inet/ip.h>
64 #include <inet/ip_impl.h>
65 #include <inet/ip6.h>
66 #include <inet/ip6_asp.h>
67 #include <inet/tcp.h>
68 #include <inet/ip_multi.h>
69 #include <inet/ip_if.h>
70 #include <inet/ip_ire.h>
71 #include <inet/ip_ftable.h>
72 #include <inet/ip_rts.h>
73 #include <inet/optcom.h>
74 #include <inet/ip_ndp.h>
75 #include <inet/ip_listutils.h>
76 #include <netinet/igmp.h>
77 #include <netinet/ip_mroute.h>
78 #include <inet/ipp_common.h>
79 
80 #include <net/pfkeyv2.h>
81 #include <inet/sadb.h>
82 #include <inet/ipsec_impl.h>
83 #include <inet/ipdrop.h>
84 #include <inet/ip_netinfo.h>
85 #include <sys/squeue_impl.h>
86 #include <sys/squeue.h>
87 
88 #include <inet/ipclassifier.h>
89 #include <inet/sctp_ip.h>
90 #include <inet/sctp/sctp_impl.h>
91 #include <inet/udp_impl.h>
92 #include <sys/sunddi.h>
93 
94 #include <sys/tsol/label.h>
95 #include <sys/tsol/tnet.h>
96 
97 /*
98  * Release a reference on ip_xmit_attr.
99  * The reference is acquired by conn_get_ixa()
100  */
101 #define	IXA_REFRELE(ixa)					\
102 {								\
103 	if (atomic_dec_32_nv(&(ixa)->ixa_refcnt) == 0)	\
104 		ixa_inactive(ixa);				\
105 }
106 
107 #define	IXA_REFHOLD(ixa)					\
108 {								\
109 	ASSERT((ixa)->ixa_refcnt != 0);				\
110 	atomic_inc_32(&(ixa)->ixa_refcnt);			\
111 }
112 
113 /*
114  * When we need to handle a transmit side asynchronous operation, then we need
115  * to save sufficient information so that we can call the fragment and postfrag
116  * functions. That information is captured in an mblk containing this structure.
117  *
118  * Since this is currently only used for IPsec, we include information for
119  * the kernel crypto framework.
120  */
121 typedef struct ixamblk_s {
122 	boolean_t	ixm_inbound;	/* B_FALSE */
123 	iaflags_t	ixm_flags;	/* ixa_flags */
124 	netstackid_t	ixm_stackid;	/* Verify it didn't go away */
125 	uint_t		ixm_ifindex;	/* Used to find the nce */
126 	in6_addr_t	ixm_nceaddr_v6;	/* Used to find nce */
127 #define	ixm_nceaddr_v4	V4_PART_OF_V6(ixm_nceaddr_v6)
128 	uint32_t	ixm_fragsize;
129 	uint_t		ixm_pktlen;
130 	uint16_t	ixm_ip_hdr_length; /* Points to ULP header */
131 	uint8_t		ixm_protocol;	/* Protocol number for ULP cksum */
132 	pfirepostfrag_t	ixm_postfragfn;
133 
134 	zoneid_t	ixm_zoneid;		/* Needed for ipobs */
135 	zoneid_t	ixm_no_loop_zoneid;	/* IXAF_NO_LOOP_ZONEID_SET */
136 
137 	uint_t		ixm_scopeid;		/* For IPv6 link-locals */
138 
139 	uint32_t	ixm_ident;		/* For IPv6 fragment header */
140 	uint32_t	ixm_xmit_hint;
141 
142 	uint64_t	ixm_conn_id;		/* Used by DTrace */
143 	cred_t		*ixm_cred;	/* For getpeerucred - refhold if set */
144 	pid_t		ixm_cpid;	/* For getpeerucred */
145 
146 	ts_label_t	*ixm_tsl;	/* Refhold if set. */
147 
148 	/*
149 	 * When the pointers below are set they have a refhold on the struct.
150 	 */
151 	ipsec_latch_t		*ixm_ipsec_latch;
152 	struct ipsa_s		*ixm_ipsec_ah_sa;	/* SA for AH */
153 	struct ipsa_s		*ixm_ipsec_esp_sa;	/* SA for ESP */
154 	struct ipsec_policy_s 	*ixm_ipsec_policy;	/* why are we here? */
155 	struct ipsec_action_s	*ixm_ipsec_action; /* For reflected packets */
156 
157 	ipsa_ref_t		ixm_ipsec_ref[2]; /* Soft reference to SA */
158 
159 	/* Need these while waiting for SA */
160 	uint16_t ixm_ipsec_src_port;	/* Source port number of d-gram. */
161 	uint16_t ixm_ipsec_dst_port;	/* Destination port number of d-gram. */
162 	uint8_t  ixm_ipsec_icmp_type;	/* ICMP type of d-gram */
163 	uint8_t  ixm_ipsec_icmp_code;	/* ICMP code of d-gram */
164 
165 	sa_family_t ixm_ipsec_inaf;	/* Inner address family */
166 	uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN];	/* Inner src address */
167 	uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN];	/* Inner dest address */
168 	uint8_t  ixm_ipsec_insrcpfx;	/* Inner source prefix */
169 	uint8_t  ixm_ipsec_indstpfx;	/* Inner destination prefix */
170 
171 	uint8_t ixm_ipsec_proto;	/* IP protocol number for d-gram. */
172 } ixamblk_t;
173 
174 
175 /*
176  * When we need to handle a receive side asynchronous operation, then we need
177  * to save sufficient information so that we can call ip_fanout.
178  * That information is captured in an mblk containing this structure.
179  *
180  * Since this is currently only used for IPsec, we include information for
181  * the kernel crypto framework.
182  */
183 typedef struct iramblk_s {
184 	boolean_t	irm_inbound;	/* B_TRUE */
185 	iaflags_t	irm_flags;	/* ira_flags */
186 	netstackid_t	irm_stackid;	/* Verify it didn't go away */
187 	uint_t		irm_ifindex;	/* To find ira_ill */
188 
189 	uint_t		irm_rifindex;	/* ira_rifindex */
190 	uint_t		irm_ruifindex;	/* ira_ruifindex */
191 	uint_t		irm_pktlen;
192 	uint16_t	irm_ip_hdr_length; /* Points to ULP header */
193 	uint8_t		irm_protocol;	/* Protocol number for ULP cksum */
194 	zoneid_t	irm_zoneid;	/* ALL_ZONES unless local delivery */
195 
196 	squeue_t	*irm_sqp;
197 	ill_rx_ring_t	*irm_ring;
198 
199 	ipaddr_t	irm_mroute_tunnel;	/* IRAF_MROUTE_TUNNEL_SET */
200 	zoneid_t	irm_no_loop_zoneid;	/* IRAF_NO_LOOP_ZONEID_SET */
201 	uint32_t	irm_esp_udp_ports;	/* IRAF_ESP_UDP_PORTS */
202 
203 	char		irm_l2src[IRA_L2SRC_SIZE];	/* If IRAF_L2SRC_SET */
204 
205 	cred_t		*irm_cred;	/* For getpeerucred - refhold if set */
206 	pid_t		irm_cpid;	/* For getpeerucred */
207 
208 	ts_label_t	*irm_tsl;	/* Refhold if set. */
209 
210 	/*
211 	 * When set these correspond to a refhold on the object.
212 	 */
213 	struct ipsa_s		*irm_ipsec_ah_sa;	/* SA for AH */
214 	struct ipsa_s		*irm_ipsec_esp_sa;	/* SA for ESP */
215 	struct ipsec_action_s	*irm_ipsec_action; /* For reflected packets */
216 } iramblk_t;
217 
218 
219 /*
220  * Take the information in ip_xmit_attr_t and stick it in an mblk
221  * that can later be passed to ip_xmit_attr_from_mblk to recreate the
222  * ip_xmit_attr_t.
223  *
224  * Returns NULL on memory allocation failure.
225  */
226 mblk_t *
227 ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
228 {
229 	mblk_t		*ixamp;
230 	ixamblk_t	*ixm;
231 	nce_t		*nce = ixa->ixa_nce;
232 
233 	ASSERT(nce != NULL);
234 	ixamp = allocb(sizeof (*ixm), BPRI_MED);
235 	if (ixamp == NULL)
236 		return (NULL);
237 
238 	ixamp->b_datap->db_type = M_BREAK;
239 	ixamp->b_wptr += sizeof (*ixm);
240 	ixm = (ixamblk_t *)ixamp->b_rptr;
241 
242 	bzero(ixm, sizeof (*ixm));
243 	ixm->ixm_inbound = B_FALSE;
244 	ixm->ixm_flags = ixa->ixa_flags;
245 	ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
246 	ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
247 	ixm->ixm_nceaddr_v6 = nce->nce_addr;
248 	ixm->ixm_fragsize = ixa->ixa_fragsize;
249 	ixm->ixm_pktlen = ixa->ixa_pktlen;
250 	ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
251 	ixm->ixm_protocol = ixa->ixa_protocol;
252 	ixm->ixm_postfragfn = ixa->ixa_postfragfn;
253 	ixm->ixm_zoneid = ixa->ixa_zoneid;
254 	ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
255 	ixm->ixm_scopeid = ixa->ixa_scopeid;
256 	ixm->ixm_ident = ixa->ixa_ident;
257 	ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;
258 
259 	if (ixa->ixa_tsl != NULL) {
260 		ixm->ixm_tsl = ixa->ixa_tsl;
261 		label_hold(ixm->ixm_tsl);
262 	}
263 	if (ixa->ixa_cred != NULL) {
264 		ixm->ixm_cred = ixa->ixa_cred;
265 		crhold(ixa->ixa_cred);
266 	}
267 	ixm->ixm_cpid = ixa->ixa_cpid;
268 	ixm->ixm_conn_id = ixa->ixa_conn_id;
269 
270 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
271 		if (ixa->ixa_ipsec_ah_sa != NULL) {
272 			ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
273 			IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
274 		}
275 		if (ixa->ixa_ipsec_esp_sa != NULL) {
276 			ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
277 			IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
278 		}
279 		if (ixa->ixa_ipsec_policy != NULL) {
280 			ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
281 			IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
282 		}
283 		if (ixa->ixa_ipsec_action != NULL) {
284 			ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
285 			IPACT_REFHOLD(ixa->ixa_ipsec_action);
286 		}
287 		if (ixa->ixa_ipsec_latch != NULL) {
288 			ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
289 			IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
290 		}
291 		ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
292 		ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
293 		ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
294 		ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
295 		ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
296 		ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
297 		ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
298 		ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
299 		ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
300 		ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
301 		ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
302 		ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
303 		ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
304 		ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
305 		ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
306 		ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
307 		ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
308 		ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
309 	}
310 	return (ixamp);
311 }
312 
313 /*
314  * Extract the ip_xmit_attr_t from the mblk, checking that the
315  * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
316  * not the case.
317  *
318  * Otherwise ixa is updated.
319  * Caller needs to release references on the ixa by calling ixa_refrele()
320  * which will imediately call ixa_inactive to release the references.
321  */
322 boolean_t
323 ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
324 {
325 	ixamblk_t	*ixm;
326 	netstack_t	*ns;
327 	ip_stack_t	*ipst;
328 	ill_t		*ill;
329 	nce_t		*nce;
330 
331 	/* We assume the caller hasn't initialized ixa */
332 	bzero(ixa, sizeof (*ixa));
333 
334 	ASSERT(DB_TYPE(ixamp) == M_BREAK);
335 	ASSERT(ixamp->b_cont == NULL);
336 
337 	ixm = (ixamblk_t *)ixamp->b_rptr;
338 	ASSERT(!ixm->ixm_inbound);
339 
340 	/* Verify the netstack is still around */
341 	ns = netstack_find_by_stackid(ixm->ixm_stackid);
342 	if (ns == NULL) {
343 		/* Disappeared on us */
344 		(void) ip_xmit_attr_free_mblk(ixamp);
345 		return (B_FALSE);
346 	}
347 	ipst = ns->netstack_ip;
348 
349 	/* Verify the ill is still around */
350 	ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
351 	    !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);
352 
353 	/* We have the ill, hence the netstack can't go away */
354 	netstack_rele(ns);
355 	if (ill == NULL) {
356 		/* Disappeared on us */
357 		(void) ip_xmit_attr_free_mblk(ixamp);
358 		return (B_FALSE);
359 	}
360 	/*
361 	 * Find the nce. We don't load-spread (only lookup nce's on the ill)
362 	 * because we want to find the same nce as the one we had when
363 	 * ip_xmit_attr_to_mblk was called.
364 	 */
365 	if (ixm->ixm_flags & IXAF_IS_IPV4) {
366 		nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
367 	} else {
368 		nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
369 	}
370 
371 	/* We have the nce, hence the ill can't go away */
372 	ill_refrele(ill);
373 	if (nce == NULL) {
374 		/*
375 		 * Since this is unusual and we don't know what type of
376 		 * nce it was, we drop the packet.
377 		 */
378 		(void) ip_xmit_attr_free_mblk(ixamp);
379 		return (B_FALSE);
380 	}
381 
382 	ixa->ixa_flags = ixm->ixm_flags;
383 	ixa->ixa_refcnt = 1;
384 	ixa->ixa_ipst = ipst;
385 	ixa->ixa_fragsize = ixm->ixm_fragsize;
386 	ixa->ixa_pktlen =  ixm->ixm_pktlen;
387 	ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
388 	ixa->ixa_protocol = ixm->ixm_protocol;
389 	ixa->ixa_nce = nce;
390 	ixa->ixa_postfragfn = ixm->ixm_postfragfn;
391 	ixa->ixa_zoneid = ixm->ixm_zoneid;
392 	ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
393 	ixa->ixa_scopeid = ixm->ixm_scopeid;
394 	ixa->ixa_ident = ixm->ixm_ident;
395 	ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;
396 
397 	if (ixm->ixm_tsl != NULL) {
398 		ixa->ixa_tsl = ixm->ixm_tsl;
399 		ixa->ixa_free_flags |= IXA_FREE_TSL;
400 		ixm->ixm_tsl = NULL;
401 	}
402 	if (ixm->ixm_cred != NULL) {
403 		ixa->ixa_cred = ixm->ixm_cred;
404 		ixa->ixa_free_flags |= IXA_FREE_CRED;
405 		ixm->ixm_cred = NULL;
406 	}
407 	ixa->ixa_cpid = ixm->ixm_cpid;
408 	ixa->ixa_conn_id = ixm->ixm_conn_id;
409 
410 	ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
411 	ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
412 	ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
413 	ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
414 	ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;
415 
416 	ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
417 	ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
418 	ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
419 	ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
420 	ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
421 	ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
422 	ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
423 	ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
424 	ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
425 	ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
426 	ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
427 	ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
428 	ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
429 	ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
430 	ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
431 	ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
432 	ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
433 	ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;
434 
435 	freeb(ixamp);
436 	return (B_TRUE);
437 }
438 
439 /*
440  * Free the ixm mblk and any references it holds
441  * Returns b_cont.
442  */
443 mblk_t *
444 ip_xmit_attr_free_mblk(mblk_t *ixamp)
445 {
446 	ixamblk_t	*ixm;
447 	mblk_t		*mp;
448 
449 	/* Consume mp */
450 	ASSERT(DB_TYPE(ixamp) == M_BREAK);
451 	mp = ixamp->b_cont;
452 
453 	ixm = (ixamblk_t *)ixamp->b_rptr;
454 	ASSERT(!ixm->ixm_inbound);
455 
456 	if (ixm->ixm_ipsec_ah_sa != NULL) {
457 		IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
458 		ixm->ixm_ipsec_ah_sa = NULL;
459 	}
460 	if (ixm->ixm_ipsec_esp_sa != NULL) {
461 		IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
462 		ixm->ixm_ipsec_esp_sa = NULL;
463 	}
464 	if (ixm->ixm_ipsec_policy != NULL) {
465 		IPPOL_REFRELE(ixm->ixm_ipsec_policy);
466 		ixm->ixm_ipsec_policy = NULL;
467 	}
468 	if (ixm->ixm_ipsec_action != NULL) {
469 		IPACT_REFRELE(ixm->ixm_ipsec_action);
470 		ixm->ixm_ipsec_action = NULL;
471 	}
472 	if (ixm->ixm_ipsec_latch) {
473 		IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
474 		ixm->ixm_ipsec_latch = NULL;
475 	}
476 
477 	if (ixm->ixm_tsl != NULL) {
478 		label_rele(ixm->ixm_tsl);
479 		ixm->ixm_tsl = NULL;
480 	}
481 	if (ixm->ixm_cred != NULL) {
482 		crfree(ixm->ixm_cred);
483 		ixm->ixm_cred = NULL;
484 	}
485 	freeb(ixamp);
486 	return (mp);
487 }
488 
489 /*
490  * Take the information in ip_recv_attr_t and stick it in an mblk
491  * that can later be passed to ip_recv_attr_from_mblk to recreate the
492  * ip_recv_attr_t.
493  *
494  * Returns NULL on memory allocation failure.
495  */
496 mblk_t *
497 ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
498 {
499 	mblk_t		*iramp;
500 	iramblk_t	*irm;
501 	ill_t		*ill = ira->ira_ill;
502 
503 	ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);
504 
505 	iramp = allocb(sizeof (*irm), BPRI_MED);
506 	if (iramp == NULL)
507 		return (NULL);
508 
509 	iramp->b_datap->db_type = M_BREAK;
510 	iramp->b_wptr += sizeof (*irm);
511 	irm = (iramblk_t *)iramp->b_rptr;
512 
513 	bzero(irm, sizeof (*irm));
514 	irm->irm_inbound = B_TRUE;
515 	irm->irm_flags = ira->ira_flags;
516 	if (ill != NULL) {
517 		/* Internal to IP - preserve ip_stack_t, ill and rill */
518 		irm->irm_stackid =
519 		    ill->ill_ipst->ips_netstack->netstack_stackid;
520 		irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
521 		ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
522 		    ira->ira_rifindex);
523 	} else {
524 		/* Let ip_recv_attr_from_stackid know there isn't one */
525 		irm->irm_stackid = -1;
526 	}
527 	irm->irm_rifindex = ira->ira_rifindex;
528 	irm->irm_ruifindex = ira->ira_ruifindex;
529 	irm->irm_pktlen = ira->ira_pktlen;
530 	irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
531 	irm->irm_protocol = ira->ira_protocol;
532 
533 	irm->irm_sqp = ira->ira_sqp;
534 	irm->irm_ring = ira->ira_ring;
535 
536 	irm->irm_zoneid = ira->ira_zoneid;
537 	irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
538 	irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
539 	irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;
540 
541 	if (ira->ira_tsl != NULL) {
542 		irm->irm_tsl = ira->ira_tsl;
543 		label_hold(irm->irm_tsl);
544 	}
545 	if (ira->ira_cred != NULL) {
546 		irm->irm_cred = ira->ira_cred;
547 		crhold(ira->ira_cred);
548 	}
549 	irm->irm_cpid = ira->ira_cpid;
550 
551 	if (ira->ira_flags & IRAF_L2SRC_SET)
552 		bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);
553 
554 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
555 		if (ira->ira_ipsec_ah_sa != NULL) {
556 			irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
557 			IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
558 		}
559 		if (ira->ira_ipsec_esp_sa != NULL) {
560 			irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
561 			IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
562 		}
563 		if (ira->ira_ipsec_action != NULL) {
564 			irm->irm_ipsec_action = ira->ira_ipsec_action;
565 			IPACT_REFHOLD(ira->ira_ipsec_action);
566 		}
567 	}
568 	return (iramp);
569 }
570 
571 /*
572  * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
573  * then irm_stackid is not -1, in which case we check that the
574  * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
575  * not the case.
576  * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
577  * and we just proceed with ira_ill and ira_rill as NULL.
578  *
579  * The caller needs to release any references on the pointers inside the ire
580  * by calling ira_cleanup.
581  */
582 boolean_t
583 ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
584 {
585 	iramblk_t	*irm;
586 	netstack_t	*ns;
587 	ip_stack_t	*ipst = NULL;
588 	ill_t		*ill = NULL, *rill = NULL;
589 
590 	/* We assume the caller hasn't initialized ira */
591 	bzero(ira, sizeof (*ira));
592 
593 	ASSERT(DB_TYPE(iramp) == M_BREAK);
594 	ASSERT(iramp->b_cont == NULL);
595 
596 	irm = (iramblk_t *)iramp->b_rptr;
597 	ASSERT(irm->irm_inbound);
598 
599 	if (irm->irm_stackid != -1) {
600 		/* Verify the netstack is still around */
601 		ns = netstack_find_by_stackid(irm->irm_stackid);
602 		if (ns == NULL) {
603 			/* Disappeared on us */
604 			(void) ip_recv_attr_free_mblk(iramp);
605 			return (B_FALSE);
606 		}
607 		ipst = ns->netstack_ip;
608 
609 		/* Verify the ill is still around */
610 		ill = ill_lookup_on_ifindex(irm->irm_ifindex,
611 		    !(irm->irm_flags & IRAF_IS_IPV4), ipst);
612 
613 		if (irm->irm_ifindex == irm->irm_rifindex) {
614 			rill = ill;
615 		} else {
616 			rill = ill_lookup_on_ifindex(irm->irm_rifindex,
617 			    !(irm->irm_flags & IRAF_IS_IPV4), ipst);
618 		}
619 
620 		/* We have the ill, hence the netstack can't go away */
621 		netstack_rele(ns);
622 		if (ill == NULL || rill == NULL) {
623 			/* Disappeared on us */
624 			if (ill != NULL)
625 				ill_refrele(ill);
626 			if (rill != NULL && rill != ill)
627 				ill_refrele(rill);
628 			(void) ip_recv_attr_free_mblk(iramp);
629 			return (B_FALSE);
630 		}
631 	}
632 
633 	ira->ira_flags = irm->irm_flags;
634 	/* Caller must ill_refele(ira_ill) by using ira_cleanup() */
635 	ira->ira_ill = ill;
636 	ira->ira_rill = rill;
637 
638 	ira->ira_rifindex = irm->irm_rifindex;
639 	ira->ira_ruifindex = irm->irm_ruifindex;
640 	ira->ira_pktlen = irm->irm_pktlen;
641 	ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
642 	ira->ira_protocol = irm->irm_protocol;
643 
644 	ira->ira_sqp = irm->irm_sqp;
645 	/* The rest of IP assumes that the rings never go away. */
646 	ira->ira_ring = irm->irm_ring;
647 
648 	ira->ira_zoneid = irm->irm_zoneid;
649 	ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
650 	ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
651 	ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;
652 
653 	if (irm->irm_tsl != NULL) {
654 		ira->ira_tsl = irm->irm_tsl;
655 		ira->ira_free_flags |= IRA_FREE_TSL;
656 		irm->irm_tsl = NULL;
657 	}
658 	if (irm->irm_cred != NULL) {
659 		ira->ira_cred = irm->irm_cred;
660 		ira->ira_free_flags |= IRA_FREE_CRED;
661 		irm->irm_cred = NULL;
662 	}
663 	ira->ira_cpid = irm->irm_cpid;
664 
665 	if (ira->ira_flags & IRAF_L2SRC_SET)
666 		bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);
667 
668 	ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
669 	ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
670 	ira->ira_ipsec_action = irm->irm_ipsec_action;
671 
672 	freeb(iramp);
673 	return (B_TRUE);
674 }
675 
676 /*
677  * Free the irm mblk and any references it holds
678  * Returns b_cont.
679  */
680 mblk_t *
681 ip_recv_attr_free_mblk(mblk_t *iramp)
682 {
683 	iramblk_t	*irm;
684 	mblk_t		*mp;
685 
686 	/* Consume mp */
687 	ASSERT(DB_TYPE(iramp) == M_BREAK);
688 	mp = iramp->b_cont;
689 
690 	irm = (iramblk_t *)iramp->b_rptr;
691 	ASSERT(irm->irm_inbound);
692 
693 	if (irm->irm_ipsec_ah_sa != NULL) {
694 		IPSA_REFRELE(irm->irm_ipsec_ah_sa);
695 		irm->irm_ipsec_ah_sa = NULL;
696 	}
697 	if (irm->irm_ipsec_esp_sa != NULL) {
698 		IPSA_REFRELE(irm->irm_ipsec_esp_sa);
699 		irm->irm_ipsec_esp_sa = NULL;
700 	}
701 	if (irm->irm_ipsec_action != NULL) {
702 		IPACT_REFRELE(irm->irm_ipsec_action);
703 		irm->irm_ipsec_action = NULL;
704 	}
705 	if (irm->irm_tsl != NULL) {
706 		label_rele(irm->irm_tsl);
707 		irm->irm_tsl = NULL;
708 	}
709 	if (irm->irm_cred != NULL) {
710 		crfree(irm->irm_cred);
711 		irm->irm_cred = NULL;
712 	}
713 
714 	freeb(iramp);
715 	return (mp);
716 }
717 
718 /*
719  * Returns true if the mblk contains an ip_recv_attr_t
720  * For now we just check db_type.
721  */
722 boolean_t
723 ip_recv_attr_is_mblk(mblk_t *mp)
724 {
725 	/*
726 	 * Need to handle the various forms of tcp_timermp which are tagged
727 	 * with b_wptr and might have a NULL b_datap.
728 	 */
729 	if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
730 		return (B_FALSE);
731 
732 #ifdef	DEBUG
733 	iramblk_t	*irm;
734 
735 	if (DB_TYPE(mp) != M_BREAK)
736 		return (B_FALSE);
737 
738 	irm = (iramblk_t *)mp->b_rptr;
739 	ASSERT(irm->irm_inbound);
740 	return (B_TRUE);
741 #else
742 	return (DB_TYPE(mp) == M_BREAK);
743 #endif
744 }
745 
746 static ip_xmit_attr_t *
747 conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
748 {
749 	ip_xmit_attr_t	*ixa;
750 	ip_xmit_attr_t	*oldixa;
751 
752 	mutex_enter(&connp->conn_lock);
753 	ixa = connp->conn_ixa;
754 
755 	/* At least one references for the conn_t */
756 	ASSERT(ixa->ixa_refcnt >= 1);
757 	if (atomic_inc_32_nv(&ixa->ixa_refcnt) == 2) {
758 		/* No other thread using conn_ixa */
759 		mutex_exit(&connp->conn_lock);
760 		return (ixa);
761 	}
762 	ixa = kmem_alloc(sizeof (*ixa), kmflag);
763 	if (ixa == NULL) {
764 		mutex_exit(&connp->conn_lock);
765 		ixa_refrele(connp->conn_ixa);
766 		return (NULL);
767 	}
768 	ixa_safe_copy(connp->conn_ixa, ixa);
769 
770 	/* Make sure we drop conn_lock before any refrele */
771 	if (replace) {
772 		ixa->ixa_refcnt++;	/* No atomic needed - not visible */
773 		oldixa = connp->conn_ixa;
774 		connp->conn_ixa = ixa;
775 		mutex_exit(&connp->conn_lock);
776 		IXA_REFRELE(oldixa);	/* Undo refcnt from conn_t */
777 	} else {
778 		oldixa = connp->conn_ixa;
779 		mutex_exit(&connp->conn_lock);
780 	}
781 	IXA_REFRELE(oldixa);	/* Undo above atomic_add_32_nv */
782 
783 	return (ixa);
784 }
785 
786 /*
787  * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
788  * the caller can access the ip_xmit_attr_t.
789  *
790  * If nobody else is using conn_ixa we return it.
791  * Otherwise we make a "safe" copy of conn_ixa
792  * and return it. The "safe" copy has the pointers set to NULL
793  * (since the pointers might be changed by another thread using
794  * conn_ixa). The caller needs to check for NULL pointers to see
795  * if ip_set_destination needs to be called to re-establish the pointers.
796  *
797  * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
798  * That is used when we connect() the ULP.
799  */
800 ip_xmit_attr_t *
801 conn_get_ixa(conn_t *connp, boolean_t replace)
802 {
803 	return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
804 }
805 
806 /*
807  * Used only when the option is to have the kernel hang due to not
808  * cleaning up ixa references on ills etc.
809  */
810 ip_xmit_attr_t *
811 conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
812 {
813 	return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
814 }
815 
816 /*
817  * Replace conn_ixa with the ixa argument.
818  *
819  * The caller must hold conn_lock.
820  *
821  * We return the old ixa; the caller must ixa_refrele that after conn_lock
822  * has been dropped.
823  */
824 ip_xmit_attr_t *
825 conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
826 {
827 	ip_xmit_attr_t	*oldixa;
828 
829 	ASSERT(MUTEX_HELD(&connp->conn_lock));
830 
831 	oldixa = connp->conn_ixa;
832 	IXA_REFHOLD(ixa);
833 	ixa->ixa_conn_id = oldixa->ixa_conn_id;
834 	connp->conn_ixa = ixa;
835 	return (oldixa);
836 }
837 
838 /*
839  * Return a ip_xmit_attr_t to use with a conn_t that is based on but
840  * separate from conn_ixa.
841  *
842  * This "safe" copy has the pointers set to NULL
843  * (since the pointers might be changed by another thread using
844  * conn_ixa). The caller needs to check for NULL pointers to see
845  * if ip_set_destination needs to be called to re-establish the pointers.
846  */
847 ip_xmit_attr_t *
848 conn_get_ixa_exclusive(conn_t *connp)
849 {
850 	ip_xmit_attr_t *ixa;
851 
852 	mutex_enter(&connp->conn_lock);
853 	ixa = connp->conn_ixa;
854 
855 	/* At least one references for the conn_t */
856 	ASSERT(ixa->ixa_refcnt >= 1);
857 
858 	/* Make sure conn_ixa doesn't disappear while we copy it */
859 	atomic_inc_32(&ixa->ixa_refcnt);
860 
861 	ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
862 	if (ixa == NULL) {
863 		mutex_exit(&connp->conn_lock);
864 		ixa_refrele(connp->conn_ixa);
865 		return (NULL);
866 	}
867 	ixa_safe_copy(connp->conn_ixa, ixa);
868 	mutex_exit(&connp->conn_lock);
869 	IXA_REFRELE(connp->conn_ixa);
870 	return (ixa);
871 }
872 
873 void
874 ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
875 {
876 	bcopy(src, ixa, sizeof (*ixa));
877 	ixa->ixa_refcnt = 1;
878 	/*
879 	 * Clear any pointers that have references and might be changed
880 	 * by ip_set_destination or the ULP
881 	 */
882 	ixa->ixa_ire = NULL;
883 	ixa->ixa_nce = NULL;
884 	ixa->ixa_dce = NULL;
885 	ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
886 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
887 #ifdef DEBUG
888 	ixa->ixa_curthread = NULL;
889 #endif
890 	/* Clear all the IPsec pointers and the flag as well. */
891 	ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
892 
893 	ixa->ixa_ipsec_latch = NULL;
894 	ixa->ixa_ipsec_ah_sa = NULL;
895 	ixa->ixa_ipsec_esp_sa = NULL;
896 	ixa->ixa_ipsec_policy = NULL;
897 	ixa->ixa_ipsec_action = NULL;
898 
899 	/*
900 	 * We leave ixa_tsl unchanged, but if it has a refhold we need
901 	 * to get an extra refhold.
902 	 */
903 	if (ixa->ixa_free_flags & IXA_FREE_TSL)
904 		label_hold(ixa->ixa_tsl);
905 
906 	/*
907 	 * We leave ixa_cred unchanged, but if it has a refhold we need
908 	 * to get an extra refhold.
909 	 */
910 	if (ixa->ixa_free_flags & IXA_FREE_CRED)
911 		crhold(ixa->ixa_cred);
912 }
913 
914 /*
915  * Duplicate an ip_xmit_attr_t.
916  * Assumes that the caller controls the ixa, hence we do not need to use
917  * a safe copy. We just have to increase the refcnt on any pointers.
918  */
919 ip_xmit_attr_t *
920 ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
921 {
922 	ip_xmit_attr_t *ixa;
923 
924 	ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
925 	if (ixa == NULL)
926 		return (NULL);
927 	bcopy(src_ixa, ixa, sizeof (*ixa));
928 	ixa->ixa_refcnt = 1;
929 
930 	if (ixa->ixa_ire != NULL)
931 		ire_refhold_notr(ixa->ixa_ire);
932 	if (ixa->ixa_nce != NULL)
933 		nce_refhold(ixa->ixa_nce);
934 	if (ixa->ixa_dce != NULL)
935 		dce_refhold_notr(ixa->ixa_dce);
936 
937 #ifdef DEBUG
938 	ixa->ixa_curthread = NULL;
939 #endif
940 
941 	if (ixa->ixa_ipsec_latch != NULL)
942 		IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
943 	if (ixa->ixa_ipsec_ah_sa != NULL)
944 		IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
945 	if (ixa->ixa_ipsec_esp_sa != NULL)
946 		IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
947 	if (ixa->ixa_ipsec_policy != NULL)
948 		IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
949 	if (ixa->ixa_ipsec_action != NULL)
950 		IPACT_REFHOLD(ixa->ixa_ipsec_action);
951 
952 	if (ixa->ixa_tsl != NULL) {
953 		label_hold(ixa->ixa_tsl);
954 		ixa->ixa_free_flags |= IXA_FREE_TSL;
955 	}
956 	if (ixa->ixa_cred != NULL) {
957 		crhold(ixa->ixa_cred);
958 		ixa->ixa_free_flags |= IXA_FREE_CRED;
959 	}
960 	return (ixa);
961 }
962 
963 /*
964  * Used to replace the ixa_label field.
965  * The caller should have a reference on the label, which we transfer to
966  * the attributes so that when the attribute is freed/cleaned up
967  * we will release that reference.
968  */
969 void
970 ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
971 {
972 	ASSERT(tsl != NULL);
973 
974 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
975 		ASSERT(ixa->ixa_tsl != NULL);
976 		label_rele(ixa->ixa_tsl);
977 	} else {
978 		ixa->ixa_free_flags |= IXA_FREE_TSL;
979 	}
980 	ixa->ixa_tsl = tsl;
981 }
982 
983 /*
984  * Replace the ip_recv_attr_t's label.
985  * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
986  * TCP/UDP uses ira_cred to set db_credp for non-socket users.
987  * This can fail (and return B_FALSE) due to lack of memory.
988  */
989 boolean_t
990 ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
991 {
992 	cred_t	*newcr;
993 
994 	if (ira->ira_free_flags & IRA_FREE_TSL) {
995 		ASSERT(ira->ira_tsl != NULL);
996 		label_rele(ira->ira_tsl);
997 	}
998 	label_hold(tsl);
999 	ira->ira_tsl = tsl;
1000 	ira->ira_free_flags |= IRA_FREE_TSL;
1001 
1002 	/*
1003 	 * Reset zoneid if we have a shared address. That allows
1004 	 * ip_fanout_tx_v4/v6 to determine the zoneid again.
1005 	 */
1006 	if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
1007 		ira->ira_zoneid = ALL_ZONES;
1008 
1009 	/* We update ira_cred for RPC */
1010 	newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
1011 	if (newcr == NULL)
1012 		return (B_FALSE);
1013 	if (ira->ira_free_flags & IRA_FREE_CRED)
1014 		crfree(ira->ira_cred);
1015 	ira->ira_cred = newcr;
1016 	ira->ira_free_flags |= IRA_FREE_CRED;
1017 	return (B_TRUE);
1018 }
1019 
1020 /*
1021  * This needs to be called after ip_set_destination/tsol_check_dest might
1022  * have changed ixa_tsl to be specific for a destination, and we now want to
1023  * send to a different destination.
1024  * We have to restart with crgetlabel() since ip_set_destination/
1025  * tsol_check_dest will start with ixa_tsl.
1026  */
1027 void
1028 ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
1029 {
1030 	if (!is_system_labeled())
1031 		return;
1032 
1033 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1034 		ASSERT(ixa->ixa_tsl != NULL);
1035 		label_rele(ixa->ixa_tsl);
1036 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1037 	}
1038 	ixa->ixa_tsl = crgetlabel(cr);
1039 }
1040 
1041 void
1042 ixa_refrele(ip_xmit_attr_t *ixa)
1043 {
1044 	IXA_REFRELE(ixa);
1045 }
1046 
1047 void
1048 ixa_inactive(ip_xmit_attr_t *ixa)
1049 {
1050 	ASSERT(ixa->ixa_refcnt == 0);
1051 
1052 	ixa_cleanup(ixa);
1053 	kmem_free(ixa, sizeof (*ixa));
1054 }
1055 
1056 /*
1057  * Release any references contained in the ixa.
1058  * Also clear any fields that are not controlled by ixa_flags.
1059  */
1060 void
1061 ixa_cleanup(ip_xmit_attr_t *ixa)
1062 {
1063 	if (ixa->ixa_ire != NULL) {
1064 		ire_refrele_notr(ixa->ixa_ire);
1065 		ixa->ixa_ire = NULL;
1066 	}
1067 	if (ixa->ixa_dce != NULL) {
1068 		dce_refrele_notr(ixa->ixa_dce);
1069 		ixa->ixa_dce = NULL;
1070 	}
1071 	if (ixa->ixa_nce != NULL) {
1072 		nce_refrele(ixa->ixa_nce);
1073 		ixa->ixa_nce = NULL;
1074 	}
1075 	ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1076 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1077 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
1078 		ipsec_out_release_refs(ixa);
1079 	}
1080 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1081 		ASSERT(ixa->ixa_tsl != NULL);
1082 		label_rele(ixa->ixa_tsl);
1083 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1084 	}
1085 	ixa->ixa_tsl = NULL;
1086 	if (ixa->ixa_free_flags & IXA_FREE_CRED) {
1087 		ASSERT(ixa->ixa_cred != NULL);
1088 		crfree(ixa->ixa_cred);
1089 		ixa->ixa_free_flags &= ~IXA_FREE_CRED;
1090 	}
1091 	ixa->ixa_cred = NULL;
1092 	ixa->ixa_src_preferences = 0;
1093 	ixa->ixa_ifindex = 0;
1094 	ixa->ixa_multicast_ifindex = 0;
1095 	ixa->ixa_multicast_ifaddr = INADDR_ANY;
1096 }
1097 
1098 /*
1099  * Release any references contained in the ira.
1100  * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
1101  * argument.
1102  */
1103 void
1104 ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
1105 {
1106 	if (ira->ira_ill != NULL) {
1107 		if (ira->ira_rill != ira->ira_ill) {
1108 			/* Caused by async processing */
1109 			ill_refrele(ira->ira_rill);
1110 		}
1111 		if (refrele_ill)
1112 			ill_refrele(ira->ira_ill);
1113 	}
1114 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1115 		ipsec_in_release_refs(ira);
1116 	}
1117 	if (ira->ira_free_flags & IRA_FREE_TSL) {
1118 		ASSERT(ira->ira_tsl != NULL);
1119 		label_rele(ira->ira_tsl);
1120 		ira->ira_free_flags &= ~IRA_FREE_TSL;
1121 	}
1122 	ira->ira_tsl = NULL;
1123 	if (ira->ira_free_flags & IRA_FREE_CRED) {
1124 		ASSERT(ira->ira_cred != NULL);
1125 		crfree(ira->ira_cred);
1126 		ira->ira_free_flags &= ~IRA_FREE_CRED;
1127 	}
1128 	ira->ira_cred = NULL;
1129 }
1130 
1131 /*
1132  * Function to help release any IRE, NCE, or DCEs that
1133  * have been deleted and are marked as condemned.
1134  * The caller is responsible for any serialization which is different
1135  * for TCP, SCTP, and others.
1136  */
1137 static void
1138 ixa_cleanup_stale(ip_xmit_attr_t *ixa)
1139 {
1140 	ire_t		*ire;
1141 	nce_t		*nce;
1142 	dce_t		*dce;
1143 
1144 	ire = ixa->ixa_ire;
1145 	nce = ixa->ixa_nce;
1146 	dce = ixa->ixa_dce;
1147 
1148 	if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
1149 		ire_refrele_notr(ire);
1150 		ire = ire_blackhole(ixa->ixa_ipst,
1151 		    !(ixa->ixa_flags & IXAF_IS_IPV4));
1152 		ASSERT(ire != NULL);
1153 #ifdef DEBUG
1154 		ire_refhold_notr(ire);
1155 		ire_refrele(ire);
1156 #endif
1157 		ixa->ixa_ire = ire;
1158 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1159 	}
1160 	if (nce != NULL && nce->nce_is_condemned) {
1161 		/* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
1162 		nce_refrele(nce);
1163 		ixa->ixa_nce = NULL;
1164 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1165 	}
1166 	if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
1167 		dce_refrele_notr(dce);
1168 		dce = dce_get_default(ixa->ixa_ipst);
1169 		ASSERT(dce != NULL);
1170 #ifdef DEBUG
1171 		dce_refhold_notr(dce);
1172 		dce_refrele(dce);
1173 #endif
1174 		ixa->ixa_dce = dce;
1175 		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1176 	}
1177 }
1178 
1179 static mblk_t *
1180 tcp_ixa_cleanup_getmblk(conn_t *connp)
1181 {
1182 	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1183 	int need_retry;
1184 	mblk_t *mp;
1185 
1186 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1187 
1188 	/*
1189 	 * It's possible that someone else came in and started cleaning up
1190 	 * another connection between the time we verified this one is not being
1191 	 * cleaned up and the time we actually get the shared mblk.  If that's
1192 	 * the case, we've dropped the lock, and some other thread may have
1193 	 * cleaned up this connection again, and is still waiting for
1194 	 * notification of that cleanup's completion.  Therefore we need to
1195 	 * recheck.
1196 	 */
1197 	do {
1198 		need_retry = 0;
1199 		while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
1200 			cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1201 			    &tcps->tcps_ixa_cleanup_lock);
1202 		}
1203 
1204 		while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
1205 			/*
1206 			 * Multiple concurrent cleanups; need to have the last
1207 			 * one run since it could be an unplumb.
1208 			 */
1209 			need_retry = 1;
1210 			cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
1211 			    &tcps->tcps_ixa_cleanup_lock);
1212 		}
1213 	} while (need_retry);
1214 
1215 	/*
1216 	 * We now have the lock and the mblk; now make sure that no one else can
1217 	 * try to clean up this connection or enqueue it for cleanup, clear the
1218 	 * mblk pointer for this stack, drop the lock, and return the mblk.
1219 	 */
1220 	ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
1221 	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
1222 	ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
1223 	ASSERT(mp != NULL);
1224 
1225 	connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
1226 	tcps->tcps_ixa_cleanup_mp = NULL;
1227 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1228 
1229 	return (mp);
1230 }
1231 
1232 /*
1233  * Used to run ixa_cleanup_stale inside the tcp squeue.
1234  * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
1235  * and waking up the caller.
1236  */
1237 /* ARGSUSED2 */
1238 static void
1239 tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
1240     ip_recv_attr_t *dummy)
1241 {
1242 	conn_t	*connp = (conn_t *)arg;
1243 	tcp_stack_t	*tcps;
1244 
1245 	tcps = connp->conn_netstack->netstack_tcp;
1246 
1247 	ixa_cleanup_stale(connp->conn_ixa);
1248 
1249 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1250 	ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
1251 	connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
1252 	tcps->tcps_ixa_cleanup_mp = mp;
1253 	cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
1254 	/*
1255 	 * It is possible for any number of threads to be waiting for cleanup of
1256 	 * different connections.  Absent a per-connection (or per-IXA) CV, we
1257 	 * need to wake them all up even though only one can be waiting on this
1258 	 * particular cleanup.
1259 	 */
1260 	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1261 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1262 }
1263 
1264 static void
1265 tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
1266 {
1267 	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1268 
1269 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1270 
1271 	ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
1272 
1273 	while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
1274 		cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1275 		    &tcps->tcps_ixa_cleanup_lock);
1276 	}
1277 
1278 	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
1279 	connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
1280 	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1281 
1282 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1283 }
1284 
1285 /*
1286  * ipcl_walk() function to help release any IRE, NCE, or DCEs that
1287  * have been deleted and are marked as condemned.
1288  * Note that we can't cleanup the pointers since there can be threads
1289  * in conn_ip_output() sending while we are called.
1290  */
1291 void
1292 conn_ixa_cleanup(conn_t *connp, void *arg)
1293 {
1294 	boolean_t tryhard = (boolean_t)arg;
1295 
1296 	if (IPCL_IS_TCP(connp)) {
1297 		mblk_t		*mp;
1298 
1299 		mp = tcp_ixa_cleanup_getmblk(connp);
1300 
1301 		if (connp->conn_sqp->sq_run == curthread) {
1302 			/* Already on squeue */
1303 			tcp_ixa_cleanup(connp, mp, NULL, NULL);
1304 		} else {
1305 			CONN_INC_REF(connp);
1306 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
1307 			    connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
1308 		}
1309 		tcp_ixa_cleanup_wait_and_finish(connp);
1310 	} else if (IPCL_IS_SCTP(connp)) {
1311 		sctp_t	*sctp;
1312 		sctp_faddr_t *fp;
1313 
1314 		sctp = CONN2SCTP(connp);
1315 		RUN_SCTP(sctp);
1316 		ixa_cleanup_stale(connp->conn_ixa);
1317 		for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->sf_next)
1318 			ixa_cleanup_stale(fp->sf_ixa);
1319 		WAKE_SCTP(sctp);
1320 	} else {
1321 		ip_xmit_attr_t	*ixa;
1322 
1323 		/*
1324 		 * If there is a different thread using conn_ixa then we get a
1325 		 * new copy and cut the old one loose from conn_ixa. Otherwise
1326 		 * we use conn_ixa and prevent any other thread from
1327 		 * using/changing it. Anybody using conn_ixa (e.g., a thread in
1328 		 * conn_ip_output) will do an ixa_refrele which will remove any
1329 		 * references on the ire etc.
1330 		 *
1331 		 * Once we are done other threads can use conn_ixa since the
1332 		 * refcnt will be back at one.
1333 		 *
1334 		 * We are called either because an ill is going away, or
1335 		 * due to memory reclaim. In the former case we wait for
1336 		 * memory since we must remove the refcnts on the ill.
1337 		 */
1338 		if (tryhard) {
1339 			ixa = conn_get_ixa_tryhard(connp, B_TRUE);
1340 			ASSERT(ixa != NULL);
1341 		} else {
1342 			ixa = conn_get_ixa(connp, B_TRUE);
1343 			if (ixa == NULL) {
1344 				/*
1345 				 * Somebody else was using it and kmem_alloc
1346 				 * failed! Next memory reclaim will try to
1347 				 * clean up.
1348 				 */
1349 				DTRACE_PROBE1(conn__ixa__cleanup__bail,
1350 				    conn_t *, connp);
1351 				return;
1352 			}
1353 		}
1354 		ixa_cleanup_stale(ixa);
1355 		ixa_refrele(ixa);
1356 	}
1357 }
1358 
1359 /*
1360  * ixa needs to be an exclusive copy so that no one changes the cookie
1361  * or the ixa_nce.
1362  */
1363 boolean_t
1364 ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
1365 {
1366 	uintptr_t cookie = ixa->ixa_cookie;
1367 	ill_dld_direct_t *idd;
1368 	idl_tx_list_t *idl_txl;
1369 	ill_t *ill = ixa->ixa_nce->nce_ill;
1370 	boolean_t inserted = B_FALSE;
1371 
1372 	idd = &(ill)->ill_dld_capab->idc_direct;
1373 	idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
1374 	mutex_enter(&idl_txl->txl_lock);
1375 
1376 	/*
1377 	 * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow
1378 	 * control is asserted on an ill that does not support direct calls.
1379 	 * Jump to insert.
1380 	 */
1381 	if (cookie == 0)
1382 		goto tryinsert;
1383 
1384 	ASSERT(ILL_DIRECT_CAPABLE(ill));
1385 
1386 	if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) {
1387 		DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie);
1388 	} else if (idl_txl->txl_cookie != NULL &&
1389 	    idl_txl->txl_cookie != ixa->ixa_cookie) {
1390 		DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie,
1391 		    uintptr_t, idl_txl->txl_cookie);
1392 		/* TODO: bump kstat for cookie collision */
1393 	} else {
1394 		/*
1395 		 * Check/set conn_blocked under conn_lock.  Note that txl_lock
1396 		 * will not suffice since two separate UDP threads may be
1397 		 * racing to send to different destinations that are
1398 		 * associated with different cookies and thus may not be
1399 		 * holding the same txl_lock.  Further, since a given conn_t
1400 		 * can only be on a single drain list, the conn_t will be
1401 		 * enqueued on whichever thread wins this race.
1402 		 */
1403 tryinsert:	mutex_enter(&connp->conn_lock);
1404 		if (connp->conn_blocked) {
1405 			DTRACE_PROBE1(ill__tx__conn__already__blocked,
1406 			    conn_t *, connp);
1407 			mutex_exit(&connp->conn_lock);
1408 		} else {
1409 			connp->conn_blocked = B_TRUE;
1410 			mutex_exit(&connp->conn_lock);
1411 			idl_txl->txl_cookie = cookie;
1412 			conn_drain_insert(connp, idl_txl);
1413 			if (!IPCL_IS_NONSTR(connp))
1414 				noenable(connp->conn_wq);
1415 			inserted = B_TRUE;
1416 		}
1417 	}
1418 	mutex_exit(&idl_txl->txl_lock);
1419 	return (inserted);
1420 }
1421