xref: /titanic_50/usr/src/uts/common/inet/ip/ip_attr.c (revision 77dabb95057c6ac2d639808648bf928ca53585f4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * Copyright 2019 Joyent, Inc.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/stream.h>
33 #include <sys/strsun.h>
34 #include <sys/zone.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/cmn_err.h>
38 #include <sys/debug.h>
39 #include <sys/atomic.h>
40 
41 #include <sys/systm.h>
42 #include <sys/param.h>
43 #include <sys/kmem.h>
44 #include <sys/sdt.h>
45 #include <sys/socket.h>
46 #include <sys/mac.h>
47 #include <net/if.h>
48 #include <net/if_arp.h>
49 #include <net/route.h>
50 #include <sys/sockio.h>
51 #include <netinet/in.h>
52 #include <net/if_dl.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/arp.h>
59 #include <inet/snmpcom.h>
60 #include <inet/kstatcom.h>
61 
62 #include <netinet/igmp_var.h>
63 #include <netinet/ip6.h>
64 #include <netinet/icmp6.h>
65 #include <netinet/sctp.h>
66 
67 #include <inet/ip.h>
68 #include <inet/ip_impl.h>
69 #include <inet/ip6.h>
70 #include <inet/ip6_asp.h>
71 #include <inet/tcp.h>
72 #include <inet/ip_multi.h>
73 #include <inet/ip_if.h>
74 #include <inet/ip_ire.h>
75 #include <inet/ip_ftable.h>
76 #include <inet/ip_rts.h>
77 #include <inet/optcom.h>
78 #include <inet/ip_ndp.h>
79 #include <inet/ip_listutils.h>
80 #include <netinet/igmp.h>
81 #include <netinet/ip_mroute.h>
82 #include <inet/ipp_common.h>
83 
84 #include <net/pfkeyv2.h>
85 #include <inet/sadb.h>
86 #include <inet/ipsec_impl.h>
87 #include <inet/ipdrop.h>
88 #include <inet/ip_netinfo.h>
89 #include <sys/squeue_impl.h>
90 #include <sys/squeue.h>
91 
92 #include <inet/ipclassifier.h>
93 #include <inet/sctp_ip.h>
94 #include <inet/sctp/sctp_impl.h>
95 #include <inet/udp_impl.h>
96 #include <sys/sunddi.h>
97 
98 #include <sys/tsol/label.h>
99 #include <sys/tsol/tnet.h>
100 
101 /*
102  * Release a reference on ip_xmit_attr.
103  * The reference is acquired by conn_get_ixa()
104  *
105  * This macro has a lowercase function-call version for callers outside
106  * this file.
107  */
108 #define	IXA_REFRELE(ixa)					\
109 {								\
110 	if (atomic_dec_32_nv(&(ixa)->ixa_refcnt) == 0)	\
111 		ixa_inactive(ixa);				\
112 }
113 
114 #define	IXA_REFHOLD(ixa)					\
115 {								\
116 	ASSERT3U((ixa)->ixa_refcnt, !=, 0);			\
117 	atomic_inc_32(&(ixa)->ixa_refcnt);			\
118 }
119 
120 /*
121  * When we need to handle a transmit side asynchronous operation, then we need
122  * to save sufficient information so that we can call the fragment and postfrag
123  * functions. That information is captured in an mblk containing this structure.
124  *
125  * Since this is currently only used for IPsec, we include information for
126  * the kernel crypto framework.
127  */
128 typedef struct ixamblk_s {
129 	boolean_t	ixm_inbound;	/* B_FALSE */
130 	iaflags_t	ixm_flags;	/* ixa_flags */
131 	netstackid_t	ixm_stackid;	/* Verify it didn't go away */
132 	uint_t		ixm_ifindex;	/* Used to find the nce */
133 	in6_addr_t	ixm_nceaddr_v6;	/* Used to find nce */
134 #define	ixm_nceaddr_v4	V4_PART_OF_V6(ixm_nceaddr_v6)
135 	uint32_t	ixm_fragsize;
136 	uint_t		ixm_pktlen;
137 	uint16_t	ixm_ip_hdr_length; /* Points to ULP header */
138 	uint8_t		ixm_protocol;	/* Protocol number for ULP cksum */
139 	pfirepostfrag_t	ixm_postfragfn;
140 
141 	zoneid_t	ixm_zoneid;		/* Needed for ipobs */
142 	zoneid_t	ixm_no_loop_zoneid;	/* IXAF_NO_LOOP_ZONEID_SET */
143 
144 	uint_t		ixm_scopeid;		/* For IPv6 link-locals */
145 
146 	uint32_t	ixm_ident;		/* For IPv6 fragment header */
147 	uint32_t	ixm_xmit_hint;
148 
149 	uint64_t	ixm_conn_id;		/* Used by DTrace */
150 	cred_t		*ixm_cred;	/* For getpeerucred - refhold if set */
151 	pid_t		ixm_cpid;	/* For getpeerucred */
152 
153 	ts_label_t	*ixm_tsl;	/* Refhold if set. */
154 
155 	/*
156 	 * When the pointers below are set they have a refhold on the struct.
157 	 */
158 	ipsec_latch_t		*ixm_ipsec_latch;
159 	struct ipsa_s		*ixm_ipsec_ah_sa;	/* SA for AH */
160 	struct ipsa_s		*ixm_ipsec_esp_sa;	/* SA for ESP */
161 	struct ipsec_policy_s	*ixm_ipsec_policy;	/* why are we here? */
162 	struct ipsec_action_s	*ixm_ipsec_action; /* For reflected packets */
163 
164 	ipsa_ref_t		ixm_ipsec_ref[2]; /* Soft reference to SA */
165 
166 	/* Need these while waiting for SA */
167 	uint16_t ixm_ipsec_src_port;	/* Source port number of d-gram. */
168 	uint16_t ixm_ipsec_dst_port;	/* Destination port number of d-gram. */
169 	uint8_t  ixm_ipsec_icmp_type;	/* ICMP type of d-gram */
170 	uint8_t  ixm_ipsec_icmp_code;	/* ICMP code of d-gram */
171 
172 	sa_family_t ixm_ipsec_inaf;	/* Inner address family */
173 	uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN];	/* Inner src address */
174 	uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN];	/* Inner dest address */
175 	uint8_t  ixm_ipsec_insrcpfx;	/* Inner source prefix */
176 	uint8_t  ixm_ipsec_indstpfx;	/* Inner destination prefix */
177 
178 	uint8_t ixm_ipsec_proto;	/* IP protocol number for d-gram. */
179 } ixamblk_t;
180 
181 
182 /*
183  * When we need to handle a receive side asynchronous operation, then we need
184  * to save sufficient information so that we can call ip_fanout.
185  * That information is captured in an mblk containing this structure.
186  *
187  * Since this is currently only used for IPsec, we include information for
188  * the kernel crypto framework.
189  */
190 typedef struct iramblk_s {
191 	boolean_t	irm_inbound;	/* B_TRUE */
192 	iaflags_t	irm_flags;	/* ira_flags */
193 	netstackid_t	irm_stackid;	/* Verify it didn't go away */
194 	uint_t		irm_ifindex;	/* To find ira_ill */
195 
196 	uint_t		irm_rifindex;	/* ira_rifindex */
197 	uint_t		irm_ruifindex;	/* ira_ruifindex */
198 	uint_t		irm_pktlen;
199 	uint16_t	irm_ip_hdr_length; /* Points to ULP header */
200 	uint8_t		irm_protocol;	/* Protocol number for ULP cksum */
201 	zoneid_t	irm_zoneid;	/* ALL_ZONES unless local delivery */
202 
203 	squeue_t	*irm_sqp;
204 	ill_rx_ring_t	*irm_ring;
205 
206 	ipaddr_t	irm_mroute_tunnel;	/* IRAF_MROUTE_TUNNEL_SET */
207 	zoneid_t	irm_no_loop_zoneid;	/* IRAF_NO_LOOP_ZONEID_SET */
208 	uint32_t	irm_esp_udp_ports;	/* IRAF_ESP_UDP_PORTS */
209 
210 	char		irm_l2src[IRA_L2SRC_SIZE];	/* If IRAF_L2SRC_SET */
211 
212 	cred_t		*irm_cred;	/* For getpeerucred - refhold if set */
213 	pid_t		irm_cpid;	/* For getpeerucred */
214 
215 	ts_label_t	*irm_tsl;	/* Refhold if set. */
216 
217 	/*
218 	 * When set these correspond to a refhold on the object.
219 	 */
220 	struct ipsa_s		*irm_ipsec_ah_sa;	/* SA for AH */
221 	struct ipsa_s		*irm_ipsec_esp_sa;	/* SA for ESP */
222 	struct ipsec_action_s	*irm_ipsec_action; /* For reflected packets */
223 } iramblk_t;
224 
225 
226 /*
227  * Take the information in ip_xmit_attr_t and stick it in an mblk
228  * that can later be passed to ip_xmit_attr_from_mblk to recreate the
229  * ip_xmit_attr_t.
230  *
231  * Returns NULL on memory allocation failure.
232  */
233 mblk_t *
ip_xmit_attr_to_mblk(ip_xmit_attr_t * ixa)234 ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
235 {
236 	mblk_t		*ixamp;
237 	ixamblk_t	*ixm;
238 	nce_t		*nce = ixa->ixa_nce;
239 
240 	ASSERT(nce != NULL);
241 	ixamp = allocb(sizeof (*ixm), BPRI_MED);
242 	if (ixamp == NULL)
243 		return (NULL);
244 
245 	ixamp->b_datap->db_type = M_BREAK;
246 	ixamp->b_wptr += sizeof (*ixm);
247 	ixm = (ixamblk_t *)ixamp->b_rptr;
248 
249 	bzero(ixm, sizeof (*ixm));
250 	ixm->ixm_inbound = B_FALSE;
251 	ixm->ixm_flags = ixa->ixa_flags;
252 	ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
253 	ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
254 	ixm->ixm_nceaddr_v6 = nce->nce_addr;
255 	ixm->ixm_fragsize = ixa->ixa_fragsize;
256 	ixm->ixm_pktlen = ixa->ixa_pktlen;
257 	ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
258 	ixm->ixm_protocol = ixa->ixa_protocol;
259 	ixm->ixm_postfragfn = ixa->ixa_postfragfn;
260 	ixm->ixm_zoneid = ixa->ixa_zoneid;
261 	ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
262 	ixm->ixm_scopeid = ixa->ixa_scopeid;
263 	ixm->ixm_ident = ixa->ixa_ident;
264 	ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;
265 
266 	if (ixa->ixa_tsl != NULL) {
267 		ixm->ixm_tsl = ixa->ixa_tsl;
268 		label_hold(ixm->ixm_tsl);
269 	}
270 	if (ixa->ixa_cred != NULL) {
271 		ixm->ixm_cred = ixa->ixa_cred;
272 		crhold(ixa->ixa_cred);
273 	}
274 	ixm->ixm_cpid = ixa->ixa_cpid;
275 	ixm->ixm_conn_id = ixa->ixa_conn_id;
276 
277 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
278 		if (ixa->ixa_ipsec_ah_sa != NULL) {
279 			ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
280 			IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
281 		}
282 		if (ixa->ixa_ipsec_esp_sa != NULL) {
283 			ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
284 			IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
285 		}
286 		if (ixa->ixa_ipsec_policy != NULL) {
287 			ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
288 			IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
289 		}
290 		if (ixa->ixa_ipsec_action != NULL) {
291 			ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
292 			IPACT_REFHOLD(ixa->ixa_ipsec_action);
293 		}
294 		if (ixa->ixa_ipsec_latch != NULL) {
295 			ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
296 			IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
297 		}
298 		ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
299 		ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
300 		ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
301 		ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
302 		ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
303 		ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
304 		ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
305 		ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
306 		ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
307 		ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
308 		ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
309 		ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
310 		ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
311 		ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
312 		ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
313 		ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
314 		ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
315 		ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
316 	}
317 	return (ixamp);
318 }
319 
320 /*
321  * Extract the ip_xmit_attr_t from the mblk, checking that the
322  * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
323  * not the case.
324  *
325  * Otherwise ixa is updated.
326  * Caller needs to release references on the ixa by calling ixa_refrele()
327  * which will imediately call ixa_inactive to release the references.
328  */
329 boolean_t
ip_xmit_attr_from_mblk(mblk_t * ixamp,ip_xmit_attr_t * ixa)330 ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
331 {
332 	ixamblk_t	*ixm;
333 	netstack_t	*ns;
334 	ip_stack_t	*ipst;
335 	ill_t		*ill;
336 	nce_t		*nce;
337 
338 	/* We assume the caller hasn't initialized ixa */
339 	bzero(ixa, sizeof (*ixa));
340 
341 	ASSERT(DB_TYPE(ixamp) == M_BREAK);
342 	ASSERT(ixamp->b_cont == NULL);
343 
344 	ixm = (ixamblk_t *)ixamp->b_rptr;
345 	ASSERT(!ixm->ixm_inbound);
346 
347 	/* Verify the netstack is still around */
348 	ns = netstack_find_by_stackid(ixm->ixm_stackid);
349 	if (ns == NULL) {
350 		/* Disappeared on us */
351 		(void) ip_xmit_attr_free_mblk(ixamp);
352 		return (B_FALSE);
353 	}
354 	ipst = ns->netstack_ip;
355 
356 	/* Verify the ill is still around */
357 	ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
358 	    !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);
359 
360 	/* We have the ill, hence the netstack can't go away */
361 	netstack_rele(ns);
362 	if (ill == NULL) {
363 		/* Disappeared on us */
364 		(void) ip_xmit_attr_free_mblk(ixamp);
365 		return (B_FALSE);
366 	}
367 	/*
368 	 * Find the nce. We don't load-spread (only lookup nce's on the ill)
369 	 * because we want to find the same nce as the one we had when
370 	 * ip_xmit_attr_to_mblk was called.
371 	 */
372 	if (ixm->ixm_flags & IXAF_IS_IPV4) {
373 		nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
374 	} else {
375 		nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
376 	}
377 
378 	/* We have the nce, hence the ill can't go away */
379 	ill_refrele(ill);
380 	if (nce == NULL) {
381 		/*
382 		 * Since this is unusual and we don't know what type of
383 		 * nce it was, we drop the packet.
384 		 */
385 		(void) ip_xmit_attr_free_mblk(ixamp);
386 		return (B_FALSE);
387 	}
388 
389 	ixa->ixa_flags = ixm->ixm_flags;
390 	ixa->ixa_refcnt = 1;
391 	ixa->ixa_ipst = ipst;
392 	ixa->ixa_fragsize = ixm->ixm_fragsize;
393 	ixa->ixa_pktlen =  ixm->ixm_pktlen;
394 	ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
395 	ixa->ixa_protocol = ixm->ixm_protocol;
396 	ixa->ixa_nce = nce;
397 	ixa->ixa_postfragfn = ixm->ixm_postfragfn;
398 	ixa->ixa_zoneid = ixm->ixm_zoneid;
399 	ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
400 	ixa->ixa_scopeid = ixm->ixm_scopeid;
401 	ixa->ixa_ident = ixm->ixm_ident;
402 	ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;
403 
404 	if (ixm->ixm_tsl != NULL) {
405 		ixa->ixa_tsl = ixm->ixm_tsl;
406 		ixa->ixa_free_flags |= IXA_FREE_TSL;
407 		ixm->ixm_tsl = NULL;
408 	}
409 	if (ixm->ixm_cred != NULL) {
410 		ixa->ixa_cred = ixm->ixm_cred;
411 		ixa->ixa_free_flags |= IXA_FREE_CRED;
412 		ixm->ixm_cred = NULL;
413 	}
414 	ixa->ixa_cpid = ixm->ixm_cpid;
415 	ixa->ixa_conn_id = ixm->ixm_conn_id;
416 
417 	ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
418 	ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
419 	ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
420 	ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
421 	ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;
422 
423 	ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
424 	ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
425 	ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
426 	ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
427 	ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
428 	ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
429 	ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
430 	ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
431 	ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
432 	ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
433 	ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
434 	ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
435 	ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
436 	ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
437 	ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
438 	ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
439 	ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
440 	ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;
441 
442 	freeb(ixamp);
443 	return (B_TRUE);
444 }
445 
446 /*
447  * Free the ixm mblk and any references it holds
448  * Returns b_cont.
449  */
450 mblk_t *
ip_xmit_attr_free_mblk(mblk_t * ixamp)451 ip_xmit_attr_free_mblk(mblk_t *ixamp)
452 {
453 	ixamblk_t	*ixm;
454 	mblk_t		*mp;
455 
456 	/* Consume mp */
457 	ASSERT(DB_TYPE(ixamp) == M_BREAK);
458 	mp = ixamp->b_cont;
459 
460 	ixm = (ixamblk_t *)ixamp->b_rptr;
461 	ASSERT(!ixm->ixm_inbound);
462 
463 	if (ixm->ixm_ipsec_ah_sa != NULL) {
464 		IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
465 		ixm->ixm_ipsec_ah_sa = NULL;
466 	}
467 	if (ixm->ixm_ipsec_esp_sa != NULL) {
468 		IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
469 		ixm->ixm_ipsec_esp_sa = NULL;
470 	}
471 	if (ixm->ixm_ipsec_policy != NULL) {
472 		IPPOL_REFRELE(ixm->ixm_ipsec_policy);
473 		ixm->ixm_ipsec_policy = NULL;
474 	}
475 	if (ixm->ixm_ipsec_action != NULL) {
476 		IPACT_REFRELE(ixm->ixm_ipsec_action);
477 		ixm->ixm_ipsec_action = NULL;
478 	}
479 	if (ixm->ixm_ipsec_latch) {
480 		IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
481 		ixm->ixm_ipsec_latch = NULL;
482 	}
483 
484 	if (ixm->ixm_tsl != NULL) {
485 		label_rele(ixm->ixm_tsl);
486 		ixm->ixm_tsl = NULL;
487 	}
488 	if (ixm->ixm_cred != NULL) {
489 		crfree(ixm->ixm_cred);
490 		ixm->ixm_cred = NULL;
491 	}
492 	freeb(ixamp);
493 	return (mp);
494 }
495 
496 /*
497  * Take the information in ip_recv_attr_t and stick it in an mblk
498  * that can later be passed to ip_recv_attr_from_mblk to recreate the
499  * ip_recv_attr_t.
500  *
501  * Returns NULL on memory allocation failure.
502  */
503 mblk_t *
ip_recv_attr_to_mblk(ip_recv_attr_t * ira)504 ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
505 {
506 	mblk_t		*iramp;
507 	iramblk_t	*irm;
508 	ill_t		*ill = ira->ira_ill;
509 
510 	ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);
511 
512 	iramp = allocb(sizeof (*irm), BPRI_MED);
513 	if (iramp == NULL)
514 		return (NULL);
515 
516 	iramp->b_datap->db_type = M_BREAK;
517 	iramp->b_wptr += sizeof (*irm);
518 	irm = (iramblk_t *)iramp->b_rptr;
519 
520 	bzero(irm, sizeof (*irm));
521 	irm->irm_inbound = B_TRUE;
522 	irm->irm_flags = ira->ira_flags;
523 	if (ill != NULL) {
524 		/* Internal to IP - preserve ip_stack_t, ill and rill */
525 		irm->irm_stackid =
526 		    ill->ill_ipst->ips_netstack->netstack_stackid;
527 		irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
528 		ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
529 		    ira->ira_rifindex);
530 	} else {
531 		/* Let ip_recv_attr_from_stackid know there isn't one */
532 		irm->irm_stackid = -1;
533 	}
534 	irm->irm_rifindex = ira->ira_rifindex;
535 	irm->irm_ruifindex = ira->ira_ruifindex;
536 	irm->irm_pktlen = ira->ira_pktlen;
537 	irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
538 	irm->irm_protocol = ira->ira_protocol;
539 
540 	irm->irm_sqp = ira->ira_sqp;
541 	irm->irm_ring = ira->ira_ring;
542 
543 	irm->irm_zoneid = ira->ira_zoneid;
544 	irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
545 	irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
546 	irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;
547 
548 	if (ira->ira_tsl != NULL) {
549 		irm->irm_tsl = ira->ira_tsl;
550 		label_hold(irm->irm_tsl);
551 	}
552 	if (ira->ira_cred != NULL) {
553 		irm->irm_cred = ira->ira_cred;
554 		crhold(ira->ira_cred);
555 	}
556 	irm->irm_cpid = ira->ira_cpid;
557 
558 	if (ira->ira_flags & IRAF_L2SRC_SET)
559 		bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);
560 
561 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
562 		if (ira->ira_ipsec_ah_sa != NULL) {
563 			irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
564 			IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
565 		}
566 		if (ira->ira_ipsec_esp_sa != NULL) {
567 			irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
568 			IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
569 		}
570 		if (ira->ira_ipsec_action != NULL) {
571 			irm->irm_ipsec_action = ira->ira_ipsec_action;
572 			IPACT_REFHOLD(ira->ira_ipsec_action);
573 		}
574 	}
575 	return (iramp);
576 }
577 
578 /*
579  * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
580  * then irm_stackid is not -1, in which case we check that the
581  * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
582  * not the case.
583  * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
584  * and we just proceed with ira_ill and ira_rill as NULL.
585  *
586  * The caller needs to release any references on the pointers inside the ire
587  * by calling ira_cleanup.
588  */
589 boolean_t
ip_recv_attr_from_mblk(mblk_t * iramp,ip_recv_attr_t * ira)590 ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
591 {
592 	iramblk_t	*irm;
593 	netstack_t	*ns;
594 	ip_stack_t	*ipst = NULL;
595 	ill_t		*ill = NULL, *rill = NULL;
596 
597 	/* We assume the caller hasn't initialized ira */
598 	bzero(ira, sizeof (*ira));
599 
600 	ASSERT(DB_TYPE(iramp) == M_BREAK);
601 	ASSERT(iramp->b_cont == NULL);
602 
603 	irm = (iramblk_t *)iramp->b_rptr;
604 	ASSERT(irm->irm_inbound);
605 
606 	if (irm->irm_stackid != -1) {
607 		/* Verify the netstack is still around */
608 		ns = netstack_find_by_stackid(irm->irm_stackid);
609 		if (ns == NULL) {
610 			/* Disappeared on us */
611 			(void) ip_recv_attr_free_mblk(iramp);
612 			return (B_FALSE);
613 		}
614 		ipst = ns->netstack_ip;
615 
616 		/* Verify the ill is still around */
617 		ill = ill_lookup_on_ifindex(irm->irm_ifindex,
618 		    !(irm->irm_flags & IRAF_IS_IPV4), ipst);
619 
620 		if (irm->irm_ifindex == irm->irm_rifindex) {
621 			rill = ill;
622 		} else {
623 			rill = ill_lookup_on_ifindex(irm->irm_rifindex,
624 			    !(irm->irm_flags & IRAF_IS_IPV4), ipst);
625 		}
626 
627 		/* We have the ill, hence the netstack can't go away */
628 		netstack_rele(ns);
629 		if (ill == NULL || rill == NULL) {
630 			/* Disappeared on us */
631 			if (ill != NULL)
632 				ill_refrele(ill);
633 			if (rill != NULL && rill != ill)
634 				ill_refrele(rill);
635 			(void) ip_recv_attr_free_mblk(iramp);
636 			return (B_FALSE);
637 		}
638 	}
639 
640 	ira->ira_flags = irm->irm_flags;
641 	/* Caller must ill_refele(ira_ill) by using ira_cleanup() */
642 	ira->ira_ill = ill;
643 	ira->ira_rill = rill;
644 
645 	ira->ira_rifindex = irm->irm_rifindex;
646 	ira->ira_ruifindex = irm->irm_ruifindex;
647 	ira->ira_pktlen = irm->irm_pktlen;
648 	ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
649 	ira->ira_protocol = irm->irm_protocol;
650 
651 	ira->ira_sqp = irm->irm_sqp;
652 	/* The rest of IP assumes that the rings never go away. */
653 	ira->ira_ring = irm->irm_ring;
654 
655 	ira->ira_zoneid = irm->irm_zoneid;
656 	ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
657 	ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
658 	ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;
659 
660 	if (irm->irm_tsl != NULL) {
661 		ira->ira_tsl = irm->irm_tsl;
662 		ira->ira_free_flags |= IRA_FREE_TSL;
663 		irm->irm_tsl = NULL;
664 	}
665 	if (irm->irm_cred != NULL) {
666 		ira->ira_cred = irm->irm_cred;
667 		ira->ira_free_flags |= IRA_FREE_CRED;
668 		irm->irm_cred = NULL;
669 	}
670 	ira->ira_cpid = irm->irm_cpid;
671 
672 	if (ira->ira_flags & IRAF_L2SRC_SET)
673 		bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);
674 
675 	ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
676 	ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
677 	ira->ira_ipsec_action = irm->irm_ipsec_action;
678 
679 	freeb(iramp);
680 	return (B_TRUE);
681 }
682 
683 /*
684  * Free the irm mblk and any references it holds
685  * Returns b_cont.
686  */
687 mblk_t *
ip_recv_attr_free_mblk(mblk_t * iramp)688 ip_recv_attr_free_mblk(mblk_t *iramp)
689 {
690 	iramblk_t	*irm;
691 	mblk_t		*mp;
692 
693 	/* Consume mp */
694 	ASSERT(DB_TYPE(iramp) == M_BREAK);
695 	mp = iramp->b_cont;
696 
697 	irm = (iramblk_t *)iramp->b_rptr;
698 	ASSERT(irm->irm_inbound);
699 
700 	if (irm->irm_ipsec_ah_sa != NULL) {
701 		IPSA_REFRELE(irm->irm_ipsec_ah_sa);
702 		irm->irm_ipsec_ah_sa = NULL;
703 	}
704 	if (irm->irm_ipsec_esp_sa != NULL) {
705 		IPSA_REFRELE(irm->irm_ipsec_esp_sa);
706 		irm->irm_ipsec_esp_sa = NULL;
707 	}
708 	if (irm->irm_ipsec_action != NULL) {
709 		IPACT_REFRELE(irm->irm_ipsec_action);
710 		irm->irm_ipsec_action = NULL;
711 	}
712 	if (irm->irm_tsl != NULL) {
713 		label_rele(irm->irm_tsl);
714 		irm->irm_tsl = NULL;
715 	}
716 	if (irm->irm_cred != NULL) {
717 		crfree(irm->irm_cred);
718 		irm->irm_cred = NULL;
719 	}
720 
721 	freeb(iramp);
722 	return (mp);
723 }
724 
725 /*
726  * Returns true if the mblk contains an ip_recv_attr_t
727  * For now we just check db_type.
728  */
729 boolean_t
ip_recv_attr_is_mblk(mblk_t * mp)730 ip_recv_attr_is_mblk(mblk_t *mp)
731 {
732 	/*
733 	 * Need to handle the various forms of tcp_timermp which are tagged
734 	 * with b_wptr and might have a NULL b_datap.
735 	 */
736 	if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
737 		return (B_FALSE);
738 
739 #ifdef	DEBUG
740 	iramblk_t	*irm;
741 
742 	if (DB_TYPE(mp) != M_BREAK)
743 		return (B_FALSE);
744 
745 	irm = (iramblk_t *)mp->b_rptr;
746 	ASSERT(irm->irm_inbound);
747 	return (B_TRUE);
748 #else
749 	return (DB_TYPE(mp) == M_BREAK);
750 #endif
751 }
752 
753 static ip_xmit_attr_t *
conn_get_ixa_impl(conn_t * connp,boolean_t replace,int kmflag)754 conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
755 {
756 	ip_xmit_attr_t	*oldixa;	/* Already attached to conn_t */
757 	ip_xmit_attr_t	*ixa;		/* New one, which we return. */
758 
759 	/*
760 	 * NOTE: If the marked-below common case isn't, move the
761 	 * kmem_alloc() up here and put a free in what was marked as the
762 	 * (not really) common case instead.
763 	 */
764 
765 	mutex_enter(&connp->conn_lock);
766 	oldixa = connp->conn_ixa;
767 
768 	/* At least one reference for the conn_t */
769 	ASSERT3U(oldixa->ixa_refcnt, >=, 1);
770 	if (atomic_inc_32_nv(&oldixa->ixa_refcnt) == 2) {
771 		/* No other thread using conn_ixa (common case) */
772 		mutex_exit(&connp->conn_lock);
773 		return (oldixa);
774 	}
775 	/* Do allocation inside-the-conn_lock because it's less common. */
776 	ixa = kmem_alloc(sizeof (*ixa), kmflag);
777 	if (ixa == NULL) {
778 		mutex_exit(&connp->conn_lock);
779 		IXA_REFRELE(oldixa);
780 		return (NULL);
781 	}
782 	ixa_safe_copy(oldixa, ixa);
783 
784 	/* Make sure we drop conn_lock before any refrele */
785 	if (replace) {
786 		ixa->ixa_refcnt++;	/* No atomic needed - not visible */
787 		connp->conn_ixa = ixa;
788 		mutex_exit(&connp->conn_lock);
789 		IXA_REFRELE(oldixa);	/* Undo refcnt from conn_t */
790 	} else {
791 		mutex_exit(&connp->conn_lock);
792 	}
793 	IXA_REFRELE(oldixa);	/* Undo above atomic_add_32_nv */
794 
795 	return (ixa);
796 }
797 
798 /*
799  * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
800  * the caller can access the ip_xmit_attr_t.
801  *
802  * If nobody else is using conn_ixa we return it.
803  * Otherwise we make a "safe" copy of conn_ixa
804  * and return it. The "safe" copy has the pointers set to NULL
805  * (since the pointers might be changed by another thread using
806  * conn_ixa). The caller needs to check for NULL pointers to see
807  * if ip_set_destination needs to be called to re-establish the pointers.
808  *
809  * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
810  * That is used when we connect() the ULP.
811  */
812 ip_xmit_attr_t *
conn_get_ixa(conn_t * connp,boolean_t replace)813 conn_get_ixa(conn_t *connp, boolean_t replace)
814 {
815 	return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
816 }
817 
818 /*
819  * Used only when the option is to have the kernel hang due to not
820  * cleaning up ixa references on ills etc.
821  */
822 ip_xmit_attr_t *
conn_get_ixa_tryhard(conn_t * connp,boolean_t replace)823 conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
824 {
825 	return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
826 }
827 
828 /*
829  * Replace conn_ixa with the ixa argument.
830  *
831  * The caller must hold conn_lock.
832  *
833  * We return the old ixa; the caller must ixa_refrele that after conn_lock
834  * has been dropped.
835  */
836 ip_xmit_attr_t *
conn_replace_ixa(conn_t * connp,ip_xmit_attr_t * ixa)837 conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
838 {
839 	ip_xmit_attr_t	*oldixa;
840 
841 	ASSERT(MUTEX_HELD(&connp->conn_lock));
842 
843 	oldixa = connp->conn_ixa;
844 	IXA_REFHOLD(ixa);
845 	ixa->ixa_conn_id = oldixa->ixa_conn_id;
846 	connp->conn_ixa = ixa;
847 	return (oldixa);
848 }
849 
850 /*
851  * Return a ip_xmit_attr_t to use with a conn_t that is based on but
852  * separate from conn_ixa.
853  *
854  * This "safe" copy has the pointers set to NULL
855  * (since the pointers might be changed by another thread using
856  * conn_ixa). The caller needs to check for NULL pointers to see
857  * if ip_set_destination needs to be called to re-establish the pointers.
858  */
859 ip_xmit_attr_t *
conn_get_ixa_exclusive(conn_t * connp)860 conn_get_ixa_exclusive(conn_t *connp)
861 {
862 	ip_xmit_attr_t *oldixa;
863 	ip_xmit_attr_t *ixa;
864 
865 	ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP | KM_NORMALPRI);
866 	if (ixa == NULL)
867 		return (NULL);
868 
869 	mutex_enter(&connp->conn_lock);
870 
871 	oldixa = connp->conn_ixa;
872 	IXA_REFHOLD(oldixa);
873 
874 	ixa_safe_copy(oldixa, ixa);
875 	mutex_exit(&connp->conn_lock);
876 	IXA_REFRELE(oldixa);
877 	return (ixa);
878 }
879 
880 void
ixa_safe_copy(ip_xmit_attr_t * src,ip_xmit_attr_t * ixa)881 ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
882 {
883 	bcopy(src, ixa, sizeof (*ixa));
884 	ixa->ixa_refcnt = 1;
885 	/*
886 	 * Clear any pointers that have references and might be changed
887 	 * by ip_set_destination or the ULP
888 	 */
889 	ixa->ixa_ire = NULL;
890 	ixa->ixa_nce = NULL;
891 	ixa->ixa_dce = NULL;
892 	ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
893 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
894 #ifdef DEBUG
895 	ixa->ixa_curthread = NULL;
896 #endif
897 	/* Clear all the IPsec pointers and the flag as well. */
898 	ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
899 
900 	ixa->ixa_ipsec_latch = NULL;
901 	ixa->ixa_ipsec_ah_sa = NULL;
902 	ixa->ixa_ipsec_esp_sa = NULL;
903 	ixa->ixa_ipsec_policy = NULL;
904 	ixa->ixa_ipsec_action = NULL;
905 
906 	/*
907 	 * We leave ixa_tsl unchanged, but if it has a refhold we need
908 	 * to get an extra refhold.
909 	 */
910 	if (ixa->ixa_free_flags & IXA_FREE_TSL)
911 		label_hold(ixa->ixa_tsl);
912 
913 	/*
914 	 * We leave ixa_cred unchanged, but if it has a refhold we need
915 	 * to get an extra refhold.
916 	 */
917 	if (ixa->ixa_free_flags & IXA_FREE_CRED)
918 		crhold(ixa->ixa_cred);
919 
920 	/*
921 	 * There is no cleanup in progress on this new copy.
922 	 */
923 	ixa->ixa_tcpcleanup = IXATC_IDLE;
924 }
925 
926 /*
927  * Duplicate an ip_xmit_attr_t.
928  * Assumes that the caller controls the ixa, hence we do not need to use
929  * a safe copy. We just have to increase the refcnt on any pointers.
930  */
931 ip_xmit_attr_t *
ip_xmit_attr_duplicate(ip_xmit_attr_t * src_ixa)932 ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
933 {
934 	ip_xmit_attr_t *ixa;
935 
936 	ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
937 	if (ixa == NULL)
938 		return (NULL);
939 	bcopy(src_ixa, ixa, sizeof (*ixa));
940 	ixa->ixa_refcnt = 1;
941 
942 	if (ixa->ixa_ire != NULL)
943 		ire_refhold_notr(ixa->ixa_ire);
944 	if (ixa->ixa_nce != NULL)
945 		nce_refhold(ixa->ixa_nce);
946 	if (ixa->ixa_dce != NULL)
947 		dce_refhold_notr(ixa->ixa_dce);
948 
949 #ifdef DEBUG
950 	ixa->ixa_curthread = NULL;
951 #endif
952 
953 	if (ixa->ixa_ipsec_latch != NULL)
954 		IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
955 	if (ixa->ixa_ipsec_ah_sa != NULL)
956 		IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
957 	if (ixa->ixa_ipsec_esp_sa != NULL)
958 		IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
959 	if (ixa->ixa_ipsec_policy != NULL)
960 		IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
961 	if (ixa->ixa_ipsec_action != NULL)
962 		IPACT_REFHOLD(ixa->ixa_ipsec_action);
963 
964 	if (ixa->ixa_tsl != NULL) {
965 		label_hold(ixa->ixa_tsl);
966 		ixa->ixa_free_flags |= IXA_FREE_TSL;
967 	}
968 	if (ixa->ixa_cred != NULL) {
969 		crhold(ixa->ixa_cred);
970 		ixa->ixa_free_flags |= IXA_FREE_CRED;
971 	}
972 	return (ixa);
973 }
974 
975 /*
976  * Used to replace the ixa_label field.
977  * The caller should have a reference on the label, which we transfer to
978  * the attributes so that when the attribute is freed/cleaned up
979  * we will release that reference.
980  */
981 void
ip_xmit_attr_replace_tsl(ip_xmit_attr_t * ixa,ts_label_t * tsl)982 ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
983 {
984 	ASSERT(tsl != NULL);
985 
986 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
987 		ASSERT(ixa->ixa_tsl != NULL);
988 		label_rele(ixa->ixa_tsl);
989 	} else {
990 		ixa->ixa_free_flags |= IXA_FREE_TSL;
991 	}
992 	ixa->ixa_tsl = tsl;
993 }
994 
995 /*
996  * Replace the ip_recv_attr_t's label.
997  * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
998  * TCP/UDP uses ira_cred to set db_credp for non-socket users.
999  * This can fail (and return B_FALSE) due to lack of memory.
1000  */
1001 boolean_t
ip_recv_attr_replace_label(ip_recv_attr_t * ira,ts_label_t * tsl)1002 ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
1003 {
1004 	cred_t	*newcr;
1005 
1006 	if (ira->ira_free_flags & IRA_FREE_TSL) {
1007 		ASSERT(ira->ira_tsl != NULL);
1008 		label_rele(ira->ira_tsl);
1009 	}
1010 	label_hold(tsl);
1011 	ira->ira_tsl = tsl;
1012 	ira->ira_free_flags |= IRA_FREE_TSL;
1013 
1014 	/*
1015 	 * Reset zoneid if we have a shared address. That allows
1016 	 * ip_fanout_tx_v4/v6 to determine the zoneid again.
1017 	 */
1018 	if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
1019 		ira->ira_zoneid = ALL_ZONES;
1020 
1021 	/* We update ira_cred for RPC */
1022 	newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
1023 	if (newcr == NULL)
1024 		return (B_FALSE);
1025 	if (ira->ira_free_flags & IRA_FREE_CRED)
1026 		crfree(ira->ira_cred);
1027 	ira->ira_cred = newcr;
1028 	ira->ira_free_flags |= IRA_FREE_CRED;
1029 	return (B_TRUE);
1030 }
1031 
1032 /*
1033  * This needs to be called after ip_set_destination/tsol_check_dest might
1034  * have changed ixa_tsl to be specific for a destination, and we now want to
1035  * send to a different destination.
1036  * We have to restart with crgetlabel() since ip_set_destination/
1037  * tsol_check_dest will start with ixa_tsl.
1038  */
1039 void
ip_xmit_attr_restore_tsl(ip_xmit_attr_t * ixa,cred_t * cr)1040 ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
1041 {
1042 	if (!is_system_labeled())
1043 		return;
1044 
1045 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1046 		ASSERT(ixa->ixa_tsl != NULL);
1047 		label_rele(ixa->ixa_tsl);
1048 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1049 	}
1050 	ixa->ixa_tsl = crgetlabel(cr);
1051 }
1052 
1053 void
ixa_refrele(ip_xmit_attr_t * ixa)1054 ixa_refrele(ip_xmit_attr_t *ixa)
1055 {
1056 	IXA_REFRELE(ixa);
1057 }
1058 
1059 void
ixa_inactive(ip_xmit_attr_t * ixa)1060 ixa_inactive(ip_xmit_attr_t *ixa)
1061 {
1062 	ASSERT(ixa->ixa_refcnt == 0);
1063 
1064 	ixa_cleanup(ixa);
1065 	kmem_free(ixa, sizeof (*ixa));
1066 }
1067 
1068 /*
1069  * Release any references contained in the ixa.
1070  * Also clear any fields that are not controlled by ixa_flags.
1071  */
1072 void
ixa_cleanup(ip_xmit_attr_t * ixa)1073 ixa_cleanup(ip_xmit_attr_t *ixa)
1074 {
1075 	if (ixa->ixa_ire != NULL) {
1076 		ire_refrele_notr(ixa->ixa_ire);
1077 		ixa->ixa_ire = NULL;
1078 	}
1079 	if (ixa->ixa_dce != NULL) {
1080 		dce_refrele_notr(ixa->ixa_dce);
1081 		ixa->ixa_dce = NULL;
1082 	}
1083 	if (ixa->ixa_nce != NULL) {
1084 		nce_refrele(ixa->ixa_nce);
1085 		ixa->ixa_nce = NULL;
1086 	}
1087 	ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1088 	ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1089 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
1090 		ipsec_out_release_refs(ixa);
1091 	}
1092 	if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1093 		ASSERT(ixa->ixa_tsl != NULL);
1094 		label_rele(ixa->ixa_tsl);
1095 		ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1096 	}
1097 	ixa->ixa_tsl = NULL;
1098 	if (ixa->ixa_free_flags & IXA_FREE_CRED) {
1099 		ASSERT(ixa->ixa_cred != NULL);
1100 		crfree(ixa->ixa_cred);
1101 		ixa->ixa_free_flags &= ~IXA_FREE_CRED;
1102 	}
1103 	ixa->ixa_cred = NULL;
1104 	ixa->ixa_src_preferences = 0;
1105 	ixa->ixa_ifindex = 0;
1106 	ixa->ixa_multicast_ifindex = 0;
1107 	ixa->ixa_multicast_ifaddr = INADDR_ANY;
1108 }
1109 
1110 /*
1111  * Release any references contained in the ira.
1112  * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
1113  * argument.
1114  */
1115 void
ira_cleanup(ip_recv_attr_t * ira,boolean_t refrele_ill)1116 ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
1117 {
1118 	if (ira->ira_ill != NULL) {
1119 		if (ira->ira_rill != ira->ira_ill) {
1120 			/* Caused by async processing */
1121 			ill_refrele(ira->ira_rill);
1122 		}
1123 		if (refrele_ill)
1124 			ill_refrele(ira->ira_ill);
1125 	}
1126 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1127 		ipsec_in_release_refs(ira);
1128 	}
1129 	if (ira->ira_free_flags & IRA_FREE_TSL) {
1130 		ASSERT(ira->ira_tsl != NULL);
1131 		label_rele(ira->ira_tsl);
1132 		ira->ira_free_flags &= ~IRA_FREE_TSL;
1133 	}
1134 	ira->ira_tsl = NULL;
1135 	if (ira->ira_free_flags & IRA_FREE_CRED) {
1136 		ASSERT(ira->ira_cred != NULL);
1137 		crfree(ira->ira_cred);
1138 		ira->ira_free_flags &= ~IRA_FREE_CRED;
1139 	}
1140 	ira->ira_cred = NULL;
1141 }
1142 
1143 /*
1144  * Function to help release any IRE, NCE, or DCEs that
1145  * have been deleted and are marked as condemned.
1146  * The caller is responsible for any serialization which is different
1147  * for TCP, SCTP, and others.
1148  */
1149 static void
ixa_cleanup_stale(ip_xmit_attr_t * ixa)1150 ixa_cleanup_stale(ip_xmit_attr_t *ixa)
1151 {
1152 	ire_t		*ire;
1153 	nce_t		*nce;
1154 	dce_t		*dce;
1155 
1156 	ire = ixa->ixa_ire;
1157 	nce = ixa->ixa_nce;
1158 	dce = ixa->ixa_dce;
1159 
1160 	if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
1161 		ire_refrele_notr(ire);
1162 		ire = ire_blackhole(ixa->ixa_ipst,
1163 		    !(ixa->ixa_flags & IXAF_IS_IPV4));
1164 		ASSERT(ire != NULL);
1165 #ifdef DEBUG
1166 		ire_refhold_notr(ire);
1167 		ire_refrele(ire);
1168 #endif
1169 		ixa->ixa_ire = ire;
1170 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1171 	}
1172 	if (nce != NULL && nce->nce_is_condemned) {
1173 		/* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
1174 		nce_refrele(nce);
1175 		ixa->ixa_nce = NULL;
1176 		ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1177 	}
1178 	if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
1179 		dce_refrele_notr(dce);
1180 		dce = dce_get_default(ixa->ixa_ipst);
1181 		ASSERT(dce != NULL);
1182 #ifdef DEBUG
1183 		dce_refhold_notr(dce);
1184 		dce_refrele(dce);
1185 #endif
1186 		ixa->ixa_dce = dce;
1187 		ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1188 	}
1189 }
1190 
1191 static mblk_t *
tcp_ixa_cleanup_getmblk(conn_t * connp)1192 tcp_ixa_cleanup_getmblk(conn_t *connp)
1193 {
1194 	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1195 	int need_retry;
1196 	mblk_t *mp;
1197 
1198 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1199 
1200 	/*
1201 	 * It's possible that someone else came in and started cleaning up
1202 	 * another connection between the time we verified this one is not being
1203 	 * cleaned up and the time we actually get the shared mblk.  If that's
1204 	 * the case, we've dropped the lock, and some other thread may have
1205 	 * cleaned up this connection again, and is still waiting for
1206 	 * notification of that cleanup's completion.  Therefore we need to
1207 	 * recheck.
1208 	 */
1209 	do {
1210 		need_retry = 0;
1211 		while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
1212 			cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1213 			    &tcps->tcps_ixa_cleanup_lock);
1214 		}
1215 
1216 		while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
1217 			/*
1218 			 * Multiple concurrent cleanups; need to have the last
1219 			 * one run since it could be an unplumb.
1220 			 */
1221 			need_retry = 1;
1222 			cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
1223 			    &tcps->tcps_ixa_cleanup_lock);
1224 		}
1225 	} while (need_retry);
1226 
1227 	/*
1228 	 * We now have the lock and the mblk; now make sure that no one else can
1229 	 * try to clean up this connection or enqueue it for cleanup, clear the
1230 	 * mblk pointer for this stack, drop the lock, and return the mblk.
1231 	 */
1232 	ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
1233 	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
1234 	ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
1235 	ASSERT(mp != NULL);
1236 
1237 	connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
1238 	tcps->tcps_ixa_cleanup_mp = NULL;
1239 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1240 
1241 	return (mp);
1242 }
1243 
1244 /*
1245  * Used to run ixa_cleanup_stale inside the tcp squeue.
1246  * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
1247  * and waking up the caller.
1248  */
1249 /* ARGSUSED2 */
1250 static void
tcp_ixa_cleanup(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1251 tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
1252     ip_recv_attr_t *dummy)
1253 {
1254 	conn_t	*connp = (conn_t *)arg;
1255 	tcp_stack_t	*tcps;
1256 
1257 	tcps = connp->conn_netstack->netstack_tcp;
1258 
1259 	ixa_cleanup_stale(connp->conn_ixa);
1260 
1261 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1262 	ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
1263 	connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
1264 	tcps->tcps_ixa_cleanup_mp = mp;
1265 	cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
1266 	/*
1267 	 * It is possible for any number of threads to be waiting for cleanup of
1268 	 * different connections.  Absent a per-connection (or per-IXA) CV, we
1269 	 * need to wake them all up even though only one can be waiting on this
1270 	 * particular cleanup.
1271 	 */
1272 	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1273 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1274 }
1275 
1276 static void
tcp_ixa_cleanup_wait_and_finish(conn_t * connp)1277 tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
1278 {
1279 	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1280 
1281 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1282 
1283 	ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
1284 
1285 	while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
1286 		cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1287 		    &tcps->tcps_ixa_cleanup_lock);
1288 	}
1289 
1290 	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
1291 	connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
1292 	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1293 
1294 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1295 }
1296 
1297 /*
1298  * ipcl_walk() function to help release any IRE, NCE, or DCEs that
1299  * have been deleted and are marked as condemned.
1300  * Note that we can't cleanup the pointers since there can be threads
1301  * in conn_ip_output() sending while we are called.
1302  */
1303 void
conn_ixa_cleanup(conn_t * connp,void * arg)1304 conn_ixa_cleanup(conn_t *connp, void *arg)
1305 {
1306 	boolean_t tryhard = (boolean_t)arg;
1307 
1308 	if (IPCL_IS_TCP(connp)) {
1309 		mblk_t		*mp;
1310 
1311 		mp = tcp_ixa_cleanup_getmblk(connp);
1312 
1313 		if (connp->conn_sqp->sq_run == curthread) {
1314 			/* Already on squeue */
1315 			tcp_ixa_cleanup(connp, mp, NULL, NULL);
1316 		} else {
1317 			CONN_INC_REF(connp);
1318 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
1319 			    connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
1320 		}
1321 		tcp_ixa_cleanup_wait_and_finish(connp);
1322 	} else if (IPCL_IS_SCTP(connp)) {
1323 		sctp_t	*sctp;
1324 		sctp_faddr_t *fp;
1325 
1326 		sctp = CONN2SCTP(connp);
1327 		RUN_SCTP(sctp);
1328 		ixa_cleanup_stale(connp->conn_ixa);
1329 		for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->sf_next)
1330 			ixa_cleanup_stale(fp->sf_ixa);
1331 		WAKE_SCTP(sctp);
1332 	} else {
1333 		ip_xmit_attr_t	*ixa;
1334 
1335 		/*
1336 		 * If there is a different thread using conn_ixa then we get a
1337 		 * new copy and cut the old one loose from conn_ixa. Otherwise
1338 		 * we use conn_ixa and prevent any other thread from
1339 		 * using/changing it. Anybody using conn_ixa (e.g., a thread in
1340 		 * conn_ip_output) will do an ixa_refrele which will remove any
1341 		 * references on the ire etc.
1342 		 *
1343 		 * Once we are done other threads can use conn_ixa since the
1344 		 * refcnt will be back at one.
1345 		 *
1346 		 * We are called either because an ill is going away, or
1347 		 * due to memory reclaim. In the former case we wait for
1348 		 * memory since we must remove the refcnts on the ill.
1349 		 */
1350 		if (tryhard) {
1351 			ixa = conn_get_ixa_tryhard(connp, B_TRUE);
1352 			ASSERT(ixa != NULL);
1353 		} else {
1354 			ixa = conn_get_ixa(connp, B_TRUE);
1355 			if (ixa == NULL) {
1356 				/*
1357 				 * Somebody else was using it and kmem_alloc
1358 				 * failed! Next memory reclaim will try to
1359 				 * clean up.
1360 				 */
1361 				DTRACE_PROBE1(conn__ixa__cleanup__bail,
1362 				    conn_t *, connp);
1363 				return;
1364 			}
1365 		}
1366 		ixa_cleanup_stale(ixa);
1367 		IXA_REFRELE(ixa);
1368 	}
1369 }
1370 
1371 /*
1372  * ixa needs to be an exclusive copy so that no one changes the cookie
1373  * or the ixa_nce.
1374  */
1375 boolean_t
ixa_check_drain_insert(conn_t * connp,ip_xmit_attr_t * ixa)1376 ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
1377 {
1378 	uintptr_t cookie = ixa->ixa_cookie;
1379 	ill_dld_direct_t *idd;
1380 	idl_tx_list_t *idl_txl;
1381 	ill_t *ill = ixa->ixa_nce->nce_ill;
1382 	boolean_t inserted = B_FALSE;
1383 
1384 	idd = &(ill)->ill_dld_capab->idc_direct;
1385 	idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
1386 	mutex_enter(&idl_txl->txl_lock);
1387 
1388 	/*
1389 	 * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow
1390 	 * control is asserted on an ill that does not support direct calls.
1391 	 * Jump to insert.
1392 	 */
1393 	if (cookie == 0)
1394 		goto tryinsert;
1395 
1396 	ASSERT(ILL_DIRECT_CAPABLE(ill));
1397 
1398 	if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) {
1399 		DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie);
1400 	} else if (idl_txl->txl_cookie != NULL &&
1401 	    idl_txl->txl_cookie != ixa->ixa_cookie) {
1402 		DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie,
1403 		    uintptr_t, idl_txl->txl_cookie);
1404 		/* TODO: bump kstat for cookie collision */
1405 	} else {
1406 		/*
1407 		 * Check/set conn_blocked under conn_lock.  Note that txl_lock
1408 		 * will not suffice since two separate UDP threads may be
1409 		 * racing to send to different destinations that are
1410 		 * associated with different cookies and thus may not be
1411 		 * holding the same txl_lock.  Further, since a given conn_t
1412 		 * can only be on a single drain list, the conn_t will be
1413 		 * enqueued on whichever thread wins this race.
1414 		 */
1415 tryinsert:	mutex_enter(&connp->conn_lock);
1416 		if (connp->conn_blocked) {
1417 			DTRACE_PROBE1(ill__tx__conn__already__blocked,
1418 			    conn_t *, connp);
1419 			mutex_exit(&connp->conn_lock);
1420 		} else {
1421 			connp->conn_blocked = B_TRUE;
1422 			mutex_exit(&connp->conn_lock);
1423 			idl_txl->txl_cookie = cookie;
1424 			conn_drain_insert(connp, idl_txl);
1425 			if (!IPCL_IS_NONSTR(connp))
1426 				noenable(connp->conn_wq);
1427 			inserted = B_TRUE;
1428 		}
1429 	}
1430 	mutex_exit(&idl_txl->txl_lock);
1431 	return (inserted);
1432 }
1433