1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25 /* Copyright (c) 1990 Mentat Inc. */
26
27 /*
28 * Copyright 2019 Joyent, Inc.
29 * Copyright 2024 Oxide Computer Company
30 */
31
32 #include <sys/types.h>
33 #include <sys/stream.h>
34 #include <sys/strsun.h>
35 #include <sys/zone.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/atomic.h>
41
42 #include <sys/systm.h>
43 #include <sys/param.h>
44 #include <sys/kmem.h>
45 #include <sys/sdt.h>
46 #include <sys/socket.h>
47 #include <sys/mac.h>
48 #include <net/if.h>
49 #include <net/if_arp.h>
50 #include <net/route.h>
51 #include <sys/sockio.h>
52 #include <netinet/in.h>
53 #include <net/if_dl.h>
54
55 #include <inet/common.h>
56 #include <inet/mi.h>
57 #include <inet/mib2.h>
58 #include <inet/nd.h>
59 #include <inet/arp.h>
60 #include <inet/snmpcom.h>
61 #include <inet/kstatcom.h>
62
63 #include <netinet/igmp_var.h>
64 #include <netinet/ip6.h>
65 #include <netinet/icmp6.h>
66 #include <netinet/sctp.h>
67
68 #include <inet/ip.h>
69 #include <inet/ip_impl.h>
70 #include <inet/ip6.h>
71 #include <inet/ip6_asp.h>
72 #include <inet/tcp.h>
73 #include <inet/ip_multi.h>
74 #include <inet/ip_if.h>
75 #include <inet/ip_ire.h>
76 #include <inet/ip_ftable.h>
77 #include <inet/ip_rts.h>
78 #include <inet/optcom.h>
79 #include <inet/ip_ndp.h>
80 #include <inet/ip_listutils.h>
81 #include <netinet/igmp.h>
82 #include <netinet/ip_mroute.h>
83 #include <inet/ipp_common.h>
84
85 #include <net/pfkeyv2.h>
86 #include <inet/sadb.h>
87 #include <inet/ipsec_impl.h>
88 #include <inet/ipdrop.h>
89 #include <inet/ip_netinfo.h>
90 #include <sys/squeue_impl.h>
91 #include <sys/squeue.h>
92
93 #include <inet/ipclassifier.h>
94 #include <inet/sctp_ip.h>
95 #include <inet/sctp/sctp_impl.h>
96 #include <inet/udp_impl.h>
97 #include <sys/sunddi.h>
98
99 #include <sys/tsol/label.h>
100 #include <sys/tsol/tnet.h>
101
102 /*
103 * Release a reference on ip_xmit_attr.
104 * The reference is acquired by conn_get_ixa()
105 *
106 * This macro has a lowercase function-call version for callers outside
107 * this file.
108 */
109 #define IXA_REFRELE(ixa) \
110 { \
111 if (atomic_dec_32_nv(&(ixa)->ixa_refcnt) == 0) \
112 ixa_inactive(ixa); \
113 }
114
115 #define IXA_REFHOLD(ixa) \
116 { \
117 ASSERT3U((ixa)->ixa_refcnt, !=, 0); \
118 atomic_inc_32(&(ixa)->ixa_refcnt); \
119 }
120
121 /*
122 * When we need to handle a transmit side asynchronous operation, then we need
123 * to save sufficient information so that we can call the fragment and postfrag
124 * functions. That information is captured in an mblk containing this structure.
125 *
126 * Since this is currently only used for IPsec, we include information for
127 * the kernel crypto framework.
128 */
129 typedef struct ixamblk_s {
130 boolean_t ixm_inbound; /* B_FALSE */
131 iaflags_t ixm_flags; /* ixa_flags */
132 netstackid_t ixm_stackid; /* Verify it didn't go away */
133 uint_t ixm_ifindex; /* Used to find the nce */
134 in6_addr_t ixm_nceaddr_v6; /* Used to find nce */
135 #define ixm_nceaddr_v4 V4_PART_OF_V6(ixm_nceaddr_v6)
136 uint32_t ixm_fragsize;
137 uint_t ixm_pktlen;
138 uint16_t ixm_ip_hdr_length; /* Points to ULP header */
139 uint8_t ixm_protocol; /* Protocol number for ULP cksum */
140 pfirepostfrag_t ixm_postfragfn;
141
142 zoneid_t ixm_zoneid; /* Needed for ipobs */
143 zoneid_t ixm_no_loop_zoneid; /* IXAF_NO_LOOP_ZONEID_SET */
144
145 uint_t ixm_scopeid; /* For IPv6 link-locals */
146
147 uint32_t ixm_ident; /* For IPv6 fragment header */
148 uint32_t ixm_xmit_hint;
149
150 uint64_t ixm_conn_id; /* Used by DTrace */
151 cred_t *ixm_cred; /* For getpeerucred - refhold if set */
152 pid_t ixm_cpid; /* For getpeerucred */
153
154 ts_label_t *ixm_tsl; /* Refhold if set. */
155
156 /*
157 * When the pointers below are set they have a refhold on the struct.
158 */
159 ipsec_latch_t *ixm_ipsec_latch;
160 struct ipsa_s *ixm_ipsec_ah_sa; /* SA for AH */
161 struct ipsa_s *ixm_ipsec_esp_sa; /* SA for ESP */
162 struct ipsec_policy_s *ixm_ipsec_policy; /* why are we here? */
163 struct ipsec_action_s *ixm_ipsec_action; /* For reflected packets */
164
165 ipsa_ref_t ixm_ipsec_ref[2]; /* Soft reference to SA */
166
167 /* Need these while waiting for SA */
168 uint16_t ixm_ipsec_src_port; /* Source port number of d-gram. */
169 uint16_t ixm_ipsec_dst_port; /* Destination port number of d-gram. */
170 uint8_t ixm_ipsec_icmp_type; /* ICMP type of d-gram */
171 uint8_t ixm_ipsec_icmp_code; /* ICMP code of d-gram */
172
173 sa_family_t ixm_ipsec_inaf; /* Inner address family */
174 uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN]; /* Inner src address */
175 uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN]; /* Inner dest address */
176 uint8_t ixm_ipsec_insrcpfx; /* Inner source prefix */
177 uint8_t ixm_ipsec_indstpfx; /* Inner destination prefix */
178
179 uint8_t ixm_ipsec_proto; /* IP protocol number for d-gram. */
180 } ixamblk_t;
181
182
183 /*
184 * When we need to handle a receive side asynchronous operation, then we need
185 * to save sufficient information so that we can call ip_fanout.
186 * That information is captured in an mblk containing this structure.
187 *
188 * Since this is currently only used for IPsec, we include information for
189 * the kernel crypto framework.
190 */
191 typedef struct iramblk_s {
192 boolean_t irm_inbound; /* B_TRUE */
193 iaflags_t irm_flags; /* ira_flags */
194 netstackid_t irm_stackid; /* Verify it didn't go away */
195 uint_t irm_ifindex; /* To find ira_ill */
196
197 uint_t irm_rifindex; /* ira_rifindex */
198 uint_t irm_ruifindex; /* ira_ruifindex */
199 uint_t irm_pktlen;
200 uint16_t irm_ip_hdr_length; /* Points to ULP header */
201 uint8_t irm_protocol; /* Protocol number for ULP cksum */
202 uint8_t irm_ttl; /* IP TTL, IPv6 hop limit */
203 zoneid_t irm_zoneid; /* ALL_ZONES unless local delivery */
204
205 squeue_t *irm_sqp;
206 ill_rx_ring_t *irm_ring;
207
208 ipaddr_t irm_mroute_tunnel; /* IRAF_MROUTE_TUNNEL_SET */
209 zoneid_t irm_no_loop_zoneid; /* IRAF_NO_LOOP_ZONEID_SET */
210 uint32_t irm_esp_udp_ports; /* IRAF_ESP_UDP_PORTS */
211
212 char irm_l2src[IRA_L2SRC_SIZE]; /* If IRAF_L2SRC_SET */
213
214 cred_t *irm_cred; /* For getpeerucred - refhold if set */
215 pid_t irm_cpid; /* For getpeerucred */
216
217 ts_label_t *irm_tsl; /* Refhold if set. */
218
219 /*
220 * When set these correspond to a refhold on the object.
221 */
222 struct ipsa_s *irm_ipsec_ah_sa; /* SA for AH */
223 struct ipsa_s *irm_ipsec_esp_sa; /* SA for ESP */
224 struct ipsec_action_s *irm_ipsec_action; /* For reflected packets */
225 } iramblk_t;
226
227
228 /*
229 * Take the information in ip_xmit_attr_t and stick it in an mblk
230 * that can later be passed to ip_xmit_attr_from_mblk to recreate the
231 * ip_xmit_attr_t.
232 *
233 * Returns NULL on memory allocation failure.
234 */
235 mblk_t *
ip_xmit_attr_to_mblk(ip_xmit_attr_t * ixa)236 ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
237 {
238 mblk_t *ixamp;
239 ixamblk_t *ixm;
240 nce_t *nce = ixa->ixa_nce;
241
242 ASSERT(nce != NULL);
243 ixamp = allocb(sizeof (*ixm), BPRI_MED);
244 if (ixamp == NULL)
245 return (NULL);
246
247 ixamp->b_datap->db_type = M_BREAK;
248 ixamp->b_wptr += sizeof (*ixm);
249 ixm = (ixamblk_t *)ixamp->b_rptr;
250
251 bzero(ixm, sizeof (*ixm));
252 ixm->ixm_inbound = B_FALSE;
253 ixm->ixm_flags = ixa->ixa_flags;
254 ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
255 ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
256 ixm->ixm_nceaddr_v6 = nce->nce_addr;
257 ixm->ixm_fragsize = ixa->ixa_fragsize;
258 ixm->ixm_pktlen = ixa->ixa_pktlen;
259 ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
260 ixm->ixm_protocol = ixa->ixa_protocol;
261 ixm->ixm_postfragfn = ixa->ixa_postfragfn;
262 ixm->ixm_zoneid = ixa->ixa_zoneid;
263 ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
264 ixm->ixm_scopeid = ixa->ixa_scopeid;
265 ixm->ixm_ident = ixa->ixa_ident;
266 ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;
267
268 if (ixa->ixa_tsl != NULL) {
269 ixm->ixm_tsl = ixa->ixa_tsl;
270 label_hold(ixm->ixm_tsl);
271 }
272 if (ixa->ixa_cred != NULL) {
273 ixm->ixm_cred = ixa->ixa_cred;
274 crhold(ixa->ixa_cred);
275 }
276 ixm->ixm_cpid = ixa->ixa_cpid;
277 ixm->ixm_conn_id = ixa->ixa_conn_id;
278
279 if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
280 if (ixa->ixa_ipsec_ah_sa != NULL) {
281 ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
282 IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
283 }
284 if (ixa->ixa_ipsec_esp_sa != NULL) {
285 ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
286 IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
287 }
288 if (ixa->ixa_ipsec_policy != NULL) {
289 ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
290 IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
291 }
292 if (ixa->ixa_ipsec_action != NULL) {
293 ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
294 IPACT_REFHOLD(ixa->ixa_ipsec_action);
295 }
296 if (ixa->ixa_ipsec_latch != NULL) {
297 ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
298 IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
299 }
300 ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
301 ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
302 ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
303 ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
304 ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
305 ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
306 ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
307 ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
308 ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
309 ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
310 ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
311 ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
312 ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
313 ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
314 ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
315 ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
316 ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
317 ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
318 }
319 return (ixamp);
320 }
321
322 /*
323 * Extract the ip_xmit_attr_t from the mblk, checking that the
324 * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
325 * not the case.
326 *
327 * Otherwise ixa is updated.
328 * Caller needs to release references on the ixa by calling ixa_refrele()
329 * which will imediately call ixa_inactive to release the references.
330 */
331 boolean_t
ip_xmit_attr_from_mblk(mblk_t * ixamp,ip_xmit_attr_t * ixa)332 ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
333 {
334 ixamblk_t *ixm;
335 netstack_t *ns;
336 ip_stack_t *ipst;
337 ill_t *ill;
338 nce_t *nce;
339
340 /* We assume the caller hasn't initialized ixa */
341 bzero(ixa, sizeof (*ixa));
342
343 ASSERT(DB_TYPE(ixamp) == M_BREAK);
344 ASSERT(ixamp->b_cont == NULL);
345
346 ixm = (ixamblk_t *)ixamp->b_rptr;
347 ASSERT(!ixm->ixm_inbound);
348
349 /* Verify the netstack is still around */
350 ns = netstack_find_by_stackid(ixm->ixm_stackid);
351 if (ns == NULL) {
352 /* Disappeared on us */
353 (void) ip_xmit_attr_free_mblk(ixamp);
354 return (B_FALSE);
355 }
356 ipst = ns->netstack_ip;
357
358 /* Verify the ill is still around */
359 ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
360 !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);
361
362 /* We have the ill, hence the netstack can't go away */
363 netstack_rele(ns);
364 if (ill == NULL) {
365 /* Disappeared on us */
366 (void) ip_xmit_attr_free_mblk(ixamp);
367 return (B_FALSE);
368 }
369 /*
370 * Find the nce. We don't load-spread (only lookup nce's on the ill)
371 * because we want to find the same nce as the one we had when
372 * ip_xmit_attr_to_mblk was called.
373 */
374 if (ixm->ixm_flags & IXAF_IS_IPV4) {
375 nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
376 } else {
377 nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
378 }
379
380 /* We have the nce, hence the ill can't go away */
381 ill_refrele(ill);
382 if (nce == NULL) {
383 /*
384 * Since this is unusual and we don't know what type of
385 * nce it was, we drop the packet.
386 */
387 (void) ip_xmit_attr_free_mblk(ixamp);
388 return (B_FALSE);
389 }
390
391 ixa->ixa_flags = ixm->ixm_flags;
392 ixa->ixa_refcnt = 1;
393 ixa->ixa_ipst = ipst;
394 ixa->ixa_fragsize = ixm->ixm_fragsize;
395 ixa->ixa_pktlen = ixm->ixm_pktlen;
396 ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
397 ixa->ixa_protocol = ixm->ixm_protocol;
398 ixa->ixa_nce = nce;
399 ixa->ixa_postfragfn = ixm->ixm_postfragfn;
400 ixa->ixa_zoneid = ixm->ixm_zoneid;
401 ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
402 ixa->ixa_scopeid = ixm->ixm_scopeid;
403 ixa->ixa_ident = ixm->ixm_ident;
404 ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;
405
406 if (ixm->ixm_tsl != NULL) {
407 ixa->ixa_tsl = ixm->ixm_tsl;
408 ixa->ixa_free_flags |= IXA_FREE_TSL;
409 ixm->ixm_tsl = NULL;
410 }
411 if (ixm->ixm_cred != NULL) {
412 ixa->ixa_cred = ixm->ixm_cred;
413 ixa->ixa_free_flags |= IXA_FREE_CRED;
414 ixm->ixm_cred = NULL;
415 }
416 ixa->ixa_cpid = ixm->ixm_cpid;
417 ixa->ixa_conn_id = ixm->ixm_conn_id;
418
419 ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
420 ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
421 ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
422 ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
423 ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;
424
425 ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
426 ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
427 ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
428 ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
429 ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
430 ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
431 ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
432 ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
433 ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
434 ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
435 ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
436 ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
437 ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
438 ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
439 ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
440 ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
441 ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
442 ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;
443
444 freeb(ixamp);
445 return (B_TRUE);
446 }
447
448 /*
449 * Free the ixm mblk and any references it holds
450 * Returns b_cont.
451 */
452 mblk_t *
ip_xmit_attr_free_mblk(mblk_t * ixamp)453 ip_xmit_attr_free_mblk(mblk_t *ixamp)
454 {
455 ixamblk_t *ixm;
456 mblk_t *mp;
457
458 /* Consume mp */
459 ASSERT(DB_TYPE(ixamp) == M_BREAK);
460 mp = ixamp->b_cont;
461
462 ixm = (ixamblk_t *)ixamp->b_rptr;
463 ASSERT(!ixm->ixm_inbound);
464
465 if (ixm->ixm_ipsec_ah_sa != NULL) {
466 IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
467 ixm->ixm_ipsec_ah_sa = NULL;
468 }
469 if (ixm->ixm_ipsec_esp_sa != NULL) {
470 IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
471 ixm->ixm_ipsec_esp_sa = NULL;
472 }
473 if (ixm->ixm_ipsec_policy != NULL) {
474 IPPOL_REFRELE(ixm->ixm_ipsec_policy);
475 ixm->ixm_ipsec_policy = NULL;
476 }
477 if (ixm->ixm_ipsec_action != NULL) {
478 IPACT_REFRELE(ixm->ixm_ipsec_action);
479 ixm->ixm_ipsec_action = NULL;
480 }
481 if (ixm->ixm_ipsec_latch) {
482 IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
483 ixm->ixm_ipsec_latch = NULL;
484 }
485
486 if (ixm->ixm_tsl != NULL) {
487 label_rele(ixm->ixm_tsl);
488 ixm->ixm_tsl = NULL;
489 }
490 if (ixm->ixm_cred != NULL) {
491 crfree(ixm->ixm_cred);
492 ixm->ixm_cred = NULL;
493 }
494 freeb(ixamp);
495 return (mp);
496 }
497
498 /*
499 * Take the information in ip_recv_attr_t and stick it in an mblk
500 * that can later be passed to ip_recv_attr_from_mblk to recreate the
501 * ip_recv_attr_t.
502 *
503 * Returns NULL on memory allocation failure.
504 */
505 mblk_t *
ip_recv_attr_to_mblk(ip_recv_attr_t * ira)506 ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
507 {
508 mblk_t *iramp;
509 iramblk_t *irm;
510 ill_t *ill = ira->ira_ill;
511
512 ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);
513
514 iramp = allocb(sizeof (*irm), BPRI_MED);
515 if (iramp == NULL)
516 return (NULL);
517
518 iramp->b_datap->db_type = M_BREAK;
519 iramp->b_wptr += sizeof (*irm);
520 irm = (iramblk_t *)iramp->b_rptr;
521
522 bzero(irm, sizeof (*irm));
523 irm->irm_inbound = B_TRUE;
524 irm->irm_flags = ira->ira_flags;
525 if (ill != NULL) {
526 /* Internal to IP - preserve ip_stack_t, ill and rill */
527 irm->irm_stackid =
528 ill->ill_ipst->ips_netstack->netstack_stackid;
529 irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
530 ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
531 ira->ira_rifindex);
532 } else {
533 /* Let ip_recv_attr_from_stackid know there isn't one */
534 irm->irm_stackid = -1;
535 }
536 irm->irm_rifindex = ira->ira_rifindex;
537 irm->irm_ruifindex = ira->ira_ruifindex;
538 irm->irm_pktlen = ira->ira_pktlen;
539 irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
540 irm->irm_protocol = ira->ira_protocol;
541 irm->irm_ttl = ira->ira_ttl;
542
543 irm->irm_sqp = ira->ira_sqp;
544 irm->irm_ring = ira->ira_ring;
545
546 irm->irm_zoneid = ira->ira_zoneid;
547 irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
548 irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
549 irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;
550
551 if (ira->ira_tsl != NULL) {
552 irm->irm_tsl = ira->ira_tsl;
553 label_hold(irm->irm_tsl);
554 }
555 if (ira->ira_cred != NULL) {
556 irm->irm_cred = ira->ira_cred;
557 crhold(ira->ira_cred);
558 }
559 irm->irm_cpid = ira->ira_cpid;
560
561 if (ira->ira_flags & IRAF_L2SRC_SET)
562 bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);
563
564 if (ira->ira_flags & IRAF_IPSEC_SECURE) {
565 if (ira->ira_ipsec_ah_sa != NULL) {
566 irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
567 IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
568 }
569 if (ira->ira_ipsec_esp_sa != NULL) {
570 irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
571 IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
572 }
573 if (ira->ira_ipsec_action != NULL) {
574 irm->irm_ipsec_action = ira->ira_ipsec_action;
575 IPACT_REFHOLD(ira->ira_ipsec_action);
576 }
577 }
578 return (iramp);
579 }
580
581 /*
582 * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
583 * then irm_stackid is not -1, in which case we check that the
584 * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
585 * not the case.
586 * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
587 * and we just proceed with ira_ill and ira_rill as NULL.
588 *
589 * The caller needs to release any references on the pointers inside the ire
590 * by calling ira_cleanup.
591 */
592 boolean_t
ip_recv_attr_from_mblk(mblk_t * iramp,ip_recv_attr_t * ira)593 ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
594 {
595 iramblk_t *irm;
596 netstack_t *ns;
597 ip_stack_t *ipst = NULL;
598 ill_t *ill = NULL, *rill = NULL;
599
600 /* We assume the caller hasn't initialized ira */
601 bzero(ira, sizeof (*ira));
602
603 ASSERT(DB_TYPE(iramp) == M_BREAK);
604 ASSERT(iramp->b_cont == NULL);
605
606 irm = (iramblk_t *)iramp->b_rptr;
607 ASSERT(irm->irm_inbound);
608
609 if (irm->irm_stackid != -1) {
610 /* Verify the netstack is still around */
611 ns = netstack_find_by_stackid(irm->irm_stackid);
612 if (ns == NULL) {
613 /* Disappeared on us */
614 (void) ip_recv_attr_free_mblk(iramp);
615 return (B_FALSE);
616 }
617 ipst = ns->netstack_ip;
618
619 /* Verify the ill is still around */
620 ill = ill_lookup_on_ifindex(irm->irm_ifindex,
621 !(irm->irm_flags & IRAF_IS_IPV4), ipst);
622
623 if (irm->irm_ifindex == irm->irm_rifindex) {
624 rill = ill;
625 } else {
626 rill = ill_lookup_on_ifindex(irm->irm_rifindex,
627 !(irm->irm_flags & IRAF_IS_IPV4), ipst);
628 }
629
630 /* We have the ill, hence the netstack can't go away */
631 netstack_rele(ns);
632 if (ill == NULL || rill == NULL) {
633 /* Disappeared on us */
634 if (ill != NULL)
635 ill_refrele(ill);
636 if (rill != NULL && rill != ill)
637 ill_refrele(rill);
638 (void) ip_recv_attr_free_mblk(iramp);
639 return (B_FALSE);
640 }
641 }
642
643 ira->ira_flags = irm->irm_flags;
644 /* Caller must ill_refele(ira_ill) by using ira_cleanup() */
645 ira->ira_ill = ill;
646 ira->ira_rill = rill;
647
648 ira->ira_rifindex = irm->irm_rifindex;
649 ira->ira_ruifindex = irm->irm_ruifindex;
650 ira->ira_pktlen = irm->irm_pktlen;
651 ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
652 ira->ira_protocol = irm->irm_protocol;
653 ira->ira_ttl = irm->irm_ttl;
654
655 ira->ira_sqp = irm->irm_sqp;
656 /* The rest of IP assumes that the rings never go away. */
657 ira->ira_ring = irm->irm_ring;
658
659 ira->ira_zoneid = irm->irm_zoneid;
660 ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
661 ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
662 ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;
663
664 if (irm->irm_tsl != NULL) {
665 ira->ira_tsl = irm->irm_tsl;
666 ira->ira_free_flags |= IRA_FREE_TSL;
667 irm->irm_tsl = NULL;
668 }
669 if (irm->irm_cred != NULL) {
670 ira->ira_cred = irm->irm_cred;
671 ira->ira_free_flags |= IRA_FREE_CRED;
672 irm->irm_cred = NULL;
673 }
674 ira->ira_cpid = irm->irm_cpid;
675
676 if (ira->ira_flags & IRAF_L2SRC_SET)
677 bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);
678
679 ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
680 ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
681 ira->ira_ipsec_action = irm->irm_ipsec_action;
682
683 freeb(iramp);
684 return (B_TRUE);
685 }
686
687 /*
688 * Free the irm mblk and any references it holds
689 * Returns b_cont.
690 */
691 mblk_t *
ip_recv_attr_free_mblk(mblk_t * iramp)692 ip_recv_attr_free_mblk(mblk_t *iramp)
693 {
694 iramblk_t *irm;
695 mblk_t *mp;
696
697 /* Consume mp */
698 ASSERT(DB_TYPE(iramp) == M_BREAK);
699 mp = iramp->b_cont;
700
701 irm = (iramblk_t *)iramp->b_rptr;
702 ASSERT(irm->irm_inbound);
703
704 if (irm->irm_ipsec_ah_sa != NULL) {
705 IPSA_REFRELE(irm->irm_ipsec_ah_sa);
706 irm->irm_ipsec_ah_sa = NULL;
707 }
708 if (irm->irm_ipsec_esp_sa != NULL) {
709 IPSA_REFRELE(irm->irm_ipsec_esp_sa);
710 irm->irm_ipsec_esp_sa = NULL;
711 }
712 if (irm->irm_ipsec_action != NULL) {
713 IPACT_REFRELE(irm->irm_ipsec_action);
714 irm->irm_ipsec_action = NULL;
715 }
716 if (irm->irm_tsl != NULL) {
717 label_rele(irm->irm_tsl);
718 irm->irm_tsl = NULL;
719 }
720 if (irm->irm_cred != NULL) {
721 crfree(irm->irm_cred);
722 irm->irm_cred = NULL;
723 }
724
725 freeb(iramp);
726 return (mp);
727 }
728
729 /*
730 * Returns true if the mblk contains an ip_recv_attr_t
731 * For now we just check db_type.
732 */
733 boolean_t
ip_recv_attr_is_mblk(mblk_t * mp)734 ip_recv_attr_is_mblk(mblk_t *mp)
735 {
736 /*
737 * Need to handle the various forms of tcp_timermp which are tagged
738 * with b_wptr and might have a NULL b_datap.
739 */
740 if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
741 return (B_FALSE);
742
743 #ifdef DEBUG
744 iramblk_t *irm;
745
746 if (DB_TYPE(mp) != M_BREAK)
747 return (B_FALSE);
748
749 irm = (iramblk_t *)mp->b_rptr;
750 ASSERT(irm->irm_inbound);
751 return (B_TRUE);
752 #else
753 return (DB_TYPE(mp) == M_BREAK);
754 #endif
755 }
756
757 static ip_xmit_attr_t *
conn_get_ixa_impl(conn_t * connp,boolean_t replace,int kmflag)758 conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
759 {
760 ip_xmit_attr_t *oldixa; /* Already attached to conn_t */
761 ip_xmit_attr_t *ixa; /* New one, which we return. */
762
763 /*
764 * NOTE: If the marked-below common case isn't, move the
765 * kmem_alloc() up here and put a free in what was marked as the
766 * (not really) common case instead.
767 */
768
769 mutex_enter(&connp->conn_lock);
770 oldixa = connp->conn_ixa;
771
772 /* At least one reference for the conn_t */
773 ASSERT3U(oldixa->ixa_refcnt, >=, 1);
774 if (atomic_inc_32_nv(&oldixa->ixa_refcnt) == 2) {
775 /* No other thread using conn_ixa (common case) */
776 mutex_exit(&connp->conn_lock);
777 return (oldixa);
778 }
779 /* Do allocation inside-the-conn_lock because it's less common. */
780 ixa = kmem_alloc(sizeof (*ixa), kmflag);
781 if (ixa == NULL) {
782 mutex_exit(&connp->conn_lock);
783 IXA_REFRELE(oldixa);
784 return (NULL);
785 }
786 ixa_safe_copy(oldixa, ixa);
787
788 /* Make sure we drop conn_lock before any refrele */
789 if (replace) {
790 ixa->ixa_refcnt++; /* No atomic needed - not visible */
791 connp->conn_ixa = ixa;
792 mutex_exit(&connp->conn_lock);
793 IXA_REFRELE(oldixa); /* Undo refcnt from conn_t */
794 } else {
795 mutex_exit(&connp->conn_lock);
796 }
797 IXA_REFRELE(oldixa); /* Undo above atomic_add_32_nv */
798
799 return (ixa);
800 }
801
802 /*
803 * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
804 * the caller can access the ip_xmit_attr_t.
805 *
806 * If nobody else is using conn_ixa we return it.
807 * Otherwise we make a "safe" copy of conn_ixa
808 * and return it. The "safe" copy has the pointers set to NULL
809 * (since the pointers might be changed by another thread using
810 * conn_ixa). The caller needs to check for NULL pointers to see
811 * if ip_set_destination needs to be called to re-establish the pointers.
812 *
813 * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
814 * That is used when we connect() the ULP.
815 */
816 ip_xmit_attr_t *
conn_get_ixa(conn_t * connp,boolean_t replace)817 conn_get_ixa(conn_t *connp, boolean_t replace)
818 {
819 return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
820 }
821
822 /*
823 * Used only when the option is to have the kernel hang due to not
824 * cleaning up ixa references on ills etc.
825 */
826 ip_xmit_attr_t *
conn_get_ixa_tryhard(conn_t * connp,boolean_t replace)827 conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
828 {
829 return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
830 }
831
832 /*
833 * Replace conn_ixa with the ixa argument.
834 *
835 * The caller must hold conn_lock.
836 *
837 * We return the old ixa; the caller must ixa_refrele that after conn_lock
838 * has been dropped.
839 */
840 ip_xmit_attr_t *
conn_replace_ixa(conn_t * connp,ip_xmit_attr_t * ixa)841 conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
842 {
843 ip_xmit_attr_t *oldixa;
844
845 ASSERT(MUTEX_HELD(&connp->conn_lock));
846
847 oldixa = connp->conn_ixa;
848 IXA_REFHOLD(ixa);
849 ixa->ixa_conn_id = oldixa->ixa_conn_id;
850 connp->conn_ixa = ixa;
851 return (oldixa);
852 }
853
854 /*
855 * Return a ip_xmit_attr_t to use with a conn_t that is based on but
856 * separate from conn_ixa.
857 *
858 * This "safe" copy has the pointers set to NULL
859 * (since the pointers might be changed by another thread using
860 * conn_ixa). The caller needs to check for NULL pointers to see
861 * if ip_set_destination needs to be called to re-establish the pointers.
862 */
863 ip_xmit_attr_t *
conn_get_ixa_exclusive(conn_t * connp)864 conn_get_ixa_exclusive(conn_t *connp)
865 {
866 ip_xmit_attr_t *oldixa;
867 ip_xmit_attr_t *ixa;
868
869 ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP_LAZY);
870 if (ixa == NULL)
871 return (NULL);
872
873 mutex_enter(&connp->conn_lock);
874
875 oldixa = connp->conn_ixa;
876 IXA_REFHOLD(oldixa);
877
878 ixa_safe_copy(oldixa, ixa);
879 mutex_exit(&connp->conn_lock);
880 IXA_REFRELE(oldixa);
881 return (ixa);
882 }
883
884 void
ixa_safe_copy(ip_xmit_attr_t * src,ip_xmit_attr_t * ixa)885 ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
886 {
887 bcopy(src, ixa, sizeof (*ixa));
888 ixa->ixa_refcnt = 1;
889 /*
890 * Clear any pointers that have references and might be changed
891 * by ip_set_destination or the ULP
892 */
893 ixa->ixa_ire = NULL;
894 ixa->ixa_nce = NULL;
895 ixa->ixa_dce = NULL;
896 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
897 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
898 #ifdef DEBUG
899 ixa->ixa_curthread = NULL;
900 #endif
901 /* Clear all the IPsec pointers and the flag as well. */
902 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
903
904 ixa->ixa_ipsec_latch = NULL;
905 ixa->ixa_ipsec_ah_sa = NULL;
906 ixa->ixa_ipsec_esp_sa = NULL;
907 ixa->ixa_ipsec_policy = NULL;
908 ixa->ixa_ipsec_action = NULL;
909
910 /*
911 * We leave ixa_tsl unchanged, but if it has a refhold we need
912 * to get an extra refhold.
913 */
914 if (ixa->ixa_free_flags & IXA_FREE_TSL)
915 label_hold(ixa->ixa_tsl);
916
917 /*
918 * We leave ixa_cred unchanged, but if it has a refhold we need
919 * to get an extra refhold.
920 */
921 if (ixa->ixa_free_flags & IXA_FREE_CRED)
922 crhold(ixa->ixa_cred);
923
924 /*
925 * There is no cleanup in progress on this new copy.
926 */
927 ixa->ixa_tcpcleanup = IXATC_IDLE;
928 }
929
930 /*
931 * Duplicate an ip_xmit_attr_t.
932 * Assumes that the caller controls the ixa, hence we do not need to use
933 * a safe copy. We just have to increase the refcnt on any pointers.
934 */
935 ip_xmit_attr_t *
ip_xmit_attr_duplicate(ip_xmit_attr_t * src_ixa)936 ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
937 {
938 ip_xmit_attr_t *ixa;
939
940 ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
941 if (ixa == NULL)
942 return (NULL);
943 bcopy(src_ixa, ixa, sizeof (*ixa));
944 ixa->ixa_refcnt = 1;
945
946 if (ixa->ixa_ire != NULL)
947 ire_refhold_notr(ixa->ixa_ire);
948 if (ixa->ixa_nce != NULL)
949 nce_refhold(ixa->ixa_nce);
950 if (ixa->ixa_dce != NULL)
951 dce_refhold_notr(ixa->ixa_dce);
952
953 #ifdef DEBUG
954 ixa->ixa_curthread = NULL;
955 #endif
956
957 if (ixa->ixa_ipsec_latch != NULL)
958 IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
959 if (ixa->ixa_ipsec_ah_sa != NULL)
960 IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
961 if (ixa->ixa_ipsec_esp_sa != NULL)
962 IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
963 if (ixa->ixa_ipsec_policy != NULL)
964 IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
965 if (ixa->ixa_ipsec_action != NULL)
966 IPACT_REFHOLD(ixa->ixa_ipsec_action);
967
968 if (ixa->ixa_tsl != NULL) {
969 label_hold(ixa->ixa_tsl);
970 ixa->ixa_free_flags |= IXA_FREE_TSL;
971 }
972 if (ixa->ixa_cred != NULL) {
973 crhold(ixa->ixa_cred);
974 ixa->ixa_free_flags |= IXA_FREE_CRED;
975 }
976 return (ixa);
977 }
978
979 /*
980 * Used to replace the ixa_label field.
981 * The caller should have a reference on the label, which we transfer to
982 * the attributes so that when the attribute is freed/cleaned up
983 * we will release that reference.
984 */
985 void
ip_xmit_attr_replace_tsl(ip_xmit_attr_t * ixa,ts_label_t * tsl)986 ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
987 {
988 ASSERT(tsl != NULL);
989
990 if (ixa->ixa_free_flags & IXA_FREE_TSL) {
991 ASSERT(ixa->ixa_tsl != NULL);
992 label_rele(ixa->ixa_tsl);
993 } else {
994 ixa->ixa_free_flags |= IXA_FREE_TSL;
995 }
996 ixa->ixa_tsl = tsl;
997 }
998
999 /*
1000 * Replace the ip_recv_attr_t's label.
1001 * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
1002 * TCP/UDP uses ira_cred to set db_credp for non-socket users.
1003 * This can fail (and return B_FALSE) due to lack of memory.
1004 */
1005 boolean_t
ip_recv_attr_replace_label(ip_recv_attr_t * ira,ts_label_t * tsl)1006 ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
1007 {
1008 cred_t *newcr;
1009
1010 if (ira->ira_free_flags & IRA_FREE_TSL) {
1011 ASSERT(ira->ira_tsl != NULL);
1012 label_rele(ira->ira_tsl);
1013 }
1014 label_hold(tsl);
1015 ira->ira_tsl = tsl;
1016 ira->ira_free_flags |= IRA_FREE_TSL;
1017
1018 /*
1019 * Reset zoneid if we have a shared address. That allows
1020 * ip_fanout_tx_v4/v6 to determine the zoneid again.
1021 */
1022 if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
1023 ira->ira_zoneid = ALL_ZONES;
1024
1025 /* We update ira_cred for RPC */
1026 newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
1027 if (newcr == NULL)
1028 return (B_FALSE);
1029 if (ira->ira_free_flags & IRA_FREE_CRED)
1030 crfree(ira->ira_cred);
1031 ira->ira_cred = newcr;
1032 ira->ira_free_flags |= IRA_FREE_CRED;
1033 return (B_TRUE);
1034 }
1035
1036 /*
1037 * This needs to be called after ip_set_destination/tsol_check_dest might
1038 * have changed ixa_tsl to be specific for a destination, and we now want to
1039 * send to a different destination.
1040 * We have to restart with crgetlabel() since ip_set_destination/
1041 * tsol_check_dest will start with ixa_tsl.
1042 */
1043 void
ip_xmit_attr_restore_tsl(ip_xmit_attr_t * ixa,cred_t * cr)1044 ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
1045 {
1046 if (!is_system_labeled())
1047 return;
1048
1049 if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1050 ASSERT(ixa->ixa_tsl != NULL);
1051 label_rele(ixa->ixa_tsl);
1052 ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1053 }
1054 ixa->ixa_tsl = crgetlabel(cr);
1055 }
1056
1057 void
ixa_refrele(ip_xmit_attr_t * ixa)1058 ixa_refrele(ip_xmit_attr_t *ixa)
1059 {
1060 IXA_REFRELE(ixa);
1061 }
1062
1063 void
ixa_inactive(ip_xmit_attr_t * ixa)1064 ixa_inactive(ip_xmit_attr_t *ixa)
1065 {
1066 ASSERT(ixa->ixa_refcnt == 0);
1067
1068 ixa_cleanup(ixa);
1069 kmem_free(ixa, sizeof (*ixa));
1070 }
1071
1072 /*
1073 * Release any references contained in the ixa.
1074 * Also clear any fields that are not controlled by ixa_flags.
1075 */
1076 void
ixa_cleanup(ip_xmit_attr_t * ixa)1077 ixa_cleanup(ip_xmit_attr_t *ixa)
1078 {
1079 if (ixa->ixa_ire != NULL) {
1080 ire_refrele_notr(ixa->ixa_ire);
1081 ixa->ixa_ire = NULL;
1082 }
1083 if (ixa->ixa_dce != NULL) {
1084 dce_refrele_notr(ixa->ixa_dce);
1085 ixa->ixa_dce = NULL;
1086 }
1087 if (ixa->ixa_nce != NULL) {
1088 nce_refrele(ixa->ixa_nce);
1089 ixa->ixa_nce = NULL;
1090 }
1091 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1092 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1093 if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
1094 ipsec_out_release_refs(ixa);
1095 }
1096 if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1097 ASSERT(ixa->ixa_tsl != NULL);
1098 label_rele(ixa->ixa_tsl);
1099 ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1100 }
1101 ixa->ixa_tsl = NULL;
1102 if (ixa->ixa_free_flags & IXA_FREE_CRED) {
1103 ASSERT(ixa->ixa_cred != NULL);
1104 crfree(ixa->ixa_cred);
1105 ixa->ixa_free_flags &= ~IXA_FREE_CRED;
1106 }
1107 ixa->ixa_cred = NULL;
1108 ixa->ixa_src_preferences = 0;
1109 ixa->ixa_ifindex = 0;
1110 ixa->ixa_multicast_ifindex = 0;
1111 ixa->ixa_multicast_ifaddr = INADDR_ANY;
1112 }
1113
1114 /*
1115 * Release any references contained in the ira.
1116 * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
1117 * argument.
1118 */
1119 void
ira_cleanup(ip_recv_attr_t * ira,boolean_t refrele_ill)1120 ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
1121 {
1122 if (ira->ira_ill != NULL) {
1123 if (ira->ira_rill != ira->ira_ill) {
1124 /* Caused by async processing */
1125 ill_refrele(ira->ira_rill);
1126 }
1127 if (refrele_ill)
1128 ill_refrele(ira->ira_ill);
1129 }
1130 if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1131 ipsec_in_release_refs(ira);
1132 }
1133 if (ira->ira_free_flags & IRA_FREE_TSL) {
1134 ASSERT(ira->ira_tsl != NULL);
1135 label_rele(ira->ira_tsl);
1136 ira->ira_free_flags &= ~IRA_FREE_TSL;
1137 }
1138 ira->ira_tsl = NULL;
1139 if (ira->ira_free_flags & IRA_FREE_CRED) {
1140 ASSERT(ira->ira_cred != NULL);
1141 crfree(ira->ira_cred);
1142 ira->ira_free_flags &= ~IRA_FREE_CRED;
1143 }
1144 ira->ira_cred = NULL;
1145 }
1146
1147 /*
1148 * Function to help release any IRE, NCE, or DCEs that
1149 * have been deleted and are marked as condemned.
1150 * The caller is responsible for any serialization which is different
1151 * for TCP, SCTP, and others.
1152 */
1153 static void
ixa_cleanup_stale(ip_xmit_attr_t * ixa)1154 ixa_cleanup_stale(ip_xmit_attr_t *ixa)
1155 {
1156 ire_t *ire;
1157 nce_t *nce;
1158 dce_t *dce;
1159
1160 ire = ixa->ixa_ire;
1161 nce = ixa->ixa_nce;
1162 dce = ixa->ixa_dce;
1163
1164 if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
1165 ire_refrele_notr(ire);
1166 ire = ire_blackhole(ixa->ixa_ipst,
1167 !(ixa->ixa_flags & IXAF_IS_IPV4));
1168 ASSERT(ire != NULL);
1169 #ifdef DEBUG
1170 ire_refhold_notr(ire);
1171 ire_refrele(ire);
1172 #endif
1173 ixa->ixa_ire = ire;
1174 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1175 }
1176 if (nce != NULL && nce->nce_is_condemned) {
1177 /* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
1178 nce_refrele(nce);
1179 ixa->ixa_nce = NULL;
1180 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1181 }
1182 if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
1183 dce_refrele_notr(dce);
1184 dce = dce_get_default(ixa->ixa_ipst);
1185 ASSERT(dce != NULL);
1186 #ifdef DEBUG
1187 dce_refhold_notr(dce);
1188 dce_refrele(dce);
1189 #endif
1190 ixa->ixa_dce = dce;
1191 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1192 }
1193 }
1194
1195 static mblk_t *
tcp_ixa_cleanup_getmblk(conn_t * connp)1196 tcp_ixa_cleanup_getmblk(conn_t *connp)
1197 {
1198 tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1199 int need_retry;
1200 mblk_t *mp;
1201
1202 mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1203
1204 /*
1205 * It's possible that someone else came in and started cleaning up
1206 * another connection between the time we verified this one is not being
1207 * cleaned up and the time we actually get the shared mblk. If that's
1208 * the case, we've dropped the lock, and some other thread may have
1209 * cleaned up this connection again, and is still waiting for
1210 * notification of that cleanup's completion. Therefore we need to
1211 * recheck.
1212 */
1213 do {
1214 need_retry = 0;
1215 while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
1216 cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1217 &tcps->tcps_ixa_cleanup_lock);
1218 }
1219
1220 while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
1221 /*
1222 * Multiple concurrent cleanups; need to have the last
1223 * one run since it could be an unplumb.
1224 */
1225 need_retry = 1;
1226 cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
1227 &tcps->tcps_ixa_cleanup_lock);
1228 }
1229 } while (need_retry);
1230
1231 /*
1232 * We now have the lock and the mblk; now make sure that no one else can
1233 * try to clean up this connection or enqueue it for cleanup, clear the
1234 * mblk pointer for this stack, drop the lock, and return the mblk.
1235 */
1236 ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
1237 ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
1238 ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
1239 ASSERT(mp != NULL);
1240
1241 connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
1242 tcps->tcps_ixa_cleanup_mp = NULL;
1243 mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1244
1245 return (mp);
1246 }
1247
1248 /*
1249 * Used to run ixa_cleanup_stale inside the tcp squeue.
1250 * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
1251 * and waking up the caller.
1252 */
1253 /* ARGSUSED2 */
1254 static void
tcp_ixa_cleanup(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * dummy)1255 tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
1256 ip_recv_attr_t *dummy)
1257 {
1258 conn_t *connp = (conn_t *)arg;
1259 tcp_stack_t *tcps;
1260
1261 tcps = connp->conn_netstack->netstack_tcp;
1262
1263 ixa_cleanup_stale(connp->conn_ixa);
1264
1265 mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1266 ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
1267 connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
1268 tcps->tcps_ixa_cleanup_mp = mp;
1269 cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
1270 /*
1271 * It is possible for any number of threads to be waiting for cleanup of
1272 * different connections. Absent a per-connection (or per-IXA) CV, we
1273 * need to wake them all up even though only one can be waiting on this
1274 * particular cleanup.
1275 */
1276 cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1277 mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1278 }
1279
1280 static void
tcp_ixa_cleanup_wait_and_finish(conn_t * connp)1281 tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
1282 {
1283 tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1284
1285 mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1286
1287 ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
1288
1289 while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
1290 cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1291 &tcps->tcps_ixa_cleanup_lock);
1292 }
1293
1294 ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
1295 connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
1296 cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1297
1298 mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1299 }
1300
1301 /*
1302 * ipcl_walk() function to help release any IRE, NCE, or DCEs that
1303 * have been deleted and are marked as condemned.
1304 * Note that we can't cleanup the pointers since there can be threads
1305 * in conn_ip_output() sending while we are called.
1306 */
1307 void
conn_ixa_cleanup(conn_t * connp,void * arg)1308 conn_ixa_cleanup(conn_t *connp, void *arg)
1309 {
1310 boolean_t tryhard = (boolean_t)arg;
1311
1312 if (IPCL_IS_TCP(connp)) {
1313 mblk_t *mp;
1314
1315 mp = tcp_ixa_cleanup_getmblk(connp);
1316
1317 if (connp->conn_sqp->sq_run == curthread) {
1318 /* Already on squeue */
1319 tcp_ixa_cleanup(connp, mp, NULL, NULL);
1320 } else {
1321 CONN_INC_REF(connp);
1322 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
1323 connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
1324 }
1325 tcp_ixa_cleanup_wait_and_finish(connp);
1326 } else if (IPCL_IS_SCTP(connp)) {
1327 sctp_t *sctp;
1328 sctp_faddr_t *fp;
1329
1330 sctp = CONN2SCTP(connp);
1331 RUN_SCTP(sctp);
1332 ixa_cleanup_stale(connp->conn_ixa);
1333 for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->sf_next)
1334 ixa_cleanup_stale(fp->sf_ixa);
1335 WAKE_SCTP(sctp);
1336 } else {
1337 ip_xmit_attr_t *ixa;
1338
1339 /*
1340 * If there is a different thread using conn_ixa then we get a
1341 * new copy and cut the old one loose from conn_ixa. Otherwise
1342 * we use conn_ixa and prevent any other thread from
1343 * using/changing it. Anybody using conn_ixa (e.g., a thread in
1344 * conn_ip_output) will do an ixa_refrele which will remove any
1345 * references on the ire etc.
1346 *
1347 * Once we are done other threads can use conn_ixa since the
1348 * refcnt will be back at one.
1349 *
1350 * We are called either because an ill is going away, or
1351 * due to memory reclaim. In the former case we wait for
1352 * memory since we must remove the refcnts on the ill.
1353 */
1354 if (tryhard) {
1355 ixa = conn_get_ixa_tryhard(connp, B_TRUE);
1356 ASSERT(ixa != NULL);
1357 } else {
1358 ixa = conn_get_ixa(connp, B_TRUE);
1359 if (ixa == NULL) {
1360 /*
1361 * Somebody else was using it and kmem_alloc
1362 * failed! Next memory reclaim will try to
1363 * clean up.
1364 */
1365 DTRACE_PROBE1(conn__ixa__cleanup__bail,
1366 conn_t *, connp);
1367 return;
1368 }
1369 }
1370 ixa_cleanup_stale(ixa);
1371 IXA_REFRELE(ixa);
1372 }
1373 }
1374
1375 /*
1376 * ixa needs to be an exclusive copy so that no one changes the cookie
1377 * or the ixa_nce.
1378 */
1379 boolean_t
ixa_check_drain_insert(conn_t * connp,ip_xmit_attr_t * ixa)1380 ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
1381 {
1382 uintptr_t cookie = ixa->ixa_cookie;
1383 ill_dld_direct_t *idd;
1384 idl_tx_list_t *idl_txl;
1385 ill_t *ill = ixa->ixa_nce->nce_ill;
1386 boolean_t inserted = B_FALSE;
1387
1388 idd = &(ill)->ill_dld_capab->idc_direct;
1389 idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
1390 mutex_enter(&idl_txl->txl_lock);
1391
1392 /*
1393 * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow
1394 * control is asserted on an ill that does not support direct calls.
1395 * Jump to insert.
1396 */
1397 if (cookie == 0)
1398 goto tryinsert;
1399
1400 ASSERT(ILL_DIRECT_CAPABLE(ill));
1401
1402 if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) {
1403 DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie);
1404 } else if (idl_txl->txl_cookie != (uintptr_t)NULL &&
1405 idl_txl->txl_cookie != ixa->ixa_cookie) {
1406 DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie,
1407 uintptr_t, idl_txl->txl_cookie);
1408 /* TODO: bump kstat for cookie collision */
1409 } else {
1410 /*
1411 * Check/set conn_blocked under conn_lock. Note that txl_lock
1412 * will not suffice since two separate UDP threads may be
1413 * racing to send to different destinations that are
1414 * associated with different cookies and thus may not be
1415 * holding the same txl_lock. Further, since a given conn_t
1416 * can only be on a single drain list, the conn_t will be
1417 * enqueued on whichever thread wins this race.
1418 */
1419 tryinsert: mutex_enter(&connp->conn_lock);
1420 if (connp->conn_blocked) {
1421 DTRACE_PROBE1(ill__tx__conn__already__blocked,
1422 conn_t *, connp);
1423 mutex_exit(&connp->conn_lock);
1424 } else {
1425 connp->conn_blocked = B_TRUE;
1426 mutex_exit(&connp->conn_lock);
1427 idl_txl->txl_cookie = cookie;
1428 conn_drain_insert(connp, idl_txl);
1429 if (!IPCL_IS_NONSTR(connp))
1430 noenable(connp->conn_wq);
1431 inserted = B_TRUE;
1432 }
1433 }
1434 mutex_exit(&idl_txl->txl_lock);
1435 return (inserted);
1436 }
1437