1c793af95Ssangeeta /*
2c793af95Ssangeeta * CDDL HEADER START
3c793af95Ssangeeta *
4c793af95Ssangeeta * The contents of this file are subject to the terms of the
5c793af95Ssangeeta * Common Development and Distribution License (the "License").
6c793af95Ssangeeta * You may not use this file except in compliance with the License.
7c793af95Ssangeeta *
8c793af95Ssangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9c793af95Ssangeeta * or http://www.opensolaris.org/os/licensing.
10c793af95Ssangeeta * See the License for the specific language governing permissions
11c793af95Ssangeeta * and limitations under the License.
12c793af95Ssangeeta *
13c793af95Ssangeeta * When distributing Covered Code, include this CDDL HEADER in each
14c793af95Ssangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15c793af95Ssangeeta * If applicable, add the following below this CDDL HEADER, with the
16c793af95Ssangeeta * fields enclosed by brackets "[]" replaced with your own identifying
17c793af95Ssangeeta * information: Portions Copyright [yyyy] [name of copyright owner]
18c793af95Ssangeeta *
19c793af95Ssangeeta * CDDL HEADER END
20c793af95Ssangeeta */
21c793af95Ssangeeta /*
22fff7ec1dSSowmini Varadhan * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23c793af95Ssangeeta */
24c793af95Ssangeeta
25c793af95Ssangeeta /*
26c793af95Ssangeeta * This file contains consumer routines of the IPv4 forwarding engine
27c793af95Ssangeeta */
28c793af95Ssangeeta
29c793af95Ssangeeta #include <sys/types.h>
30c793af95Ssangeeta #include <sys/stream.h>
31c793af95Ssangeeta #include <sys/stropts.h>
32c793af95Ssangeeta #include <sys/strlog.h>
33c793af95Ssangeeta #include <sys/dlpi.h>
34c793af95Ssangeeta #include <sys/ddi.h>
35c793af95Ssangeeta #include <sys/cmn_err.h>
36c793af95Ssangeeta #include <sys/policy.h>
37c793af95Ssangeeta
38c793af95Ssangeeta #include <sys/systm.h>
39c793af95Ssangeeta #include <sys/strsun.h>
40c793af95Ssangeeta #include <sys/kmem.h>
41c793af95Ssangeeta #include <sys/param.h>
42c793af95Ssangeeta #include <sys/socket.h>
43edd26dc5Sdr146992 #include <sys/strsubr.h>
44c793af95Ssangeeta #include <net/if.h>
45c793af95Ssangeeta #include <net/route.h>
46c793af95Ssangeeta #include <netinet/in.h>
47c793af95Ssangeeta #include <net/if_dl.h>
48c793af95Ssangeeta #include <netinet/ip6.h>
49c793af95Ssangeeta #include <netinet/icmp6.h>
50c793af95Ssangeeta
51bd670b35SErik Nordmark #include <inet/ipsec_impl.h>
52c793af95Ssangeeta #include <inet/common.h>
53c793af95Ssangeeta #include <inet/mi.h>
54c793af95Ssangeeta #include <inet/mib2.h>
55c793af95Ssangeeta #include <inet/ip.h>
56edd26dc5Sdr146992 #include <inet/ip_impl.h>
57c793af95Ssangeeta #include <inet/ip6.h>
58c793af95Ssangeeta #include <inet/ip_ndp.h>
59c793af95Ssangeeta #include <inet/arp.h>
60c793af95Ssangeeta #include <inet/ip_if.h>
61c793af95Ssangeeta #include <inet/ip_ire.h>
62c793af95Ssangeeta #include <inet/ip_ftable.h>
63c793af95Ssangeeta #include <inet/ip_rts.h>
64c793af95Ssangeeta #include <inet/nd.h>
65c793af95Ssangeeta
66c793af95Ssangeeta #include <net/pfkeyv2.h>
67c793af95Ssangeeta #include <inet/sadb.h>
68c793af95Ssangeeta #include <inet/tcp.h>
69c793af95Ssangeeta #include <inet/ipclassifier.h>
70c793af95Ssangeeta #include <sys/zone.h>
71c793af95Ssangeeta #include <net/radix.h>
72c793af95Ssangeeta #include <sys/tsol/label.h>
73c793af95Ssangeeta #include <sys/tsol/tnet.h>
74c793af95Ssangeeta
75c793af95Ssangeeta #define IS_DEFAULT_ROUTE(ire) \
76c793af95Ssangeeta (((ire)->ire_type & IRE_DEFAULT) || \
77c793af95Ssangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
78c793af95Ssangeeta
7944b099c4SSowmini Varadhan #define IP_SRC_MULTIHOMING(isv6, ipst) \
8044b099c4SSowmini Varadhan (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \
8144b099c4SSowmini Varadhan ipst->ips_ip_strict_src_multihoming)
8244b099c4SSowmini Varadhan
83f4b3ec61Sdh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
84c793af95Ssangeeta static void ire_del_host_redir(ire_t *, char *);
85c793af95Ssangeeta static boolean_t ire_find_best_route(struct radix_node *, void *);
86c793af95Ssangeeta
87c793af95Ssangeeta /*
88c793af95Ssangeeta * Lookup a route in forwarding table. A specific lookup is indicated by
89c793af95Ssangeeta * passing the required parameters and indicating the match required in the
90c793af95Ssangeeta * flag field.
91c793af95Ssangeeta *
92c793af95Ssangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing.
93c793af95Ssangeeta */
94c793af95Ssangeeta ire_t *
ire_ftable_lookup_v4(ipaddr_t addr,ipaddr_t mask,ipaddr_t gateway,int type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,int flags,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)95bd670b35SErik Nordmark ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
96bd670b35SErik Nordmark int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
97bd670b35SErik Nordmark int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
98c793af95Ssangeeta {
99bd670b35SErik Nordmark ire_t *ire;
100c793af95Ssangeeta struct rt_sockaddr rdst, rmask;
101c793af95Ssangeeta struct rt_entry *rt;
102c793af95Ssangeeta ire_ftable_args_t margs;
103c793af95Ssangeeta
104bd670b35SErik Nordmark ASSERT(ill == NULL || !ill->ill_isv6);
105c793af95Ssangeeta
106c793af95Ssangeeta /*
107bd670b35SErik Nordmark * ire_match_args() will dereference ill if MATCH_IRE_ILL
108bd670b35SErik Nordmark * is set.
109c793af95Ssangeeta */
11044b099c4SSowmini Varadhan if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
111c793af95Ssangeeta return (NULL);
112c793af95Ssangeeta
113188e1664SErik Nordmark bzero(&rdst, sizeof (rdst));
114c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst);
115c793af95Ssangeeta rdst.rt_sin_family = AF_INET;
116c793af95Ssangeeta rdst.rt_sin_addr.s_addr = addr;
117c793af95Ssangeeta
118188e1664SErik Nordmark bzero(&rmask, sizeof (rmask));
119c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask);
120c793af95Ssangeeta rmask.rt_sin_family = AF_INET;
121c793af95Ssangeeta rmask.rt_sin_addr.s_addr = mask;
122c793af95Ssangeeta
123188e1664SErik Nordmark bzero(&margs, sizeof (margs));
124c793af95Ssangeeta margs.ift_addr = addr;
125c793af95Ssangeeta margs.ift_mask = mask;
126c793af95Ssangeeta margs.ift_gateway = gateway;
127c793af95Ssangeeta margs.ift_type = type;
128bd670b35SErik Nordmark margs.ift_ill = ill;
129c793af95Ssangeeta margs.ift_zoneid = zoneid;
130c793af95Ssangeeta margs.ift_tsl = tsl;
131c793af95Ssangeeta margs.ift_flags = flags;
132c793af95Ssangeeta
133c793af95Ssangeeta /*
134c793af95Ssangeeta * The flags argument passed to ire_ftable_lookup may cause the
135c793af95Ssangeeta * search to return, not the longest matching prefix, but the
136c793af95Ssangeeta * "best matching prefix", i.e., the longest prefix that also
137c793af95Ssangeeta * satisfies constraints imposed via the permutation of flags
138c793af95Ssangeeta * passed in. To achieve this, we invoke ire_match_args() on
139c793af95Ssangeeta * each matching leaf in the radix tree. ire_match_args is
140c793af95Ssangeeta * invoked by the callback function ire_find_best_route()
141c793af95Ssangeeta * We hold the global tree lock in read mode when calling
142c793af95Ssangeeta * rn_match_args. Before dropping the global tree lock, ensure
143c793af95Ssangeeta * that the radix node can't be deleted by incrementing ire_refcnt.
144c793af95Ssangeeta */
145f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
146f4b3ec61Sdh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
147f4b3ec61Sdh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs);
148c793af95Ssangeeta ire = margs.ift_best_ire;
149c793af95Ssangeeta if (rt == NULL) {
150bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
151c793af95Ssangeeta return (NULL);
152c793af95Ssangeeta }
153bd670b35SErik Nordmark ASSERT(ire != NULL);
154c793af95Ssangeeta
155c793af95Ssangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
156c793af95Ssangeeta
157c793af95Ssangeeta /*
158c793af95Ssangeeta * round-robin only if we have more than one route in the bucket.
159bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP
160bd670b35SErik Nordmark * 2: always
161bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
162bd670b35SErik Nordmark * 0: never
163c793af95Ssangeeta */
164bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
165bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 ||
166bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 &&
167bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) {
168c793af95Ssangeeta ire_t *next_ire;
169c793af95Ssangeeta
170bd670b35SErik Nordmark margs.ift_best_ire = NULL;
171bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs,
172bd670b35SErik Nordmark xmit_hint, ire, ipst);
173bd670b35SErik Nordmark if (next_ire == NULL) {
174bd670b35SErik Nordmark /* keep ire if next_ire is null */
175bd670b35SErik Nordmark goto done;
176bd670b35SErik Nordmark }
177bd670b35SErik Nordmark ire_refrele(ire);
178c793af95Ssangeeta ire = next_ire;
179c793af95Ssangeeta }
180c793af95Ssangeeta }
181c793af95Ssangeeta
182bd670b35SErik Nordmark done:
183bd670b35SErik Nordmark /* Return generation before dropping lock */
184bd670b35SErik Nordmark if (generationp != NULL)
185bd670b35SErik Nordmark *generationp = ire->ire_generation;
186c793af95Ssangeeta
187bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
188e11c3f44Smeem
189c793af95Ssangeeta /*
190bd670b35SErik Nordmark * For shared-IP zones we need additional checks to what was
191bd670b35SErik Nordmark * done in ire_match_args to make sure IRE_LOCALs are handled.
192bd670b35SErik Nordmark *
193bd670b35SErik Nordmark * When ip_restrict_interzone_loopback is set, then
194bd670b35SErik Nordmark * we ensure that IRE_LOCAL are only used for loopback
195bd670b35SErik Nordmark * between zones when the logical "Ethernet" would
196bd670b35SErik Nordmark * have looped them back. That is, if in the absense of
197bd670b35SErik Nordmark * the IRE_LOCAL we would have sent to packet out the
198bd670b35SErik Nordmark * same ill.
199c793af95Ssangeeta */
200bd670b35SErik Nordmark if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
201bd670b35SErik Nordmark ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
202bd670b35SErik Nordmark ipst->ips_ip_restrict_interzone_loopback) {
203bd670b35SErik Nordmark ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
204bd670b35SErik Nordmark ASSERT(ire != NULL);
205c793af95Ssangeeta }
206c793af95Ssangeeta return (ire);
207c793af95Ssangeeta }
208c793af95Ssangeeta
209da14cebeSEric Cheng /*
210da14cebeSEric Cheng * This function is called by
211bd670b35SErik Nordmark * ip_input/ire_route_recursive when doing a route lookup on only the
212bd670b35SErik Nordmark * destination address.
213bd670b35SErik Nordmark *
214da14cebeSEric Cheng * The optimizations of this function over ire_ftable_lookup are:
215da14cebeSEric Cheng * o removing unnecessary flag matching
216da14cebeSEric Cheng * o doing longest prefix match instead of overloading it further
217da14cebeSEric Cheng * with the unnecessary "best_prefix_match"
218bd670b35SErik Nordmark *
219bd670b35SErik Nordmark * If no route is found we return IRE_NOROUTE.
220da14cebeSEric Cheng */
221bd670b35SErik Nordmark ire_t *
ire_ftable_lookup_simple_v4(ipaddr_t addr,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)222bd670b35SErik Nordmark ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
223bd670b35SErik Nordmark uint_t *generationp)
224da14cebeSEric Cheng {
225bd670b35SErik Nordmark ire_t *ire;
226da14cebeSEric Cheng struct rt_sockaddr rdst;
227da14cebeSEric Cheng struct rt_entry *rt;
228bd670b35SErik Nordmark irb_t *irb;
229da14cebeSEric Cheng
230da14cebeSEric Cheng rdst.rt_sin_len = sizeof (rdst);
231da14cebeSEric Cheng rdst.rt_sin_family = AF_INET;
232da14cebeSEric Cheng rdst.rt_sin_addr.s_addr = addr;
233da14cebeSEric Cheng
234da14cebeSEric Cheng /*
235da14cebeSEric Cheng * This is basically inlining a simpler version of ire_match_args
236da14cebeSEric Cheng */
237da14cebeSEric Cheng RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
238da14cebeSEric Cheng
239da14cebeSEric Cheng rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
240da14cebeSEric Cheng ipst->ips_ip_ftable, NULL, NULL);
241da14cebeSEric Cheng
242bd670b35SErik Nordmark if (rt == NULL)
243bd670b35SErik Nordmark goto bad;
244bd670b35SErik Nordmark
245bd670b35SErik Nordmark irb = &rt->rt_irb;
246bd670b35SErik Nordmark if (irb->irb_ire_cnt == 0)
247bd670b35SErik Nordmark goto bad;
248bd670b35SErik Nordmark
249bd670b35SErik Nordmark rw_enter(&irb->irb_lock, RW_READER);
250bd670b35SErik Nordmark ire = irb->irb_ire;
251bd670b35SErik Nordmark if (ire == NULL) {
252bd670b35SErik Nordmark rw_exit(&irb->irb_lock);
253bd670b35SErik Nordmark goto bad;
254da14cebeSEric Cheng }
255bd670b35SErik Nordmark while (IRE_IS_CONDEMNED(ire)) {
256bd670b35SErik Nordmark ire = ire->ire_next;
257bd670b35SErik Nordmark if (ire == NULL) {
258bd670b35SErik Nordmark rw_exit(&irb->irb_lock);
259bd670b35SErik Nordmark goto bad;
260bd670b35SErik Nordmark }
261da14cebeSEric Cheng }
262da14cebeSEric Cheng
263da14cebeSEric Cheng /* we have a ire that matches */
264bd670b35SErik Nordmark ire_refhold(ire);
265bd670b35SErik Nordmark rw_exit(&irb->irb_lock);
266bd670b35SErik Nordmark
267bd670b35SErik Nordmark /*
268bd670b35SErik Nordmark * round-robin only if we have more than one route in the bucket.
269bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP
270bd670b35SErik Nordmark * 2: always
271bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
272bd670b35SErik Nordmark * 0: never
273bd670b35SErik Nordmark *
274bd670b35SErik Nordmark * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
275bd670b35SErik Nordmark * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
276bd670b35SErik Nordmark * and the IRE_INTERFACESs are likely to be shorter matches.
277bd670b35SErik Nordmark */
278bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1) {
279bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 ||
280bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 &&
281bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) {
282bd670b35SErik Nordmark ire_t *next_ire;
283bd670b35SErik Nordmark ire_ftable_args_t margs;
284bd670b35SErik Nordmark
285188e1664SErik Nordmark bzero(&margs, sizeof (margs));
286bd670b35SErik Nordmark margs.ift_addr = addr;
287bd670b35SErik Nordmark margs.ift_zoneid = ALL_ZONES;
288bd670b35SErik Nordmark
289bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs,
290bd670b35SErik Nordmark xmit_hint, ire, ipst);
291bd670b35SErik Nordmark if (next_ire == NULL) {
292bd670b35SErik Nordmark /* keep ire if next_ire is null */
293bd670b35SErik Nordmark if (generationp != NULL)
294bd670b35SErik Nordmark *generationp = ire->ire_generation;
295bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
296bd670b35SErik Nordmark return (ire);
297bd670b35SErik Nordmark }
298bd670b35SErik Nordmark ire_refrele(ire);
299bd670b35SErik Nordmark ire = next_ire;
300bd670b35SErik Nordmark }
301bd670b35SErik Nordmark }
302bd670b35SErik Nordmark /* Return generation before dropping lock */
303bd670b35SErik Nordmark if (generationp != NULL)
304bd670b35SErik Nordmark *generationp = ire->ire_generation;
305bd670b35SErik Nordmark
306da14cebeSEric Cheng RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
307da14cebeSEric Cheng
308bd670b35SErik Nordmark /*
309bd670b35SErik Nordmark * Since we only did ALL_ZONES matches there is no special handling
310bd670b35SErik Nordmark * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
311bd670b35SErik Nordmark */
312da14cebeSEric Cheng return (ire);
313da14cebeSEric Cheng
314bd670b35SErik Nordmark bad:
315bd670b35SErik Nordmark if (generationp != NULL)
316bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY;
317da14cebeSEric Cheng
318bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
319bd670b35SErik Nordmark return (ire_reject(ipst, B_FALSE));
320da14cebeSEric Cheng }
321c793af95Ssangeeta
322c793af95Ssangeeta /*
323bd670b35SErik Nordmark * Find the ill matching a multicast group.
324c793af95Ssangeeta * Allows different routes for multicast addresses
325c793af95Ssangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
326c793af95Ssangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF
327c793af95Ssangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
328c793af95Ssangeeta * specify the interface to join on.
329c793af95Ssangeeta *
330bd670b35SErik Nordmark * Supports link-local addresses by using ire_route_recursive which follows
331bd670b35SErik Nordmark * the ill when recursing.
332bd670b35SErik Nordmark *
333bd670b35SErik Nordmark * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
334bd670b35SErik Nordmark * and the MULTIRT property can be different for different groups, we
335bd670b35SErik Nordmark * extract RTF_MULTIRT from the special unicast route added for a group
336bd670b35SErik Nordmark * with CGTP and pass that back in the multirtp argument.
337bd670b35SErik Nordmark * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
338bd670b35SErik Nordmark * We have a setsrcp argument for the same reason.
339c793af95Ssangeeta */
340bd670b35SErik Nordmark ill_t *
ire_lookup_multi_ill_v4(ipaddr_t group,zoneid_t zoneid,ip_stack_t * ipst,boolean_t * multirtp,ipaddr_t * setsrcp)341bd670b35SErik Nordmark ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
342bd670b35SErik Nordmark boolean_t *multirtp, ipaddr_t *setsrcp)
343c793af95Ssangeeta {
344c793af95Ssangeeta ire_t *ire;
345bd670b35SErik Nordmark ill_t *ill;
346c793af95Ssangeeta
347bd670b35SErik Nordmark ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
3489e3469d3SErik Nordmark MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
349bd670b35SErik Nordmark ASSERT(ire != NULL);
350bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
351c793af95Ssangeeta ire_refrele(ire);
352c793af95Ssangeeta return (NULL);
353c793af95Ssangeeta }
354bd670b35SErik Nordmark
355bd670b35SErik Nordmark if (multirtp != NULL)
356bd670b35SErik Nordmark *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
357bd670b35SErik Nordmark
358bd670b35SErik Nordmark ill = ire_nexthop_ill(ire);
359bd670b35SErik Nordmark ire_refrele(ire);
360bd670b35SErik Nordmark return (ill);
361c793af95Ssangeeta }
362c793af95Ssangeeta
363c793af95Ssangeeta /*
364c793af95Ssangeeta * Delete the passed in ire if the gateway addr matches
365c793af95Ssangeeta */
366c793af95Ssangeeta void
ire_del_host_redir(ire_t * ire,char * gateway)367c793af95Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway)
368c793af95Ssangeeta {
3696bdb8e66Sdd193516 if ((ire->ire_flags & RTF_DYNAMIC) &&
370c793af95Ssangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
371c793af95Ssangeeta ire_delete(ire);
372c793af95Ssangeeta }
373c793af95Ssangeeta
374c793af95Ssangeeta /*
375bd670b35SErik Nordmark * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
376c793af95Ssangeeta * pointing at the specified gateway and
377c793af95Ssangeeta * delete them. This routine is called only
378c793af95Ssangeeta * when a default gateway is going away.
379c793af95Ssangeeta */
380c793af95Ssangeeta void
ire_delete_host_redirects(ipaddr_t gateway,ip_stack_t * ipst)381f4b3ec61Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
382c793af95Ssangeeta {
383c793af95Ssangeeta struct rtfuncarg rtfarg;
384c793af95Ssangeeta
385188e1664SErik Nordmark bzero(&rtfarg, sizeof (rtfarg));
386c793af95Ssangeeta rtfarg.rt_func = ire_del_host_redir;
387c793af95Ssangeeta rtfarg.rt_arg = (void *)&gateway;
388188e1664SErik Nordmark rtfarg.rt_zoneid = ALL_ZONES;
389188e1664SErik Nordmark rtfarg.rt_ipst = ipst;
390f4b3ec61Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
391f4b3ec61Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
392c793af95Ssangeeta }
393c793af95Ssangeeta
394c793af95Ssangeeta /*
395f4b3ec61Sdh155122 * Obtain the rt_entry and rt_irb for the route to be added to
396f4b3ec61Sdh155122 * the ips_ip_ftable.
397c793af95Ssangeeta * First attempt to add a node to the radix tree via rn_addroute. If the
398c793af95Ssangeeta * route already exists, return the bucket for the existing route.
399c793af95Ssangeeta *
400c793af95Ssangeeta * Locking notes: Need to hold the global radix tree lock in write mode to
401c793af95Ssangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket()
402c793af95Ssangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
403c793af95Ssangeeta * while holding the irb_lock, but not the radix tree lock.
404c793af95Ssangeeta */
405c793af95Ssangeeta irb_t *
ire_get_bucket(ire_t * ire)406c793af95Ssangeeta ire_get_bucket(ire_t *ire)
407c793af95Ssangeeta {
408c793af95Ssangeeta struct radix_node *rn;
409c793af95Ssangeeta struct rt_entry *rt;
410c793af95Ssangeeta struct rt_sockaddr rmask, rdst;
411c793af95Ssangeeta irb_t *irb = NULL;
412f4b3ec61Sdh155122 ip_stack_t *ipst = ire->ire_ipst;
413c793af95Ssangeeta
414f4b3ec61Sdh155122 ASSERT(ipst->ips_ip_ftable != NULL);
415c793af95Ssangeeta
416c793af95Ssangeeta /* first try to see if route exists (based on rtalloc1) */
417188e1664SErik Nordmark bzero(&rdst, sizeof (rdst));
418c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst);
419c793af95Ssangeeta rdst.rt_sin_family = AF_INET;
420c793af95Ssangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr;
421c793af95Ssangeeta
422188e1664SErik Nordmark bzero(&rmask, sizeof (rmask));
423c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask);
424c793af95Ssangeeta rmask.rt_sin_family = AF_INET;
425c793af95Ssangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask;
426c793af95Ssangeeta
427c793af95Ssangeeta /*
428c793af95Ssangeeta * add the route. based on BSD's rtrequest1(RTM_ADD)
429c793af95Ssangeeta */
430c793af95Ssangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt));
43129bc4795Ssangeeta /* kmem_alloc failed */
43229bc4795Ssangeeta if (rt == NULL)
43329bc4795Ssangeeta return (NULL);
43429bc4795Ssangeeta
435188e1664SErik Nordmark bzero(rt, sizeof (*rt));
436c793af95Ssangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
437c793af95Ssangeeta rt->rt_dst = rdst;
438c793af95Ssangeeta irb = &rt->rt_irb;
439bd670b35SErik Nordmark irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
440f4b3ec61Sdh155122 irb->irb_ipst = ipst;
441c793af95Ssangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
442f4b3ec61Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
443f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
444f4b3ec61Sdh155122 ipst->ips_ip_ftable, (struct radix_node *)rt);
445c793af95Ssangeeta if (rn == NULL) {
446f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
447c793af95Ssangeeta Free(rt, rt_entry_cache);
448c793af95Ssangeeta rt = NULL;
449c793af95Ssangeeta irb = NULL;
450f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
451f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
452f4b3ec61Sdh155122 ipst->ips_ip_ftable);
453f4b3ec61Sdh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
454c793af95Ssangeeta /* found a non-root match */
455c793af95Ssangeeta rt = (struct rt_entry *)rn;
456c793af95Ssangeeta }
457c793af95Ssangeeta }
458c793af95Ssangeeta if (rt != NULL) {
459c793af95Ssangeeta irb = &rt->rt_irb;
460bd670b35SErik Nordmark irb_refhold(irb);
461c793af95Ssangeeta }
462f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
463c793af95Ssangeeta return (irb);
464c793af95Ssangeeta }
465c793af95Ssangeeta
466c793af95Ssangeeta /*
467c793af95Ssangeeta * This function is used when the caller wants to know the outbound
468c793af95Ssangeeta * interface for a packet given only the address.
469c793af95Ssangeeta * If this is a offlink IP address and there are multiple
470c793af95Ssangeeta * routes to this destination, this routine will utilise the
471c793af95Ssangeeta * first route it finds to IP address
472c793af95Ssangeeta * Return values:
473c793af95Ssangeeta * 0 - FAILURE
474c793af95Ssangeeta * nonzero - ifindex
475c793af95Ssangeeta */
476c793af95Ssangeeta uint_t
ifindex_lookup(const struct sockaddr * ipaddr,zoneid_t zoneid)477c793af95Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
478c793af95Ssangeeta {
479c793af95Ssangeeta uint_t ifindex = 0;
480c793af95Ssangeeta ire_t *ire;
481c793af95Ssangeeta ill_t *ill;
482f4b3ec61Sdh155122 netstack_t *ns;
483f4b3ec61Sdh155122 ip_stack_t *ipst;
484c793af95Ssangeeta
485f4b3ec61Sdh155122 if (zoneid == ALL_ZONES)
486f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
487f4b3ec61Sdh155122 else
488f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid);
489f4b3ec61Sdh155122 ASSERT(ns != NULL);
490f4b3ec61Sdh155122
491f4b3ec61Sdh155122 /*
492f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero
493f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks.
494f4b3ec61Sdh155122 */
495f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
496f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID;
497f4b3ec61Sdh155122 ipst = ns->netstack_ip;
498c793af95Ssangeeta
499c793af95Ssangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
500c793af95Ssangeeta
501f4b3ec61Sdh155122 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
502bd670b35SErik Nordmark ill = ire_nexthop_ill(ire);
503bd670b35SErik Nordmark if (ill != NULL) {
504c793af95Ssangeeta ifindex = ill->ill_phyint->phyint_ifindex;
505bd670b35SErik Nordmark ill_refrele(ill);
506bd670b35SErik Nordmark }
507c793af95Ssangeeta ire_refrele(ire);
508c793af95Ssangeeta }
509f4b3ec61Sdh155122 netstack_rele(ns);
510c793af95Ssangeeta return (ifindex);
511c793af95Ssangeeta }
512c793af95Ssangeeta
513c793af95Ssangeeta /*
514c793af95Ssangeeta * Routine to find the route to a destination. If a ifindex is supplied
515bd670b35SErik Nordmark * it tries to match the route to the corresponding ipif for the ifindex
516c793af95Ssangeeta */
517c793af95Ssangeeta static ire_t *
route_to_dst(const struct sockaddr * dst_addr,zoneid_t zoneid,ip_stack_t * ipst)518f4b3ec61Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
519c793af95Ssangeeta {
520c793af95Ssangeeta ire_t *ire = NULL;
521c793af95Ssangeeta int match_flags;
522c793af95Ssangeeta
523bd670b35SErik Nordmark match_flags = MATCH_IRE_DSTONLY;
524c793af95Ssangeeta
525c793af95Ssangeeta /* XXX pass NULL tsl for now */
526c793af95Ssangeeta
527c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) {
528bd670b35SErik Nordmark ire = ire_route_recursive_v4(
529bd670b35SErik Nordmark ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
5309e3469d3SErik Nordmark zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
5319e3469d3SErik Nordmark NULL, NULL);
532c793af95Ssangeeta } else {
533bd670b35SErik Nordmark ire = ire_route_recursive_v6(
534bd670b35SErik Nordmark &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
5359e3469d3SErik Nordmark zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
5369e3469d3SErik Nordmark NULL, NULL);
537bd670b35SErik Nordmark }
538bd670b35SErik Nordmark ASSERT(ire != NULL);
539bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
540bd670b35SErik Nordmark ire_refrele(ire);
541bd670b35SErik Nordmark return (NULL);
542c793af95Ssangeeta }
543c793af95Ssangeeta return (ire);
544c793af95Ssangeeta }
545c793af95Ssangeeta
546c793af95Ssangeeta /*
547c793af95Ssangeeta * This routine is called by IP Filter to send a packet out on the wire
548bd670b35SErik Nordmark * to a specified dstination (which may be onlink or offlink). The ifindex may
549bd670b35SErik Nordmark * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
550c793af95Ssangeeta * an outgoing interface and requires the nexthop to be on that interface.
551c793af95Ssangeeta * IP WILL NOT DO the following to the data packet before sending it out:
552c793af95Ssangeeta * a. manipulate ttl
553edd26dc5Sdr146992 * b. ipsec work
554edd26dc5Sdr146992 * c. fragmentation
555edd26dc5Sdr146992 *
556edd26dc5Sdr146992 * If the packet has been prepared for hardware checksum then it will be
557edd26dc5Sdr146992 * passed off to ip_send_align_cksum() to check that the flags set on the
558edd26dc5Sdr146992 * packet are in alignment with the capabilities of the new outgoing NIC.
559c793af95Ssangeeta *
560c793af95Ssangeeta * Return values:
561c793af95Ssangeeta * 0: IP was able to send of the data pkt
562c793af95Ssangeeta * ECOMM: Could not send packet
563c793af95Ssangeeta * ENONET No route to dst. It is up to the caller
564c793af95Ssangeeta * to send icmp unreachable error message,
565c793af95Ssangeeta * EINPROGRESS The macaddr of the onlink dst or that
566c793af95Ssangeeta * of the offlink dst's nexthop needs to get
567c793af95Ssangeeta * resolved before packet can be sent to dst.
568c793af95Ssangeeta * Thus transmission is not guaranteed.
569bd670b35SErik Nordmark * Note: No longer have visibility to the ARP queue
570bd670b35SErik Nordmark * hence no EINPROGRESS.
571c793af95Ssangeeta */
572c793af95Ssangeeta int
ipfil_sendpkt(const struct sockaddr * dst_addr,mblk_t * mp,uint_t ifindex,zoneid_t zoneid)573c793af95Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
574c793af95Ssangeeta zoneid_t zoneid)
575c793af95Ssangeeta {
576bd670b35SErik Nordmark ipaddr_t nexthop;
577f4b3ec61Sdh155122 netstack_t *ns;
578f4b3ec61Sdh155122 ip_stack_t *ipst;
579bd670b35SErik Nordmark ip_xmit_attr_t ixas;
580bd670b35SErik Nordmark int error;
581c793af95Ssangeeta
582c793af95Ssangeeta ASSERT(mp != NULL);
583c793af95Ssangeeta
584f4b3ec61Sdh155122 if (zoneid == ALL_ZONES)
585f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
586f4b3ec61Sdh155122 else
587f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid);
588f4b3ec61Sdh155122 ASSERT(ns != NULL);
589f4b3ec61Sdh155122
590f4b3ec61Sdh155122 /*
591f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero
592f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks.
593f4b3ec61Sdh155122 */
594f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
595f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID;
596f4b3ec61Sdh155122 ipst = ns->netstack_ip;
597f4b3ec61Sdh155122
598c793af95Ssangeeta ASSERT(dst_addr->sa_family == AF_INET ||
599c793af95Ssangeeta dst_addr->sa_family == AF_INET6);
600c793af95Ssangeeta
601bd670b35SErik Nordmark bzero(&ixas, sizeof (ixas));
602bd670b35SErik Nordmark /*
603bd670b35SErik Nordmark * No IPsec, no fragmentation, and don't let any hooks see
604bd670b35SErik Nordmark * the packet.
605bd670b35SErik Nordmark */
606bd670b35SErik Nordmark ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
607bd670b35SErik Nordmark ixas.ixa_cred = kcred;
608bd670b35SErik Nordmark ixas.ixa_cpid = NOPID;
609bd670b35SErik Nordmark ixas.ixa_tsl = NULL;
610bd670b35SErik Nordmark ixas.ixa_ipst = ipst;
611bd670b35SErik Nordmark ixas.ixa_ifindex = ifindex;
612bd670b35SErik Nordmark
613c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) {
614bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr;
615bd670b35SErik Nordmark
616bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_IS_IPV4;
617bd670b35SErik Nordmark nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
618bd670b35SErik Nordmark if (nexthop != ipha->ipha_dst) {
619bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET;
620bd670b35SErik Nordmark ixas.ixa_nexthop_v4 = nexthop;
621bd670b35SErik Nordmark }
622bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ipha->ipha_ttl;
623c793af95Ssangeeta } else {
624bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr;
625bd670b35SErik Nordmark in6_addr_t *nexthop6;
626bd670b35SErik Nordmark
627bd670b35SErik Nordmark nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
628bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
629bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET;
630bd670b35SErik Nordmark ixas.ixa_nexthop_v6 = *nexthop6;
631c793af95Ssangeeta }
632bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ip6h->ip6_hops;
633c793af95Ssangeeta }
634bd670b35SErik Nordmark error = ip_output_simple(mp, &ixas);
635bd670b35SErik Nordmark ixa_cleanup(&ixas);
636c793af95Ssangeeta
637f4b3ec61Sdh155122 netstack_rele(ns);
638bd670b35SErik Nordmark switch (error) {
639bd670b35SErik Nordmark case 0:
640bd670b35SErik Nordmark break;
641bd670b35SErik Nordmark
642bd670b35SErik Nordmark case EHOSTUNREACH:
643bd670b35SErik Nordmark case ENETUNREACH:
644bd670b35SErik Nordmark error = ENONET;
645bd670b35SErik Nordmark break;
646bd670b35SErik Nordmark
647bd670b35SErik Nordmark default:
648bd670b35SErik Nordmark error = ECOMM;
649bd670b35SErik Nordmark break;
650c793af95Ssangeeta }
651bd670b35SErik Nordmark return (error);
652edd26dc5Sdr146992 }
653edd26dc5Sdr146992
654c793af95Ssangeeta /*
655c793af95Ssangeeta * callback function provided by ire_ftable_lookup when calling
656c793af95Ssangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in
657c793af95Ssangeeta * the radix tree.
658c793af95Ssangeeta */
659c793af95Ssangeeta boolean_t
ire_find_best_route(struct radix_node * rn,void * arg)660c793af95Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg)
661c793af95Ssangeeta {
662c793af95Ssangeeta struct rt_entry *rt = (struct rt_entry *)rn;
663c793af95Ssangeeta irb_t *irb_ptr;
664c793af95Ssangeeta ire_t *ire;
665c793af95Ssangeeta ire_ftable_args_t *margs = arg;
666c793af95Ssangeeta ipaddr_t match_mask;
667c793af95Ssangeeta
668c793af95Ssangeeta ASSERT(rt != NULL);
669c793af95Ssangeeta
670c793af95Ssangeeta irb_ptr = &rt->rt_irb;
671c793af95Ssangeeta
672c793af95Ssangeeta if (irb_ptr->irb_ire_cnt == 0)
673c793af95Ssangeeta return (B_FALSE);
674c793af95Ssangeeta
675c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_READER);
676c793af95Ssangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
677bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire))
678c793af95Ssangeeta continue;
67944b099c4SSowmini Varadhan ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
68044b099c4SSowmini Varadhan if (margs->ift_flags & MATCH_IRE_MASK)
681c793af95Ssangeeta match_mask = margs->ift_mask;
682c793af95Ssangeeta else
683c793af95Ssangeeta match_mask = ire->ire_mask;
684c793af95Ssangeeta
685c793af95Ssangeeta if (ire_match_args(ire, margs->ift_addr, match_mask,
686bd670b35SErik Nordmark margs->ift_gateway, margs->ift_type, margs->ift_ill,
687bd670b35SErik Nordmark margs->ift_zoneid, margs->ift_tsl,
688bd670b35SErik Nordmark margs->ift_flags)) {
689bd670b35SErik Nordmark ire_refhold(ire);
690c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock);
691c793af95Ssangeeta margs->ift_best_ire = ire;
692c793af95Ssangeeta return (B_TRUE);
693c793af95Ssangeeta }
694c793af95Ssangeeta }
695c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock);
696c793af95Ssangeeta return (B_FALSE);
697c793af95Ssangeeta }
698c793af95Ssangeeta
699c793af95Ssangeeta /*
700c793af95Ssangeeta * ftable irb_t structures are dynamically allocated, and we need to
701c793af95Ssangeeta * check if the irb_t (and associated ftable tree attachment) needs to
702c793af95Ssangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need
703c793af95Ssangeeta * be verified are:
704c793af95Ssangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
705c793af95Ssangeeta * - no other threads holding references to ire's in the bucket,
706c793af95Ssangeeta * i.e., irb_nire == 0
707c793af95Ssangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
708c793af95Ssangeeta * - need to hold the global tree lock and irb_lock in write mode.
709c793af95Ssangeeta */
710c793af95Ssangeeta void
irb_refrele_ftable(irb_t * irb)711c793af95Ssangeeta irb_refrele_ftable(irb_t *irb)
712c793af95Ssangeeta {
713c793af95Ssangeeta for (;;) {
714c793af95Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER);
715c793af95Ssangeeta ASSERT(irb->irb_refcnt != 0);
716c793af95Ssangeeta if (irb->irb_refcnt != 1) {
717c793af95Ssangeeta /*
718c793af95Ssangeeta * Someone has a reference to this radix node
719c793af95Ssangeeta * or there is some bucket walker.
720c793af95Ssangeeta */
721c793af95Ssangeeta irb->irb_refcnt--;
722c793af95Ssangeeta rw_exit(&irb->irb_lock);
723c793af95Ssangeeta return;
724c793af95Ssangeeta } else {
725c793af95Ssangeeta /*
726c793af95Ssangeeta * There is no other walker, nor is there any
727c793af95Ssangeeta * other thread that holds a direct ref to this
728c793af95Ssangeeta * radix node. Do the clean up if needed. Call
729c793af95Ssangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
730c793af95Ssangeeta */
731c793af95Ssangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) {
732c793af95Ssangeeta ire_t *ire_list;
733c793af95Ssangeeta
734c793af95Ssangeeta ire_list = ire_unlink(irb);
735c793af95Ssangeeta rw_exit(&irb->irb_lock);
736c793af95Ssangeeta
737c793af95Ssangeeta if (ire_list != NULL)
738c793af95Ssangeeta ire_cleanup(ire_list);
739c793af95Ssangeeta /*
740c793af95Ssangeeta * more CONDEMNED entries could have
741c793af95Ssangeeta * been added while we dropped the lock,
742c793af95Ssangeeta * so we have to re-check.
743c793af95Ssangeeta */
744c793af95Ssangeeta continue;
745c793af95Ssangeeta }
746c793af95Ssangeeta
747c793af95Ssangeeta /*
748c793af95Ssangeeta * Now check if there are still any ires
749c793af95Ssangeeta * associated with this radix node.
750c793af95Ssangeeta */
751c793af95Ssangeeta if (irb->irb_nire != 0) {
752c793af95Ssangeeta /*
753c793af95Ssangeeta * someone is still holding on
754c793af95Ssangeeta * to ires in this bucket
755c793af95Ssangeeta */
756c793af95Ssangeeta irb->irb_refcnt--;
757c793af95Ssangeeta rw_exit(&irb->irb_lock);
758c793af95Ssangeeta return;
759c793af95Ssangeeta } else {
760c793af95Ssangeeta /*
761c793af95Ssangeeta * Everything is clear. Zero walkers,
762c793af95Ssangeeta * Zero threads with a ref to this
763c793af95Ssangeeta * radix node, Zero ires associated with
764c793af95Ssangeeta * this radix node. Due to lock order,
765c793af95Ssangeeta * check the above conditions again
766c793af95Ssangeeta * after grabbing all locks in the right order
767c793af95Ssangeeta */
768c793af95Ssangeeta rw_exit(&irb->irb_lock);
769c793af95Ssangeeta if (irb_inactive(irb))
770c793af95Ssangeeta return;
771c793af95Ssangeeta /*
772c793af95Ssangeeta * irb_inactive could not free the irb.
773c793af95Ssangeeta * See if there are any walkers, if not
774c793af95Ssangeeta * try to clean up again.
775c793af95Ssangeeta */
776c793af95Ssangeeta }
777c793af95Ssangeeta }
778c793af95Ssangeeta }
779c793af95Ssangeeta }
780c793af95Ssangeeta
781c793af95Ssangeeta /*
782bd670b35SErik Nordmark * IRE iterator used by ire_ftable_lookup to process multiple equal
783bd670b35SErik Nordmark * routes. Given a starting point in the hash list (hash), walk the IREs
784bd670b35SErik Nordmark * in the bucket skipping deleted entries. We treat the bucket as a circular
785bd670b35SErik Nordmark * list for the purposes of walking it.
786bd670b35SErik Nordmark * Returns the IRE (held) that corresponds to the hash value. If that IRE is
787bd670b35SErik Nordmark * not applicable (ire_match_args failed) then it returns a subsequent one.
788bd670b35SErik Nordmark * If we fail to find an IRE we return NULL.
789c793af95Ssangeeta *
790bd670b35SErik Nordmark * Assumes that the caller holds a reference on the IRE bucket and a read lock
791bd670b35SErik Nordmark * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
792bd670b35SErik Nordmark *
793bd670b35SErik Nordmark * Applies to IPv4 and IPv6.
794bd670b35SErik Nordmark *
795bd670b35SErik Nordmark * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
796bd670b35SErik Nordmark * address and bucket, we compare against ire_type for the orig_ire. We also
797bd670b35SErik Nordmark * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
798188e1664SErik Nordmark * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
799bd670b35SErik Nordmark *
800bd670b35SErik Nordmark * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
801bd670b35SErik Nordmark * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
802bd670b35SErik Nordmark * in which the zone has an IP address. We check this for the global zone
803bd670b35SErik Nordmark * even if no shared-IP zones are configured.
804c793af95Ssangeeta */
805c793af95Ssangeeta ire_t *
ire_round_robin(irb_t * irb_ptr,ire_ftable_args_t * margs,uint_t hash,ire_t * orig_ire,ip_stack_t * ipst)806bd670b35SErik Nordmark ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
807bd670b35SErik Nordmark ire_t *orig_ire, ip_stack_t *ipst)
808c793af95Ssangeeta {
809c793af95Ssangeeta ire_t *ire, *maybe_ire = NULL;
810bd670b35SErik Nordmark uint_t maybe_badcnt;
811bd670b35SErik Nordmark uint_t maxwalk;
812bd670b35SErik Nordmark
813bd670b35SErik Nordmark /* Fold in more bits from the hint/hash */
814bd670b35SErik Nordmark hash = hash ^ (hash >> 8) ^ (hash >> 16);
815c793af95Ssangeeta
816c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER);
817bd670b35SErik Nordmark maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */
818*82dec0a6SRobert Mustacchi if (maxwalk == 0) {
819*82dec0a6SRobert Mustacchi rw_exit(&irb_ptr->irb_lock);
820*82dec0a6SRobert Mustacchi return (NULL);
821*82dec0a6SRobert Mustacchi }
822*82dec0a6SRobert Mustacchi
823bd670b35SErik Nordmark hash %= maxwalk;
824bd670b35SErik Nordmark irb_refhold_locked(irb_ptr);
825c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock);
826c793af95Ssangeeta
827c793af95Ssangeeta /*
828c793af95Ssangeeta * Round-robin the routers list looking for a route that
829c793af95Ssangeeta * matches the passed in parameters.
830bd670b35SErik Nordmark * First we skip "hash" number of non-condemned IREs.
831bd670b35SErik Nordmark * Then we match the IRE.
832bd670b35SErik Nordmark * If we find an ire which has a non-zero ire_badcnt then we remember
833bd670b35SErik Nordmark * it and keep on looking for a lower ire_badcnt.
834bd670b35SErik Nordmark * If we come to the end of the list we continue (treat the
835bd670b35SErik Nordmark * bucket list as a circular list) but we match less than "max"
836bd670b35SErik Nordmark * entries.
837c793af95Ssangeeta */
838bd670b35SErik Nordmark ire = irb_ptr->irb_ire;
839bd670b35SErik Nordmark while (maxwalk > 0) {
840bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire))
841bd670b35SErik Nordmark goto next_ire_skip;
842c793af95Ssangeeta
843bd670b35SErik Nordmark /* Skip the first "hash" entries to do ECMP */
844bd670b35SErik Nordmark if (hash != 0) {
845bd670b35SErik Nordmark hash--;
846bd670b35SErik Nordmark goto next_ire_skip;
847bd670b35SErik Nordmark }
848bd670b35SErik Nordmark
849bd670b35SErik Nordmark /* See CGTP comment above */
850bd670b35SErik Nordmark if (ire->ire_type != orig_ire->ire_type ||
851188e1664SErik Nordmark ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
852c793af95Ssangeeta goto next_ire;
853c793af95Ssangeeta
854c793af95Ssangeeta /*
855bd670b35SErik Nordmark * Note: Since IPv6 has hash buckets instead of radix
856bd670b35SErik Nordmark * buckers we need to explicitly compare the addresses.
857bd670b35SErik Nordmark * That makes this less efficient since we will be called
858bd670b35SErik Nordmark * even if there is no alternatives just because the
859bd670b35SErik Nordmark * bucket has multiple IREs for different addresses.
860c793af95Ssangeeta */
861bd670b35SErik Nordmark if (ire->ire_ipversion == IPV6_VERSION) {
862bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
863bd670b35SErik Nordmark &ire->ire_addr_v6))
864c793af95Ssangeeta goto next_ire;
865c793af95Ssangeeta }
866c793af95Ssangeeta
867c793af95Ssangeeta /*
868bd670b35SErik Nordmark * For some reason find_best_route uses ire_mask. We do
869bd670b35SErik Nordmark * the same.
870bd670b35SErik Nordmark */
871bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION ?
872bd670b35SErik Nordmark !ire_match_args(ire, margs->ift_addr,
873bd670b35SErik Nordmark ire->ire_mask, margs->ift_gateway,
874bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid,
875bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags) :
876bd670b35SErik Nordmark !ire_match_args_v6(ire, &margs->ift_addr_v6,
877bd670b35SErik Nordmark &ire->ire_mask_v6, &margs->ift_gateway_v6,
878bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid,
879bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags))
880bd670b35SErik Nordmark goto next_ire;
881bd670b35SErik Nordmark
882bd670b35SErik Nordmark if (margs->ift_zoneid != ALL_ZONES &&
883bd670b35SErik Nordmark (ire->ire_type & IRE_OFFLINK)) {
884bd670b35SErik Nordmark /*
885bd670b35SErik Nordmark * When we're in a zone, we're only
886c793af95Ssangeeta * interested in routers that are
887c793af95Ssangeeta * reachable through ipifs within our zone.
888c793af95Ssangeeta */
889bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) {
890bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v4(
891bd670b35SErik Nordmark ire->ire_gateway_addr, margs->ift_zoneid,
892bd670b35SErik Nordmark ire->ire_ill, margs->ift_tsl, ipst,
893bd670b35SErik Nordmark B_TRUE))
894bd670b35SErik Nordmark goto next_ire;
895bd670b35SErik Nordmark } else {
896bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v6(
897bd670b35SErik Nordmark &ire->ire_gateway_addr_v6,
898bd670b35SErik Nordmark margs->ift_zoneid, ire->ire_ill,
899bd670b35SErik Nordmark margs->ift_tsl, ipst, B_TRUE))
900bd670b35SErik Nordmark goto next_ire;
901bd670b35SErik Nordmark }
902bd670b35SErik Nordmark }
903bd670b35SErik Nordmark mutex_enter(&ire->ire_lock);
904bd670b35SErik Nordmark /* Look for stale ire_badcnt and clear */
905bd670b35SErik Nordmark if (ire->ire_badcnt != 0 &&
906d3d50737SRafael Vanoni (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
907bd670b35SErik Nordmark ipst->ips_ip_ire_badcnt_lifetime))
908bd670b35SErik Nordmark ire->ire_badcnt = 0;
909bd670b35SErik Nordmark mutex_exit(&ire->ire_lock);
910e11c3f44Smeem
911bd670b35SErik Nordmark if (ire->ire_badcnt == 0) {
912bd670b35SErik Nordmark /* We found one with a zero badcnt; done */
913bd670b35SErik Nordmark ire_refhold(ire);
914bd670b35SErik Nordmark /*
915bd670b35SErik Nordmark * Care needed since irb_refrele grabs WLOCK to free
916bd670b35SErik Nordmark * the irb_t.
917bd670b35SErik Nordmark */
918bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) {
919bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
920bd670b35SErik Nordmark irb_refrele(irb_ptr);
921bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
922bd670b35SErik Nordmark } else {
923bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock);
924bd670b35SErik Nordmark irb_refrele(irb_ptr);
925bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock,
926bd670b35SErik Nordmark RW_READER);
927bd670b35SErik Nordmark }
928c793af95Ssangeeta return (ire);
929c793af95Ssangeeta }
930bd670b35SErik Nordmark /*
931bd670b35SErik Nordmark * keep looking to see if there is a better (lower
932bd670b35SErik Nordmark * badcnt) matching IRE, but save this one as a last resort.
933bd670b35SErik Nordmark * If we find a lower badcnt pick that one as the last* resort.
934bd670b35SErik Nordmark */
935bd670b35SErik Nordmark if (maybe_ire == NULL) {
936bd670b35SErik Nordmark maybe_ire = ire;
937bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt;
938bd670b35SErik Nordmark } else if (ire->ire_badcnt < maybe_badcnt) {
939bd670b35SErik Nordmark maybe_ire = ire;
940bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt;
941bd670b35SErik Nordmark }
942bd670b35SErik Nordmark
943c793af95Ssangeeta next_ire:
944bd670b35SErik Nordmark maxwalk--;
945bd670b35SErik Nordmark next_ire_skip:
946bd670b35SErik Nordmark ire = ire->ire_next;
947bd670b35SErik Nordmark if (ire == NULL)
948bd670b35SErik Nordmark ire = irb_ptr->irb_ire;
949c793af95Ssangeeta }
950c793af95Ssangeeta if (maybe_ire != NULL)
951bd670b35SErik Nordmark ire_refhold(maybe_ire);
952bd670b35SErik Nordmark
953bd670b35SErik Nordmark /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
954bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) {
955bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
956bd670b35SErik Nordmark irb_refrele(irb_ptr);
957bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
958bd670b35SErik Nordmark } else {
959bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock);
960bd670b35SErik Nordmark irb_refrele(irb_ptr);
961bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
962bd670b35SErik Nordmark }
963c793af95Ssangeeta return (maybe_ire);
964c793af95Ssangeeta }
9652679e103Ssowmini
9662679e103Ssowmini void
irb_refhold_rn(struct radix_node * rn)9672679e103Ssowmini irb_refhold_rn(struct radix_node *rn)
9682679e103Ssowmini {
9692679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0)
970bd670b35SErik Nordmark irb_refhold(&((rt_t *)(rn))->rt_irb);
9712679e103Ssowmini }
9722679e103Ssowmini
9732679e103Ssowmini void
irb_refrele_rn(struct radix_node * rn)9742679e103Ssowmini irb_refrele_rn(struct radix_node *rn)
9752679e103Ssowmini {
9762679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0)
9772679e103Ssowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
9782679e103Ssowmini }
979bd670b35SErik Nordmark
98044b099c4SSowmini Varadhan
98144b099c4SSowmini Varadhan /*
98244b099c4SSowmini Varadhan * ip_select_src_ill() is used by ip_select_route() to find the src_ill
98344b099c4SSowmini Varadhan * to be used for source-aware routing table lookup. This function will
98444b099c4SSowmini Varadhan * ignore IPIF_UNNUMBERED interface addresses, and will only return a
98544b099c4SSowmini Varadhan * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
98644b099c4SSowmini Varadhan * interfaces).
98744b099c4SSowmini Varadhan */
98844b099c4SSowmini Varadhan static ill_t *
ip_select_src_ill(const in6_addr_t * v6src,zoneid_t zoneid,ip_stack_t * ipst)98944b099c4SSowmini Varadhan ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
99044b099c4SSowmini Varadhan {
99144b099c4SSowmini Varadhan ipif_t *ipif;
99244b099c4SSowmini Varadhan ill_t *ill;
99344b099c4SSowmini Varadhan boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
99444b099c4SSowmini Varadhan ipaddr_t v4src;
99544b099c4SSowmini Varadhan
99644b099c4SSowmini Varadhan if (isv6) {
99744b099c4SSowmini Varadhan ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
99844b099c4SSowmini Varadhan } else {
99944b099c4SSowmini Varadhan IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
100044b099c4SSowmini Varadhan ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
100144b099c4SSowmini Varadhan }
100244b099c4SSowmini Varadhan if (ipif == NULL)
100344b099c4SSowmini Varadhan return (NULL);
100444b099c4SSowmini Varadhan ill = ipif->ipif_ill;
100544b099c4SSowmini Varadhan ill_refhold(ill);
100644b099c4SSowmini Varadhan ipif_refrele(ipif);
100744b099c4SSowmini Varadhan return (ill);
100844b099c4SSowmini Varadhan }
100944b099c4SSowmini Varadhan
101044b099c4SSowmini Varadhan /*
101144b099c4SSowmini Varadhan * verify that v6src is configured on ill
101244b099c4SSowmini Varadhan */
101344b099c4SSowmini Varadhan static boolean_t
ip_verify_src_on_ill(const in6_addr_t v6src,ill_t * ill,zoneid_t zoneid)101444b099c4SSowmini Varadhan ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
101544b099c4SSowmini Varadhan {
101644b099c4SSowmini Varadhan ipif_t *ipif;
101744b099c4SSowmini Varadhan ip_stack_t *ipst;
101844b099c4SSowmini Varadhan ipaddr_t v4src;
101944b099c4SSowmini Varadhan
102044b099c4SSowmini Varadhan if (ill == NULL)
102144b099c4SSowmini Varadhan return (B_FALSE);
102244b099c4SSowmini Varadhan ipst = ill->ill_ipst;
102344b099c4SSowmini Varadhan
102444b099c4SSowmini Varadhan if (ill->ill_isv6) {
102544b099c4SSowmini Varadhan ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
102644b099c4SSowmini Varadhan } else {
102744b099c4SSowmini Varadhan IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
102844b099c4SSowmini Varadhan ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
102944b099c4SSowmini Varadhan }
103044b099c4SSowmini Varadhan
103144b099c4SSowmini Varadhan if (ipif != NULL) {
103244b099c4SSowmini Varadhan ipif_refrele(ipif);
103344b099c4SSowmini Varadhan return (B_TRUE);
103444b099c4SSowmini Varadhan } else {
103544b099c4SSowmini Varadhan return (B_FALSE);
103644b099c4SSowmini Varadhan }
103744b099c4SSowmini Varadhan }
103844b099c4SSowmini Varadhan
1039bd670b35SErik Nordmark /*
1040bd670b35SErik Nordmark * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
1041bd670b35SErik Nordmark * routes this routine sets up a ire_nce_cache as well. The caller needs to
1042bd670b35SErik Nordmark * lookup an nce for the multicast case.
104344b099c4SSowmini Varadhan *
104444b099c4SSowmini Varadhan * When src_multihoming is set to 2 (strict src multihoming) we use the source
104544b099c4SSowmini Varadhan * address to select the interface and route. If IP_BOUND_IF etc are
104644b099c4SSowmini Varadhan * specified, we require that they specify an interface on which the
104744b099c4SSowmini Varadhan * source address is assigned.
104844b099c4SSowmini Varadhan *
104944b099c4SSowmini Varadhan * When src_multihoming is set to 1 (preferred src aware route
105044b099c4SSowmini Varadhan * selection) the unicast lookup prefers a matching source
105144b099c4SSowmini Varadhan * (i.e., that the route points out an ill on which the source is assigned), but
105244b099c4SSowmini Varadhan * if no such route is found we fallback to not considering the source in the
105344b099c4SSowmini Varadhan * route lookup.
105444b099c4SSowmini Varadhan *
105544b099c4SSowmini Varadhan * We skip the src_multihoming check when the source isn't (yet) set, and
105644b099c4SSowmini Varadhan * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
105744b099c4SSowmini Varadhan * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
105844b099c4SSowmini Varadhan * when secpolicy_net_rawaccess().
1059bd670b35SErik Nordmark */
1060bd670b35SErik Nordmark ire_t *
ip_select_route(const in6_addr_t * v6dst,const in6_addr_t v6src,ip_xmit_attr_t * ixa,uint_t * generationp,in6_addr_t * setsrcp,int * errorp,boolean_t * multirtp)106144b099c4SSowmini Varadhan ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
106244b099c4SSowmini Varadhan ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
106344b099c4SSowmini Varadhan int *errorp, boolean_t *multirtp)
1064bd670b35SErik Nordmark {
1065bd670b35SErik Nordmark uint_t match_args;
1066bd670b35SErik Nordmark uint_t ire_type;
106744b099c4SSowmini Varadhan ill_t *ill = NULL;
1068bd670b35SErik Nordmark ire_t *ire;
1069bd670b35SErik Nordmark ip_stack_t *ipst = ixa->ixa_ipst;
1070bd670b35SErik Nordmark ipaddr_t v4dst;
1071bd670b35SErik Nordmark in6_addr_t v6nexthop;
1072bd670b35SErik Nordmark iaflags_t ixaflags = ixa->ixa_flags;
1073bd670b35SErik Nordmark nce_t *nce;
107444b099c4SSowmini Varadhan boolean_t preferred_src_aware = B_FALSE;
107544b099c4SSowmini Varadhan boolean_t verify_src;
107644b099c4SSowmini Varadhan boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
107744b099c4SSowmini Varadhan int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
107844b099c4SSowmini Varadhan
107944b099c4SSowmini Varadhan /*
108044b099c4SSowmini Varadhan * We only verify that the src has been configured on a selected
108144b099c4SSowmini Varadhan * interface if the src is not :: or INADDR_ANY, and if the
108244b099c4SSowmini Varadhan * IXAF_VERIFY_SOURCE flag is set.
108344b099c4SSowmini Varadhan */
108444b099c4SSowmini Varadhan verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
108544b099c4SSowmini Varadhan (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
1086bd670b35SErik Nordmark
1087bd670b35SErik Nordmark match_args = MATCH_IRE_SECATTR;
1088bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
1089bd670b35SErik Nordmark if (setsrcp != NULL)
1090bd670b35SErik Nordmark ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1091bd670b35SErik Nordmark if (errorp != NULL)
1092bd670b35SErik Nordmark ASSERT(*errorp == 0);
1093bd670b35SErik Nordmark
1094bd670b35SErik Nordmark /*
1095bd670b35SErik Nordmark * The content of the ixa will be different if IP_NEXTHOP,
1096bd670b35SErik Nordmark * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
1097bd670b35SErik Nordmark */
1098bd670b35SErik Nordmark
109944b099c4SSowmini Varadhan if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
1100bd670b35SErik Nordmark /* Pick up the IRE_MULTICAST for the ill */
1101bd670b35SErik Nordmark if (ixa->ixa_multicast_ifindex != 0) {
1102bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
110344b099c4SSowmini Varadhan isv6, ipst);
1104bd670b35SErik Nordmark } else if (ixaflags & IXAF_SCOPEID_SET) {
1105bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */
1106bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0);
1107bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
110844b099c4SSowmini Varadhan isv6, ipst);
1109bd670b35SErik Nordmark } else if (ixa->ixa_ifindex != 0) {
1110bd670b35SErik Nordmark /*
1111bd670b35SErik Nordmark * In the ipmp case, the ixa_ifindex is set to
1112bd670b35SErik Nordmark * point at an under_ill and we would return the
1113bd670b35SErik Nordmark * ire_multicast() corresponding to that under_ill.
1114bd670b35SErik Nordmark */
1115bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
111644b099c4SSowmini Varadhan isv6, ipst);
111744b099c4SSowmini Varadhan } else if (src_multihoming != 0 && verify_src) {
111844b099c4SSowmini Varadhan /* Look up the ill based on the source address */
111944b099c4SSowmini Varadhan ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
112044b099c4SSowmini Varadhan /*
112144b099c4SSowmini Varadhan * Since we looked up the ill from the source there
112244b099c4SSowmini Varadhan * is no need to verify that the source is on the ill
112344b099c4SSowmini Varadhan * below.
112444b099c4SSowmini Varadhan */
112544b099c4SSowmini Varadhan verify_src = B_FALSE;
112644b099c4SSowmini Varadhan if (ill != NULL && IS_VNI(ill)) {
112744b099c4SSowmini Varadhan ill_t *usesrc = ill;
112844b099c4SSowmini Varadhan
112944b099c4SSowmini Varadhan ill = ill_lookup_usesrc(usesrc);
113044b099c4SSowmini Varadhan ill_refrele(usesrc);
113144b099c4SSowmini Varadhan }
113244b099c4SSowmini Varadhan } else if (!isv6) {
1133bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY;
1134bd670b35SErik Nordmark
113544b099c4SSowmini Varadhan ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
113644b099c4SSowmini Varadhan ipst, multirtp, &v4setsrc);
1137bd670b35SErik Nordmark if (setsrcp != NULL)
1138bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1139bd670b35SErik Nordmark } else {
114044b099c4SSowmini Varadhan ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
114144b099c4SSowmini Varadhan ipst, multirtp, setsrcp);
1142bd670b35SErik Nordmark }
1143bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) {
1144bd670b35SErik Nordmark ill_refrele(ill);
1145bd670b35SErik Nordmark ill = NULL;
1146bd670b35SErik Nordmark }
1147bd670b35SErik Nordmark if (ill == NULL) {
1148bd670b35SErik Nordmark if (errorp != NULL)
1149bd670b35SErik Nordmark *errorp = ENXIO;
1150bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */
115144b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6);
1152bd670b35SErik Nordmark return (ire);
1153bd670b35SErik Nordmark }
1154bd670b35SErik Nordmark if (!(ill->ill_flags & ILLF_MULTICAST)) {
1155bd670b35SErik Nordmark ill_refrele(ill);
1156bd670b35SErik Nordmark if (errorp != NULL)
1157bd670b35SErik Nordmark *errorp = EHOSTUNREACH;
1158bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */
115944b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6);
116044b099c4SSowmini Varadhan return (ire);
116144b099c4SSowmini Varadhan }
116244b099c4SSowmini Varadhan /*
116344b099c4SSowmini Varadhan * If we are doing the strictest src_multihoming, then
116444b099c4SSowmini Varadhan * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
116544b099c4SSowmini Varadhan * an interface that is consistent with the source address.
116644b099c4SSowmini Varadhan */
116744b099c4SSowmini Varadhan if (verify_src && src_multihoming == 2 &&
116844b099c4SSowmini Varadhan !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
116944b099c4SSowmini Varadhan if (errorp != NULL)
117044b099c4SSowmini Varadhan *errorp = EADDRNOTAVAIL;
117144b099c4SSowmini Varadhan ill_refrele(ill);
117244b099c4SSowmini Varadhan /* Get a hold on the IRE_NOROUTE */
117344b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6);
1174bd670b35SErik Nordmark return (ire);
1175bd670b35SErik Nordmark }
1176bd670b35SErik Nordmark /* Get a refcnt on the single IRE_MULTICAST per ill */
1177bd670b35SErik Nordmark ire = ire_multicast(ill);
1178bd670b35SErik Nordmark ill_refrele(ill);
1179bd670b35SErik Nordmark if (generationp != NULL)
1180bd670b35SErik Nordmark *generationp = ire->ire_generation;
1181bd670b35SErik Nordmark if (errorp != NULL &&
1182bd670b35SErik Nordmark (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
1183bd670b35SErik Nordmark *errorp = EHOSTUNREACH;
1184bd670b35SErik Nordmark }
1185bd670b35SErik Nordmark return (ire);
1186bd670b35SErik Nordmark }
1187bd670b35SErik Nordmark
118844b099c4SSowmini Varadhan /* Now for unicast */
1189bd670b35SErik Nordmark if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
1190bd670b35SErik Nordmark if (ixaflags & IXAF_SCOPEID_SET) {
1191bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */
1192bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0);
1193bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
119444b099c4SSowmini Varadhan isv6, ipst);
1195bd670b35SErik Nordmark } else {
1196bd670b35SErik Nordmark ASSERT(ixa->ixa_ifindex != 0);
1197bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
119844b099c4SSowmini Varadhan isv6, ipst);
1199bd670b35SErik Nordmark }
1200bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) {
1201bd670b35SErik Nordmark ill_refrele(ill);
1202bd670b35SErik Nordmark ill = NULL;
1203bd670b35SErik Nordmark }
1204bd670b35SErik Nordmark if (ill == NULL) {
1205bd670b35SErik Nordmark if (errorp != NULL)
1206bd670b35SErik Nordmark *errorp = ENXIO;
1207bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */
120844b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6);
1209bd670b35SErik Nordmark return (ire);
1210bd670b35SErik Nordmark }
121144b099c4SSowmini Varadhan
121244b099c4SSowmini Varadhan match_args |= MATCH_IRE_ILL;
121344b099c4SSowmini Varadhan
1214bd670b35SErik Nordmark /*
1215bd670b35SErik Nordmark * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1216bd670b35SErik Nordmark * so for both of them we need to be able look for an under
1217bd670b35SErik Nordmark * interface.
1218bd670b35SErik Nordmark */
1219bd670b35SErik Nordmark if (IS_UNDER_IPMP(ill))
1220bd670b35SErik Nordmark match_args |= MATCH_IRE_TESTHIDDEN;
122144b099c4SSowmini Varadhan
122244b099c4SSowmini Varadhan /*
122344b099c4SSowmini Varadhan * If we are doing the strictest src_multihoming, then
122444b099c4SSowmini Varadhan * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
122544b099c4SSowmini Varadhan * an interface that is consistent with the source address.
122644b099c4SSowmini Varadhan */
122744b099c4SSowmini Varadhan if (src_multihoming == 2 &&
122844b099c4SSowmini Varadhan !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
122944b099c4SSowmini Varadhan if (errorp != NULL)
123044b099c4SSowmini Varadhan *errorp = EADDRNOTAVAIL;
123144b099c4SSowmini Varadhan ill_refrele(ill);
123244b099c4SSowmini Varadhan /* Get a hold on the IRE_NOROUTE */
123344b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6);
123444b099c4SSowmini Varadhan return (ire);
123544b099c4SSowmini Varadhan }
123644b099c4SSowmini Varadhan } else if (src_multihoming != 0 && verify_src) {
123744b099c4SSowmini Varadhan /* Look up the ill based on the source address */
123844b099c4SSowmini Varadhan ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
123944b099c4SSowmini Varadhan if (ill == NULL) {
124044b099c4SSowmini Varadhan char addrbuf[INET6_ADDRSTRLEN];
124144b099c4SSowmini Varadhan
124244b099c4SSowmini Varadhan ip3dbg(("%s not a valid src for unicast",
124344b099c4SSowmini Varadhan inet_ntop(AF_INET6, &v6src, addrbuf,
124444b099c4SSowmini Varadhan sizeof (addrbuf))));
124544b099c4SSowmini Varadhan if (errorp != NULL)
124644b099c4SSowmini Varadhan *errorp = EADDRNOTAVAIL;
124744b099c4SSowmini Varadhan /* Get a hold on the IRE_NOROUTE */
124844b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6);
124944b099c4SSowmini Varadhan return (ire);
125044b099c4SSowmini Varadhan }
125144b099c4SSowmini Varadhan match_args |= MATCH_IRE_SRC_ILL;
125244b099c4SSowmini Varadhan preferred_src_aware = (src_multihoming == 1);
1253bd670b35SErik Nordmark }
1254bd670b35SErik Nordmark
1255bd670b35SErik Nordmark if (ixaflags & IXAF_NEXTHOP_SET) {
1256bd670b35SErik Nordmark /* IP_NEXTHOP was set */
1257bd670b35SErik Nordmark v6nexthop = ixa->ixa_nexthop_v6;
1258bd670b35SErik Nordmark } else {
1259bd670b35SErik Nordmark v6nexthop = *v6dst;
1260bd670b35SErik Nordmark }
1261bd670b35SErik Nordmark
1262bd670b35SErik Nordmark ire_type = 0;
1263bd670b35SErik Nordmark
1264bd670b35SErik Nordmark /*
1265bd670b35SErik Nordmark * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1266bd670b35SErik Nordmark * we only look for an onlink IRE.
1267bd670b35SErik Nordmark */
1268bd670b35SErik Nordmark if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
1269bd670b35SErik Nordmark match_args |= MATCH_IRE_TYPE;
1270bd670b35SErik Nordmark ire_type = IRE_ONLINK;
1271bd670b35SErik Nordmark }
1272bd670b35SErik Nordmark
127344b099c4SSowmini Varadhan retry:
127444b099c4SSowmini Varadhan if (!isv6) {
1275bd670b35SErik Nordmark ipaddr_t v4nexthop;
1276bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY;
1277bd670b35SErik Nordmark
1278bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
1279bd670b35SErik Nordmark ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
12809e3469d3SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1281bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
1282bd670b35SErik Nordmark if (setsrcp != NULL)
1283bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1284bd670b35SErik Nordmark } else {
1285bd670b35SErik Nordmark ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
12869e3469d3SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1287bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
1288bd670b35SErik Nordmark }
1289bd670b35SErik Nordmark
1290bd670b35SErik Nordmark #ifdef DEBUG
1291bd670b35SErik Nordmark if (match_args & MATCH_IRE_TESTHIDDEN) {
1292bd670b35SErik Nordmark ip3dbg(("looking for hidden; dst %x ire %p\n",
1293bd670b35SErik Nordmark v4dst, (void *)ire));
1294bd670b35SErik Nordmark }
1295bd670b35SErik Nordmark #endif
129644b099c4SSowmini Varadhan if (ill != NULL) {
1297bd670b35SErik Nordmark ill_refrele(ill);
129844b099c4SSowmini Varadhan ill = NULL;
129944b099c4SSowmini Varadhan }
1300bd670b35SErik Nordmark if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1301bd670b35SErik Nordmark (ire->ire_type & IRE_MULTICAST)) {
130244b099c4SSowmini Varadhan if (preferred_src_aware) {
130344b099c4SSowmini Varadhan /*
130444b099c4SSowmini Varadhan * "Preferred Source Aware" send mode. If we cannot
130544b099c4SSowmini Varadhan * find an ire whose ire_ill had the desired source
130644b099c4SSowmini Varadhan * address retry after relaxing the ill matching
130744b099c4SSowmini Varadhan * constraint.
130844b099c4SSowmini Varadhan */
130944b099c4SSowmini Varadhan ire_refrele(ire);
131044b099c4SSowmini Varadhan preferred_src_aware = B_FALSE;
131144b099c4SSowmini Varadhan match_args &= ~MATCH_IRE_SRC_ILL;
131244b099c4SSowmini Varadhan goto retry;
131344b099c4SSowmini Varadhan }
1314bd670b35SErik Nordmark /* No ire_nce_cache */
1315bd670b35SErik Nordmark return (ire);
1316bd670b35SErik Nordmark }
1317bd670b35SErik Nordmark
1318bd670b35SErik Nordmark /* Setup ire_nce_cache if it doesn't exist or is condemned. */
1319bd670b35SErik Nordmark mutex_enter(&ire->ire_lock);
1320bd670b35SErik Nordmark nce = ire->ire_nce_cache;
1321bd670b35SErik Nordmark if (nce == NULL || nce->nce_is_condemned) {
1322bd670b35SErik Nordmark mutex_exit(&ire->ire_lock);
1323bd670b35SErik Nordmark (void) ire_revalidate_nce(ire);
1324bd670b35SErik Nordmark } else {
1325bd670b35SErik Nordmark mutex_exit(&ire->ire_lock);
1326bd670b35SErik Nordmark }
1327bd670b35SErik Nordmark return (ire);
1328bd670b35SErik Nordmark }
1329bd670b35SErik Nordmark
1330bd670b35SErik Nordmark /*
1331bd670b35SErik Nordmark * Find a route given some xmit attributes and a packet.
1332bd670b35SErik Nordmark * Generic for IPv4 and IPv6
1333bd670b35SErik Nordmark *
1334bd670b35SErik Nordmark * This never returns NULL. But when it returns the IRE_NOROUTE
1335bd670b35SErik Nordmark * it might set errorp.
1336bd670b35SErik Nordmark */
1337bd670b35SErik Nordmark ire_t *
ip_select_route_pkt(mblk_t * mp,ip_xmit_attr_t * ixa,uint_t * generationp,int * errorp,boolean_t * multirtp)1338bd670b35SErik Nordmark ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
1339bd670b35SErik Nordmark int *errorp, boolean_t *multirtp)
1340bd670b35SErik Nordmark {
1341bd670b35SErik Nordmark if (ixa->ixa_flags & IXAF_IS_IPV4) {
1342bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr;
134344b099c4SSowmini Varadhan in6_addr_t v6dst, v6src;
1344bd670b35SErik Nordmark
1345bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
134644b099c4SSowmini Varadhan IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
1347bd670b35SErik Nordmark
134844b099c4SSowmini Varadhan return (ip_select_route(&v6dst, v6src, ixa, generationp,
1349bd670b35SErik Nordmark NULL, errorp, multirtp));
1350bd670b35SErik Nordmark } else {
1351bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1352bd670b35SErik Nordmark
135344b099c4SSowmini Varadhan return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
135444b099c4SSowmini Varadhan ixa, generationp, NULL, errorp, multirtp));
1355bd670b35SErik Nordmark }
1356bd670b35SErik Nordmark }
1357bd670b35SErik Nordmark
1358bd670b35SErik Nordmark ire_t *
ip_select_route_v4(ipaddr_t dst,ipaddr_t src,ip_xmit_attr_t * ixa,uint_t * generationp,ipaddr_t * v4setsrcp,int * errorp,boolean_t * multirtp)135944b099c4SSowmini Varadhan ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
136044b099c4SSowmini Varadhan uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
1361bd670b35SErik Nordmark {
136244b099c4SSowmini Varadhan in6_addr_t v6dst, v6src;
1363bd670b35SErik Nordmark ire_t *ire;
1364bd670b35SErik Nordmark in6_addr_t setsrc;
1365bd670b35SErik Nordmark
1366bd670b35SErik Nordmark ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
1367bd670b35SErik Nordmark
1368bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
136944b099c4SSowmini Varadhan IN6_IPADDR_TO_V4MAPPED(src, &v6src);
1370bd670b35SErik Nordmark
1371bd670b35SErik Nordmark setsrc = ipv6_all_zeros;
137244b099c4SSowmini Varadhan ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
1373bd670b35SErik Nordmark multirtp);
1374bd670b35SErik Nordmark if (v4setsrcp != NULL)
1375bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
1376bd670b35SErik Nordmark return (ire);
1377bd670b35SErik Nordmark }
1378bd670b35SErik Nordmark
1379bd670b35SErik Nordmark /*
1380bd670b35SErik Nordmark * Recursively look for a route to the destination. Can also match on
1381bd670b35SErik Nordmark * the zoneid, ill, and label. Used for the data paths. See also
1382bd670b35SErik Nordmark * ire_route_recursive.
1383bd670b35SErik Nordmark *
13849e3469d3SErik Nordmark * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
13859e3469d3SErik Nordmark * create an IRE_IF_CLONE. This is used on the receive side when we are not
13869e3469d3SErik Nordmark * forwarding.
13879e3469d3SErik Nordmark * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
13889e3469d3SErik Nordmark * resolve the gateway.
13899e3469d3SErik Nordmark *
1390bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE
1391bd670b35SErik Nordmark * instead.
1392bd670b35SErik Nordmark *
1393bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1394bd670b35SErik Nordmark * is an error.
1395bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT.
1396bd670b35SErik Nordmark */
1397bd670b35SErik Nordmark ire_t *
ire_route_recursive_impl_v4(ire_t * ire,ipaddr_t nexthop,uint_t ire_type,const ill_t * ill_arg,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,ipaddr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)1398bd670b35SErik Nordmark ire_route_recursive_impl_v4(ire_t *ire,
1399bd670b35SErik Nordmark ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
1400bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
14019e3469d3SErik Nordmark uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1402bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1403bd670b35SErik Nordmark {
1404bd670b35SErik Nordmark int i, j;
1405bd670b35SErik Nordmark ire_t *ires[MAX_IRE_RECURSION];
1406bd670b35SErik Nordmark uint_t generation;
1407bd670b35SErik Nordmark uint_t generations[MAX_IRE_RECURSION];
1408bd670b35SErik Nordmark boolean_t need_refrele = B_FALSE;
1409bd670b35SErik Nordmark boolean_t invalidate = B_FALSE;
1410bd670b35SErik Nordmark ill_t *ill = NULL;
141101685f97SSowmini Varadhan uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
1412bd670b35SErik Nordmark
1413bd670b35SErik Nordmark if (setsrcp != NULL)
1414bd670b35SErik Nordmark ASSERT(*setsrcp == INADDR_ANY);
1415bd670b35SErik Nordmark if (gwattrp != NULL)
1416bd670b35SErik Nordmark ASSERT(*gwattrp == NULL);
1417bd670b35SErik Nordmark
1418bd670b35SErik Nordmark /*
1419bd670b35SErik Nordmark * We iterate up to three times to resolve a route, even though
1420bd670b35SErik Nordmark * we have four slots in the array. The extra slot is for an
1421bd670b35SErik Nordmark * IRE_IF_CLONE we might need to create.
1422bd670b35SErik Nordmark */
1423bd670b35SErik Nordmark i = 0;
1424bd670b35SErik Nordmark while (i < MAX_IRE_RECURSION - 1) {
1425bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */
1426bd670b35SErik Nordmark if (ire == NULL) {
1427bd670b35SErik Nordmark ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
142844b099c4SSowmini Varadhan (ill != NULL? ill : ill_arg), zoneid, tsl,
1429bd670b35SErik Nordmark match_args, xmit_hint, ipst, &generation);
1430bd670b35SErik Nordmark } else {
1431bd670b35SErik Nordmark /* Caller passed it; extra hold since we will rele */
1432bd670b35SErik Nordmark ire_refhold(ire);
1433bd670b35SErik Nordmark if (generationp != NULL)
1434bd670b35SErik Nordmark generation = *generationp;
1435bd670b35SErik Nordmark else
1436bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY;
1437bd670b35SErik Nordmark }
143801685f97SSowmini Varadhan if (ire == NULL) {
143901685f97SSowmini Varadhan if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
14409e3469d3SErik Nordmark ire = ires[0];
14419e3469d3SErik Nordmark ire_refhold(ire);
14429e3469d3SErik Nordmark } else {
1443bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE);
14449e3469d3SErik Nordmark }
1445bd670b35SErik Nordmark goto error;
1446bd670b35SErik Nordmark }
144701685f97SSowmini Varadhan
144801685f97SSowmini Varadhan /* Need to return the ire with RTF_REJECT|BLACKHOLE */
144901685f97SSowmini Varadhan if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
145001685f97SSowmini Varadhan goto error;
145101685f97SSowmini Varadhan
145201685f97SSowmini Varadhan ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1453fff7ec1dSSowmini Varadhan /*
1454fff7ec1dSSowmini Varadhan * Verify that the IRE_IF_CLONE has a consistent generation
1455fff7ec1dSSowmini Varadhan * number.
1456fff7ec1dSSowmini Varadhan */
1457fff7ec1dSSowmini Varadhan if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1458fff7ec1dSSowmini Varadhan ire_refrele(ire);
1459fff7ec1dSSowmini Varadhan ire = NULL;
1460fff7ec1dSSowmini Varadhan continue;
1461fff7ec1dSSowmini Varadhan }
146201685f97SSowmini Varadhan
146301685f97SSowmini Varadhan /*
146401685f97SSowmini Varadhan * Don't allow anything unusual past the first iteration.
146501685f97SSowmini Varadhan * After the first lookup, we should no longer look for
146601685f97SSowmini Varadhan * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
146701685f97SSowmini Varadhan * routes.
146801685f97SSowmini Varadhan *
146901685f97SSowmini Varadhan * In addition, after we have found a direct IRE_OFFLINK,
147001685f97SSowmini Varadhan * we should only look for interface or clone routes.
147101685f97SSowmini Varadhan */
147201685f97SSowmini Varadhan match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
147301685f97SSowmini Varadhan
147401685f97SSowmini Varadhan if ((ire->ire_type & IRE_OFFLINK) &&
147501685f97SSowmini Varadhan !(ire->ire_flags & RTF_INDIRECT)) {
147601685f97SSowmini Varadhan ire_type = IRE_IF_ALL;
147701685f97SSowmini Varadhan } else {
147801685f97SSowmini Varadhan /*
147901685f97SSowmini Varadhan * no more local, loopback, broadcast routes
148001685f97SSowmini Varadhan */
148101685f97SSowmini Varadhan if (!(match_args & MATCH_IRE_TYPE))
148201685f97SSowmini Varadhan ire_type = (IRE_OFFLINK|IRE_ONLINK);
148301685f97SSowmini Varadhan ire_type &= ~maskoff;
1484bd670b35SErik Nordmark }
148501685f97SSowmini Varadhan match_args |= MATCH_IRE_TYPE;
148601685f97SSowmini Varadhan
1487bd670b35SErik Nordmark /* We have a usable IRE */
1488bd670b35SErik Nordmark ires[i] = ire;
1489bd670b35SErik Nordmark generations[i] = generation;
1490bd670b35SErik Nordmark i++;
1491bd670b35SErik Nordmark
1492bd670b35SErik Nordmark /* The first RTF_SETSRC address is passed back if setsrcp */
1493bd670b35SErik Nordmark if ((ire->ire_flags & RTF_SETSRC) &&
1494bd670b35SErik Nordmark setsrcp != NULL && *setsrcp == INADDR_ANY) {
1495bd670b35SErik Nordmark ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1496bd670b35SErik Nordmark *setsrcp = ire->ire_setsrc_addr;
1497bd670b35SErik Nordmark }
1498bd670b35SErik Nordmark
1499bd670b35SErik Nordmark /* The first ire_gw_secattr is passed back if gwattrp */
1500bd670b35SErik Nordmark if (ire->ire_gw_secattr != NULL &&
1501bd670b35SErik Nordmark gwattrp != NULL && *gwattrp == NULL)
1502bd670b35SErik Nordmark *gwattrp = ire->ire_gw_secattr;
1503bd670b35SErik Nordmark
1504bd670b35SErik Nordmark /*
1505bd670b35SErik Nordmark * Check if we have a short-cut pointer to an IRE for this
1506bd670b35SErik Nordmark * destination, and that the cached dependency isn't stale.
1507bd670b35SErik Nordmark * In that case we've rejoined an existing tree towards a
1508bd670b35SErik Nordmark * parent, thus we don't need to continue the loop to
1509bd670b35SErik Nordmark * discover the rest of the tree.
1510bd670b35SErik Nordmark */
1511bd670b35SErik Nordmark mutex_enter(&ire->ire_lock);
1512bd670b35SErik Nordmark if (ire->ire_dep_parent != NULL &&
1513bd670b35SErik Nordmark ire->ire_dep_parent->ire_generation ==
1514bd670b35SErik Nordmark ire->ire_dep_parent_generation) {
1515bd670b35SErik Nordmark mutex_exit(&ire->ire_lock);
1516bd670b35SErik Nordmark ire = NULL;
1517bd670b35SErik Nordmark goto done;
1518bd670b35SErik Nordmark }
1519bd670b35SErik Nordmark mutex_exit(&ire->ire_lock);
1520bd670b35SErik Nordmark
1521bd670b35SErik Nordmark /*
1522bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it
1523bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes
1524bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask.
1525bd670b35SErik Nordmark */
1526bd670b35SErik Nordmark if (ire->ire_nce_capable) {
1527bd670b35SErik Nordmark ire = NULL;
1528bd670b35SErik Nordmark goto done;
1529bd670b35SErik Nordmark }
1530bd670b35SErik Nordmark ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1531bd670b35SErik Nordmark /*
1532bd670b35SErik Nordmark * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1533bd670b35SErik Nordmark * particular destination
1534bd670b35SErik Nordmark */
1535bd670b35SErik Nordmark if (ire->ire_type & IRE_INTERFACE) {
1536bd670b35SErik Nordmark in6_addr_t v6nexthop;
1537bd670b35SErik Nordmark ire_t *clone;
1538bd670b35SErik Nordmark
1539bd670b35SErik Nordmark ASSERT(ire->ire_masklen != IPV4_ABITS);
1540bd670b35SErik Nordmark
1541bd670b35SErik Nordmark /*
1542bd670b35SErik Nordmark * In the case of ip_input and ILLF_FORWARDING not
15439e3469d3SErik Nordmark * being set, and in the case of RTM_GET, there is
15449e3469d3SErik Nordmark * no point in allocating an IRE_IF_CLONE. We return
15459e3469d3SErik Nordmark * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
15469e3469d3SErik Nordmark * result in a ire_dep_parent which is IRE_IF_*
15479e3469d3SErik Nordmark * without an IRE_IF_CLONE.
1548bd670b35SErik Nordmark * We recover from that when we need to send packets
1549bd670b35SErik Nordmark * by ensuring that the generations become
1550bd670b35SErik Nordmark * IRE_GENERATION_VERIFY in this case.
1551bd670b35SErik Nordmark */
15529e3469d3SErik Nordmark if (!(irr_flags & IRR_ALLOCATE)) {
1553bd670b35SErik Nordmark invalidate = B_TRUE;
1554bd670b35SErik Nordmark ire = NULL;
1555bd670b35SErik Nordmark goto done;
1556bd670b35SErik Nordmark }
1557bd670b35SErik Nordmark
1558bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
1559bd670b35SErik Nordmark
1560bd670b35SErik Nordmark clone = ire_create_if_clone(ire, &v6nexthop,
1561bd670b35SErik Nordmark &generation);
1562bd670b35SErik Nordmark if (clone == NULL) {
1563bd670b35SErik Nordmark /*
1564bd670b35SErik Nordmark * Temporary failure - no memory.
1565bd670b35SErik Nordmark * Don't want caller to cache IRE_NOROUTE.
1566bd670b35SErik Nordmark */
1567bd670b35SErik Nordmark invalidate = B_TRUE;
1568bd670b35SErik Nordmark ire = ire_blackhole(ipst, B_FALSE);
1569bd670b35SErik Nordmark goto error;
1570bd670b35SErik Nordmark }
1571bd670b35SErik Nordmark /*
1572bd670b35SErik Nordmark * Make clone next to last entry and the
1573bd670b35SErik Nordmark * IRE_INTERFACE the last in the dependency
1574bd670b35SErik Nordmark * chain since the clone depends on the
1575bd670b35SErik Nordmark * IRE_INTERFACE.
1576bd670b35SErik Nordmark */
1577bd670b35SErik Nordmark ASSERT(i >= 1);
1578bd670b35SErik Nordmark ASSERT(i < MAX_IRE_RECURSION);
1579bd670b35SErik Nordmark
1580bd670b35SErik Nordmark ires[i] = ires[i-1];
1581bd670b35SErik Nordmark generations[i] = generations[i-1];
1582bd670b35SErik Nordmark ires[i-1] = clone;
1583bd670b35SErik Nordmark generations[i-1] = generation;
1584bd670b35SErik Nordmark i++;
1585bd670b35SErik Nordmark
1586bd670b35SErik Nordmark ire = NULL;
1587bd670b35SErik Nordmark goto done;
1588bd670b35SErik Nordmark }
1589bd670b35SErik Nordmark
1590bd670b35SErik Nordmark /*
1591bd670b35SErik Nordmark * We only match on the type and optionally ILL when
1592bd670b35SErik Nordmark * recursing. The type match is used by some callers
1593bd670b35SErik Nordmark * to exclude certain types (such as IRE_IF_CLONE or
1594bd670b35SErik Nordmark * IRE_LOCAL|IRE_LOOPBACK).
159544b099c4SSowmini Varadhan *
159644b099c4SSowmini Varadhan * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
159744b099c4SSowmini Varadhan * ire->ire_ill, and we want to find the IRE_INTERFACE for
159844b099c4SSowmini Varadhan * ire_ill, so we set ill to the ire_ill;
1599bd670b35SErik Nordmark */
160001685f97SSowmini Varadhan match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1601bd670b35SErik Nordmark nexthop = ire->ire_gateway_addr;
1602bd670b35SErik Nordmark if (ill == NULL && ire->ire_ill != NULL) {
1603bd670b35SErik Nordmark ill = ire->ire_ill;
1604bd670b35SErik Nordmark need_refrele = B_TRUE;
1605bd670b35SErik Nordmark ill_refhold(ill);
1606bd670b35SErik Nordmark match_args |= MATCH_IRE_ILL;
1607bd670b35SErik Nordmark }
1608bd670b35SErik Nordmark ire = NULL;
1609bd670b35SErik Nordmark }
1610bd670b35SErik Nordmark ASSERT(ire == NULL);
1611bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE);
1612bd670b35SErik Nordmark
1613bd670b35SErik Nordmark error:
1614bd670b35SErik Nordmark ASSERT(ire != NULL);
1615bd670b35SErik Nordmark if (need_refrele)
1616bd670b35SErik Nordmark ill_refrele(ill);
1617bd670b35SErik Nordmark
1618bd670b35SErik Nordmark /*
1619bd670b35SErik Nordmark * In the case of MULTIRT we want to try a different IRE the next
1620bd670b35SErik Nordmark * time. We let the next packet retry in that case.
1621bd670b35SErik Nordmark */
1622bd670b35SErik Nordmark if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1623bd670b35SErik Nordmark (void) ire_no_good(ires[0]);
1624bd670b35SErik Nordmark
1625bd670b35SErik Nordmark cleanup:
1626bd670b35SErik Nordmark /* cleanup ires[i] */
1627bd670b35SErik Nordmark ire_dep_unbuild(ires, i);
1628bd670b35SErik Nordmark for (j = 0; j < i; j++)
1629bd670b35SErik Nordmark ire_refrele(ires[j]);
1630bd670b35SErik Nordmark
16319e3469d3SErik Nordmark ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
16329e3469d3SErik Nordmark (irr_flags & IRR_INCOMPLETE));
1633bd670b35SErik Nordmark /*
1634bd670b35SErik Nordmark * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1635bd670b35SErik Nordmark * ip_select_route since the reject or lack of memory might be gone.
1636bd670b35SErik Nordmark */
1637bd670b35SErik Nordmark if (generationp != NULL)
1638bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY;
1639bd670b35SErik Nordmark return (ire);
1640bd670b35SErik Nordmark
1641bd670b35SErik Nordmark done:
1642bd670b35SErik Nordmark ASSERT(ire == NULL);
1643bd670b35SErik Nordmark if (need_refrele) {
1644bd670b35SErik Nordmark ill_refrele(ill);
1645bd670b35SErik Nordmark ill = NULL;
1646bd670b35SErik Nordmark }
1647bd670b35SErik Nordmark
1648bd670b35SErik Nordmark /* Build dependencies */
1649188e1664SErik Nordmark if (i > 1 && !ire_dep_build(ires, generations, i)) {
1650bd670b35SErik Nordmark /* Something in chain was condemned; tear it apart */
1651bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE);
1652bd670b35SErik Nordmark goto cleanup;
1653bd670b35SErik Nordmark }
1654bd670b35SErik Nordmark
1655bd670b35SErik Nordmark /*
1656bd670b35SErik Nordmark * Release all refholds except the one for ires[0] that we
1657bd670b35SErik Nordmark * will return to the caller.
1658bd670b35SErik Nordmark */
1659bd670b35SErik Nordmark for (j = 1; j < i; j++)
1660bd670b35SErik Nordmark ire_refrele(ires[j]);
1661bd670b35SErik Nordmark
1662bd670b35SErik Nordmark if (invalidate) {
1663bd670b35SErik Nordmark /*
1664bd670b35SErik Nordmark * Since we needed to allocate but couldn't we need to make
1665bd670b35SErik Nordmark * sure that the dependency chain is rebuilt the next time.
1666bd670b35SErik Nordmark */
1667bd670b35SErik Nordmark ire_dep_invalidate_generations(ires[0]);
1668bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY;
1669bd670b35SErik Nordmark } else {
1670bd670b35SErik Nordmark /*
1671bd670b35SErik Nordmark * IREs can have been added or deleted while we did the
1672bd670b35SErik Nordmark * recursive lookup and we can't catch those until we've built
1673bd670b35SErik Nordmark * the dependencies. We verify the stored
1674bd670b35SErik Nordmark * ire_dep_parent_generation to catch any such changes and
1675bd670b35SErik Nordmark * return IRE_GENERATION_VERIFY (which will cause
1676bd670b35SErik Nordmark * ip_select_route to be called again so we can redo the
1677bd670b35SErik Nordmark * recursive lookup next time we send a packet.
1678bd670b35SErik Nordmark */
1679188e1664SErik Nordmark if (ires[0]->ire_dep_parent == NULL)
1680188e1664SErik Nordmark generation = ires[0]->ire_generation;
1681188e1664SErik Nordmark else
1682bd670b35SErik Nordmark generation = ire_dep_validate_generations(ires[0]);
1683bd670b35SErik Nordmark if (generations[0] != ires[0]->ire_generation) {
1684bd670b35SErik Nordmark /* Something changed at the top */
1685bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY;
1686bd670b35SErik Nordmark }
1687bd670b35SErik Nordmark }
1688bd670b35SErik Nordmark if (generationp != NULL)
1689bd670b35SErik Nordmark *generationp = generation;
1690bd670b35SErik Nordmark
1691bd670b35SErik Nordmark return (ires[0]);
1692bd670b35SErik Nordmark }
1693bd670b35SErik Nordmark
1694bd670b35SErik Nordmark ire_t *
ire_route_recursive_v4(ipaddr_t nexthop,uint_t ire_type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,ipaddr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)1695bd670b35SErik Nordmark ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
1696bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
16979e3469d3SErik Nordmark uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1698bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1699bd670b35SErik Nordmark {
1700bd670b35SErik Nordmark return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
17019e3469d3SErik Nordmark zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1702bd670b35SErik Nordmark gwattrp, generationp));
1703bd670b35SErik Nordmark }
1704bd670b35SErik Nordmark
1705bd670b35SErik Nordmark /*
1706bd670b35SErik Nordmark * Recursively look for a route to the destination.
1707bd670b35SErik Nordmark * We only handle a destination match here, yet we have the same arguments
1708bd670b35SErik Nordmark * as the full match to allow function pointers to select between the two.
1709bd670b35SErik Nordmark *
1710bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE
1711bd670b35SErik Nordmark * instead.
1712bd670b35SErik Nordmark *
1713bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1714bd670b35SErik Nordmark * is an error.
1715bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT.
1716bd670b35SErik Nordmark */
1717bd670b35SErik Nordmark ire_t *
ire_route_recursive_dstonly_v4(ipaddr_t nexthop,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst)17189e3469d3SErik Nordmark ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
1719bd670b35SErik Nordmark uint32_t xmit_hint, ip_stack_t *ipst)
1720bd670b35SErik Nordmark {
1721bd670b35SErik Nordmark ire_t *ire;
1722bd670b35SErik Nordmark ire_t *ire1;
1723bd670b35SErik Nordmark uint_t generation;
1724bd670b35SErik Nordmark
1725bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */
1726bd670b35SErik Nordmark ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
1727bd670b35SErik Nordmark &generation);
1728bd670b35SErik Nordmark ASSERT(ire != NULL);
1729fff7ec1dSSowmini Varadhan /*
1730fff7ec1dSSowmini Varadhan * If the IRE has a current cached parent we know that the whole
1731fff7ec1dSSowmini Varadhan * parent chain is current, hence we don't need to discover and
1732fff7ec1dSSowmini Varadhan * build any dependencies by doing a recursive lookup.
1733fff7ec1dSSowmini Varadhan */
1734fff7ec1dSSowmini Varadhan mutex_enter(&ire->ire_lock);
1735fff7ec1dSSowmini Varadhan if (ire->ire_dep_parent != NULL) {
1736fff7ec1dSSowmini Varadhan if (ire->ire_dep_parent->ire_generation ==
1737fff7ec1dSSowmini Varadhan ire->ire_dep_parent_generation) {
1738fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock);
1739fff7ec1dSSowmini Varadhan return (ire);
1740fff7ec1dSSowmini Varadhan }
1741fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock);
1742fff7ec1dSSowmini Varadhan } else {
1743fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock);
1744bd670b35SErik Nordmark /*
1745bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it
1746bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes
1747bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask.
1748bd670b35SErik Nordmark */
1749bd670b35SErik Nordmark if (ire->ire_nce_capable)
1750bd670b35SErik Nordmark return (ire);
1751bd670b35SErik Nordmark }
1752bd670b35SErik Nordmark
1753bd670b35SErik Nordmark /*
1754bd670b35SErik Nordmark * Fallback to loop in the normal code starting with the ire
1755bd670b35SErik Nordmark * we found. Normally this would return the same ire.
1756bd670b35SErik Nordmark */
1757bd670b35SErik Nordmark ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
17589e3469d3SErik Nordmark NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1759bd670b35SErik Nordmark &generation);
1760bd670b35SErik Nordmark ire_refrele(ire);
1761bd670b35SErik Nordmark return (ire1);
1762bd670b35SErik Nordmark }
1763fff7ec1dSSowmini Varadhan
1764fff7ec1dSSowmini Varadhan /*
1765fff7ec1dSSowmini Varadhan * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
1766fff7ec1dSSowmini Varadhan * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
1767fff7ec1dSSowmini Varadhan * are not consistent, and TRUE otherwise.
1768fff7ec1dSSowmini Varadhan */
1769fff7ec1dSSowmini Varadhan boolean_t
ire_clone_verify(ire_t * ire)1770fff7ec1dSSowmini Varadhan ire_clone_verify(ire_t *ire)
1771fff7ec1dSSowmini Varadhan {
1772fff7ec1dSSowmini Varadhan ASSERT((ire->ire_type & IRE_IF_CLONE) != 0);
1773fff7ec1dSSowmini Varadhan mutex_enter(&ire->ire_lock);
1774fff7ec1dSSowmini Varadhan if (ire->ire_dep_parent != NULL &&
1775fff7ec1dSSowmini Varadhan ire->ire_dep_parent->ire_generation !=
1776fff7ec1dSSowmini Varadhan ire->ire_dep_parent_generation) {
1777fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock);
1778fff7ec1dSSowmini Varadhan ire_delete(ire);
1779fff7ec1dSSowmini Varadhan return (B_FALSE);
1780fff7ec1dSSowmini Varadhan }
1781fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock);
1782fff7ec1dSSowmini Varadhan return (B_TRUE);
1783fff7ec1dSSowmini Varadhan }
1784