1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2021 Racktop Systems, Inc.
24 */
25
26 /*
27 * This file contains consumer routines of the IPv4 forwarding engine
28 */
29
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #include <sys/dlpi.h>
35 #include <sys/ddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/policy.h>
38
39 #include <sys/systm.h>
40 #include <sys/strsun.h>
41 #include <sys/kmem.h>
42 #include <sys/param.h>
43 #include <sys/socket.h>
44 #include <sys/strsubr.h>
45 #include <net/if.h>
46 #include <net/route.h>
47 #include <netinet/in.h>
48 #include <net/if_dl.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51
52 #include <inet/ipsec_impl.h>
53 #include <inet/common.h>
54 #include <inet/mi.h>
55 #include <inet/mib2.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ip6.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/arp.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_ftable.h>
64 #include <inet/ip_rts.h>
65 #include <inet/nd.h>
66
67 #include <net/pfkeyv2.h>
68 #include <inet/sadb.h>
69 #include <inet/tcp.h>
70 #include <inet/ipclassifier.h>
71 #include <sys/zone.h>
72 #include <net/radix.h>
73 #include <sys/tsol/label.h>
74 #include <sys/tsol/tnet.h>
75
76 #define IS_DEFAULT_ROUTE(ire) \
77 (((ire)->ire_type & IRE_DEFAULT) || \
78 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
79
80 #define IP_SRC_MULTIHOMING(isv6, ipst) \
81 (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \
82 ipst->ips_ip_strict_src_multihoming)
83
84 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
85 static void ire_del_host_redir(ire_t *, char *);
86 static boolean_t ire_find_best_route(struct radix_node *, void *);
87
88 /*
89 * Lookup a route in forwarding table. A specific lookup is indicated by
90 * passing the required parameters and indicating the match required in the
91 * flag field.
92 *
93 * Supports IP_BOUND_IF by following the ipif/ill when recursing.
94 */
95 ire_t *
ire_ftable_lookup_v4(ipaddr_t addr,ipaddr_t mask,ipaddr_t gateway,int type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,int flags,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)96 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
97 int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
98 int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
99 {
100 ire_t *ire;
101 struct rt_sockaddr rdst, rmask;
102 struct rt_entry *rt;
103 ire_ftable_args_t margs;
104
105 ASSERT(ill == NULL || !ill->ill_isv6);
106
107 /*
108 * ire_match_args() will dereference ill if MATCH_IRE_ILL
109 * is set.
110 */
111 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
112 return (NULL);
113
114 bzero(&rdst, sizeof (rdst));
115 rdst.rt_sin_len = sizeof (rdst);
116 rdst.rt_sin_family = AF_INET;
117 rdst.rt_sin_addr.s_addr = addr;
118
119 bzero(&rmask, sizeof (rmask));
120 rmask.rt_sin_len = sizeof (rmask);
121 rmask.rt_sin_family = AF_INET;
122 rmask.rt_sin_addr.s_addr = mask;
123
124 bzero(&margs, sizeof (margs));
125 margs.ift_addr = addr;
126 margs.ift_mask = mask;
127 margs.ift_gateway = gateway;
128 margs.ift_type = type;
129 margs.ift_ill = ill;
130 margs.ift_zoneid = zoneid;
131 margs.ift_tsl = tsl;
132 margs.ift_flags = flags;
133
134 /*
135 * The flags argument passed to ire_ftable_lookup may cause the
136 * search to return, not the longest matching prefix, but the
137 * "best matching prefix", i.e., the longest prefix that also
138 * satisfies constraints imposed via the permutation of flags
139 * passed in. To achieve this, we invoke ire_match_args() on
140 * each matching leaf in the radix tree. ire_match_args is
141 * invoked by the callback function ire_find_best_route()
142 * We hold the global tree lock in read mode when calling
143 * rn_match_args. Before dropping the global tree lock, ensure
144 * that the radix node can't be deleted by incrementing ire_refcnt.
145 */
146 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
147 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
148 ipst->ips_ip_ftable, ire_find_best_route, &margs);
149 ire = margs.ift_best_ire;
150 if (rt == NULL) {
151 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
152 return (NULL);
153 }
154 ASSERT(ire != NULL);
155
156 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
157
158 /*
159 * round-robin only if we have more than one route in the bucket.
160 * ips_ip_ecmp_behavior controls when we do ECMP
161 * 2: always
162 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
163 * 0: never
164 */
165 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
166 if (ipst->ips_ip_ecmp_behavior == 2 ||
167 (ipst->ips_ip_ecmp_behavior == 1 &&
168 IS_DEFAULT_ROUTE(ire))) {
169 ire_t *next_ire;
170
171 margs.ift_best_ire = NULL;
172 next_ire = ire_round_robin(ire->ire_bucket, &margs,
173 xmit_hint, ire, ipst);
174 if (next_ire == NULL) {
175 /* keep ire if next_ire is null */
176 goto done;
177 }
178 ire_refrele(ire);
179 ire = next_ire;
180 }
181 }
182
183 done:
184 /* Return generation before dropping lock */
185 if (generationp != NULL)
186 *generationp = ire->ire_generation;
187
188 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
189
190 /*
191 * For shared-IP zones we need additional checks to what was
192 * done in ire_match_args to make sure IRE_LOCALs are handled.
193 *
194 * When ip_restrict_interzone_loopback is set, then
195 * we ensure that IRE_LOCAL are only used for loopback
196 * between zones when the logical "Ethernet" would
197 * have looped them back. That is, if in the absense of
198 * the IRE_LOCAL we would have sent to packet out the
199 * same ill.
200 */
201 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
202 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
203 ipst->ips_ip_restrict_interzone_loopback) {
204 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
205 ASSERT(ire != NULL);
206 }
207 return (ire);
208 }
209
210 /*
211 * This function is called by
212 * ip_input/ire_route_recursive when doing a route lookup on only the
213 * destination address.
214 *
215 * The optimizations of this function over ire_ftable_lookup are:
216 * o removing unnecessary flag matching
217 * o doing longest prefix match instead of overloading it further
218 * with the unnecessary "best_prefix_match"
219 *
220 * If no route is found we return IRE_NOROUTE.
221 */
222 ire_t *
ire_ftable_lookup_simple_v4(ipaddr_t addr,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)223 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
224 uint_t *generationp)
225 {
226 ire_t *ire;
227 struct rt_sockaddr rdst;
228 struct rt_entry *rt;
229 irb_t *irb;
230
231 rdst.rt_sin_len = sizeof (rdst);
232 rdst.rt_sin_family = AF_INET;
233 rdst.rt_sin_addr.s_addr = addr;
234
235 /*
236 * This is basically inlining a simpler version of ire_match_args
237 */
238 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
239
240 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
241 ipst->ips_ip_ftable, NULL, NULL);
242
243 if (rt == NULL)
244 goto bad;
245
246 irb = &rt->rt_irb;
247 if (irb->irb_ire_cnt == 0)
248 goto bad;
249
250 rw_enter(&irb->irb_lock, RW_READER);
251 ire = irb->irb_ire;
252 if (ire == NULL) {
253 rw_exit(&irb->irb_lock);
254 goto bad;
255 }
256 while (IRE_IS_CONDEMNED(ire)) {
257 ire = ire->ire_next;
258 if (ire == NULL) {
259 rw_exit(&irb->irb_lock);
260 goto bad;
261 }
262 }
263
264 /* we have a ire that matches */
265 ire_refhold(ire);
266 rw_exit(&irb->irb_lock);
267
268 /*
269 * round-robin only if we have more than one route in the bucket.
270 * ips_ip_ecmp_behavior controls when we do ECMP
271 * 2: always
272 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
273 * 0: never
274 *
275 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
276 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
277 * and the IRE_INTERFACESs are likely to be shorter matches.
278 */
279 if (ire->ire_bucket->irb_ire_cnt > 1) {
280 if (ipst->ips_ip_ecmp_behavior == 2 ||
281 (ipst->ips_ip_ecmp_behavior == 1 &&
282 IS_DEFAULT_ROUTE(ire))) {
283 ire_t *next_ire;
284 ire_ftable_args_t margs;
285
286 bzero(&margs, sizeof (margs));
287 margs.ift_addr = addr;
288 margs.ift_zoneid = ALL_ZONES;
289
290 next_ire = ire_round_robin(ire->ire_bucket, &margs,
291 xmit_hint, ire, ipst);
292 if (next_ire == NULL) {
293 /* keep ire if next_ire is null */
294 if (generationp != NULL)
295 *generationp = ire->ire_generation;
296 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
297 return (ire);
298 }
299 ire_refrele(ire);
300 ire = next_ire;
301 }
302 }
303 /* Return generation before dropping lock */
304 if (generationp != NULL)
305 *generationp = ire->ire_generation;
306
307 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
308
309 /*
310 * Since we only did ALL_ZONES matches there is no special handling
311 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
312 */
313 return (ire);
314
315 bad:
316 if (generationp != NULL)
317 *generationp = IRE_GENERATION_VERIFY;
318
319 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
320 return (ire_reject(ipst, B_FALSE));
321 }
322
323 /*
324 * Find the ill matching a multicast group.
325 * Allows different routes for multicast addresses
326 * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
327 * which point at different interfaces. This is used when IP_MULTICAST_IF
328 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
329 * specify the interface to join on.
330 *
331 * Supports link-local addresses by using ire_route_recursive which follows
332 * the ill when recursing.
333 *
334 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
335 * and the MULTIRT property can be different for different groups, we
336 * extract RTF_MULTIRT from the special unicast route added for a group
337 * with CGTP and pass that back in the multirtp argument.
338 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
339 * We have a setsrcp argument for the same reason.
340 */
341 ill_t *
ire_lookup_multi_ill_v4(ipaddr_t group,zoneid_t zoneid,ip_stack_t * ipst,boolean_t * multirtp,ipaddr_t * setsrcp)342 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
343 boolean_t *multirtp, ipaddr_t *setsrcp)
344 {
345 ire_t *ire;
346 ill_t *ill;
347
348 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
349 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
350 ASSERT(ire != NULL);
351 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
352 ire_refrele(ire);
353 return (NULL);
354 }
355
356 if (multirtp != NULL)
357 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
358
359 ill = ire_nexthop_ill(ire);
360 ire_refrele(ire);
361 return (ill);
362 }
363
364 /*
365 * Delete the passed in ire if the gateway addr matches
366 */
367 void
ire_del_host_redir(ire_t * ire,char * gateway)368 ire_del_host_redir(ire_t *ire, char *gateway)
369 {
370 if ((ire->ire_flags & RTF_DYNAMIC) &&
371 (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
372 ire_delete(ire);
373 }
374
375 /*
376 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
377 * pointing at the specified gateway and
378 * delete them. This routine is called only
379 * when a default gateway is going away.
380 */
381 void
ire_delete_host_redirects(ipaddr_t gateway,ip_stack_t * ipst)382 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
383 {
384 struct rtfuncarg rtfarg;
385
386 bzero(&rtfarg, sizeof (rtfarg));
387 rtfarg.rt_func = ire_del_host_redir;
388 rtfarg.rt_arg = (void *)&gateway;
389 rtfarg.rt_zoneid = ALL_ZONES;
390 rtfarg.rt_ipst = ipst;
391 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
392 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
393 }
394
395 /*
396 * Obtain the rt_entry and rt_irb for the route to be added to
397 * the ips_ip_ftable.
398 * First attempt to add a node to the radix tree via rn_addroute. If the
399 * route already exists, return the bucket for the existing route.
400 *
401 * Locking notes: Need to hold the global radix tree lock in write mode to
402 * add a radix node. To prevent the node from being deleted, ire_get_bucket()
403 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
404 * while holding the irb_lock, but not the radix tree lock.
405 */
406 irb_t *
ire_get_bucket(ire_t * ire)407 ire_get_bucket(ire_t *ire)
408 {
409 struct radix_node *rn;
410 struct rt_entry *rt;
411 struct rt_sockaddr rmask, rdst;
412 irb_t *irb = NULL;
413 ip_stack_t *ipst = ire->ire_ipst;
414
415 ASSERT(ipst->ips_ip_ftable != NULL);
416
417 /* first try to see if route exists (based on rtalloc1) */
418 bzero(&rdst, sizeof (rdst));
419 rdst.rt_sin_len = sizeof (rdst);
420 rdst.rt_sin_family = AF_INET;
421 rdst.rt_sin_addr.s_addr = ire->ire_addr;
422
423 bzero(&rmask, sizeof (rmask));
424 rmask.rt_sin_len = sizeof (rmask);
425 rmask.rt_sin_family = AF_INET;
426 rmask.rt_sin_addr.s_addr = ire->ire_mask;
427
428 /*
429 * add the route. based on BSD's rtrequest1(RTM_ADD)
430 */
431 R_Malloc(rt, rt_entry_cache, sizeof (*rt));
432 /* kmem_alloc failed */
433 if (rt == NULL)
434 return (NULL);
435
436 bzero(rt, sizeof (*rt));
437 rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
438 rt->rt_dst = rdst;
439 irb = &rt->rt_irb;
440 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
441 irb->irb_ipst = ipst;
442 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
443 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
444 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
445 ipst->ips_ip_ftable, (struct radix_node *)rt);
446 if (rn == NULL) {
447 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
448 Free(rt, rt_entry_cache);
449 rt = NULL;
450 irb = NULL;
451 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
452 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
453 ipst->ips_ip_ftable);
454 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
455 /* found a non-root match */
456 rt = (struct rt_entry *)rn;
457 }
458 }
459 if (rt != NULL) {
460 irb = &rt->rt_irb;
461 irb_refhold(irb);
462 }
463 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
464 return (irb);
465 }
466
467 /*
468 * This function is used when the caller wants to know the outbound
469 * interface for a packet given only the address.
470 * If this is a offlink IP address and there are multiple
471 * routes to this destination, this routine will utilise the
472 * first route it finds to IP address
473 * Return values:
474 * 0 - FAILURE
475 * nonzero - ifindex
476 */
477 uint_t
ifindex_lookup(const struct sockaddr * ipaddr,zoneid_t zoneid)478 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
479 {
480 uint_t ifindex = 0;
481 ire_t *ire;
482 ill_t *ill;
483 netstack_t *ns;
484 ip_stack_t *ipst;
485
486 if (zoneid == ALL_ZONES)
487 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
488 else
489 ns = netstack_find_by_zoneid(zoneid);
490 ASSERT(ns != NULL);
491
492 /*
493 * For exclusive stacks we set the zoneid to zero
494 * since IP uses the global zoneid in the exclusive stacks.
495 */
496 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
497 zoneid = GLOBAL_ZONEID;
498 ipst = ns->netstack_ip;
499
500 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
501
502 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
503 ill = ire_nexthop_ill(ire);
504 if (ill != NULL) {
505 ifindex = ill->ill_phyint->phyint_ifindex;
506 ill_refrele(ill);
507 }
508 ire_refrele(ire);
509 }
510 netstack_rele(ns);
511 return (ifindex);
512 }
513
514 /*
515 * Routine to find the route to a destination. If a ifindex is supplied
516 * it tries to match the route to the corresponding ipif for the ifindex
517 */
518 static ire_t *
route_to_dst(const struct sockaddr * dst_addr,zoneid_t zoneid,ip_stack_t * ipst)519 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
520 {
521 ire_t *ire = NULL;
522 int match_flags;
523
524 match_flags = MATCH_IRE_DSTONLY;
525
526 /* XXX pass NULL tsl for now */
527
528 if (dst_addr->sa_family == AF_INET) {
529 ire = ire_route_recursive_v4(
530 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
531 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
532 NULL, NULL);
533 } else {
534 ire = ire_route_recursive_v6(
535 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
536 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
537 NULL, NULL);
538 }
539 ASSERT(ire != NULL);
540 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
541 ire_refrele(ire);
542 return (NULL);
543 }
544 return (ire);
545 }
546
547 /*
548 * This routine is called by IP Filter to send a packet out on the wire
549 * to a specified dstination (which may be onlink or offlink). The ifindex may
550 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
551 * an outgoing interface and requires the nexthop to be on that interface.
552 * IP WILL NOT DO the following to the data packet before sending it out:
553 * a. manipulate ttl
554 * b. ipsec work
555 * c. fragmentation
556 *
557 * If the packet has been prepared for hardware checksum then it will be
558 * passed off to ip_send_align_cksum() to check that the flags set on the
559 * packet are in alignment with the capabilities of the new outgoing NIC.
560 *
561 * Return values:
562 * 0: IP was able to send of the data pkt
563 * ECOMM: Could not send packet
564 * ENONET No route to dst. It is up to the caller
565 * to send icmp unreachable error message,
566 * EINPROGRESS The macaddr of the onlink dst or that
567 * of the offlink dst's nexthop needs to get
568 * resolved before packet can be sent to dst.
569 * Thus transmission is not guaranteed.
570 * Note: No longer have visibility to the ARP queue
571 * hence no EINPROGRESS.
572 */
573 int
ipfil_sendpkt(const struct sockaddr * dst_addr,mblk_t * mp,uint_t ifindex,zoneid_t zoneid)574 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
575 zoneid_t zoneid)
576 {
577 ipaddr_t nexthop;
578 netstack_t *ns;
579 ip_stack_t *ipst;
580 ip_xmit_attr_t ixas;
581 int error;
582
583 ASSERT(mp != NULL);
584
585 if (zoneid == ALL_ZONES)
586 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
587 else
588 ns = netstack_find_by_zoneid(zoneid);
589 ASSERT(ns != NULL);
590
591 /*
592 * For exclusive stacks we set the zoneid to zero
593 * since IP uses the global zoneid in the exclusive stacks.
594 */
595 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
596 zoneid = GLOBAL_ZONEID;
597 ipst = ns->netstack_ip;
598
599 ASSERT(dst_addr->sa_family == AF_INET ||
600 dst_addr->sa_family == AF_INET6);
601
602 bzero(&ixas, sizeof (ixas));
603 /*
604 * No IPsec, no fragmentation, and don't let any hooks see
605 * the packet.
606 */
607 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
608 ixas.ixa_cred = kcred;
609 ixas.ixa_cpid = NOPID;
610 ixas.ixa_tsl = NULL;
611 ixas.ixa_ipst = ipst;
612 ixas.ixa_ifindex = ifindex;
613
614 if (dst_addr->sa_family == AF_INET) {
615 ipha_t *ipha = (ipha_t *)mp->b_rptr;
616
617 ixas.ixa_flags |= IXAF_IS_IPV4;
618 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
619 if (nexthop != ipha->ipha_dst) {
620 ixas.ixa_flags |= IXAF_NEXTHOP_SET;
621 ixas.ixa_nexthop_v4 = nexthop;
622 }
623 ixas.ixa_multicast_ttl = ipha->ipha_ttl;
624 } else {
625 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
626 in6_addr_t *nexthop6;
627
628 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
629 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
630 ixas.ixa_flags |= IXAF_NEXTHOP_SET;
631 ixas.ixa_nexthop_v6 = *nexthop6;
632 }
633 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
634 }
635 error = ip_output_simple(mp, &ixas);
636 ixa_cleanup(&ixas);
637
638 netstack_rele(ns);
639 switch (error) {
640 case 0:
641 break;
642
643 case EHOSTUNREACH:
644 case ENETUNREACH:
645 error = ENONET;
646 break;
647
648 default:
649 error = ECOMM;
650 break;
651 }
652 return (error);
653 }
654
655 /*
656 * callback function provided by ire_ftable_lookup when calling
657 * rn_match_args(). Invoke ire_match_args on each matching leaf node in
658 * the radix tree.
659 */
660 boolean_t
ire_find_best_route(struct radix_node * rn,void * arg)661 ire_find_best_route(struct radix_node *rn, void *arg)
662 {
663 struct rt_entry *rt = (struct rt_entry *)rn;
664 irb_t *irb_ptr;
665 ire_t *ire;
666 ire_ftable_args_t *margs = arg;
667 ipaddr_t match_mask;
668
669 ASSERT(rt != NULL);
670
671 irb_ptr = &rt->rt_irb;
672
673 if (irb_ptr->irb_ire_cnt == 0)
674 return (B_FALSE);
675
676 rw_enter(&irb_ptr->irb_lock, RW_READER);
677 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
678 if (IRE_IS_CONDEMNED(ire))
679 continue;
680 ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
681 if (margs->ift_flags & MATCH_IRE_MASK)
682 match_mask = margs->ift_mask;
683 else
684 match_mask = ire->ire_mask;
685
686 if (ire_match_args(ire, margs->ift_addr, match_mask,
687 margs->ift_gateway, margs->ift_type, margs->ift_ill,
688 margs->ift_zoneid, margs->ift_tsl,
689 margs->ift_flags)) {
690 ire_refhold(ire);
691 rw_exit(&irb_ptr->irb_lock);
692 margs->ift_best_ire = ire;
693 return (B_TRUE);
694 }
695 }
696 rw_exit(&irb_ptr->irb_lock);
697 return (B_FALSE);
698 }
699
700 /*
701 * ftable irb_t structures are dynamically allocated, and we need to
702 * check if the irb_t (and associated ftable tree attachment) needs to
703 * be cleaned up when the irb_refcnt goes to 0. The conditions that need
704 * be verified are:
705 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
706 * - no other threads holding references to ire's in the bucket,
707 * i.e., irb_nire == 0
708 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
709 * - need to hold the global tree lock and irb_lock in write mode.
710 */
711 void
irb_refrele_ftable(irb_t * irb)712 irb_refrele_ftable(irb_t *irb)
713 {
714 for (;;) {
715 rw_enter(&irb->irb_lock, RW_WRITER);
716 ASSERT(irb->irb_refcnt != 0);
717 if (irb->irb_refcnt != 1) {
718 /*
719 * Someone has a reference to this radix node
720 * or there is some bucket walker.
721 */
722 irb->irb_refcnt--;
723 rw_exit(&irb->irb_lock);
724 return;
725 } else {
726 /*
727 * There is no other walker, nor is there any
728 * other thread that holds a direct ref to this
729 * radix node. Do the clean up if needed. Call
730 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
731 */
732 if (irb->irb_marks & IRB_MARK_CONDEMNED) {
733 ire_t *ire_list;
734
735 ire_list = ire_unlink(irb);
736 rw_exit(&irb->irb_lock);
737
738 if (ire_list != NULL)
739 ire_cleanup(ire_list);
740 /*
741 * more CONDEMNED entries could have
742 * been added while we dropped the lock,
743 * so we have to re-check.
744 */
745 continue;
746 }
747
748 /*
749 * Now check if there are still any ires
750 * associated with this radix node.
751 */
752 if (irb->irb_nire != 0) {
753 /*
754 * someone is still holding on
755 * to ires in this bucket
756 */
757 irb->irb_refcnt--;
758 rw_exit(&irb->irb_lock);
759 return;
760 } else {
761 /*
762 * Everything is clear. Zero walkers,
763 * Zero threads with a ref to this
764 * radix node, Zero ires associated with
765 * this radix node. Due to lock order,
766 * check the above conditions again
767 * after grabbing all locks in the right order
768 */
769 rw_exit(&irb->irb_lock);
770 if (irb_inactive(irb))
771 return;
772 /*
773 * irb_inactive could not free the irb.
774 * See if there are any walkers, if not
775 * try to clean up again.
776 */
777 }
778 }
779 }
780 }
781
782 /*
783 * IRE iterator used by ire_ftable_lookup to process multiple equal
784 * routes. Given a starting point in the hash list (hash), walk the IREs
785 * in the bucket skipping deleted entries. We treat the bucket as a circular
786 * list for the purposes of walking it.
787 * Returns the IRE (held) that corresponds to the hash value. If that IRE is
788 * not applicable (ire_match_args failed) then it returns a subsequent one.
789 * If we fail to find an IRE we return NULL.
790 *
791 * Assumes that the caller holds a reference on the IRE bucket and a read lock
792 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
793 *
794 * Applies to IPv4 and IPv6.
795 *
796 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
797 * address and bucket, we compare against ire_type for the orig_ire. We also
798 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
799 * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
800 *
801 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
802 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
803 * in which the zone has an IP address. We check this for the global zone
804 * even if no shared-IP zones are configured.
805 */
806 ire_t *
ire_round_robin(irb_t * irb_ptr,ire_ftable_args_t * margs,uint_t hash,ire_t * orig_ire,ip_stack_t * ipst)807 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
808 ire_t *orig_ire, ip_stack_t *ipst)
809 {
810 ire_t *ire, *maybe_ire = NULL;
811 uint_t maybe_badcnt = 0;
812 uint_t maxwalk;
813
814 /* Fold in more bits from the hint/hash */
815 hash = hash ^ (hash >> 8) ^ (hash >> 16);
816
817 rw_enter(&irb_ptr->irb_lock, RW_WRITER);
818 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */
819 if (maxwalk == 0) {
820 rw_exit(&irb_ptr->irb_lock);
821 return (NULL);
822 }
823
824 hash %= maxwalk;
825 irb_refhold_locked(irb_ptr);
826 rw_exit(&irb_ptr->irb_lock);
827
828 /*
829 * Round-robin the routers list looking for a route that
830 * matches the passed in parameters.
831 * First we skip "hash" number of non-condemned IREs.
832 * Then we match the IRE.
833 * If we find an ire which has a non-zero ire_badcnt then we remember
834 * it and keep on looking for a lower ire_badcnt.
835 * If we come to the end of the list we continue (treat the
836 * bucket list as a circular list) but we match less than "max"
837 * entries.
838 */
839 ire = irb_ptr->irb_ire;
840 while (maxwalk > 0) {
841 if (IRE_IS_CONDEMNED(ire))
842 goto next_ire_skip;
843
844 /* Skip the first "hash" entries to do ECMP */
845 if (hash != 0) {
846 hash--;
847 goto next_ire_skip;
848 }
849
850 /* See CGTP comment above */
851 if (ire->ire_type != orig_ire->ire_type ||
852 ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
853 goto next_ire;
854
855 /*
856 * Note: Since IPv6 has hash buckets instead of radix
857 * buckers we need to explicitly compare the addresses.
858 * That makes this less efficient since we will be called
859 * even if there is no alternatives just because the
860 * bucket has multiple IREs for different addresses.
861 */
862 if (ire->ire_ipversion == IPV6_VERSION) {
863 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
864 &ire->ire_addr_v6))
865 goto next_ire;
866 }
867
868 /*
869 * For some reason find_best_route uses ire_mask. We do
870 * the same.
871 */
872 if (ire->ire_ipversion == IPV4_VERSION ?
873 !ire_match_args(ire, margs->ift_addr,
874 ire->ire_mask, margs->ift_gateway,
875 margs->ift_type, margs->ift_ill, margs->ift_zoneid,
876 margs->ift_tsl, margs->ift_flags) :
877 !ire_match_args_v6(ire, &margs->ift_addr_v6,
878 &ire->ire_mask_v6, &margs->ift_gateway_v6,
879 margs->ift_type, margs->ift_ill, margs->ift_zoneid,
880 margs->ift_tsl, margs->ift_flags))
881 goto next_ire;
882
883 if (margs->ift_zoneid != ALL_ZONES &&
884 (ire->ire_type & IRE_OFFLINK)) {
885 /*
886 * When we're in a zone, we're only
887 * interested in routers that are
888 * reachable through ipifs within our zone.
889 */
890 if (ire->ire_ipversion == IPV4_VERSION) {
891 if (!ire_gateway_ok_zone_v4(
892 ire->ire_gateway_addr, margs->ift_zoneid,
893 ire->ire_ill, margs->ift_tsl, ipst,
894 B_TRUE))
895 goto next_ire;
896 } else {
897 if (!ire_gateway_ok_zone_v6(
898 &ire->ire_gateway_addr_v6,
899 margs->ift_zoneid, ire->ire_ill,
900 margs->ift_tsl, ipst, B_TRUE))
901 goto next_ire;
902 }
903 }
904 mutex_enter(&ire->ire_lock);
905 /* Look for stale ire_badcnt and clear */
906 if (ire->ire_badcnt != 0 &&
907 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
908 ipst->ips_ip_ire_badcnt_lifetime))
909 ire->ire_badcnt = 0;
910 mutex_exit(&ire->ire_lock);
911
912 if (ire->ire_badcnt == 0) {
913 /* We found one with a zero badcnt; done */
914 ire_refhold(ire);
915 /*
916 * Care needed since irb_refrele grabs WLOCK to free
917 * the irb_t.
918 */
919 if (ire->ire_ipversion == IPV4_VERSION) {
920 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
921 irb_refrele(irb_ptr);
922 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
923 } else {
924 rw_exit(&ipst->ips_ip6_ire_head_lock);
925 irb_refrele(irb_ptr);
926 rw_enter(&ipst->ips_ip6_ire_head_lock,
927 RW_READER);
928 }
929 return (ire);
930 }
931 /*
932 * keep looking to see if there is a better (lower
933 * badcnt) matching IRE, but save this one as a last resort.
934 * If we find a lower badcnt pick that one as the last* resort.
935 */
936 if (maybe_ire == NULL) {
937 maybe_ire = ire;
938 maybe_badcnt = ire->ire_badcnt;
939 } else if (ire->ire_badcnt < maybe_badcnt) {
940 maybe_ire = ire;
941 maybe_badcnt = ire->ire_badcnt;
942 }
943
944 next_ire:
945 maxwalk--;
946 next_ire_skip:
947 ire = ire->ire_next;
948 if (ire == NULL)
949 ire = irb_ptr->irb_ire;
950 }
951 if (maybe_ire != NULL)
952 ire_refhold(maybe_ire);
953
954 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
955 if (ire->ire_ipversion == IPV4_VERSION) {
956 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
957 irb_refrele(irb_ptr);
958 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
959 } else {
960 rw_exit(&ipst->ips_ip6_ire_head_lock);
961 irb_refrele(irb_ptr);
962 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
963 }
964 return (maybe_ire);
965 }
966
967 void
irb_refhold_rn(struct radix_node * rn)968 irb_refhold_rn(struct radix_node *rn)
969 {
970 if ((rn->rn_flags & RNF_ROOT) == 0)
971 irb_refhold(&((rt_t *)(rn))->rt_irb);
972 }
973
974 void
irb_refrele_rn(struct radix_node * rn)975 irb_refrele_rn(struct radix_node *rn)
976 {
977 if ((rn->rn_flags & RNF_ROOT) == 0)
978 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
979 }
980
981
982 /*
983 * ip_select_src_ill() is used by ip_select_route() to find the src_ill
984 * to be used for source-aware routing table lookup. This function will
985 * ignore IPIF_UNNUMBERED interface addresses, and will only return a
986 * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
987 * interfaces).
988 */
989 static ill_t *
ip_select_src_ill(const in6_addr_t * v6src,zoneid_t zoneid,ip_stack_t * ipst)990 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
991 {
992 ipif_t *ipif;
993 ill_t *ill;
994 boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
995 ipaddr_t v4src;
996
997 if (isv6) {
998 ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
999 } else {
1000 IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
1001 ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
1002 }
1003 if (ipif == NULL)
1004 return (NULL);
1005 ill = ipif->ipif_ill;
1006 ill_refhold(ill);
1007 ipif_refrele(ipif);
1008 return (ill);
1009 }
1010
1011 /*
1012 * verify that v6src is configured on ill
1013 */
1014 static boolean_t
ip_verify_src_on_ill(const in6_addr_t v6src,ill_t * ill,zoneid_t zoneid)1015 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
1016 {
1017 ipif_t *ipif;
1018 ip_stack_t *ipst;
1019 ipaddr_t v4src;
1020
1021 if (ill == NULL)
1022 return (B_FALSE);
1023 ipst = ill->ill_ipst;
1024
1025 if (ill->ill_isv6) {
1026 ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
1027 } else {
1028 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
1029 ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
1030 }
1031
1032 if (ipif != NULL) {
1033 ipif_refrele(ipif);
1034 return (B_TRUE);
1035 } else {
1036 return (B_FALSE);
1037 }
1038 }
1039
1040 /*
1041 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
1042 * routes this routine sets up a ire_nce_cache as well. The caller needs to
1043 * lookup an nce for the multicast case.
1044 *
1045 * When src_multihoming is set to 2 (strict src multihoming) we use the source
1046 * address to select the interface and route. If IP_BOUND_IF etc are
1047 * specified, we require that they specify an interface on which the
1048 * source address is assigned.
1049 *
1050 * When src_multihoming is set to 1 (preferred src aware route
1051 * selection) the unicast lookup prefers a matching source
1052 * (i.e., that the route points out an ill on which the source is assigned), but
1053 * if no such route is found we fallback to not considering the source in the
1054 * route lookup.
1055 *
1056 * We skip the src_multihoming check when the source isn't (yet) set, and
1057 * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
1058 * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
1059 * when secpolicy_net_rawaccess().
1060 */
1061 ire_t *
ip_select_route(const in6_addr_t * v6dst,const in6_addr_t v6src,ip_xmit_attr_t * ixa,uint_t * generationp,in6_addr_t * setsrcp,int * errorp,boolean_t * multirtp)1062 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
1063 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1064 int *errorp, boolean_t *multirtp)
1065 {
1066 uint_t match_args;
1067 uint_t ire_type;
1068 ill_t *ill = NULL;
1069 ire_t *ire;
1070 ip_stack_t *ipst = ixa->ixa_ipst;
1071 ipaddr_t v4dst;
1072 in6_addr_t v6nexthop;
1073 iaflags_t ixaflags = ixa->ixa_flags;
1074 nce_t *nce;
1075 boolean_t preferred_src_aware = B_FALSE;
1076 boolean_t verify_src;
1077 boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
1078 int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
1079
1080 /*
1081 * We only verify that the src has been configured on a selected
1082 * interface if the src is not :: or INADDR_ANY, and if the
1083 * IXAF_VERIFY_SOURCE flag is set.
1084 */
1085 verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
1086 (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
1087
1088 match_args = MATCH_IRE_SECATTR;
1089 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
1090 if (setsrcp != NULL)
1091 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1092 if (errorp != NULL)
1093 ASSERT(*errorp == 0);
1094
1095 /*
1096 * The content of the ixa will be different if IP_NEXTHOP,
1097 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
1098 */
1099
1100 if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
1101 /* Pick up the IRE_MULTICAST for the ill */
1102 if (ixa->ixa_multicast_ifindex != 0) {
1103 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
1104 isv6, ipst);
1105 } else if (ixaflags & IXAF_SCOPEID_SET) {
1106 /* sin6_scope_id takes precedence over ixa_ifindex */
1107 ASSERT(ixa->ixa_scopeid != 0);
1108 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1109 isv6, ipst);
1110 } else if (ixa->ixa_ifindex != 0) {
1111 /*
1112 * In the ipmp case, the ixa_ifindex is set to
1113 * point at an under_ill and we would return the
1114 * ire_multicast() corresponding to that under_ill.
1115 */
1116 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1117 isv6, ipst);
1118 } else if (src_multihoming != 0 && verify_src) {
1119 /* Look up the ill based on the source address */
1120 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1121 /*
1122 * Since we looked up the ill from the source there
1123 * is no need to verify that the source is on the ill
1124 * below.
1125 */
1126 verify_src = B_FALSE;
1127 if (ill != NULL && IS_VNI(ill)) {
1128 ill_t *usesrc = ill;
1129
1130 ill = ill_lookup_usesrc(usesrc);
1131 ill_refrele(usesrc);
1132 }
1133 } else if (!isv6) {
1134 ipaddr_t v4setsrc = INADDR_ANY;
1135
1136 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
1137 ipst, multirtp, &v4setsrc);
1138 if (setsrcp != NULL)
1139 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1140 } else {
1141 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
1142 ipst, multirtp, setsrcp);
1143 }
1144 if (ill != NULL && IS_VNI(ill)) {
1145 ill_refrele(ill);
1146 ill = NULL;
1147 }
1148 if (ill == NULL) {
1149 if (errorp != NULL)
1150 *errorp = ENXIO;
1151 /* Get a hold on the IRE_NOROUTE */
1152 ire = ire_reject(ipst, isv6);
1153 return (ire);
1154 }
1155 if (!(ill->ill_flags & ILLF_MULTICAST)) {
1156 ill_refrele(ill);
1157 if (errorp != NULL)
1158 *errorp = EHOSTUNREACH;
1159 /* Get a hold on the IRE_NOROUTE */
1160 ire = ire_reject(ipst, isv6);
1161 return (ire);
1162 }
1163 /*
1164 * If we are doing the strictest src_multihoming, then
1165 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
1166 * an interface that is consistent with the source address.
1167 */
1168 if (verify_src && src_multihoming == 2 &&
1169 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1170 if (errorp != NULL)
1171 *errorp = EADDRNOTAVAIL;
1172 ill_refrele(ill);
1173 /* Get a hold on the IRE_NOROUTE */
1174 ire = ire_reject(ipst, isv6);
1175 return (ire);
1176 }
1177 /* Get a refcnt on the single IRE_MULTICAST per ill */
1178 ire = ire_multicast(ill);
1179 ill_refrele(ill);
1180 if (generationp != NULL)
1181 *generationp = ire->ire_generation;
1182 if (errorp != NULL &&
1183 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
1184 *errorp = EHOSTUNREACH;
1185 }
1186 return (ire);
1187 }
1188
1189 /* Now for unicast and broadcast */
1190 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
1191 if (ixaflags & IXAF_SCOPEID_SET) {
1192 /* sin6_scope_id takes precedence over ixa_ifindex */
1193 ASSERT(ixa->ixa_scopeid != 0);
1194 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1195 isv6, ipst);
1196 } else {
1197 ASSERT(ixa->ixa_ifindex != 0);
1198 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1199 isv6, ipst);
1200 }
1201 if (ill != NULL && IS_VNI(ill)) {
1202 ill_refrele(ill);
1203 ill = NULL;
1204 }
1205 if (ill == NULL) {
1206 if (errorp != NULL)
1207 *errorp = ENXIO;
1208 /* Get a hold on the IRE_NOROUTE */
1209 ire = ire_reject(ipst, isv6);
1210 return (ire);
1211 }
1212
1213 match_args |= MATCH_IRE_ILL;
1214
1215 /*
1216 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1217 * so for both of them we need to be able look for an under
1218 * interface.
1219 */
1220 if (IS_UNDER_IPMP(ill))
1221 match_args |= MATCH_IRE_TESTHIDDEN;
1222
1223 /*
1224 * If we are doing the strictest src_multihoming, then
1225 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
1226 * an interface that is consistent with the source address.
1227 */
1228 if (verify_src && src_multihoming == 2 &&
1229 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1230 if (errorp != NULL)
1231 *errorp = EADDRNOTAVAIL;
1232 ill_refrele(ill);
1233 /* Get a hold on the IRE_NOROUTE */
1234 ire = ire_reject(ipst, isv6);
1235 return (ire);
1236 }
1237 } else if (src_multihoming != 0 && verify_src) {
1238 /* Look up the ill based on the source address */
1239 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1240 if (ill == NULL) {
1241 char addrbuf[INET6_ADDRSTRLEN];
1242
1243 ip3dbg(("%s not a valid src for unicast",
1244 inet_ntop(AF_INET6, &v6src, addrbuf,
1245 sizeof (addrbuf))));
1246 if (errorp != NULL)
1247 *errorp = EADDRNOTAVAIL;
1248 /* Get a hold on the IRE_NOROUTE */
1249 ire = ire_reject(ipst, isv6);
1250 return (ire);
1251 }
1252 match_args |= MATCH_IRE_SRC_ILL;
1253 preferred_src_aware = (src_multihoming == 1);
1254 }
1255
1256 if (ixaflags & IXAF_NEXTHOP_SET) {
1257 /* IP_NEXTHOP was set */
1258 v6nexthop = ixa->ixa_nexthop_v6;
1259 } else {
1260 v6nexthop = *v6dst;
1261 }
1262
1263 ire_type = 0;
1264
1265 /*
1266 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1267 * we only look for an onlink IRE.
1268 */
1269 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
1270 match_args |= MATCH_IRE_TYPE;
1271 ire_type = IRE_ONLINK;
1272 }
1273
1274 retry:
1275 if (!isv6) {
1276 ipaddr_t v4nexthop;
1277 ipaddr_t v4setsrc = INADDR_ANY;
1278
1279 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
1280 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
1281 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1282 ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
1283 if (setsrcp != NULL)
1284 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1285 } else {
1286 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
1287 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1288 ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
1289 }
1290
1291 #ifdef DEBUG
1292 if (match_args & MATCH_IRE_TESTHIDDEN) {
1293 ip3dbg(("looking for hidden; dst %x ire %p\n",
1294 v4dst, (void *)ire));
1295 }
1296 #endif
1297 if (ill != NULL) {
1298 ill_refrele(ill);
1299 ill = NULL;
1300 }
1301 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1302 (ire->ire_type & IRE_MULTICAST)) {
1303 if (preferred_src_aware) {
1304 /*
1305 * "Preferred Source Aware" send mode. If we cannot
1306 * find an ire whose ire_ill had the desired source
1307 * address retry after relaxing the ill matching
1308 * constraint.
1309 */
1310 ire_refrele(ire);
1311 preferred_src_aware = B_FALSE;
1312 match_args &= ~MATCH_IRE_SRC_ILL;
1313 goto retry;
1314 }
1315 /* No ire_nce_cache */
1316 return (ire);
1317 }
1318
1319 /* Setup ire_nce_cache if it doesn't exist or is condemned. */
1320 mutex_enter(&ire->ire_lock);
1321 nce = ire->ire_nce_cache;
1322 if (nce == NULL || nce->nce_is_condemned) {
1323 mutex_exit(&ire->ire_lock);
1324 (void) ire_revalidate_nce(ire);
1325 } else {
1326 mutex_exit(&ire->ire_lock);
1327 }
1328 return (ire);
1329 }
1330
1331 /*
1332 * Find a route given some xmit attributes and a packet.
1333 * Generic for IPv4 and IPv6
1334 *
1335 * This never returns NULL. But when it returns the IRE_NOROUTE
1336 * it might set errorp.
1337 */
1338 ire_t *
ip_select_route_pkt(mblk_t * mp,ip_xmit_attr_t * ixa,uint_t * generationp,int * errorp,boolean_t * multirtp)1339 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
1340 int *errorp, boolean_t *multirtp)
1341 {
1342 if (ixa->ixa_flags & IXAF_IS_IPV4) {
1343 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1344 in6_addr_t v6dst, v6src;
1345
1346 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
1347 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
1348
1349 return (ip_select_route(&v6dst, v6src, ixa, generationp,
1350 NULL, errorp, multirtp));
1351 } else {
1352 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1353
1354 return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
1355 ixa, generationp, NULL, errorp, multirtp));
1356 }
1357 }
1358
1359 ire_t *
ip_select_route_v4(ipaddr_t dst,ipaddr_t src,ip_xmit_attr_t * ixa,uint_t * generationp,ipaddr_t * v4setsrcp,int * errorp,boolean_t * multirtp)1360 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
1361 uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
1362 {
1363 in6_addr_t v6dst, v6src;
1364 ire_t *ire;
1365 in6_addr_t setsrc;
1366
1367 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
1368
1369 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
1370 IN6_IPADDR_TO_V4MAPPED(src, &v6src);
1371
1372 setsrc = ipv6_all_zeros;
1373 ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
1374 multirtp);
1375 if (v4setsrcp != NULL)
1376 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
1377 return (ire);
1378 }
1379
1380 /*
1381 * Recursively look for a route to the destination. Can also match on
1382 * the zoneid, ill, and label. Used for the data paths. See also
1383 * ire_route_recursive.
1384 *
1385 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1386 * create an IRE_IF_CLONE. This is used on the receive side when we are not
1387 * forwarding.
1388 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1389 * resolve the gateway.
1390 *
1391 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1392 * instead.
1393 *
1394 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1395 * is an error.
1396 * Allow at most one RTF_INDIRECT.
1397 */
1398 ire_t *
ire_route_recursive_impl_v4(ire_t * ire,ipaddr_t nexthop,uint_t ire_type,const ill_t * ill_arg,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,ipaddr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)1399 ire_route_recursive_impl_v4(ire_t *ire,
1400 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
1401 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1402 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1403 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1404 {
1405 int i, j;
1406 ire_t *ires[MAX_IRE_RECURSION];
1407 uint_t generation;
1408 uint_t generations[MAX_IRE_RECURSION];
1409 boolean_t need_refrele = B_FALSE;
1410 boolean_t invalidate = B_FALSE;
1411 ill_t *ill = NULL;
1412 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
1413
1414 if (setsrcp != NULL)
1415 ASSERT(*setsrcp == INADDR_ANY);
1416 if (gwattrp != NULL)
1417 ASSERT(*gwattrp == NULL);
1418
1419 /*
1420 * We iterate up to three times to resolve a route, even though
1421 * we have four slots in the array. The extra slot is for an
1422 * IRE_IF_CLONE we might need to create.
1423 */
1424 i = 0;
1425 while (i < MAX_IRE_RECURSION - 1) {
1426 /* ire_ftable_lookup handles round-robin/ECMP */
1427 if (ire == NULL) {
1428 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
1429 (ill != NULL? ill : ill_arg), zoneid, tsl,
1430 match_args, xmit_hint, ipst, &generation);
1431 } else {
1432 /* Caller passed it; extra hold since we will rele */
1433 ire_refhold(ire);
1434 if (generationp != NULL)
1435 generation = *generationp;
1436 else
1437 generation = IRE_GENERATION_VERIFY;
1438 }
1439 if (ire == NULL) {
1440 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1441 ire = ires[0];
1442 ire_refhold(ire);
1443 } else {
1444 ire = ire_reject(ipst, B_FALSE);
1445 }
1446 goto error;
1447 }
1448
1449 /* Need to return the ire with RTF_REJECT|BLACKHOLE */
1450 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1451 goto error;
1452
1453 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1454 /*
1455 * Verify that the IRE_IF_CLONE has a consistent generation
1456 * number.
1457 */
1458 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1459 ire_refrele(ire);
1460 ire = NULL;
1461 continue;
1462 }
1463
1464 /*
1465 * Don't allow anything unusual past the first iteration.
1466 * After the first lookup, we should no longer look for
1467 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1468 * routes.
1469 *
1470 * In addition, after we have found a direct IRE_OFFLINK,
1471 * we should only look for interface or clone routes.
1472 */
1473 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1474
1475 if ((ire->ire_type & IRE_OFFLINK) &&
1476 !(ire->ire_flags & RTF_INDIRECT)) {
1477 ire_type = IRE_IF_ALL;
1478 } else {
1479 /*
1480 * no more local, loopback, broadcast routes
1481 */
1482 if (!(match_args & MATCH_IRE_TYPE))
1483 ire_type = (IRE_OFFLINK|IRE_ONLINK);
1484 ire_type &= ~maskoff;
1485 }
1486 match_args |= MATCH_IRE_TYPE;
1487
1488 /* We have a usable IRE */
1489 ires[i] = ire;
1490 generations[i] = generation;
1491 i++;
1492
1493 /* The first RTF_SETSRC address is passed back if setsrcp */
1494 if ((ire->ire_flags & RTF_SETSRC) &&
1495 setsrcp != NULL && *setsrcp == INADDR_ANY) {
1496 ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1497 *setsrcp = ire->ire_setsrc_addr;
1498 }
1499
1500 /* The first ire_gw_secattr is passed back if gwattrp */
1501 if (ire->ire_gw_secattr != NULL &&
1502 gwattrp != NULL && *gwattrp == NULL)
1503 *gwattrp = ire->ire_gw_secattr;
1504
1505 /*
1506 * Check if we have a short-cut pointer to an IRE for this
1507 * destination, and that the cached dependency isn't stale.
1508 * In that case we've rejoined an existing tree towards a
1509 * parent, thus we don't need to continue the loop to
1510 * discover the rest of the tree.
1511 */
1512 mutex_enter(&ire->ire_lock);
1513 if (ire->ire_dep_parent != NULL &&
1514 ire->ire_dep_parent->ire_generation ==
1515 ire->ire_dep_parent_generation) {
1516 mutex_exit(&ire->ire_lock);
1517 ire = NULL;
1518 goto done;
1519 }
1520 mutex_exit(&ire->ire_lock);
1521
1522 /*
1523 * If this type should have an ire_nce_cache (even if it
1524 * doesn't yet have one) then we are done. Includes
1525 * IRE_INTERFACE with a full 32 bit mask.
1526 */
1527 if (ire->ire_nce_capable) {
1528 ire = NULL;
1529 goto done;
1530 }
1531 ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1532 /*
1533 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1534 * particular destination
1535 */
1536 if (ire->ire_type & IRE_INTERFACE) {
1537 in6_addr_t v6nexthop;
1538 ire_t *clone;
1539
1540 ASSERT(ire->ire_masklen != IPV4_ABITS);
1541
1542 /*
1543 * In the case of ip_input and ILLF_FORWARDING not
1544 * being set, and in the case of RTM_GET, there is
1545 * no point in allocating an IRE_IF_CLONE. We return
1546 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1547 * result in a ire_dep_parent which is IRE_IF_*
1548 * without an IRE_IF_CLONE.
1549 * We recover from that when we need to send packets
1550 * by ensuring that the generations become
1551 * IRE_GENERATION_VERIFY in this case.
1552 */
1553 if (!(irr_flags & IRR_ALLOCATE)) {
1554 invalidate = B_TRUE;
1555 ire = NULL;
1556 goto done;
1557 }
1558
1559 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
1560
1561 clone = ire_create_if_clone(ire, &v6nexthop,
1562 &generation);
1563 if (clone == NULL) {
1564 /*
1565 * Temporary failure - no memory.
1566 * Don't want caller to cache IRE_NOROUTE.
1567 */
1568 invalidate = B_TRUE;
1569 ire = ire_blackhole(ipst, B_FALSE);
1570 goto error;
1571 }
1572 /*
1573 * Make clone next to last entry and the
1574 * IRE_INTERFACE the last in the dependency
1575 * chain since the clone depends on the
1576 * IRE_INTERFACE.
1577 */
1578 ASSERT(i >= 1);
1579 ASSERT(i < MAX_IRE_RECURSION);
1580
1581 ires[i] = ires[i-1];
1582 generations[i] = generations[i-1];
1583 ires[i-1] = clone;
1584 generations[i-1] = generation;
1585 i++;
1586
1587 ire = NULL;
1588 goto done;
1589 }
1590
1591 /*
1592 * We only match on the type and optionally ILL when
1593 * recursing. The type match is used by some callers
1594 * to exclude certain types (such as IRE_IF_CLONE or
1595 * IRE_LOCAL|IRE_LOOPBACK).
1596 *
1597 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1598 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1599 * ire_ill, so we set ill to the ire_ill;
1600 */
1601 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1602 nexthop = ire->ire_gateway_addr;
1603 if (ill == NULL && ire->ire_ill != NULL) {
1604 ill = ire->ire_ill;
1605 need_refrele = B_TRUE;
1606 ill_refhold(ill);
1607 match_args |= MATCH_IRE_ILL;
1608 }
1609 ire = NULL;
1610 }
1611 ASSERT(ire == NULL);
1612 ire = ire_reject(ipst, B_FALSE);
1613
1614 error:
1615 ASSERT(ire != NULL);
1616 if (need_refrele)
1617 ill_refrele(ill);
1618
1619 /*
1620 * In the case of MULTIRT we want to try a different IRE the next
1621 * time. We let the next packet retry in that case.
1622 */
1623 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1624 (void) ire_no_good(ires[0]);
1625
1626 cleanup:
1627 /* cleanup ires[i] */
1628 ire_dep_unbuild(ires, i);
1629 for (j = 0; j < i; j++)
1630 ire_refrele(ires[j]);
1631
1632 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1633 (irr_flags & IRR_INCOMPLETE));
1634 /*
1635 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1636 * ip_select_route since the reject or lack of memory might be gone.
1637 */
1638 if (generationp != NULL)
1639 *generationp = IRE_GENERATION_VERIFY;
1640 return (ire);
1641
1642 done:
1643 ASSERT(ire == NULL);
1644 if (need_refrele) {
1645 ill_refrele(ill);
1646 ill = NULL;
1647 }
1648
1649 /* Build dependencies */
1650 if (i > 1 && !ire_dep_build(ires, generations, i)) {
1651 /* Something in chain was condemned; tear it apart */
1652 ire = ire_reject(ipst, B_FALSE);
1653 goto cleanup;
1654 }
1655
1656 /*
1657 * Release all refholds except the one for ires[0] that we
1658 * will return to the caller.
1659 */
1660 for (j = 1; j < i; j++)
1661 ire_refrele(ires[j]);
1662
1663 if (invalidate) {
1664 /*
1665 * Since we needed to allocate but couldn't we need to make
1666 * sure that the dependency chain is rebuilt the next time.
1667 */
1668 ire_dep_invalidate_generations(ires[0]);
1669 generation = IRE_GENERATION_VERIFY;
1670 } else {
1671 /*
1672 * IREs can have been added or deleted while we did the
1673 * recursive lookup and we can't catch those until we've built
1674 * the dependencies. We verify the stored
1675 * ire_dep_parent_generation to catch any such changes and
1676 * return IRE_GENERATION_VERIFY (which will cause
1677 * ip_select_route to be called again so we can redo the
1678 * recursive lookup next time we send a packet.
1679 */
1680 if (ires[0]->ire_dep_parent == NULL)
1681 generation = ires[0]->ire_generation;
1682 else
1683 generation = ire_dep_validate_generations(ires[0]);
1684 if (generations[0] != ires[0]->ire_generation) {
1685 /* Something changed at the top */
1686 generation = IRE_GENERATION_VERIFY;
1687 }
1688 }
1689 if (generationp != NULL)
1690 *generationp = generation;
1691
1692 return (ires[0]);
1693 }
1694
1695 ire_t *
ire_route_recursive_v4(ipaddr_t nexthop,uint_t ire_type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,ipaddr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)1696 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
1697 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1698 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1699 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1700 {
1701 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
1702 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1703 gwattrp, generationp));
1704 }
1705
1706 /*
1707 * Recursively look for a route to the destination.
1708 * We only handle a destination match here, yet we have the same arguments
1709 * as the full match to allow function pointers to select between the two.
1710 *
1711 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1712 * instead.
1713 *
1714 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1715 * is an error.
1716 * Allow at most one RTF_INDIRECT.
1717 */
1718 ire_t *
ire_route_recursive_dstonly_v4(ipaddr_t nexthop,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst)1719 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
1720 uint32_t xmit_hint, ip_stack_t *ipst)
1721 {
1722 ire_t *ire;
1723 ire_t *ire1;
1724 uint_t generation;
1725
1726 /* ire_ftable_lookup handles round-robin/ECMP */
1727 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
1728 &generation);
1729 ASSERT(ire != NULL);
1730 /*
1731 * If the IRE has a current cached parent we know that the whole
1732 * parent chain is current, hence we don't need to discover and
1733 * build any dependencies by doing a recursive lookup.
1734 */
1735 mutex_enter(&ire->ire_lock);
1736 if (ire->ire_dep_parent != NULL) {
1737 if (ire->ire_dep_parent->ire_generation ==
1738 ire->ire_dep_parent_generation) {
1739 mutex_exit(&ire->ire_lock);
1740 return (ire);
1741 }
1742 mutex_exit(&ire->ire_lock);
1743 } else {
1744 mutex_exit(&ire->ire_lock);
1745 /*
1746 * If this type should have an ire_nce_cache (even if it
1747 * doesn't yet have one) then we are done. Includes
1748 * IRE_INTERFACE with a full 32 bit mask.
1749 */
1750 if (ire->ire_nce_capable)
1751 return (ire);
1752 }
1753
1754 /*
1755 * Fallback to loop in the normal code starting with the ire
1756 * we found. Normally this would return the same ire.
1757 */
1758 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
1759 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1760 &generation);
1761 ire_refrele(ire);
1762 return (ire1);
1763 }
1764
1765 /*
1766 * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
1767 * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
1768 * are not consistent, and TRUE otherwise.
1769 */
1770 boolean_t
ire_clone_verify(ire_t * ire)1771 ire_clone_verify(ire_t *ire)
1772 {
1773 ASSERT((ire->ire_type & IRE_IF_CLONE) != 0);
1774 mutex_enter(&ire->ire_lock);
1775 if (ire->ire_dep_parent != NULL &&
1776 ire->ire_dep_parent->ire_generation !=
1777 ire->ire_dep_parent_generation) {
1778 mutex_exit(&ire->ire_lock);
1779 ire_delete(ire);
1780 return (B_FALSE);
1781 }
1782 mutex_exit(&ire->ire_lock);
1783 return (B_TRUE);
1784 }
1785