1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * This file contains consumer routines of the IPv4 forwarding engine
27 */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #include <sys/dlpi.h>
34 #include <sys/ddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/policy.h>
37
38 #include <sys/systm.h>
39 #include <sys/strsun.h>
40 #include <sys/kmem.h>
41 #include <sys/param.h>
42 #include <sys/socket.h>
43 #include <sys/strsubr.h>
44 #include <net/if.h>
45 #include <net/route.h>
46 #include <netinet/in.h>
47 #include <net/if_dl.h>
48 #include <netinet/ip6.h>
49 #include <netinet/icmp6.h>
50
51 #include <inet/ipsec_impl.h>
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/ip.h>
56 #include <inet/ip_impl.h>
57 #include <inet/ip6.h>
58 #include <inet/ip_ndp.h>
59 #include <inet/arp.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_ftable.h>
63 #include <inet/ip_rts.h>
64 #include <inet/nd.h>
65
66 #include <net/pfkeyv2.h>
67 #include <inet/sadb.h>
68 #include <inet/tcp.h>
69 #include <inet/ipclassifier.h>
70 #include <sys/zone.h>
71 #include <net/radix.h>
72 #include <sys/tsol/label.h>
73 #include <sys/tsol/tnet.h>
74
75 #define IS_DEFAULT_ROUTE(ire) \
76 (((ire)->ire_type & IRE_DEFAULT) || \
77 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
78
79 #define IP_SRC_MULTIHOMING(isv6, ipst) \
80 (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \
81 ipst->ips_ip_strict_src_multihoming)
82
83 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
84 static void ire_del_host_redir(ire_t *, char *);
85 static boolean_t ire_find_best_route(struct radix_node *, void *);
86
87 /*
88 * Lookup a route in forwarding table. A specific lookup is indicated by
89 * passing the required parameters and indicating the match required in the
90 * flag field.
91 *
92 * Supports IP_BOUND_IF by following the ipif/ill when recursing.
93 */
94 ire_t *
ire_ftable_lookup_v4(ipaddr_t addr,ipaddr_t mask,ipaddr_t gateway,int type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,int flags,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)95 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
96 int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
97 int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
98 {
99 ire_t *ire;
100 struct rt_sockaddr rdst, rmask;
101 struct rt_entry *rt;
102 ire_ftable_args_t margs;
103
104 ASSERT(ill == NULL || !ill->ill_isv6);
105
106 /*
107 * ire_match_args() will dereference ill if MATCH_IRE_ILL
108 * is set.
109 */
110 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
111 return (NULL);
112
113 bzero(&rdst, sizeof (rdst));
114 rdst.rt_sin_len = sizeof (rdst);
115 rdst.rt_sin_family = AF_INET;
116 rdst.rt_sin_addr.s_addr = addr;
117
118 bzero(&rmask, sizeof (rmask));
119 rmask.rt_sin_len = sizeof (rmask);
120 rmask.rt_sin_family = AF_INET;
121 rmask.rt_sin_addr.s_addr = mask;
122
123 bzero(&margs, sizeof (margs));
124 margs.ift_addr = addr;
125 margs.ift_mask = mask;
126 margs.ift_gateway = gateway;
127 margs.ift_type = type;
128 margs.ift_ill = ill;
129 margs.ift_zoneid = zoneid;
130 margs.ift_tsl = tsl;
131 margs.ift_flags = flags;
132
133 /*
134 * The flags argument passed to ire_ftable_lookup may cause the
135 * search to return, not the longest matching prefix, but the
136 * "best matching prefix", i.e., the longest prefix that also
137 * satisfies constraints imposed via the permutation of flags
138 * passed in. To achieve this, we invoke ire_match_args() on
139 * each matching leaf in the radix tree. ire_match_args is
140 * invoked by the callback function ire_find_best_route()
141 * We hold the global tree lock in read mode when calling
142 * rn_match_args. Before dropping the global tree lock, ensure
143 * that the radix node can't be deleted by incrementing ire_refcnt.
144 */
145 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
146 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
147 ipst->ips_ip_ftable, ire_find_best_route, &margs);
148 ire = margs.ift_best_ire;
149 if (rt == NULL) {
150 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
151 return (NULL);
152 }
153 ASSERT(ire != NULL);
154
155 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
156
157 /*
158 * round-robin only if we have more than one route in the bucket.
159 * ips_ip_ecmp_behavior controls when we do ECMP
160 * 2: always
161 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
162 * 0: never
163 */
164 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
165 if (ipst->ips_ip_ecmp_behavior == 2 ||
166 (ipst->ips_ip_ecmp_behavior == 1 &&
167 IS_DEFAULT_ROUTE(ire))) {
168 ire_t *next_ire;
169
170 margs.ift_best_ire = NULL;
171 next_ire = ire_round_robin(ire->ire_bucket, &margs,
172 xmit_hint, ire, ipst);
173 if (next_ire == NULL) {
174 /* keep ire if next_ire is null */
175 goto done;
176 }
177 ire_refrele(ire);
178 ire = next_ire;
179 }
180 }
181
182 done:
183 /* Return generation before dropping lock */
184 if (generationp != NULL)
185 *generationp = ire->ire_generation;
186
187 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
188
189 /*
190 * For shared-IP zones we need additional checks to what was
191 * done in ire_match_args to make sure IRE_LOCALs are handled.
192 *
193 * When ip_restrict_interzone_loopback is set, then
194 * we ensure that IRE_LOCAL are only used for loopback
195 * between zones when the logical "Ethernet" would
196 * have looped them back. That is, if in the absense of
197 * the IRE_LOCAL we would have sent to packet out the
198 * same ill.
199 */
200 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
201 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
202 ipst->ips_ip_restrict_interzone_loopback) {
203 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
204 ASSERT(ire != NULL);
205 }
206 return (ire);
207 }
208
209 /*
210 * This function is called by
211 * ip_input/ire_route_recursive when doing a route lookup on only the
212 * destination address.
213 *
214 * The optimizations of this function over ire_ftable_lookup are:
215 * o removing unnecessary flag matching
216 * o doing longest prefix match instead of overloading it further
217 * with the unnecessary "best_prefix_match"
218 *
219 * If no route is found we return IRE_NOROUTE.
220 */
221 ire_t *
ire_ftable_lookup_simple_v4(ipaddr_t addr,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)222 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
223 uint_t *generationp)
224 {
225 ire_t *ire;
226 struct rt_sockaddr rdst;
227 struct rt_entry *rt;
228 irb_t *irb;
229
230 rdst.rt_sin_len = sizeof (rdst);
231 rdst.rt_sin_family = AF_INET;
232 rdst.rt_sin_addr.s_addr = addr;
233
234 /*
235 * This is basically inlining a simpler version of ire_match_args
236 */
237 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
238
239 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
240 ipst->ips_ip_ftable, NULL, NULL);
241
242 if (rt == NULL)
243 goto bad;
244
245 irb = &rt->rt_irb;
246 if (irb->irb_ire_cnt == 0)
247 goto bad;
248
249 rw_enter(&irb->irb_lock, RW_READER);
250 ire = irb->irb_ire;
251 if (ire == NULL) {
252 rw_exit(&irb->irb_lock);
253 goto bad;
254 }
255 while (IRE_IS_CONDEMNED(ire)) {
256 ire = ire->ire_next;
257 if (ire == NULL) {
258 rw_exit(&irb->irb_lock);
259 goto bad;
260 }
261 }
262
263 /* we have a ire that matches */
264 ire_refhold(ire);
265 rw_exit(&irb->irb_lock);
266
267 /*
268 * round-robin only if we have more than one route in the bucket.
269 * ips_ip_ecmp_behavior controls when we do ECMP
270 * 2: always
271 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
272 * 0: never
273 *
274 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
275 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
276 * and the IRE_INTERFACESs are likely to be shorter matches.
277 */
278 if (ire->ire_bucket->irb_ire_cnt > 1) {
279 if (ipst->ips_ip_ecmp_behavior == 2 ||
280 (ipst->ips_ip_ecmp_behavior == 1 &&
281 IS_DEFAULT_ROUTE(ire))) {
282 ire_t *next_ire;
283 ire_ftable_args_t margs;
284
285 bzero(&margs, sizeof (margs));
286 margs.ift_addr = addr;
287 margs.ift_zoneid = ALL_ZONES;
288
289 next_ire = ire_round_robin(ire->ire_bucket, &margs,
290 xmit_hint, ire, ipst);
291 if (next_ire == NULL) {
292 /* keep ire if next_ire is null */
293 if (generationp != NULL)
294 *generationp = ire->ire_generation;
295 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
296 return (ire);
297 }
298 ire_refrele(ire);
299 ire = next_ire;
300 }
301 }
302 /* Return generation before dropping lock */
303 if (generationp != NULL)
304 *generationp = ire->ire_generation;
305
306 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
307
308 /*
309 * Since we only did ALL_ZONES matches there is no special handling
310 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
311 */
312 return (ire);
313
314 bad:
315 if (generationp != NULL)
316 *generationp = IRE_GENERATION_VERIFY;
317
318 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
319 return (ire_reject(ipst, B_FALSE));
320 }
321
322 /*
323 * Find the ill matching a multicast group.
324 * Allows different routes for multicast addresses
325 * in the unicast routing table (akin to 224.0.0.0 but could be more specific)
326 * which point at different interfaces. This is used when IP_MULTICAST_IF
327 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
328 * specify the interface to join on.
329 *
330 * Supports link-local addresses by using ire_route_recursive which follows
331 * the ill when recursing.
332 *
333 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
334 * and the MULTIRT property can be different for different groups, we
335 * extract RTF_MULTIRT from the special unicast route added for a group
336 * with CGTP and pass that back in the multirtp argument.
337 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
338 * We have a setsrcp argument for the same reason.
339 */
340 ill_t *
ire_lookup_multi_ill_v4(ipaddr_t group,zoneid_t zoneid,ip_stack_t * ipst,boolean_t * multirtp,ipaddr_t * setsrcp)341 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
342 boolean_t *multirtp, ipaddr_t *setsrcp)
343 {
344 ire_t *ire;
345 ill_t *ill;
346
347 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
348 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
349 ASSERT(ire != NULL);
350 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
351 ire_refrele(ire);
352 return (NULL);
353 }
354
355 if (multirtp != NULL)
356 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
357
358 ill = ire_nexthop_ill(ire);
359 ire_refrele(ire);
360 return (ill);
361 }
362
363 /*
364 * Delete the passed in ire if the gateway addr matches
365 */
366 void
ire_del_host_redir(ire_t * ire,char * gateway)367 ire_del_host_redir(ire_t *ire, char *gateway)
368 {
369 if ((ire->ire_flags & RTF_DYNAMIC) &&
370 (ire->ire_gateway_addr == *(ipaddr_t *)gateway))
371 ire_delete(ire);
372 }
373
374 /*
375 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
376 * pointing at the specified gateway and
377 * delete them. This routine is called only
378 * when a default gateway is going away.
379 */
380 void
ire_delete_host_redirects(ipaddr_t gateway,ip_stack_t * ipst)381 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
382 {
383 struct rtfuncarg rtfarg;
384
385 bzero(&rtfarg, sizeof (rtfarg));
386 rtfarg.rt_func = ire_del_host_redir;
387 rtfarg.rt_arg = (void *)&gateway;
388 rtfarg.rt_zoneid = ALL_ZONES;
389 rtfarg.rt_ipst = ipst;
390 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
391 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
392 }
393
394 /*
395 * Obtain the rt_entry and rt_irb for the route to be added to
396 * the ips_ip_ftable.
397 * First attempt to add a node to the radix tree via rn_addroute. If the
398 * route already exists, return the bucket for the existing route.
399 *
400 * Locking notes: Need to hold the global radix tree lock in write mode to
401 * add a radix node. To prevent the node from being deleted, ire_get_bucket()
402 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4()
403 * while holding the irb_lock, but not the radix tree lock.
404 */
405 irb_t *
ire_get_bucket(ire_t * ire)406 ire_get_bucket(ire_t *ire)
407 {
408 struct radix_node *rn;
409 struct rt_entry *rt;
410 struct rt_sockaddr rmask, rdst;
411 irb_t *irb = NULL;
412 ip_stack_t *ipst = ire->ire_ipst;
413
414 ASSERT(ipst->ips_ip_ftable != NULL);
415
416 /* first try to see if route exists (based on rtalloc1) */
417 bzero(&rdst, sizeof (rdst));
418 rdst.rt_sin_len = sizeof (rdst);
419 rdst.rt_sin_family = AF_INET;
420 rdst.rt_sin_addr.s_addr = ire->ire_addr;
421
422 bzero(&rmask, sizeof (rmask));
423 rmask.rt_sin_len = sizeof (rmask);
424 rmask.rt_sin_family = AF_INET;
425 rmask.rt_sin_addr.s_addr = ire->ire_mask;
426
427 /*
428 * add the route. based on BSD's rtrequest1(RTM_ADD)
429 */
430 R_Malloc(rt, rt_entry_cache, sizeof (*rt));
431 /* kmem_alloc failed */
432 if (rt == NULL)
433 return (NULL);
434
435 bzero(rt, sizeof (*rt));
436 rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
437 rt->rt_dst = rdst;
438 irb = &rt->rt_irb;
439 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
440 irb->irb_ipst = ipst;
441 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
442 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
443 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask,
444 ipst->ips_ip_ftable, (struct radix_node *)rt);
445 if (rn == NULL) {
446 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
447 Free(rt, rt_entry_cache);
448 rt = NULL;
449 irb = NULL;
450 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
451 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask,
452 ipst->ips_ip_ftable);
453 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
454 /* found a non-root match */
455 rt = (struct rt_entry *)rn;
456 }
457 }
458 if (rt != NULL) {
459 irb = &rt->rt_irb;
460 irb_refhold(irb);
461 }
462 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
463 return (irb);
464 }
465
466 /*
467 * This function is used when the caller wants to know the outbound
468 * interface for a packet given only the address.
469 * If this is a offlink IP address and there are multiple
470 * routes to this destination, this routine will utilise the
471 * first route it finds to IP address
472 * Return values:
473 * 0 - FAILURE
474 * nonzero - ifindex
475 */
476 uint_t
ifindex_lookup(const struct sockaddr * ipaddr,zoneid_t zoneid)477 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
478 {
479 uint_t ifindex = 0;
480 ire_t *ire;
481 ill_t *ill;
482 netstack_t *ns;
483 ip_stack_t *ipst;
484
485 if (zoneid == ALL_ZONES)
486 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
487 else
488 ns = netstack_find_by_zoneid(zoneid);
489 ASSERT(ns != NULL);
490
491 /*
492 * For exclusive stacks we set the zoneid to zero
493 * since IP uses the global zoneid in the exclusive stacks.
494 */
495 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
496 zoneid = GLOBAL_ZONEID;
497 ipst = ns->netstack_ip;
498
499 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
500
501 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
502 ill = ire_nexthop_ill(ire);
503 if (ill != NULL) {
504 ifindex = ill->ill_phyint->phyint_ifindex;
505 ill_refrele(ill);
506 }
507 ire_refrele(ire);
508 }
509 netstack_rele(ns);
510 return (ifindex);
511 }
512
513 /*
514 * Routine to find the route to a destination. If a ifindex is supplied
515 * it tries to match the route to the corresponding ipif for the ifindex
516 */
517 static ire_t *
route_to_dst(const struct sockaddr * dst_addr,zoneid_t zoneid,ip_stack_t * ipst)518 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
519 {
520 ire_t *ire = NULL;
521 int match_flags;
522
523 match_flags = MATCH_IRE_DSTONLY;
524
525 /* XXX pass NULL tsl for now */
526
527 if (dst_addr->sa_family == AF_INET) {
528 ire = ire_route_recursive_v4(
529 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
530 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
531 NULL, NULL);
532 } else {
533 ire = ire_route_recursive_v6(
534 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
535 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL,
536 NULL, NULL);
537 }
538 ASSERT(ire != NULL);
539 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
540 ire_refrele(ire);
541 return (NULL);
542 }
543 return (ire);
544 }
545
546 /*
547 * This routine is called by IP Filter to send a packet out on the wire
548 * to a specified dstination (which may be onlink or offlink). The ifindex may
549 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
550 * an outgoing interface and requires the nexthop to be on that interface.
551 * IP WILL NOT DO the following to the data packet before sending it out:
552 * a. manipulate ttl
553 * b. ipsec work
554 * c. fragmentation
555 *
556 * If the packet has been prepared for hardware checksum then it will be
557 * passed off to ip_send_align_cksum() to check that the flags set on the
558 * packet are in alignment with the capabilities of the new outgoing NIC.
559 *
560 * Return values:
561 * 0: IP was able to send of the data pkt
562 * ECOMM: Could not send packet
563 * ENONET No route to dst. It is up to the caller
564 * to send icmp unreachable error message,
565 * EINPROGRESS The macaddr of the onlink dst or that
566 * of the offlink dst's nexthop needs to get
567 * resolved before packet can be sent to dst.
568 * Thus transmission is not guaranteed.
569 * Note: No longer have visibility to the ARP queue
570 * hence no EINPROGRESS.
571 */
572 int
ipfil_sendpkt(const struct sockaddr * dst_addr,mblk_t * mp,uint_t ifindex,zoneid_t zoneid)573 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
574 zoneid_t zoneid)
575 {
576 ipaddr_t nexthop;
577 netstack_t *ns;
578 ip_stack_t *ipst;
579 ip_xmit_attr_t ixas;
580 int error;
581
582 ASSERT(mp != NULL);
583
584 if (zoneid == ALL_ZONES)
585 ns = netstack_find_by_zoneid(GLOBAL_ZONEID);
586 else
587 ns = netstack_find_by_zoneid(zoneid);
588 ASSERT(ns != NULL);
589
590 /*
591 * For exclusive stacks we set the zoneid to zero
592 * since IP uses the global zoneid in the exclusive stacks.
593 */
594 if (ns->netstack_stackid != GLOBAL_NETSTACKID)
595 zoneid = GLOBAL_ZONEID;
596 ipst = ns->netstack_ip;
597
598 ASSERT(dst_addr->sa_family == AF_INET ||
599 dst_addr->sa_family == AF_INET6);
600
601 bzero(&ixas, sizeof (ixas));
602 /*
603 * No IPsec, no fragmentation, and don't let any hooks see
604 * the packet.
605 */
606 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
607 ixas.ixa_cred = kcred;
608 ixas.ixa_cpid = NOPID;
609 ixas.ixa_tsl = NULL;
610 ixas.ixa_ipst = ipst;
611 ixas.ixa_ifindex = ifindex;
612
613 if (dst_addr->sa_family == AF_INET) {
614 ipha_t *ipha = (ipha_t *)mp->b_rptr;
615
616 ixas.ixa_flags |= IXAF_IS_IPV4;
617 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
618 if (nexthop != ipha->ipha_dst) {
619 ixas.ixa_flags |= IXAF_NEXTHOP_SET;
620 ixas.ixa_nexthop_v4 = nexthop;
621 }
622 ixas.ixa_multicast_ttl = ipha->ipha_ttl;
623 } else {
624 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
625 in6_addr_t *nexthop6;
626
627 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
628 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
629 ixas.ixa_flags |= IXAF_NEXTHOP_SET;
630 ixas.ixa_nexthop_v6 = *nexthop6;
631 }
632 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
633 }
634 error = ip_output_simple(mp, &ixas);
635 ixa_cleanup(&ixas);
636
637 netstack_rele(ns);
638 switch (error) {
639 case 0:
640 break;
641
642 case EHOSTUNREACH:
643 case ENETUNREACH:
644 error = ENONET;
645 break;
646
647 default:
648 error = ECOMM;
649 break;
650 }
651 return (error);
652 }
653
654 /*
655 * callback function provided by ire_ftable_lookup when calling
656 * rn_match_args(). Invoke ire_match_args on each matching leaf node in
657 * the radix tree.
658 */
659 boolean_t
ire_find_best_route(struct radix_node * rn,void * arg)660 ire_find_best_route(struct radix_node *rn, void *arg)
661 {
662 struct rt_entry *rt = (struct rt_entry *)rn;
663 irb_t *irb_ptr;
664 ire_t *ire;
665 ire_ftable_args_t *margs = arg;
666 ipaddr_t match_mask;
667
668 ASSERT(rt != NULL);
669
670 irb_ptr = &rt->rt_irb;
671
672 if (irb_ptr->irb_ire_cnt == 0)
673 return (B_FALSE);
674
675 rw_enter(&irb_ptr->irb_lock, RW_READER);
676 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
677 if (IRE_IS_CONDEMNED(ire))
678 continue;
679 ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0);
680 if (margs->ift_flags & MATCH_IRE_MASK)
681 match_mask = margs->ift_mask;
682 else
683 match_mask = ire->ire_mask;
684
685 if (ire_match_args(ire, margs->ift_addr, match_mask,
686 margs->ift_gateway, margs->ift_type, margs->ift_ill,
687 margs->ift_zoneid, margs->ift_tsl,
688 margs->ift_flags)) {
689 ire_refhold(ire);
690 rw_exit(&irb_ptr->irb_lock);
691 margs->ift_best_ire = ire;
692 return (B_TRUE);
693 }
694 }
695 rw_exit(&irb_ptr->irb_lock);
696 return (B_FALSE);
697 }
698
699 /*
700 * ftable irb_t structures are dynamically allocated, and we need to
701 * check if the irb_t (and associated ftable tree attachment) needs to
702 * be cleaned up when the irb_refcnt goes to 0. The conditions that need
703 * be verified are:
704 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt,
705 * - no other threads holding references to ire's in the bucket,
706 * i.e., irb_nire == 0
707 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0
708 * - need to hold the global tree lock and irb_lock in write mode.
709 */
710 void
irb_refrele_ftable(irb_t * irb)711 irb_refrele_ftable(irb_t *irb)
712 {
713 for (;;) {
714 rw_enter(&irb->irb_lock, RW_WRITER);
715 ASSERT(irb->irb_refcnt != 0);
716 if (irb->irb_refcnt != 1) {
717 /*
718 * Someone has a reference to this radix node
719 * or there is some bucket walker.
720 */
721 irb->irb_refcnt--;
722 rw_exit(&irb->irb_lock);
723 return;
724 } else {
725 /*
726 * There is no other walker, nor is there any
727 * other thread that holds a direct ref to this
728 * radix node. Do the clean up if needed. Call
729 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag
730 */
731 if (irb->irb_marks & IRB_MARK_CONDEMNED) {
732 ire_t *ire_list;
733
734 ire_list = ire_unlink(irb);
735 rw_exit(&irb->irb_lock);
736
737 if (ire_list != NULL)
738 ire_cleanup(ire_list);
739 /*
740 * more CONDEMNED entries could have
741 * been added while we dropped the lock,
742 * so we have to re-check.
743 */
744 continue;
745 }
746
747 /*
748 * Now check if there are still any ires
749 * associated with this radix node.
750 */
751 if (irb->irb_nire != 0) {
752 /*
753 * someone is still holding on
754 * to ires in this bucket
755 */
756 irb->irb_refcnt--;
757 rw_exit(&irb->irb_lock);
758 return;
759 } else {
760 /*
761 * Everything is clear. Zero walkers,
762 * Zero threads with a ref to this
763 * radix node, Zero ires associated with
764 * this radix node. Due to lock order,
765 * check the above conditions again
766 * after grabbing all locks in the right order
767 */
768 rw_exit(&irb->irb_lock);
769 if (irb_inactive(irb))
770 return;
771 /*
772 * irb_inactive could not free the irb.
773 * See if there are any walkers, if not
774 * try to clean up again.
775 */
776 }
777 }
778 }
779 }
780
781 /*
782 * IRE iterator used by ire_ftable_lookup to process multiple equal
783 * routes. Given a starting point in the hash list (hash), walk the IREs
784 * in the bucket skipping deleted entries. We treat the bucket as a circular
785 * list for the purposes of walking it.
786 * Returns the IRE (held) that corresponds to the hash value. If that IRE is
787 * not applicable (ire_match_args failed) then it returns a subsequent one.
788 * If we fail to find an IRE we return NULL.
789 *
790 * Assumes that the caller holds a reference on the IRE bucket and a read lock
791 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
792 *
793 * Applies to IPv4 and IPv6.
794 *
795 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
796 * address and bucket, we compare against ire_type for the orig_ire. We also
797 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
798 * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire.
799 *
800 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
801 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
802 * in which the zone has an IP address. We check this for the global zone
803 * even if no shared-IP zones are configured.
804 */
805 ire_t *
ire_round_robin(irb_t * irb_ptr,ire_ftable_args_t * margs,uint_t hash,ire_t * orig_ire,ip_stack_t * ipst)806 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
807 ire_t *orig_ire, ip_stack_t *ipst)
808 {
809 ire_t *ire, *maybe_ire = NULL;
810 uint_t maybe_badcnt;
811 uint_t maxwalk;
812
813 /* Fold in more bits from the hint/hash */
814 hash = hash ^ (hash >> 8) ^ (hash >> 16);
815
816 rw_enter(&irb_ptr->irb_lock, RW_WRITER);
817 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */
818 if (maxwalk == 0) {
819 rw_exit(&irb_ptr->irb_lock);
820 return (NULL);
821 }
822
823 hash %= maxwalk;
824 irb_refhold_locked(irb_ptr);
825 rw_exit(&irb_ptr->irb_lock);
826
827 /*
828 * Round-robin the routers list looking for a route that
829 * matches the passed in parameters.
830 * First we skip "hash" number of non-condemned IREs.
831 * Then we match the IRE.
832 * If we find an ire which has a non-zero ire_badcnt then we remember
833 * it and keep on looking for a lower ire_badcnt.
834 * If we come to the end of the list we continue (treat the
835 * bucket list as a circular list) but we match less than "max"
836 * entries.
837 */
838 ire = irb_ptr->irb_ire;
839 while (maxwalk > 0) {
840 if (IRE_IS_CONDEMNED(ire))
841 goto next_ire_skip;
842
843 /* Skip the first "hash" entries to do ECMP */
844 if (hash != 0) {
845 hash--;
846 goto next_ire_skip;
847 }
848
849 /* See CGTP comment above */
850 if (ire->ire_type != orig_ire->ire_type ||
851 ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0)
852 goto next_ire;
853
854 /*
855 * Note: Since IPv6 has hash buckets instead of radix
856 * buckers we need to explicitly compare the addresses.
857 * That makes this less efficient since we will be called
858 * even if there is no alternatives just because the
859 * bucket has multiple IREs for different addresses.
860 */
861 if (ire->ire_ipversion == IPV6_VERSION) {
862 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
863 &ire->ire_addr_v6))
864 goto next_ire;
865 }
866
867 /*
868 * For some reason find_best_route uses ire_mask. We do
869 * the same.
870 */
871 if (ire->ire_ipversion == IPV4_VERSION ?
872 !ire_match_args(ire, margs->ift_addr,
873 ire->ire_mask, margs->ift_gateway,
874 margs->ift_type, margs->ift_ill, margs->ift_zoneid,
875 margs->ift_tsl, margs->ift_flags) :
876 !ire_match_args_v6(ire, &margs->ift_addr_v6,
877 &ire->ire_mask_v6, &margs->ift_gateway_v6,
878 margs->ift_type, margs->ift_ill, margs->ift_zoneid,
879 margs->ift_tsl, margs->ift_flags))
880 goto next_ire;
881
882 if (margs->ift_zoneid != ALL_ZONES &&
883 (ire->ire_type & IRE_OFFLINK)) {
884 /*
885 * When we're in a zone, we're only
886 * interested in routers that are
887 * reachable through ipifs within our zone.
888 */
889 if (ire->ire_ipversion == IPV4_VERSION) {
890 if (!ire_gateway_ok_zone_v4(
891 ire->ire_gateway_addr, margs->ift_zoneid,
892 ire->ire_ill, margs->ift_tsl, ipst,
893 B_TRUE))
894 goto next_ire;
895 } else {
896 if (!ire_gateway_ok_zone_v6(
897 &ire->ire_gateway_addr_v6,
898 margs->ift_zoneid, ire->ire_ill,
899 margs->ift_tsl, ipst, B_TRUE))
900 goto next_ire;
901 }
902 }
903 mutex_enter(&ire->ire_lock);
904 /* Look for stale ire_badcnt and clear */
905 if (ire->ire_badcnt != 0 &&
906 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt >
907 ipst->ips_ip_ire_badcnt_lifetime))
908 ire->ire_badcnt = 0;
909 mutex_exit(&ire->ire_lock);
910
911 if (ire->ire_badcnt == 0) {
912 /* We found one with a zero badcnt; done */
913 ire_refhold(ire);
914 /*
915 * Care needed since irb_refrele grabs WLOCK to free
916 * the irb_t.
917 */
918 if (ire->ire_ipversion == IPV4_VERSION) {
919 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
920 irb_refrele(irb_ptr);
921 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
922 } else {
923 rw_exit(&ipst->ips_ip6_ire_head_lock);
924 irb_refrele(irb_ptr);
925 rw_enter(&ipst->ips_ip6_ire_head_lock,
926 RW_READER);
927 }
928 return (ire);
929 }
930 /*
931 * keep looking to see if there is a better (lower
932 * badcnt) matching IRE, but save this one as a last resort.
933 * If we find a lower badcnt pick that one as the last* resort.
934 */
935 if (maybe_ire == NULL) {
936 maybe_ire = ire;
937 maybe_badcnt = ire->ire_badcnt;
938 } else if (ire->ire_badcnt < maybe_badcnt) {
939 maybe_ire = ire;
940 maybe_badcnt = ire->ire_badcnt;
941 }
942
943 next_ire:
944 maxwalk--;
945 next_ire_skip:
946 ire = ire->ire_next;
947 if (ire == NULL)
948 ire = irb_ptr->irb_ire;
949 }
950 if (maybe_ire != NULL)
951 ire_refhold(maybe_ire);
952
953 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
954 if (ire->ire_ipversion == IPV4_VERSION) {
955 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
956 irb_refrele(irb_ptr);
957 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
958 } else {
959 rw_exit(&ipst->ips_ip6_ire_head_lock);
960 irb_refrele(irb_ptr);
961 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
962 }
963 return (maybe_ire);
964 }
965
966 void
irb_refhold_rn(struct radix_node * rn)967 irb_refhold_rn(struct radix_node *rn)
968 {
969 if ((rn->rn_flags & RNF_ROOT) == 0)
970 irb_refhold(&((rt_t *)(rn))->rt_irb);
971 }
972
973 void
irb_refrele_rn(struct radix_node * rn)974 irb_refrele_rn(struct radix_node *rn)
975 {
976 if ((rn->rn_flags & RNF_ROOT) == 0)
977 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
978 }
979
980
981 /*
982 * ip_select_src_ill() is used by ip_select_route() to find the src_ill
983 * to be used for source-aware routing table lookup. This function will
984 * ignore IPIF_UNNUMBERED interface addresses, and will only return a
985 * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED
986 * interfaces).
987 */
988 static ill_t *
ip_select_src_ill(const in6_addr_t * v6src,zoneid_t zoneid,ip_stack_t * ipst)989 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst)
990 {
991 ipif_t *ipif;
992 ill_t *ill;
993 boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src);
994 ipaddr_t v4src;
995
996 if (isv6) {
997 ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst);
998 } else {
999 IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
1000 ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst);
1001 }
1002 if (ipif == NULL)
1003 return (NULL);
1004 ill = ipif->ipif_ill;
1005 ill_refhold(ill);
1006 ipif_refrele(ipif);
1007 return (ill);
1008 }
1009
1010 /*
1011 * verify that v6src is configured on ill
1012 */
1013 static boolean_t
ip_verify_src_on_ill(const in6_addr_t v6src,ill_t * ill,zoneid_t zoneid)1014 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid)
1015 {
1016 ipif_t *ipif;
1017 ip_stack_t *ipst;
1018 ipaddr_t v4src;
1019
1020 if (ill == NULL)
1021 return (B_FALSE);
1022 ipst = ill->ill_ipst;
1023
1024 if (ill->ill_isv6) {
1025 ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst);
1026 } else {
1027 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
1028 ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst);
1029 }
1030
1031 if (ipif != NULL) {
1032 ipif_refrele(ipif);
1033 return (B_TRUE);
1034 } else {
1035 return (B_FALSE);
1036 }
1037 }
1038
1039 /*
1040 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
1041 * routes this routine sets up a ire_nce_cache as well. The caller needs to
1042 * lookup an nce for the multicast case.
1043 *
1044 * When src_multihoming is set to 2 (strict src multihoming) we use the source
1045 * address to select the interface and route. If IP_BOUND_IF etc are
1046 * specified, we require that they specify an interface on which the
1047 * source address is assigned.
1048 *
1049 * When src_multihoming is set to 1 (preferred src aware route
1050 * selection) the unicast lookup prefers a matching source
1051 * (i.e., that the route points out an ill on which the source is assigned), but
1052 * if no such route is found we fallback to not considering the source in the
1053 * route lookup.
1054 *
1055 * We skip the src_multihoming check when the source isn't (yet) set, and
1056 * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send
1057 * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO
1058 * when secpolicy_net_rawaccess().
1059 */
1060 ire_t *
ip_select_route(const in6_addr_t * v6dst,const in6_addr_t v6src,ip_xmit_attr_t * ixa,uint_t * generationp,in6_addr_t * setsrcp,int * errorp,boolean_t * multirtp)1061 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src,
1062 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1063 int *errorp, boolean_t *multirtp)
1064 {
1065 uint_t match_args;
1066 uint_t ire_type;
1067 ill_t *ill = NULL;
1068 ire_t *ire;
1069 ip_stack_t *ipst = ixa->ixa_ipst;
1070 ipaddr_t v4dst;
1071 in6_addr_t v6nexthop;
1072 iaflags_t ixaflags = ixa->ixa_flags;
1073 nce_t *nce;
1074 boolean_t preferred_src_aware = B_FALSE;
1075 boolean_t verify_src;
1076 boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4);
1077 int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst);
1078
1079 /*
1080 * We only verify that the src has been configured on a selected
1081 * interface if the src is not :: or INADDR_ANY, and if the
1082 * IXAF_VERIFY_SOURCE flag is set.
1083 */
1084 verify_src = (!V6_OR_V4_INADDR_ANY(v6src) &&
1085 (ixa->ixa_flags & IXAF_VERIFY_SOURCE));
1086
1087 match_args = MATCH_IRE_SECATTR;
1088 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
1089 if (setsrcp != NULL)
1090 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1091 if (errorp != NULL)
1092 ASSERT(*errorp == 0);
1093
1094 /*
1095 * The content of the ixa will be different if IP_NEXTHOP,
1096 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
1097 */
1098
1099 if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) {
1100 /* Pick up the IRE_MULTICAST for the ill */
1101 if (ixa->ixa_multicast_ifindex != 0) {
1102 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
1103 isv6, ipst);
1104 } else if (ixaflags & IXAF_SCOPEID_SET) {
1105 /* sin6_scope_id takes precedence over ixa_ifindex */
1106 ASSERT(ixa->ixa_scopeid != 0);
1107 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1108 isv6, ipst);
1109 } else if (ixa->ixa_ifindex != 0) {
1110 /*
1111 * In the ipmp case, the ixa_ifindex is set to
1112 * point at an under_ill and we would return the
1113 * ire_multicast() corresponding to that under_ill.
1114 */
1115 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1116 isv6, ipst);
1117 } else if (src_multihoming != 0 && verify_src) {
1118 /* Look up the ill based on the source address */
1119 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1120 /*
1121 * Since we looked up the ill from the source there
1122 * is no need to verify that the source is on the ill
1123 * below.
1124 */
1125 verify_src = B_FALSE;
1126 if (ill != NULL && IS_VNI(ill)) {
1127 ill_t *usesrc = ill;
1128
1129 ill = ill_lookup_usesrc(usesrc);
1130 ill_refrele(usesrc);
1131 }
1132 } else if (!isv6) {
1133 ipaddr_t v4setsrc = INADDR_ANY;
1134
1135 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid,
1136 ipst, multirtp, &v4setsrc);
1137 if (setsrcp != NULL)
1138 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1139 } else {
1140 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid,
1141 ipst, multirtp, setsrcp);
1142 }
1143 if (ill != NULL && IS_VNI(ill)) {
1144 ill_refrele(ill);
1145 ill = NULL;
1146 }
1147 if (ill == NULL) {
1148 if (errorp != NULL)
1149 *errorp = ENXIO;
1150 /* Get a hold on the IRE_NOROUTE */
1151 ire = ire_reject(ipst, isv6);
1152 return (ire);
1153 }
1154 if (!(ill->ill_flags & ILLF_MULTICAST)) {
1155 ill_refrele(ill);
1156 if (errorp != NULL)
1157 *errorp = EHOSTUNREACH;
1158 /* Get a hold on the IRE_NOROUTE */
1159 ire = ire_reject(ipst, isv6);
1160 return (ire);
1161 }
1162 /*
1163 * If we are doing the strictest src_multihoming, then
1164 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify
1165 * an interface that is consistent with the source address.
1166 */
1167 if (verify_src && src_multihoming == 2 &&
1168 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1169 if (errorp != NULL)
1170 *errorp = EADDRNOTAVAIL;
1171 ill_refrele(ill);
1172 /* Get a hold on the IRE_NOROUTE */
1173 ire = ire_reject(ipst, isv6);
1174 return (ire);
1175 }
1176 /* Get a refcnt on the single IRE_MULTICAST per ill */
1177 ire = ire_multicast(ill);
1178 ill_refrele(ill);
1179 if (generationp != NULL)
1180 *generationp = ire->ire_generation;
1181 if (errorp != NULL &&
1182 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
1183 *errorp = EHOSTUNREACH;
1184 }
1185 return (ire);
1186 }
1187
1188 /* Now for unicast */
1189 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
1190 if (ixaflags & IXAF_SCOPEID_SET) {
1191 /* sin6_scope_id takes precedence over ixa_ifindex */
1192 ASSERT(ixa->ixa_scopeid != 0);
1193 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
1194 isv6, ipst);
1195 } else {
1196 ASSERT(ixa->ixa_ifindex != 0);
1197 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
1198 isv6, ipst);
1199 }
1200 if (ill != NULL && IS_VNI(ill)) {
1201 ill_refrele(ill);
1202 ill = NULL;
1203 }
1204 if (ill == NULL) {
1205 if (errorp != NULL)
1206 *errorp = ENXIO;
1207 /* Get a hold on the IRE_NOROUTE */
1208 ire = ire_reject(ipst, isv6);
1209 return (ire);
1210 }
1211
1212 match_args |= MATCH_IRE_ILL;
1213
1214 /*
1215 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
1216 * so for both of them we need to be able look for an under
1217 * interface.
1218 */
1219 if (IS_UNDER_IPMP(ill))
1220 match_args |= MATCH_IRE_TESTHIDDEN;
1221
1222 /*
1223 * If we are doing the strictest src_multihoming, then
1224 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify
1225 * an interface that is consistent with the source address.
1226 */
1227 if (src_multihoming == 2 &&
1228 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) {
1229 if (errorp != NULL)
1230 *errorp = EADDRNOTAVAIL;
1231 ill_refrele(ill);
1232 /* Get a hold on the IRE_NOROUTE */
1233 ire = ire_reject(ipst, isv6);
1234 return (ire);
1235 }
1236 } else if (src_multihoming != 0 && verify_src) {
1237 /* Look up the ill based on the source address */
1238 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst);
1239 if (ill == NULL) {
1240 char addrbuf[INET6_ADDRSTRLEN];
1241
1242 ip3dbg(("%s not a valid src for unicast",
1243 inet_ntop(AF_INET6, &v6src, addrbuf,
1244 sizeof (addrbuf))));
1245 if (errorp != NULL)
1246 *errorp = EADDRNOTAVAIL;
1247 /* Get a hold on the IRE_NOROUTE */
1248 ire = ire_reject(ipst, isv6);
1249 return (ire);
1250 }
1251 match_args |= MATCH_IRE_SRC_ILL;
1252 preferred_src_aware = (src_multihoming == 1);
1253 }
1254
1255 if (ixaflags & IXAF_NEXTHOP_SET) {
1256 /* IP_NEXTHOP was set */
1257 v6nexthop = ixa->ixa_nexthop_v6;
1258 } else {
1259 v6nexthop = *v6dst;
1260 }
1261
1262 ire_type = 0;
1263
1264 /*
1265 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
1266 * we only look for an onlink IRE.
1267 */
1268 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
1269 match_args |= MATCH_IRE_TYPE;
1270 ire_type = IRE_ONLINK;
1271 }
1272
1273 retry:
1274 if (!isv6) {
1275 ipaddr_t v4nexthop;
1276 ipaddr_t v4setsrc = INADDR_ANY;
1277
1278 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
1279 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
1280 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1281 ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
1282 if (setsrcp != NULL)
1283 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
1284 } else {
1285 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
1286 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE,
1287 ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
1288 }
1289
1290 #ifdef DEBUG
1291 if (match_args & MATCH_IRE_TESTHIDDEN) {
1292 ip3dbg(("looking for hidden; dst %x ire %p\n",
1293 v4dst, (void *)ire));
1294 }
1295 #endif
1296 if (ill != NULL) {
1297 ill_refrele(ill);
1298 ill = NULL;
1299 }
1300 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1301 (ire->ire_type & IRE_MULTICAST)) {
1302 if (preferred_src_aware) {
1303 /*
1304 * "Preferred Source Aware" send mode. If we cannot
1305 * find an ire whose ire_ill had the desired source
1306 * address retry after relaxing the ill matching
1307 * constraint.
1308 */
1309 ire_refrele(ire);
1310 preferred_src_aware = B_FALSE;
1311 match_args &= ~MATCH_IRE_SRC_ILL;
1312 goto retry;
1313 }
1314 /* No ire_nce_cache */
1315 return (ire);
1316 }
1317
1318 /* Setup ire_nce_cache if it doesn't exist or is condemned. */
1319 mutex_enter(&ire->ire_lock);
1320 nce = ire->ire_nce_cache;
1321 if (nce == NULL || nce->nce_is_condemned) {
1322 mutex_exit(&ire->ire_lock);
1323 (void) ire_revalidate_nce(ire);
1324 } else {
1325 mutex_exit(&ire->ire_lock);
1326 }
1327 return (ire);
1328 }
1329
1330 /*
1331 * Find a route given some xmit attributes and a packet.
1332 * Generic for IPv4 and IPv6
1333 *
1334 * This never returns NULL. But when it returns the IRE_NOROUTE
1335 * it might set errorp.
1336 */
1337 ire_t *
ip_select_route_pkt(mblk_t * mp,ip_xmit_attr_t * ixa,uint_t * generationp,int * errorp,boolean_t * multirtp)1338 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
1339 int *errorp, boolean_t *multirtp)
1340 {
1341 if (ixa->ixa_flags & IXAF_IS_IPV4) {
1342 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1343 in6_addr_t v6dst, v6src;
1344
1345 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
1346 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
1347
1348 return (ip_select_route(&v6dst, v6src, ixa, generationp,
1349 NULL, errorp, multirtp));
1350 } else {
1351 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1352
1353 return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src,
1354 ixa, generationp, NULL, errorp, multirtp));
1355 }
1356 }
1357
1358 ire_t *
ip_select_route_v4(ipaddr_t dst,ipaddr_t src,ip_xmit_attr_t * ixa,uint_t * generationp,ipaddr_t * v4setsrcp,int * errorp,boolean_t * multirtp)1359 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa,
1360 uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
1361 {
1362 in6_addr_t v6dst, v6src;
1363 ire_t *ire;
1364 in6_addr_t setsrc;
1365
1366 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
1367
1368 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
1369 IN6_IPADDR_TO_V4MAPPED(src, &v6src);
1370
1371 setsrc = ipv6_all_zeros;
1372 ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp,
1373 multirtp);
1374 if (v4setsrcp != NULL)
1375 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
1376 return (ire);
1377 }
1378
1379 /*
1380 * Recursively look for a route to the destination. Can also match on
1381 * the zoneid, ill, and label. Used for the data paths. See also
1382 * ire_route_recursive.
1383 *
1384 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1385 * create an IRE_IF_CLONE. This is used on the receive side when we are not
1386 * forwarding.
1387 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1388 * resolve the gateway.
1389 *
1390 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1391 * instead.
1392 *
1393 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1394 * is an error.
1395 * Allow at most one RTF_INDIRECT.
1396 */
1397 ire_t *
ire_route_recursive_impl_v4(ire_t * ire,ipaddr_t nexthop,uint_t ire_type,const ill_t * ill_arg,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,ipaddr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)1398 ire_route_recursive_impl_v4(ire_t *ire,
1399 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
1400 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1401 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1402 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1403 {
1404 int i, j;
1405 ire_t *ires[MAX_IRE_RECURSION];
1406 uint_t generation;
1407 uint_t generations[MAX_IRE_RECURSION];
1408 boolean_t need_refrele = B_FALSE;
1409 boolean_t invalidate = B_FALSE;
1410 ill_t *ill = NULL;
1411 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST);
1412
1413 if (setsrcp != NULL)
1414 ASSERT(*setsrcp == INADDR_ANY);
1415 if (gwattrp != NULL)
1416 ASSERT(*gwattrp == NULL);
1417
1418 /*
1419 * We iterate up to three times to resolve a route, even though
1420 * we have four slots in the array. The extra slot is for an
1421 * IRE_IF_CLONE we might need to create.
1422 */
1423 i = 0;
1424 while (i < MAX_IRE_RECURSION - 1) {
1425 /* ire_ftable_lookup handles round-robin/ECMP */
1426 if (ire == NULL) {
1427 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
1428 (ill != NULL? ill : ill_arg), zoneid, tsl,
1429 match_args, xmit_hint, ipst, &generation);
1430 } else {
1431 /* Caller passed it; extra hold since we will rele */
1432 ire_refhold(ire);
1433 if (generationp != NULL)
1434 generation = *generationp;
1435 else
1436 generation = IRE_GENERATION_VERIFY;
1437 }
1438 if (ire == NULL) {
1439 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1440 ire = ires[0];
1441 ire_refhold(ire);
1442 } else {
1443 ire = ire_reject(ipst, B_FALSE);
1444 }
1445 goto error;
1446 }
1447
1448 /* Need to return the ire with RTF_REJECT|BLACKHOLE */
1449 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1450 goto error;
1451
1452 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1453 /*
1454 * Verify that the IRE_IF_CLONE has a consistent generation
1455 * number.
1456 */
1457 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1458 ire_refrele(ire);
1459 ire = NULL;
1460 continue;
1461 }
1462
1463 /*
1464 * Don't allow anything unusual past the first iteration.
1465 * After the first lookup, we should no longer look for
1466 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT
1467 * routes.
1468 *
1469 * In addition, after we have found a direct IRE_OFFLINK,
1470 * we should only look for interface or clone routes.
1471 */
1472 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1473
1474 if ((ire->ire_type & IRE_OFFLINK) &&
1475 !(ire->ire_flags & RTF_INDIRECT)) {
1476 ire_type = IRE_IF_ALL;
1477 } else {
1478 /*
1479 * no more local, loopback, broadcast routes
1480 */
1481 if (!(match_args & MATCH_IRE_TYPE))
1482 ire_type = (IRE_OFFLINK|IRE_ONLINK);
1483 ire_type &= ~maskoff;
1484 }
1485 match_args |= MATCH_IRE_TYPE;
1486
1487 /* We have a usable IRE */
1488 ires[i] = ire;
1489 generations[i] = generation;
1490 i++;
1491
1492 /* The first RTF_SETSRC address is passed back if setsrcp */
1493 if ((ire->ire_flags & RTF_SETSRC) &&
1494 setsrcp != NULL && *setsrcp == INADDR_ANY) {
1495 ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
1496 *setsrcp = ire->ire_setsrc_addr;
1497 }
1498
1499 /* The first ire_gw_secattr is passed back if gwattrp */
1500 if (ire->ire_gw_secattr != NULL &&
1501 gwattrp != NULL && *gwattrp == NULL)
1502 *gwattrp = ire->ire_gw_secattr;
1503
1504 /*
1505 * Check if we have a short-cut pointer to an IRE for this
1506 * destination, and that the cached dependency isn't stale.
1507 * In that case we've rejoined an existing tree towards a
1508 * parent, thus we don't need to continue the loop to
1509 * discover the rest of the tree.
1510 */
1511 mutex_enter(&ire->ire_lock);
1512 if (ire->ire_dep_parent != NULL &&
1513 ire->ire_dep_parent->ire_generation ==
1514 ire->ire_dep_parent_generation) {
1515 mutex_exit(&ire->ire_lock);
1516 ire = NULL;
1517 goto done;
1518 }
1519 mutex_exit(&ire->ire_lock);
1520
1521 /*
1522 * If this type should have an ire_nce_cache (even if it
1523 * doesn't yet have one) then we are done. Includes
1524 * IRE_INTERFACE with a full 32 bit mask.
1525 */
1526 if (ire->ire_nce_capable) {
1527 ire = NULL;
1528 goto done;
1529 }
1530 ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1531 /*
1532 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1533 * particular destination
1534 */
1535 if (ire->ire_type & IRE_INTERFACE) {
1536 in6_addr_t v6nexthop;
1537 ire_t *clone;
1538
1539 ASSERT(ire->ire_masklen != IPV4_ABITS);
1540
1541 /*
1542 * In the case of ip_input and ILLF_FORWARDING not
1543 * being set, and in the case of RTM_GET, there is
1544 * no point in allocating an IRE_IF_CLONE. We return
1545 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1546 * result in a ire_dep_parent which is IRE_IF_*
1547 * without an IRE_IF_CLONE.
1548 * We recover from that when we need to send packets
1549 * by ensuring that the generations become
1550 * IRE_GENERATION_VERIFY in this case.
1551 */
1552 if (!(irr_flags & IRR_ALLOCATE)) {
1553 invalidate = B_TRUE;
1554 ire = NULL;
1555 goto done;
1556 }
1557
1558 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
1559
1560 clone = ire_create_if_clone(ire, &v6nexthop,
1561 &generation);
1562 if (clone == NULL) {
1563 /*
1564 * Temporary failure - no memory.
1565 * Don't want caller to cache IRE_NOROUTE.
1566 */
1567 invalidate = B_TRUE;
1568 ire = ire_blackhole(ipst, B_FALSE);
1569 goto error;
1570 }
1571 /*
1572 * Make clone next to last entry and the
1573 * IRE_INTERFACE the last in the dependency
1574 * chain since the clone depends on the
1575 * IRE_INTERFACE.
1576 */
1577 ASSERT(i >= 1);
1578 ASSERT(i < MAX_IRE_RECURSION);
1579
1580 ires[i] = ires[i-1];
1581 generations[i] = generations[i-1];
1582 ires[i-1] = clone;
1583 generations[i-1] = generation;
1584 i++;
1585
1586 ire = NULL;
1587 goto done;
1588 }
1589
1590 /*
1591 * We only match on the type and optionally ILL when
1592 * recursing. The type match is used by some callers
1593 * to exclude certain types (such as IRE_IF_CLONE or
1594 * IRE_LOCAL|IRE_LOOPBACK).
1595 *
1596 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1597 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1598 * ire_ill, so we set ill to the ire_ill;
1599 */
1600 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1601 nexthop = ire->ire_gateway_addr;
1602 if (ill == NULL && ire->ire_ill != NULL) {
1603 ill = ire->ire_ill;
1604 need_refrele = B_TRUE;
1605 ill_refhold(ill);
1606 match_args |= MATCH_IRE_ILL;
1607 }
1608 ire = NULL;
1609 }
1610 ASSERT(ire == NULL);
1611 ire = ire_reject(ipst, B_FALSE);
1612
1613 error:
1614 ASSERT(ire != NULL);
1615 if (need_refrele)
1616 ill_refrele(ill);
1617
1618 /*
1619 * In the case of MULTIRT we want to try a different IRE the next
1620 * time. We let the next packet retry in that case.
1621 */
1622 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1623 (void) ire_no_good(ires[0]);
1624
1625 cleanup:
1626 /* cleanup ires[i] */
1627 ire_dep_unbuild(ires, i);
1628 for (j = 0; j < i; j++)
1629 ire_refrele(ires[j]);
1630
1631 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1632 (irr_flags & IRR_INCOMPLETE));
1633 /*
1634 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1635 * ip_select_route since the reject or lack of memory might be gone.
1636 */
1637 if (generationp != NULL)
1638 *generationp = IRE_GENERATION_VERIFY;
1639 return (ire);
1640
1641 done:
1642 ASSERT(ire == NULL);
1643 if (need_refrele) {
1644 ill_refrele(ill);
1645 ill = NULL;
1646 }
1647
1648 /* Build dependencies */
1649 if (i > 1 && !ire_dep_build(ires, generations, i)) {
1650 /* Something in chain was condemned; tear it apart */
1651 ire = ire_reject(ipst, B_FALSE);
1652 goto cleanup;
1653 }
1654
1655 /*
1656 * Release all refholds except the one for ires[0] that we
1657 * will return to the caller.
1658 */
1659 for (j = 1; j < i; j++)
1660 ire_refrele(ires[j]);
1661
1662 if (invalidate) {
1663 /*
1664 * Since we needed to allocate but couldn't we need to make
1665 * sure that the dependency chain is rebuilt the next time.
1666 */
1667 ire_dep_invalidate_generations(ires[0]);
1668 generation = IRE_GENERATION_VERIFY;
1669 } else {
1670 /*
1671 * IREs can have been added or deleted while we did the
1672 * recursive lookup and we can't catch those until we've built
1673 * the dependencies. We verify the stored
1674 * ire_dep_parent_generation to catch any such changes and
1675 * return IRE_GENERATION_VERIFY (which will cause
1676 * ip_select_route to be called again so we can redo the
1677 * recursive lookup next time we send a packet.
1678 */
1679 if (ires[0]->ire_dep_parent == NULL)
1680 generation = ires[0]->ire_generation;
1681 else
1682 generation = ire_dep_validate_generations(ires[0]);
1683 if (generations[0] != ires[0]->ire_generation) {
1684 /* Something changed at the top */
1685 generation = IRE_GENERATION_VERIFY;
1686 }
1687 }
1688 if (generationp != NULL)
1689 *generationp = generation;
1690
1691 return (ires[0]);
1692 }
1693
1694 ire_t *
ire_route_recursive_v4(ipaddr_t nexthop,uint_t ire_type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,ipaddr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)1695 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
1696 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1697 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
1698 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1699 {
1700 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
1701 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1702 gwattrp, generationp));
1703 }
1704
1705 /*
1706 * Recursively look for a route to the destination.
1707 * We only handle a destination match here, yet we have the same arguments
1708 * as the full match to allow function pointers to select between the two.
1709 *
1710 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1711 * instead.
1712 *
1713 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1714 * is an error.
1715 * Allow at most one RTF_INDIRECT.
1716 */
1717 ire_t *
ire_route_recursive_dstonly_v4(ipaddr_t nexthop,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst)1718 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags,
1719 uint32_t xmit_hint, ip_stack_t *ipst)
1720 {
1721 ire_t *ire;
1722 ire_t *ire1;
1723 uint_t generation;
1724
1725 /* ire_ftable_lookup handles round-robin/ECMP */
1726 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
1727 &generation);
1728 ASSERT(ire != NULL);
1729 /*
1730 * If the IRE has a current cached parent we know that the whole
1731 * parent chain is current, hence we don't need to discover and
1732 * build any dependencies by doing a recursive lookup.
1733 */
1734 mutex_enter(&ire->ire_lock);
1735 if (ire->ire_dep_parent != NULL) {
1736 if (ire->ire_dep_parent->ire_generation ==
1737 ire->ire_dep_parent_generation) {
1738 mutex_exit(&ire->ire_lock);
1739 return (ire);
1740 }
1741 mutex_exit(&ire->ire_lock);
1742 } else {
1743 mutex_exit(&ire->ire_lock);
1744 /*
1745 * If this type should have an ire_nce_cache (even if it
1746 * doesn't yet have one) then we are done. Includes
1747 * IRE_INTERFACE with a full 32 bit mask.
1748 */
1749 if (ire->ire_nce_capable)
1750 return (ire);
1751 }
1752
1753 /*
1754 * Fallback to loop in the normal code starting with the ire
1755 * we found. Normally this would return the same ire.
1756 */
1757 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
1758 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1759 &generation);
1760 ire_refrele(ire);
1761 return (ire1);
1762 }
1763
1764 /*
1765 * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE
1766 * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they
1767 * are not consistent, and TRUE otherwise.
1768 */
1769 boolean_t
ire_clone_verify(ire_t * ire)1770 ire_clone_verify(ire_t *ire)
1771 {
1772 ASSERT((ire->ire_type & IRE_IF_CLONE) != 0);
1773 mutex_enter(&ire->ire_lock);
1774 if (ire->ire_dep_parent != NULL &&
1775 ire->ire_dep_parent->ire_generation !=
1776 ire->ire_dep_parent_generation) {
1777 mutex_exit(&ire->ire_lock);
1778 ire_delete(ire);
1779 return (B_FALSE);
1780 }
1781 mutex_exit(&ire->ire_lock);
1782 return (B_TRUE);
1783 }
1784