1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 1990 Mentat Inc.
24 */
25
26 /*
27 * This file contains routines that manipulate Internet Routing Entries (IREs).
28 */
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/ddi.h>
33 #include <sys/cmn_err.h>
34
35 #include <sys/systm.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <net/if.h>
39 #include <net/route.h>
40 #include <netinet/in.h>
41 #include <net/if_dl.h>
42 #include <netinet/ip6.h>
43 #include <netinet/icmp6.h>
44
45 #include <inet/common.h>
46 #include <inet/mi.h>
47 #include <inet/ip.h>
48 #include <inet/ip6.h>
49 #include <inet/ip_ndp.h>
50 #include <inet/ip_if.h>
51 #include <inet/ip_ire.h>
52 #include <inet/ipclassifier.h>
53 #include <inet/nd.h>
54 #include <inet/tunables.h>
55 #include <sys/kmem.h>
56 #include <sys/zone.h>
57
58 #include <sys/tsol/label.h>
59 #include <sys/tsol/tnet.h>
60
61 #define IS_DEFAULT_ROUTE_V6(ire) \
62 (((ire)->ire_type & IRE_DEFAULT) || \
63 (((ire)->ire_type & IRE_INTERFACE) && \
64 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
65
66 static ire_t ire_null;
67
68 static ire_t *
69 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
70 const in6_addr_t *gateway, int type, const ill_t *ill,
71 zoneid_t zoneid, const ts_label_t *tsl, int flags,
72 ip_stack_t *ipst);
73
74 /*
75 * Initialize the ire that is specific to IPv6 part and call
76 * ire_init_common to finish it.
77 * Returns zero or errno.
78 */
79 int
ire_init_v6(ire_t * ire,const in6_addr_t * v6addr,const in6_addr_t * v6mask,const in6_addr_t * v6gateway,ushort_t type,ill_t * ill,zoneid_t zoneid,uint_t flags,tsol_gc_t * gc,ip_stack_t * ipst)80 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
81 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
82 zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
83 {
84 int error;
85
86 /*
87 * Reject IRE security attmakeribute creation/initialization
88 * if system is not running in Trusted mode.
89 */
90 if (gc != NULL && !is_system_labeled())
91 return (EINVAL);
92
93 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
94 if (v6addr != NULL)
95 ire->ire_addr_v6 = *v6addr;
96 if (v6gateway != NULL)
97 ire->ire_gateway_addr_v6 = *v6gateway;
98
99 /* Make sure we don't have stray values in some fields */
100 switch (type) {
101 case IRE_LOOPBACK:
102 case IRE_HOST:
103 case IRE_LOCAL:
104 case IRE_IF_CLONE:
105 ire->ire_mask_v6 = ipv6_all_ones;
106 ire->ire_masklen = IPV6_ABITS;
107 break;
108 case IRE_PREFIX:
109 case IRE_DEFAULT:
110 case IRE_IF_RESOLVER:
111 case IRE_IF_NORESOLVER:
112 if (v6mask != NULL) {
113 ire->ire_mask_v6 = *v6mask;
114 ire->ire_masklen =
115 ip_mask_to_plen_v6(&ire->ire_mask_v6);
116 }
117 break;
118 case IRE_MULTICAST:
119 case IRE_NOROUTE:
120 ASSERT(v6mask == NULL);
121 break;
122 default:
123 ASSERT(0);
124 return (EINVAL);
125 }
126
127 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
128 gc, ipst);
129 if (error != NULL)
130 return (error);
131
132 /* Determine which function pointers to use */
133 ire->ire_postfragfn = ip_xmit; /* Common case */
134
135 switch (ire->ire_type) {
136 case IRE_LOCAL:
137 ire->ire_sendfn = ire_send_local_v6;
138 ire->ire_recvfn = ire_recv_local_v6;
139 ASSERT(ire->ire_ill != NULL);
140 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
141 ire->ire_recvfn = ire_recv_noaccept_v6;
142 break;
143 case IRE_LOOPBACK:
144 ire->ire_sendfn = ire_send_local_v6;
145 ire->ire_recvfn = ire_recv_loopback_v6;
146 break;
147 case IRE_MULTICAST:
148 ire->ire_postfragfn = ip_postfrag_loopcheck;
149 ire->ire_sendfn = ire_send_multicast_v6;
150 ire->ire_recvfn = ire_recv_multicast_v6;
151 break;
152 default:
153 /*
154 * For IRE_IF_ALL and IRE_OFFLINK we forward received
155 * packets by default.
156 */
157 ire->ire_sendfn = ire_send_wire_v6;
158 ire->ire_recvfn = ire_recv_forward_v6;
159 break;
160 }
161 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
162 ire->ire_sendfn = ire_send_noroute_v6;
163 ire->ire_recvfn = ire_recv_noroute_v6;
164 } else if (ire->ire_flags & RTF_MULTIRT) {
165 ire->ire_postfragfn = ip_postfrag_multirt_v6;
166 ire->ire_sendfn = ire_send_multirt_v6;
167 ire->ire_recvfn = ire_recv_multirt_v6;
168 }
169 ire->ire_nce_capable = ire_determine_nce_capable(ire);
170 return (0);
171 }
172
173 /*
174 * ire_create_v6 is called to allocate and initialize a new IRE.
175 *
176 * NOTE : This is called as writer sometimes though not required
177 * by this function.
178 */
179 /* ARGSUSED */
180 ire_t *
ire_create_v6(const in6_addr_t * v6addr,const in6_addr_t * v6mask,const in6_addr_t * v6gateway,ushort_t type,ill_t * ill,zoneid_t zoneid,uint_t flags,tsol_gc_t * gc,ip_stack_t * ipst)181 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
182 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
183 uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
184 {
185 ire_t *ire;
186 int error;
187
188 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
189
190 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
191 if (ire == NULL) {
192 DTRACE_PROBE(kmem__cache__alloc);
193 return (NULL);
194 }
195 *ire = ire_null;
196
197 error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
198 type, ill, zoneid, flags, gc, ipst);
199
200 if (error != 0) {
201 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
202 kmem_cache_free(ire_cache, ire);
203 return (NULL);
204 }
205 return (ire);
206 }
207
208 /*
209 * Find the ill matching a multicast group.
210 * Allows different routes for multicast addresses
211 * in the unicast routing table (akin to FF::0/8 but could be more specific)
212 * which point at different interfaces. This is used when IPV6_MULTICAST_IF
213 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
214 * specify the interface to join on.
215 *
216 * Supports link-local addresses by using ire_route_recursive which follows
217 * the ill when recursing.
218 *
219 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
220 * and the MULTIRT property can be different for different groups, we
221 * extract RTF_MULTIRT from the special unicast route added for a group
222 * with CGTP and pass that back in the multirtp argument.
223 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
224 * We have a setsrcp argument for the same reason.
225 */
226 ill_t *
ire_lookup_multi_ill_v6(const in6_addr_t * group,zoneid_t zoneid,ip_stack_t * ipst,boolean_t * multirtp,in6_addr_t * setsrcp)227 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
228 ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
229 {
230 ire_t *ire;
231 ill_t *ill;
232
233 ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
234 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
235 ASSERT(ire != NULL);
236
237 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
238 ire_refrele(ire);
239 return (NULL);
240 }
241
242 if (multirtp != NULL)
243 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
244
245 ill = ire_nexthop_ill(ire);
246 ire_refrele(ire);
247 return (ill);
248 }
249
250 /*
251 * This function takes a mask and returns number of bits set in the
252 * mask (the represented prefix length). Assumes a contiguous mask.
253 */
254 int
ip_mask_to_plen_v6(const in6_addr_t * v6mask)255 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
256 {
257 int bits;
258 int plen = IPV6_ABITS;
259 int i;
260
261 for (i = 3; i >= 0; i--) {
262 if (v6mask->s6_addr32[i] == 0) {
263 plen -= 32;
264 continue;
265 }
266 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
267 if (bits == 0)
268 break;
269 plen -= bits;
270 }
271
272 return (plen);
273 }
274
275 /*
276 * Convert a prefix length to the mask for that prefix.
277 * Returns the argument bitmask.
278 */
279 in6_addr_t *
ip_plen_to_mask_v6(uint_t plen,in6_addr_t * bitmask)280 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
281 {
282 uint32_t *ptr;
283
284 if (plen < 0 || plen > IPV6_ABITS)
285 return (NULL);
286 *bitmask = ipv6_all_zeros;
287 if (plen == 0)
288 return (bitmask);
289
290 ptr = (uint32_t *)bitmask;
291 while (plen > 32) {
292 *ptr++ = 0xffffffffU;
293 plen -= 32;
294 }
295 *ptr = htonl(0xffffffffU << (32 - plen));
296 return (bitmask);
297 }
298
299 /*
300 * Add a fully initialized IPv6 IRE to the forwarding table.
301 * This returns NULL on failure, or a held IRE on success.
302 * Normally the returned IRE is the same as the argument. But a different
303 * IRE will be returned if the added IRE is deemed identical to an existing
304 * one. In that case ire_identical_ref will be increased.
305 * The caller always needs to do an ire_refrele() on the returned IRE.
306 */
307 ire_t *
ire_add_v6(ire_t * ire)308 ire_add_v6(ire_t *ire)
309 {
310 ire_t *ire1;
311 int mask_table_index;
312 irb_t *irb_ptr;
313 ire_t **irep;
314 int match_flags;
315 int error;
316 ip_stack_t *ipst = ire->ire_ipst;
317
318 ASSERT(ire->ire_ipversion == IPV6_VERSION);
319
320 /* Make sure the address is properly masked. */
321 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
322
323 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
324 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
325 irb_t *ptr;
326 int i;
327
328 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
329 sizeof (irb_t)));
330 if (ptr == NULL) {
331 ire_delete(ire);
332 return (NULL);
333 }
334 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
335 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
336 ptr[i].irb_ipst = ipst;
337 }
338 mutex_enter(&ipst->ips_ire_ft_init_lock);
339 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
340 NULL) {
341 ipst->ips_ip_forwarding_table_v6[mask_table_index] =
342 ptr;
343 mutex_exit(&ipst->ips_ire_ft_init_lock);
344 } else {
345 /*
346 * Some other thread won the race in
347 * initializing the forwarding table at the
348 * same index.
349 */
350 mutex_exit(&ipst->ips_ire_ft_init_lock);
351 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
352 rw_destroy(&ptr[i].irb_lock);
353 }
354 mi_free(ptr);
355 }
356 }
357 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
358 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
359 ipst->ips_ip6_ftable_hash_size)]);
360
361 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
362 if (ire->ire_ill != NULL)
363 match_flags |= MATCH_IRE_ILL;
364 /*
365 * Start the atomic add of the ire. Grab the bucket lock and the
366 * ill lock. Check for condemned.
367 */
368 error = ire_atomic_start(irb_ptr, ire);
369 if (error != 0) {
370 ire_delete(ire);
371 return (NULL);
372 }
373
374 /*
375 * If we are creating a hidden IRE, make sure we search for
376 * hidden IREs when searching for duplicates below.
377 * Otherwise, we might find an IRE on some other interface
378 * that's not marked hidden.
379 */
380 if (ire->ire_testhidden)
381 match_flags |= MATCH_IRE_TESTHIDDEN;
382
383 /*
384 * Atomically check for duplicate and insert in the table.
385 */
386 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
387 if (IRE_IS_CONDEMNED(ire1))
388 continue;
389 /*
390 * Here we need an exact match on zoneid, i.e.,
391 * ire_match_args doesn't fit.
392 */
393 if (ire1->ire_zoneid != ire->ire_zoneid)
394 continue;
395
396 if (ire1->ire_type != ire->ire_type)
397 continue;
398
399 /*
400 * Note: We do not allow multiple routes that differ only
401 * in the gateway security attributes; such routes are
402 * considered duplicates.
403 * To change that we explicitly have to treat them as
404 * different here.
405 */
406 if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
407 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
408 ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
409 match_flags)) {
410 /*
411 * Return the old ire after doing a REFHOLD.
412 * As most of the callers continue to use the IRE
413 * after adding, we return a held ire. This will
414 * avoid a lookup in the caller again. If the callers
415 * don't want to use it, they need to do a REFRELE.
416 *
417 * We only allow exactly one IRE_IF_CLONE for any dst,
418 * so, if the is an IF_CLONE, return the ire without
419 * an identical_ref, but with an ire_ref held.
420 */
421 if (ire->ire_type != IRE_IF_CLONE) {
422 atomic_add_32(&ire1->ire_identical_ref, 1);
423 DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
424 ire_t *, ire);
425 }
426 ip1dbg(("found dup ire existing %p new %p",
427 (void *)ire1, (void *)ire));
428 ire_refhold(ire1);
429 ire_atomic_end(irb_ptr, ire);
430 ire_delete(ire);
431 return (ire1);
432 }
433 }
434
435 /*
436 * Normally we do head insertion since most things do not care about
437 * the order of the IREs in the bucket.
438 * However, due to shared-IP zones (and restrict_interzone_loopback)
439 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
440 * address. For that reason we do tail insertion for IRE_IF_CLONE.
441 */
442 irep = (ire_t **)irb_ptr;
443 if (ire->ire_type & IRE_IF_CLONE) {
444 while ((ire1 = *irep) != NULL)
445 irep = &ire1->ire_next;
446 }
447 /* Insert at *irep */
448 ire1 = *irep;
449 if (ire1 != NULL)
450 ire1->ire_ptpn = &ire->ire_next;
451 ire->ire_next = ire1;
452 /* Link the new one in. */
453 ire->ire_ptpn = irep;
454 /*
455 * ire_walk routines de-reference ire_next without holding
456 * a lock. Before we point to the new ire, we want to make
457 * sure the store that sets the ire_next of the new ire
458 * reaches global visibility, so that ire_walk routines
459 * don't see a truncated list of ires i.e if the ire_next
460 * of the new ire gets set after we do "*irep = ire" due
461 * to re-ordering, the ire_walk thread will see a NULL
462 * once it accesses the ire_next of the new ire.
463 * membar_producer() makes sure that the following store
464 * happens *after* all of the above stores.
465 */
466 membar_producer();
467 *irep = ire;
468 ire->ire_bucket = irb_ptr;
469 /*
470 * We return a bumped up IRE above. Keep it symmetrical
471 * so that the callers will always have to release. This
472 * helps the callers of this function because they continue
473 * to use the IRE after adding and hence they don't have to
474 * lookup again after we return the IRE.
475 *
476 * NOTE : We don't have to use atomics as this is appearing
477 * in the list for the first time and no one else can bump
478 * up the reference count on this yet.
479 */
480 ire_refhold_locked(ire);
481 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
482 irb_ptr->irb_ire_cnt++;
483
484 if (ire->ire_ill != NULL) {
485 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
486 (char *), "ire", (void *), ire);
487 ire->ire_ill->ill_ire_cnt++;
488 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */
489 }
490 ire_atomic_end(irb_ptr, ire);
491
492 /* Make any caching of the IREs be notified or updated */
493 ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
494
495 return (ire);
496 }
497
498 /*
499 * Search for all HOST REDIRECT routes that are
500 * pointing at the specified gateway and
501 * delete them. This routine is called only
502 * when a default gateway is going away.
503 */
504 static void
ire_delete_host_redirects_v6(const in6_addr_t * gateway,ip_stack_t * ipst)505 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
506 {
507 irb_t *irb_ptr;
508 irb_t *irb;
509 ire_t *ire;
510 in6_addr_t gw_addr_v6;
511 int i;
512
513 /* get the hash table for HOST routes */
514 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
515 if (irb_ptr == NULL)
516 return;
517 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
518 irb = &irb_ptr[i];
519 irb_refhold(irb);
520 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
521 if (!(ire->ire_flags & RTF_DYNAMIC))
522 continue;
523 mutex_enter(&ire->ire_lock);
524 gw_addr_v6 = ire->ire_gateway_addr_v6;
525 mutex_exit(&ire->ire_lock);
526 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
527 ire_delete(ire);
528 }
529 irb_refrele(irb);
530 }
531 }
532
533 /*
534 * Delete the specified IRE.
535 * All calls should use ire_delete().
536 * Sometimes called as writer though not required by this function.
537 *
538 * NOTE : This function is called only if the ire was added
539 * in the list.
540 */
541 void
ire_delete_v6(ire_t * ire)542 ire_delete_v6(ire_t *ire)
543 {
544 in6_addr_t gw_addr_v6;
545 ip_stack_t *ipst = ire->ire_ipst;
546
547 /*
548 * Make sure ire_generation increases from ire_flush_cache happen
549 * after any lookup/reader has read ire_generation.
550 * Since the rw_enter makes us wait until any lookup/reader has
551 * completed we can exit the lock immediately.
552 */
553 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
554 rw_exit(&ipst->ips_ip6_ire_head_lock);
555
556 ASSERT(ire->ire_refcnt >= 1);
557 ASSERT(ire->ire_ipversion == IPV6_VERSION);
558
559 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
560
561 if (ire->ire_type == IRE_DEFAULT) {
562 /*
563 * when a default gateway is going away
564 * delete all the host redirects pointing at that
565 * gateway.
566 */
567 mutex_enter(&ire->ire_lock);
568 gw_addr_v6 = ire->ire_gateway_addr_v6;
569 mutex_exit(&ire->ire_lock);
570 ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
571 }
572
573 /*
574 * If we are deleting an IRE_INTERFACE then we make sure we also
575 * delete any IRE_IF_CLONE that has been created from it.
576 * Those are always in ire_dep_children.
577 */
578 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
579 ire_dep_delete_if_clone(ire);
580
581 /* Remove from parent dependencies and child */
582 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
583 if (ire->ire_dep_parent != NULL) {
584 ire_dep_remove(ire);
585 }
586 while (ire->ire_dep_children != NULL)
587 ire_dep_remove(ire->ire_dep_children);
588 rw_exit(&ipst->ips_ire_dep_lock);
589 }
590
591 /*
592 * When an IRE is added or deleted this routine is called to make sure
593 * any caching of IRE information is notified or updated.
594 *
595 * The flag argument indicates if the flush request is due to addition
596 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
597 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
598 */
599 void
ire_flush_cache_v6(ire_t * ire,int flag)600 ire_flush_cache_v6(ire_t *ire, int flag)
601 {
602 ip_stack_t *ipst = ire->ire_ipst;
603
604 /*
605 * IRE_IF_CLONE ire's don't provide any new information
606 * than the parent from which they are cloned, so don't
607 * perturb the generation numbers.
608 */
609 if (ire->ire_type & IRE_IF_CLONE)
610 return;
611
612 /*
613 * Ensure that an ire_add during a lookup serializes the updates of
614 * the generation numbers under ire_head_lock so that the lookup gets
615 * either the old ire and old generation number, or a new ire and new
616 * generation number.
617 */
618 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
619
620 /*
621 * If a route was just added, we need to notify everybody that
622 * has cached an IRE_NOROUTE since there might now be a better
623 * route for them.
624 */
625 if (flag == IRE_FLUSH_ADD) {
626 ire_increment_generation(ipst->ips_ire_reject_v6);
627 ire_increment_generation(ipst->ips_ire_blackhole_v6);
628 }
629
630 /* Adding a default can't otherwise provide a better route */
631 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
632 rw_exit(&ipst->ips_ip6_ire_head_lock);
633 return;
634 }
635
636 switch (flag) {
637 case IRE_FLUSH_DELETE:
638 case IRE_FLUSH_GWCHANGE:
639 /*
640 * Update ire_generation for all ire_dep_children chains
641 * starting with this IRE
642 */
643 ire_dep_incr_generation(ire);
644 break;
645 case IRE_FLUSH_ADD: {
646 in6_addr_t addr;
647 in6_addr_t mask;
648 ip_stack_t *ipst = ire->ire_ipst;
649 uint_t masklen;
650
651 /*
652 * Find an IRE which is a shorter match than the ire to be added
653 * For any such IRE (which we repeat) we update the
654 * ire_generation the same way as in the delete case.
655 */
656 addr = ire->ire_addr_v6;
657 mask = ire->ire_mask_v6;
658 masklen = ip_mask_to_plen_v6(&mask);
659
660 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
661 ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
662 while (ire != NULL) {
663 /* We need to handle all in the same bucket */
664 irb_increment_generation(ire->ire_bucket);
665
666 mask = ire->ire_mask_v6;
667 ASSERT(masklen > ip_mask_to_plen_v6(&mask));
668 masklen = ip_mask_to_plen_v6(&mask);
669 ire_refrele(ire);
670 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
671 NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
672 }
673 }
674 break;
675 }
676 rw_exit(&ipst->ips_ip6_ire_head_lock);
677 }
678
679 /*
680 * Matches the arguments passed with the values in the ire.
681 *
682 * Note: for match types that match using "ill" passed in, ill
683 * must be checked for non-NULL before calling this routine.
684 */
685 boolean_t
ire_match_args_v6(ire_t * ire,const in6_addr_t * addr,const in6_addr_t * mask,const in6_addr_t * gateway,int type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,int match_flags)686 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
687 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
688 const ts_label_t *tsl, int match_flags)
689 {
690 in6_addr_t masked_addr;
691 in6_addr_t gw_addr_v6;
692 ill_t *ire_ill = NULL, *dst_ill;
693 ip_stack_t *ipst = ire->ire_ipst;
694
695 ASSERT(ire->ire_ipversion == IPV6_VERSION);
696 ASSERT(addr != NULL);
697 ASSERT(mask != NULL);
698 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
699 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
700 (ill != NULL && ill->ill_isv6));
701
702 /*
703 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
704 * is in fact hidden, to ensure the caller gets the right one.
705 */
706 if (ire->ire_testhidden) {
707 if (!(match_flags & MATCH_IRE_TESTHIDDEN))
708 return (B_FALSE);
709 }
710
711 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
712 ire->ire_zoneid != ALL_ZONES) {
713 /*
714 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
715 * does not match that of ire_zoneid, a failure to
716 * match is reported at this point. Otherwise, since some IREs
717 * that are available in the global zone can be used in local
718 * zones, additional checks need to be performed:
719 *
720 * IRE_LOOPBACK
721 * entries should never be matched in this situation.
722 * Each zone has its own IRE_LOOPBACK.
723 *
724 * IRE_LOCAL
725 * We allow them for any zoneid. ire_route_recursive
726 * does additional checks when
727 * ip_restrict_interzone_loopback is set.
728 *
729 * If ill_usesrc_ifindex is set
730 * Then we check if the zone has a valid source address
731 * on the usesrc ill.
732 *
733 * If ire_ill is set, then check that the zone has an ipif
734 * on that ill.
735 *
736 * Outside of this function (in ire_round_robin) we check
737 * that any IRE_OFFLINK has a gateway that reachable from the
738 * zone when we have multiple choices (ECMP).
739 */
740 if (match_flags & MATCH_IRE_ZONEONLY)
741 return (B_FALSE);
742 if (ire->ire_type & IRE_LOOPBACK)
743 return (B_FALSE);
744
745 if (ire->ire_type & IRE_LOCAL)
746 goto matchit;
747
748 /*
749 * The normal case of IRE_ONLINK has a matching zoneid.
750 * Here we handle the case when shared-IP zones have been
751 * configured with IP addresses on vniN. In that case it
752 * is ok for traffic from a zone to use IRE_ONLINK routes
753 * if the ill has a usesrc pointing at vniN
754 * Applies to IRE_INTERFACE.
755 */
756 dst_ill = ire->ire_ill;
757 if (ire->ire_type & IRE_ONLINK) {
758 uint_t ifindex;
759
760 /*
761 * Note there is no IRE_INTERFACE on vniN thus
762 * can't do an IRE lookup for a matching route.
763 */
764 ifindex = dst_ill->ill_usesrc_ifindex;
765 if (ifindex == 0)
766 return (B_FALSE);
767
768 /*
769 * If there is a usable source address in the
770 * zone, then it's ok to return this IRE_INTERFACE
771 */
772 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
773 zoneid, ipst)) {
774 ip3dbg(("ire_match_args: no usrsrc for zone"
775 " dst_ill %p\n", (void *)dst_ill));
776 return (B_FALSE);
777 }
778 }
779 /*
780 * For example, with
781 * route add 11.0.0.0 gw1 -ifp bge0
782 * route add 11.0.0.0 gw2 -ifp bge1
783 * this code would differentiate based on
784 * where the sending zone has addresses.
785 * Only if the zone has an address on bge0 can it use the first
786 * route. It isn't clear if this behavior is documented
787 * anywhere.
788 */
789 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
790 ipif_t *tipif;
791
792 mutex_enter(&dst_ill->ill_lock);
793 for (tipif = dst_ill->ill_ipif;
794 tipif != NULL; tipif = tipif->ipif_next) {
795 if (!IPIF_IS_CONDEMNED(tipif) &&
796 (tipif->ipif_flags & IPIF_UP) &&
797 (tipif->ipif_zoneid == zoneid ||
798 tipif->ipif_zoneid == ALL_ZONES))
799 break;
800 }
801 mutex_exit(&dst_ill->ill_lock);
802 if (tipif == NULL)
803 return (B_FALSE);
804 }
805 }
806
807 matchit:
808 ire_ill = ire->ire_ill;
809 if (match_flags & MATCH_IRE_GW) {
810 mutex_enter(&ire->ire_lock);
811 gw_addr_v6 = ire->ire_gateway_addr_v6;
812 mutex_exit(&ire->ire_lock);
813 }
814 if (match_flags & MATCH_IRE_ILL) {
815
816 /*
817 * If asked to match an ill, we *must* match
818 * on the ire_ill for ipmp test addresses, or
819 * any of the ill in the group for data addresses.
820 * If we don't, we may as well fail.
821 * However, we need an exception for IRE_LOCALs to ensure
822 * we loopback packets even sent to test addresses on different
823 * interfaces in the group.
824 */
825 if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
826 !(ire->ire_type & IRE_LOCAL)) {
827 if (ire->ire_ill != ill)
828 return (B_FALSE);
829 } else {
830 match_flags &= ~MATCH_IRE_TESTHIDDEN;
831 /*
832 * We know that ill is not NULL, but ire_ill could be
833 * NULL
834 */
835 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
836 return (B_FALSE);
837 }
838 }
839 if (match_flags & MATCH_IRE_SRC_ILL) {
840 if (ire_ill == NULL)
841 return (B_FALSE);
842 if (!IS_ON_SAME_LAN(ill, ire_ill)) {
843 if (ire_ill->ill_usesrc_ifindex == 0 ||
844 (ire_ill->ill_usesrc_ifindex !=
845 ill->ill_phyint->phyint_ifindex))
846 return (B_FALSE);
847 }
848 }
849
850 /* No ire_addr_v6 bits set past the mask */
851 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
852 ire->ire_addr_v6));
853 V6_MASK_COPY(*addr, *mask, masked_addr);
854 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
855 ((!(match_flags & MATCH_IRE_GW)) ||
856 ((!(match_flags & MATCH_IRE_DIRECT)) ||
857 !(ire->ire_flags & RTF_INDIRECT)) &&
858 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
859 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
860 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
861 ((!(match_flags & MATCH_IRE_MASK)) ||
862 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
863 ((!(match_flags & MATCH_IRE_SECATTR)) ||
864 (!is_system_labeled()) ||
865 (tsol_ire_match_gwattr(ire, tsl) == 0))) {
866 /* We found the matched IRE */
867 return (B_TRUE);
868 }
869 return (B_FALSE);
870 }
871
872 /*
873 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
874 * gateway address. If ill is non-NULL we also match on it.
875 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
876 */
877 boolean_t
ire_gateway_ok_zone_v6(const in6_addr_t * gateway,zoneid_t zoneid,ill_t * ill,const ts_label_t * tsl,ip_stack_t * ipst,boolean_t lock_held)878 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
879 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
880 {
881 ire_t *ire;
882 uint_t match_flags;
883
884 if (lock_held)
885 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
886 else
887 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
888
889 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
890 if (ill != NULL)
891 match_flags |= MATCH_IRE_ILL;
892
893 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
894 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
895 ipst);
896
897 if (!lock_held)
898 rw_exit(&ipst->ips_ip6_ire_head_lock);
899 if (ire != NULL) {
900 ire_refrele(ire);
901 return (B_TRUE);
902 } else {
903 return (B_FALSE);
904 }
905 }
906
907 /*
908 * Lookup a route in forwarding table.
909 * specific lookup is indicated by passing the
910 * required parameters and indicating the
911 * match required in flag field.
912 *
913 * Supports link-local addresses by following the ipif/ill when recursing.
914 */
915 ire_t *
ire_ftable_lookup_v6(const in6_addr_t * addr,const in6_addr_t * mask,const in6_addr_t * gateway,int type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,int flags,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)916 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
917 const in6_addr_t *gateway, int type, const ill_t *ill,
918 zoneid_t zoneid, const ts_label_t *tsl, int flags,
919 uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
920 {
921 ire_t *ire = NULL;
922
923 ASSERT(addr != NULL);
924 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
925 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
926 ASSERT(ill == NULL || ill->ill_isv6);
927
928 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
929
930 /*
931 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
932 * or MATCH_IRE_SRC_ILL is set.
933 */
934 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
935 return (NULL);
936
937 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
938 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
939 tsl, flags, ipst);
940 if (ire == NULL) {
941 rw_exit(&ipst->ips_ip6_ire_head_lock);
942 return (NULL);
943 }
944
945 /*
946 * round-robin only if we have more than one route in the bucket.
947 * ips_ip_ecmp_behavior controls when we do ECMP
948 * 2: always
949 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
950 * 0: never
951 *
952 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
953 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
954 * and the IRE_INTERFACESs are likely to be shorter matches.
955 */
956 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
957 if (ipst->ips_ip_ecmp_behavior == 2 ||
958 (ipst->ips_ip_ecmp_behavior == 1 &&
959 IS_DEFAULT_ROUTE_V6(ire))) {
960 ire_t *next_ire;
961 ire_ftable_args_t margs;
962
963 bzero(&margs, sizeof (margs));
964 margs.ift_addr_v6 = *addr;
965 if (mask != NULL)
966 margs.ift_mask_v6 = *mask;
967 if (gateway != NULL)
968 margs.ift_gateway_v6 = *gateway;
969 margs.ift_type = type;
970 margs.ift_ill = ill;
971 margs.ift_zoneid = zoneid;
972 margs.ift_tsl = tsl;
973 margs.ift_flags = flags;
974
975 next_ire = ire_round_robin(ire->ire_bucket, &margs,
976 xmit_hint, ire, ipst);
977 if (next_ire == NULL) {
978 /* keep ire if next_ire is null */
979 goto done;
980 }
981 ire_refrele(ire);
982 ire = next_ire;
983 }
984 }
985
986 done:
987 /* Return generation before dropping lock */
988 if (generationp != NULL)
989 *generationp = ire->ire_generation;
990
991 rw_exit(&ipst->ips_ip6_ire_head_lock);
992
993 /*
994 * For shared-IP zones we need additional checks to what was
995 * done in ire_match_args to make sure IRE_LOCALs are handled.
996 *
997 * When ip_restrict_interzone_loopback is set, then
998 * we ensure that IRE_LOCAL are only used for loopback
999 * between zones when the logical "Ethernet" would
1000 * have looped them back. That is, if in the absense of
1001 * the IRE_LOCAL we would have sent to packet out the
1002 * same ill.
1003 */
1004 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
1005 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
1006 ipst->ips_ip_restrict_interzone_loopback) {
1007 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
1008 ASSERT(ire != NULL);
1009 }
1010
1011 return (ire);
1012 }
1013
1014 /*
1015 * Look up a single ire. The caller holds either the read or write lock.
1016 */
1017 ire_t *
ire_ftable_lookup_impl_v6(const in6_addr_t * addr,const in6_addr_t * mask,const in6_addr_t * gateway,int type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,int flags,ip_stack_t * ipst)1018 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1019 const in6_addr_t *gateway, int type, const ill_t *ill,
1020 zoneid_t zoneid, const ts_label_t *tsl, int flags,
1021 ip_stack_t *ipst)
1022 {
1023 irb_t *irb_ptr;
1024 ire_t *ire = NULL;
1025 int i;
1026
1027 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
1028
1029 /*
1030 * If the mask is known, the lookup
1031 * is simple, if the mask is not known
1032 * we need to search.
1033 */
1034 if (flags & MATCH_IRE_MASK) {
1035 uint_t masklen;
1036
1037 masklen = ip_mask_to_plen_v6(mask);
1038 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
1039 return (NULL);
1040 }
1041 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
1042 IRE_ADDR_MASK_HASH_V6(*addr, *mask,
1043 ipst->ips_ip6_ftable_hash_size)]);
1044 rw_enter(&irb_ptr->irb_lock, RW_READER);
1045 for (ire = irb_ptr->irb_ire; ire != NULL;
1046 ire = ire->ire_next) {
1047 if (IRE_IS_CONDEMNED(ire))
1048 continue;
1049 if (ire_match_args_v6(ire, addr, mask, gateway, type,
1050 ill, zoneid, tsl, flags))
1051 goto found_ire;
1052 }
1053 rw_exit(&irb_ptr->irb_lock);
1054 } else {
1055 uint_t masklen;
1056
1057 /*
1058 * In this case we don't know the mask, we need to
1059 * search the table assuming different mask sizes.
1060 */
1061 if (flags & MATCH_IRE_SHORTERMASK) {
1062 masklen = ip_mask_to_plen_v6(mask);
1063 if (masklen == 0) {
1064 /* Nothing shorter than zero */
1065 return (NULL);
1066 }
1067 masklen--;
1068 } else {
1069 masklen = IP6_MASK_TABLE_SIZE - 1;
1070 }
1071
1072 for (i = masklen; i >= 0; i--) {
1073 in6_addr_t tmpmask;
1074
1075 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
1076 continue;
1077 (void) ip_plen_to_mask_v6(i, &tmpmask);
1078 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
1079 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1080 ipst->ips_ip6_ftable_hash_size)];
1081 rw_enter(&irb_ptr->irb_lock, RW_READER);
1082 for (ire = irb_ptr->irb_ire; ire != NULL;
1083 ire = ire->ire_next) {
1084 if (IRE_IS_CONDEMNED(ire))
1085 continue;
1086 if (ire_match_args_v6(ire, addr,
1087 &ire->ire_mask_v6, gateway, type, ill,
1088 zoneid, tsl, flags))
1089 goto found_ire;
1090 }
1091 rw_exit(&irb_ptr->irb_lock);
1092 }
1093 }
1094 ASSERT(ire == NULL);
1095 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1096 return (NULL);
1097
1098 found_ire:
1099 ire_refhold(ire);
1100 rw_exit(&irb_ptr->irb_lock);
1101 return (ire);
1102 }
1103
1104
1105 /*
1106 * This function is called by
1107 * ip_input/ire_route_recursive when doing a route lookup on only the
1108 * destination address.
1109 *
1110 * The optimizations of this function over ire_ftable_lookup are:
1111 * o removing unnecessary flag matching
1112 * o doing longest prefix match instead of overloading it further
1113 * with the unnecessary "best_prefix_match"
1114 *
1115 * If no route is found we return IRE_NOROUTE.
1116 */
1117 ire_t *
ire_ftable_lookup_simple_v6(const in6_addr_t * addr,uint32_t xmit_hint,ip_stack_t * ipst,uint_t * generationp)1118 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
1119 ip_stack_t *ipst, uint_t *generationp)
1120 {
1121 ire_t *ire;
1122
1123 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
1124 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
1125 if (ire == NULL) {
1126 ire = ire_reject(ipst, B_TRUE);
1127 if (generationp != NULL)
1128 *generationp = IRE_GENERATION_VERIFY;
1129 }
1130 /* ftable_lookup did round robin */
1131 return (ire);
1132 }
1133
1134 ire_t *
ip_select_route_v6(const in6_addr_t * dst,const in6_addr_t src,ip_xmit_attr_t * ixa,uint_t * generationp,in6_addr_t * setsrcp,int * errorp,boolean_t * multirtp)1135 ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src,
1136 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1137 int *errorp, boolean_t *multirtp)
1138 {
1139 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1140
1141 return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp,
1142 multirtp));
1143 }
1144
1145 /*
1146 * Recursively look for a route to the destination. Can also match on
1147 * the zoneid, ill, and label. Used for the data paths. See also
1148 * ire_route_recursive_dstonly.
1149 *
1150 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1151 * create an IRE_IF_CLONE. This is used on the receive side when we are not
1152 * forwarding.
1153 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1154 * resolve the gateway.
1155 *
1156 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1157 * instead.
1158 *
1159 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1160 * is an error.
1161 * Allow at most one RTF_INDIRECT.
1162 */
1163 ire_t *
ire_route_recursive_impl_v6(ire_t * ire,const in6_addr_t * nexthop,uint_t ire_type,const ill_t * ill_arg,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,in6_addr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)1164 ire_route_recursive_impl_v6(ire_t *ire,
1165 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
1166 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1167 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1168 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1169 {
1170 int i, j;
1171 in6_addr_t v6nexthop = *nexthop;
1172 ire_t *ires[MAX_IRE_RECURSION];
1173 uint_t generation;
1174 uint_t generations[MAX_IRE_RECURSION];
1175 boolean_t need_refrele = B_FALSE;
1176 boolean_t invalidate = B_FALSE;
1177 ill_t *ill = NULL;
1178 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK);
1179
1180 if (setsrcp != NULL)
1181 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1182 if (gwattrp != NULL)
1183 ASSERT(*gwattrp == NULL);
1184
1185 /*
1186 * We iterate up to three times to resolve a route, even though
1187 * we have four slots in the array. The extra slot is for an
1188 * IRE_IF_CLONE we might need to create.
1189 */
1190 i = 0;
1191 while (i < MAX_IRE_RECURSION - 1) {
1192 /* ire_ftable_lookup handles round-robin/ECMP */
1193 if (ire == NULL) {
1194 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
1195 (ill != NULL ? ill : ill_arg), zoneid, tsl,
1196 match_args, xmit_hint, ipst, &generation);
1197 } else {
1198 /* Caller passed it; extra hold since we will rele */
1199 ire_refhold(ire);
1200 if (generationp != NULL)
1201 generation = *generationp;
1202 else
1203 generation = IRE_GENERATION_VERIFY;
1204 }
1205
1206 if (ire == NULL) {
1207 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1208 ire = ires[0];
1209 ire_refhold(ire);
1210 } else {
1211 ire = ire_reject(ipst, B_TRUE);
1212 }
1213 goto error;
1214 }
1215
1216 /* Need to return the ire with RTF_REJECT|BLACKHOLE */
1217 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1218 goto error;
1219
1220 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1221 /*
1222 * Verify that the IRE_IF_CLONE has a consistent generation
1223 * number.
1224 */
1225 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) {
1226 ire_refrele(ire);
1227 ire = NULL;
1228 continue;
1229 }
1230
1231 /*
1232 * Don't allow anything unusual past the first iteration.
1233 * After the first lookup, we should no longer look for
1234 * (IRE_LOCAL|IRE_LOOPBACK) or RTF_INDIRECT routes.
1235 *
1236 * In addition, after we have found a direct IRE_OFFLINK,
1237 * we should only look for interface or clone routes.
1238 */
1239 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1240 if ((ire->ire_type & IRE_OFFLINK) &&
1241 !(ire->ire_flags & RTF_INDIRECT)) {
1242 ire_type = IRE_IF_ALL;
1243 } else {
1244 if (!(match_args & MATCH_IRE_TYPE))
1245 ire_type = (IRE_OFFLINK|IRE_ONLINK);
1246 ire_type &= ~maskoff; /* no more LOCAL, LOOPBACK */
1247 }
1248 match_args |= MATCH_IRE_TYPE;
1249 /* We have a usable IRE */
1250 ires[i] = ire;
1251 generations[i] = generation;
1252 i++;
1253
1254 /* The first RTF_SETSRC address is passed back if setsrcp */
1255 if ((ire->ire_flags & RTF_SETSRC) &&
1256 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
1257 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1258 &ire->ire_setsrc_addr_v6));
1259 *setsrcp = ire->ire_setsrc_addr_v6;
1260 }
1261
1262 /* The first ire_gw_secattr is passed back if gwattrp */
1263 if (ire->ire_gw_secattr != NULL &&
1264 gwattrp != NULL && *gwattrp == NULL)
1265 *gwattrp = ire->ire_gw_secattr;
1266
1267 /*
1268 * Check if we have a short-cut pointer to an IRE for this
1269 * destination, and that the cached dependency isn't stale.
1270 * In that case we've rejoined an existing tree towards a
1271 * parent, thus we don't need to continue the loop to
1272 * discover the rest of the tree.
1273 */
1274 mutex_enter(&ire->ire_lock);
1275 if (ire->ire_dep_parent != NULL &&
1276 ire->ire_dep_parent->ire_generation ==
1277 ire->ire_dep_parent_generation) {
1278 mutex_exit(&ire->ire_lock);
1279 ire = NULL;
1280 goto done;
1281 }
1282 mutex_exit(&ire->ire_lock);
1283
1284 /*
1285 * If this type should have an ire_nce_cache (even if it
1286 * doesn't yet have one) then we are done. Includes
1287 * IRE_INTERFACE with a full 128 bit mask.
1288 */
1289 if (ire->ire_nce_capable) {
1290 ire = NULL;
1291 goto done;
1292 }
1293 ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1294 /*
1295 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1296 * particular destination
1297 */
1298 if (ire->ire_type & IRE_INTERFACE) {
1299 ire_t *clone;
1300
1301 ASSERT(ire->ire_masklen != IPV6_ABITS);
1302
1303 /*
1304 * In the case of ip_input and ILLF_FORWARDING not
1305 * being set, and in the case of RTM_GET, there is
1306 * no point in allocating an IRE_IF_CLONE. We return
1307 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1308 * result in a ire_dep_parent which is IRE_IF_*
1309 * without an IRE_IF_CLONE.
1310 * We recover from that when we need to send packets
1311 * by ensuring that the generations become
1312 * IRE_GENERATION_VERIFY in this case.
1313 */
1314 if (!(irr_flags & IRR_ALLOCATE)) {
1315 invalidate = B_TRUE;
1316 ire = NULL;
1317 goto done;
1318 }
1319
1320 clone = ire_create_if_clone(ire, &v6nexthop,
1321 &generation);
1322 if (clone == NULL) {
1323 /*
1324 * Temporary failure - no memory.
1325 * Don't want caller to cache IRE_NOROUTE.
1326 */
1327 invalidate = B_TRUE;
1328 ire = ire_blackhole(ipst, B_TRUE);
1329 goto error;
1330 }
1331 /*
1332 * Make clone next to last entry and the
1333 * IRE_INTERFACE the last in the dependency
1334 * chain since the clone depends on the
1335 * IRE_INTERFACE.
1336 */
1337 ASSERT(i >= 1);
1338 ASSERT(i < MAX_IRE_RECURSION);
1339
1340 ires[i] = ires[i-1];
1341 generations[i] = generations[i-1];
1342 ires[i-1] = clone;
1343 generations[i-1] = generation;
1344 i++;
1345
1346 ire = NULL;
1347 goto done;
1348 }
1349
1350 /*
1351 * We only match on the type and optionally ILL when
1352 * recursing. The type match is used by some callers
1353 * to exclude certain types (such as IRE_IF_CLONE or
1354 * IRE_LOCAL|IRE_LOOPBACK).
1355 *
1356 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1357 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1358 * ire_ill, so we set ill to the ire_ill
1359 */
1360 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1361 v6nexthop = ire->ire_gateway_addr_v6;
1362 if (ill == NULL && ire->ire_ill != NULL) {
1363 ill = ire->ire_ill;
1364 need_refrele = B_TRUE;
1365 ill_refhold(ill);
1366 match_args |= MATCH_IRE_ILL;
1367 }
1368 ire = NULL;
1369 }
1370 ASSERT(ire == NULL);
1371 ire = ire_reject(ipst, B_TRUE);
1372
1373 error:
1374 ASSERT(ire != NULL);
1375 if (need_refrele)
1376 ill_refrele(ill);
1377
1378 /*
1379 * In the case of MULTIRT we want to try a different IRE the next
1380 * time. We let the next packet retry in that case.
1381 */
1382 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1383 (void) ire_no_good(ires[0]);
1384
1385 cleanup:
1386 /* cleanup ires[i] */
1387 ire_dep_unbuild(ires, i);
1388 for (j = 0; j < i; j++)
1389 ire_refrele(ires[j]);
1390
1391 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1392 (irr_flags & IRR_INCOMPLETE));
1393 /*
1394 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1395 * ip_select_route since the reject or lack of memory might be gone.
1396 */
1397 if (generationp != NULL)
1398 *generationp = IRE_GENERATION_VERIFY;
1399 return (ire);
1400
1401 done:
1402 ASSERT(ire == NULL);
1403 if (need_refrele)
1404 ill_refrele(ill);
1405
1406 /* Build dependencies */
1407 if (i > 1 && !ire_dep_build(ires, generations, i)) {
1408 /* Something in chain was condemned; tear it apart */
1409 ire = ire_blackhole(ipst, B_TRUE);
1410 goto cleanup;
1411 }
1412
1413 /*
1414 * Release all refholds except the one for ires[0] that we
1415 * will return to the caller.
1416 */
1417 for (j = 1; j < i; j++)
1418 ire_refrele(ires[j]);
1419
1420 if (invalidate) {
1421 /*
1422 * Since we needed to allocate but couldn't we need to make
1423 * sure that the dependency chain is rebuilt the next time.
1424 */
1425 ire_dep_invalidate_generations(ires[0]);
1426 generation = IRE_GENERATION_VERIFY;
1427 } else {
1428 /*
1429 * IREs can have been added or deleted while we did the
1430 * recursive lookup and we can't catch those until we've built
1431 * the dependencies. We verify the stored
1432 * ire_dep_parent_generation to catch any such changes and
1433 * return IRE_GENERATION_VERIFY (which will cause
1434 * ip_select_route to be called again so we can redo the
1435 * recursive lookup next time we send a packet.
1436 */
1437 if (ires[0]->ire_dep_parent == NULL)
1438 generation = ires[0]->ire_generation;
1439 else
1440 generation = ire_dep_validate_generations(ires[0]);
1441 if (generations[0] != ires[0]->ire_generation) {
1442 /* Something changed at the top */
1443 generation = IRE_GENERATION_VERIFY;
1444 }
1445 }
1446 if (generationp != NULL)
1447 *generationp = generation;
1448
1449 return (ires[0]);
1450 }
1451
1452 ire_t *
ire_route_recursive_v6(const in6_addr_t * nexthop,uint_t ire_type,const ill_t * ill,zoneid_t zoneid,const ts_label_t * tsl,uint_t match_args,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst,in6_addr_t * setsrcp,tsol_ire_gw_secattr_t ** gwattrp,uint_t * generationp)1453 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
1454 const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1455 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1456 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1457 {
1458 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
1459 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1460 gwattrp, generationp));
1461 }
1462
1463 /*
1464 * Recursively look for a route to the destination.
1465 * We only handle a destination match here, yet we have the same arguments
1466 * as the full match to allow function pointers to select between the two.
1467 *
1468 * Note that this function never returns NULL. It returns an IRE_NOROUTE
1469 * instead.
1470 *
1471 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1472 * is an error.
1473 * Allow at most one RTF_INDIRECT.
1474 */
1475 ire_t *
ire_route_recursive_dstonly_v6(const in6_addr_t * nexthop,uint_t irr_flags,uint32_t xmit_hint,ip_stack_t * ipst)1476 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags,
1477 uint32_t xmit_hint, ip_stack_t *ipst)
1478 {
1479 ire_t *ire;
1480 ire_t *ire1;
1481 uint_t generation;
1482
1483 /* ire_ftable_lookup handles round-robin/ECMP */
1484 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
1485 &generation);
1486 ASSERT(ire != NULL);
1487
1488 /*
1489 * If the IRE has a current cached parent we know that the whole
1490 * parent chain is current, hence we don't need to discover and
1491 * build any dependencies by doing a recursive lookup.
1492 */
1493 mutex_enter(&ire->ire_lock);
1494 if (ire->ire_dep_parent != NULL) {
1495 if (ire->ire_dep_parent->ire_generation ==
1496 ire->ire_dep_parent_generation) {
1497 mutex_exit(&ire->ire_lock);
1498 return (ire);
1499 }
1500 mutex_exit(&ire->ire_lock);
1501 } else {
1502 mutex_exit(&ire->ire_lock);
1503 /*
1504 * If this type should have an ire_nce_cache (even if it
1505 * doesn't yet have one) then we are done. Includes
1506 * IRE_INTERFACE with a full 128 bit mask.
1507 */
1508 if (ire->ire_nce_capable)
1509 return (ire);
1510 }
1511
1512 /*
1513 * Fallback to loop in the normal code starting with the ire
1514 * we found. Normally this would return the same ire.
1515 */
1516 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
1517 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1518 &generation);
1519 ire_refrele(ire);
1520 return (ire1);
1521 }
1522