xref: /titanic_51/usr/src/uts/common/inet/ip/ip6_ire.c (revision f4ce81cfdad23bacfdb147be77d8d5fbe7673847)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 1990 Mentat Inc.
27  */
28 
29 /*
30  * This file contains routines that manipulate Internet Routing Entries (IREs).
31  */
32 #include <sys/types.h>
33 #include <sys/stream.h>
34 #include <sys/stropts.h>
35 #include <sys/ddi.h>
36 #include <sys/cmn_err.h>
37 
38 #include <sys/systm.h>
39 #include <sys/param.h>
40 #include <sys/socket.h>
41 #include <net/if.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
44 #include <net/if_dl.h>
45 #include <netinet/ip6.h>
46 #include <netinet/icmp6.h>
47 
48 #include <inet/common.h>
49 #include <inet/mi.h>
50 #include <inet/ip.h>
51 #include <inet/ip6.h>
52 #include <inet/ip_ndp.h>
53 #include <inet/ip_if.h>
54 #include <inet/ip_ire.h>
55 #include <inet/ipclassifier.h>
56 #include <inet/nd.h>
57 #include <sys/kmem.h>
58 #include <sys/zone.h>
59 
60 #include <sys/tsol/label.h>
61 #include <sys/tsol/tnet.h>
62 
63 #define	IS_DEFAULT_ROUTE_V6(ire)	\
64 	(((ire)->ire_type & IRE_DEFAULT) || \
65 	    (((ire)->ire_type & IRE_INTERFACE) && \
66 	    (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
67 
68 static	ire_t	ire_null;
69 
70 static ire_t *
71 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
72     const in6_addr_t *gateway, int type, const ill_t *ill,
73     zoneid_t zoneid, const ts_label_t *tsl, int flags,
74     ip_stack_t *ipst);
75 
76 /*
77  * Initialize the ire that is specific to IPv6 part and call
78  * ire_init_common to finish it.
79  * Returns zero or errno.
80  */
81 int
82 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
83     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
84     zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
85 {
86 	int error;
87 
88 	/*
89 	 * Reject IRE security attmakeribute creation/initialization
90 	 * if system is not running in Trusted mode.
91 	 */
92 	if (gc != NULL && !is_system_labeled())
93 		return (EINVAL);
94 
95 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
96 	if (v6addr != NULL)
97 		ire->ire_addr_v6 = *v6addr;
98 	if (v6gateway != NULL)
99 		ire->ire_gateway_addr_v6 = *v6gateway;
100 
101 	/* Make sure we don't have stray values in some fields */
102 	switch (type) {
103 	case IRE_LOOPBACK:
104 		ire->ire_gateway_addr_v6 = ire->ire_addr_v6;
105 		/* FALLTHRU */
106 	case IRE_HOST:
107 	case IRE_LOCAL:
108 	case IRE_IF_CLONE:
109 		ire->ire_mask_v6 = ipv6_all_ones;
110 		ire->ire_masklen = IPV6_ABITS;
111 		break;
112 	case IRE_PREFIX:
113 	case IRE_DEFAULT:
114 	case IRE_IF_RESOLVER:
115 	case IRE_IF_NORESOLVER:
116 		if (v6mask != NULL) {
117 			ire->ire_mask_v6 = *v6mask;
118 			ire->ire_masklen =
119 			    ip_mask_to_plen_v6(&ire->ire_mask_v6);
120 		}
121 		break;
122 	case IRE_MULTICAST:
123 	case IRE_NOROUTE:
124 		ASSERT(v6mask == NULL);
125 		break;
126 	default:
127 		ASSERT(0);
128 		return (EINVAL);
129 	}
130 
131 	error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
132 	    gc, ipst);
133 	if (error != NULL)
134 		return (error);
135 
136 	/* Determine which function pointers to use */
137 	ire->ire_postfragfn = ip_xmit;		/* Common case */
138 
139 	switch (ire->ire_type) {
140 	case IRE_LOCAL:
141 		ire->ire_sendfn = ire_send_local_v6;
142 		ire->ire_recvfn = ire_recv_local_v6;
143 #ifdef SO_VRRP
144 		ASSERT(ire->ire_ill != NULL);
145 		if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) {
146 			ire->ire_noaccept = B_TRUE;
147 			ire->ire_recvfn = ire_recv_noaccept_v6;
148 		}
149 #endif
150 		break;
151 	case IRE_LOOPBACK:
152 		ire->ire_sendfn = ire_send_local_v6;
153 		ire->ire_recvfn = ire_recv_loopback_v6;
154 		break;
155 	case IRE_MULTICAST:
156 		ire->ire_postfragfn = ip_postfrag_loopcheck;
157 		ire->ire_sendfn = ire_send_multicast_v6;
158 		ire->ire_recvfn = ire_recv_multicast_v6;
159 		break;
160 	default:
161 		/*
162 		 * For IRE_IF_ALL and IRE_OFFLINK we forward received
163 		 * packets by default.
164 		 */
165 		ire->ire_sendfn = ire_send_wire_v6;
166 		ire->ire_recvfn = ire_recv_forward_v6;
167 		break;
168 	}
169 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
170 		ire->ire_sendfn = ire_send_noroute_v6;
171 		ire->ire_recvfn = ire_recv_noroute_v6;
172 	} else if (ire->ire_flags & RTF_MULTIRT) {
173 		ire->ire_postfragfn = ip_postfrag_multirt_v6;
174 		ire->ire_sendfn = ire_send_multirt_v6;
175 		ire->ire_recvfn = ire_recv_multirt_v6;
176 	}
177 	ire->ire_nce_capable = ire_determine_nce_capable(ire);
178 	return (0);
179 }
180 
181 /*
182  * ire_create_v6 is called to allocate and initialize a new IRE.
183  *
184  * NOTE : This is called as writer sometimes though not required
185  * by this function.
186  */
187 /* ARGSUSED */
188 ire_t *
189 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
190     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
191     uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
192 {
193 	ire_t	*ire;
194 	int	error;
195 
196 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
197 
198 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
199 	if (ire == NULL) {
200 		DTRACE_PROBE(kmem__cache__alloc);
201 		return (NULL);
202 	}
203 	*ire = ire_null;
204 
205 	error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
206 	    type, ill, zoneid, flags, gc, ipst);
207 
208 	if (error != 0) {
209 		DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
210 		kmem_cache_free(ire_cache, ire);
211 		return (NULL);
212 	}
213 	return (ire);
214 }
215 
216 /*
217  * Find the ill matching a multicast group.
218  * Allows different routes for multicast addresses
219  * in the unicast routing table (akin to FF::0/8 but could be more specific)
220  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
221  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
222  * specify the interface to join on.
223  *
224  * Supports link-local addresses by using ire_route_recursive which follows
225  * the ill when recursing.
226  *
227  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
228  * and the MULTIRT property can be different for different groups, we
229  * extract RTF_MULTIRT from the special unicast route added for a group
230  * with CGTP and pass that back in the multirtp argument.
231  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
232  * We have a setsrcp argument for the same reason.
233  */
234 ill_t *
235 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
236     ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
237 {
238 	ire_t	*ire;
239 	ill_t	*ill;
240 
241 	ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
242 	    MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
243 	ASSERT(ire != NULL);
244 
245 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
246 		ire_refrele(ire);
247 		return (NULL);
248 	}
249 
250 	if (multirtp != NULL)
251 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
252 
253 	ill = ire_nexthop_ill(ire);
254 	ire_refrele(ire);
255 	return (ill);
256 }
257 
258 /*
259  * This function takes a mask and returns number of bits set in the
260  * mask (the represented prefix length).  Assumes a contiguous mask.
261  */
262 int
263 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
264 {
265 	int		bits;
266 	int		plen = IPV6_ABITS;
267 	int		i;
268 
269 	for (i = 3; i >= 0; i--) {
270 		if (v6mask->s6_addr32[i] == 0) {
271 			plen -= 32;
272 			continue;
273 		}
274 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
275 		if (bits == 0)
276 			break;
277 		plen -= bits;
278 	}
279 
280 	return (plen);
281 }
282 
283 /*
284  * Convert a prefix length to the mask for that prefix.
285  * Returns the argument bitmask.
286  */
287 in6_addr_t *
288 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
289 {
290 	uint32_t *ptr;
291 
292 	if (plen < 0 || plen > IPV6_ABITS)
293 		return (NULL);
294 	*bitmask = ipv6_all_zeros;
295 	if (plen == 0)
296 		return (bitmask);
297 
298 	ptr = (uint32_t *)bitmask;
299 	while (plen > 32) {
300 		*ptr++ = 0xffffffffU;
301 		plen -= 32;
302 	}
303 	*ptr = htonl(0xffffffffU << (32 - plen));
304 	return (bitmask);
305 }
306 
307 /*
308  * Add a fully initialized IPv6 IRE to the forwarding table.
309  * This returns NULL on failure, or a held IRE on success.
310  * Normally the returned IRE is the same as the argument. But a different
311  * IRE will be returned if the added IRE is deemed identical to an existing
312  * one. In that case ire_identical_ref will be increased.
313  * The caller always needs to do an ire_refrele() on the returned IRE.
314  */
315 ire_t *
316 ire_add_v6(ire_t *ire)
317 {
318 	ire_t	*ire1;
319 	int	mask_table_index;
320 	irb_t	*irb_ptr;
321 	ire_t	**irep;
322 	int	match_flags;
323 	int	error;
324 	ip_stack_t	*ipst = ire->ire_ipst;
325 
326 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
327 
328 	/* Make sure the address is properly masked. */
329 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
330 
331 	mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
332 	if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
333 		irb_t *ptr;
334 		int i;
335 
336 		ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
337 		    sizeof (irb_t)));
338 		if (ptr == NULL) {
339 			ire_delete(ire);
340 			return (NULL);
341 		}
342 		for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
343 			rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
344 		}
345 		mutex_enter(&ipst->ips_ire_ft_init_lock);
346 		if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
347 		    NULL) {
348 			ipst->ips_ip_forwarding_table_v6[mask_table_index] =
349 			    ptr;
350 			mutex_exit(&ipst->ips_ire_ft_init_lock);
351 		} else {
352 			/*
353 			 * Some other thread won the race in
354 			 * initializing the forwarding table at the
355 			 * same index.
356 			 */
357 			mutex_exit(&ipst->ips_ire_ft_init_lock);
358 			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
359 				rw_destroy(&ptr[i].irb_lock);
360 			}
361 			mi_free(ptr);
362 		}
363 	}
364 	irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
365 	    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
366 	    ipst->ips_ip6_ftable_hash_size)]);
367 
368 	match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
369 	if (ire->ire_ill != NULL)
370 		match_flags |= MATCH_IRE_ILL;
371 	/*
372 	 * Start the atomic add of the ire. Grab the bucket lock and the
373 	 * ill lock. Check for condemned.
374 	 */
375 	error = ire_atomic_start(irb_ptr, ire);
376 	if (error != 0) {
377 		ire_delete(ire);
378 		return (NULL);
379 	}
380 
381 	/*
382 	 * If we are creating a hidden IRE, make sure we search for
383 	 * hidden IREs when searching for duplicates below.
384 	 * Otherwise, we might find an IRE on some other interface
385 	 * that's not marked hidden.
386 	 */
387 	if (ire->ire_testhidden)
388 		match_flags |= MATCH_IRE_TESTHIDDEN;
389 
390 	/*
391 	 * Atomically check for duplicate and insert in the table.
392 	 */
393 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
394 		if (IRE_IS_CONDEMNED(ire1))
395 			continue;
396 		/*
397 		 * Here we need an exact match on zoneid, i.e.,
398 		 * ire_match_args doesn't fit.
399 		 */
400 		if (ire1->ire_zoneid != ire->ire_zoneid)
401 			continue;
402 
403 		if (ire1->ire_type != ire->ire_type)
404 			continue;
405 
406 		/*
407 		 * Note: We do not allow multiple routes that differ only
408 		 * in the gateway security attributes; such routes are
409 		 * considered duplicates.
410 		 * To change that we explicitly have to treat them as
411 		 * different here.
412 		 */
413 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
414 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
415 		    ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
416 		    match_flags)) {
417 			/*
418 			 * Return the old ire after doing a REFHOLD.
419 			 * As most of the callers continue to use the IRE
420 			 * after adding, we return a held ire. This will
421 			 * avoid a lookup in the caller again. If the callers
422 			 * don't want to use it, they need to do a REFRELE.
423 			 */
424 			ip1dbg(("found dup ire existing %p new %p",
425 			    (void *)ire1, (void *)ire));
426 			ire_refhold(ire1);
427 			atomic_add_32(&ire1->ire_identical_ref, 1);
428 			ire_atomic_end(irb_ptr, ire);
429 			ire_delete(ire);
430 			return (ire1);
431 		}
432 	}
433 
434 	/*
435 	 * Normally we do head insertion since most things do not care about
436 	 * the order of the IREs in the bucket.
437 	 * However, due to shared-IP zones (and restrict_interzone_loopback)
438 	 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
439 	 * address. For that reason we do tail insertion for IRE_IF_CLONE.
440 	 */
441 	irep = (ire_t **)irb_ptr;
442 	if (ire->ire_type & IRE_IF_CLONE) {
443 		while ((ire1 = *irep) != NULL)
444 			irep = &ire1->ire_next;
445 	}
446 	/* Insert at *irep */
447 	ire1 = *irep;
448 	if (ire1 != NULL)
449 		ire1->ire_ptpn = &ire->ire_next;
450 	ire->ire_next = ire1;
451 	/* Link the new one in. */
452 	ire->ire_ptpn = irep;
453 	/*
454 	 * ire_walk routines de-reference ire_next without holding
455 	 * a lock. Before we point to the new ire, we want to make
456 	 * sure the store that sets the ire_next of the new ire
457 	 * reaches global visibility, so that ire_walk routines
458 	 * don't see a truncated list of ires i.e if the ire_next
459 	 * of the new ire gets set after we do "*irep = ire" due
460 	 * to re-ordering, the ire_walk thread will see a NULL
461 	 * once it accesses the ire_next of the new ire.
462 	 * membar_producer() makes sure that the following store
463 	 * happens *after* all of the above stores.
464 	 */
465 	membar_producer();
466 	*irep = ire;
467 	ire->ire_bucket = irb_ptr;
468 	/*
469 	 * We return a bumped up IRE above. Keep it symmetrical
470 	 * so that the callers will always have to release. This
471 	 * helps the callers of this function because they continue
472 	 * to use the IRE after adding and hence they don't have to
473 	 * lookup again after we return the IRE.
474 	 *
475 	 * NOTE : We don't have to use atomics as this is appearing
476 	 * in the list for the first time and no one else can bump
477 	 * up the reference count on this yet.
478 	 */
479 	ire_refhold_locked(ire);
480 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
481 	irb_ptr->irb_ire_cnt++;
482 
483 	if (ire->ire_ill != NULL) {
484 		DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
485 		    (char *), "ire", (void *), ire);
486 		ire->ire_ill->ill_ire_cnt++;
487 		ASSERT(ire->ire_ill->ill_ire_cnt != 0);	/* Wraparound */
488 	}
489 	ire_atomic_end(irb_ptr, ire);
490 
491 	/* Make any caching of the IREs be notified or updated */
492 	ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
493 
494 	return (ire);
495 }
496 
497 /*
498  * Search for all HOST REDIRECT routes that are
499  * pointing at the specified gateway and
500  * delete them. This routine is called only
501  * when a default gateway is going away.
502  */
503 static void
504 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
505 {
506 	irb_t *irb_ptr;
507 	irb_t *irb;
508 	ire_t *ire;
509 	in6_addr_t gw_addr_v6;
510 	int i;
511 
512 	/* get the hash table for HOST routes */
513 	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
514 	if (irb_ptr == NULL)
515 		return;
516 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
517 		irb = &irb_ptr[i];
518 		irb_refhold(irb);
519 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
520 			if (!(ire->ire_flags & RTF_DYNAMIC))
521 				continue;
522 			mutex_enter(&ire->ire_lock);
523 			gw_addr_v6 = ire->ire_gateway_addr_v6;
524 			mutex_exit(&ire->ire_lock);
525 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
526 				ire_delete(ire);
527 		}
528 		irb_refrele(irb);
529 	}
530 }
531 
532 /*
533  * Delete the specified IRE.
534  * All calls should use ire_delete().
535  * Sometimes called as writer though not required by this function.
536  *
537  * NOTE : This function is called only if the ire was added
538  * in the list.
539  */
540 void
541 ire_delete_v6(ire_t *ire)
542 {
543 	in6_addr_t gw_addr_v6;
544 	ip_stack_t	*ipst = ire->ire_ipst;
545 
546 	/*
547 	 * Make sure ire_generation increases from ire_flush_cache happen
548 	 * after any lookup/reader has read ire_generation.
549 	 * Since the rw_enter makes us wait until any lookup/reader has
550 	 * completed we can exit the lock immediately.
551 	 */
552 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
553 	rw_exit(&ipst->ips_ip6_ire_head_lock);
554 
555 	ASSERT(ire->ire_refcnt >= 1);
556 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
557 
558 	ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
559 
560 	if (ire->ire_type == IRE_DEFAULT) {
561 		/*
562 		 * when a default gateway is going away
563 		 * delete all the host redirects pointing at that
564 		 * gateway.
565 		 */
566 		mutex_enter(&ire->ire_lock);
567 		gw_addr_v6 = ire->ire_gateway_addr_v6;
568 		mutex_exit(&ire->ire_lock);
569 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
570 	}
571 
572 	/*
573 	 * If we are deleting an IRE_INTERFACE then we make sure we also
574 	 * delete any IRE_IF_CLONE that has been created from it.
575 	 * Those are always in ire_dep_children.
576 	 */
577 	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
578 		ire_dep_delete_if_clone(ire);
579 
580 	/* Remove from parent dependencies and child */
581 	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
582 	if (ire->ire_dep_parent != NULL) {
583 		ire_dep_remove(ire);
584 	}
585 	while (ire->ire_dep_children != NULL)
586 		ire_dep_remove(ire->ire_dep_children);
587 	rw_exit(&ipst->ips_ire_dep_lock);
588 }
589 
590 /*
591  * When an IRE is added or deleted this routine is called to make sure
592  * any caching of IRE information is notified or updated.
593  *
594  * The flag argument indicates if the flush request is due to addition
595  * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
596  * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
597  */
598 void
599 ire_flush_cache_v6(ire_t *ire, int flag)
600 {
601 	ip_stack_t *ipst = ire->ire_ipst;
602 
603 	/*
604 	 * IRE_IF_CLONE ire's don't provide any new information
605 	 * than the parent from which they are cloned, so don't
606 	 * perturb the generation numbers.
607 	 */
608 	if (ire->ire_type & IRE_IF_CLONE)
609 		return;
610 
611 	/*
612 	 * Ensure that an ire_add during a lookup serializes the updates of
613 	 * the generation numbers under ire_head_lock so that the lookup gets
614 	 * either the old ire and old generation number, or a new ire and new
615 	 * generation number.
616 	 */
617 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
618 
619 	/*
620 	 * If a route was just added, we need to notify everybody that
621 	 * has cached an IRE_NOROUTE since there might now be a better
622 	 * route for them.
623 	 */
624 	if (flag == IRE_FLUSH_ADD) {
625 		ire_increment_generation(ipst->ips_ire_reject_v6);
626 		ire_increment_generation(ipst->ips_ire_blackhole_v6);
627 	}
628 
629 	/* Adding a default can't otherwise provide a better route */
630 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
631 		rw_exit(&ipst->ips_ip6_ire_head_lock);
632 		return;
633 	}
634 
635 	switch (flag) {
636 	case IRE_FLUSH_DELETE:
637 	case IRE_FLUSH_GWCHANGE:
638 		/*
639 		 * Update ire_generation for all ire_dep_children chains
640 		 * starting with this IRE
641 		 */
642 		ire_dep_incr_generation(ire);
643 		break;
644 	case IRE_FLUSH_ADD: {
645 		in6_addr_t	addr;
646 		in6_addr_t	mask;
647 		ip_stack_t	*ipst = ire->ire_ipst;
648 		uint_t		masklen;
649 
650 		/*
651 		 * Find an IRE which is a shorter match than the ire to be added
652 		 * For any such IRE (which we repeat) we update the
653 		 * ire_generation the same way as in the delete case.
654 		 */
655 		addr = ire->ire_addr_v6;
656 		mask = ire->ire_mask_v6;
657 		masklen = ip_mask_to_plen_v6(&mask);
658 
659 		ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
660 		    ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
661 		while (ire != NULL) {
662 			/* We need to handle all in the same bucket */
663 			irb_increment_generation(ire->ire_bucket);
664 
665 			mask = ire->ire_mask_v6;
666 			ASSERT(masklen > ip_mask_to_plen_v6(&mask));
667 			masklen = ip_mask_to_plen_v6(&mask);
668 			ire_refrele(ire);
669 			ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
670 			    NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
671 		}
672 		}
673 		break;
674 	}
675 	rw_exit(&ipst->ips_ip6_ire_head_lock);
676 }
677 
678 /*
679  * Matches the arguments passed with the values in the ire.
680  *
681  * Note: for match types that match using "ill" passed in, ill
682  * must be checked for non-NULL before calling this routine.
683  */
684 boolean_t
685 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
686     const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
687     const ts_label_t *tsl, int match_flags)
688 {
689 	in6_addr_t masked_addr;
690 	in6_addr_t gw_addr_v6;
691 	ill_t *ire_ill = NULL, *dst_ill;
692 	ip_stack_t *ipst = ire->ire_ipst;
693 
694 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
695 	ASSERT(addr != NULL);
696 	ASSERT(mask != NULL);
697 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
698 	ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
699 	    (ill != NULL && ill->ill_isv6));
700 
701 	/*
702 	 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
703 	 * is in fact hidden, to ensure the caller gets the right one.
704 	 */
705 	if (ire->ire_testhidden) {
706 		if (!(match_flags & MATCH_IRE_TESTHIDDEN))
707 			return (B_FALSE);
708 	}
709 
710 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
711 	    ire->ire_zoneid != ALL_ZONES) {
712 		/*
713 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
714 		 * does not match that of ire_zoneid, a failure to
715 		 * match is reported at this point. Otherwise, since some IREs
716 		 * that are available in the global zone can be used in local
717 		 * zones, additional checks need to be performed:
718 		 *
719 		 * IRE_LOOPBACK
720 		 *	entries should never be matched in this situation.
721 		 *	Each zone has its own IRE_LOOPBACK.
722 		 *
723 		 * IRE_LOCAL
724 		 *	We allow them for any zoneid. ire_route_recursive
725 		 *	does additional checks when
726 		 *	ip_restrict_interzone_loopback is set.
727 		 *
728 		 * If ill_usesrc_ifindex is set
729 		 *	Then we check if the zone has a valid source address
730 		 *	on the usesrc ill.
731 		 *
732 		 * If ire_ill is set, then check that the zone has an ipif
733 		 *	on that ill.
734 		 *
735 		 * Outside of this function (in ire_round_robin) we check
736 		 * that any IRE_OFFLINK has a gateway that reachable from the
737 		 * zone when we have multiple choices (ECMP).
738 		 */
739 		if (match_flags & MATCH_IRE_ZONEONLY)
740 			return (B_FALSE);
741 		if (ire->ire_type & IRE_LOOPBACK)
742 			return (B_FALSE);
743 
744 		if (ire->ire_type & IRE_LOCAL)
745 			goto matchit;
746 
747 		/*
748 		 * The normal case of IRE_ONLINK has a matching zoneid.
749 		 * Here we handle the case when shared-IP zones have been
750 		 * configured with IP addresses on vniN. In that case it
751 		 * is ok for traffic from a zone to use IRE_ONLINK routes
752 		 * if the ill has a usesrc pointing at vniN
753 		 * Applies to IRE_INTERFACE.
754 		 */
755 		dst_ill = ire->ire_ill;
756 		if (ire->ire_type & IRE_ONLINK) {
757 			uint_t	ifindex;
758 
759 			/*
760 			 * Note there is no IRE_INTERFACE on vniN thus
761 			 * can't do an IRE lookup for a matching route.
762 			 */
763 			ifindex = dst_ill->ill_usesrc_ifindex;
764 			if (ifindex == 0)
765 				return (B_FALSE);
766 
767 			/*
768 			 * If there is a usable source address in the
769 			 * zone, then it's ok to return this IRE_INTERFACE
770 			 */
771 			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
772 			    zoneid, ipst)) {
773 				ip3dbg(("ire_match_args: no usrsrc for zone"
774 				    " dst_ill %p\n", (void *)dst_ill));
775 				return (B_FALSE);
776 			}
777 		}
778 		/*
779 		 * For exampe, with
780 		 * route add 11.0.0.0 gw1 -ifp bge0
781 		 * route add 11.0.0.0 gw2 -ifp bge1
782 		 * this code would differentiate based on
783 		 * where the sending zone has addresses.
784 		 * Only if the zone has an address on bge0 can it use the first
785 		 * route. It isn't clear if this behavior is documented
786 		 * anywhere.
787 		 */
788 		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
789 			ipif_t	*tipif;
790 
791 			mutex_enter(&dst_ill->ill_lock);
792 			for (tipif = dst_ill->ill_ipif;
793 			    tipif != NULL; tipif = tipif->ipif_next) {
794 				if (!IPIF_IS_CONDEMNED(tipif) &&
795 				    (tipif->ipif_flags & IPIF_UP) &&
796 				    (tipif->ipif_zoneid == zoneid ||
797 				    tipif->ipif_zoneid == ALL_ZONES))
798 					break;
799 			}
800 			mutex_exit(&dst_ill->ill_lock);
801 			if (tipif == NULL)
802 				return (B_FALSE);
803 		}
804 	}
805 
806 matchit:
807 	if (match_flags & MATCH_IRE_GW) {
808 		mutex_enter(&ire->ire_lock);
809 		gw_addr_v6 = ire->ire_gateway_addr_v6;
810 		mutex_exit(&ire->ire_lock);
811 	}
812 	if (match_flags & MATCH_IRE_ILL) {
813 		ire_ill = ire->ire_ill;
814 
815 		/*
816 		 * If asked to match an ill, we *must* match
817 		 * on the ire_ill for ipmp test addresses, or
818 		 * any of the ill in the group for data addresses.
819 		 * If we don't, we may as well fail.
820 		 * However, we need an exception for IRE_LOCALs to ensure
821 		 * we loopback packets even sent to test addresses on different
822 		 * interfaces in the group.
823 		 */
824 		if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
825 		    !(ire->ire_type & IRE_LOCAL)) {
826 			if (ire->ire_ill != ill)
827 				return (B_FALSE);
828 		} else  {
829 			match_flags &= ~MATCH_IRE_TESTHIDDEN;
830 			/*
831 			 * We know that ill is not NULL, but ire_ill could be
832 			 * NULL
833 			 */
834 			if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
835 				return (B_FALSE);
836 		}
837 	}
838 	/* No ire_addr_v6 bits set past the mask */
839 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
840 	    ire->ire_addr_v6));
841 	V6_MASK_COPY(*addr, *mask, masked_addr);
842 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
843 	    ((!(match_flags & MATCH_IRE_GW)) ||
844 	    IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
845 	    ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
846 	    ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
847 	    ((!(match_flags & MATCH_IRE_MASK)) ||
848 	    (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
849 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
850 	    (!is_system_labeled()) ||
851 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
852 		/* We found the matched IRE */
853 		return (B_TRUE);
854 	}
855 	return (B_FALSE);
856 }
857 
858 /*
859  * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
860  * gateway address. If ill is non-NULL we also match on it.
861  * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
862  */
863 boolean_t
864 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
865     const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
866 {
867 	ire_t	*ire;
868 	uint_t	match_flags;
869 
870 	if (lock_held)
871 		ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
872 	else
873 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
874 
875 	match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
876 	if (ill != NULL)
877 		match_flags |= MATCH_IRE_ILL;
878 
879 	ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
880 	    &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
881 	    ipst);
882 
883 	if (!lock_held)
884 		rw_exit(&ipst->ips_ip6_ire_head_lock);
885 	if (ire != NULL) {
886 		ire_refrele(ire);
887 		return (B_TRUE);
888 	} else {
889 		return (B_FALSE);
890 	}
891 }
892 
893 /*
894  * Lookup a route in forwarding table.
895  * specific lookup is indicated by passing the
896  * required parameters and indicating the
897  * match required in flag field.
898  *
899  * Supports link-local addresses by following the ipif/ill when recursing.
900  */
901 ire_t *
902 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
903     const in6_addr_t *gateway, int type, const ill_t *ill,
904     zoneid_t zoneid, const ts_label_t *tsl, int flags,
905     uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
906 {
907 	ire_t *ire = NULL;
908 
909 	ASSERT(addr != NULL);
910 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
911 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
912 	ASSERT(ill == NULL || ill->ill_isv6);
913 
914 	ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
915 
916 	/*
917 	 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
918 	 * is set.
919 	 */
920 	if ((flags & (MATCH_IRE_ILL)) && (ill == NULL))
921 		return (NULL);
922 
923 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
924 	ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
925 	    tsl, flags, ipst);
926 	if (ire == NULL) {
927 		rw_exit(&ipst->ips_ip6_ire_head_lock);
928 		return (NULL);
929 	}
930 
931 	/*
932 	 * round-robin only if we have more than one route in the bucket.
933 	 * ips_ip_ecmp_behavior controls when we do ECMP
934 	 *	2:	always
935 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
936 	 *	0:	never
937 	 *
938 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
939 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
940 	 * and the IRE_INTERFACESs are likely to be shorter matches.
941 	 */
942 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
943 		if (ipst->ips_ip_ecmp_behavior == 2 ||
944 		    (ipst->ips_ip_ecmp_behavior == 1 &&
945 		    IS_DEFAULT_ROUTE_V6(ire))) {
946 			ire_t	*next_ire;
947 			ire_ftable_args_t margs;
948 
949 			(void) memset(&margs, 0, sizeof (margs));
950 			margs.ift_addr_v6 = *addr;
951 			if (mask != NULL)
952 				margs.ift_mask_v6 = *mask;
953 			if (gateway != NULL)
954 				margs.ift_gateway_v6 = *gateway;
955 			margs.ift_type = type;
956 			margs.ift_ill = ill;
957 			margs.ift_zoneid = zoneid;
958 			margs.ift_tsl = tsl;
959 			margs.ift_flags = flags;
960 
961 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
962 			    xmit_hint, ire, ipst);
963 			if (next_ire == NULL) {
964 				/* keep ire if next_ire is null */
965 				goto done;
966 			}
967 			ire_refrele(ire);
968 			ire = next_ire;
969 		}
970 	}
971 
972 done:
973 	/* Return generation before dropping lock */
974 	if (generationp != NULL)
975 		*generationp = ire->ire_generation;
976 
977 	rw_exit(&ipst->ips_ip6_ire_head_lock);
978 
979 	/*
980 	 * For shared-IP zones we need additional checks to what was
981 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
982 	 *
983 	 * When ip_restrict_interzone_loopback is set, then
984 	 * we ensure that IRE_LOCAL are only used for loopback
985 	 * between zones when the logical "Ethernet" would
986 	 * have looped them back. That is, if in the absense of
987 	 * the IRE_LOCAL we would have sent to packet out the
988 	 * same ill.
989 	 */
990 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
991 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
992 	    ipst->ips_ip_restrict_interzone_loopback) {
993 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
994 		ASSERT(ire != NULL);
995 	}
996 
997 	return (ire);
998 }
999 
1000 /*
1001  * Look up a single ire. The caller holds either the read or write lock.
1002  */
1003 ire_t *
1004 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1005     const in6_addr_t *gateway, int type, const ill_t *ill,
1006     zoneid_t zoneid, const ts_label_t *tsl, int flags,
1007     ip_stack_t *ipst)
1008 {
1009 	irb_t *irb_ptr;
1010 	ire_t *ire = NULL;
1011 	int i;
1012 
1013 	ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
1014 
1015 	/*
1016 	 * If the mask is known, the lookup
1017 	 * is simple, if the mask is not known
1018 	 * we need to search.
1019 	 */
1020 	if (flags & MATCH_IRE_MASK) {
1021 		uint_t masklen;
1022 
1023 		masklen = ip_mask_to_plen_v6(mask);
1024 		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
1025 			return (NULL);
1026 		}
1027 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
1028 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
1029 		    ipst->ips_ip6_ftable_hash_size)]);
1030 		rw_enter(&irb_ptr->irb_lock, RW_READER);
1031 		for (ire = irb_ptr->irb_ire; ire != NULL;
1032 		    ire = ire->ire_next) {
1033 			if (IRE_IS_CONDEMNED(ire))
1034 				continue;
1035 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
1036 			    ill, zoneid, tsl, flags))
1037 				goto found_ire;
1038 		}
1039 		rw_exit(&irb_ptr->irb_lock);
1040 	} else {
1041 		uint_t masklen;
1042 
1043 		/*
1044 		 * In this case we don't know the mask, we need to
1045 		 * search the table assuming different mask sizes.
1046 		 */
1047 		if (flags & MATCH_IRE_SHORTERMASK) {
1048 			masklen = ip_mask_to_plen_v6(mask);
1049 			if (masklen == 0) {
1050 				/* Nothing shorter than zero */
1051 				return (NULL);
1052 			}
1053 			masklen--;
1054 		} else {
1055 			masklen = IP6_MASK_TABLE_SIZE - 1;
1056 		}
1057 
1058 		for (i = masklen; i >= 0; i--) {
1059 			in6_addr_t tmpmask;
1060 
1061 			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
1062 				continue;
1063 			(void) ip_plen_to_mask_v6(i, &tmpmask);
1064 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
1065 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1066 			    ipst->ips_ip6_ftable_hash_size)];
1067 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1068 			for (ire = irb_ptr->irb_ire; ire != NULL;
1069 			    ire = ire->ire_next) {
1070 				if (IRE_IS_CONDEMNED(ire))
1071 					continue;
1072 				if (ire_match_args_v6(ire, addr,
1073 				    &ire->ire_mask_v6, gateway, type, ill,
1074 				    zoneid, tsl, flags))
1075 					goto found_ire;
1076 			}
1077 			rw_exit(&irb_ptr->irb_lock);
1078 		}
1079 	}
1080 	ASSERT(ire == NULL);
1081 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1082 	return (NULL);
1083 
1084 found_ire:
1085 	ire_refhold(ire);
1086 	rw_exit(&irb_ptr->irb_lock);
1087 	return (ire);
1088 }
1089 
1090 
1091 /*
1092  * This function is called by
1093  * ip_input/ire_route_recursive when doing a route lookup on only the
1094  * destination address.
1095  *
1096  * The optimizations of this function over ire_ftable_lookup are:
1097  *	o removing unnecessary flag matching
1098  *	o doing longest prefix match instead of overloading it further
1099  *	  with the unnecessary "best_prefix_match"
1100  *
1101  * If no route is found we return IRE_NOROUTE.
1102  */
1103 ire_t *
1104 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
1105     ip_stack_t *ipst, uint_t *generationp)
1106 {
1107 	ire_t	*ire;
1108 
1109 	ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
1110 	    MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
1111 	if (ire == NULL) {
1112 		ire = ire_reject(ipst, B_TRUE);
1113 		if (generationp != NULL)
1114 			*generationp = IRE_GENERATION_VERIFY;
1115 	}
1116 	/* ftable_lookup did round robin */
1117 	return (ire);
1118 }
1119 
1120 ire_t *
1121 ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa,
1122     uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
1123 {
1124 	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1125 
1126 	return (ip_select_route(dst, ixa, generationp, setsrcp, errorp,
1127 	    multirtp));
1128 }
1129 
1130 /*
1131  * Recursively look for a route to the destination. Can also match on
1132  * the zoneid, ill, and label. Used for the data paths. See also
1133  * ire_route_recursive_dstonly.
1134  *
1135  * If ill is set this means we will match it by adding MATCH_IRE_ILL.
1136  *
1137  * If allocate is not set then we will only inspect the existing IREs; never
1138  * create an IRE_IF_CLONE. This is used on the receive side when we are not
1139  * forwarding.
1140  *
1141  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1142  * instead.
1143  *
1144  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1145  * is an error.
1146  * Allow at most one RTF_INDIRECT.
1147  */
1148 ire_t *
1149 ire_route_recursive_impl_v6(ire_t *ire,
1150     const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
1151     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1152     boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
1153     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1154 {
1155 	int		i, j;
1156 	in6_addr_t	v6nexthop = *nexthop;
1157 	ire_t		*ires[MAX_IRE_RECURSION];
1158 	uint_t		generation;
1159 	uint_t		generations[MAX_IRE_RECURSION];
1160 	boolean_t	need_refrele = B_FALSE;
1161 	boolean_t	invalidate = B_FALSE;
1162 	int		prefs[MAX_IRE_RECURSION];
1163 	ill_t		*ill = NULL;
1164 
1165 	if (setsrcp != NULL)
1166 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1167 	if (gwattrp != NULL)
1168 		ASSERT(*gwattrp == NULL);
1169 
1170 	if (ill_arg != NULL)
1171 		match_args |= MATCH_IRE_ILL;
1172 
1173 	/*
1174 	 * We iterate up to three times to resolve a route, even though
1175 	 * we have four slots in the array. The extra slot is for an
1176 	 * IRE_IF_CLONE we might need to create.
1177 	 */
1178 	i = 0;
1179 	while (i < MAX_IRE_RECURSION - 1) {
1180 		/* ire_ftable_lookup handles round-robin/ECMP */
1181 		if (ire == NULL) {
1182 			ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
1183 			    (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
1184 			    match_args, xmit_hint, ipst, &generation);
1185 		} else {
1186 			/* Caller passed it; extra hold since we will rele */
1187 			ire_refhold(ire);
1188 			if (generationp != NULL)
1189 				generation = *generationp;
1190 			else
1191 				generation = IRE_GENERATION_VERIFY;
1192 		}
1193 
1194 		if (ire == NULL)
1195 			ire = ire_reject(ipst, B_TRUE);
1196 
1197 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
1198 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1199 			goto error;
1200 
1201 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1202 
1203 		prefs[i] = ire_pref(ire);
1204 		if (i != 0) {
1205 			/*
1206 			 * Don't allow anything unusual past the first
1207 			 * iteration.
1208 			 */
1209 			if ((ire->ire_type &
1210 			    (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
1211 			    prefs[i] <= prefs[i-1]) {
1212 				ire_refrele(ire);
1213 				ire = ire_reject(ipst, B_TRUE);
1214 				goto error;
1215 			}
1216 		}
1217 		/* We have a usable IRE */
1218 		ires[i] = ire;
1219 		generations[i] = generation;
1220 		i++;
1221 
1222 		/* The first RTF_SETSRC address is passed back if setsrcp */
1223 		if ((ire->ire_flags & RTF_SETSRC) &&
1224 		    setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
1225 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1226 			    &ire->ire_setsrc_addr_v6));
1227 			*setsrcp = ire->ire_setsrc_addr_v6;
1228 		}
1229 
1230 		/* The first ire_gw_secattr is passed back if gwattrp */
1231 		if (ire->ire_gw_secattr != NULL &&
1232 		    gwattrp != NULL && *gwattrp == NULL)
1233 			*gwattrp = ire->ire_gw_secattr;
1234 
1235 		/*
1236 		 * Check if we have a short-cut pointer to an IRE for this
1237 		 * destination, and that the cached dependency isn't stale.
1238 		 * In that case we've rejoined an existing tree towards a
1239 		 * parent, thus we don't need to continue the loop to
1240 		 * discover the rest of the tree.
1241 		 */
1242 		mutex_enter(&ire->ire_lock);
1243 		if (ire->ire_dep_parent != NULL &&
1244 		    ire->ire_dep_parent->ire_generation ==
1245 		    ire->ire_dep_parent_generation) {
1246 			mutex_exit(&ire->ire_lock);
1247 			ire = NULL;
1248 			goto done;
1249 		}
1250 		mutex_exit(&ire->ire_lock);
1251 
1252 		/*
1253 		 * If this type should have an ire_nce_cache (even if it
1254 		 * doesn't yet have one) then we are done. Includes
1255 		 * IRE_INTERFACE with a full 128 bit mask.
1256 		 */
1257 		if (ire->ire_nce_capable) {
1258 			ire = NULL;
1259 			goto done;
1260 		}
1261 
1262 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1263 		/*
1264 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1265 		 * particular destination
1266 		 */
1267 		if (ire->ire_type & IRE_INTERFACE) {
1268 			ire_t		*clone;
1269 
1270 			ASSERT(ire->ire_masklen != IPV6_ABITS);
1271 
1272 			/*
1273 			 * In the case of ip_input and ILLF_FORWARDING not
1274 			 * being set, and in the case of RTM_GET,
1275 			 * there is no point in allocating
1276 			 * an IRE_IF_CLONE. We return the IRE_INTERFACE.
1277 			 * Note that !allocate can result in a ire_dep_parent
1278 			 * which is IRE_IF_* without an IRE_IF_CLONE.
1279 			 * We recover from that when we need to send packets
1280 			 * by ensuring that the generations become
1281 			 * IRE_GENERATION_VERIFY in this case.
1282 			 */
1283 			if (!allocate) {
1284 				invalidate = B_TRUE;
1285 				ire = NULL;
1286 				goto done;
1287 			}
1288 
1289 			clone = ire_create_if_clone(ire, &v6nexthop,
1290 			    &generation);
1291 			if (clone == NULL) {
1292 				/*
1293 				 * Temporary failure - no memory.
1294 				 * Don't want caller to cache IRE_NOROUTE.
1295 				 */
1296 				invalidate = B_TRUE;
1297 				ire = ire_blackhole(ipst, B_TRUE);
1298 				goto error;
1299 			}
1300 			/*
1301 			 * Make clone next to last entry and the
1302 			 * IRE_INTERFACE the last in the dependency
1303 			 * chain since the clone depends on the
1304 			 * IRE_INTERFACE.
1305 			 */
1306 			ASSERT(i >= 1);
1307 			ASSERT(i < MAX_IRE_RECURSION);
1308 
1309 			ires[i] = ires[i-1];
1310 			generations[i] = generations[i-1];
1311 			ires[i-1] = clone;
1312 			generations[i-1] = generation;
1313 			i++;
1314 
1315 			ire = NULL;
1316 			goto done;
1317 		}
1318 
1319 		/*
1320 		 * We only match on the type and optionally ILL when
1321 		 * recursing. The type match is used by some callers
1322 		 * to exclude certain types (such as IRE_IF_CLONE or
1323 		 * IRE_LOCAL|IRE_LOOPBACK).
1324 		 */
1325 		match_args &= MATCH_IRE_TYPE;
1326 		v6nexthop = ire->ire_gateway_addr_v6;
1327 		if (ill == NULL && ire->ire_ill != NULL) {
1328 			ill = ire->ire_ill;
1329 			need_refrele = B_TRUE;
1330 			ill_refhold(ill);
1331 			match_args |= MATCH_IRE_ILL;
1332 		}
1333 
1334 		ire = NULL;
1335 	}
1336 	ASSERT(ire == NULL);
1337 	ire = ire_reject(ipst, B_TRUE);
1338 
1339 error:
1340 	ASSERT(ire != NULL);
1341 	if (need_refrele)
1342 		ill_refrele(ill);
1343 
1344 	/*
1345 	 * In the case of MULTIRT we want to try a different IRE the next
1346 	 * time. We let the next packet retry in that case.
1347 	 */
1348 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1349 		(void) ire_no_good(ires[0]);
1350 
1351 cleanup:
1352 	/* cleanup ires[i] */
1353 	ire_dep_unbuild(ires, i);
1354 	for (j = 0; j < i; j++)
1355 		ire_refrele(ires[j]);
1356 
1357 	ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
1358 	/*
1359 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1360 	 * ip_select_route since the reject or lack of memory might be gone.
1361 	 */
1362 	if (generationp != NULL)
1363 		*generationp = IRE_GENERATION_VERIFY;
1364 	return (ire);
1365 
1366 done:
1367 	ASSERT(ire == NULL);
1368 	if (need_refrele)
1369 		ill_refrele(ill);
1370 
1371 	/* Build dependencies */
1372 	if (!ire_dep_build(ires, generations, i)) {
1373 		/* Something in chain was condemned; tear it apart */
1374 		ire = ire_blackhole(ipst, B_TRUE);
1375 		goto cleanup;
1376 	}
1377 
1378 	/*
1379 	 * Release all refholds except the one for ires[0] that we
1380 	 * will return to the caller.
1381 	 */
1382 	for (j = 1; j < i; j++)
1383 		ire_refrele(ires[j]);
1384 
1385 	if (invalidate) {
1386 		/*
1387 		 * Since we needed to allocate but couldn't we need to make
1388 		 * sure that the dependency chain is rebuilt the next time.
1389 		 */
1390 		ire_dep_invalidate_generations(ires[0]);
1391 		generation = IRE_GENERATION_VERIFY;
1392 	} else {
1393 		/*
1394 		 * IREs can have been added or deleted while we did the
1395 		 * recursive lookup and we can't catch those until we've built
1396 		 * the dependencies. We verify the stored
1397 		 * ire_dep_parent_generation to catch any such changes and
1398 		 * return IRE_GENERATION_VERIFY (which will cause
1399 		 * ip_select_route to be called again so we can redo the
1400 		 * recursive lookup next time we send a packet.
1401 		 */
1402 		generation = ire_dep_validate_generations(ires[0]);
1403 		if (generations[0] != ires[0]->ire_generation) {
1404 			/* Something changed at the top */
1405 			generation = IRE_GENERATION_VERIFY;
1406 		}
1407 	}
1408 	if (generationp != NULL)
1409 		*generationp = generation;
1410 
1411 	return (ires[0]);
1412 }
1413 
1414 ire_t *
1415 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
1416     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1417     boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
1418     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1419 {
1420 	return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
1421 	    zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
1422 	    gwattrp, generationp));
1423 }
1424 
1425 /*
1426  * Recursively look for a route to the destination.
1427  * We only handle a destination match here, yet we have the same arguments
1428  * as the full match to allow function pointers to select between the two.
1429  *
1430  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1431  * instead.
1432  *
1433  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1434  * is an error.
1435  * Allow at most one RTF_INDIRECT.
1436  */
1437 ire_t *
1438 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate,
1439     uint32_t xmit_hint, ip_stack_t *ipst)
1440 {
1441 	ire_t	*ire;
1442 	ire_t	*ire1;
1443 	uint_t	generation;
1444 
1445 	/* ire_ftable_lookup handles round-robin/ECMP */
1446 	ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
1447 	    &generation);
1448 	ASSERT(ire != NULL);
1449 
1450 	/*
1451 	 * If this type should have an ire_nce_cache (even if it
1452 	 * doesn't yet have one) then we are done. Includes
1453 	 * IRE_INTERFACE with a full 128 bit mask.
1454 	 */
1455 	if (ire->ire_nce_capable)
1456 		return (ire);
1457 
1458 	/*
1459 	 * If the IRE has a current cached parent we know that the whole
1460 	 * parent chain is current, hence we don't need to discover and
1461 	 * build any dependencies by doing a recursive lookup.
1462 	 */
1463 	mutex_enter(&ire->ire_lock);
1464 	if (ire->ire_dep_parent != NULL &&
1465 	    ire->ire_dep_parent->ire_generation ==
1466 	    ire->ire_dep_parent_generation) {
1467 		mutex_exit(&ire->ire_lock);
1468 		return (ire);
1469 	}
1470 	mutex_exit(&ire->ire_lock);
1471 
1472 	/*
1473 	 * Fallback to loop in the normal code starting with the ire
1474 	 * we found. Normally this would return the same ire.
1475 	 */
1476 	ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
1477 	    NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
1478 	    &generation);
1479 	ire_refrele(ire);
1480 	return (ire1);
1481 }
1482