xref: /titanic_52/usr/src/uts/common/inet/ip/ip6_ire.c (revision f6e214c7418f43af38bd8c3a557e3d0a1d311cfa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 1990 Mentat Inc.
24  */
25 
26 /*
27  * This file contains routines that manipulate Internet Routing Entries (IREs).
28  */
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/ddi.h>
33 #include <sys/cmn_err.h>
34 
35 #include <sys/systm.h>
36 #include <sys/param.h>
37 #include <sys/socket.h>
38 #include <net/if.h>
39 #include <net/route.h>
40 #include <netinet/in.h>
41 #include <net/if_dl.h>
42 #include <netinet/ip6.h>
43 #include <netinet/icmp6.h>
44 
45 #include <inet/common.h>
46 #include <inet/mi.h>
47 #include <inet/ip.h>
48 #include <inet/ip6.h>
49 #include <inet/ip_ndp.h>
50 #include <inet/ip_if.h>
51 #include <inet/ip_ire.h>
52 #include <inet/ipclassifier.h>
53 #include <inet/nd.h>
54 #include <inet/tunables.h>
55 #include <sys/kmem.h>
56 #include <sys/zone.h>
57 
58 #include <sys/tsol/label.h>
59 #include <sys/tsol/tnet.h>
60 
61 #define	IS_DEFAULT_ROUTE_V6(ire)	\
62 	(((ire)->ire_type & IRE_DEFAULT) || \
63 	    (((ire)->ire_type & IRE_INTERFACE) && \
64 	    (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
65 
66 static	ire_t	ire_null;
67 
68 static ire_t *
69 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
70     const in6_addr_t *gateway, int type, const ill_t *ill,
71     zoneid_t zoneid, const ts_label_t *tsl, int flags,
72     ip_stack_t *ipst);
73 
74 /*
75  * Initialize the ire that is specific to IPv6 part and call
76  * ire_init_common to finish it.
77  * Returns zero or errno.
78  */
79 int
80 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
81     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
82     zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
83 {
84 	int error;
85 
86 	/*
87 	 * Reject IRE security attmakeribute creation/initialization
88 	 * if system is not running in Trusted mode.
89 	 */
90 	if (gc != NULL && !is_system_labeled())
91 		return (EINVAL);
92 
93 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
94 	if (v6addr != NULL)
95 		ire->ire_addr_v6 = *v6addr;
96 	if (v6gateway != NULL)
97 		ire->ire_gateway_addr_v6 = *v6gateway;
98 
99 	/* Make sure we don't have stray values in some fields */
100 	switch (type) {
101 	case IRE_LOOPBACK:
102 	case IRE_HOST:
103 	case IRE_LOCAL:
104 	case IRE_IF_CLONE:
105 		ire->ire_mask_v6 = ipv6_all_ones;
106 		ire->ire_masklen = IPV6_ABITS;
107 		break;
108 	case IRE_PREFIX:
109 	case IRE_DEFAULT:
110 	case IRE_IF_RESOLVER:
111 	case IRE_IF_NORESOLVER:
112 		if (v6mask != NULL) {
113 			ire->ire_mask_v6 = *v6mask;
114 			ire->ire_masklen =
115 			    ip_mask_to_plen_v6(&ire->ire_mask_v6);
116 		}
117 		break;
118 	case IRE_MULTICAST:
119 	case IRE_NOROUTE:
120 		ASSERT(v6mask == NULL);
121 		break;
122 	default:
123 		ASSERT(0);
124 		return (EINVAL);
125 	}
126 
127 	error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
128 	    gc, ipst);
129 	if (error != NULL)
130 		return (error);
131 
132 	/* Determine which function pointers to use */
133 	ire->ire_postfragfn = ip_xmit;		/* Common case */
134 
135 	switch (ire->ire_type) {
136 	case IRE_LOCAL:
137 		ire->ire_sendfn = ire_send_local_v6;
138 		ire->ire_recvfn = ire_recv_local_v6;
139 		ASSERT(ire->ire_ill != NULL);
140 		if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
141 			ire->ire_recvfn = ire_recv_noaccept_v6;
142 		break;
143 	case IRE_LOOPBACK:
144 		ire->ire_sendfn = ire_send_local_v6;
145 		ire->ire_recvfn = ire_recv_loopback_v6;
146 		break;
147 	case IRE_MULTICAST:
148 		ire->ire_postfragfn = ip_postfrag_loopcheck;
149 		ire->ire_sendfn = ire_send_multicast_v6;
150 		ire->ire_recvfn = ire_recv_multicast_v6;
151 		break;
152 	default:
153 		/*
154 		 * For IRE_IF_ALL and IRE_OFFLINK we forward received
155 		 * packets by default.
156 		 */
157 		ire->ire_sendfn = ire_send_wire_v6;
158 		ire->ire_recvfn = ire_recv_forward_v6;
159 		break;
160 	}
161 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
162 		ire->ire_sendfn = ire_send_noroute_v6;
163 		ire->ire_recvfn = ire_recv_noroute_v6;
164 	} else if (ire->ire_flags & RTF_MULTIRT) {
165 		ire->ire_postfragfn = ip_postfrag_multirt_v6;
166 		ire->ire_sendfn = ire_send_multirt_v6;
167 		ire->ire_recvfn = ire_recv_multirt_v6;
168 	}
169 	ire->ire_nce_capable = ire_determine_nce_capable(ire);
170 	return (0);
171 }
172 
173 /*
174  * ire_create_v6 is called to allocate and initialize a new IRE.
175  *
176  * NOTE : This is called as writer sometimes though not required
177  * by this function.
178  */
179 /* ARGSUSED */
180 ire_t *
181 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
182     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
183     uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
184 {
185 	ire_t	*ire;
186 	int	error;
187 
188 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
189 
190 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
191 	if (ire == NULL) {
192 		DTRACE_PROBE(kmem__cache__alloc);
193 		return (NULL);
194 	}
195 	*ire = ire_null;
196 
197 	error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
198 	    type, ill, zoneid, flags, gc, ipst);
199 
200 	if (error != 0) {
201 		DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
202 		kmem_cache_free(ire_cache, ire);
203 		return (NULL);
204 	}
205 	return (ire);
206 }
207 
208 /*
209  * Find the ill matching a multicast group.
210  * Allows different routes for multicast addresses
211  * in the unicast routing table (akin to FF::0/8 but could be more specific)
212  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
213  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
214  * specify the interface to join on.
215  *
216  * Supports link-local addresses by using ire_route_recursive which follows
217  * the ill when recursing.
218  *
219  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
220  * and the MULTIRT property can be different for different groups, we
221  * extract RTF_MULTIRT from the special unicast route added for a group
222  * with CGTP and pass that back in the multirtp argument.
223  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
224  * We have a setsrcp argument for the same reason.
225  */
226 ill_t *
227 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
228     ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
229 {
230 	ire_t	*ire;
231 	ill_t	*ill;
232 
233 	ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
234 	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
235 	ASSERT(ire != NULL);
236 
237 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
238 		ire_refrele(ire);
239 		return (NULL);
240 	}
241 
242 	if (multirtp != NULL)
243 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
244 
245 	ill = ire_nexthop_ill(ire);
246 	ire_refrele(ire);
247 	return (ill);
248 }
249 
250 /*
251  * This function takes a mask and returns number of bits set in the
252  * mask (the represented prefix length).  Assumes a contiguous mask.
253  */
254 int
255 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
256 {
257 	int		bits;
258 	int		plen = IPV6_ABITS;
259 	int		i;
260 
261 	for (i = 3; i >= 0; i--) {
262 		if (v6mask->s6_addr32[i] == 0) {
263 			plen -= 32;
264 			continue;
265 		}
266 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
267 		if (bits == 0)
268 			break;
269 		plen -= bits;
270 	}
271 
272 	return (plen);
273 }
274 
275 /*
276  * Convert a prefix length to the mask for that prefix.
277  * Returns the argument bitmask.
278  */
279 in6_addr_t *
280 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
281 {
282 	uint32_t *ptr;
283 
284 	if (plen < 0 || plen > IPV6_ABITS)
285 		return (NULL);
286 	*bitmask = ipv6_all_zeros;
287 	if (plen == 0)
288 		return (bitmask);
289 
290 	ptr = (uint32_t *)bitmask;
291 	while (plen > 32) {
292 		*ptr++ = 0xffffffffU;
293 		plen -= 32;
294 	}
295 	*ptr = htonl(0xffffffffU << (32 - plen));
296 	return (bitmask);
297 }
298 
299 /*
300  * Add a fully initialized IPv6 IRE to the forwarding table.
301  * This returns NULL on failure, or a held IRE on success.
302  * Normally the returned IRE is the same as the argument. But a different
303  * IRE will be returned if the added IRE is deemed identical to an existing
304  * one. In that case ire_identical_ref will be increased.
305  * The caller always needs to do an ire_refrele() on the returned IRE.
306  */
307 ire_t *
308 ire_add_v6(ire_t *ire)
309 {
310 	ire_t	*ire1;
311 	int	mask_table_index;
312 	irb_t	*irb_ptr;
313 	ire_t	**irep;
314 	int	match_flags;
315 	int	error;
316 	ip_stack_t	*ipst = ire->ire_ipst;
317 
318 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
319 
320 	/* Make sure the address is properly masked. */
321 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
322 
323 	mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
324 	if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
325 		irb_t *ptr;
326 		int i;
327 
328 		ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
329 		    sizeof (irb_t)));
330 		if (ptr == NULL) {
331 			ire_delete(ire);
332 			return (NULL);
333 		}
334 		for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
335 			rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
336 			ptr[i].irb_ipst = ipst;
337 		}
338 		mutex_enter(&ipst->ips_ire_ft_init_lock);
339 		if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
340 		    NULL) {
341 			ipst->ips_ip_forwarding_table_v6[mask_table_index] =
342 			    ptr;
343 			mutex_exit(&ipst->ips_ire_ft_init_lock);
344 		} else {
345 			/*
346 			 * Some other thread won the race in
347 			 * initializing the forwarding table at the
348 			 * same index.
349 			 */
350 			mutex_exit(&ipst->ips_ire_ft_init_lock);
351 			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
352 				rw_destroy(&ptr[i].irb_lock);
353 			}
354 			mi_free(ptr);
355 		}
356 	}
357 	irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
358 	    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
359 	    ipst->ips_ip6_ftable_hash_size)]);
360 
361 	match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
362 	if (ire->ire_ill != NULL)
363 		match_flags |= MATCH_IRE_ILL;
364 	/*
365 	 * Start the atomic add of the ire. Grab the bucket lock and the
366 	 * ill lock. Check for condemned.
367 	 */
368 	error = ire_atomic_start(irb_ptr, ire);
369 	if (error != 0) {
370 		ire_delete(ire);
371 		return (NULL);
372 	}
373 
374 	/*
375 	 * If we are creating a hidden IRE, make sure we search for
376 	 * hidden IREs when searching for duplicates below.
377 	 * Otherwise, we might find an IRE on some other interface
378 	 * that's not marked hidden.
379 	 */
380 	if (ire->ire_testhidden)
381 		match_flags |= MATCH_IRE_TESTHIDDEN;
382 
383 	/*
384 	 * Atomically check for duplicate and insert in the table.
385 	 */
386 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
387 		if (IRE_IS_CONDEMNED(ire1))
388 			continue;
389 		/*
390 		 * Here we need an exact match on zoneid, i.e.,
391 		 * ire_match_args doesn't fit.
392 		 */
393 		if (ire1->ire_zoneid != ire->ire_zoneid)
394 			continue;
395 
396 		if (ire1->ire_type != ire->ire_type)
397 			continue;
398 
399 		/*
400 		 * Note: We do not allow multiple routes that differ only
401 		 * in the gateway security attributes; such routes are
402 		 * considered duplicates.
403 		 * To change that we explicitly have to treat them as
404 		 * different here.
405 		 */
406 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
407 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
408 		    ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
409 		    match_flags)) {
410 			/*
411 			 * Return the old ire after doing a REFHOLD.
412 			 * As most of the callers continue to use the IRE
413 			 * after adding, we return a held ire. This will
414 			 * avoid a lookup in the caller again. If the callers
415 			 * don't want to use it, they need to do a REFRELE.
416 			 *
417 			 * We only allow exactly one IRE_IF_CLONE for any dst,
418 			 * so, if the is an IF_CLONE, return the ire without
419 			 * an identical_ref, but with an ire_ref held.
420 			 */
421 			if (ire->ire_type != IRE_IF_CLONE) {
422 				atomic_add_32(&ire1->ire_identical_ref, 1);
423 				DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
424 				    ire_t *, ire);
425 			}
426 			ip1dbg(("found dup ire existing %p new %p",
427 			    (void *)ire1, (void *)ire));
428 			ire_refhold(ire1);
429 			ire_atomic_end(irb_ptr, ire);
430 			ire_delete(ire);
431 			return (ire1);
432 		}
433 	}
434 
435 	/*
436 	 * Normally we do head insertion since most things do not care about
437 	 * the order of the IREs in the bucket.
438 	 * However, due to shared-IP zones (and restrict_interzone_loopback)
439 	 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
440 	 * address. For that reason we do tail insertion for IRE_IF_CLONE.
441 	 */
442 	irep = (ire_t **)irb_ptr;
443 	if (ire->ire_type & IRE_IF_CLONE) {
444 		while ((ire1 = *irep) != NULL)
445 			irep = &ire1->ire_next;
446 	}
447 	/* Insert at *irep */
448 	ire1 = *irep;
449 	if (ire1 != NULL)
450 		ire1->ire_ptpn = &ire->ire_next;
451 	ire->ire_next = ire1;
452 	/* Link the new one in. */
453 	ire->ire_ptpn = irep;
454 	/*
455 	 * ire_walk routines de-reference ire_next without holding
456 	 * a lock. Before we point to the new ire, we want to make
457 	 * sure the store that sets the ire_next of the new ire
458 	 * reaches global visibility, so that ire_walk routines
459 	 * don't see a truncated list of ires i.e if the ire_next
460 	 * of the new ire gets set after we do "*irep = ire" due
461 	 * to re-ordering, the ire_walk thread will see a NULL
462 	 * once it accesses the ire_next of the new ire.
463 	 * membar_producer() makes sure that the following store
464 	 * happens *after* all of the above stores.
465 	 */
466 	membar_producer();
467 	*irep = ire;
468 	ire->ire_bucket = irb_ptr;
469 	/*
470 	 * We return a bumped up IRE above. Keep it symmetrical
471 	 * so that the callers will always have to release. This
472 	 * helps the callers of this function because they continue
473 	 * to use the IRE after adding and hence they don't have to
474 	 * lookup again after we return the IRE.
475 	 *
476 	 * NOTE : We don't have to use atomics as this is appearing
477 	 * in the list for the first time and no one else can bump
478 	 * up the reference count on this yet.
479 	 */
480 	ire_refhold_locked(ire);
481 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
482 	irb_ptr->irb_ire_cnt++;
483 
484 	if (ire->ire_ill != NULL) {
485 		DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
486 		    (char *), "ire", (void *), ire);
487 		ire->ire_ill->ill_ire_cnt++;
488 		ASSERT(ire->ire_ill->ill_ire_cnt != 0);	/* Wraparound */
489 	}
490 	ire_atomic_end(irb_ptr, ire);
491 
492 	/* Make any caching of the IREs be notified or updated */
493 	ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
494 
495 	return (ire);
496 }
497 
498 /*
499  * Search for all HOST REDIRECT routes that are
500  * pointing at the specified gateway and
501  * delete them. This routine is called only
502  * when a default gateway is going away.
503  */
504 static void
505 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
506 {
507 	irb_t *irb_ptr;
508 	irb_t *irb;
509 	ire_t *ire;
510 	in6_addr_t gw_addr_v6;
511 	int i;
512 
513 	/* get the hash table for HOST routes */
514 	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
515 	if (irb_ptr == NULL)
516 		return;
517 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
518 		irb = &irb_ptr[i];
519 		irb_refhold(irb);
520 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
521 			if (!(ire->ire_flags & RTF_DYNAMIC))
522 				continue;
523 			mutex_enter(&ire->ire_lock);
524 			gw_addr_v6 = ire->ire_gateway_addr_v6;
525 			mutex_exit(&ire->ire_lock);
526 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
527 				ire_delete(ire);
528 		}
529 		irb_refrele(irb);
530 	}
531 }
532 
533 /*
534  * Delete the specified IRE.
535  * All calls should use ire_delete().
536  * Sometimes called as writer though not required by this function.
537  *
538  * NOTE : This function is called only if the ire was added
539  * in the list.
540  */
541 void
542 ire_delete_v6(ire_t *ire)
543 {
544 	in6_addr_t gw_addr_v6;
545 	ip_stack_t	*ipst = ire->ire_ipst;
546 
547 	/*
548 	 * Make sure ire_generation increases from ire_flush_cache happen
549 	 * after any lookup/reader has read ire_generation.
550 	 * Since the rw_enter makes us wait until any lookup/reader has
551 	 * completed we can exit the lock immediately.
552 	 */
553 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
554 	rw_exit(&ipst->ips_ip6_ire_head_lock);
555 
556 	ASSERT(ire->ire_refcnt >= 1);
557 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
558 
559 	ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
560 
561 	if (ire->ire_type == IRE_DEFAULT) {
562 		/*
563 		 * when a default gateway is going away
564 		 * delete all the host redirects pointing at that
565 		 * gateway.
566 		 */
567 		mutex_enter(&ire->ire_lock);
568 		gw_addr_v6 = ire->ire_gateway_addr_v6;
569 		mutex_exit(&ire->ire_lock);
570 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
571 	}
572 
573 	/*
574 	 * If we are deleting an IRE_INTERFACE then we make sure we also
575 	 * delete any IRE_IF_CLONE that has been created from it.
576 	 * Those are always in ire_dep_children.
577 	 */
578 	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
579 		ire_dep_delete_if_clone(ire);
580 
581 	/* Remove from parent dependencies and child */
582 	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
583 	if (ire->ire_dep_parent != NULL) {
584 		ire_dep_remove(ire);
585 	}
586 	while (ire->ire_dep_children != NULL)
587 		ire_dep_remove(ire->ire_dep_children);
588 	rw_exit(&ipst->ips_ire_dep_lock);
589 }
590 
591 /*
592  * When an IRE is added or deleted this routine is called to make sure
593  * any caching of IRE information is notified or updated.
594  *
595  * The flag argument indicates if the flush request is due to addition
596  * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
597  * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
598  */
599 void
600 ire_flush_cache_v6(ire_t *ire, int flag)
601 {
602 	ip_stack_t *ipst = ire->ire_ipst;
603 
604 	/*
605 	 * IRE_IF_CLONE ire's don't provide any new information
606 	 * than the parent from which they are cloned, so don't
607 	 * perturb the generation numbers.
608 	 */
609 	if (ire->ire_type & IRE_IF_CLONE)
610 		return;
611 
612 	/*
613 	 * Ensure that an ire_add during a lookup serializes the updates of
614 	 * the generation numbers under ire_head_lock so that the lookup gets
615 	 * either the old ire and old generation number, or a new ire and new
616 	 * generation number.
617 	 */
618 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
619 
620 	/*
621 	 * If a route was just added, we need to notify everybody that
622 	 * has cached an IRE_NOROUTE since there might now be a better
623 	 * route for them.
624 	 */
625 	if (flag == IRE_FLUSH_ADD) {
626 		ire_increment_generation(ipst->ips_ire_reject_v6);
627 		ire_increment_generation(ipst->ips_ire_blackhole_v6);
628 	}
629 
630 	/* Adding a default can't otherwise provide a better route */
631 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
632 		rw_exit(&ipst->ips_ip6_ire_head_lock);
633 		return;
634 	}
635 
636 	switch (flag) {
637 	case IRE_FLUSH_DELETE:
638 	case IRE_FLUSH_GWCHANGE:
639 		/*
640 		 * Update ire_generation for all ire_dep_children chains
641 		 * starting with this IRE
642 		 */
643 		ire_dep_incr_generation(ire);
644 		break;
645 	case IRE_FLUSH_ADD: {
646 		in6_addr_t	addr;
647 		in6_addr_t	mask;
648 		ip_stack_t	*ipst = ire->ire_ipst;
649 		uint_t		masklen;
650 
651 		/*
652 		 * Find an IRE which is a shorter match than the ire to be added
653 		 * For any such IRE (which we repeat) we update the
654 		 * ire_generation the same way as in the delete case.
655 		 */
656 		addr = ire->ire_addr_v6;
657 		mask = ire->ire_mask_v6;
658 		masklen = ip_mask_to_plen_v6(&mask);
659 
660 		ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
661 		    ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
662 		while (ire != NULL) {
663 			/* We need to handle all in the same bucket */
664 			irb_increment_generation(ire->ire_bucket);
665 
666 			mask = ire->ire_mask_v6;
667 			ASSERT(masklen > ip_mask_to_plen_v6(&mask));
668 			masklen = ip_mask_to_plen_v6(&mask);
669 			ire_refrele(ire);
670 			ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
671 			    NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
672 		}
673 		}
674 		break;
675 	}
676 	rw_exit(&ipst->ips_ip6_ire_head_lock);
677 }
678 
679 /*
680  * Matches the arguments passed with the values in the ire.
681  *
682  * Note: for match types that match using "ill" passed in, ill
683  * must be checked for non-NULL before calling this routine.
684  */
685 boolean_t
686 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
687     const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
688     const ts_label_t *tsl, int match_flags)
689 {
690 	in6_addr_t masked_addr;
691 	in6_addr_t gw_addr_v6;
692 	ill_t *ire_ill = NULL, *dst_ill;
693 	ip_stack_t *ipst = ire->ire_ipst;
694 
695 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
696 	ASSERT(addr != NULL);
697 	ASSERT(mask != NULL);
698 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
699 	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
700 	    (ill != NULL && ill->ill_isv6));
701 
702 	/*
703 	 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
704 	 * is in fact hidden, to ensure the caller gets the right one.
705 	 */
706 	if (ire->ire_testhidden) {
707 		if (!(match_flags & MATCH_IRE_TESTHIDDEN))
708 			return (B_FALSE);
709 	}
710 
711 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
712 	    ire->ire_zoneid != ALL_ZONES) {
713 		/*
714 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
715 		 * does not match that of ire_zoneid, a failure to
716 		 * match is reported at this point. Otherwise, since some IREs
717 		 * that are available in the global zone can be used in local
718 		 * zones, additional checks need to be performed:
719 		 *
720 		 * IRE_LOOPBACK
721 		 *	entries should never be matched in this situation.
722 		 *	Each zone has its own IRE_LOOPBACK.
723 		 *
724 		 * IRE_LOCAL
725 		 *	We allow them for any zoneid. ire_route_recursive
726 		 *	does additional checks when
727 		 *	ip_restrict_interzone_loopback is set.
728 		 *
729 		 * If ill_usesrc_ifindex is set
730 		 *	Then we check if the zone has a valid source address
731 		 *	on the usesrc ill.
732 		 *
733 		 * If ire_ill is set, then check that the zone has an ipif
734 		 *	on that ill.
735 		 *
736 		 * Outside of this function (in ire_round_robin) we check
737 		 * that any IRE_OFFLINK has a gateway that reachable from the
738 		 * zone when we have multiple choices (ECMP).
739 		 */
740 		if (match_flags & MATCH_IRE_ZONEONLY)
741 			return (B_FALSE);
742 		if (ire->ire_type & IRE_LOOPBACK)
743 			return (B_FALSE);
744 
745 		if (ire->ire_type & IRE_LOCAL)
746 			goto matchit;
747 
748 		/*
749 		 * The normal case of IRE_ONLINK has a matching zoneid.
750 		 * Here we handle the case when shared-IP zones have been
751 		 * configured with IP addresses on vniN. In that case it
752 		 * is ok for traffic from a zone to use IRE_ONLINK routes
753 		 * if the ill has a usesrc pointing at vniN
754 		 * Applies to IRE_INTERFACE.
755 		 */
756 		dst_ill = ire->ire_ill;
757 		if (ire->ire_type & IRE_ONLINK) {
758 			uint_t	ifindex;
759 
760 			/*
761 			 * Note there is no IRE_INTERFACE on vniN thus
762 			 * can't do an IRE lookup for a matching route.
763 			 */
764 			ifindex = dst_ill->ill_usesrc_ifindex;
765 			if (ifindex == 0)
766 				return (B_FALSE);
767 
768 			/*
769 			 * If there is a usable source address in the
770 			 * zone, then it's ok to return this IRE_INTERFACE
771 			 */
772 			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
773 			    zoneid, ipst)) {
774 				ip3dbg(("ire_match_args: no usrsrc for zone"
775 				    " dst_ill %p\n", (void *)dst_ill));
776 				return (B_FALSE);
777 			}
778 		}
779 		/*
780 		 * For example, with
781 		 * route add 11.0.0.0 gw1 -ifp bge0
782 		 * route add 11.0.0.0 gw2 -ifp bge1
783 		 * this code would differentiate based on
784 		 * where the sending zone has addresses.
785 		 * Only if the zone has an address on bge0 can it use the first
786 		 * route. It isn't clear if this behavior is documented
787 		 * anywhere.
788 		 */
789 		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
790 			ipif_t	*tipif;
791 
792 			mutex_enter(&dst_ill->ill_lock);
793 			for (tipif = dst_ill->ill_ipif;
794 			    tipif != NULL; tipif = tipif->ipif_next) {
795 				if (!IPIF_IS_CONDEMNED(tipif) &&
796 				    (tipif->ipif_flags & IPIF_UP) &&
797 				    (tipif->ipif_zoneid == zoneid ||
798 				    tipif->ipif_zoneid == ALL_ZONES))
799 					break;
800 			}
801 			mutex_exit(&dst_ill->ill_lock);
802 			if (tipif == NULL)
803 				return (B_FALSE);
804 		}
805 	}
806 
807 matchit:
808 	ire_ill = ire->ire_ill;
809 	if (match_flags & MATCH_IRE_GW) {
810 		mutex_enter(&ire->ire_lock);
811 		gw_addr_v6 = ire->ire_gateway_addr_v6;
812 		mutex_exit(&ire->ire_lock);
813 	}
814 	if (match_flags & MATCH_IRE_ILL) {
815 
816 		/*
817 		 * If asked to match an ill, we *must* match
818 		 * on the ire_ill for ipmp test addresses, or
819 		 * any of the ill in the group for data addresses.
820 		 * If we don't, we may as well fail.
821 		 * However, we need an exception for IRE_LOCALs to ensure
822 		 * we loopback packets even sent to test addresses on different
823 		 * interfaces in the group.
824 		 */
825 		if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
826 		    !(ire->ire_type & IRE_LOCAL)) {
827 			if (ire->ire_ill != ill)
828 				return (B_FALSE);
829 		} else  {
830 			match_flags &= ~MATCH_IRE_TESTHIDDEN;
831 			/*
832 			 * We know that ill is not NULL, but ire_ill could be
833 			 * NULL
834 			 */
835 			if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
836 				return (B_FALSE);
837 		}
838 	}
839 	if (match_flags & MATCH_IRE_SRC_ILL) {
840 		if (ire_ill == NULL)
841 			return (B_FALSE);
842 		if (!IS_ON_SAME_LAN(ill, ire_ill)) {
843 			if (ire_ill->ill_usesrc_ifindex == 0 ||
844 			    (ire_ill->ill_usesrc_ifindex !=
845 			    ill->ill_phyint->phyint_ifindex))
846 				return (B_FALSE);
847 		}
848 	}
849 
850 	/* No ire_addr_v6 bits set past the mask */
851 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
852 	    ire->ire_addr_v6));
853 	V6_MASK_COPY(*addr, *mask, masked_addr);
854 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
855 	    ((!(match_flags & MATCH_IRE_GW)) ||
856 	    ((!(match_flags & MATCH_IRE_DIRECT)) ||
857 	    !(ire->ire_flags & RTF_INDIRECT)) &&
858 	    IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
859 	    ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
860 	    ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
861 	    ((!(match_flags & MATCH_IRE_MASK)) ||
862 	    (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
863 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
864 	    (!is_system_labeled()) ||
865 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
866 		/* We found the matched IRE */
867 		return (B_TRUE);
868 	}
869 	return (B_FALSE);
870 }
871 
872 /*
873  * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
874  * gateway address. If ill is non-NULL we also match on it.
875  * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
876  */
877 boolean_t
878 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
879     const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
880 {
881 	ire_t	*ire;
882 	uint_t	match_flags;
883 
884 	if (lock_held)
885 		ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
886 	else
887 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
888 
889 	match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
890 	if (ill != NULL)
891 		match_flags |= MATCH_IRE_ILL;
892 
893 	ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
894 	    &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
895 	    ipst);
896 
897 	if (!lock_held)
898 		rw_exit(&ipst->ips_ip6_ire_head_lock);
899 	if (ire != NULL) {
900 		ire_refrele(ire);
901 		return (B_TRUE);
902 	} else {
903 		return (B_FALSE);
904 	}
905 }
906 
907 /*
908  * Lookup a route in forwarding table.
909  * specific lookup is indicated by passing the
910  * required parameters and indicating the
911  * match required in flag field.
912  *
913  * Supports link-local addresses by following the ipif/ill when recursing.
914  */
915 ire_t *
916 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
917     const in6_addr_t *gateway, int type, const ill_t *ill,
918     zoneid_t zoneid, const ts_label_t *tsl, int flags,
919     uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
920 {
921 	ire_t *ire = NULL;
922 
923 	ASSERT(addr != NULL);
924 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
925 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
926 	ASSERT(ill == NULL || ill->ill_isv6);
927 
928 	ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
929 
930 	/*
931 	 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
932 	 * or MATCH_IRE_SRC_ILL is set.
933 	 */
934 	if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
935 		return (NULL);
936 
937 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
938 	ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
939 	    tsl, flags, ipst);
940 	if (ire == NULL) {
941 		rw_exit(&ipst->ips_ip6_ire_head_lock);
942 		return (NULL);
943 	}
944 
945 	/*
946 	 * round-robin only if we have more than one route in the bucket.
947 	 * ips_ip_ecmp_behavior controls when we do ECMP
948 	 *	2:	always
949 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
950 	 *	0:	never
951 	 *
952 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
953 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
954 	 * and the IRE_INTERFACESs are likely to be shorter matches.
955 	 */
956 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
957 		if (ipst->ips_ip_ecmp_behavior == 2 ||
958 		    (ipst->ips_ip_ecmp_behavior == 1 &&
959 		    IS_DEFAULT_ROUTE_V6(ire))) {
960 			ire_t	*next_ire;
961 			ire_ftable_args_t margs;
962 
963 			bzero(&margs, sizeof (margs));
964 			margs.ift_addr_v6 = *addr;
965 			if (mask != NULL)
966 				margs.ift_mask_v6 = *mask;
967 			if (gateway != NULL)
968 				margs.ift_gateway_v6 = *gateway;
969 			margs.ift_type = type;
970 			margs.ift_ill = ill;
971 			margs.ift_zoneid = zoneid;
972 			margs.ift_tsl = tsl;
973 			margs.ift_flags = flags;
974 
975 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
976 			    xmit_hint, ire, ipst);
977 			if (next_ire == NULL) {
978 				/* keep ire if next_ire is null */
979 				goto done;
980 			}
981 			ire_refrele(ire);
982 			ire = next_ire;
983 		}
984 	}
985 
986 done:
987 	/* Return generation before dropping lock */
988 	if (generationp != NULL)
989 		*generationp = ire->ire_generation;
990 
991 	rw_exit(&ipst->ips_ip6_ire_head_lock);
992 
993 	/*
994 	 * For shared-IP zones we need additional checks to what was
995 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
996 	 *
997 	 * When ip_restrict_interzone_loopback is set, then
998 	 * we ensure that IRE_LOCAL are only used for loopback
999 	 * between zones when the logical "Ethernet" would
1000 	 * have looped them back. That is, if in the absense of
1001 	 * the IRE_LOCAL we would have sent to packet out the
1002 	 * same ill.
1003 	 */
1004 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
1005 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
1006 	    ipst->ips_ip_restrict_interzone_loopback) {
1007 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
1008 		ASSERT(ire != NULL);
1009 	}
1010 
1011 	return (ire);
1012 }
1013 
1014 /*
1015  * Look up a single ire. The caller holds either the read or write lock.
1016  */
1017 ire_t *
1018 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1019     const in6_addr_t *gateway, int type, const ill_t *ill,
1020     zoneid_t zoneid, const ts_label_t *tsl, int flags,
1021     ip_stack_t *ipst)
1022 {
1023 	irb_t *irb_ptr;
1024 	ire_t *ire = NULL;
1025 	int i;
1026 
1027 	ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
1028 
1029 	/*
1030 	 * If the mask is known, the lookup
1031 	 * is simple, if the mask is not known
1032 	 * we need to search.
1033 	 */
1034 	if (flags & MATCH_IRE_MASK) {
1035 		uint_t masklen;
1036 
1037 		masklen = ip_mask_to_plen_v6(mask);
1038 		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
1039 			return (NULL);
1040 		}
1041 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
1042 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
1043 		    ipst->ips_ip6_ftable_hash_size)]);
1044 		rw_enter(&irb_ptr->irb_lock, RW_READER);
1045 		for (ire = irb_ptr->irb_ire; ire != NULL;
1046 		    ire = ire->ire_next) {
1047 			if (IRE_IS_CONDEMNED(ire))
1048 				continue;
1049 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
1050 			    ill, zoneid, tsl, flags))
1051 				goto found_ire;
1052 		}
1053 		rw_exit(&irb_ptr->irb_lock);
1054 	} else {
1055 		uint_t masklen;
1056 
1057 		/*
1058 		 * In this case we don't know the mask, we need to
1059 		 * search the table assuming different mask sizes.
1060 		 */
1061 		if (flags & MATCH_IRE_SHORTERMASK) {
1062 			masklen = ip_mask_to_plen_v6(mask);
1063 			if (masklen == 0) {
1064 				/* Nothing shorter than zero */
1065 				return (NULL);
1066 			}
1067 			masklen--;
1068 		} else {
1069 			masklen = IP6_MASK_TABLE_SIZE - 1;
1070 		}
1071 
1072 		for (i = masklen; i >= 0; i--) {
1073 			in6_addr_t tmpmask;
1074 
1075 			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
1076 				continue;
1077 			(void) ip_plen_to_mask_v6(i, &tmpmask);
1078 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
1079 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1080 			    ipst->ips_ip6_ftable_hash_size)];
1081 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1082 			for (ire = irb_ptr->irb_ire; ire != NULL;
1083 			    ire = ire->ire_next) {
1084 				if (IRE_IS_CONDEMNED(ire))
1085 					continue;
1086 				if (ire_match_args_v6(ire, addr,
1087 				    &ire->ire_mask_v6, gateway, type, ill,
1088 				    zoneid, tsl, flags))
1089 					goto found_ire;
1090 			}
1091 			rw_exit(&irb_ptr->irb_lock);
1092 		}
1093 	}
1094 	ASSERT(ire == NULL);
1095 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1096 	return (NULL);
1097 
1098 found_ire:
1099 	ire_refhold(ire);
1100 	rw_exit(&irb_ptr->irb_lock);
1101 	return (ire);
1102 }
1103 
1104 
1105 /*
1106  * This function is called by
1107  * ip_input/ire_route_recursive when doing a route lookup on only the
1108  * destination address.
1109  *
1110  * The optimizations of this function over ire_ftable_lookup are:
1111  *	o removing unnecessary flag matching
1112  *	o doing longest prefix match instead of overloading it further
1113  *	  with the unnecessary "best_prefix_match"
1114  *
1115  * If no route is found we return IRE_NOROUTE.
1116  */
1117 ire_t *
1118 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
1119     ip_stack_t *ipst, uint_t *generationp)
1120 {
1121 	ire_t	*ire;
1122 
1123 	ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
1124 	    MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
1125 	if (ire == NULL) {
1126 		ire = ire_reject(ipst, B_TRUE);
1127 		if (generationp != NULL)
1128 			*generationp = IRE_GENERATION_VERIFY;
1129 	}
1130 	/* ftable_lookup did round robin */
1131 	return (ire);
1132 }
1133 
1134 ire_t *
1135 ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src,
1136     ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1137     int *errorp, boolean_t *multirtp)
1138 {
1139 	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1140 
1141 	return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp,
1142 	    multirtp));
1143 }
1144 
1145 /*
1146  * Recursively look for a route to the destination. Can also match on
1147  * the zoneid, ill, and label. Used for the data paths. See also
1148  * ire_route_recursive_dstonly.
1149  *
1150  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1151  * create an IRE_IF_CLONE. This is used on the receive side when we are not
1152  * forwarding.
1153  * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1154  * resolve the gateway.
1155  *
1156  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1157  * instead.
1158  *
1159  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1160  * is an error.
1161  * Allow at most one RTF_INDIRECT.
1162  */
1163 ire_t *
1164 ire_route_recursive_impl_v6(ire_t *ire,
1165     const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
1166     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1167     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1168     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1169 {
1170 	int		i, j;
1171 	in6_addr_t	v6nexthop = *nexthop;
1172 	ire_t		*ires[MAX_IRE_RECURSION];
1173 	uint_t		generation;
1174 	uint_t		generations[MAX_IRE_RECURSION];
1175 	boolean_t	need_refrele = B_FALSE;
1176 	boolean_t	invalidate = B_FALSE;
1177 	ill_t		*ill = NULL;
1178 	uint_t		maskoff = (IRE_LOCAL|IRE_LOOPBACK);
1179 
1180 	if (setsrcp != NULL)
1181 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1182 	if (gwattrp != NULL)
1183 		ASSERT(*gwattrp == NULL);
1184 
1185 	/*
1186 	 * We iterate up to three times to resolve a route, even though
1187 	 * we have four slots in the array. The extra slot is for an
1188 	 * IRE_IF_CLONE we might need to create.
1189 	 */
1190 	i = 0;
1191 	while (i < MAX_IRE_RECURSION - 1) {
1192 		/* ire_ftable_lookup handles round-robin/ECMP */
1193 		if (ire == NULL) {
1194 			ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
1195 			    (ill != NULL ? ill : ill_arg), zoneid, tsl,
1196 			    match_args, xmit_hint, ipst, &generation);
1197 		} else {
1198 			/* Caller passed it; extra hold since we will rele */
1199 			ire_refhold(ire);
1200 			if (generationp != NULL)
1201 				generation = *generationp;
1202 			else
1203 				generation = IRE_GENERATION_VERIFY;
1204 		}
1205 
1206 		if (ire == NULL) {
1207 			if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1208 				ire = ires[0];
1209 				ire_refhold(ire);
1210 			} else {
1211 				ire = ire_reject(ipst, B_TRUE);
1212 			}
1213 			goto error;
1214 		}
1215 
1216 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
1217 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1218 			goto error;
1219 
1220 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1221 
1222 		/*
1223 		 * Don't allow anything unusual past the first iteration.
1224 		 * After the first lookup, we should no longer look for
1225 		 * (IRE_LOCAL|IRE_LOOPBACK) or RTF_INDIRECT routes.
1226 		 *
1227 		 * In addition, after we have found a direct IRE_OFFLINK,
1228 		 * we should only look for interface or clone routes.
1229 		 */
1230 		match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1231 		if ((ire->ire_type & IRE_OFFLINK) &&
1232 		    !(ire->ire_flags & RTF_INDIRECT)) {
1233 			ire_type = IRE_IF_ALL;
1234 		} else {
1235 			if (!(match_args & MATCH_IRE_TYPE))
1236 				ire_type = (IRE_OFFLINK|IRE_ONLINK);
1237 			ire_type &= ~maskoff; /* no more LOCAL, LOOPBACK */
1238 		}
1239 		match_args |= MATCH_IRE_TYPE;
1240 		/* We have a usable IRE */
1241 		ires[i] = ire;
1242 		generations[i] = generation;
1243 		i++;
1244 
1245 		/* The first RTF_SETSRC address is passed back if setsrcp */
1246 		if ((ire->ire_flags & RTF_SETSRC) &&
1247 		    setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
1248 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1249 			    &ire->ire_setsrc_addr_v6));
1250 			*setsrcp = ire->ire_setsrc_addr_v6;
1251 		}
1252 
1253 		/* The first ire_gw_secattr is passed back if gwattrp */
1254 		if (ire->ire_gw_secattr != NULL &&
1255 		    gwattrp != NULL && *gwattrp == NULL)
1256 			*gwattrp = ire->ire_gw_secattr;
1257 
1258 		/*
1259 		 * Check if we have a short-cut pointer to an IRE for this
1260 		 * destination, and that the cached dependency isn't stale.
1261 		 * In that case we've rejoined an existing tree towards a
1262 		 * parent, thus we don't need to continue the loop to
1263 		 * discover the rest of the tree.
1264 		 */
1265 		mutex_enter(&ire->ire_lock);
1266 		if (ire->ire_dep_parent != NULL &&
1267 		    ire->ire_dep_parent->ire_generation ==
1268 		    ire->ire_dep_parent_generation) {
1269 			mutex_exit(&ire->ire_lock);
1270 			ire = NULL;
1271 			goto done;
1272 		}
1273 		mutex_exit(&ire->ire_lock);
1274 
1275 		/*
1276 		 * If this type should have an ire_nce_cache (even if it
1277 		 * doesn't yet have one) then we are done. Includes
1278 		 * IRE_INTERFACE with a full 128 bit mask.
1279 		 */
1280 		if (ire->ire_nce_capable) {
1281 			ire = NULL;
1282 			goto done;
1283 		}
1284 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1285 		/*
1286 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1287 		 * particular destination
1288 		 */
1289 		if (ire->ire_type & IRE_INTERFACE) {
1290 			ire_t		*clone;
1291 
1292 			ASSERT(ire->ire_masklen != IPV6_ABITS);
1293 
1294 			/*
1295 			 * In the case of ip_input and ILLF_FORWARDING not
1296 			 * being set, and in the case of RTM_GET, there is
1297 			 * no point in allocating an IRE_IF_CLONE. We return
1298 			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1299 			 * result in a ire_dep_parent which is IRE_IF_*
1300 			 * without an IRE_IF_CLONE.
1301 			 * We recover from that when we need to send packets
1302 			 * by ensuring that the generations become
1303 			 * IRE_GENERATION_VERIFY in this case.
1304 			 */
1305 			if (!(irr_flags & IRR_ALLOCATE)) {
1306 				invalidate = B_TRUE;
1307 				ire = NULL;
1308 				goto done;
1309 			}
1310 
1311 			clone = ire_create_if_clone(ire, &v6nexthop,
1312 			    &generation);
1313 			if (clone == NULL) {
1314 				/*
1315 				 * Temporary failure - no memory.
1316 				 * Don't want caller to cache IRE_NOROUTE.
1317 				 */
1318 				invalidate = B_TRUE;
1319 				ire = ire_blackhole(ipst, B_TRUE);
1320 				goto error;
1321 			}
1322 			/*
1323 			 * Make clone next to last entry and the
1324 			 * IRE_INTERFACE the last in the dependency
1325 			 * chain since the clone depends on the
1326 			 * IRE_INTERFACE.
1327 			 */
1328 			ASSERT(i >= 1);
1329 			ASSERT(i < MAX_IRE_RECURSION);
1330 
1331 			ires[i] = ires[i-1];
1332 			generations[i] = generations[i-1];
1333 			ires[i-1] = clone;
1334 			generations[i-1] = generation;
1335 			i++;
1336 
1337 			ire = NULL;
1338 			goto done;
1339 		}
1340 
1341 		/*
1342 		 * We only match on the type and optionally ILL when
1343 		 * recursing. The type match is used by some callers
1344 		 * to exclude certain types (such as IRE_IF_CLONE or
1345 		 * IRE_LOCAL|IRE_LOOPBACK).
1346 		 *
1347 		 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1348 		 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1349 		 * ire_ill, so we set ill to the ire_ill
1350 		 */
1351 		match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1352 		v6nexthop = ire->ire_gateway_addr_v6;
1353 		if (ill == NULL && ire->ire_ill != NULL) {
1354 			ill = ire->ire_ill;
1355 			need_refrele = B_TRUE;
1356 			ill_refhold(ill);
1357 			match_args |= MATCH_IRE_ILL;
1358 		}
1359 		ire = NULL;
1360 	}
1361 	ASSERT(ire == NULL);
1362 	ire = ire_reject(ipst, B_TRUE);
1363 
1364 error:
1365 	ASSERT(ire != NULL);
1366 	if (need_refrele)
1367 		ill_refrele(ill);
1368 
1369 	/*
1370 	 * In the case of MULTIRT we want to try a different IRE the next
1371 	 * time. We let the next packet retry in that case.
1372 	 */
1373 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1374 		(void) ire_no_good(ires[0]);
1375 
1376 cleanup:
1377 	/* cleanup ires[i] */
1378 	ire_dep_unbuild(ires, i);
1379 	for (j = 0; j < i; j++)
1380 		ire_refrele(ires[j]);
1381 
1382 	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1383 	    (irr_flags & IRR_INCOMPLETE));
1384 	/*
1385 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1386 	 * ip_select_route since the reject or lack of memory might be gone.
1387 	 */
1388 	if (generationp != NULL)
1389 		*generationp = IRE_GENERATION_VERIFY;
1390 	return (ire);
1391 
1392 done:
1393 	ASSERT(ire == NULL);
1394 	if (need_refrele)
1395 		ill_refrele(ill);
1396 
1397 	/* Build dependencies */
1398 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
1399 		/* Something in chain was condemned; tear it apart */
1400 		ire = ire_blackhole(ipst, B_TRUE);
1401 		goto cleanup;
1402 	}
1403 
1404 	/*
1405 	 * Release all refholds except the one for ires[0] that we
1406 	 * will return to the caller.
1407 	 */
1408 	for (j = 1; j < i; j++)
1409 		ire_refrele(ires[j]);
1410 
1411 	if (invalidate) {
1412 		/*
1413 		 * Since we needed to allocate but couldn't we need to make
1414 		 * sure that the dependency chain is rebuilt the next time.
1415 		 */
1416 		ire_dep_invalidate_generations(ires[0]);
1417 		generation = IRE_GENERATION_VERIFY;
1418 	} else {
1419 		/*
1420 		 * IREs can have been added or deleted while we did the
1421 		 * recursive lookup and we can't catch those until we've built
1422 		 * the dependencies. We verify the stored
1423 		 * ire_dep_parent_generation to catch any such changes and
1424 		 * return IRE_GENERATION_VERIFY (which will cause
1425 		 * ip_select_route to be called again so we can redo the
1426 		 * recursive lookup next time we send a packet.
1427 		 */
1428 		if (ires[0]->ire_dep_parent == NULL)
1429 			generation = ires[0]->ire_generation;
1430 		else
1431 			generation = ire_dep_validate_generations(ires[0]);
1432 		if (generations[0] != ires[0]->ire_generation) {
1433 			/* Something changed at the top */
1434 			generation = IRE_GENERATION_VERIFY;
1435 		}
1436 	}
1437 	if (generationp != NULL)
1438 		*generationp = generation;
1439 
1440 	return (ires[0]);
1441 }
1442 
1443 ire_t *
1444 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
1445     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1446     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1447     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1448 {
1449 	return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
1450 	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1451 	    gwattrp, generationp));
1452 }
1453 
1454 /*
1455  * Recursively look for a route to the destination.
1456  * We only handle a destination match here, yet we have the same arguments
1457  * as the full match to allow function pointers to select between the two.
1458  *
1459  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1460  * instead.
1461  *
1462  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1463  * is an error.
1464  * Allow at most one RTF_INDIRECT.
1465  */
1466 ire_t *
1467 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags,
1468     uint32_t xmit_hint, ip_stack_t *ipst)
1469 {
1470 	ire_t	*ire;
1471 	ire_t	*ire1;
1472 	uint_t	generation;
1473 
1474 	/* ire_ftable_lookup handles round-robin/ECMP */
1475 	ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
1476 	    &generation);
1477 	ASSERT(ire != NULL);
1478 
1479 	/*
1480 	 * If this type should have an ire_nce_cache (even if it
1481 	 * doesn't yet have one) then we are done. Includes
1482 	 * IRE_INTERFACE with a full 128 bit mask.
1483 	 */
1484 	if (ire->ire_nce_capable)
1485 		return (ire);
1486 
1487 	/*
1488 	 * If the IRE has a current cached parent we know that the whole
1489 	 * parent chain is current, hence we don't need to discover and
1490 	 * build any dependencies by doing a recursive lookup.
1491 	 */
1492 	mutex_enter(&ire->ire_lock);
1493 	if (ire->ire_dep_parent != NULL &&
1494 	    ire->ire_dep_parent->ire_generation ==
1495 	    ire->ire_dep_parent_generation) {
1496 		mutex_exit(&ire->ire_lock);
1497 		return (ire);
1498 	}
1499 	mutex_exit(&ire->ire_lock);
1500 
1501 	/*
1502 	 * Fallback to loop in the normal code starting with the ire
1503 	 * we found. Normally this would return the same ire.
1504 	 */
1505 	ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
1506 	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1507 	    &generation);
1508 	ire_refrele(ire);
1509 	return (ire1);
1510 }
1511