xref: /titanic_41/usr/src/uts/common/inet/ip/ip6_ire.c (revision c61d8baa39b367ff34e75d386cd4982896306860)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 1990 Mentat Inc.
27  */
28 
29 /*
30  * This file contains routines that manipulate Internet Routing Entries (IREs).
31  */
32 #include <sys/types.h>
33 #include <sys/stream.h>
34 #include <sys/stropts.h>
35 #include <sys/ddi.h>
36 #include <sys/cmn_err.h>
37 
38 #include <sys/systm.h>
39 #include <sys/param.h>
40 #include <sys/socket.h>
41 #include <net/if.h>
42 #include <net/route.h>
43 #include <netinet/in.h>
44 #include <net/if_dl.h>
45 #include <netinet/ip6.h>
46 #include <netinet/icmp6.h>
47 
48 #include <inet/common.h>
49 #include <inet/mi.h>
50 #include <inet/ip.h>
51 #include <inet/ip6.h>
52 #include <inet/ip_ndp.h>
53 #include <inet/ip_if.h>
54 #include <inet/ip_ire.h>
55 #include <inet/ipclassifier.h>
56 #include <inet/nd.h>
57 #include <inet/tunables.h>
58 #include <sys/kmem.h>
59 #include <sys/zone.h>
60 
61 #include <sys/tsol/label.h>
62 #include <sys/tsol/tnet.h>
63 
64 #define	IS_DEFAULT_ROUTE_V6(ire)	\
65 	(((ire)->ire_type & IRE_DEFAULT) || \
66 	    (((ire)->ire_type & IRE_INTERFACE) && \
67 	    (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
68 
69 static	ire_t	ire_null;
70 
71 static ire_t *
72 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
73     const in6_addr_t *gateway, int type, const ill_t *ill,
74     zoneid_t zoneid, const ts_label_t *tsl, int flags,
75     ip_stack_t *ipst);
76 
77 /*
78  * Initialize the ire that is specific to IPv6 part and call
79  * ire_init_common to finish it.
80  * Returns zero or errno.
81  */
82 int
83 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
84     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
85     zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
86 {
87 	int error;
88 
89 	/*
90 	 * Reject IRE security attmakeribute creation/initialization
91 	 * if system is not running in Trusted mode.
92 	 */
93 	if (gc != NULL && !is_system_labeled())
94 		return (EINVAL);
95 
96 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
97 	if (v6addr != NULL)
98 		ire->ire_addr_v6 = *v6addr;
99 	if (v6gateway != NULL)
100 		ire->ire_gateway_addr_v6 = *v6gateway;
101 
102 	/* Make sure we don't have stray values in some fields */
103 	switch (type) {
104 	case IRE_LOOPBACK:
105 	case IRE_HOST:
106 	case IRE_LOCAL:
107 	case IRE_IF_CLONE:
108 		ire->ire_mask_v6 = ipv6_all_ones;
109 		ire->ire_masklen = IPV6_ABITS;
110 		break;
111 	case IRE_PREFIX:
112 	case IRE_DEFAULT:
113 	case IRE_IF_RESOLVER:
114 	case IRE_IF_NORESOLVER:
115 		if (v6mask != NULL) {
116 			ire->ire_mask_v6 = *v6mask;
117 			ire->ire_masklen =
118 			    ip_mask_to_plen_v6(&ire->ire_mask_v6);
119 		}
120 		break;
121 	case IRE_MULTICAST:
122 	case IRE_NOROUTE:
123 		ASSERT(v6mask == NULL);
124 		break;
125 	default:
126 		ASSERT(0);
127 		return (EINVAL);
128 	}
129 
130 	error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
131 	    gc, ipst);
132 	if (error != NULL)
133 		return (error);
134 
135 	/* Determine which function pointers to use */
136 	ire->ire_postfragfn = ip_xmit;		/* Common case */
137 
138 	switch (ire->ire_type) {
139 	case IRE_LOCAL:
140 		ire->ire_sendfn = ire_send_local_v6;
141 		ire->ire_recvfn = ire_recv_local_v6;
142 		ASSERT(ire->ire_ill != NULL);
143 		if (ire->ire_ill->ill_flags & ILLF_NOACCEPT)
144 			ire->ire_recvfn = ire_recv_noaccept_v6;
145 		break;
146 	case IRE_LOOPBACK:
147 		ire->ire_sendfn = ire_send_local_v6;
148 		ire->ire_recvfn = ire_recv_loopback_v6;
149 		break;
150 	case IRE_MULTICAST:
151 		ire->ire_postfragfn = ip_postfrag_loopcheck;
152 		ire->ire_sendfn = ire_send_multicast_v6;
153 		ire->ire_recvfn = ire_recv_multicast_v6;
154 		break;
155 	default:
156 		/*
157 		 * For IRE_IF_ALL and IRE_OFFLINK we forward received
158 		 * packets by default.
159 		 */
160 		ire->ire_sendfn = ire_send_wire_v6;
161 		ire->ire_recvfn = ire_recv_forward_v6;
162 		break;
163 	}
164 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
165 		ire->ire_sendfn = ire_send_noroute_v6;
166 		ire->ire_recvfn = ire_recv_noroute_v6;
167 	} else if (ire->ire_flags & RTF_MULTIRT) {
168 		ire->ire_postfragfn = ip_postfrag_multirt_v6;
169 		ire->ire_sendfn = ire_send_multirt_v6;
170 		ire->ire_recvfn = ire_recv_multirt_v6;
171 	}
172 	ire->ire_nce_capable = ire_determine_nce_capable(ire);
173 	return (0);
174 }
175 
176 /*
177  * ire_create_v6 is called to allocate and initialize a new IRE.
178  *
179  * NOTE : This is called as writer sometimes though not required
180  * by this function.
181  */
182 /* ARGSUSED */
183 ire_t *
184 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
185     const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
186     uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
187 {
188 	ire_t	*ire;
189 	int	error;
190 
191 	ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
192 
193 	ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
194 	if (ire == NULL) {
195 		DTRACE_PROBE(kmem__cache__alloc);
196 		return (NULL);
197 	}
198 	*ire = ire_null;
199 
200 	error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
201 	    type, ill, zoneid, flags, gc, ipst);
202 
203 	if (error != 0) {
204 		DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
205 		kmem_cache_free(ire_cache, ire);
206 		return (NULL);
207 	}
208 	return (ire);
209 }
210 
211 /*
212  * Find the ill matching a multicast group.
213  * Allows different routes for multicast addresses
214  * in the unicast routing table (akin to FF::0/8 but could be more specific)
215  * which point at different interfaces. This is used when IPV6_MULTICAST_IF
216  * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
217  * specify the interface to join on.
218  *
219  * Supports link-local addresses by using ire_route_recursive which follows
220  * the ill when recursing.
221  *
222  * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
223  * and the MULTIRT property can be different for different groups, we
224  * extract RTF_MULTIRT from the special unicast route added for a group
225  * with CGTP and pass that back in the multirtp argument.
226  * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
227  * We have a setsrcp argument for the same reason.
228  */
229 ill_t *
230 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
231     ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
232 {
233 	ire_t	*ire;
234 	ill_t	*ill;
235 
236 	ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
237 	    MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL);
238 	ASSERT(ire != NULL);
239 
240 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
241 		ire_refrele(ire);
242 		return (NULL);
243 	}
244 
245 	if (multirtp != NULL)
246 		*multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
247 
248 	ill = ire_nexthop_ill(ire);
249 	ire_refrele(ire);
250 	return (ill);
251 }
252 
253 /*
254  * This function takes a mask and returns number of bits set in the
255  * mask (the represented prefix length).  Assumes a contiguous mask.
256  */
257 int
258 ip_mask_to_plen_v6(const in6_addr_t *v6mask)
259 {
260 	int		bits;
261 	int		plen = IPV6_ABITS;
262 	int		i;
263 
264 	for (i = 3; i >= 0; i--) {
265 		if (v6mask->s6_addr32[i] == 0) {
266 			plen -= 32;
267 			continue;
268 		}
269 		bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
270 		if (bits == 0)
271 			break;
272 		plen -= bits;
273 	}
274 
275 	return (plen);
276 }
277 
278 /*
279  * Convert a prefix length to the mask for that prefix.
280  * Returns the argument bitmask.
281  */
282 in6_addr_t *
283 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
284 {
285 	uint32_t *ptr;
286 
287 	if (plen < 0 || plen > IPV6_ABITS)
288 		return (NULL);
289 	*bitmask = ipv6_all_zeros;
290 	if (plen == 0)
291 		return (bitmask);
292 
293 	ptr = (uint32_t *)bitmask;
294 	while (plen > 32) {
295 		*ptr++ = 0xffffffffU;
296 		plen -= 32;
297 	}
298 	*ptr = htonl(0xffffffffU << (32 - plen));
299 	return (bitmask);
300 }
301 
302 /*
303  * Add a fully initialized IPv6 IRE to the forwarding table.
304  * This returns NULL on failure, or a held IRE on success.
305  * Normally the returned IRE is the same as the argument. But a different
306  * IRE will be returned if the added IRE is deemed identical to an existing
307  * one. In that case ire_identical_ref will be increased.
308  * The caller always needs to do an ire_refrele() on the returned IRE.
309  */
310 ire_t *
311 ire_add_v6(ire_t *ire)
312 {
313 	ire_t	*ire1;
314 	int	mask_table_index;
315 	irb_t	*irb_ptr;
316 	ire_t	**irep;
317 	int	match_flags;
318 	int	error;
319 	ip_stack_t	*ipst = ire->ire_ipst;
320 
321 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
322 
323 	/* Make sure the address is properly masked. */
324 	V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
325 
326 	mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
327 	if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
328 		irb_t *ptr;
329 		int i;
330 
331 		ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
332 		    sizeof (irb_t)));
333 		if (ptr == NULL) {
334 			ire_delete(ire);
335 			return (NULL);
336 		}
337 		for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
338 			rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
339 			ptr[i].irb_ipst = ipst;
340 		}
341 		mutex_enter(&ipst->ips_ire_ft_init_lock);
342 		if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
343 		    NULL) {
344 			ipst->ips_ip_forwarding_table_v6[mask_table_index] =
345 			    ptr;
346 			mutex_exit(&ipst->ips_ire_ft_init_lock);
347 		} else {
348 			/*
349 			 * Some other thread won the race in
350 			 * initializing the forwarding table at the
351 			 * same index.
352 			 */
353 			mutex_exit(&ipst->ips_ire_ft_init_lock);
354 			for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
355 				rw_destroy(&ptr[i].irb_lock);
356 			}
357 			mi_free(ptr);
358 		}
359 	}
360 	irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
361 	    IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
362 	    ipst->ips_ip6_ftable_hash_size)]);
363 
364 	match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
365 	if (ire->ire_ill != NULL)
366 		match_flags |= MATCH_IRE_ILL;
367 	/*
368 	 * Start the atomic add of the ire. Grab the bucket lock and the
369 	 * ill lock. Check for condemned.
370 	 */
371 	error = ire_atomic_start(irb_ptr, ire);
372 	if (error != 0) {
373 		ire_delete(ire);
374 		return (NULL);
375 	}
376 
377 	/*
378 	 * If we are creating a hidden IRE, make sure we search for
379 	 * hidden IREs when searching for duplicates below.
380 	 * Otherwise, we might find an IRE on some other interface
381 	 * that's not marked hidden.
382 	 */
383 	if (ire->ire_testhidden)
384 		match_flags |= MATCH_IRE_TESTHIDDEN;
385 
386 	/*
387 	 * Atomically check for duplicate and insert in the table.
388 	 */
389 	for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
390 		if (IRE_IS_CONDEMNED(ire1))
391 			continue;
392 		/*
393 		 * Here we need an exact match on zoneid, i.e.,
394 		 * ire_match_args doesn't fit.
395 		 */
396 		if (ire1->ire_zoneid != ire->ire_zoneid)
397 			continue;
398 
399 		if (ire1->ire_type != ire->ire_type)
400 			continue;
401 
402 		/*
403 		 * Note: We do not allow multiple routes that differ only
404 		 * in the gateway security attributes; such routes are
405 		 * considered duplicates.
406 		 * To change that we explicitly have to treat them as
407 		 * different here.
408 		 */
409 		if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
410 		    &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
411 		    ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
412 		    match_flags)) {
413 			/*
414 			 * Return the old ire after doing a REFHOLD.
415 			 * As most of the callers continue to use the IRE
416 			 * after adding, we return a held ire. This will
417 			 * avoid a lookup in the caller again. If the callers
418 			 * don't want to use it, they need to do a REFRELE.
419 			 */
420 			ip1dbg(("found dup ire existing %p new %p",
421 			    (void *)ire1, (void *)ire));
422 			ire_refhold(ire1);
423 			atomic_add_32(&ire1->ire_identical_ref, 1);
424 			ire_atomic_end(irb_ptr, ire);
425 			ire_delete(ire);
426 			return (ire1);
427 		}
428 	}
429 
430 	/*
431 	 * Normally we do head insertion since most things do not care about
432 	 * the order of the IREs in the bucket.
433 	 * However, due to shared-IP zones (and restrict_interzone_loopback)
434 	 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
435 	 * address. For that reason we do tail insertion for IRE_IF_CLONE.
436 	 */
437 	irep = (ire_t **)irb_ptr;
438 	if (ire->ire_type & IRE_IF_CLONE) {
439 		while ((ire1 = *irep) != NULL)
440 			irep = &ire1->ire_next;
441 	}
442 	/* Insert at *irep */
443 	ire1 = *irep;
444 	if (ire1 != NULL)
445 		ire1->ire_ptpn = &ire->ire_next;
446 	ire->ire_next = ire1;
447 	/* Link the new one in. */
448 	ire->ire_ptpn = irep;
449 	/*
450 	 * ire_walk routines de-reference ire_next without holding
451 	 * a lock. Before we point to the new ire, we want to make
452 	 * sure the store that sets the ire_next of the new ire
453 	 * reaches global visibility, so that ire_walk routines
454 	 * don't see a truncated list of ires i.e if the ire_next
455 	 * of the new ire gets set after we do "*irep = ire" due
456 	 * to re-ordering, the ire_walk thread will see a NULL
457 	 * once it accesses the ire_next of the new ire.
458 	 * membar_producer() makes sure that the following store
459 	 * happens *after* all of the above stores.
460 	 */
461 	membar_producer();
462 	*irep = ire;
463 	ire->ire_bucket = irb_ptr;
464 	/*
465 	 * We return a bumped up IRE above. Keep it symmetrical
466 	 * so that the callers will always have to release. This
467 	 * helps the callers of this function because they continue
468 	 * to use the IRE after adding and hence they don't have to
469 	 * lookup again after we return the IRE.
470 	 *
471 	 * NOTE : We don't have to use atomics as this is appearing
472 	 * in the list for the first time and no one else can bump
473 	 * up the reference count on this yet.
474 	 */
475 	ire_refhold_locked(ire);
476 	BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
477 	irb_ptr->irb_ire_cnt++;
478 
479 	if (ire->ire_ill != NULL) {
480 		DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
481 		    (char *), "ire", (void *), ire);
482 		ire->ire_ill->ill_ire_cnt++;
483 		ASSERT(ire->ire_ill->ill_ire_cnt != 0);	/* Wraparound */
484 	}
485 	ire_atomic_end(irb_ptr, ire);
486 
487 	/* Make any caching of the IREs be notified or updated */
488 	ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
489 
490 	return (ire);
491 }
492 
493 /*
494  * Search for all HOST REDIRECT routes that are
495  * pointing at the specified gateway and
496  * delete them. This routine is called only
497  * when a default gateway is going away.
498  */
499 static void
500 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
501 {
502 	irb_t *irb_ptr;
503 	irb_t *irb;
504 	ire_t *ire;
505 	in6_addr_t gw_addr_v6;
506 	int i;
507 
508 	/* get the hash table for HOST routes */
509 	irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)];
510 	if (irb_ptr == NULL)
511 		return;
512 	for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
513 		irb = &irb_ptr[i];
514 		irb_refhold(irb);
515 		for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
516 			if (!(ire->ire_flags & RTF_DYNAMIC))
517 				continue;
518 			mutex_enter(&ire->ire_lock);
519 			gw_addr_v6 = ire->ire_gateway_addr_v6;
520 			mutex_exit(&ire->ire_lock);
521 			if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
522 				ire_delete(ire);
523 		}
524 		irb_refrele(irb);
525 	}
526 }
527 
528 /*
529  * Delete the specified IRE.
530  * All calls should use ire_delete().
531  * Sometimes called as writer though not required by this function.
532  *
533  * NOTE : This function is called only if the ire was added
534  * in the list.
535  */
536 void
537 ire_delete_v6(ire_t *ire)
538 {
539 	in6_addr_t gw_addr_v6;
540 	ip_stack_t	*ipst = ire->ire_ipst;
541 
542 	/*
543 	 * Make sure ire_generation increases from ire_flush_cache happen
544 	 * after any lookup/reader has read ire_generation.
545 	 * Since the rw_enter makes us wait until any lookup/reader has
546 	 * completed we can exit the lock immediately.
547 	 */
548 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
549 	rw_exit(&ipst->ips_ip6_ire_head_lock);
550 
551 	ASSERT(ire->ire_refcnt >= 1);
552 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
553 
554 	ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
555 
556 	if (ire->ire_type == IRE_DEFAULT) {
557 		/*
558 		 * when a default gateway is going away
559 		 * delete all the host redirects pointing at that
560 		 * gateway.
561 		 */
562 		mutex_enter(&ire->ire_lock);
563 		gw_addr_v6 = ire->ire_gateway_addr_v6;
564 		mutex_exit(&ire->ire_lock);
565 		ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
566 	}
567 
568 	/*
569 	 * If we are deleting an IRE_INTERFACE then we make sure we also
570 	 * delete any IRE_IF_CLONE that has been created from it.
571 	 * Those are always in ire_dep_children.
572 	 */
573 	if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
574 		ire_dep_delete_if_clone(ire);
575 
576 	/* Remove from parent dependencies and child */
577 	rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
578 	if (ire->ire_dep_parent != NULL) {
579 		ire_dep_remove(ire);
580 	}
581 	while (ire->ire_dep_children != NULL)
582 		ire_dep_remove(ire->ire_dep_children);
583 	rw_exit(&ipst->ips_ire_dep_lock);
584 }
585 
586 /*
587  * When an IRE is added or deleted this routine is called to make sure
588  * any caching of IRE information is notified or updated.
589  *
590  * The flag argument indicates if the flush request is due to addition
591  * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
592  * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
593  */
594 void
595 ire_flush_cache_v6(ire_t *ire, int flag)
596 {
597 	ip_stack_t *ipst = ire->ire_ipst;
598 
599 	/*
600 	 * IRE_IF_CLONE ire's don't provide any new information
601 	 * than the parent from which they are cloned, so don't
602 	 * perturb the generation numbers.
603 	 */
604 	if (ire->ire_type & IRE_IF_CLONE)
605 		return;
606 
607 	/*
608 	 * Ensure that an ire_add during a lookup serializes the updates of
609 	 * the generation numbers under ire_head_lock so that the lookup gets
610 	 * either the old ire and old generation number, or a new ire and new
611 	 * generation number.
612 	 */
613 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
614 
615 	/*
616 	 * If a route was just added, we need to notify everybody that
617 	 * has cached an IRE_NOROUTE since there might now be a better
618 	 * route for them.
619 	 */
620 	if (flag == IRE_FLUSH_ADD) {
621 		ire_increment_generation(ipst->ips_ire_reject_v6);
622 		ire_increment_generation(ipst->ips_ire_blackhole_v6);
623 	}
624 
625 	/* Adding a default can't otherwise provide a better route */
626 	if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
627 		rw_exit(&ipst->ips_ip6_ire_head_lock);
628 		return;
629 	}
630 
631 	switch (flag) {
632 	case IRE_FLUSH_DELETE:
633 	case IRE_FLUSH_GWCHANGE:
634 		/*
635 		 * Update ire_generation for all ire_dep_children chains
636 		 * starting with this IRE
637 		 */
638 		ire_dep_incr_generation(ire);
639 		break;
640 	case IRE_FLUSH_ADD: {
641 		in6_addr_t	addr;
642 		in6_addr_t	mask;
643 		ip_stack_t	*ipst = ire->ire_ipst;
644 		uint_t		masklen;
645 
646 		/*
647 		 * Find an IRE which is a shorter match than the ire to be added
648 		 * For any such IRE (which we repeat) we update the
649 		 * ire_generation the same way as in the delete case.
650 		 */
651 		addr = ire->ire_addr_v6;
652 		mask = ire->ire_mask_v6;
653 		masklen = ip_mask_to_plen_v6(&mask);
654 
655 		ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
656 		    ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
657 		while (ire != NULL) {
658 			/* We need to handle all in the same bucket */
659 			irb_increment_generation(ire->ire_bucket);
660 
661 			mask = ire->ire_mask_v6;
662 			ASSERT(masklen > ip_mask_to_plen_v6(&mask));
663 			masklen = ip_mask_to_plen_v6(&mask);
664 			ire_refrele(ire);
665 			ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
666 			    NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
667 		}
668 		}
669 		break;
670 	}
671 	rw_exit(&ipst->ips_ip6_ire_head_lock);
672 }
673 
674 /*
675  * Matches the arguments passed with the values in the ire.
676  *
677  * Note: for match types that match using "ill" passed in, ill
678  * must be checked for non-NULL before calling this routine.
679  */
680 boolean_t
681 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
682     const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
683     const ts_label_t *tsl, int match_flags)
684 {
685 	in6_addr_t masked_addr;
686 	in6_addr_t gw_addr_v6;
687 	ill_t *ire_ill = NULL, *dst_ill;
688 	ip_stack_t *ipst = ire->ire_ipst;
689 
690 	ASSERT(ire->ire_ipversion == IPV6_VERSION);
691 	ASSERT(addr != NULL);
692 	ASSERT(mask != NULL);
693 	ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
694 	ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) ||
695 	    (ill != NULL && ill->ill_isv6));
696 
697 	/*
698 	 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
699 	 * is in fact hidden, to ensure the caller gets the right one.
700 	 */
701 	if (ire->ire_testhidden) {
702 		if (!(match_flags & MATCH_IRE_TESTHIDDEN))
703 			return (B_FALSE);
704 	}
705 
706 	if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
707 	    ire->ire_zoneid != ALL_ZONES) {
708 		/*
709 		 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
710 		 * does not match that of ire_zoneid, a failure to
711 		 * match is reported at this point. Otherwise, since some IREs
712 		 * that are available in the global zone can be used in local
713 		 * zones, additional checks need to be performed:
714 		 *
715 		 * IRE_LOOPBACK
716 		 *	entries should never be matched in this situation.
717 		 *	Each zone has its own IRE_LOOPBACK.
718 		 *
719 		 * IRE_LOCAL
720 		 *	We allow them for any zoneid. ire_route_recursive
721 		 *	does additional checks when
722 		 *	ip_restrict_interzone_loopback is set.
723 		 *
724 		 * If ill_usesrc_ifindex is set
725 		 *	Then we check if the zone has a valid source address
726 		 *	on the usesrc ill.
727 		 *
728 		 * If ire_ill is set, then check that the zone has an ipif
729 		 *	on that ill.
730 		 *
731 		 * Outside of this function (in ire_round_robin) we check
732 		 * that any IRE_OFFLINK has a gateway that reachable from the
733 		 * zone when we have multiple choices (ECMP).
734 		 */
735 		if (match_flags & MATCH_IRE_ZONEONLY)
736 			return (B_FALSE);
737 		if (ire->ire_type & IRE_LOOPBACK)
738 			return (B_FALSE);
739 
740 		if (ire->ire_type & IRE_LOCAL)
741 			goto matchit;
742 
743 		/*
744 		 * The normal case of IRE_ONLINK has a matching zoneid.
745 		 * Here we handle the case when shared-IP zones have been
746 		 * configured with IP addresses on vniN. In that case it
747 		 * is ok for traffic from a zone to use IRE_ONLINK routes
748 		 * if the ill has a usesrc pointing at vniN
749 		 * Applies to IRE_INTERFACE.
750 		 */
751 		dst_ill = ire->ire_ill;
752 		if (ire->ire_type & IRE_ONLINK) {
753 			uint_t	ifindex;
754 
755 			/*
756 			 * Note there is no IRE_INTERFACE on vniN thus
757 			 * can't do an IRE lookup for a matching route.
758 			 */
759 			ifindex = dst_ill->ill_usesrc_ifindex;
760 			if (ifindex == 0)
761 				return (B_FALSE);
762 
763 			/*
764 			 * If there is a usable source address in the
765 			 * zone, then it's ok to return this IRE_INTERFACE
766 			 */
767 			if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
768 			    zoneid, ipst)) {
769 				ip3dbg(("ire_match_args: no usrsrc for zone"
770 				    " dst_ill %p\n", (void *)dst_ill));
771 				return (B_FALSE);
772 			}
773 		}
774 		/*
775 		 * For example, with
776 		 * route add 11.0.0.0 gw1 -ifp bge0
777 		 * route add 11.0.0.0 gw2 -ifp bge1
778 		 * this code would differentiate based on
779 		 * where the sending zone has addresses.
780 		 * Only if the zone has an address on bge0 can it use the first
781 		 * route. It isn't clear if this behavior is documented
782 		 * anywhere.
783 		 */
784 		if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
785 			ipif_t	*tipif;
786 
787 			mutex_enter(&dst_ill->ill_lock);
788 			for (tipif = dst_ill->ill_ipif;
789 			    tipif != NULL; tipif = tipif->ipif_next) {
790 				if (!IPIF_IS_CONDEMNED(tipif) &&
791 				    (tipif->ipif_flags & IPIF_UP) &&
792 				    (tipif->ipif_zoneid == zoneid ||
793 				    tipif->ipif_zoneid == ALL_ZONES))
794 					break;
795 			}
796 			mutex_exit(&dst_ill->ill_lock);
797 			if (tipif == NULL)
798 				return (B_FALSE);
799 		}
800 	}
801 
802 matchit:
803 	ire_ill = ire->ire_ill;
804 	if (match_flags & MATCH_IRE_GW) {
805 		mutex_enter(&ire->ire_lock);
806 		gw_addr_v6 = ire->ire_gateway_addr_v6;
807 		mutex_exit(&ire->ire_lock);
808 	}
809 	if (match_flags & MATCH_IRE_ILL) {
810 
811 		/*
812 		 * If asked to match an ill, we *must* match
813 		 * on the ire_ill for ipmp test addresses, or
814 		 * any of the ill in the group for data addresses.
815 		 * If we don't, we may as well fail.
816 		 * However, we need an exception for IRE_LOCALs to ensure
817 		 * we loopback packets even sent to test addresses on different
818 		 * interfaces in the group.
819 		 */
820 		if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
821 		    !(ire->ire_type & IRE_LOCAL)) {
822 			if (ire->ire_ill != ill)
823 				return (B_FALSE);
824 		} else  {
825 			match_flags &= ~MATCH_IRE_TESTHIDDEN;
826 			/*
827 			 * We know that ill is not NULL, but ire_ill could be
828 			 * NULL
829 			 */
830 			if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
831 				return (B_FALSE);
832 		}
833 	}
834 	if (match_flags & MATCH_IRE_SRC_ILL) {
835 		if (ire_ill == NULL)
836 			return (B_FALSE);
837 		if (!IS_ON_SAME_LAN(ill, ire_ill)) {
838 			if (ire_ill->ill_usesrc_ifindex == 0 ||
839 			    (ire_ill->ill_usesrc_ifindex !=
840 			    ill->ill_phyint->phyint_ifindex))
841 				return (B_FALSE);
842 		}
843 	}
844 
845 	/* No ire_addr_v6 bits set past the mask */
846 	ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
847 	    ire->ire_addr_v6));
848 	V6_MASK_COPY(*addr, *mask, masked_addr);
849 	if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
850 	    ((!(match_flags & MATCH_IRE_GW)) ||
851 	    ((!(match_flags & MATCH_IRE_DIRECT)) ||
852 	    !(ire->ire_flags & RTF_INDIRECT)) &&
853 	    IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
854 	    ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
855 	    ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
856 	    ((!(match_flags & MATCH_IRE_MASK)) ||
857 	    (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
858 	    ((!(match_flags & MATCH_IRE_SECATTR)) ||
859 	    (!is_system_labeled()) ||
860 	    (tsol_ire_match_gwattr(ire, tsl) == 0))) {
861 		/* We found the matched IRE */
862 		return (B_TRUE);
863 	}
864 	return (B_FALSE);
865 }
866 
867 /*
868  * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
869  * gateway address. If ill is non-NULL we also match on it.
870  * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
871  */
872 boolean_t
873 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
874     const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
875 {
876 	ire_t	*ire;
877 	uint_t	match_flags;
878 
879 	if (lock_held)
880 		ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
881 	else
882 		rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
883 
884 	match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
885 	if (ill != NULL)
886 		match_flags |= MATCH_IRE_ILL;
887 
888 	ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
889 	    &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
890 	    ipst);
891 
892 	if (!lock_held)
893 		rw_exit(&ipst->ips_ip6_ire_head_lock);
894 	if (ire != NULL) {
895 		ire_refrele(ire);
896 		return (B_TRUE);
897 	} else {
898 		return (B_FALSE);
899 	}
900 }
901 
902 /*
903  * Lookup a route in forwarding table.
904  * specific lookup is indicated by passing the
905  * required parameters and indicating the
906  * match required in flag field.
907  *
908  * Supports link-local addresses by following the ipif/ill when recursing.
909  */
910 ire_t *
911 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
912     const in6_addr_t *gateway, int type, const ill_t *ill,
913     zoneid_t zoneid, const ts_label_t *tsl, int flags,
914     uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
915 {
916 	ire_t *ire = NULL;
917 
918 	ASSERT(addr != NULL);
919 	ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
920 	ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
921 	ASSERT(ill == NULL || ill->ill_isv6);
922 
923 	ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
924 
925 	/*
926 	 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
927 	 * or MATCH_IRE_SRC_ILL is set.
928 	 */
929 	if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL))
930 		return (NULL);
931 
932 	rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
933 	ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
934 	    tsl, flags, ipst);
935 	if (ire == NULL) {
936 		rw_exit(&ipst->ips_ip6_ire_head_lock);
937 		return (NULL);
938 	}
939 
940 	/*
941 	 * round-robin only if we have more than one route in the bucket.
942 	 * ips_ip_ecmp_behavior controls when we do ECMP
943 	 *	2:	always
944 	 *	1:	for IRE_DEFAULT and /0 IRE_INTERFACE
945 	 *	0:	never
946 	 *
947 	 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
948 	 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
949 	 * and the IRE_INTERFACESs are likely to be shorter matches.
950 	 */
951 	if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
952 		if (ipst->ips_ip_ecmp_behavior == 2 ||
953 		    (ipst->ips_ip_ecmp_behavior == 1 &&
954 		    IS_DEFAULT_ROUTE_V6(ire))) {
955 			ire_t	*next_ire;
956 			ire_ftable_args_t margs;
957 
958 			bzero(&margs, sizeof (margs));
959 			margs.ift_addr_v6 = *addr;
960 			if (mask != NULL)
961 				margs.ift_mask_v6 = *mask;
962 			if (gateway != NULL)
963 				margs.ift_gateway_v6 = *gateway;
964 			margs.ift_type = type;
965 			margs.ift_ill = ill;
966 			margs.ift_zoneid = zoneid;
967 			margs.ift_tsl = tsl;
968 			margs.ift_flags = flags;
969 
970 			next_ire = ire_round_robin(ire->ire_bucket, &margs,
971 			    xmit_hint, ire, ipst);
972 			if (next_ire == NULL) {
973 				/* keep ire if next_ire is null */
974 				goto done;
975 			}
976 			ire_refrele(ire);
977 			ire = next_ire;
978 		}
979 	}
980 
981 done:
982 	/* Return generation before dropping lock */
983 	if (generationp != NULL)
984 		*generationp = ire->ire_generation;
985 
986 	rw_exit(&ipst->ips_ip6_ire_head_lock);
987 
988 	/*
989 	 * For shared-IP zones we need additional checks to what was
990 	 * done in ire_match_args to make sure IRE_LOCALs are handled.
991 	 *
992 	 * When ip_restrict_interzone_loopback is set, then
993 	 * we ensure that IRE_LOCAL are only used for loopback
994 	 * between zones when the logical "Ethernet" would
995 	 * have looped them back. That is, if in the absense of
996 	 * the IRE_LOCAL we would have sent to packet out the
997 	 * same ill.
998 	 */
999 	if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
1000 	    ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
1001 	    ipst->ips_ip_restrict_interzone_loopback) {
1002 		ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
1003 		ASSERT(ire != NULL);
1004 	}
1005 
1006 	return (ire);
1007 }
1008 
1009 /*
1010  * Look up a single ire. The caller holds either the read or write lock.
1011  */
1012 ire_t *
1013 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
1014     const in6_addr_t *gateway, int type, const ill_t *ill,
1015     zoneid_t zoneid, const ts_label_t *tsl, int flags,
1016     ip_stack_t *ipst)
1017 {
1018 	irb_t *irb_ptr;
1019 	ire_t *ire = NULL;
1020 	int i;
1021 
1022 	ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
1023 
1024 	/*
1025 	 * If the mask is known, the lookup
1026 	 * is simple, if the mask is not known
1027 	 * we need to search.
1028 	 */
1029 	if (flags & MATCH_IRE_MASK) {
1030 		uint_t masklen;
1031 
1032 		masklen = ip_mask_to_plen_v6(mask);
1033 		if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
1034 			return (NULL);
1035 		}
1036 		irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
1037 		    IRE_ADDR_MASK_HASH_V6(*addr, *mask,
1038 		    ipst->ips_ip6_ftable_hash_size)]);
1039 		rw_enter(&irb_ptr->irb_lock, RW_READER);
1040 		for (ire = irb_ptr->irb_ire; ire != NULL;
1041 		    ire = ire->ire_next) {
1042 			if (IRE_IS_CONDEMNED(ire))
1043 				continue;
1044 			if (ire_match_args_v6(ire, addr, mask, gateway, type,
1045 			    ill, zoneid, tsl, flags))
1046 				goto found_ire;
1047 		}
1048 		rw_exit(&irb_ptr->irb_lock);
1049 	} else {
1050 		uint_t masklen;
1051 
1052 		/*
1053 		 * In this case we don't know the mask, we need to
1054 		 * search the table assuming different mask sizes.
1055 		 */
1056 		if (flags & MATCH_IRE_SHORTERMASK) {
1057 			masklen = ip_mask_to_plen_v6(mask);
1058 			if (masklen == 0) {
1059 				/* Nothing shorter than zero */
1060 				return (NULL);
1061 			}
1062 			masklen--;
1063 		} else {
1064 			masklen = IP6_MASK_TABLE_SIZE - 1;
1065 		}
1066 
1067 		for (i = masklen; i >= 0; i--) {
1068 			in6_addr_t tmpmask;
1069 
1070 			if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
1071 				continue;
1072 			(void) ip_plen_to_mask_v6(i, &tmpmask);
1073 			irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][
1074 			    IRE_ADDR_MASK_HASH_V6(*addr, tmpmask,
1075 			    ipst->ips_ip6_ftable_hash_size)];
1076 			rw_enter(&irb_ptr->irb_lock, RW_READER);
1077 			for (ire = irb_ptr->irb_ire; ire != NULL;
1078 			    ire = ire->ire_next) {
1079 				if (IRE_IS_CONDEMNED(ire))
1080 					continue;
1081 				if (ire_match_args_v6(ire, addr,
1082 				    &ire->ire_mask_v6, gateway, type, ill,
1083 				    zoneid, tsl, flags))
1084 					goto found_ire;
1085 			}
1086 			rw_exit(&irb_ptr->irb_lock);
1087 		}
1088 	}
1089 	ASSERT(ire == NULL);
1090 	ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
1091 	return (NULL);
1092 
1093 found_ire:
1094 	ire_refhold(ire);
1095 	rw_exit(&irb_ptr->irb_lock);
1096 	return (ire);
1097 }
1098 
1099 
1100 /*
1101  * This function is called by
1102  * ip_input/ire_route_recursive when doing a route lookup on only the
1103  * destination address.
1104  *
1105  * The optimizations of this function over ire_ftable_lookup are:
1106  *	o removing unnecessary flag matching
1107  *	o doing longest prefix match instead of overloading it further
1108  *	  with the unnecessary "best_prefix_match"
1109  *
1110  * If no route is found we return IRE_NOROUTE.
1111  */
1112 ire_t *
1113 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
1114     ip_stack_t *ipst, uint_t *generationp)
1115 {
1116 	ire_t	*ire;
1117 
1118 	ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
1119 	    MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
1120 	if (ire == NULL) {
1121 		ire = ire_reject(ipst, B_TRUE);
1122 		if (generationp != NULL)
1123 			*generationp = IRE_GENERATION_VERIFY;
1124 	}
1125 	/* ftable_lookup did round robin */
1126 	return (ire);
1127 }
1128 
1129 ire_t *
1130 ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src,
1131     ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp,
1132     int *errorp, boolean_t *multirtp)
1133 {
1134 	ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1135 
1136 	return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp,
1137 	    multirtp));
1138 }
1139 
1140 /*
1141  * Recursively look for a route to the destination. Can also match on
1142  * the zoneid, ill, and label. Used for the data paths. See also
1143  * ire_route_recursive_dstonly.
1144  *
1145  * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never
1146  * create an IRE_IF_CLONE. This is used on the receive side when we are not
1147  * forwarding.
1148  * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly
1149  * resolve the gateway.
1150  *
1151  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1152  * instead.
1153  *
1154  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1155  * is an error.
1156  * Allow at most one RTF_INDIRECT.
1157  */
1158 ire_t *
1159 ire_route_recursive_impl_v6(ire_t *ire,
1160     const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
1161     zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1162     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1163     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1164 {
1165 	int		i, j;
1166 	in6_addr_t	v6nexthop = *nexthop;
1167 	ire_t		*ires[MAX_IRE_RECURSION];
1168 	uint_t		generation;
1169 	uint_t		generations[MAX_IRE_RECURSION];
1170 	boolean_t	need_refrele = B_FALSE;
1171 	boolean_t	invalidate = B_FALSE;
1172 	ill_t		*ill = NULL;
1173 	uint_t		maskoff = (IRE_LOCAL|IRE_LOOPBACK);
1174 
1175 	if (setsrcp != NULL)
1176 		ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
1177 	if (gwattrp != NULL)
1178 		ASSERT(*gwattrp == NULL);
1179 
1180 	/*
1181 	 * We iterate up to three times to resolve a route, even though
1182 	 * we have four slots in the array. The extra slot is for an
1183 	 * IRE_IF_CLONE we might need to create.
1184 	 */
1185 	i = 0;
1186 	while (i < MAX_IRE_RECURSION - 1) {
1187 		/* ire_ftable_lookup handles round-robin/ECMP */
1188 		if (ire == NULL) {
1189 			ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
1190 			    (ill != NULL ? ill : ill_arg), zoneid, tsl,
1191 			    match_args, xmit_hint, ipst, &generation);
1192 		} else {
1193 			/* Caller passed it; extra hold since we will rele */
1194 			ire_refhold(ire);
1195 			if (generationp != NULL)
1196 				generation = *generationp;
1197 			else
1198 				generation = IRE_GENERATION_VERIFY;
1199 		}
1200 
1201 		if (ire == NULL) {
1202 			if (i > 0 && (irr_flags & IRR_INCOMPLETE)) {
1203 				ire = ires[0];
1204 				ire_refhold(ire);
1205 			} else {
1206 				ire = ire_reject(ipst, B_TRUE);
1207 			}
1208 			goto error;
1209 		}
1210 
1211 		/* Need to return the ire with RTF_REJECT|BLACKHOLE */
1212 		if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
1213 			goto error;
1214 
1215 		ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
1216 
1217 		/*
1218 		 * Don't allow anything unusual past the first iteration.
1219 		 * After the first lookup, we should no longer look for
1220 		 * (IRE_LOCAL|IRE_LOOPBACK) or RTF_INDIRECT routes.
1221 		 *
1222 		 * In addition, after we have found a direct IRE_OFFLINK,
1223 		 * we should only look for interface or clone routes.
1224 		 */
1225 		match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */
1226 		if ((ire->ire_type & IRE_OFFLINK) &&
1227 		    !(ire->ire_flags & RTF_INDIRECT)) {
1228 			ire_type = IRE_IF_ALL;
1229 		} else {
1230 			if (!(match_args & MATCH_IRE_TYPE))
1231 				ire_type = (IRE_OFFLINK|IRE_ONLINK);
1232 			ire_type &= ~maskoff; /* no more LOCAL, LOOPBACK */
1233 		}
1234 		match_args |= MATCH_IRE_TYPE;
1235 		/* We have a usable IRE */
1236 		ires[i] = ire;
1237 		generations[i] = generation;
1238 		i++;
1239 
1240 		/* The first RTF_SETSRC address is passed back if setsrcp */
1241 		if ((ire->ire_flags & RTF_SETSRC) &&
1242 		    setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
1243 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
1244 			    &ire->ire_setsrc_addr_v6));
1245 			*setsrcp = ire->ire_setsrc_addr_v6;
1246 		}
1247 
1248 		/* The first ire_gw_secattr is passed back if gwattrp */
1249 		if (ire->ire_gw_secattr != NULL &&
1250 		    gwattrp != NULL && *gwattrp == NULL)
1251 			*gwattrp = ire->ire_gw_secattr;
1252 
1253 		/*
1254 		 * Check if we have a short-cut pointer to an IRE for this
1255 		 * destination, and that the cached dependency isn't stale.
1256 		 * In that case we've rejoined an existing tree towards a
1257 		 * parent, thus we don't need to continue the loop to
1258 		 * discover the rest of the tree.
1259 		 */
1260 		mutex_enter(&ire->ire_lock);
1261 		if (ire->ire_dep_parent != NULL &&
1262 		    ire->ire_dep_parent->ire_generation ==
1263 		    ire->ire_dep_parent_generation) {
1264 			mutex_exit(&ire->ire_lock);
1265 			ire = NULL;
1266 			goto done;
1267 		}
1268 		mutex_exit(&ire->ire_lock);
1269 
1270 		/*
1271 		 * If this type should have an ire_nce_cache (even if it
1272 		 * doesn't yet have one) then we are done. Includes
1273 		 * IRE_INTERFACE with a full 128 bit mask.
1274 		 */
1275 		if (ire->ire_nce_capable) {
1276 			ire = NULL;
1277 			goto done;
1278 		}
1279 		ASSERT(!(ire->ire_type & IRE_IF_CLONE));
1280 		/*
1281 		 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
1282 		 * particular destination
1283 		 */
1284 		if (ire->ire_type & IRE_INTERFACE) {
1285 			ire_t		*clone;
1286 
1287 			ASSERT(ire->ire_masklen != IPV6_ABITS);
1288 
1289 			/*
1290 			 * In the case of ip_input and ILLF_FORWARDING not
1291 			 * being set, and in the case of RTM_GET, there is
1292 			 * no point in allocating an IRE_IF_CLONE. We return
1293 			 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can
1294 			 * result in a ire_dep_parent which is IRE_IF_*
1295 			 * without an IRE_IF_CLONE.
1296 			 * We recover from that when we need to send packets
1297 			 * by ensuring that the generations become
1298 			 * IRE_GENERATION_VERIFY in this case.
1299 			 */
1300 			if (!(irr_flags & IRR_ALLOCATE)) {
1301 				invalidate = B_TRUE;
1302 				ire = NULL;
1303 				goto done;
1304 			}
1305 
1306 			clone = ire_create_if_clone(ire, &v6nexthop,
1307 			    &generation);
1308 			if (clone == NULL) {
1309 				/*
1310 				 * Temporary failure - no memory.
1311 				 * Don't want caller to cache IRE_NOROUTE.
1312 				 */
1313 				invalidate = B_TRUE;
1314 				ire = ire_blackhole(ipst, B_TRUE);
1315 				goto error;
1316 			}
1317 			/*
1318 			 * Make clone next to last entry and the
1319 			 * IRE_INTERFACE the last in the dependency
1320 			 * chain since the clone depends on the
1321 			 * IRE_INTERFACE.
1322 			 */
1323 			ASSERT(i >= 1);
1324 			ASSERT(i < MAX_IRE_RECURSION);
1325 
1326 			ires[i] = ires[i-1];
1327 			generations[i] = generations[i-1];
1328 			ires[i-1] = clone;
1329 			generations[i-1] = generation;
1330 			i++;
1331 
1332 			ire = NULL;
1333 			goto done;
1334 		}
1335 
1336 		/*
1337 		 * We only match on the type and optionally ILL when
1338 		 * recursing. The type match is used by some callers
1339 		 * to exclude certain types (such as IRE_IF_CLONE or
1340 		 * IRE_LOCAL|IRE_LOOPBACK).
1341 		 *
1342 		 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof'
1343 		 * ire->ire_ill, and we want to find the IRE_INTERFACE for
1344 		 * ire_ill, so we set ill to the ire_ill
1345 		 */
1346 		match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT);
1347 		v6nexthop = ire->ire_gateway_addr_v6;
1348 		if (ill == NULL && ire->ire_ill != NULL) {
1349 			ill = ire->ire_ill;
1350 			need_refrele = B_TRUE;
1351 			ill_refhold(ill);
1352 			match_args |= MATCH_IRE_ILL;
1353 		}
1354 		ire = NULL;
1355 	}
1356 	ASSERT(ire == NULL);
1357 	ire = ire_reject(ipst, B_TRUE);
1358 
1359 error:
1360 	ASSERT(ire != NULL);
1361 	if (need_refrele)
1362 		ill_refrele(ill);
1363 
1364 	/*
1365 	 * In the case of MULTIRT we want to try a different IRE the next
1366 	 * time. We let the next packet retry in that case.
1367 	 */
1368 	if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
1369 		(void) ire_no_good(ires[0]);
1370 
1371 cleanup:
1372 	/* cleanup ires[i] */
1373 	ire_dep_unbuild(ires, i);
1374 	for (j = 0; j < i; j++)
1375 		ire_refrele(ires[j]);
1376 
1377 	ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1378 	    (irr_flags & IRR_INCOMPLETE));
1379 	/*
1380 	 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
1381 	 * ip_select_route since the reject or lack of memory might be gone.
1382 	 */
1383 	if (generationp != NULL)
1384 		*generationp = IRE_GENERATION_VERIFY;
1385 	return (ire);
1386 
1387 done:
1388 	ASSERT(ire == NULL);
1389 	if (need_refrele)
1390 		ill_refrele(ill);
1391 
1392 	/* Build dependencies */
1393 	if (i > 1 && !ire_dep_build(ires, generations, i)) {
1394 		/* Something in chain was condemned; tear it apart */
1395 		ire = ire_blackhole(ipst, B_TRUE);
1396 		goto cleanup;
1397 	}
1398 
1399 	/*
1400 	 * Release all refholds except the one for ires[0] that we
1401 	 * will return to the caller.
1402 	 */
1403 	for (j = 1; j < i; j++)
1404 		ire_refrele(ires[j]);
1405 
1406 	if (invalidate) {
1407 		/*
1408 		 * Since we needed to allocate but couldn't we need to make
1409 		 * sure that the dependency chain is rebuilt the next time.
1410 		 */
1411 		ire_dep_invalidate_generations(ires[0]);
1412 		generation = IRE_GENERATION_VERIFY;
1413 	} else {
1414 		/*
1415 		 * IREs can have been added or deleted while we did the
1416 		 * recursive lookup and we can't catch those until we've built
1417 		 * the dependencies. We verify the stored
1418 		 * ire_dep_parent_generation to catch any such changes and
1419 		 * return IRE_GENERATION_VERIFY (which will cause
1420 		 * ip_select_route to be called again so we can redo the
1421 		 * recursive lookup next time we send a packet.
1422 		 */
1423 		if (ires[0]->ire_dep_parent == NULL)
1424 			generation = ires[0]->ire_generation;
1425 		else
1426 			generation = ire_dep_validate_generations(ires[0]);
1427 		if (generations[0] != ires[0]->ire_generation) {
1428 			/* Something changed at the top */
1429 			generation = IRE_GENERATION_VERIFY;
1430 		}
1431 	}
1432 	if (generationp != NULL)
1433 		*generationp = generation;
1434 
1435 	return (ires[0]);
1436 }
1437 
1438 ire_t *
1439 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
1440     const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
1441     uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst,
1442     in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
1443 {
1444 	return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
1445 	    zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp,
1446 	    gwattrp, generationp));
1447 }
1448 
1449 /*
1450  * Recursively look for a route to the destination.
1451  * We only handle a destination match here, yet we have the same arguments
1452  * as the full match to allow function pointers to select between the two.
1453  *
1454  * Note that this function never returns NULL. It returns an IRE_NOROUTE
1455  * instead.
1456  *
1457  * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
1458  * is an error.
1459  * Allow at most one RTF_INDIRECT.
1460  */
1461 ire_t *
1462 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags,
1463     uint32_t xmit_hint, ip_stack_t *ipst)
1464 {
1465 	ire_t	*ire;
1466 	ire_t	*ire1;
1467 	uint_t	generation;
1468 
1469 	/* ire_ftable_lookup handles round-robin/ECMP */
1470 	ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
1471 	    &generation);
1472 	ASSERT(ire != NULL);
1473 
1474 	/*
1475 	 * If this type should have an ire_nce_cache (even if it
1476 	 * doesn't yet have one) then we are done. Includes
1477 	 * IRE_INTERFACE with a full 128 bit mask.
1478 	 */
1479 	if (ire->ire_nce_capable)
1480 		return (ire);
1481 
1482 	/*
1483 	 * If the IRE has a current cached parent we know that the whole
1484 	 * parent chain is current, hence we don't need to discover and
1485 	 * build any dependencies by doing a recursive lookup.
1486 	 */
1487 	mutex_enter(&ire->ire_lock);
1488 	if (ire->ire_dep_parent != NULL &&
1489 	    ire->ire_dep_parent->ire_generation ==
1490 	    ire->ire_dep_parent_generation) {
1491 		mutex_exit(&ire->ire_lock);
1492 		return (ire);
1493 	}
1494 	mutex_exit(&ire->ire_lock);
1495 
1496 	/*
1497 	 * Fallback to loop in the normal code starting with the ire
1498 	 * we found. Normally this would return the same ire.
1499 	 */
1500 	ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
1501 	    NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL,
1502 	    &generation);
1503 	ire_refrele(ire);
1504 	return (ire1);
1505 }
1506