xref: /titanic_51/usr/src/uts/common/inet/ip/tnet.c (revision 0c240c64cf90f44c2fdf3439010f6e8b33d85e7d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/strsubr.h>
29 #include <sys/stropts.h>
30 #include <sys/sunddi.h>
31 #include <sys/cred.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/errno.h>
35 #include <sys/disp.h>
36 #include <netinet/in.h>
37 #include <netinet/in_systm.h>
38 #include <netinet/ip.h>
39 #include <netinet/ip_icmp.h>
40 #include <netinet/tcp.h>
41 #include <inet/common.h>
42 #include <inet/ipclassifier.h>
43 #include <inet/ip.h>
44 #include <inet/mib2.h>
45 #include <inet/nd.h>
46 #include <inet/tcp.h>
47 #include <inet/ip_rts.h>
48 #include <inet/ip_ire.h>
49 #include <inet/ip_if.h>
50 #include <sys/modhash.h>
51 
52 #include <sys/tsol/label.h>
53 #include <sys/tsol/label_macro.h>
54 #include <sys/tsol/tnet.h>
55 #include <sys/tsol/tndb.h>
56 #include <sys/strsun.h>
57 
58 /* tunable for strict error-reply behavior (TCP RST and ICMP Unreachable) */
59 int tsol_strict_error;
60 
61 /*
62  * Some notes on the Trusted Solaris IRE gateway security attributes:
63  *
64  * When running in Trusted mode, the routing subsystem determines whether or
65  * not a packet can be delivered to an off-link host (not directly reachable
66  * through an interface) based on the accreditation checks of the packet's
67  * security attributes against those associated with the next-hop gateway.
68  *
69  * The next-hop gateway's security attributes can be derived from two sources
70  * (in order of preference): route-related and the host database.  A Trusted
71  * system must be configured with at least the host database containing an
72  * entry for the next-hop gateway, or otherwise no accreditation checks can
73  * be performed, which may result in the inability to send packets to any
74  * off-link destination host.
75  *
76  * The major differences between the two sources are the number and type of
77  * security attributes used for accreditation checks.  A host database entry
78  * can contain at most one set of security attributes, specific only to the
79  * next-hop gateway.  On contrast, route-related security attributes are made
80  * up of a collection of security attributes for the distant networks, and
81  * are grouped together per next-hop gateway used to reach those networks.
82  * This is the preferred method, and the routing subsystem will fallback to
83  * the host database entry only if there are no route-related attributes
84  * associated with the next-hop gateway.
85  *
86  * In Trusted mode, all of the IRE entries (except LOCAL/LOOPBACK/BROADCAST/
87  * INTERFACE type) are initialized to contain a placeholder to store this
88  * information.  The ire_gw_secattr structure gets allocated, initialized
89  * and associated with the IRE during the time of the IRE creation.  The
90  * initialization process also includes resolving the host database entry
91  * of the next-hop gateway for fallback purposes.  It does not include any
92  * route-related attribute setup, as that process comes separately as part
93  * of the route requests (add/change) made to the routing subsystem.
94  *
95  * The underlying logic which involves associating IREs with the gateway
96  * security attributes are represented by the following data structures:
97  *
98  * tsol_gcdb_t, or "gcdb"
99  *
100  *	- This is a system-wide collection of records containing the
101  *	  currently used route-related security attributes, which are fed
102  *	  through the routing socket interface, e.g. "route add/change".
103  *
104  * tsol_gc_t, or "gc"
105  *
106  *	- This is the gateway credential structure, and it provides for the
107  *	  only mechanism to access the contents of gcdb.  More than one gc
108  *	  entries may refer to the same gcdb record.  gc's in the system are
109  *	  grouped according to the next-hop gateway address.
110  *
111  * tsol_gcgrp_t, or "gcgrp"
112  *
113  *	- Group of gateway credentials, and is unique per next-hop gateway
114  *	  address.  When the group is not empty, i.e. when gcgrp_count is
115  *	  greater than zero, it contains one or more gc's, each pointing to
116  *	  a gcdb record which indicates the gateway security attributes
117  *	  associated with the next-hop gateway.
118  *
119  * The fields of the tsol_ire_gw_secattr_t used from within the IRE are:
120  *
121  * igsa_lock
122  *
123  *	- Lock that protects all fields within tsol_ire_gw_secattr_t.
124  *
125  * igsa_rhc
126  *
127  *	- Remote host cache database entry of next-hop gateway.  This is
128  *	  used in the case when there are no route-related attributes
129  *	  configured for the IRE.
130  *
131  * igsa_gc
132  *
133  *	- A set of route-related attributes that only get set for prefix
134  *	  IREs.  If this is non-NULL, the prefix IRE has been associated
135  *	  with a set of gateway security attributes by way of route add/
136  *	  change functionality.
137  */
138 
139 static kmem_cache_t *ire_gw_secattr_cache;
140 
141 #define	GCDB_HASH_SIZE	101
142 #define	GCGRP_HASH_SIZE	101
143 
144 #define	GCDB_REFRELE(p) {		\
145 	mutex_enter(&gcdb_lock);	\
146 	ASSERT((p)->gcdb_refcnt > 0);	\
147 	if (--((p)->gcdb_refcnt) == 0)	\
148 		gcdb_inactive(p);	\
149 	ASSERT(MUTEX_HELD(&gcdb_lock));	\
150 	mutex_exit(&gcdb_lock);		\
151 }
152 
153 static int gcdb_hash_size = GCDB_HASH_SIZE;
154 static int gcgrp_hash_size = GCGRP_HASH_SIZE;
155 static mod_hash_t *gcdb_hash;
156 static mod_hash_t *gcgrp4_hash;
157 static mod_hash_t *gcgrp6_hash;
158 
159 static kmutex_t gcdb_lock;
160 kmutex_t gcgrp_lock;
161 
162 static uint_t gcdb_hash_by_secattr(void *, mod_hash_key_t);
163 static int gcdb_hash_cmp(mod_hash_key_t, mod_hash_key_t);
164 static tsol_gcdb_t *gcdb_lookup(struct rtsa_s *, boolean_t);
165 static void gcdb_inactive(tsol_gcdb_t *);
166 
167 static uint_t gcgrp_hash_by_addr(void *, mod_hash_key_t);
168 static int gcgrp_hash_cmp(mod_hash_key_t, mod_hash_key_t);
169 
170 static int ire_gw_secattr_constructor(void *, void *, int);
171 static void ire_gw_secattr_destructor(void *, void *);
172 
173 void
174 tnet_init(void)
175 {
176 	ire_gw_secattr_cache = kmem_cache_create("ire_gw_secattr_cache",
177 	    sizeof (tsol_ire_gw_secattr_t), 64, ire_gw_secattr_constructor,
178 	    ire_gw_secattr_destructor, NULL, NULL, NULL, 0);
179 
180 	gcdb_hash = mod_hash_create_extended("gcdb_hash",
181 	    gcdb_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
182 	    gcdb_hash_by_secattr, NULL, gcdb_hash_cmp, KM_SLEEP);
183 
184 	gcgrp4_hash = mod_hash_create_extended("gcgrp4_hash",
185 	    gcgrp_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
186 	    gcgrp_hash_by_addr, NULL, gcgrp_hash_cmp, KM_SLEEP);
187 
188 	gcgrp6_hash = mod_hash_create_extended("gcgrp6_hash",
189 	    gcgrp_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
190 	    gcgrp_hash_by_addr, NULL, gcgrp_hash_cmp, KM_SLEEP);
191 
192 	mutex_init(&gcdb_lock, NULL, MUTEX_DEFAULT, NULL);
193 	mutex_init(&gcgrp_lock, NULL, MUTEX_DEFAULT, NULL);
194 }
195 
196 void
197 tnet_fini(void)
198 {
199 	kmem_cache_destroy(ire_gw_secattr_cache);
200 	mod_hash_destroy_hash(gcdb_hash);
201 	mod_hash_destroy_hash(gcgrp4_hash);
202 	mod_hash_destroy_hash(gcgrp6_hash);
203 	mutex_destroy(&gcdb_lock);
204 	mutex_destroy(&gcgrp_lock);
205 }
206 
207 /* ARGSUSED */
208 static int
209 ire_gw_secattr_constructor(void *buf, void *cdrarg, int kmflags)
210 {
211 	tsol_ire_gw_secattr_t *attrp = buf;
212 
213 	mutex_init(&attrp->igsa_lock, NULL, MUTEX_DEFAULT, NULL);
214 
215 	attrp->igsa_rhc = NULL;
216 	attrp->igsa_gc = NULL;
217 
218 	return (0);
219 }
220 
221 /* ARGSUSED */
222 static void
223 ire_gw_secattr_destructor(void *buf, void *cdrarg)
224 {
225 	tsol_ire_gw_secattr_t *attrp = (tsol_ire_gw_secattr_t *)buf;
226 
227 	mutex_destroy(&attrp->igsa_lock);
228 }
229 
230 tsol_ire_gw_secattr_t *
231 ire_gw_secattr_alloc(int kmflags)
232 {
233 	return (kmem_cache_alloc(ire_gw_secattr_cache, kmflags));
234 }
235 
236 void
237 ire_gw_secattr_free(tsol_ire_gw_secattr_t *attrp)
238 {
239 	ASSERT(MUTEX_NOT_HELD(&attrp->igsa_lock));
240 
241 	if (attrp->igsa_rhc != NULL) {
242 		TNRHC_RELE(attrp->igsa_rhc);
243 		attrp->igsa_rhc = NULL;
244 	}
245 
246 	if (attrp->igsa_gc != NULL) {
247 		GC_REFRELE(attrp->igsa_gc);
248 		attrp->igsa_gc = NULL;
249 	}
250 
251 	ASSERT(attrp->igsa_rhc == NULL);
252 	ASSERT(attrp->igsa_gc == NULL);
253 
254 	kmem_cache_free(ire_gw_secattr_cache, attrp);
255 }
256 
257 /* ARGSUSED */
258 static uint_t
259 gcdb_hash_by_secattr(void *hash_data, mod_hash_key_t key)
260 {
261 	const struct rtsa_s *rp = (struct rtsa_s *)key;
262 	const uint32_t *up, *ue;
263 	uint_t hash;
264 	int i;
265 
266 	ASSERT(rp != NULL);
267 
268 	/* See comments in hash_bylabel in zone.c for details */
269 	hash = rp->rtsa_doi + (rp->rtsa_doi << 1);
270 	up = (const uint32_t *)&rp->rtsa_slrange;
271 	ue = up + sizeof (rp->rtsa_slrange) / sizeof (*up);
272 	i = 1;
273 	while (up < ue) {
274 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
275 		hash += *up + (*up << ((i % 16) + 1));
276 		up++;
277 		i++;
278 	}
279 	return (hash);
280 }
281 
282 static int
283 gcdb_hash_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
284 {
285 	struct rtsa_s *rp1 = (struct rtsa_s *)key1;
286 	struct rtsa_s *rp2 = (struct rtsa_s *)key2;
287 
288 	ASSERT(rp1 != NULL && rp2 != NULL);
289 
290 	if (blequal(&rp1->rtsa_slrange.lower_bound,
291 	    &rp2->rtsa_slrange.lower_bound) &&
292 	    blequal(&rp1->rtsa_slrange.upper_bound,
293 	    &rp2->rtsa_slrange.upper_bound) &&
294 	    rp1->rtsa_doi == rp2->rtsa_doi)
295 		return (0);
296 
297 	/* No match; not found */
298 	return (-1);
299 }
300 
301 /* ARGSUSED */
302 static uint_t
303 gcgrp_hash_by_addr(void *hash_data, mod_hash_key_t key)
304 {
305 	tsol_gcgrp_addr_t *ga = (tsol_gcgrp_addr_t *)key;
306 	uint_t		idx = 0;
307 	uint32_t	*ap;
308 
309 	ASSERT(ga != NULL);
310 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
311 
312 	ap = (uint32_t *)&ga->ga_addr.s6_addr32[0];
313 	idx ^= *ap++;
314 	idx ^= *ap++;
315 	idx ^= *ap++;
316 	idx ^= *ap;
317 
318 	return (idx);
319 }
320 
321 static int
322 gcgrp_hash_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
323 {
324 	tsol_gcgrp_addr_t *ga1 = (tsol_gcgrp_addr_t *)key1;
325 	tsol_gcgrp_addr_t *ga2 = (tsol_gcgrp_addr_t *)key2;
326 
327 	ASSERT(ga1 != NULL && ga2 != NULL);
328 
329 	/* Address family must match */
330 	if (ga1->ga_af != ga2->ga_af)
331 		return (-1);
332 
333 	if (ga1->ga_addr.s6_addr32[0] == ga2->ga_addr.s6_addr32[0] &&
334 	    ga1->ga_addr.s6_addr32[1] == ga2->ga_addr.s6_addr32[1] &&
335 	    ga1->ga_addr.s6_addr32[2] == ga2->ga_addr.s6_addr32[2] &&
336 	    ga1->ga_addr.s6_addr32[3] == ga2->ga_addr.s6_addr32[3])
337 		return (0);
338 
339 	/* No match; not found */
340 	return (-1);
341 }
342 
343 #define	RTSAFLAGS	"\20\11cipso\3doi\2max_sl\1min_sl"
344 
345 int
346 rtsa_validate(const struct rtsa_s *rp)
347 {
348 	uint32_t mask = rp->rtsa_mask;
349 
350 	/* RTSA_CIPSO must be set, and DOI must not be zero */
351 	if ((mask & RTSA_CIPSO) == 0 || rp->rtsa_doi == 0) {
352 		DTRACE_PROBE2(tx__gcdb__log__error__rtsa__validate, char *,
353 		    "rtsa(1) lacks flag or has 0 doi.",
354 		    rtsa_s *, rp);
355 		return (EINVAL);
356 	}
357 	/*
358 	 * SL range must be specified, and it must have its
359 	 * upper bound dominating its lower bound.
360 	 */
361 	if ((mask & RTSA_SLRANGE) != RTSA_SLRANGE ||
362 	    !bldominates(&rp->rtsa_slrange.upper_bound,
363 	    &rp->rtsa_slrange.lower_bound)) {
364 		DTRACE_PROBE2(tx__gcdb__log__error__rtsa__validate, char *,
365 		    "rtsa(1) min_sl and max_sl not set or max_sl is "
366 		    "not dominating.", rtsa_s *, rp);
367 		return (EINVAL);
368 	}
369 	return (0);
370 }
371 
372 /*
373  * A brief explanation of the reference counting scheme:
374  *
375  * Apart from dynamic references due to to reference holds done
376  * actively by threads, we have the following references:
377  *
378  * gcdb_refcnt:
379  *	- Every tsol_gc_t pointing to a tsol_gcdb_t contributes a reference
380  *	  to the gcdb_refcnt.
381  *
382  * gc_refcnt:
383  *	- A prefix IRE that points to an igsa_gc contributes a reference
384  *	  to the gc_refcnt.
385  *
386  * gcgrp_refcnt:
387  *	- Every tsol_gc_t in the chain headed by tsol_gcgrp_t contributes
388  *	  a reference to the gcgrp_refcnt.
389  */
390 static tsol_gcdb_t *
391 gcdb_lookup(struct rtsa_s *rp, boolean_t alloc)
392 {
393 	tsol_gcdb_t *gcdb = NULL;
394 
395 	if (rtsa_validate(rp) != 0)
396 		return (NULL);
397 
398 	mutex_enter(&gcdb_lock);
399 	/* Find a copy in the cache; otherwise, create one and cache it */
400 	if (mod_hash_find(gcdb_hash, (mod_hash_key_t)rp,
401 	    (mod_hash_val_t *)&gcdb) == 0) {
402 		gcdb->gcdb_refcnt++;
403 		ASSERT(gcdb->gcdb_refcnt != 0);
404 
405 		DTRACE_PROBE2(tx__gcdb__log__info__gcdb__lookup, char *,
406 		    "gcdb(1) is in gcdb_hash(global)", tsol_gcdb_t *, gcdb);
407 	} else if (alloc) {
408 		gcdb = kmem_zalloc(sizeof (*gcdb), KM_NOSLEEP);
409 		if (gcdb != NULL) {
410 			gcdb->gcdb_refcnt = 1;
411 			gcdb->gcdb_mask = rp->rtsa_mask;
412 			gcdb->gcdb_doi = rp->rtsa_doi;
413 			gcdb->gcdb_slrange = rp->rtsa_slrange;
414 
415 			if (mod_hash_insert(gcdb_hash,
416 			    (mod_hash_key_t)&gcdb->gcdb_attr,
417 			    (mod_hash_val_t)gcdb) != 0) {
418 				mutex_exit(&gcdb_lock);
419 				kmem_free(gcdb, sizeof (*gcdb));
420 				return (NULL);
421 			}
422 
423 			DTRACE_PROBE2(tx__gcdb__log__info__gcdb__insert, char *,
424 			    "gcdb(1) inserted in gcdb_hash(global)",
425 			    tsol_gcdb_t *, gcdb);
426 		}
427 	}
428 	mutex_exit(&gcdb_lock);
429 	return (gcdb);
430 }
431 
432 static void
433 gcdb_inactive(tsol_gcdb_t *gcdb)
434 {
435 	ASSERT(MUTEX_HELD(&gcdb_lock));
436 	ASSERT(gcdb != NULL && gcdb->gcdb_refcnt == 0);
437 
438 	(void) mod_hash_remove(gcdb_hash, (mod_hash_key_t)&gcdb->gcdb_attr,
439 	    (mod_hash_val_t *)&gcdb);
440 
441 	DTRACE_PROBE2(tx__gcdb__log__info__gcdb__remove, char *,
442 	    "gcdb(1) removed from gcdb_hash(global)",
443 	    tsol_gcdb_t *, gcdb);
444 	kmem_free(gcdb, sizeof (*gcdb));
445 }
446 
447 tsol_gc_t *
448 gc_create(struct rtsa_s *rp, tsol_gcgrp_t *gcgrp, boolean_t *gcgrp_xtrarefp)
449 {
450 	tsol_gc_t *gc;
451 	tsol_gcdb_t *gcdb;
452 
453 	*gcgrp_xtrarefp = B_TRUE;
454 
455 	rw_enter(&gcgrp->gcgrp_rwlock, RW_WRITER);
456 	if ((gcdb = gcdb_lookup(rp, B_TRUE)) == NULL) {
457 		rw_exit(&gcgrp->gcgrp_rwlock);
458 		return (NULL);
459 	}
460 
461 	for (gc = gcgrp->gcgrp_head; gc != NULL; gc = gc->gc_next) {
462 		if (gc->gc_db == gcdb) {
463 			ASSERT(gc->gc_grp == gcgrp);
464 
465 			gc->gc_refcnt++;
466 			ASSERT(gc->gc_refcnt != 0);
467 
468 			GCDB_REFRELE(gcdb);
469 
470 			DTRACE_PROBE3(tx__gcdb__log__info__gc__create,
471 			    char *, "found gc(1) in gcgrp(2)",
472 			    tsol_gc_t *, gc, tsol_gcgrp_t *, gcgrp);
473 			rw_exit(&gcgrp->gcgrp_rwlock);
474 			return (gc);
475 		}
476 	}
477 
478 	gc = kmem_zalloc(sizeof (*gc), KM_NOSLEEP);
479 	if (gc != NULL) {
480 		if (gcgrp->gcgrp_head == NULL) {
481 			gcgrp->gcgrp_head = gcgrp->gcgrp_tail = gc;
482 		} else {
483 			gcgrp->gcgrp_tail->gc_next = gc;
484 			gc->gc_prev = gcgrp->gcgrp_tail;
485 			gcgrp->gcgrp_tail = gc;
486 		}
487 		gcgrp->gcgrp_count++;
488 		ASSERT(gcgrp->gcgrp_count != 0);
489 
490 		/* caller has incremented gcgrp reference for us */
491 		gc->gc_grp = gcgrp;
492 
493 		gc->gc_db = gcdb;
494 		gc->gc_refcnt = 1;
495 
496 		DTRACE_PROBE3(tx__gcdb__log__info__gc__create, char *,
497 		    "added gc(1) to gcgrp(2)", tsol_gc_t *, gc,
498 		    tsol_gcgrp_t *, gcgrp);
499 
500 		*gcgrp_xtrarefp = B_FALSE;
501 	}
502 	rw_exit(&gcgrp->gcgrp_rwlock);
503 
504 	return (gc);
505 }
506 
507 void
508 gc_inactive(tsol_gc_t *gc)
509 {
510 	tsol_gcgrp_t *gcgrp = gc->gc_grp;
511 
512 	ASSERT(gcgrp != NULL);
513 	ASSERT(RW_WRITE_HELD(&gcgrp->gcgrp_rwlock));
514 	ASSERT(gc->gc_refcnt == 0);
515 
516 	if (gc->gc_prev != NULL)
517 		gc->gc_prev->gc_next = gc->gc_next;
518 	else
519 		gcgrp->gcgrp_head = gc->gc_next;
520 	if (gc->gc_next != NULL)
521 		gc->gc_next->gc_prev = gc->gc_prev;
522 	else
523 		gcgrp->gcgrp_tail = gc->gc_prev;
524 	ASSERT(gcgrp->gcgrp_count > 0);
525 	gcgrp->gcgrp_count--;
526 
527 	/* drop lock before it's destroyed */
528 	rw_exit(&gcgrp->gcgrp_rwlock);
529 
530 	DTRACE_PROBE3(tx__gcdb__log__info__gc__remove, char *,
531 	    "removed inactive gc(1) from gcgrp(2)",
532 	    tsol_gc_t *, gc, tsol_gcgrp_t *, gcgrp);
533 
534 	GCGRP_REFRELE(gcgrp);
535 
536 	gc->gc_grp = NULL;
537 	gc->gc_prev = gc->gc_next = NULL;
538 
539 	if (gc->gc_db != NULL)
540 		GCDB_REFRELE(gc->gc_db);
541 
542 	kmem_free(gc, sizeof (*gc));
543 }
544 
545 tsol_gcgrp_t *
546 gcgrp_lookup(tsol_gcgrp_addr_t *ga, boolean_t alloc)
547 {
548 	tsol_gcgrp_t *gcgrp = NULL;
549 	mod_hash_t *hashp;
550 
551 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
552 
553 	hashp = (ga->ga_af == AF_INET) ? gcgrp4_hash : gcgrp6_hash;
554 
555 	mutex_enter(&gcgrp_lock);
556 	if (mod_hash_find(hashp, (mod_hash_key_t)ga,
557 	    (mod_hash_val_t *)&gcgrp) == 0) {
558 		gcgrp->gcgrp_refcnt++;
559 		ASSERT(gcgrp->gcgrp_refcnt != 0);
560 
561 		DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__lookup, char *,
562 		    "found gcgrp(1) in hash(2)", tsol_gcgrp_t *, gcgrp,
563 		    mod_hash_t *, hashp);
564 
565 	} else if (alloc) {
566 		gcgrp = kmem_zalloc(sizeof (*gcgrp), KM_NOSLEEP);
567 		if (gcgrp != NULL) {
568 			gcgrp->gcgrp_refcnt = 1;
569 			rw_init(&gcgrp->gcgrp_rwlock, NULL, RW_DEFAULT, NULL);
570 			bcopy(ga, &gcgrp->gcgrp_addr, sizeof (*ga));
571 
572 			if (mod_hash_insert(hashp,
573 			    (mod_hash_key_t)&gcgrp->gcgrp_addr,
574 			    (mod_hash_val_t)gcgrp) != 0) {
575 				mutex_exit(&gcgrp_lock);
576 				kmem_free(gcgrp, sizeof (*gcgrp));
577 				return (NULL);
578 			}
579 
580 			DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__insert,
581 			    char *, "inserted gcgrp(1) in hash(2)",
582 			    tsol_gcgrp_t *, gcgrp, mod_hash_t *, hashp);
583 		}
584 	}
585 	mutex_exit(&gcgrp_lock);
586 	return (gcgrp);
587 }
588 
589 void
590 gcgrp_inactive(tsol_gcgrp_t *gcgrp)
591 {
592 	tsol_gcgrp_addr_t *ga;
593 	mod_hash_t *hashp;
594 
595 	ASSERT(MUTEX_HELD(&gcgrp_lock));
596 	ASSERT(gcgrp != NULL && gcgrp->gcgrp_refcnt == 0);
597 	ASSERT(gcgrp->gcgrp_head == NULL && gcgrp->gcgrp_count == 0);
598 
599 	ga = &gcgrp->gcgrp_addr;
600 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
601 
602 	hashp = (ga->ga_af == AF_INET) ? gcgrp4_hash : gcgrp6_hash;
603 	(void) mod_hash_remove(hashp, (mod_hash_key_t)ga,
604 	    (mod_hash_val_t *)&gcgrp);
605 	rw_destroy(&gcgrp->gcgrp_rwlock);
606 
607 	DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__remove, char *,
608 	    "removed inactive gcgrp(1) from hash(2)",
609 	    tsol_gcgrp_t *, gcgrp, mod_hash_t *, hashp);
610 
611 	kmem_free(gcgrp, sizeof (*gcgrp));
612 }
613 
614 
615 /*
616  * Assign a sensitivity label to inbound traffic which arrived without
617  * an explicit on-the-wire label.
618  *
619  * In the case of CIPSO-type hosts, we assume packets arriving without
620  * a label are at the most sensitive label known for the host, most
621  * likely involving out-of-band key management traffic (such as IKE,
622  * etc.,)
623  */
624 static boolean_t
625 tsol_find_unlabeled_label(tsol_tpc_t *rhtp, bslabel_t *sl, uint32_t *doi)
626 {
627 	*doi = rhtp->tpc_tp.tp_doi;
628 	switch (rhtp->tpc_tp.host_type) {
629 	case UNLABELED:
630 		*sl = rhtp->tpc_tp.tp_def_label;
631 		break;
632 	case SUN_CIPSO:
633 		*sl = rhtp->tpc_tp.tp_sl_range_cipso.upper_bound;
634 		break;
635 	default:
636 		return (B_FALSE);
637 	}
638 	setbltype(sl, SUN_SL_ID);
639 	return (B_TRUE);
640 }
641 
642 /*
643  * Converts CIPSO option to sensitivity label.
644  * Validity checks based on restrictions defined in
645  * COMMERCIAL IP SECURITY OPTION (CIPSO 2.2) (draft-ietf-cipso-ipsecurity)
646  */
647 static boolean_t
648 cipso_to_sl(const uchar_t *option, bslabel_t *sl)
649 {
650 	const struct cipso_option *co = (const struct cipso_option *)option;
651 	const struct cipso_tag_type_1 *tt1;
652 
653 	tt1 = (struct cipso_tag_type_1 *)&co->cipso_tag_type[0];
654 	if (tt1->tag_type != 1 ||
655 	    tt1->tag_length < TSOL_TT1_MIN_LENGTH ||
656 	    tt1->tag_length > TSOL_TT1_MAX_LENGTH ||
657 	    tt1->tag_length + TSOL_CIPSO_TAG_OFFSET > co->cipso_length)
658 		return (B_FALSE);
659 
660 	bsllow(sl);	/* assumed: sets compartments to all zeroes */
661 	LCLASS_SET((_bslabel_impl_t *)sl, tt1->tag_sl);
662 	bcopy(tt1->tag_cat, &((_bslabel_impl_t *)sl)->compartments,
663 	    tt1->tag_length - TSOL_TT1_MIN_LENGTH);
664 	return (B_TRUE);
665 }
666 
667 /*
668  * If present, parse the CIPSO label in the incoming packet and
669  * construct a ts_label_t that reflects the CIPSO label and put it in
670  * the ip_recv_attr_t. Later as the packet flows up through the stack any
671  * code that needs to examine the packet label can inspect the label
672  * from the ira_tsl. This function is
673  * called right in ip_input for all packets, i.e. locally destined and
674  * to be forwarded packets. The forwarding path needs to examine the label
675  * to determine how to forward the packet.
676  *
677  * This routine pulls all message text up into the first mblk.
678  * For IPv4, only the first 20 bytes of the IP header are guaranteed
679  * to exist. For IPv6, only the IPv6 header is guaranteed to exist.
680  */
681 boolean_t
682 tsol_get_pkt_label(mblk_t *mp, int version, ip_recv_attr_t *ira)
683 {
684 	tsol_tpc_t	*src_rhtp = NULL;
685 	uchar_t		*opt_ptr = NULL;
686 	const ipha_t	*ipha;
687 	bslabel_t	sl;
688 	uint32_t	doi;
689 	tsol_ip_label_t	label_type;
690 	uint32_t	label_flags = 0; /* flags to set in label */
691 	const cipso_option_t *co;
692 	const void	*src;
693 	const ip6_t	*ip6h;
694 	cred_t		*credp;
695 	int 		proto;
696 
697 	ASSERT(DB_TYPE(mp) == M_DATA);
698 
699 	if (mp->b_cont != NULL && !pullupmsg(mp, -1))
700 		return (B_FALSE);
701 
702 	if (version == IPV4_VERSION) {
703 		ASSERT(MBLKL(mp) >= IP_SIMPLE_HDR_LENGTH);
704 		ipha = (const ipha_t *)mp->b_rptr;
705 		src = &ipha->ipha_src;
706 		if (!tsol_get_option_v4(mp, &label_type, &opt_ptr))
707 			return (B_FALSE);
708 	} else {
709 		ASSERT(MBLKL(mp) >= IPV6_HDR_LEN);
710 		ip6h = (const ip6_t *)mp->b_rptr;
711 		src = &ip6h->ip6_src;
712 		if (!tsol_get_option_v6(mp, &label_type, &opt_ptr))
713 			return (B_FALSE);
714 	}
715 
716 	switch (label_type) {
717 	case OPT_CIPSO:
718 		/*
719 		 * Convert the CIPSO label to the internal format
720 		 * and attach it to the dblk cred.
721 		 * Validity checks based on restrictions defined in
722 		 * COMMERCIAL IP SECURITY OPTION (CIPSO 2.2)
723 		 * (draft-ietf-cipso-ipsecurity)
724 		 */
725 		if (version == IPV6_VERSION && ip6opt_ls == 0)
726 			return (B_FALSE);
727 		co = (const struct cipso_option *)opt_ptr;
728 		if ((co->cipso_length <
729 		    TSOL_CIPSO_TAG_OFFSET + TSOL_TT1_MIN_LENGTH) ||
730 		    (co->cipso_length > IP_MAX_OPT_LENGTH))
731 			return (B_FALSE);
732 		bcopy(co->cipso_doi, &doi, sizeof (doi));
733 		doi = ntohl(doi);
734 		if (!cipso_to_sl(opt_ptr, &sl))
735 			return (B_FALSE);
736 		setbltype(&sl, SUN_SL_ID);
737 
738 		/*
739 		 * If the source was unlabeled, then flag as such,
740 		 * (since CIPSO routers may add headers)
741 		 */
742 
743 		if ((src_rhtp = find_tpc(src, version, B_FALSE)) == NULL)
744 			return (B_FALSE);
745 
746 		if (src_rhtp->tpc_tp.host_type == UNLABELED)
747 			label_flags = TSLF_UNLABELED;
748 
749 		TPC_RELE(src_rhtp);
750 
751 		break;
752 
753 	case OPT_NONE:
754 		/*
755 		 * Handle special cases that may not be labeled, even
756 		 * though the sending system may otherwise be configured as
757 		 * labeled.
758 		 *	- IGMP
759 		 *	- IPv4 ICMP Router Discovery
760 		 *	- IPv6 Neighbor Discovery
761 		 *	- IPsec ESP
762 		 */
763 		if (version == IPV4_VERSION) {
764 			proto = ipha->ipha_protocol;
765 			if (proto == IPPROTO_IGMP)
766 				return (B_TRUE);
767 			if (proto == IPPROTO_ICMP) {
768 				const struct icmp *icmp = (const struct icmp *)
769 				    (mp->b_rptr + IPH_HDR_LENGTH(ipha));
770 
771 				if ((uchar_t *)icmp + ICMP_MINLEN > mp->b_wptr)
772 					return (B_FALSE);
773 				if (icmp->icmp_type == ICMP_ROUTERADVERT ||
774 				    icmp->icmp_type == ICMP_ROUTERSOLICIT)
775 					return (B_TRUE);
776 			}
777 		} else {
778 			proto = ip6h->ip6_nxt;
779 			if (proto == IPPROTO_ICMPV6) {
780 				const icmp6_t *icmp6 = (const icmp6_t *)
781 				    (mp->b_rptr + IPV6_HDR_LEN);
782 
783 				if ((uchar_t *)icmp6 + ICMP6_MINLEN >
784 				    mp->b_wptr)
785 					return (B_FALSE);
786 				if (icmp6->icmp6_type >= MLD_LISTENER_QUERY &&
787 				    icmp6->icmp6_type <= ICMP6_MAX_INFO_TYPE)
788 					return (B_TRUE);
789 			}
790 		}
791 
792 		/*
793 		 * Look up the tnrhtp database and get the implicit label
794 		 * that is associated with the sending host and attach
795 		 * it to the packet.
796 		 */
797 		if ((src_rhtp = find_tpc(src, version, B_FALSE)) == NULL)
798 			return (B_FALSE);
799 
800 		/*
801 		 * If peer is label-aware, mark as "implicit" rather than
802 		 * "unlabeled" to cause appropriate mac-exempt processing
803 		 * to happen.
804 		 */
805 		if (src_rhtp->tpc_tp.host_type == SUN_CIPSO)
806 			label_flags = TSLF_IMPLICIT_IN;
807 		else if (src_rhtp->tpc_tp.host_type == UNLABELED)
808 			label_flags = TSLF_UNLABELED;
809 		else {
810 			DTRACE_PROBE2(tx__get__pkt__label, char *,
811 			    "template(1) has unknown hosttype",
812 			    tsol_tpc_t *, src_rhtp);
813 		}
814 
815 
816 		if (!tsol_find_unlabeled_label(src_rhtp, &sl, &doi)) {
817 			TPC_RELE(src_rhtp);
818 			return (B_FALSE);
819 		}
820 		TPC_RELE(src_rhtp);
821 		break;
822 
823 	default:
824 		return (B_FALSE);
825 	}
826 
827 	if (ira->ira_cred == NULL) {
828 		credp = newcred_from_bslabel(&sl, doi, KM_NOSLEEP);
829 		if (credp == NULL)
830 			return (B_FALSE);
831 	} else {
832 		cred_t	*newcr;
833 
834 		newcr = copycred_from_bslabel(ira->ira_cred, &sl, doi,
835 		    KM_NOSLEEP);
836 		if (newcr == NULL)
837 			return (B_FALSE);
838 		if (ira->ira_free_flags & IRA_FREE_CRED) {
839 			crfree(ira->ira_cred);
840 			ira->ira_free_flags &= ~IRA_FREE_CRED;
841 			ira->ira_cred = NULL;
842 		}
843 		credp = newcr;
844 	}
845 
846 	/*
847 	 * Put the label in ira_tsl for convinience, while keeping
848 	 * the cred in ira_cred for getpeerucred which is used to get
849 	 * labels with TX.
850 	 * Note: no explicit refcnt/free_flag for ira_tsl. The free_flag
851 	 * for IRA_FREE_CRED is sufficient for both.
852 	 */
853 	ira->ira_tsl = crgetlabel(credp);
854 	ira->ira_cred = credp;
855 	ira->ira_free_flags |= IRA_FREE_CRED;
856 
857 	ira->ira_tsl->tsl_flags |= label_flags;
858 	return (B_TRUE);
859 }
860 
861 /*
862  * This routine determines whether the given packet should be accepted locally.
863  * It does a range/set check on the packet's label by looking up the given
864  * address in the remote host database.
865  */
866 boolean_t
867 tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
868     ip_recv_attr_t *ira, const conn_t *connp)
869 {
870 	const cred_t *credp;
871 	ts_label_t *plabel, *conn_plabel;
872 	tsol_tpc_t *tp;
873 	boolean_t retv;
874 	const bslabel_t *label, *conn_label;
875 	boolean_t shared_addr = (ira->ira_flags & IRAF_TX_SHARED_ADDR);
876 
877 	/*
878 	 * tsol_get_pkt_label intentionally avoids the labeling process for:
879 	 *	- IPv6 router and neighbor discovery as well as redirects.
880 	 *	- MLD packets. (Anything between ICMPv6 code 130 and 138.)
881 	 *	- IGMP packets.
882 	 *	- IPv4 router discovery.
883 	 * In those cases ire_cred is NULL.
884 	 */
885 	credp = ira->ira_cred;
886 	if (credp == NULL)
887 		return (B_TRUE);
888 
889 	/*
890 	 * If this packet is from the inside (not a remote host) and has the
891 	 * same zoneid as the selected destination, then no checks are
892 	 * necessary.  Membership in the zone is enough proof.  This is
893 	 * intended to be a hot path through this function.
894 	 * Note: Using crgetzone here is ok since the peer is local.
895 	 */
896 	if (!crisremote(credp) &&
897 	    crgetzone(credp) == crgetzone(connp->conn_cred))
898 		return (B_TRUE);
899 
900 	plabel = ira->ira_tsl;
901 	conn_plabel = crgetlabel(connp->conn_cred);
902 	ASSERT(plabel != NULL && conn_plabel != NULL);
903 
904 	label = label2bslabel(plabel);
905 	conn_label = label2bslabel(conn_plabel);
906 
907 
908 	/*
909 	 * Implicitly labeled packets from label-aware sources
910 	 * go only to privileged receivers
911 	 */
912 	if ((plabel->tsl_flags & TSLF_IMPLICIT_IN) &&
913 	    (connp->conn_mac_mode != CONN_MAC_IMPLICIT)) {
914 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac_impl,
915 		    char *,
916 		    "implicitly labeled packet mp(1) for conn(2) "
917 		    "which isn't in implicit mac mode",
918 		    mblk_t *, mp, conn_t *, connp);
919 
920 		return (B_FALSE);
921 	}
922 
923 
924 	/*
925 	 * MLPs are always validated using the range and set of the local
926 	 * address, even when the remote host is unlabeled.
927 	 */
928 	if (connp->conn_mlp_type == mlptBoth ||
929 	/* LINTED: no consequent */
930 	    connp->conn_mlp_type == (shared_addr ? mlptShared : mlptPrivate)) {
931 		;
932 
933 	/*
934 	 * If this is a packet from an unlabeled sender, then we must apply
935 	 * different rules.  If the label is equal to the zone's label, then
936 	 * it's allowed.  If it's not equal, but the zone is either the global
937 	 * zone or the label is dominated by the zone's label, then allow it
938 	 * as long as it's in the range configured for the destination.
939 	 */
940 	} else if (plabel->tsl_flags & TSLF_UNLABELED) {
941 		if (plabel->tsl_doi == conn_plabel->tsl_doi &&
942 		    blequal(label, conn_label))
943 			return (B_TRUE);
944 
945 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
946 		    (!connp->conn_zone_is_global &&
947 		    (plabel->tsl_doi != conn_plabel->tsl_doi ||
948 		    !bldominates(conn_label, label)))) {
949 			DTRACE_PROBE3(
950 			    tx__ip__log__drop__receivelocal__mac_unl,
951 			    char *,
952 			    "unlabeled packet mp(1) fails mac for conn(2)",
953 			    mblk_t *, mp, conn_t *, connp);
954 			return (B_FALSE);
955 		}
956 
957 	/*
958 	 * If this is a packet from a labeled sender, verify the
959 	 * label on the packet matches the connection label.
960 	 */
961 	} else {
962 		if (plabel->tsl_doi != conn_plabel->tsl_doi ||
963 		    !blequal(label, conn_label)) {
964 			DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac__slp,
965 			    char *,
966 			    "packet mp(1) failed label match to SLP conn(2)",
967 			    mblk_t *, mp, conn_t *, connp);
968 			return (B_FALSE);
969 		}
970 		/*
971 		 * No further checks will be needed if this is a zone-
972 		 * specific address because (1) The process for bringing up
973 		 * the interface ensures the zone's label is within the zone-
974 		 * specific address's valid label range; (2) For cases where
975 		 * the conn is bound to the unspecified addresses, ip fanout
976 		 * logic ensures conn's zoneid equals the dest addr's zoneid;
977 		 * (3) Mac-exempt and mlp logic above already handle all
978 		 * cases where the zone label may not be the same as the
979 		 * conn label.
980 		 */
981 		if (!shared_addr)
982 			return (B_TRUE);
983 	}
984 
985 	tp = find_tpc(addr, version, B_FALSE);
986 	if (tp == NULL) {
987 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__no__tnr,
988 		    char *, "dropping mp(1), host(2) lacks entry",
989 		    mblk_t *, mp, void *, addr);
990 		return (B_FALSE);
991 	}
992 
993 	/*
994 	 * The local host address should not be unlabeled at this point.  The
995 	 * only way this can happen is that the destination isn't unicast.  We
996 	 * assume that the packet should not have had a label, and thus should
997 	 * have been handled by the TSLF_UNLABELED logic above.
998 	 */
999 	if (tp->tpc_tp.host_type == UNLABELED) {
1000 		retv = B_FALSE;
1001 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__flag, char *,
1002 		    "mp(1) unlabeled source, but tp is not unlabeled.",
1003 		    mblk_t *, mp, tsol_tpc_t *, tp);
1004 
1005 	} else if (tp->tpc_tp.host_type != SUN_CIPSO) {
1006 		retv = B_FALSE;
1007 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__tptype, char *,
1008 		    "delivering mp(1), found unrecognized tpc(2) type.",
1009 		    mblk_t *, mp, tsol_tpc_t *, tp);
1010 
1011 	} else if (plabel->tsl_doi != tp->tpc_tp.tp_doi) {
1012 		retv = B_FALSE;
1013 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac, char *,
1014 		    "mp(1) could not be delievered to tp(2), doi mismatch",
1015 		    mblk_t *, mp, tsol_tpc_t *, tp);
1016 
1017 	} else if (!_blinrange(label, &tp->tpc_tp.tp_sl_range_cipso) &&
1018 	    !blinlset(label, tp->tpc_tp.tp_sl_set_cipso)) {
1019 		retv = B_FALSE;
1020 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac, char *,
1021 		    "mp(1) could not be delievered to tp(2), bad mac",
1022 		    mblk_t *, mp, tsol_tpc_t *, tp);
1023 	} else {
1024 		retv = B_TRUE;
1025 	}
1026 
1027 	TPC_RELE(tp);
1028 
1029 	return (retv);
1030 }
1031 
1032 boolean_t
1033 tsol_can_accept_raw(mblk_t *mp, ip_recv_attr_t *ira, boolean_t check_host)
1034 {
1035 	ts_label_t	*plabel = NULL;
1036 	tsol_tpc_t	*src_rhtp, *dst_rhtp;
1037 	boolean_t	retv;
1038 
1039 	plabel = ira->ira_tsl;
1040 
1041 	/* We are bootstrapping or the internal template was never deleted */
1042 	if (plabel == NULL)
1043 		return (B_TRUE);
1044 
1045 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1046 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
1047 
1048 		src_rhtp = find_tpc(&ipha->ipha_src, IPV4_VERSION,
1049 		    B_FALSE);
1050 		if (src_rhtp == NULL)
1051 			return (B_FALSE);
1052 		dst_rhtp = find_tpc(&ipha->ipha_dst, IPV4_VERSION,
1053 		    B_FALSE);
1054 	} else {
1055 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1056 
1057 		src_rhtp = find_tpc(&ip6h->ip6_src, IPV6_VERSION,
1058 		    B_FALSE);
1059 		if (src_rhtp == NULL)
1060 			return (B_FALSE);
1061 		dst_rhtp = find_tpc(&ip6h->ip6_dst, IPV6_VERSION,
1062 		    B_FALSE);
1063 	}
1064 	if (dst_rhtp == NULL) {
1065 		TPC_RELE(src_rhtp);
1066 		return (B_FALSE);
1067 	}
1068 
1069 	if (label2doi(plabel) != src_rhtp->tpc_tp.tp_doi) {
1070 		retv = B_FALSE;
1071 
1072 	/*
1073 	 * Check that the packet's label is in the correct range for labeled
1074 	 * sender, or is equal to the default label for unlabeled sender.
1075 	 */
1076 	} else if ((src_rhtp->tpc_tp.host_type != UNLABELED &&
1077 	    !_blinrange(label2bslabel(plabel),
1078 	    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
1079 	    !blinlset(label2bslabel(plabel),
1080 	    src_rhtp->tpc_tp.tp_sl_set_cipso)) ||
1081 	    (src_rhtp->tpc_tp.host_type == UNLABELED &&
1082 	    !blequal(&plabel->tsl_label, &src_rhtp->tpc_tp.tp_def_label))) {
1083 		retv = B_FALSE;
1084 
1085 	} else if (check_host) {
1086 		retv = B_TRUE;
1087 
1088 	/*
1089 	 * Until we have SL range in the Zone structure, pass it
1090 	 * when our own address lookup returned an internal entry.
1091 	 */
1092 	} else switch (dst_rhtp->tpc_tp.host_type) {
1093 	case UNLABELED:
1094 		retv = B_TRUE;
1095 		break;
1096 
1097 	case SUN_CIPSO:
1098 		retv = _blinrange(label2bslabel(plabel),
1099 		    &dst_rhtp->tpc_tp.tp_sl_range_cipso) ||
1100 		    blinlset(label2bslabel(plabel),
1101 		    dst_rhtp->tpc_tp.tp_sl_set_cipso);
1102 		break;
1103 
1104 	default:
1105 		retv = B_FALSE;
1106 	}
1107 	TPC_RELE(src_rhtp);
1108 	TPC_RELE(dst_rhtp);
1109 	return (retv);
1110 }
1111 
1112 /*
1113  * This routine determines whether a response to a failed packet delivery or
1114  * connection should be sent back.  By default, the policy is to allow such
1115  * messages to be sent at all times, as these messages reveal little useful
1116  * information and are healthy parts of TCP/IP networking.
1117  *
1118  * If tsol_strict_error is set, then we do strict tests: if the packet label is
1119  * within the label range/set of this host/zone, return B_TRUE; otherwise
1120  * return B_FALSE, which causes the packet to be dropped silently.
1121  *
1122  * Note that tsol_get_pkt_label will cause the packet to drop if the sender is
1123  * marked as labeled in the remote host database, but the packet lacks a label.
1124  * This means that we don't need to do a lookup on the source; the
1125  * TSLF_UNLABELED flag is sufficient.
1126  */
1127 boolean_t
1128 tsol_can_reply_error(const mblk_t *mp, ip_recv_attr_t *ira)
1129 {
1130 	ts_label_t	*plabel = NULL;
1131 	tsol_tpc_t	*rhtp;
1132 	const ipha_t	*ipha;
1133 	const ip6_t	*ip6h;
1134 	boolean_t	retv;
1135 	bslabel_t	*pktbs;
1136 
1137 	/* Caller must pull up at least the IP header */
1138 	ASSERT(MBLKL(mp) >= (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ?
1139 	    sizeof (*ipha) : sizeof (*ip6h)));
1140 
1141 	if (!tsol_strict_error)
1142 		return (B_TRUE);
1143 
1144 	plabel = ira->ira_tsl;
1145 
1146 	/* We are bootstrapping or the internal template was never deleted */
1147 	if (plabel == NULL)
1148 		return (B_TRUE);
1149 
1150 	if (plabel->tsl_flags & TSLF_IMPLICIT_IN) {
1151 		DTRACE_PROBE3(tx__ip__log__drop__replyerror__unresolved__label,
1152 		    char *,
1153 		    "cannot send error report for packet mp(1) with "
1154 		    "unresolved security label sl(2)",
1155 		    mblk_t *, mp, ts_label_t *, plabel);
1156 		return (B_FALSE);
1157 	}
1158 
1159 
1160 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1161 		ipha = (const ipha_t *)mp->b_rptr;
1162 		rhtp = find_tpc(&ipha->ipha_dst, IPV4_VERSION, B_FALSE);
1163 	} else {
1164 		ip6h = (const ip6_t *)mp->b_rptr;
1165 		rhtp = find_tpc(&ip6h->ip6_dst, IPV6_VERSION, B_FALSE);
1166 	}
1167 
1168 	if (rhtp == NULL || label2doi(plabel) != rhtp->tpc_tp.tp_doi) {
1169 		retv = B_FALSE;
1170 	} else {
1171 		/*
1172 		 * If we're in the midst of forwarding, then the destination
1173 		 * address might not be labeled.  In that case, allow unlabeled
1174 		 * packets through only if the default label is the same, and
1175 		 * labeled ones if they dominate.
1176 		 */
1177 		pktbs = label2bslabel(plabel);
1178 		switch (rhtp->tpc_tp.host_type) {
1179 		case UNLABELED:
1180 			if (plabel->tsl_flags & TSLF_UNLABELED) {
1181 				retv = blequal(pktbs,
1182 				    &rhtp->tpc_tp.tp_def_label);
1183 			} else {
1184 				retv = bldominates(pktbs,
1185 				    &rhtp->tpc_tp.tp_def_label);
1186 			}
1187 			break;
1188 
1189 		case SUN_CIPSO:
1190 			retv = _blinrange(pktbs,
1191 			    &rhtp->tpc_tp.tp_sl_range_cipso) ||
1192 			    blinlset(pktbs, rhtp->tpc_tp.tp_sl_set_cipso);
1193 			break;
1194 
1195 		default:
1196 			retv = B_FALSE;
1197 			break;
1198 		}
1199 	}
1200 
1201 	if (rhtp != NULL)
1202 		TPC_RELE(rhtp);
1203 
1204 	return (retv);
1205 }
1206 
1207 /*
1208  * Finds the zone associated with the receive attributes.  Returns GLOBAL_ZONEID
1209  * if the zone cannot be located.
1210  *
1211  * This is used by the classifier when the packet matches an ALL_ZONES IRE, and
1212  * there's no MLP defined.
1213  *
1214  * Note that we assume that this is only invoked in the ALL_ZONES case.
1215  * Handling other cases would require handling exclusive IP zones where either
1216  * this routine or the callers would have to map from
1217  * the zoneid (zone->zone_id) to what IP uses in conn_zoneid etc.
1218  */
1219 zoneid_t
1220 tsol_attr_to_zoneid(const ip_recv_attr_t *ira)
1221 {
1222 	zone_t *zone;
1223 	ts_label_t *label;
1224 
1225 	if ((label = ira->ira_tsl) != NULL) {
1226 		zone = zone_find_by_label(label);
1227 		if (zone != NULL) {
1228 			zoneid_t zoneid = zone->zone_id;
1229 
1230 			zone_rele(zone);
1231 			return (zoneid);
1232 		}
1233 	}
1234 	return (GLOBAL_ZONEID);
1235 }
1236 
1237 int
1238 tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
1239 {
1240 	int		error = 0;
1241 	tsol_ire_gw_secattr_t *attrp = NULL;
1242 	tsol_tnrhc_t	*gw_rhc = NULL;
1243 	tsol_gcgrp_t	*gcgrp = NULL;
1244 	tsol_gc_t	*gc = NULL;
1245 	in_addr_t	ga_addr4;
1246 	void		*paddr = NULL;
1247 
1248 	/* Not in Trusted mode or IRE is local/loopback/broadcast/interface */
1249 	if (!is_system_labeled() ||
1250 	    (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
1251 	    IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)))
1252 		goto done;
1253 
1254 	/*
1255 	 * If we don't have a label to compare with, or the IRE does not
1256 	 * contain any gateway security attributes, there's not much that
1257 	 * we can do.  We let the former case pass, and the latter fail,
1258 	 * since the IRE doesn't qualify for a match due to the lack of
1259 	 * security attributes.
1260 	 */
1261 	if (tsl == NULL || ire->ire_gw_secattr == NULL) {
1262 		if (tsl != NULL) {
1263 			DTRACE_PROBE3(
1264 			    tx__ip__log__drop__irematch__nogwsec, char *,
1265 			    "ire(1) lacks ire_gw_secattr when matching "
1266 			    "label(2)", ire_t *, ire, ts_label_t *, tsl);
1267 			error = EACCES;
1268 		}
1269 		goto done;
1270 	}
1271 
1272 	attrp = ire->ire_gw_secattr;
1273 
1274 	/*
1275 	 * The possible lock order scenarios related to the tsol gateway
1276 	 * attribute locks are documented at the beginning of ip.c in the
1277 	 * lock order scenario section.
1278 	 */
1279 	mutex_enter(&attrp->igsa_lock);
1280 
1281 	/*
1282 	 * We seek the group
1283 	 * structure which contains all security credentials of the gateway.
1284 	 * An offline IRE is associated with at most one gateway credential.
1285 	 */
1286 	if ((gc = attrp->igsa_gc) != NULL) {
1287 		gcgrp = gc->gc_grp;
1288 		ASSERT(gcgrp != NULL);
1289 		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1290 		GCGRP_REFHOLD(gcgrp);
1291 	}
1292 
1293 	if ((gw_rhc = attrp->igsa_rhc) != NULL) {
1294 		/*
1295 		 * If our cached entry has grown stale, then discard it so we
1296 		 * can get a new one.
1297 		 */
1298 		if (gw_rhc->rhc_invalid || gw_rhc->rhc_tpc->tpc_invalid) {
1299 			TNRHC_RELE(gw_rhc);
1300 			attrp->igsa_rhc = gw_rhc = NULL;
1301 		} else {
1302 			TNRHC_HOLD(gw_rhc)
1303 		}
1304 	}
1305 
1306 	/* Last attempt at loading the template had failed; try again */
1307 	if (gw_rhc == NULL) {
1308 		if (gcgrp != NULL) {
1309 			tsol_gcgrp_addr_t *ga = &gcgrp->gcgrp_addr;
1310 
1311 			if (ire->ire_ipversion == IPV4_VERSION) {
1312 				ASSERT(ga->ga_af == AF_INET);
1313 				IN6_V4MAPPED_TO_IPADDR(&ga->ga_addr, ga_addr4);
1314 				paddr = &ga_addr4;
1315 			} else {
1316 				ASSERT(ga->ga_af == AF_INET6);
1317 				paddr = &ga->ga_addr;
1318 			}
1319 		} else if (ire->ire_type & IRE_OFFLINK) {
1320 			if (ire->ire_ipversion == IPV6_VERSION)
1321 				paddr = &ire->ire_gateway_addr_v6;
1322 			else if (ire->ire_ipversion == IPV4_VERSION)
1323 				paddr = &ire->ire_gateway_addr;
1324 		}
1325 
1326 		/* We've found a gateway address to do the template lookup */
1327 		if (paddr != NULL) {
1328 			ASSERT(gw_rhc == NULL);
1329 			gw_rhc = find_rhc(paddr, ire->ire_ipversion, B_FALSE);
1330 			if (gw_rhc != NULL) {
1331 				/*
1332 				 * Note that if the lookup above returned an
1333 				 * internal template, we'll use it for the
1334 				 * time being, and do another lookup next
1335 				 * time around.
1336 				 */
1337 				/* Another thread has loaded the template? */
1338 				if (attrp->igsa_rhc != NULL) {
1339 					TNRHC_RELE(gw_rhc)
1340 					/* reload, it could be different */
1341 					gw_rhc = attrp->igsa_rhc;
1342 				} else {
1343 					attrp->igsa_rhc = gw_rhc;
1344 				}
1345 				/*
1346 				 * Hold an extra reference just like we did
1347 				 * above prior to dropping the igsa_lock.
1348 				 */
1349 				TNRHC_HOLD(gw_rhc)
1350 			}
1351 		}
1352 	}
1353 
1354 	mutex_exit(&attrp->igsa_lock);
1355 	/* Gateway template not found */
1356 	if (gw_rhc == NULL) {
1357 		/*
1358 		 * If destination address is directly reachable through an
1359 		 * interface rather than through a learned route, pass it.
1360 		 */
1361 		if (paddr != NULL) {
1362 			DTRACE_PROBE3(
1363 			    tx__ip__log__drop__irematch__nogwtmpl, char *,
1364 			    "ire(1), label(2) off-link with no gw_rhc",
1365 			    ire_t *, ire, ts_label_t *, tsl);
1366 			error = EINVAL;
1367 		}
1368 		goto done;
1369 	}
1370 
1371 	if (gc != NULL) {
1372 
1373 		tsol_gcdb_t *gcdb;
1374 		/*
1375 		 * In the case of IRE_CACHE we've got one or more gateway
1376 		 * security credentials to compare against the passed in label.
1377 		 * Perform label range comparison against each security
1378 		 * credential of the gateway. In the case of a prefix ire
1379 		 * we need to match against the security attributes of
1380 		 * just the route itself, so the loop is executed only once.
1381 		 */
1382 		ASSERT(gcgrp != NULL);
1383 		gcdb = gc->gc_db;
1384 		if (tsl->tsl_doi != gcdb->gcdb_doi ||
1385 		    !_blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange)) {
1386 			DTRACE_PROBE3(
1387 			    tx__ip__log__drop__irematch__nogcmatched,
1388 			    char *, "ire(1), tsl(2): all gc failed match",
1389 			    ire_t *, ire, ts_label_t *, tsl);
1390 			error = EACCES;
1391 		}
1392 	} else {
1393 		/*
1394 		 * We didn't find any gateway credentials in the IRE
1395 		 * attributes; fall back to the gateway's template for
1396 		 * label range checks, if we are required to do so.
1397 		 */
1398 		ASSERT(gw_rhc != NULL);
1399 		switch (gw_rhc->rhc_tpc->tpc_tp.host_type) {
1400 		case SUN_CIPSO:
1401 			if (tsl->tsl_doi != gw_rhc->rhc_tpc->tpc_tp.tp_doi ||
1402 			    (!_blinrange(&tsl->tsl_label,
1403 			    &gw_rhc->rhc_tpc->tpc_tp.tp_sl_range_cipso) &&
1404 			    !blinlset(&tsl->tsl_label,
1405 			    gw_rhc->rhc_tpc->tpc_tp.tp_sl_set_cipso))) {
1406 				error = EACCES;
1407 				DTRACE_PROBE4(
1408 				    tx__ip__log__drop__irematch__deftmpl,
1409 				    char *, "ire(1), tsl(2), gw_rhc(3) "
1410 				    "failed match (cipso gw)",
1411 				    ire_t *, ire, ts_label_t *, tsl,
1412 				    tsol_tnrhc_t *, gw_rhc);
1413 			}
1414 			break;
1415 
1416 		case UNLABELED:
1417 			if (tsl->tsl_doi != gw_rhc->rhc_tpc->tpc_tp.tp_doi ||
1418 			    (!_blinrange(&tsl->tsl_label,
1419 			    &gw_rhc->rhc_tpc->tpc_tp.tp_gw_sl_range) &&
1420 			    !blinlset(&tsl->tsl_label,
1421 			    gw_rhc->rhc_tpc->tpc_tp.tp_gw_sl_set))) {
1422 				error = EACCES;
1423 				DTRACE_PROBE4(
1424 				    tx__ip__log__drop__irematch__deftmpl,
1425 				    char *, "ire(1), tsl(2), gw_rhc(3) "
1426 				    "failed match (unlabeled gw)",
1427 				    ire_t *, ire, ts_label_t *, tsl,
1428 				    tsol_tnrhc_t *, gw_rhc);
1429 			}
1430 			break;
1431 		}
1432 	}
1433 
1434 done:
1435 
1436 	if (gcgrp != NULL) {
1437 		rw_exit(&gcgrp->gcgrp_rwlock);
1438 		GCGRP_REFRELE(gcgrp);
1439 	}
1440 
1441 	if (gw_rhc != NULL)
1442 		TNRHC_RELE(gw_rhc)
1443 
1444 	return (error);
1445 }
1446 
1447 /*
1448  * Performs label accreditation checks for packet forwarding.
1449  * Add or remove a CIPSO option as needed.
1450  *
1451  * Returns a pointer to the modified mblk if allowed for forwarding,
1452  * or NULL if the packet must be dropped.
1453  */
1454 mblk_t *
1455 tsol_ip_forward(ire_t *ire, mblk_t *mp, const ip_recv_attr_t *ira)
1456 {
1457 	tsol_ire_gw_secattr_t *attrp = NULL;
1458 	ipha_t		*ipha;
1459 	ip6_t		*ip6h;
1460 	const void	*pdst;
1461 	const void	*psrc;
1462 	boolean_t	off_link;
1463 	tsol_tpc_t	*dst_rhtp, *gw_rhtp;
1464 	tsol_ip_label_t label_type;
1465 	uchar_t		*opt_ptr = NULL;
1466 	ts_label_t	*tsl;
1467 	uint8_t		proto;
1468 	int		af, adjust;
1469 	uint16_t	iplen;
1470 	boolean_t	need_tpc_rele = B_FALSE;
1471 	ipaddr_t	*gw;
1472 	ip_stack_t	*ipst = ire->ire_ipst;
1473 	int		err;
1474 	ts_label_t	*effective_tsl = NULL;
1475 
1476 	ASSERT(ire != NULL && mp != NULL);
1477 	/*
1478 	 * Note that the ire is the first one found, i.e., an IRE_OFFLINK if
1479 	 * the destination is offlink.
1480 	 */
1481 
1482 	af = (ire->ire_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
1483 
1484 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1485 		ASSERT(ire->ire_ipversion == IPV4_VERSION);
1486 		ipha = (ipha_t *)mp->b_rptr;
1487 		psrc = &ipha->ipha_src;
1488 		pdst = &ipha->ipha_dst;
1489 		proto = ipha->ipha_protocol;
1490 		if (!tsol_get_option_v4(mp, &label_type, &opt_ptr))
1491 			return (NULL);
1492 	} else {
1493 		ASSERT(ire->ire_ipversion == IPV6_VERSION);
1494 		ip6h = (ip6_t *)mp->b_rptr;
1495 		psrc = &ip6h->ip6_src;
1496 		pdst = &ip6h->ip6_dst;
1497 		proto = ip6h->ip6_nxt;
1498 
1499 		if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
1500 		    proto != IPPROTO_ICMPV6) {
1501 			uint8_t *nexthdrp;
1502 			uint16_t hdr_len;
1503 
1504 			if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len,
1505 			    &nexthdrp)) {
1506 				/* malformed packet; drop it */
1507 				return (NULL);
1508 			}
1509 			proto = *nexthdrp;
1510 		}
1511 		if (!tsol_get_option_v6(mp, &label_type, &opt_ptr))
1512 			return (NULL);
1513 	}
1514 	/*
1515 	 * off_link is TRUE if destination not directly reachable.
1516 	 */
1517 	off_link = (ire->ire_type & IRE_OFFLINK);
1518 
1519 	if ((tsl = ira->ira_tsl) == NULL)
1520 		return (mp);
1521 
1522 	if (tsl->tsl_flags & TSLF_IMPLICIT_IN) {
1523 		DTRACE_PROBE3(tx__ip__log__drop__forward__unresolved__label,
1524 		    char *,
1525 		    "cannot forward packet mp(1) with unresolved "
1526 		    "security label sl(2)",
1527 		    mblk_t *, mp, ts_label_t *, tsl);
1528 
1529 		return (NULL);
1530 	}
1531 
1532 
1533 	ASSERT(psrc != NULL && pdst != NULL);
1534 	dst_rhtp = find_tpc(pdst, ire->ire_ipversion, B_FALSE);
1535 
1536 	if (dst_rhtp == NULL) {
1537 		/*
1538 		 * Without a template we do not know if forwarding
1539 		 * violates MAC
1540 		 */
1541 		DTRACE_PROBE3(tx__ip__log__drop__forward__nodst, char *,
1542 		    "mp(1) dropped, no template for destination ip4|6(2)",
1543 		    mblk_t *, mp, void *, pdst);
1544 		return (NULL);
1545 	}
1546 
1547 	/*
1548 	 * Gateway template must have existed for off-link destinations,
1549 	 * since tsol_ire_match_gwattr has ensured such condition.
1550 	 */
1551 	if (ire->ire_ipversion == IPV4_VERSION && off_link) {
1552 		/*
1553 		 * Surya note: first check if we can get the gw_rhtp from
1554 		 * the ire_gw_secattr->igsa_rhc; if this is null, then
1555 		 * do a lookup based on the ire_addr (address of gw)
1556 		 */
1557 		if (ire->ire_gw_secattr != NULL &&
1558 		    ire->ire_gw_secattr->igsa_rhc != NULL) {
1559 			attrp = ire->ire_gw_secattr;
1560 			gw_rhtp = attrp->igsa_rhc->rhc_tpc;
1561 		} else  {
1562 			gw = &ire->ire_gateway_addr;
1563 			gw_rhtp = find_tpc(gw, ire->ire_ipversion, B_FALSE);
1564 			need_tpc_rele = B_TRUE;
1565 		}
1566 		if (gw_rhtp == NULL) {
1567 			DTRACE_PROBE3(tx__ip__log__drop__forward__nogw, char *,
1568 			    "mp(1) dropped, no gateway in ire attributes(2)",
1569 			    mblk_t *, mp, tsol_ire_gw_secattr_t *, attrp);
1570 			mp = NULL;
1571 			goto keep_label;
1572 		}
1573 	}
1574 	if (ire->ire_ipversion == IPV6_VERSION &&
1575 	    ((attrp = ire->ire_gw_secattr) == NULL || attrp->igsa_rhc == NULL ||
1576 	    (gw_rhtp = attrp->igsa_rhc->rhc_tpc) == NULL) && off_link) {
1577 		DTRACE_PROBE3(tx__ip__log__drop__forward__nogw, char *,
1578 		    "mp(1) dropped, no gateway in ire attributes(2)",
1579 		    mblk_t *, mp, tsol_ire_gw_secattr_t *, attrp);
1580 		mp = NULL;
1581 		goto keep_label;
1582 	}
1583 
1584 	/*
1585 	 * Check that the label for the packet is acceptable
1586 	 * by destination host; otherwise, drop it.
1587 	 */
1588 	switch (dst_rhtp->tpc_tp.host_type) {
1589 	case SUN_CIPSO:
1590 		if (tsl->tsl_doi != dst_rhtp->tpc_tp.tp_doi ||
1591 		    (!_blinrange(&tsl->tsl_label,
1592 		    &dst_rhtp->tpc_tp.tp_sl_range_cipso) &&
1593 		    !blinlset(&tsl->tsl_label,
1594 		    dst_rhtp->tpc_tp.tp_sl_set_cipso))) {
1595 			DTRACE_PROBE4(tx__ip__log__drop__forward__mac, char *,
1596 			    "labeled packet mp(1) dropped, label(2) fails "
1597 			    "destination(3) accredation check",
1598 			    mblk_t *, mp, ts_label_t *, tsl,
1599 			    tsol_tpc_t *, dst_rhtp);
1600 			mp = NULL;
1601 			goto keep_label;
1602 		}
1603 		break;
1604 
1605 
1606 	case UNLABELED:
1607 		if (tsl->tsl_doi != dst_rhtp->tpc_tp.tp_doi ||
1608 		    !blequal(&dst_rhtp->tpc_tp.tp_def_label,
1609 		    &tsl->tsl_label)) {
1610 			DTRACE_PROBE4(tx__ip__log__drop__forward__mac, char *,
1611 			    "unlabeled packet mp(1) dropped, label(2) fails "
1612 			    "destination(3) accredation check",
1613 			    mblk_t *, mp, ts_label_t *, tsl,
1614 			    tsol_tpc_t *, dst_rhtp);
1615 			mp = NULL;
1616 			goto keep_label;
1617 		}
1618 		break;
1619 	}
1620 	if (label_type == OPT_CIPSO) {
1621 		/*
1622 		 * We keep the label on any of the following cases:
1623 		 *
1624 		 *   1. The destination is labeled (on/off-link).
1625 		 *   2. The unlabeled destination is off-link,
1626 		 *	and the next hop gateway is labeled.
1627 		 */
1628 		if (dst_rhtp->tpc_tp.host_type != UNLABELED ||
1629 		    (off_link &&
1630 		    gw_rhtp->tpc_tp.host_type != UNLABELED))
1631 			goto keep_label;
1632 
1633 		/*
1634 		 * Strip off the CIPSO option from the packet because: the
1635 		 * unlabeled destination host is directly reachable through
1636 		 * an interface (on-link); or, the unlabeled destination host
1637 		 * is not directly reachable (off-link), and the next hop
1638 		 * gateway is unlabeled.
1639 		 */
1640 		adjust = (af == AF_INET) ? tsol_remove_secopt(ipha, MBLKL(mp)) :
1641 		    tsol_remove_secopt_v6(ip6h, MBLKL(mp));
1642 
1643 		ASSERT(adjust <= 0);
1644 		if (adjust != 0) {
1645 
1646 			/* adjust is negative */
1647 			ASSERT((mp->b_wptr + adjust) >= mp->b_rptr);
1648 			mp->b_wptr += adjust;
1649 			/*
1650 			 * Note that caller adjusts ira_pktlen and
1651 			 * ira_ip_hdr_length
1652 			 *
1653 			 * For AF_INET6 note that tsol_remove_secopt_v6
1654 			 * adjusted ip6_plen.
1655 			 */
1656 			if (af == AF_INET) {
1657 				ipha = (ipha_t *)mp->b_rptr;
1658 				iplen = ntohs(ipha->ipha_length) + adjust;
1659 				ipha->ipha_length = htons(iplen);
1660 				ipha->ipha_hdr_checksum = 0;
1661 				ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1662 			}
1663 			DTRACE_PROBE3(tx__ip__log__info__forward__adjust,
1664 			    char *,
1665 			    "mp(1) adjusted(2) for CIPSO option removal",
1666 			    mblk_t *, mp, int, adjust);
1667 		}
1668 		goto keep_label;
1669 	}
1670 
1671 	ASSERT(label_type == OPT_NONE);
1672 	ASSERT(dst_rhtp != NULL);
1673 
1674 	/*
1675 	 * We need to add CIPSO option if the destination or the next hop
1676 	 * gateway is labeled.  Otherwise, pass the packet as is.
1677 	 */
1678 	if (dst_rhtp->tpc_tp.host_type == UNLABELED &&
1679 	    (!off_link || gw_rhtp->tpc_tp.host_type == UNLABELED))
1680 		goto keep_label;
1681 
1682 	/*
1683 	 * Since we are forwarding packets we use GLOBAL_ZONEID for
1684 	 * the IRE lookup in tsol_check_label.
1685 	 * Since mac_exempt is false the zoneid isn't used for anything
1686 	 * but the IRE lookup, hence we set zone_is_global to false.
1687 	 */
1688 	if (af == AF_INET) {
1689 		err = tsol_check_label_v4(tsl, GLOBAL_ZONEID, &mp,
1690 		    CONN_MAC_DEFAULT, B_FALSE, ipst, &effective_tsl);
1691 	} else {
1692 		err = tsol_check_label_v6(tsl, GLOBAL_ZONEID, &mp,
1693 		    CONN_MAC_DEFAULT, B_FALSE, ipst, &effective_tsl);
1694 	}
1695 	if (err != 0) {
1696 		BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1697 		ip_drop_output("tsol_check_label", mp, NULL);
1698 		freemsg(mp);
1699 		mp = NULL;
1700 		goto keep_label;
1701 	}
1702 
1703 	/*
1704 	 * The effective_tsl must never affect the routing decision, hence
1705 	 * we ignore it here.
1706 	 */
1707 	if (effective_tsl != NULL)
1708 		label_rele(effective_tsl);
1709 
1710 	if (af == AF_INET) {
1711 		ipha = (ipha_t *)mp->b_rptr;
1712 		ipha->ipha_hdr_checksum = 0;
1713 		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1714 	}
1715 
1716 keep_label:
1717 	TPC_RELE(dst_rhtp);
1718 	if (need_tpc_rele && gw_rhtp != NULL)
1719 		TPC_RELE(gw_rhtp);
1720 	return (mp);
1721 }
1722 
1723 /*
1724  * Name:	tsol_pmtu_adjust()
1725  *
1726  * Returns the adjusted mtu after removing security option.
1727  * Removes/subtracts the option if the packet's cred indicates an unlabeled
1728  * sender or if pkt_diff indicates this system enlarged the packet.
1729  */
1730 uint32_t
1731 tsol_pmtu_adjust(mblk_t *mp, uint32_t mtu, int pkt_diff, int af)
1732 {
1733 	int		label_adj = 0;
1734 	uint32_t	min_mtu = IP_MIN_MTU;
1735 	tsol_tpc_t	*src_rhtp;
1736 	void		*src;
1737 
1738 	/*
1739 	 * Note: label_adj is non-positive, indicating the number of
1740 	 * bytes removed by removing the security option from the
1741 	 * header.
1742 	 */
1743 	if (af == AF_INET6) {
1744 		ip6_t	*ip6h;
1745 
1746 		min_mtu = IPV6_MIN_MTU;
1747 		ip6h = (ip6_t *)mp->b_rptr;
1748 		src = &ip6h->ip6_src;
1749 		if ((src_rhtp = find_tpc(src, IPV6_VERSION, B_FALSE)) == NULL)
1750 			return (mtu);
1751 		if (pkt_diff > 0 || src_rhtp->tpc_tp.host_type == UNLABELED) {
1752 			label_adj = tsol_remove_secopt_v6(
1753 			    (ip6_t *)mp->b_rptr, MBLKL(mp));
1754 		}
1755 	} else {
1756 		ipha_t    *ipha;
1757 
1758 		ASSERT(af == AF_INET);
1759 		ipha = (ipha_t *)mp->b_rptr;
1760 		src = &ipha->ipha_src;
1761 		if ((src_rhtp = find_tpc(src, IPV4_VERSION, B_FALSE)) == NULL)
1762 			return (mtu);
1763 		if (pkt_diff > 0 || src_rhtp->tpc_tp.host_type == UNLABELED)
1764 			label_adj = tsol_remove_secopt(
1765 			    (ipha_t *)mp->b_rptr, MBLKL(mp));
1766 	}
1767 	/*
1768 	 * Make pkt_diff non-negative and the larger of the bytes
1769 	 * previously added (if any) or just removed, since label
1770 	 * addition + subtraction may not be completely idempotent.
1771 	 */
1772 	if (pkt_diff < -label_adj)
1773 		pkt_diff = -label_adj;
1774 	if (pkt_diff > 0 && pkt_diff < mtu)
1775 		mtu -= pkt_diff;
1776 
1777 	TPC_RELE(src_rhtp);
1778 	return (MAX(mtu, min_mtu));
1779 }
1780 
1781 /*
1782  * Name:	tsol_rtsa_init()
1783  *
1784  * Normal:	Sanity checks on the route security attributes provided by
1785  *		user.  Convert it into a route security parameter list to
1786  *		be returned to caller.
1787  *
1788  * Output:	EINVAL if bad security attributes in the routing message
1789  *		ENOMEM if unable to allocate data structures
1790  *		0 otherwise.
1791  *
1792  * Note:	On input, cp must point to the end of any addresses in
1793  *		the rt_msghdr_t structure.
1794  */
1795 int
1796 tsol_rtsa_init(rt_msghdr_t *rtm, tsol_rtsecattr_t *sp, caddr_t cp)
1797 {
1798 	uint_t	sacnt;
1799 	int	err;
1800 	caddr_t	lim;
1801 	tsol_rtsecattr_t *tp;
1802 
1803 	ASSERT((cp >= (caddr_t)&rtm[1]) && sp != NULL);
1804 
1805 	/*
1806 	 * In theory, we could accept as many security attributes configured
1807 	 * per route destination.  However, the current design is limited
1808 	 * such that at most only one set security attributes is allowed to
1809 	 * be associated with a prefix IRE.  We therefore assert for now.
1810 	 */
1811 	/* LINTED */
1812 	ASSERT(TSOL_RTSA_REQUEST_MAX == 1);
1813 
1814 	sp->rtsa_cnt = 0;
1815 	lim = (caddr_t)rtm + rtm->rtm_msglen;
1816 	ASSERT(cp <= lim);
1817 
1818 	if ((lim - cp) < sizeof (rtm_ext_t) ||
1819 	    ((rtm_ext_t *)cp)->rtmex_type != RTMEX_GATEWAY_SECATTR)
1820 		return (0);
1821 
1822 	if (((rtm_ext_t *)cp)->rtmex_len < sizeof (tsol_rtsecattr_t))
1823 		return (EINVAL);
1824 
1825 	cp += sizeof (rtm_ext_t);
1826 
1827 	if ((lim - cp) < sizeof (*tp) ||
1828 	    (tp = (tsol_rtsecattr_t *)cp, (sacnt = tp->rtsa_cnt) == 0) ||
1829 	    (lim - cp) < TSOL_RTSECATTR_SIZE(sacnt))
1830 		return (EINVAL);
1831 
1832 	/*
1833 	 * Trying to add route security attributes when system
1834 	 * labeling service is not available, or when user supllies
1835 	 * more than the maximum number of security attributes
1836 	 * allowed per request.
1837 	 */
1838 	if ((sacnt > 0 && !is_system_labeled()) ||
1839 	    sacnt > TSOL_RTSA_REQUEST_MAX)
1840 		return (EINVAL);
1841 
1842 	/* Ensure valid credentials */
1843 	if ((err = rtsa_validate(&((tsol_rtsecattr_t *)cp)->
1844 	    rtsa_attr[0])) != 0) {
1845 		cp += sizeof (*sp);
1846 		return (err);
1847 	}
1848 
1849 	bcopy(cp, sp, sizeof (*sp));
1850 	cp += sizeof (*sp);
1851 	return (0);
1852 }
1853 
1854 int
1855 tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc)
1856 {
1857 	tsol_ire_gw_secattr_t *attrp;
1858 	boolean_t exists = B_FALSE;
1859 	in_addr_t ga_addr4;
1860 	void *paddr = NULL;
1861 	tsol_gcgrp_t *gcgrp = NULL;
1862 
1863 	ASSERT(ire != NULL);
1864 
1865 	/*
1866 	 * The only time that attrp can be NULL is when this routine is
1867 	 * called for the first time during the creation/initialization
1868 	 * of the corresponding IRE.  It will only get cleared when the
1869 	 * IRE is deleted.
1870 	 */
1871 	if ((attrp = ire->ire_gw_secattr) == NULL) {
1872 		attrp = ire_gw_secattr_alloc(KM_NOSLEEP);
1873 		if (attrp == NULL)
1874 			return (ENOMEM);
1875 		ire->ire_gw_secattr = attrp;
1876 	} else {
1877 		exists = B_TRUE;
1878 		mutex_enter(&attrp->igsa_lock);
1879 
1880 		if (attrp->igsa_rhc != NULL) {
1881 			TNRHC_RELE(attrp->igsa_rhc);
1882 			attrp->igsa_rhc = NULL;
1883 		}
1884 
1885 		if (attrp->igsa_gc != NULL)
1886 			GC_REFRELE(attrp->igsa_gc);
1887 	}
1888 	ASSERT(!exists || MUTEX_HELD(&attrp->igsa_lock));
1889 
1890 	/*
1891 	 * References already held by caller and we keep them;
1892 	 * note that gc may be set to NULL to clear out igsa_gc.
1893 	 */
1894 	attrp->igsa_gc = gc;
1895 
1896 	if (gc != NULL) {
1897 		gcgrp = gc->gc_grp;
1898 		ASSERT(gcgrp != NULL);
1899 	}
1900 
1901 	/*
1902 	 * Intialize the template for gateway; we use the gateway's
1903 	 * address found in either the passed in gateway credential
1904 	 * or group pointer, or the ire_gateway_addr{_v6} field.
1905 	 */
1906 	if (gcgrp != NULL) {
1907 		tsol_gcgrp_addr_t *ga = &gcgrp->gcgrp_addr;
1908 
1909 		/*
1910 		 * Caller is holding a reference, and that we don't
1911 		 * need to hold any lock to access the address.
1912 		 */
1913 		if (ipversion == IPV4_VERSION) {
1914 			ASSERT(ga->ga_af == AF_INET);
1915 			IN6_V4MAPPED_TO_IPADDR(&ga->ga_addr, ga_addr4);
1916 			paddr = &ga_addr4;
1917 		} else {
1918 			ASSERT(ga->ga_af == AF_INET6);
1919 			paddr = &ga->ga_addr;
1920 		}
1921 	} else if (ire->ire_type & IRE_OFFLINK) {
1922 		if (ipversion == IPV6_VERSION)
1923 			paddr = &ire->ire_gateway_addr_v6;
1924 		else if (ipversion == IPV4_VERSION)
1925 			paddr = &ire->ire_gateway_addr;
1926 	}
1927 
1928 	/*
1929 	 * Lookup the gateway template; note that we could get an internal
1930 	 * template here, which we cache anyway.  During IRE matching, we'll
1931 	 * try to update this gateway template cache and hopefully get a
1932 	 * real one.
1933 	 */
1934 	if (paddr != NULL) {
1935 		attrp->igsa_rhc = find_rhc(paddr, ipversion, B_FALSE);
1936 	}
1937 
1938 	if (exists)
1939 		mutex_exit(&attrp->igsa_lock);
1940 
1941 	return (0);
1942 }
1943 
1944 /*
1945  * This function figures the type of MLP that we'll be using based on the
1946  * address that the user is binding and the zone.  If the address is
1947  * unspecified, then we're looking at both private and shared.  If it's one
1948  * of the zone's private addresses, then it's private only.  If it's one
1949  * of the global addresses, then it's shared only. Multicast addresses are
1950  * treated same as unspecified address.
1951  *
1952  * If we can't figure out what it is, then return mlptSingle.  That's actually
1953  * an error case.
1954  *
1955  * The callers are assumed to pass in zone->zone_id and not the zoneid that
1956  * is stored in a conn_t (since the latter will be GLOBAL_ZONEID in an
1957  * exclusive stack zone).
1958  */
1959 mlp_type_t
1960 tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr,
1961     ip_stack_t *ipst)
1962 {
1963 	in_addr_t in4;
1964 	ire_t *ire;
1965 	ipif_t *ipif;
1966 	zoneid_t addrzone;
1967 	zoneid_t ip_zoneid;
1968 
1969 	ASSERT(addr != NULL);
1970 
1971 	/*
1972 	 * For exclusive stacks we set the zoneid to zero
1973 	 * to operate as if in the global zone for IRE and conn_t comparisons.
1974 	 */
1975 	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1976 		ip_zoneid = GLOBAL_ZONEID;
1977 	else
1978 		ip_zoneid = zoneid;
1979 
1980 	if (version == IPV6_VERSION &&
1981 	    IN6_IS_ADDR_V4MAPPED((const in6_addr_t *)addr)) {
1982 		IN6_V4MAPPED_TO_IPADDR((const in6_addr_t *)addr, in4);
1983 		addr = &in4;
1984 		version = IPV4_VERSION;
1985 	}
1986 
1987 	/* Check whether the IRE_LOCAL (or ipif) is ALL_ZONES */
1988 	if (version == IPV4_VERSION) {
1989 		in4 = *(const in_addr_t *)addr;
1990 		if ((in4 == INADDR_ANY) || CLASSD(in4)) {
1991 			return (mlptBoth);
1992 		}
1993 		ire = ire_ftable_lookup_v4(in4, 0, 0, IRE_LOCAL|IRE_LOOPBACK,
1994 		    NULL, ip_zoneid, NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY,
1995 		    0, ipst, NULL);
1996 	} else {
1997 		if (IN6_IS_ADDR_UNSPECIFIED((const in6_addr_t *)addr) ||
1998 		    IN6_IS_ADDR_MULTICAST((const in6_addr_t *)addr)) {
1999 			return (mlptBoth);
2000 		}
2001 		ire = ire_ftable_lookup_v6(addr, 0, 0, IRE_LOCAL|IRE_LOOPBACK,
2002 		    NULL, ip_zoneid, NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY,
2003 		    0, ipst, NULL);
2004 	}
2005 	/*
2006 	 * If we can't find the IRE, then we have to behave exactly like
2007 	 * ip_laddr_verify_{v4,v6}.  That means looking up the IPIF so that
2008 	 * users can bind to addresses on "down" interfaces.
2009 	 *
2010 	 * If we can't find that either, then the bind is going to fail, so
2011 	 * just give up.  Note that there's a miniscule chance that the address
2012 	 * is in transition, but we don't bother handling that.
2013 	 */
2014 	if (ire == NULL) {
2015 		if (version == IPV4_VERSION)
2016 			ipif = ipif_lookup_addr(*(const in_addr_t *)addr, NULL,
2017 			    ip_zoneid, ipst);
2018 		else
2019 			ipif = ipif_lookup_addr_v6((const in6_addr_t *)addr,
2020 			    NULL, ip_zoneid, ipst);
2021 		if (ipif == NULL) {
2022 			return (mlptSingle);
2023 		}
2024 		addrzone = ipif->ipif_zoneid;
2025 		ipif_refrele(ipif);
2026 	} else {
2027 		addrzone = ire->ire_zoneid;
2028 		ire_refrele(ire);
2029 	}
2030 	return (addrzone == ALL_ZONES ? mlptShared : mlptPrivate);
2031 }
2032 
2033 /*
2034  * Since we are configuring local interfaces, and we know trusted
2035  * extension CDE requires local interfaces to be cipso host type in
2036  * order to function correctly, we'll associate a cipso template
2037  * to each local interface and let the interface come up.  Configuring
2038  * a local interface to be "unlabeled" host type is a configuration error.
2039  * We'll override that error and make the interface host type to be cipso
2040  * here.
2041  *
2042  * The code is optimized for the usual "success" case and unwinds things on
2043  * error.  We don't want to go to the trouble and expense of formatting the
2044  * interface name for the usual case where everything is configured correctly.
2045  */
2046 boolean_t
2047 tsol_check_interface_address(const ipif_t *ipif)
2048 {
2049 	tsol_tpc_t *tp;
2050 	char addrbuf[INET6_ADDRSTRLEN];
2051 	int af;
2052 	const void *addr;
2053 	zone_t *zone;
2054 	ts_label_t *plabel;
2055 	const bslabel_t *label;
2056 	char ifbuf[LIFNAMSIZ + 10];
2057 	const char *ifname;
2058 	boolean_t retval;
2059 	tsol_rhent_t rhent;
2060 	netstack_t *ns = ipif->ipif_ill->ill_ipst->ips_netstack;
2061 
2062 	if (IN6_IS_ADDR_V4MAPPED(&ipif->ipif_v6lcl_addr)) {
2063 		af = AF_INET;
2064 		addr = &V4_PART_OF_V6(ipif->ipif_v6lcl_addr);
2065 	} else {
2066 		af = AF_INET6;
2067 		addr = &ipif->ipif_v6lcl_addr;
2068 	}
2069 
2070 	tp = find_tpc(&ipif->ipif_v6lcl_addr, IPV6_VERSION, B_FALSE);
2071 
2072 	/* assumes that ALL_ZONES implies that there is no exclusive stack */
2073 	if (ipif->ipif_zoneid == ALL_ZONES) {
2074 		zone = NULL;
2075 	} else if (ns->netstack_stackid == GLOBAL_NETSTACKID) {
2076 		/* Shared stack case */
2077 		zone = zone_find_by_id(ipif->ipif_zoneid);
2078 	} else {
2079 		/* Exclusive stack case */
2080 		zone = zone_find_by_id(crgetzoneid(ipif->ipif_ill->ill_credp));
2081 	}
2082 	if (zone != NULL) {
2083 		plabel = zone->zone_slabel;
2084 		ASSERT(plabel != NULL);
2085 		label = label2bslabel(plabel);
2086 	}
2087 
2088 	/*
2089 	 * If it's CIPSO and an all-zones address, then we're done.
2090 	 * If it's a CIPSO zone specific address, the zone's label
2091 	 * must be in the range or set specified in the template.
2092 	 * When the remote host entry is missing or the template
2093 	 * type is incorrect for this interface, we create a
2094 	 * CIPSO host entry in kernel and allow the interface to be
2095 	 * brought up as CIPSO type.
2096 	 */
2097 	if (tp != NULL && (
2098 	    /* The all-zones case */
2099 	    (tp->tpc_tp.host_type == SUN_CIPSO &&
2100 	    tp->tpc_tp.tp_doi == default_doi &&
2101 	    ipif->ipif_zoneid == ALL_ZONES) ||
2102 	    /* The local-zone case */
2103 	    (zone != NULL && plabel->tsl_doi == tp->tpc_tp.tp_doi &&
2104 	    ((tp->tpc_tp.host_type == SUN_CIPSO &&
2105 	    (_blinrange(label, &tp->tpc_tp.tp_sl_range_cipso) ||
2106 	    blinlset(label, tp->tpc_tp.tp_sl_set_cipso))))))) {
2107 		if (zone != NULL)
2108 			zone_rele(zone);
2109 		TPC_RELE(tp);
2110 		return (B_TRUE);
2111 	}
2112 
2113 	ifname = ipif->ipif_ill->ill_name;
2114 	if (ipif->ipif_id != 0) {
2115 		(void) snprintf(ifbuf, sizeof (ifbuf), "%s:%u", ifname,
2116 		    ipif->ipif_id);
2117 		ifname = ifbuf;
2118 	}
2119 	(void) inet_ntop(af, addr, addrbuf, sizeof (addrbuf));
2120 
2121 	if (tp == NULL) {
2122 		cmn_err(CE_NOTE, "template entry for %s missing. Default to "
2123 		    "CIPSO type for %s", ifname, addrbuf);
2124 		retval = B_TRUE;
2125 	} else if (tp->tpc_tp.host_type == UNLABELED) {
2126 		cmn_err(CE_NOTE, "template type for %s incorrectly configured. "
2127 		    "Change to CIPSO type for %s", ifname, addrbuf);
2128 		retval = B_TRUE;
2129 	} else if (ipif->ipif_zoneid == ALL_ZONES) {
2130 		if (tp->tpc_tp.host_type != SUN_CIPSO) {
2131 			cmn_err(CE_NOTE, "%s failed: %s isn't set to CIPSO for "
2132 			    "all-zones. Converted to CIPSO.", ifname, addrbuf);
2133 			retval = B_TRUE;
2134 		} else {
2135 			cmn_err(CE_NOTE, "%s failed: %s has wrong DOI %d "
2136 			    "instead of %d", ifname, addrbuf,
2137 			    tp->tpc_tp.tp_doi, default_doi);
2138 			retval = B_FALSE;
2139 		}
2140 	} else if (zone == NULL) {
2141 		cmn_err(CE_NOTE, "%s failed: zoneid %d unknown",
2142 		    ifname, ipif->ipif_zoneid);
2143 		retval = B_FALSE;
2144 	} else if (plabel->tsl_doi != tp->tpc_tp.tp_doi) {
2145 		cmn_err(CE_NOTE, "%s failed: zone %s has DOI %d but %s has "
2146 		    "DOI %d", ifname, zone->zone_name, plabel->tsl_doi,
2147 		    addrbuf, tp->tpc_tp.tp_doi);
2148 		retval = B_FALSE;
2149 	} else {
2150 		cmn_err(CE_NOTE, "%s failed: zone %s label incompatible with "
2151 		    "%s", ifname, zone->zone_name, addrbuf);
2152 		tsol_print_label(label, "zone label");
2153 		retval = B_FALSE;
2154 	}
2155 
2156 	if (zone != NULL)
2157 		zone_rele(zone);
2158 	if (tp != NULL)
2159 		TPC_RELE(tp);
2160 	if (retval) {
2161 		/*
2162 		 * we've corrected a config error and let the interface
2163 		 * come up as cipso. Need to insert an rhent.
2164 		 */
2165 		if ((rhent.rh_address.ta_family = af) == AF_INET) {
2166 			rhent.rh_prefix = 32;
2167 			rhent.rh_address.ta_addr_v4 = *(struct in_addr *)addr;
2168 		} else {
2169 			rhent.rh_prefix = 128;
2170 			rhent.rh_address.ta_addr_v6 = *(in6_addr_t *)addr;
2171 		}
2172 		(void) strcpy(rhent.rh_template, "cipso");
2173 		if (tnrh_load(&rhent) != 0) {
2174 			cmn_err(CE_NOTE, "%s failed: Cannot insert CIPSO "
2175 			    "template for local addr %s", ifname, addrbuf);
2176 			retval = B_FALSE;
2177 		}
2178 	}
2179 	return (retval);
2180 }
2181