xref: /titanic_50/usr/src/uts/common/inet/ip/tnet.c (revision 65d6e08afd923e8496fff598c19c151fd4d0ce64)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/sunddi.h>
33 #include <sys/cred.h>
34 #include <sys/debug.h>
35 #include <sys/kmem.h>
36 #include <sys/errno.h>
37 #include <sys/disp.h>
38 #include <netinet/in.h>
39 #include <netinet/in_systm.h>
40 #include <netinet/ip.h>
41 #include <netinet/ip_icmp.h>
42 #include <netinet/tcp.h>
43 #include <inet/common.h>
44 #include <inet/ipclassifier.h>
45 #include <inet/ip.h>
46 #include <inet/mib2.h>
47 #include <inet/nd.h>
48 #include <inet/tcp.h>
49 #include <inet/ip_rts.h>
50 #include <inet/ip_ire.h>
51 #include <inet/ip_if.h>
52 #include <sys/modhash.h>
53 
54 #include <sys/tsol/label.h>
55 #include <sys/tsol/label_macro.h>
56 #include <sys/tsol/tnet.h>
57 #include <sys/tsol/tndb.h>
58 #include <sys/strsun.h>
59 
60 /* tunable for strict error-reply behavior (TCP RST and ICMP Unreachable) */
61 int tsol_strict_error;
62 
63 /*
64  * Some notes on the Trusted Solaris IRE gateway security attributes:
65  *
66  * When running in Trusted mode, the routing subsystem determines whether or
67  * not a packet can be delivered to an off-link host (not directly reachable
68  * through an interface) based on the accreditation checks of the packet's
69  * security attributes against those associated with the next-hop gateway.
70  *
71  * The next-hop gateway's security attributes can be derived from two sources
72  * (in order of preference): route-related and the host database.  A Trusted
73  * system must be configured with at least the host database containing an
74  * entry for the next-hop gateway, or otherwise no accreditation checks can
75  * be performed, which may result in the inability to send packets to any
76  * off-link destination host.
77  *
78  * The major differences between the two sources are the number and type of
79  * security attributes used for accreditation checks.  A host database entry
80  * can contain at most one set of security attributes, specific only to the
81  * next-hop gateway.  On contrast, route-related security attributes are made
82  * up of a collection of security attributes for the distant networks, and
83  * are grouped together per next-hop gateway used to reach those networks.
84  * This is the preferred method, and the routing subsystem will fallback to
85  * the host database entry only if there are no route-related attributes
86  * associated with the next-hop gateway.
87  *
88  * In Trusted mode, all of the IRE entries (except LOCAL/LOOPBACK/BROADCAST/
89  * INTERFACE type) are initialized to contain a placeholder to store this
90  * information.  The ire_gw_secattr structure gets allocated, initialized
91  * and associated with the IRE during the time of the IRE creation.  The
92  * initialization process also includes resolving the host database entry
93  * of the next-hop gateway for fallback purposes.  It does not include any
94  * route-related attribute setup, as that process comes separately as part
95  * of the route requests (add/change) made to the routing subsystem.
96  *
97  * The underlying logic which involves associating IREs with the gateway
98  * security attributes are represented by the following data structures:
99  *
100  * tsol_gcdb_t, or "gcdb"
101  *
102  *	- This is a system-wide collection of records containing the
103  *	  currently used route-related security attributes, which are fed
104  *	  through the routing socket interface, e.g. "route add/change".
105  *
106  * tsol_gc_t, or "gc"
107  *
108  *	- This is the gateway credential structure, and it provides for the
109  *	  only mechanism to access the contents of gcdb.  More than one gc
110  *	  entries may refer to the same gcdb record.  gc's in the system are
111  *	  grouped according to the next-hop gateway address.
112  *
113  * tsol_gcgrp_t, or "gcgrp"
114  *
115  *	- Group of gateway credentials, and is unique per next-hop gateway
116  *	  address.  When the group is not empty, i.e. when gcgrp_count is
117  *	  greater than zero, it contains one or more gc's, each pointing to
118  *	  a gcdb record which indicates the gateway security attributes
119  *	  associated with the next-hop gateway.
120  *
121  * The fields of the tsol_ire_gw_secattr_t used from within the IRE are:
122  *
123  * igsa_lock
124  *
125  *	- Lock that protects all fields within tsol_ire_gw_secattr_t.
126  *
127  * igsa_rhc
128  *
129  *	- Remote host cache database entry of next-hop gateway.  This is
130  *	  used in the case when there are no route-related attributes
131  *	  configured for the IRE.
132  *
133  * igsa_gc
134  *
135  *	- A set of route-related attributes that only get set for prefix
136  *	  IREs.  If this is non-NULL, the prefix IRE has been associated
137  *	  with a set of gateway security attributes by way of route add/
138  *	  change functionality.  This field stays NULL for IRE_CACHEs.
139  *
140  * igsa_gcgrp
141  *
142  *	- Group of gc's which only gets set for IRE_CACHEs.  Each of the gc
143  *	  points to a gcdb record that contains the security attributes
144  *	  used to perform the credential checks of the packet which uses
145  *	  the IRE.  If the group is not empty, the list of gc's can be
146  *	  traversed starting at gcgrp_head.  This field stays NULL for
147  *	  prefix IREs.
148  */
149 
150 static kmem_cache_t *ire_gw_secattr_cache;
151 
152 #define	GCDB_HASH_SIZE	101
153 #define	GCGRP_HASH_SIZE	101
154 
155 #define	GCDB_REFRELE(p) {		\
156 	mutex_enter(&gcdb_lock);	\
157 	ASSERT((p)->gcdb_refcnt > 0);	\
158 	if (--((p)->gcdb_refcnt) == 0)	\
159 		gcdb_inactive(p);	\
160 	ASSERT(MUTEX_HELD(&gcdb_lock));	\
161 	mutex_exit(&gcdb_lock);		\
162 }
163 
164 static int gcdb_hash_size = GCDB_HASH_SIZE;
165 static int gcgrp_hash_size = GCGRP_HASH_SIZE;
166 static mod_hash_t *gcdb_hash;
167 static mod_hash_t *gcgrp4_hash;
168 static mod_hash_t *gcgrp6_hash;
169 
170 static kmutex_t gcdb_lock;
171 kmutex_t gcgrp_lock;
172 
173 static uint_t gcdb_hash_by_secattr(void *, mod_hash_key_t);
174 static int gcdb_hash_cmp(mod_hash_key_t, mod_hash_key_t);
175 static tsol_gcdb_t *gcdb_lookup(struct rtsa_s *, boolean_t);
176 static void gcdb_inactive(tsol_gcdb_t *);
177 
178 static uint_t gcgrp_hash_by_addr(void *, mod_hash_key_t);
179 static int gcgrp_hash_cmp(mod_hash_key_t, mod_hash_key_t);
180 
181 static int ire_gw_secattr_constructor(void *, void *, int);
182 static void ire_gw_secattr_destructor(void *, void *);
183 
184 void
185 tnet_init(void)
186 {
187 	ire_gw_secattr_cache = kmem_cache_create("ire_gw_secattr_cache",
188 	    sizeof (tsol_ire_gw_secattr_t), 64, ire_gw_secattr_constructor,
189 	    ire_gw_secattr_destructor, NULL, NULL, NULL, 0);
190 
191 	gcdb_hash = mod_hash_create_extended("gcdb_hash",
192 	    gcdb_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
193 	    gcdb_hash_by_secattr, NULL, gcdb_hash_cmp, KM_SLEEP);
194 
195 	gcgrp4_hash = mod_hash_create_extended("gcgrp4_hash",
196 	    gcgrp_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
197 	    gcgrp_hash_by_addr, NULL, gcgrp_hash_cmp, KM_SLEEP);
198 
199 	gcgrp6_hash = mod_hash_create_extended("gcgrp6_hash",
200 	    gcgrp_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
201 	    gcgrp_hash_by_addr, NULL, gcgrp_hash_cmp, KM_SLEEP);
202 
203 	mutex_init(&gcdb_lock, NULL, MUTEX_DEFAULT, NULL);
204 	mutex_init(&gcgrp_lock, NULL, MUTEX_DEFAULT, NULL);
205 }
206 
207 void
208 tnet_fini(void)
209 {
210 	kmem_cache_destroy(ire_gw_secattr_cache);
211 	mod_hash_destroy_hash(gcdb_hash);
212 	mod_hash_destroy_hash(gcgrp4_hash);
213 	mod_hash_destroy_hash(gcgrp6_hash);
214 	mutex_destroy(&gcdb_lock);
215 	mutex_destroy(&gcgrp_lock);
216 }
217 
218 /* ARGSUSED */
219 static int
220 ire_gw_secattr_constructor(void *buf, void *cdrarg, int kmflags)
221 {
222 	tsol_ire_gw_secattr_t *attrp = buf;
223 
224 	mutex_init(&attrp->igsa_lock, NULL, MUTEX_DEFAULT, NULL);
225 
226 	attrp->igsa_rhc = NULL;
227 	attrp->igsa_gc = NULL;
228 	attrp->igsa_gcgrp = NULL;
229 
230 	return (0);
231 }
232 
233 /* ARGSUSED */
234 static void
235 ire_gw_secattr_destructor(void *buf, void *cdrarg)
236 {
237 	tsol_ire_gw_secattr_t *attrp = (tsol_ire_gw_secattr_t *)buf;
238 
239 	mutex_destroy(&attrp->igsa_lock);
240 }
241 
242 tsol_ire_gw_secattr_t *
243 ire_gw_secattr_alloc(int kmflags)
244 {
245 	return (kmem_cache_alloc(ire_gw_secattr_cache, kmflags));
246 }
247 
248 void
249 ire_gw_secattr_free(tsol_ire_gw_secattr_t *attrp)
250 {
251 	ASSERT(MUTEX_NOT_HELD(&attrp->igsa_lock));
252 
253 	if (attrp->igsa_rhc != NULL) {
254 		TNRHC_RELE(attrp->igsa_rhc);
255 		attrp->igsa_rhc = NULL;
256 	}
257 
258 	if (attrp->igsa_gc != NULL) {
259 		GC_REFRELE(attrp->igsa_gc);
260 		attrp->igsa_gc = NULL;
261 	}
262 	if (attrp->igsa_gcgrp != NULL) {
263 		GCGRP_REFRELE(attrp->igsa_gcgrp);
264 		attrp->igsa_gcgrp = NULL;
265 	}
266 
267 	ASSERT(attrp->igsa_rhc == NULL);
268 	ASSERT(attrp->igsa_gc == NULL);
269 	ASSERT(attrp->igsa_gcgrp == NULL);
270 
271 	kmem_cache_free(ire_gw_secattr_cache, attrp);
272 }
273 
274 /* ARGSUSED */
275 static uint_t
276 gcdb_hash_by_secattr(void *hash_data, mod_hash_key_t key)
277 {
278 	const struct rtsa_s *rp = (struct rtsa_s *)key;
279 	const uint32_t *up, *ue;
280 	uint_t hash;
281 	int i;
282 
283 	ASSERT(rp != NULL);
284 
285 	/* See comments in hash_bylabel in zone.c for details */
286 	hash = rp->rtsa_doi + (rp->rtsa_doi << 1);
287 	up = (const uint32_t *)&rp->rtsa_slrange;
288 	ue = up + sizeof (rp->rtsa_slrange) / sizeof (*up);
289 	i = 1;
290 	while (up < ue) {
291 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
292 		hash += *up + (*up << ((i % 16) + 1));
293 		up++;
294 		i++;
295 	}
296 	return (hash);
297 }
298 
299 static int
300 gcdb_hash_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
301 {
302 	struct rtsa_s *rp1 = (struct rtsa_s *)key1;
303 	struct rtsa_s *rp2 = (struct rtsa_s *)key2;
304 
305 	ASSERT(rp1 != NULL && rp2 != NULL);
306 
307 	if (blequal(&rp1->rtsa_slrange.lower_bound,
308 	    &rp2->rtsa_slrange.lower_bound) &&
309 	    blequal(&rp1->rtsa_slrange.upper_bound,
310 	    &rp2->rtsa_slrange.upper_bound) &&
311 	    rp1->rtsa_doi == rp2->rtsa_doi)
312 		return (0);
313 
314 	/* No match; not found */
315 	return (-1);
316 }
317 
318 /* ARGSUSED */
319 static uint_t
320 gcgrp_hash_by_addr(void *hash_data, mod_hash_key_t key)
321 {
322 	tsol_gcgrp_addr_t *ga = (tsol_gcgrp_addr_t *)key;
323 	uint_t		idx = 0;
324 	uint32_t	*ap;
325 
326 	ASSERT(ga != NULL);
327 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
328 
329 	ap = (uint32_t *)&ga->ga_addr.s6_addr32[0];
330 	idx ^= *ap++;
331 	idx ^= *ap++;
332 	idx ^= *ap++;
333 	idx ^= *ap;
334 
335 	return (idx);
336 }
337 
338 static int
339 gcgrp_hash_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
340 {
341 	tsol_gcgrp_addr_t *ga1 = (tsol_gcgrp_addr_t *)key1;
342 	tsol_gcgrp_addr_t *ga2 = (tsol_gcgrp_addr_t *)key2;
343 
344 	ASSERT(ga1 != NULL && ga2 != NULL);
345 
346 	/* Address family must match */
347 	if (ga1->ga_af != ga2->ga_af)
348 		return (-1);
349 
350 	if (ga1->ga_addr.s6_addr32[0] == ga2->ga_addr.s6_addr32[0] &&
351 	    ga1->ga_addr.s6_addr32[1] == ga2->ga_addr.s6_addr32[1] &&
352 	    ga1->ga_addr.s6_addr32[2] == ga2->ga_addr.s6_addr32[2] &&
353 	    ga1->ga_addr.s6_addr32[3] == ga2->ga_addr.s6_addr32[3])
354 		return (0);
355 
356 	/* No match; not found */
357 	return (-1);
358 }
359 
360 #define	RTSAFLAGS	"\20\11cipso\3doi\2max_sl\1min_sl"
361 
362 int
363 rtsa_validate(const struct rtsa_s *rp)
364 {
365 	uint32_t mask = rp->rtsa_mask;
366 
367 	/* RTSA_CIPSO must be set, and DOI must not be zero */
368 	if ((mask & RTSA_CIPSO) == 0 || rp->rtsa_doi == 0) {
369 		DTRACE_PROBE2(tx__gcdb__log__error__rtsa__validate, char *,
370 		    "rtsa(1) lacks flag or has 0 doi.",
371 		    rtsa_s *, rp);
372 		return (EINVAL);
373 	}
374 	/*
375 	 * SL range must be specified, and it must have its
376 	 * upper bound dominating its lower bound.
377 	 */
378 	if ((mask & RTSA_SLRANGE) != RTSA_SLRANGE ||
379 	    !bldominates(&rp->rtsa_slrange.upper_bound,
380 	    &rp->rtsa_slrange.lower_bound)) {
381 		DTRACE_PROBE2(tx__gcdb__log__error__rtsa__validate, char *,
382 		    "rtsa(1) min_sl and max_sl not set or max_sl is "
383 		    "not dominating.", rtsa_s *, rp);
384 		return (EINVAL);
385 	}
386 	return (0);
387 }
388 
389 /*
390  * A brief explanation of the reference counting scheme:
391  *
392  * Prefix IREs have a non-NULL igsa_gc and a NULL igsa_gcgrp;
393  * IRE_CACHEs have it vice-versa.
394  *
395  * Apart from dynamic references due to to reference holds done
396  * actively by threads, we have the following references:
397  *
398  * gcdb_refcnt:
399  *	- Every tsol_gc_t pointing to a tsol_gcdb_t contributes a reference
400  *	  to the gcdb_refcnt.
401  *
402  * gc_refcnt:
403  *	- A prefix IRE that points to an igsa_gc contributes a reference
404  *	  to the gc_refcnt.
405  *
406  * gcgrp_refcnt:
407  *	- An IRE_CACHE that points to an igsa_gcgrp contributes a reference
408  *	  to the gcgrp_refcnt of the associated tsol_gcgrp_t.
409  *	- Every tsol_gc_t in the chain headed by tsol_gcgrp_t contributes
410  *	  a reference to the gcgrp_refcnt.
411  */
412 static tsol_gcdb_t *
413 gcdb_lookup(struct rtsa_s *rp, boolean_t alloc)
414 {
415 	tsol_gcdb_t *gcdb = NULL;
416 
417 	if (rtsa_validate(rp) != 0)
418 		return (NULL);
419 
420 	mutex_enter(&gcdb_lock);
421 	/* Find a copy in the cache; otherwise, create one and cache it */
422 	if (mod_hash_find(gcdb_hash, (mod_hash_key_t)rp,
423 	    (mod_hash_val_t *)&gcdb) == 0) {
424 		gcdb->gcdb_refcnt++;
425 		ASSERT(gcdb->gcdb_refcnt != 0);
426 
427 		DTRACE_PROBE2(tx__gcdb__log__info__gcdb__lookup, char *,
428 		    "gcdb(1) is in gcdb_hash(global)", tsol_gcdb_t *, gcdb);
429 	} else if (alloc) {
430 		gcdb = kmem_zalloc(sizeof (*gcdb), KM_NOSLEEP);
431 		if (gcdb != NULL) {
432 			gcdb->gcdb_refcnt = 1;
433 			gcdb->gcdb_mask = rp->rtsa_mask;
434 			gcdb->gcdb_doi = rp->rtsa_doi;
435 			gcdb->gcdb_slrange = rp->rtsa_slrange;
436 
437 			if (mod_hash_insert(gcdb_hash,
438 			    (mod_hash_key_t)&gcdb->gcdb_attr,
439 			    (mod_hash_val_t)gcdb) != 0) {
440 				mutex_exit(&gcdb_lock);
441 				kmem_free(gcdb, sizeof (*gcdb));
442 				return (NULL);
443 			}
444 
445 			DTRACE_PROBE2(tx__gcdb__log__info__gcdb__insert, char *,
446 			    "gcdb(1) inserted in gcdb_hash(global)",
447 			    tsol_gcdb_t *, gcdb);
448 		}
449 	}
450 	mutex_exit(&gcdb_lock);
451 	return (gcdb);
452 }
453 
454 static void
455 gcdb_inactive(tsol_gcdb_t *gcdb)
456 {
457 	ASSERT(MUTEX_HELD(&gcdb_lock));
458 	ASSERT(gcdb != NULL && gcdb->gcdb_refcnt == 0);
459 
460 	(void) mod_hash_remove(gcdb_hash, (mod_hash_key_t)&gcdb->gcdb_attr,
461 	    (mod_hash_val_t *)&gcdb);
462 
463 	DTRACE_PROBE2(tx__gcdb__log__info__gcdb__remove, char *,
464 	    "gcdb(1) removed from gcdb_hash(global)",
465 	    tsol_gcdb_t *, gcdb);
466 	kmem_free(gcdb, sizeof (*gcdb));
467 }
468 
469 tsol_gc_t *
470 gc_create(struct rtsa_s *rp, tsol_gcgrp_t *gcgrp, boolean_t *gcgrp_xtrarefp)
471 {
472 	tsol_gc_t *gc;
473 	tsol_gcdb_t *gcdb;
474 
475 	*gcgrp_xtrarefp = B_TRUE;
476 
477 	rw_enter(&gcgrp->gcgrp_rwlock, RW_WRITER);
478 	if ((gcdb = gcdb_lookup(rp, B_TRUE)) == NULL) {
479 		rw_exit(&gcgrp->gcgrp_rwlock);
480 		return (NULL);
481 	}
482 
483 	for (gc = gcgrp->gcgrp_head; gc != NULL; gc = gc->gc_next) {
484 		if (gc->gc_db == gcdb) {
485 			ASSERT(gc->gc_grp == gcgrp);
486 
487 			gc->gc_refcnt++;
488 			ASSERT(gc->gc_refcnt != 0);
489 
490 			GCDB_REFRELE(gcdb);
491 
492 			DTRACE_PROBE3(tx__gcdb__log__info__gc__create,
493 			    char *, "found gc(1) in gcgrp(2)",
494 			    tsol_gc_t *, gc, tsol_gcgrp_t *, gcgrp);
495 			rw_exit(&gcgrp->gcgrp_rwlock);
496 			return (gc);
497 		}
498 	}
499 
500 	gc = kmem_zalloc(sizeof (*gc), KM_NOSLEEP);
501 	if (gc != NULL) {
502 		if (gcgrp->gcgrp_head == NULL) {
503 			gcgrp->gcgrp_head = gcgrp->gcgrp_tail = gc;
504 		} else {
505 			gcgrp->gcgrp_tail->gc_next = gc;
506 			gc->gc_prev = gcgrp->gcgrp_tail;
507 			gcgrp->gcgrp_tail = gc;
508 		}
509 		gcgrp->gcgrp_count++;
510 		ASSERT(gcgrp->gcgrp_count != 0);
511 
512 		/* caller has incremented gcgrp reference for us */
513 		gc->gc_grp = gcgrp;
514 
515 		gc->gc_db = gcdb;
516 		gc->gc_refcnt = 1;
517 
518 		DTRACE_PROBE3(tx__gcdb__log__info__gc__create, char *,
519 		    "added gc(1) to gcgrp(2)", tsol_gc_t *, gc,
520 		    tsol_gcgrp_t *, gcgrp);
521 
522 		*gcgrp_xtrarefp = B_FALSE;
523 	}
524 	rw_exit(&gcgrp->gcgrp_rwlock);
525 
526 	return (gc);
527 }
528 
529 void
530 gc_inactive(tsol_gc_t *gc)
531 {
532 	tsol_gcgrp_t *gcgrp = gc->gc_grp;
533 
534 	ASSERT(gcgrp != NULL);
535 	ASSERT(RW_WRITE_HELD(&gcgrp->gcgrp_rwlock));
536 	ASSERT(gc->gc_refcnt == 0);
537 
538 	if (gc->gc_prev != NULL)
539 		gc->gc_prev->gc_next = gc->gc_next;
540 	else
541 		gcgrp->gcgrp_head = gc->gc_next;
542 	if (gc->gc_next != NULL)
543 		gc->gc_next->gc_prev = gc->gc_prev;
544 	else
545 		gcgrp->gcgrp_tail = gc->gc_prev;
546 	ASSERT(gcgrp->gcgrp_count > 0);
547 	gcgrp->gcgrp_count--;
548 
549 	/* drop lock before it's destroyed */
550 	rw_exit(&gcgrp->gcgrp_rwlock);
551 
552 	DTRACE_PROBE3(tx__gcdb__log__info__gc__remove, char *,
553 	    "removed inactive gc(1) from gcgrp(2)",
554 	    tsol_gc_t *, gc, tsol_gcgrp_t *, gcgrp);
555 
556 	GCGRP_REFRELE(gcgrp);
557 
558 	gc->gc_grp = NULL;
559 	gc->gc_prev = gc->gc_next = NULL;
560 
561 	if (gc->gc_db != NULL)
562 		GCDB_REFRELE(gc->gc_db);
563 
564 	kmem_free(gc, sizeof (*gc));
565 }
566 
567 tsol_gcgrp_t *
568 gcgrp_lookup(tsol_gcgrp_addr_t *ga, boolean_t alloc)
569 {
570 	tsol_gcgrp_t *gcgrp = NULL;
571 	mod_hash_t *hashp;
572 
573 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
574 
575 	hashp = (ga->ga_af == AF_INET) ? gcgrp4_hash : gcgrp6_hash;
576 
577 	mutex_enter(&gcgrp_lock);
578 	if (mod_hash_find(hashp, (mod_hash_key_t)ga,
579 	    (mod_hash_val_t *)&gcgrp) == 0) {
580 		gcgrp->gcgrp_refcnt++;
581 		ASSERT(gcgrp->gcgrp_refcnt != 0);
582 
583 		DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__lookup, char *,
584 		    "found gcgrp(1) in hash(2)", tsol_gcgrp_t *, gcgrp,
585 		    mod_hash_t *, hashp);
586 
587 	} else if (alloc) {
588 		gcgrp = kmem_zalloc(sizeof (*gcgrp), KM_NOSLEEP);
589 		if (gcgrp != NULL) {
590 			gcgrp->gcgrp_refcnt = 1;
591 			rw_init(&gcgrp->gcgrp_rwlock, NULL, RW_DEFAULT, NULL);
592 			bcopy(ga, &gcgrp->gcgrp_addr, sizeof (*ga));
593 
594 			if (mod_hash_insert(hashp,
595 			    (mod_hash_key_t)&gcgrp->gcgrp_addr,
596 			    (mod_hash_val_t)gcgrp) != 0) {
597 				mutex_exit(&gcgrp_lock);
598 				kmem_free(gcgrp, sizeof (*gcgrp));
599 				return (NULL);
600 			}
601 
602 			DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__insert,
603 			    char *, "inserted gcgrp(1) in hash(2)",
604 			    tsol_gcgrp_t *, gcgrp, mod_hash_t *, hashp);
605 		}
606 	}
607 	mutex_exit(&gcgrp_lock);
608 	return (gcgrp);
609 }
610 
611 void
612 gcgrp_inactive(tsol_gcgrp_t *gcgrp)
613 {
614 	tsol_gcgrp_addr_t *ga;
615 	mod_hash_t *hashp;
616 
617 	ASSERT(MUTEX_HELD(&gcgrp_lock));
618 	ASSERT(!RW_LOCK_HELD(&gcgrp->gcgrp_rwlock));
619 	ASSERT(gcgrp != NULL && gcgrp->gcgrp_refcnt == 0);
620 	ASSERT(gcgrp->gcgrp_head == NULL && gcgrp->gcgrp_count == 0);
621 
622 	ga = &gcgrp->gcgrp_addr;
623 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
624 
625 	hashp = (ga->ga_af == AF_INET) ? gcgrp4_hash : gcgrp6_hash;
626 	(void) mod_hash_remove(hashp, (mod_hash_key_t)ga,
627 	    (mod_hash_val_t *)&gcgrp);
628 	rw_destroy(&gcgrp->gcgrp_rwlock);
629 
630 	DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__remove, char *,
631 	    "removed inactive gcgrp(1) from hash(2)",
632 	    tsol_gcgrp_t *, gcgrp, mod_hash_t *, hashp);
633 
634 	kmem_free(gcgrp, sizeof (*gcgrp));
635 }
636 
637 /*
638  * Converts CIPSO option to sensitivity label.
639  * Validity checks based on restrictions defined in
640  * COMMERCIAL IP SECURITY OPTION (CIPSO 2.2) (draft-ietf-cipso-ipsecurity)
641  */
642 static boolean_t
643 cipso_to_sl(const uchar_t *option, bslabel_t *sl)
644 {
645 	const struct cipso_option *co = (const struct cipso_option *)option;
646 	const struct cipso_tag_type_1 *tt1;
647 
648 	tt1 = (struct cipso_tag_type_1 *)&co->cipso_tag_type[0];
649 	if (tt1->tag_type != 1 ||
650 	    tt1->tag_length < TSOL_TT1_MIN_LENGTH ||
651 	    tt1->tag_length > TSOL_TT1_MAX_LENGTH ||
652 	    tt1->tag_length + TSOL_CIPSO_TAG_OFFSET > co->cipso_length)
653 		return (B_FALSE);
654 
655 	bsllow(sl);	/* assumed: sets compartments to all zeroes */
656 	LCLASS_SET((_bslabel_impl_t *)sl, tt1->tag_sl);
657 	bcopy(tt1->tag_cat, &((_bslabel_impl_t *)sl)->compartments,
658 	    tt1->tag_length - TSOL_TT1_MIN_LENGTH);
659 	return (B_TRUE);
660 }
661 
662 /*
663  * Parse the CIPSO label in the incoming packet and construct a ts_label_t
664  * that reflects the CIPSO label and attach it to the dblk cred. Later as
665  * the mblk flows up through the stack any code that needs to examine the
666  * packet label can inspect the label from the dblk cred. This function is
667  * called right in ip_rput for all packets, i.e. locally destined and
668  * to be forwarded packets. The forwarding path needs to examine the label
669  * to determine how to forward the packet.
670  *
671  * For IPv4, IP header options have been pulled up, but other headers might not
672  * have been.  For IPv6, any hop-by-hop options have been pulled up, but any
673  * other headers might not be present.
674  */
675 boolean_t
676 tsol_get_pkt_label(mblk_t *mp, int version)
677 {
678 	tsol_tpc_t	*src_rhtp;
679 	uchar_t		*opt_ptr = NULL;
680 	const ipha_t	*ipha;
681 	bslabel_t	sl;
682 	uint32_t	doi;
683 	tsol_ip_label_t	label_type;
684 	const cipso_option_t *co;
685 	const void	*src;
686 	const ip6_t	*ip6h;
687 
688 	ASSERT(DB_TYPE(mp) == M_DATA);
689 
690 	if (version == IPV4_VERSION) {
691 		ipha = (const ipha_t *)mp->b_rptr;
692 		src = &ipha->ipha_src;
693 		label_type = tsol_get_option(mp, &opt_ptr);
694 	} else {
695 		uchar_t		*after_secopt;
696 		boolean_t	hbh_needed;
697 		const uchar_t	*ip6hbh;
698 		size_t		optlen;
699 
700 		label_type = OPT_NONE;
701 		ip6h = (const ip6_t *)mp->b_rptr;
702 		src = &ip6h->ip6_src;
703 		if (ip6h->ip6_nxt == IPPROTO_HOPOPTS) {
704 			ip6hbh = (const uchar_t *)&ip6h[1];
705 			optlen = (ip6hbh[1] + 1) << 3;
706 			ASSERT(ip6hbh + optlen <= mp->b_wptr);
707 			opt_ptr = tsol_find_secopt_v6(ip6hbh, optlen,
708 			    &after_secopt, &hbh_needed);
709 			/* tsol_find_secopt_v6 guarantees some sanity */
710 			if (opt_ptr != NULL &&
711 			    (optlen = opt_ptr[1]) >= 8) {
712 				opt_ptr += 2;
713 				bcopy(opt_ptr, &doi, sizeof (doi));
714 				doi = ntohl(doi);
715 				if (doi == IP6LS_DOI_V4 &&
716 				    opt_ptr[4] == IP6LS_TT_V4 &&
717 				    opt_ptr[5] <= optlen - 4 &&
718 				    opt_ptr[7] <= optlen - 6) {
719 					opt_ptr += sizeof (doi) + 2;
720 					label_type = OPT_CIPSO;
721 				}
722 			}
723 		}
724 	}
725 
726 	switch (label_type) {
727 	case OPT_CIPSO:
728 		/*
729 		 * Convert the CIPSO label to the internal format
730 		 * and attach it to the dblk cred.
731 		 * Validity checks based on restrictions defined in
732 		 * COMMERCIAL IP SECURITY OPTION (CIPSO 2.2)
733 		 * (draft-ietf-cipso-ipsecurity)
734 		 */
735 		if (version == IPV6_VERSION && ip6opt_ls == 0)
736 			return (B_FALSE);
737 		co = (const struct cipso_option *)opt_ptr;
738 		if ((co->cipso_length <
739 		    TSOL_CIPSO_TAG_OFFSET + TSOL_TT1_MIN_LENGTH) ||
740 		    (co->cipso_length > IP_MAX_OPT_LENGTH))
741 			return (B_FALSE);
742 		bcopy(co->cipso_doi, &doi, sizeof (doi));
743 		doi = ntohl(doi);
744 		if (!cipso_to_sl(opt_ptr, &sl))
745 			return (B_FALSE);
746 		setbltype(&sl, SUN_SL_ID);
747 		break;
748 
749 	case OPT_NONE:
750 		/*
751 		 * Handle special cases that are not currently labeled, even
752 		 * though the sending system may otherwise be configured as
753 		 * labeled.
754 		 *	- IGMP
755 		 *	- IPv4 ICMP Router Discovery
756 		 *	- IPv6 Neighbor Discovery
757 		 */
758 		if (version == IPV4_VERSION) {
759 			if (ipha->ipha_protocol == IPPROTO_IGMP)
760 				return (B_TRUE);
761 			if (ipha->ipha_protocol == IPPROTO_ICMP) {
762 				const struct icmp *icmp = (const struct icmp *)
763 				    (mp->b_rptr + IPH_HDR_LENGTH(ipha));
764 
765 				if ((uchar_t *)icmp > mp->b_wptr) {
766 					if (!pullupmsg(mp,
767 					    (uchar_t *)icmp - mp->b_rptr + 1))
768 						return (B_FALSE);
769 					icmp = (const struct icmp *)
770 					    (mp->b_rptr +
771 					    IPH_HDR_LENGTH(ipha));
772 				}
773 				if (icmp->icmp_type == ICMP_ROUTERADVERT ||
774 				    icmp->icmp_type == ICMP_ROUTERSOLICIT)
775 					return (B_TRUE);
776 			}
777 			src = &ipha->ipha_src;
778 		} else {
779 			if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
780 				const icmp6_t *icmp6 = (const icmp6_t *)
781 				    (mp->b_rptr + IPV6_HDR_LEN);
782 
783 				if ((uchar_t *)icmp6 + ICMP6_MINLEN >
784 				    mp->b_wptr) {
785 					if (!pullupmsg(mp,
786 					    (uchar_t *)icmp6 - mp->b_rptr +
787 					    ICMP6_MINLEN))
788 						return (B_FALSE);
789 					icmp6 = (const icmp6_t *)
790 					    (mp->b_rptr + IPV6_HDR_LEN);
791 				}
792 				if (icmp6->icmp6_type >= MLD_LISTENER_QUERY &&
793 				    icmp6->icmp6_type <= ICMP6_MAX_INFO_TYPE)
794 					return (B_TRUE);
795 			}
796 			src = &ip6h->ip6_src;
797 		}
798 
799 		/*
800 		 * Look up the tnrhtp database and get the implicit label
801 		 * that is associated with this unlabeled host and attach
802 		 * it to the packet.
803 		 */
804 		if ((src_rhtp = find_tpc(src, version, B_FALSE)) == NULL)
805 			return (B_FALSE);
806 
807 		/* If the sender is labeled, drop the unlabeled packet. */
808 		if (src_rhtp->tpc_tp.host_type != UNLABELED) {
809 			TPC_RELE(src_rhtp);
810 			pr_addr_dbg("unlabeled packet forged from %s\n",
811 			    version == IPV4_VERSION ? AF_INET : AF_INET6, src);
812 			return (B_FALSE);
813 		}
814 
815 		sl = src_rhtp->tpc_tp.tp_def_label;
816 		setbltype(&sl, SUN_SL_ID);
817 		doi = src_rhtp->tpc_tp.tp_doi;
818 		TPC_RELE(src_rhtp);
819 		break;
820 
821 	default:
822 		return (B_FALSE);
823 	}
824 
825 	/* Make sure no other thread is messing with this mblk */
826 	ASSERT(DB_REF(mp) == 1);
827 	if (DB_CRED(mp) == NULL) {
828 		DB_CRED(mp) = newcred_from_bslabel(&sl, doi, KM_NOSLEEP);
829 		if (DB_CRED(mp) == NULL)
830 			return (B_FALSE);
831 	} else {
832 		cred_t	*newcr;
833 
834 		newcr = copycred_from_bslabel(DB_CRED(mp), &sl, doi,
835 		    KM_NOSLEEP);
836 		if (newcr == NULL)
837 			return (B_FALSE);
838 		crfree(DB_CRED(mp));
839 		DB_CRED(mp) = newcr;
840 	}
841 
842 	/*
843 	 * If the source was unlabeled, then flag as such,
844 	 * while remembering that CIPSO routers add headers.
845 	 */
846 	if (label_type == OPT_NONE)
847 		crgetlabel(DB_CRED(mp))->tsl_flags |= TSLF_UNLABELED;
848 	else if (label_type == OPT_CIPSO) {
849 		if ((src_rhtp = find_tpc(src, version, B_FALSE)) == NULL)
850 			return (B_FALSE);
851 		if (src_rhtp->tpc_tp.host_type == UNLABELED)
852 		    crgetlabel(DB_CRED(mp))->tsl_flags |=
853 		    TSLF_UNLABELED;
854 		TPC_RELE(src_rhtp);
855 	}
856 
857 	return (B_TRUE);
858 }
859 
860 /*
861  * This routine determines whether the given packet should be accepted locally.
862  * It does a range/set check on the packet's label by looking up the given
863  * address in the remote host database.
864  */
865 boolean_t
866 tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
867     boolean_t shared_addr, const conn_t *connp)
868 {
869 	const cred_t *credp;
870 	ts_label_t *plabel, *conn_plabel;
871 	tsol_tpc_t *tp;
872 	boolean_t retv;
873 	const bslabel_t *label, *conn_label;
874 
875 	/*
876 	 * The cases in which this can happen are:
877 	 *	- IPv6 Router Alert, where ip_rput_data_v6 deliberately skips
878 	 *	  over the label attachment process.
879 	 *	- MLD output looped-back to ourselves.
880 	 *	- IPv4 Router Discovery, where tsol_get_pkt_label intentionally
881 	 *	  avoids the labeling process.
882 	 * We trust that all valid paths in the code set the cred pointer when
883 	 * needed.
884 	 */
885 	if ((credp = DB_CRED(mp)) == NULL)
886 		return (B_TRUE);
887 
888 	/*
889 	 * If this packet is from the inside (not a remote host) and has the
890 	 * same zoneid as the selected destination, then no checks are
891 	 * necessary.  Membership in the zone is enough proof.  This is
892 	 * intended to be a hot path through this function.
893 	 */
894 	if (!crisremote(credp) &&
895 	    crgetzone(credp) == crgetzone(connp->conn_cred))
896 		return (B_TRUE);
897 
898 	plabel = crgetlabel(credp);
899 	conn_plabel = crgetlabel(connp->conn_cred);
900 	ASSERT(plabel != NULL && conn_plabel != NULL);
901 
902 	label = label2bslabel(plabel);
903 	conn_label = label2bslabel(crgetlabel(connp->conn_cred));
904 
905 	/*
906 	 * MLPs are always validated using the range and set of the local
907 	 * address, even when the remote host is unlabeled.
908 	 */
909 	if (connp->conn_mlp_type == mlptBoth ||
910 	/* LINTED: no consequent */
911 	    connp->conn_mlp_type == (shared_addr ? mlptShared : mlptPrivate)) {
912 		;
913 
914 	/*
915 	 * If this is a packet from an unlabeled sender, then we must apply
916 	 * different rules.  If the label is equal to the zone's label, then
917 	 * it's allowed.  If it's not equal, but the zone is either the global
918 	 * zone or the label is dominated by the zone's label, then allow it
919 	 * as long as it's in the range configured for the destination.
920 	 */
921 	} else if (plabel->tsl_flags & TSLF_UNLABELED) {
922 		if (plabel->tsl_doi == conn_plabel->tsl_doi &&
923 		    blequal(label, conn_label))
924 			return (B_TRUE);
925 
926 		if (!connp->conn_mac_exempt ||
927 		    (connp->conn_zoneid != GLOBAL_ZONEID &&
928 		    (plabel->tsl_doi != conn_plabel->tsl_doi ||
929 		    !bldominates(conn_label, label)))) {
930 			DTRACE_PROBE3(
931 			    tx__ip__log__drop__receivelocal__mac_unl,
932 			    char *,
933 			    "unlabeled packet mp(1) fails mac for conn(2)",
934 			    mblk_t *, mp, conn_t *, connp);
935 			return (B_FALSE);
936 		}
937 
938 	/*
939 	 * If this is a private address and the connection is SLP for private
940 	 * addresses, then the only thing that matters is the label on the
941 	 * zone, which is the same as the label on the connection.  We don't
942 	 * care (and don't have to care) about the tnrhdb.
943 	 */
944 	} else if (!shared_addr) {
945 		/*
946 		 * Since this is a zone-specific address, we know that any MLP
947 		 * case should have been handled up above.  That means this
948 		 * connection must not be MLP for zone-specific addresses.  We
949 		 * assert that to be true.
950 		 */
951 		ASSERT(connp->conn_mlp_type == mlptSingle ||
952 		    connp->conn_mlp_type == mlptShared);
953 		if (plabel->tsl_doi == conn_plabel->tsl_doi &&
954 		    blequal(label, conn_label))
955 			return (B_TRUE);
956 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac__slp,
957 		    char *, "packet mp(1) fails exactly SLP match conn(2)",
958 		    mblk_t *, mp, conn_t *, connp);
959 		return (B_FALSE);
960 	}
961 
962 	tp = find_tpc(addr, version, B_FALSE);
963 	if (tp == NULL) {
964 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__no__tnr,
965 		    char *, "dropping mp(1), host(2) lacks entry",
966 		    mblk_t *, mp, void *, addr);
967 		return (B_FALSE);
968 	}
969 
970 	/*
971 	 * The local host address should not be unlabeled at this point.  The
972 	 * only way this can happen is that the destination isn't unicast.  We
973 	 * assume that the packet should not have had a label, and thus should
974 	 * have been handled by the TSLF_UNLABELED logic above.
975 	 */
976 	if (tp->tpc_tp.host_type == UNLABELED) {
977 		retv = B_FALSE;
978 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__flag, char *,
979 		    "mp(1) unlabeled source, but tp is not unlabeled.",
980 		    mblk_t *, mp, tsol_tpc_t *, tp);
981 
982 	} else if (tp->tpc_tp.host_type != SUN_CIPSO) {
983 		retv = B_FALSE;
984 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__tptype, char *,
985 		    "delivering mp(1), found unrecognized tpc(2) type.",
986 		    mblk_t *, mp, tsol_tpc_t *, tp);
987 
988 	} else if (plabel->tsl_doi != tp->tpc_tp.tp_doi) {
989 		retv = B_FALSE;
990 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac, char *,
991 		    "mp(1) could not be delievered to tp(2), doi mismatch",
992 		    mblk_t *, mp, tsol_tpc_t *, tp);
993 
994 	} else if (!_blinrange(label, &tp->tpc_tp.tp_sl_range_cipso) &&
995 	    !blinlset(label, tp->tpc_tp.tp_sl_set_cipso)) {
996 		retv = B_FALSE;
997 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac, char *,
998 		    "mp(1) could not be delievered to tp(2), bad mac",
999 		    mblk_t *, mp, tsol_tpc_t *, tp);
1000 	} else {
1001 		retv = B_TRUE;
1002 	}
1003 
1004 	TPC_RELE(tp);
1005 
1006 	return (retv);
1007 }
1008 
1009 boolean_t
1010 tsol_can_accept_raw(mblk_t *mp, boolean_t check_host)
1011 {
1012 	ts_label_t	*plabel = NULL;
1013 	tsol_tpc_t	*src_rhtp, *dst_rhtp;
1014 	boolean_t	retv;
1015 
1016 	if (DB_CRED(mp) != NULL)
1017 		plabel = crgetlabel(DB_CRED(mp));
1018 
1019 	/* We are bootstrapping or the internal template was never deleted */
1020 	if (plabel == NULL)
1021 		return (B_TRUE);
1022 
1023 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1024 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
1025 
1026 		src_rhtp = find_tpc(&ipha->ipha_src, IPV4_VERSION,
1027 		    B_FALSE);
1028 		if (src_rhtp == NULL)
1029 			return (B_FALSE);
1030 		dst_rhtp = find_tpc(&ipha->ipha_dst, IPV4_VERSION,
1031 		    B_FALSE);
1032 	} else {
1033 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1034 
1035 		src_rhtp = find_tpc(&ip6h->ip6_src, IPV6_VERSION,
1036 		    B_FALSE);
1037 		if (src_rhtp == NULL)
1038 			return (B_FALSE);
1039 		dst_rhtp = find_tpc(&ip6h->ip6_dst, IPV6_VERSION,
1040 		    B_FALSE);
1041 	}
1042 	if (dst_rhtp == NULL) {
1043 		TPC_RELE(src_rhtp);
1044 		return (B_FALSE);
1045 	}
1046 
1047 	if (label2doi(plabel) != src_rhtp->tpc_tp.tp_doi) {
1048 		retv = B_FALSE;
1049 
1050 	/*
1051 	 * Check that the packet's label is in the correct range for labeled
1052 	 * sender, or is equal to the default label for unlabeled sender.
1053 	 */
1054 	} else if ((src_rhtp->tpc_tp.host_type != UNLABELED &&
1055 	    !_blinrange(label2bslabel(plabel),
1056 	    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
1057 	    !blinlset(label2bslabel(plabel),
1058 	    src_rhtp->tpc_tp.tp_sl_set_cipso)) ||
1059 	    (src_rhtp->tpc_tp.host_type == UNLABELED &&
1060 	    !blequal(&plabel->tsl_label, &src_rhtp->tpc_tp.tp_def_label))) {
1061 		retv = B_FALSE;
1062 
1063 	} else if (check_host) {
1064 		retv = B_TRUE;
1065 
1066 	/*
1067 	 * Until we have SL range in the Zone structure, pass it
1068 	 * when our own address lookup returned an internal entry.
1069 	 */
1070 	} else switch (dst_rhtp->tpc_tp.host_type) {
1071 	case UNLABELED:
1072 		retv = B_TRUE;
1073 		break;
1074 
1075 	case SUN_CIPSO:
1076 		retv = _blinrange(label2bslabel(plabel),
1077 		    &dst_rhtp->tpc_tp.tp_sl_range_cipso) ||
1078 		    blinlset(label2bslabel(plabel),
1079 		    dst_rhtp->tpc_tp.tp_sl_set_cipso);
1080 		break;
1081 
1082 	default:
1083 		retv = B_FALSE;
1084 	}
1085 	TPC_RELE(src_rhtp);
1086 	TPC_RELE(dst_rhtp);
1087 	return (retv);
1088 }
1089 
1090 /*
1091  * This routine determines whether a response to a failed packet delivery or
1092  * connection should be sent back.  By default, the policy is to allow such
1093  * messages to be sent at all times, as these messages reveal little useful
1094  * information and are healthy parts of TCP/IP networking.
1095  *
1096  * If tsol_strict_error is set, then we do strict tests: if the packet label is
1097  * within the label range/set of this host/zone, return B_TRUE; otherwise
1098  * return B_FALSE, which causes the packet to be dropped silently.
1099  *
1100  * Note that tsol_get_pkt_label will cause the packet to drop if the sender is
1101  * marked as labeled in the remote host database, but the packet lacks a label.
1102  * This means that we don't need to do a lookup on the source; the
1103  * TSLF_UNLABELED flag is sufficient.
1104  */
1105 boolean_t
1106 tsol_can_reply_error(const mblk_t *mp)
1107 {
1108 	ts_label_t	*plabel = NULL;
1109 	tsol_tpc_t	*rhtp;
1110 	const ipha_t	*ipha;
1111 	const ip6_t	*ip6h;
1112 	boolean_t	retv;
1113 	bslabel_t	*pktbs;
1114 
1115 	/* Caller must pull up at least the IP header */
1116 	ASSERT(MBLKL(mp) >= (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ?
1117 	    sizeof (*ipha) : sizeof (*ip6h)));
1118 
1119 	if (!tsol_strict_error)
1120 		return (B_TRUE);
1121 
1122 	if (DB_CRED(mp) != NULL)
1123 		plabel = crgetlabel(DB_CRED(mp));
1124 
1125 	/* We are bootstrapping or the internal template was never deleted */
1126 	if (plabel == NULL)
1127 		return (B_TRUE);
1128 
1129 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1130 		ipha = (const ipha_t *)mp->b_rptr;
1131 		rhtp = find_tpc(&ipha->ipha_dst, IPV4_VERSION, B_FALSE);
1132 	} else {
1133 		ip6h = (const ip6_t *)mp->b_rptr;
1134 		rhtp = find_tpc(&ip6h->ip6_dst, IPV6_VERSION, B_FALSE);
1135 	}
1136 
1137 	if (rhtp == NULL || label2doi(plabel) != rhtp->tpc_tp.tp_doi) {
1138 		retv = B_FALSE;
1139 	} else {
1140 		/*
1141 		 * If we're in the midst of forwarding, then the destination
1142 		 * address might not be labeled.  In that case, allow unlabeled
1143 		 * packets through only if the default label is the same, and
1144 		 * labeled ones if they dominate.
1145 		 */
1146 		pktbs = label2bslabel(plabel);
1147 		switch (rhtp->tpc_tp.host_type) {
1148 		case UNLABELED:
1149 			if (plabel->tsl_flags & TSLF_UNLABELED) {
1150 				retv = blequal(pktbs,
1151 				    &rhtp->tpc_tp.tp_def_label);
1152 			} else {
1153 				retv = bldominates(pktbs,
1154 				    &rhtp->tpc_tp.tp_def_label);
1155 			}
1156 			break;
1157 
1158 		case SUN_CIPSO:
1159 			retv = _blinrange(pktbs,
1160 			    &rhtp->tpc_tp.tp_sl_range_cipso) ||
1161 			    blinlset(pktbs, rhtp->tpc_tp.tp_sl_set_cipso);
1162 			break;
1163 
1164 		default:
1165 			retv = B_FALSE;
1166 			break;
1167 		}
1168 	}
1169 
1170 	if (rhtp != NULL)
1171 		TPC_RELE(rhtp);
1172 
1173 	return (retv);
1174 }
1175 
1176 /*
1177  * Finds the zone associated with the given packet.  Returns GLOBAL_ZONEID if
1178  * the zone cannot be located.
1179  *
1180  * This is used by the classifier when the packet matches an ALL_ZONES IRE, and
1181  * there's no MLP defined.
1182  */
1183 zoneid_t
1184 tsol_packet_to_zoneid(const mblk_t *mp)
1185 {
1186 	cred_t *cr = DB_CRED(mp);
1187 	zone_t *zone;
1188 	ts_label_t *label;
1189 
1190 	if (cr != NULL) {
1191 		if ((label = crgetlabel(cr)) != NULL) {
1192 			zone = zone_find_by_label(label);
1193 			if (zone != NULL) {
1194 				zoneid_t zoneid = zone->zone_id;
1195 
1196 				zone_rele(zone);
1197 				return (zoneid);
1198 			}
1199 		}
1200 	}
1201 	return (GLOBAL_ZONEID);
1202 }
1203 
1204 int
1205 tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
1206 {
1207 	int		error = 0;
1208 	tsol_ire_gw_secattr_t *attrp = NULL;
1209 	tsol_tnrhc_t	*gw_rhc = NULL;
1210 	tsol_gcgrp_t	*gcgrp = NULL;
1211 	tsol_gc_t	*gc = NULL;
1212 	in_addr_t	ga_addr4;
1213 	void		*paddr = NULL;
1214 
1215 	/* Not in Trusted mode or IRE is local/loopback/broadcast/interface */
1216 	if (!is_system_labeled() ||
1217 	    (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
1218 	    IRE_INTERFACE)))
1219 		goto done;
1220 
1221 	/*
1222 	 * If we don't have a label to compare with, or the IRE does not
1223 	 * contain any gateway security attributes, there's not much that
1224 	 * we can do.  We let the former case pass, and the latter fail,
1225 	 * since the IRE doesn't qualify for a match due to the lack of
1226 	 * security attributes.
1227 	 */
1228 	if (tsl == NULL || ire->ire_gw_secattr == NULL) {
1229 		if (tsl != NULL) {
1230 			DTRACE_PROBE3(
1231 			tx__ip__log__drop__irematch__nogwsec, char *,
1232 			"ire(1) lacks ire_gw_secattr when matching label(2)",
1233 			ire_t *, ire, ts_label_t *, tsl);
1234 			error = EACCES;
1235 		}
1236 		goto done;
1237 	}
1238 
1239 	attrp = ire->ire_gw_secattr;
1240 
1241 	/*
1242 	 * The possible lock order scenarios related to the tsol gateway
1243 	 * attribute locks are documented at the beginning of ip.c in the
1244 	 * lock order scenario section.
1245 	 */
1246 	mutex_enter(&attrp->igsa_lock);
1247 
1248 	/*
1249 	 * Depending on the IRE type (prefix vs. cache), we seek the group
1250 	 * structure which contains all security credentials of the gateway.
1251 	 * A prefix IRE is associated with at most one gateway credential,
1252 	 * while a cache IRE is associated with every credentials that the
1253 	 * gateway has.
1254 	 */
1255 	if ((gc = attrp->igsa_gc) != NULL) {			/* prefix */
1256 		gcgrp = gc->gc_grp;
1257 		ASSERT(gcgrp != NULL);
1258 		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1259 	} else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {	/* cache */
1260 		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1261 		gc = gcgrp->gcgrp_head;
1262 		if (gc == NULL) {
1263 			/* gc group is empty, so the drop lock now */
1264 			ASSERT(gcgrp->gcgrp_count == 0);
1265 			rw_exit(&gcgrp->gcgrp_rwlock);
1266 			gcgrp = NULL;
1267 		}
1268 	}
1269 
1270 	if (gcgrp != NULL)
1271 		GCGRP_REFHOLD(gcgrp);
1272 
1273 	if ((gw_rhc = attrp->igsa_rhc) != NULL) {
1274 		/*
1275 		 * If our cached entry has grown stale, then discard it so we
1276 		 * can get a new one.
1277 		 */
1278 		if (gw_rhc->rhc_invalid || gw_rhc->rhc_tpc->tpc_invalid) {
1279 			TNRHC_RELE(gw_rhc);
1280 			attrp->igsa_rhc = gw_rhc = NULL;
1281 		} else {
1282 			TNRHC_HOLD(gw_rhc)
1283 		}
1284 	}
1285 
1286 	/* Last attempt at loading the template had failed; try again */
1287 	if (gw_rhc == NULL) {
1288 		if (gcgrp != NULL) {
1289 			tsol_gcgrp_addr_t *ga = &gcgrp->gcgrp_addr;
1290 
1291 			if (ire->ire_ipversion == IPV4_VERSION) {
1292 				ASSERT(ga->ga_af == AF_INET);
1293 				IN6_V4MAPPED_TO_IPADDR(&ga->ga_addr, ga_addr4);
1294 				paddr = &ga_addr4;
1295 			} else {
1296 				ASSERT(ga->ga_af == AF_INET6);
1297 				paddr = &ga->ga_addr;
1298 			}
1299 		} else if (ire->ire_ipversion == IPV6_VERSION &&
1300 		    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
1301 			paddr = &ire->ire_gateway_addr_v6;
1302 		} else if (ire->ire_ipversion == IPV4_VERSION &&
1303 		    ire->ire_gateway_addr != INADDR_ANY) {
1304 			paddr = &ire->ire_gateway_addr;
1305 		}
1306 
1307 		/* We've found a gateway address to do the template lookup */
1308 		if (paddr != NULL) {
1309 			ASSERT(gw_rhc == NULL);
1310 			gw_rhc = find_rhc(paddr, ire->ire_ipversion, B_FALSE);
1311 			if (gw_rhc != NULL) {
1312 				/*
1313 				 * Note that if the lookup above returned an
1314 				 * internal template, we'll use it for the
1315 				 * time being, and do another lookup next
1316 				 * time around.
1317 				 */
1318 				/* Another thread has loaded the template? */
1319 				if (attrp->igsa_rhc != NULL) {
1320 					TNRHC_RELE(gw_rhc)
1321 					/* reload, it could be different */
1322 					gw_rhc = attrp->igsa_rhc;
1323 				} else {
1324 					attrp->igsa_rhc = gw_rhc;
1325 				}
1326 				/*
1327 				 * Hold an extra reference just like we did
1328 				 * above prior to dropping the igsa_lock.
1329 				 */
1330 				TNRHC_HOLD(gw_rhc)
1331 			}
1332 		}
1333 	}
1334 
1335 	mutex_exit(&attrp->igsa_lock);
1336 	/* Gateway template not found */
1337 	if (gw_rhc == NULL) {
1338 		/*
1339 		 * If destination address is directly reachable through an
1340 		 * interface rather than through a learned route, pass it.
1341 		 */
1342 		if (paddr != NULL) {
1343 			DTRACE_PROBE3(
1344 			    tx__ip__log__drop__irematch__nogwtmpl, char *,
1345 			    "ire(1), label(2) off-link with no gw_rhc",
1346 			    ire_t *, ire, ts_label_t *, tsl);
1347 			error = EINVAL;
1348 		}
1349 		goto done;
1350 	}
1351 
1352 	if (gc != NULL) {
1353 		tsol_gcdb_t *gcdb;
1354 		/*
1355 		 * In the case of IRE_CACHE we've got one or more gateway
1356 		 * security credentials to compare against the passed in label.
1357 		 * Perform label range comparison against each security
1358 		 * credential of the gateway. In the case of a prefix ire
1359 		 * we need to match against the security attributes of
1360 		 * just the route itself, so the loop is executed only once.
1361 		 */
1362 		ASSERT(gcgrp != NULL);
1363 		do {
1364 			gcdb = gc->gc_db;
1365 			if (tsl->tsl_doi == gcdb->gcdb_doi &&
1366 			    _blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange))
1367 				break;
1368 			if (ire->ire_type == IRE_CACHE)
1369 				gc = gc->gc_next;
1370 			else
1371 				gc = NULL;
1372 		} while (gc != NULL);
1373 
1374 		if (gc == NULL) {
1375 			DTRACE_PROBE3(
1376 			    tx__ip__log__drop__irematch__nogcmatched,
1377 			    char *, "ire(1), tsl(2): all gc failed match",
1378 			    ire_t *, ire, ts_label_t *, tsl);
1379 			error = EACCES;
1380 		}
1381 	} else {
1382 		/*
1383 		 * We didn't find any gateway credentials in the IRE
1384 		 * attributes; fall back to the gateway's template for
1385 		 * label range checks, if we are required to do so.
1386 		 */
1387 		ASSERT(gw_rhc != NULL);
1388 		switch (gw_rhc->rhc_tpc->tpc_tp.host_type) {
1389 		case SUN_CIPSO:
1390 			if (tsl->tsl_doi !=
1391 			    gw_rhc->rhc_tpc->tpc_tp.tp_doi ||
1392 			    (!_blinrange(&tsl->tsl_label,
1393 			    &gw_rhc->rhc_tpc->tpc_tp.
1394 			    tp_sl_range_cipso) &&
1395 			    !blinlset(&tsl->tsl_label,
1396 			    gw_rhc->rhc_tpc->tpc_tp.tp_sl_set_cipso))) {
1397 				error = EACCES;
1398 				DTRACE_PROBE4(
1399 				    tx__ip__log__drop__irematch__deftmpl,
1400 				    char *, "ire(1), tsl(2), gw_rhc(3) "
1401 				    "failed match (cipso gw)",
1402 				    ire_t *, ire, ts_label_t *, tsl,
1403 				    tsol_tnrhc_t *, gw_rhc);
1404 			}
1405 			break;
1406 
1407 		case UNLABELED:
1408 			if (tsl->tsl_doi !=
1409 				gw_rhc->rhc_tpc->tpc_tp.tp_doi ||
1410 			    (!_blinrange(&tsl->tsl_label,
1411 			    &gw_rhc->rhc_tpc->tpc_tp.tp_gw_sl_range) &&
1412 			    !blinlset(&tsl->tsl_label,
1413 			    gw_rhc->rhc_tpc->tpc_tp.tp_gw_sl_set))) {
1414 				error = EACCES;
1415 				DTRACE_PROBE4(
1416 				    tx__ip__log__drop__irematch__deftmpl,
1417 				    char *, "ire(1), tsl(2), gw_rhc(3) "
1418 				    "failed match (unlabeled gw)",
1419 				    ire_t *, ire, ts_label_t *, tsl,
1420 				    tsol_tnrhc_t *, gw_rhc);
1421 			}
1422 			break;
1423 		}
1424 	}
1425 
1426 done:
1427 
1428 	if (gcgrp != NULL) {
1429 		rw_exit(&gcgrp->gcgrp_rwlock);
1430 		GCGRP_REFRELE(gcgrp);
1431 	}
1432 
1433 	if (gw_rhc != NULL)
1434 		TNRHC_RELE(gw_rhc)
1435 
1436 	return (error);
1437 }
1438 
1439 /*
1440  * Performs label accreditation checks for packet forwarding.
1441  *
1442  * Returns a pointer to the modified mblk if allowed for forwarding,
1443  * or NULL if the packet must be dropped.
1444  */
1445 mblk_t *
1446 tsol_ip_forward(ire_t *ire, mblk_t *mp)
1447 {
1448 	tsol_ire_gw_secattr_t *attrp = NULL;
1449 	ipha_t		*ipha;
1450 	ip6_t		*ip6h;
1451 	const void	*pdst;
1452 	const void	*psrc;
1453 	boolean_t	off_link;
1454 	tsol_tpc_t	*dst_rhtp, *gw_rhtp;
1455 	tsol_ip_label_t label_type;
1456 	uchar_t		*opt_ptr = NULL;
1457 	ts_label_t	*tsl;
1458 	uint8_t		proto;
1459 	int		af, adjust;
1460 	uint16_t	iplen;
1461 	boolean_t	need_tpc_rele = B_FALSE;
1462 	ipaddr_t	*gw;
1463 
1464 	ASSERT(ire != NULL && mp != NULL);
1465 	ASSERT(ire->ire_stq != NULL);
1466 
1467 	af = (ire->ire_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
1468 
1469 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1470 		ASSERT(ire->ire_ipversion == IPV4_VERSION);
1471 		ipha = (ipha_t *)mp->b_rptr;
1472 		psrc = &ipha->ipha_src;
1473 		pdst = &ipha->ipha_dst;
1474 		proto = ipha->ipha_protocol;
1475 
1476 		/*
1477 		 * off_link is TRUE if destination not directly reachable.
1478 		 * Surya note: we avoid creation of per-dst IRE_CACHE entries
1479 		 * for forwarded packets, so we set off_link to be TRUE
1480 		 * if the packet dst is different from the ire_addr of
1481 		 * the ire for the nexthop.
1482 		 */
1483 		off_link = ((ipha->ipha_dst != ire->ire_addr) ||
1484 		    (ire->ire_gateway_addr != INADDR_ANY));
1485 	} else {
1486 		ASSERT(ire->ire_ipversion == IPV6_VERSION);
1487 		ip6h = (ip6_t *)mp->b_rptr;
1488 		psrc = &ip6h->ip6_src;
1489 		pdst = &ip6h->ip6_dst;
1490 		proto = ip6h->ip6_nxt;
1491 
1492 		if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
1493 		    proto != IPPROTO_ICMPV6) {
1494 			uint8_t *nexthdrp;
1495 			uint16_t hdr_len;
1496 
1497 			if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len,
1498 			    &nexthdrp)) {
1499 				/* malformed packet; drop it */
1500 				return (NULL);
1501 			}
1502 			proto = *nexthdrp;
1503 		}
1504 
1505 		/* destination not directly reachable? */
1506 		off_link = !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
1507 	}
1508 
1509 	if ((tsl = MBLK_GETLABEL(mp)) == NULL)
1510 		return (mp);
1511 
1512 	label_type = tsol_get_option(mp, &opt_ptr);
1513 
1514 	ASSERT(psrc != NULL && pdst != NULL);
1515 	dst_rhtp = find_tpc(pdst, ire->ire_ipversion, B_FALSE);
1516 
1517 	if (dst_rhtp == NULL) {
1518 		/*
1519 		 * Without a template we do not know if forwarding
1520 		 * violates MAC
1521 		 */
1522 		DTRACE_PROBE3(tx__ip__log__drop__forward__nodst, char *,
1523 		    "mp(1) dropped, no template for destination ip4|6(2)",
1524 		    mblk_t *, mp, void *, pdst);
1525 		return (NULL);
1526 	}
1527 
1528 	/*
1529 	 * Gateway template must have existed for off-link destinations,
1530 	 * since tsol_ire_match_gwattr has ensured such condition.
1531 	 */
1532 	if (ire->ire_ipversion == IPV4_VERSION && off_link) {
1533 		/*
1534 		 * Surya note: first check if we can get the gw_rhtp from
1535 		 * the ire_gw_secattr->igsa_rhc; if this is null, then
1536 		 * do a lookup based on the ire_addr (address of gw)
1537 		 */
1538 		if (ire->ire_gw_secattr != NULL &&
1539 		    ire->ire_gw_secattr->igsa_rhc != NULL) {
1540 			attrp = ire->ire_gw_secattr;
1541 			gw_rhtp = attrp->igsa_rhc->rhc_tpc;
1542 		} else  {
1543 			/*
1544 			 * use the ire_addr if this is the IRE_CACHE of nexthop
1545 			 */
1546 			gw = (ire->ire_gateway_addr == NULL? &ire->ire_addr :
1547 			    &ire->ire_gateway_addr);
1548 			gw_rhtp = find_tpc(gw, ire->ire_ipversion, B_FALSE);
1549 			need_tpc_rele = B_TRUE;
1550 		}
1551 		if (gw_rhtp == NULL) {
1552 			DTRACE_PROBE3(tx__ip__log__drop__forward__nogw, char *,
1553 			    "mp(1) dropped, no gateway in ire attributes(2)",
1554 			    mblk_t *, mp, tsol_ire_gw_secattr_t *, attrp);
1555 			mp = NULL;
1556 			goto keep_label;
1557 		}
1558 	}
1559 	if (ire->ire_ipversion == IPV6_VERSION &&
1560 	    ((attrp = ire->ire_gw_secattr) == NULL || attrp->igsa_rhc == NULL ||
1561 	    (gw_rhtp = attrp->igsa_rhc->rhc_tpc) == NULL) && off_link) {
1562 		DTRACE_PROBE3(tx__ip__log__drop__forward__nogw, char *,
1563 		    "mp(1) dropped, no gateway in ire attributes(2)",
1564 		    mblk_t *, mp, tsol_ire_gw_secattr_t *, attrp);
1565 		mp = NULL;
1566 		goto keep_label;
1567 	}
1568 
1569 	/*
1570 	 * Check that the label for the packet is acceptable
1571 	 * by destination host; otherwise, drop it.
1572 	 */
1573 	switch (dst_rhtp->tpc_tp.host_type) {
1574 	case SUN_CIPSO:
1575 		if (tsl->tsl_doi != dst_rhtp->tpc_tp.tp_doi ||
1576 		    (!_blinrange(&tsl->tsl_label,
1577 		    &dst_rhtp->tpc_tp.tp_sl_range_cipso) &&
1578 		    !blinlset(&tsl->tsl_label,
1579 		    dst_rhtp->tpc_tp.tp_sl_set_cipso))) {
1580 			DTRACE_PROBE4(tx__ip__log__drop__forward__mac, char *,
1581 			    "labeled packet mp(1) dropped, label(2) fails "
1582 			    "destination(3) accredation check",
1583 			    mblk_t *, mp, ts_label_t *, tsl,
1584 			    tsol_tpc_t *, dst_rhtp);
1585 			mp = NULL;
1586 			goto keep_label;
1587 		}
1588 		break;
1589 
1590 
1591 	case UNLABELED:
1592 		if (tsl->tsl_doi != dst_rhtp->tpc_tp.tp_doi ||
1593 		    !blequal(&dst_rhtp->tpc_tp.tp_def_label,
1594 		    &tsl->tsl_label)) {
1595 			DTRACE_PROBE4(tx__ip__log__drop__forward__mac, char *,
1596 			    "unlabeled packet mp(1) dropped, label(2) fails "
1597 			    "destination(3) accredation check",
1598 			    mblk_t *, mp, ts_label_t *, tsl,
1599 			    tsol_tpc_t *, dst_rhtp);
1600 			mp = NULL;
1601 			goto keep_label;
1602 		}
1603 		break;
1604 	}
1605 	if (label_type == OPT_CIPSO) {
1606 		/*
1607 		 * We keep the label on any of the following cases:
1608 		 *
1609 		 *   1. The destination is labeled (on/off-link).
1610 		 *   2. The unlabeled destination is off-link,
1611 		 *	and the next hop gateway is labeled.
1612 		 */
1613 		if (dst_rhtp->tpc_tp.host_type != UNLABELED ||
1614 		    (off_link &&
1615 		    gw_rhtp->tpc_tp.host_type != UNLABELED))
1616 			goto keep_label;
1617 
1618 		/*
1619 		 * Strip off the CIPSO option from the packet because: the
1620 		 * unlabeled destination host is directly reachable through
1621 		 * an interface (on-link); or, the unlabeled destination host
1622 		 * is not directly reachable (off-link), and the next hop
1623 		 * gateway is unlabeled.
1624 		 */
1625 		adjust = (af == AF_INET) ? tsol_remove_secopt(ipha, MBLKL(mp)) :
1626 		    tsol_remove_secopt_v6(ip6h, MBLKL(mp));
1627 
1628 		ASSERT(adjust <= 0);
1629 		if (adjust != 0) {
1630 
1631 			/* adjust is negative */
1632 			ASSERT((mp->b_wptr + adjust) >= mp->b_rptr);
1633 			mp->b_wptr += adjust;
1634 
1635 			if (af == AF_INET) {
1636 				ipha = (ipha_t *)mp->b_rptr;
1637 				iplen = ntohs(ipha->ipha_length) + adjust;
1638 				ipha->ipha_length = htons(iplen);
1639 				ipha->ipha_hdr_checksum = 0;
1640 				ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1641 			}
1642 			DTRACE_PROBE3(tx__ip__log__info__forward__adjust,
1643 			    char *,
1644 			    "mp(1) adjusted(2) for CIPSO option removal",
1645 			    mblk_t *, mp, int, adjust);
1646 		}
1647 		goto keep_label;
1648 	}
1649 
1650 	ASSERT(label_type == OPT_NONE);
1651 	ASSERT(dst_rhtp != NULL);
1652 
1653 	/*
1654 	 * We need to add CIPSO option if the destination or the next hop
1655 	 * gateway is labeled.  Otherwise, pass the packet as is.
1656 	 */
1657 	if (dst_rhtp->tpc_tp.host_type == UNLABELED &&
1658 	    (!off_link || gw_rhtp->tpc_tp.host_type == UNLABELED))
1659 		goto keep_label;
1660 
1661 	if ((af == AF_INET &&
1662 	    tsol_check_label(DB_CRED(mp), &mp, &adjust, B_FALSE) != 0) ||
1663 	    (af == AF_INET6 &&
1664 	    tsol_check_label_v6(DB_CRED(mp), &mp, &adjust, B_FALSE) != 0)) {
1665 		mp = NULL;
1666 		goto keep_label;
1667 	}
1668 
1669 	ASSERT(adjust != -1);
1670 	if (adjust != 0) {
1671 		if (af == AF_INET) {
1672 			ipha = (ipha_t *)mp->b_rptr;
1673 			iplen = ntohs(ipha->ipha_length) + adjust;
1674 			ipha->ipha_length = htons(iplen);
1675 			ipha->ipha_hdr_checksum = 0;
1676 			ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1677 		}
1678 
1679 		DTRACE_PROBE3(tx__ip__log__info__forward__adjust, char *,
1680 		    "mp(1) adjusted(2) for CIPSO option removal",
1681 		    mblk_t *, mp, int, adjust);
1682 	}
1683 
1684 keep_label:
1685 	TPC_RELE(dst_rhtp);
1686 	if (need_tpc_rele && gw_rhtp != NULL)
1687 		TPC_RELE(gw_rhtp);
1688 	return (mp);
1689 }
1690 
1691 /*
1692  * Name:	tsol_rtsa_init()
1693  *
1694  * Normal:	Sanity checks on the route security attributes provided by
1695  *		user.  Convert it into a route security parameter list to
1696  *		be returned to caller.
1697  *
1698  * Output:	EINVAL if bad security attributes in the routing message
1699  *		ENOMEM if unable to allocate data structures
1700  *		0 otherwise.
1701  *
1702  * Note:	On input, cp must point to the end of any addresses in
1703  *		the rt_msghdr_t structure.
1704  */
1705 int
1706 tsol_rtsa_init(rt_msghdr_t *rtm, tsol_rtsecattr_t *sp, caddr_t cp)
1707 {
1708 	uint_t	sacnt;
1709 	int	err;
1710 	caddr_t	lim;
1711 	tsol_rtsecattr_t *tp;
1712 
1713 	ASSERT((cp >= (caddr_t)&rtm[1]) && sp != NULL);
1714 
1715 	/*
1716 	 * In theory, we could accept as many security attributes configured
1717 	 * per route destination.  However, the current design is limited
1718 	 * such that at most only one set security attributes is allowed to
1719 	 * be associated with a prefix IRE.  We therefore assert for now.
1720 	 */
1721 	/* LINTED */
1722 	ASSERT(TSOL_RTSA_REQUEST_MAX == 1);
1723 
1724 	sp->rtsa_cnt = 0;
1725 	lim = (caddr_t)rtm + rtm->rtm_msglen;
1726 	ASSERT(cp <= lim);
1727 
1728 	if ((lim - cp) < sizeof (rtm_ext_t) ||
1729 	    ((rtm_ext_t *)cp)->rtmex_type != RTMEX_GATEWAY_SECATTR)
1730 		return (0);
1731 
1732 	if (((rtm_ext_t *)cp)->rtmex_len < sizeof (tsol_rtsecattr_t))
1733 		return (EINVAL);
1734 
1735 	cp += sizeof (rtm_ext_t);
1736 
1737 	if ((lim - cp) < sizeof (*tp) ||
1738 	    (tp = (tsol_rtsecattr_t *)cp, (sacnt = tp->rtsa_cnt) == 0) ||
1739 	    (lim - cp) < TSOL_RTSECATTR_SIZE(sacnt))
1740 		return (EINVAL);
1741 
1742 	/*
1743 	 * Trying to add route security attributes when system
1744 	 * labeling service is not available, or when user supllies
1745 	 * more than the maximum number of security attributes
1746 	 * allowed per request.
1747 	 */
1748 	if ((sacnt > 0 && !is_system_labeled()) ||
1749 	    sacnt > TSOL_RTSA_REQUEST_MAX)
1750 		return (EINVAL);
1751 
1752 	/* Ensure valid credentials */
1753 	if ((err = rtsa_validate(&((tsol_rtsecattr_t *)cp)->
1754 	    rtsa_attr[0])) != 0) {
1755 		cp += sizeof (*sp);
1756 		return (err);
1757 	}
1758 
1759 	bcopy(cp, sp, sizeof (*sp));
1760 	cp += sizeof (*sp);
1761 	return (0);
1762 }
1763 
1764 int
1765 tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
1766     tsol_gcgrp_t *gcgrp)
1767 {
1768 	tsol_ire_gw_secattr_t *attrp;
1769 	boolean_t exists = B_FALSE;
1770 	in_addr_t ga_addr4;
1771 	void *paddr = NULL;
1772 
1773 	ASSERT(ire != NULL);
1774 
1775 	/*
1776 	 * The only time that attrp can be NULL is when this routine is
1777 	 * called for the first time during the creation/initialization
1778 	 * of the corresponding IRE.  It will only get cleared when the
1779 	 * IRE is deleted.
1780 	 */
1781 	if ((attrp = ire->ire_gw_secattr) == NULL) {
1782 		attrp = ire_gw_secattr_alloc(KM_NOSLEEP);
1783 		if (attrp == NULL)
1784 			return (ENOMEM);
1785 		ire->ire_gw_secattr = attrp;
1786 	} else {
1787 		exists = B_TRUE;
1788 		mutex_enter(&attrp->igsa_lock);
1789 
1790 		if (attrp->igsa_rhc != NULL) {
1791 			TNRHC_RELE(attrp->igsa_rhc);
1792 			attrp->igsa_rhc = NULL;
1793 		}
1794 
1795 		if (attrp->igsa_gc != NULL)
1796 			GC_REFRELE(attrp->igsa_gc);
1797 		if (attrp->igsa_gcgrp != NULL)
1798 			GCGRP_REFRELE(attrp->igsa_gcgrp);
1799 	}
1800 	ASSERT(!exists || MUTEX_HELD(&attrp->igsa_lock));
1801 
1802 	/*
1803 	 * References already held by caller and we keep them;
1804 	 * note that both gc and gcgrp may be set to NULL to
1805 	 * clear out igsa_gc and igsa_gcgrp, respectively.
1806 	 */
1807 	attrp->igsa_gc = gc;
1808 	attrp->igsa_gcgrp = gcgrp;
1809 
1810 	if (gcgrp == NULL && gc != NULL) {
1811 		gcgrp = gc->gc_grp;
1812 		ASSERT(gcgrp != NULL);
1813 	}
1814 
1815 	/*
1816 	 * Intialize the template for gateway; we use the gateway's
1817 	 * address found in either the passed in gateway credential
1818 	 * or group pointer, or the ire_gateway_addr{_v6} field.
1819 	 */
1820 	if (gcgrp != NULL) {
1821 		tsol_gcgrp_addr_t *ga = &gcgrp->gcgrp_addr;
1822 
1823 		/*
1824 		 * Caller is holding a reference, and that we don't
1825 		 * need to hold any lock to access the address.
1826 		 */
1827 		if (ipversion == IPV4_VERSION) {
1828 			ASSERT(ga->ga_af == AF_INET);
1829 			IN6_V4MAPPED_TO_IPADDR(&ga->ga_addr, ga_addr4);
1830 			paddr = &ga_addr4;
1831 		} else {
1832 			ASSERT(ga->ga_af == AF_INET6);
1833 			paddr = &ga->ga_addr;
1834 		}
1835 	} else if (ipversion == IPV6_VERSION &&
1836 	    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
1837 		paddr = &ire->ire_gateway_addr_v6;
1838 	} else if (ipversion == IPV4_VERSION &&
1839 	    ire->ire_gateway_addr != INADDR_ANY) {
1840 		paddr = &ire->ire_gateway_addr;
1841 	}
1842 
1843 	/*
1844 	 * Lookup the gateway template; note that we could get an internal
1845 	 * template here, which we cache anyway.  During IRE matching, we'll
1846 	 * try to update this gateway template cache and hopefully get a
1847 	 * real one.
1848 	 */
1849 	if (paddr != NULL) {
1850 		attrp->igsa_rhc = find_rhc(paddr, ipversion, B_FALSE);
1851 	}
1852 
1853 	if (exists)
1854 		mutex_exit(&attrp->igsa_lock);
1855 
1856 	return (0);
1857 }
1858 
1859 /*
1860  * This function figures the type of MLP that we'll be using based on the
1861  * address that the user is binding and the zone.  If the address is
1862  * unspecified, then we're looking at both private and shared.  If it's one
1863  * of the zone's private addresses, then it's private only.  If it's one
1864  * of the global addresses, then it's shared only.
1865  *
1866  * If we can't figure out what it is, then return mlptSingle.  That's actually
1867  * an error case.
1868  */
1869 mlp_type_t
1870 tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr)
1871 {
1872 	in_addr_t in4;
1873 	ire_t *ire;
1874 	ipif_t *ipif;
1875 	zoneid_t addrzone;
1876 
1877 	ASSERT(addr != NULL);
1878 
1879 	if (version == IPV6_VERSION &&
1880 	    IN6_IS_ADDR_V4MAPPED((const in6_addr_t *)addr)) {
1881 		IN6_V4MAPPED_TO_IPADDR((const in6_addr_t *)addr, in4);
1882 		addr = &in4;
1883 		version = IPV4_VERSION;
1884 	}
1885 
1886 	if (version == IPV4_VERSION) {
1887 		in4 = *(const in_addr_t *)addr;
1888 		if (in4 == INADDR_ANY)
1889 			return (mlptBoth);
1890 		ire = ire_cache_lookup(in4, zoneid, NULL);
1891 	} else {
1892 		if (IN6_IS_ADDR_UNSPECIFIED((const in6_addr_t *)addr))
1893 			return (mlptBoth);
1894 		ire = ire_cache_lookup_v6(addr, zoneid, NULL);
1895 	}
1896 	/*
1897 	 * If we can't find the IRE, then we have to behave exactly like
1898 	 * ip_bind_laddr{,_v6}.  That means looking up the IPIF so that users
1899 	 * can bind to addresses on "down" interfaces.
1900 	 *
1901 	 * If we can't find that either, then the bind is going to fail, so
1902 	 * just give up.  Note that there's a miniscule chance that the address
1903 	 * is in transition, but we don't bother handling that.
1904 	 */
1905 	if (ire == NULL) {
1906 		if (version == IPV4_VERSION)
1907 			ipif = ipif_lookup_addr(*(const in_addr_t *)addr, NULL,
1908 			    zoneid, NULL, NULL, NULL, NULL);
1909 		else
1910 			ipif = ipif_lookup_addr_v6((const in6_addr_t *)addr,
1911 			    NULL, zoneid, NULL, NULL, NULL, NULL);
1912 		if (ipif == NULL)
1913 			return (mlptSingle);
1914 		addrzone = ipif->ipif_zoneid;
1915 		ipif_refrele(ipif);
1916 	} else {
1917 		addrzone = ire->ire_zoneid;
1918 		ire_refrele(ire);
1919 	}
1920 	return (addrzone == ALL_ZONES ? mlptShared : mlptPrivate);
1921 }
1922 
1923 /*
1924  * Since we are configuring local interfaces, and we know trusted
1925  * extension CDE requires local interfaces to be cipso host type in
1926  * order to function correctly, we'll associate a cipso template
1927  * to each local interface and let the interface come up.  Configuring
1928  * a local interface to be "unlabeled" host type is a configuration error.
1929  * We'll override that error and make the interface host type to be cipso
1930  * here.
1931  *
1932  * The code is optimized for the usual "success" case and unwinds things on
1933  * error.  We don't want to go to the trouble and expense of formatting the
1934  * interface name for the usual case where everything is configured correctly.
1935  */
1936 boolean_t
1937 tsol_check_interface_address(const ipif_t *ipif)
1938 {
1939 	tsol_tpc_t *tp;
1940 	char addrbuf[INET6_ADDRSTRLEN];
1941 	int af;
1942 	const void *addr;
1943 	zone_t *zone;
1944 	ts_label_t *plabel;
1945 	const bslabel_t *label;
1946 	char ifbuf[LIFNAMSIZ + 10];
1947 	const char *ifname;
1948 	boolean_t retval;
1949 	tsol_rhent_t rhent;
1950 
1951 	if (IN6_IS_ADDR_V4MAPPED(&ipif->ipif_v6lcl_addr)) {
1952 		af = AF_INET;
1953 		addr = &V4_PART_OF_V6(ipif->ipif_v6lcl_addr);
1954 	} else {
1955 		af = AF_INET6;
1956 		addr = &ipif->ipif_v6lcl_addr;
1957 	}
1958 
1959 	tp = find_tpc(&ipif->ipif_v6lcl_addr, IPV6_VERSION, B_FALSE);
1960 	zone = ipif->ipif_zoneid == ALL_ZONES ? NULL :
1961 	    zone_find_by_id(ipif->ipif_zoneid);
1962 	if (zone != NULL) {
1963 		plabel = zone->zone_slabel;
1964 		ASSERT(plabel != NULL);
1965 		label = label2bslabel(plabel);
1966 	}
1967 
1968 	/*
1969 	 * If it's CIPSO and an all-zones address, then we're done.
1970 	 * If it's a CIPSO zone specific address, the zone's label
1971 	 * must be in the range or set specified in the template.
1972 	 * When the remote host entry is missing or the template
1973 	 * type is incorrect for this interface, we create a
1974 	 * CIPSO host entry in kernel and allow the interface to be
1975 	 * brought up as CIPSO type.
1976 	 */
1977 	if (tp != NULL && (
1978 	    /* The all-zones case */
1979 	    (tp->tpc_tp.host_type == SUN_CIPSO &&
1980 	    tp->tpc_tp.tp_doi == default_doi &&
1981 	    ipif->ipif_zoneid == ALL_ZONES) ||
1982 	    /* The local-zone case */
1983 	    (zone != NULL && plabel->tsl_doi == tp->tpc_tp.tp_doi &&
1984 	    ((tp->tpc_tp.host_type == SUN_CIPSO &&
1985 	    (_blinrange(label, &tp->tpc_tp.tp_sl_range_cipso) ||
1986 	    blinlset(label, tp->tpc_tp.tp_sl_set_cipso))))))) {
1987 		if (zone != NULL)
1988 			zone_rele(zone);
1989 		TPC_RELE(tp);
1990 		return (B_TRUE);
1991 	}
1992 
1993 	ifname = ipif->ipif_ill->ill_name;
1994 	if (ipif->ipif_id != 0) {
1995 		(void) snprintf(ifbuf, sizeof (ifbuf), "%s:%u", ifname,
1996 		    ipif->ipif_id);
1997 		ifname = ifbuf;
1998 	}
1999 	(void) inet_ntop(af, addr, addrbuf, sizeof (addrbuf));
2000 
2001 	if (tp == NULL) {
2002 		cmn_err(CE_NOTE, "template entry for %s missing. Default to "
2003 		    "CIPSO type for %s", ifname, addrbuf);
2004 		retval = B_TRUE;
2005 	} else if (tp->tpc_tp.host_type == UNLABELED) {
2006 		cmn_err(CE_NOTE, "template type for %s incorrectly configured. "
2007 		    "Change to CIPSO type for %s", ifname, addrbuf);
2008 		retval = B_TRUE;
2009 	} else if (ipif->ipif_zoneid == ALL_ZONES) {
2010 		if (tp->tpc_tp.host_type != SUN_CIPSO) {
2011 			cmn_err(CE_NOTE, "%s failed: %s isn't set to CIPSO for "
2012 			    "all-zones. Converted to CIPSO.", ifname, addrbuf);
2013 			retval = B_TRUE;
2014 		} else {
2015 			cmn_err(CE_NOTE, "%s failed: %s has wrong DOI %d "
2016 			    "instead of %d", ifname, addrbuf,
2017 			    tp->tpc_tp.tp_doi, default_doi);
2018 			retval = B_FALSE;
2019 		}
2020 	} else if (zone == NULL) {
2021 		cmn_err(CE_NOTE, "%s failed: zoneid %d unknown",
2022 		    ifname, ipif->ipif_zoneid);
2023 		retval = B_FALSE;
2024 	} else if (plabel->tsl_doi != tp->tpc_tp.tp_doi) {
2025 		cmn_err(CE_NOTE, "%s failed: zone %s has DOI %d but %s has "
2026 		    "DOI %d", ifname, zone->zone_name, plabel->tsl_doi,
2027 		    addrbuf, tp->tpc_tp.tp_doi);
2028 		retval = B_FALSE;
2029 	} else {
2030 		cmn_err(CE_NOTE, "%s failed: zone %s label incompatible with "
2031 		    "%s", ifname, zone->zone_name, addrbuf);
2032 		tsol_print_label(label, "zone label");
2033 		retval = B_FALSE;
2034 	}
2035 
2036 	if (zone != NULL)
2037 		zone_rele(zone);
2038 	if (tp != NULL)
2039 		TPC_RELE(tp);
2040 	if (retval) {
2041 		/*
2042 		 * we've corrected a config error and let the interface
2043 		 * come up as cipso. Need to insert an rhent.
2044 		 */
2045 		if ((rhent.rh_address.ta_family = af) == AF_INET) {
2046 			rhent.rh_prefix = 32;
2047 			rhent.rh_address.ta_addr_v4 = *(struct in_addr *)addr;
2048 		} else {
2049 			rhent.rh_prefix = 128;
2050 			rhent.rh_address.ta_addr_v6 = *(in6_addr_t *)addr;
2051 		}
2052 		(void) strcpy(rhent.rh_template, "cipso");
2053 		if (tnrh_load(&rhent) != 0) {
2054 			cmn_err(CE_NOTE, "%s failed: Cannot insert CIPSO "
2055 			    "template for local addr %s", ifname, addrbuf);
2056 			retval = B_FALSE;
2057 		}
2058 	}
2059 	return (retval);
2060 }
2061