xref: /titanic_41/usr/src/uts/common/inet/ip/tnet.c (revision 98c507c4288789fc67365c4cb51f80eb641e7182)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/strsubr.h>
29 #include <sys/stropts.h>
30 #include <sys/sunddi.h>
31 #include <sys/cred.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/errno.h>
35 #include <sys/disp.h>
36 #include <netinet/in.h>
37 #include <netinet/in_systm.h>
38 #include <netinet/ip.h>
39 #include <netinet/ip_icmp.h>
40 #include <netinet/tcp.h>
41 #include <inet/common.h>
42 #include <inet/ipclassifier.h>
43 #include <inet/ip.h>
44 #include <inet/mib2.h>
45 #include <inet/nd.h>
46 #include <inet/tcp.h>
47 #include <inet/ip_rts.h>
48 #include <inet/ip_ire.h>
49 #include <inet/ip_if.h>
50 #include <sys/modhash.h>
51 
52 #include <sys/tsol/label.h>
53 #include <sys/tsol/label_macro.h>
54 #include <sys/tsol/tnet.h>
55 #include <sys/tsol/tndb.h>
56 #include <sys/strsun.h>
57 
58 /* tunable for strict error-reply behavior (TCP RST and ICMP Unreachable) */
59 int tsol_strict_error;
60 
61 /*
62  * Some notes on the Trusted Solaris IRE gateway security attributes:
63  *
64  * When running in Trusted mode, the routing subsystem determines whether or
65  * not a packet can be delivered to an off-link host (not directly reachable
66  * through an interface) based on the accreditation checks of the packet's
67  * security attributes against those associated with the next-hop gateway.
68  *
69  * The next-hop gateway's security attributes can be derived from two sources
70  * (in order of preference): route-related and the host database.  A Trusted
71  * system must be configured with at least the host database containing an
72  * entry for the next-hop gateway, or otherwise no accreditation checks can
73  * be performed, which may result in the inability to send packets to any
74  * off-link destination host.
75  *
76  * The major differences between the two sources are the number and type of
77  * security attributes used for accreditation checks.  A host database entry
78  * can contain at most one set of security attributes, specific only to the
79  * next-hop gateway.  On contrast, route-related security attributes are made
80  * up of a collection of security attributes for the distant networks, and
81  * are grouped together per next-hop gateway used to reach those networks.
82  * This is the preferred method, and the routing subsystem will fallback to
83  * the host database entry only if there are no route-related attributes
84  * associated with the next-hop gateway.
85  *
86  * In Trusted mode, all of the IRE entries (except LOCAL/LOOPBACK/BROADCAST/
87  * INTERFACE type) are initialized to contain a placeholder to store this
88  * information.  The ire_gw_secattr structure gets allocated, initialized
89  * and associated with the IRE during the time of the IRE creation.  The
90  * initialization process also includes resolving the host database entry
91  * of the next-hop gateway for fallback purposes.  It does not include any
92  * route-related attribute setup, as that process comes separately as part
93  * of the route requests (add/change) made to the routing subsystem.
94  *
95  * The underlying logic which involves associating IREs with the gateway
96  * security attributes are represented by the following data structures:
97  *
98  * tsol_gcdb_t, or "gcdb"
99  *
100  *	- This is a system-wide collection of records containing the
101  *	  currently used route-related security attributes, which are fed
102  *	  through the routing socket interface, e.g. "route add/change".
103  *
104  * tsol_gc_t, or "gc"
105  *
106  *	- This is the gateway credential structure, and it provides for the
107  *	  only mechanism to access the contents of gcdb.  More than one gc
108  *	  entries may refer to the same gcdb record.  gc's in the system are
109  *	  grouped according to the next-hop gateway address.
110  *
111  * tsol_gcgrp_t, or "gcgrp"
112  *
113  *	- Group of gateway credentials, and is unique per next-hop gateway
114  *	  address.  When the group is not empty, i.e. when gcgrp_count is
115  *	  greater than zero, it contains one or more gc's, each pointing to
116  *	  a gcdb record which indicates the gateway security attributes
117  *	  associated with the next-hop gateway.
118  *
119  * The fields of the tsol_ire_gw_secattr_t used from within the IRE are:
120  *
121  * igsa_lock
122  *
123  *	- Lock that protects all fields within tsol_ire_gw_secattr_t.
124  *
125  * igsa_rhc
126  *
127  *	- Remote host cache database entry of next-hop gateway.  This is
128  *	  used in the case when there are no route-related attributes
129  *	  configured for the IRE.
130  *
131  * igsa_gc
132  *
133  *	- A set of route-related attributes that only get set for prefix
134  *	  IREs.  If this is non-NULL, the prefix IRE has been associated
135  *	  with a set of gateway security attributes by way of route add/
136  *	  change functionality.  This field stays NULL for IRE_CACHEs.
137  *
138  * igsa_gcgrp
139  *
140  *	- Group of gc's which only gets set for IRE_CACHEs.  Each of the gc
141  *	  points to a gcdb record that contains the security attributes
142  *	  used to perform the credential checks of the packet which uses
143  *	  the IRE.  If the group is not empty, the list of gc's can be
144  *	  traversed starting at gcgrp_head.  This field stays NULL for
145  *	  prefix IREs.
146  */
147 
148 static kmem_cache_t *ire_gw_secattr_cache;
149 
150 #define	GCDB_HASH_SIZE	101
151 #define	GCGRP_HASH_SIZE	101
152 
153 #define	GCDB_REFRELE(p) {		\
154 	mutex_enter(&gcdb_lock);	\
155 	ASSERT((p)->gcdb_refcnt > 0);	\
156 	if (--((p)->gcdb_refcnt) == 0)	\
157 		gcdb_inactive(p);	\
158 	ASSERT(MUTEX_HELD(&gcdb_lock));	\
159 	mutex_exit(&gcdb_lock);		\
160 }
161 
162 static int gcdb_hash_size = GCDB_HASH_SIZE;
163 static int gcgrp_hash_size = GCGRP_HASH_SIZE;
164 static mod_hash_t *gcdb_hash;
165 static mod_hash_t *gcgrp4_hash;
166 static mod_hash_t *gcgrp6_hash;
167 
168 static kmutex_t gcdb_lock;
169 kmutex_t gcgrp_lock;
170 
171 static uint_t gcdb_hash_by_secattr(void *, mod_hash_key_t);
172 static int gcdb_hash_cmp(mod_hash_key_t, mod_hash_key_t);
173 static tsol_gcdb_t *gcdb_lookup(struct rtsa_s *, boolean_t);
174 static void gcdb_inactive(tsol_gcdb_t *);
175 
176 static uint_t gcgrp_hash_by_addr(void *, mod_hash_key_t);
177 static int gcgrp_hash_cmp(mod_hash_key_t, mod_hash_key_t);
178 
179 static int ire_gw_secattr_constructor(void *, void *, int);
180 static void ire_gw_secattr_destructor(void *, void *);
181 
182 void
183 tnet_init(void)
184 {
185 	ire_gw_secattr_cache = kmem_cache_create("ire_gw_secattr_cache",
186 	    sizeof (tsol_ire_gw_secattr_t), 64, ire_gw_secattr_constructor,
187 	    ire_gw_secattr_destructor, NULL, NULL, NULL, 0);
188 
189 	gcdb_hash = mod_hash_create_extended("gcdb_hash",
190 	    gcdb_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
191 	    gcdb_hash_by_secattr, NULL, gcdb_hash_cmp, KM_SLEEP);
192 
193 	gcgrp4_hash = mod_hash_create_extended("gcgrp4_hash",
194 	    gcgrp_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
195 	    gcgrp_hash_by_addr, NULL, gcgrp_hash_cmp, KM_SLEEP);
196 
197 	gcgrp6_hash = mod_hash_create_extended("gcgrp6_hash",
198 	    gcgrp_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
199 	    gcgrp_hash_by_addr, NULL, gcgrp_hash_cmp, KM_SLEEP);
200 
201 	mutex_init(&gcdb_lock, NULL, MUTEX_DEFAULT, NULL);
202 	mutex_init(&gcgrp_lock, NULL, MUTEX_DEFAULT, NULL);
203 }
204 
205 void
206 tnet_fini(void)
207 {
208 	kmem_cache_destroy(ire_gw_secattr_cache);
209 	mod_hash_destroy_hash(gcdb_hash);
210 	mod_hash_destroy_hash(gcgrp4_hash);
211 	mod_hash_destroy_hash(gcgrp6_hash);
212 	mutex_destroy(&gcdb_lock);
213 	mutex_destroy(&gcgrp_lock);
214 }
215 
216 /* ARGSUSED */
217 static int
218 ire_gw_secattr_constructor(void *buf, void *cdrarg, int kmflags)
219 {
220 	tsol_ire_gw_secattr_t *attrp = buf;
221 
222 	mutex_init(&attrp->igsa_lock, NULL, MUTEX_DEFAULT, NULL);
223 
224 	attrp->igsa_rhc = NULL;
225 	attrp->igsa_gc = NULL;
226 	attrp->igsa_gcgrp = NULL;
227 
228 	return (0);
229 }
230 
231 /* ARGSUSED */
232 static void
233 ire_gw_secattr_destructor(void *buf, void *cdrarg)
234 {
235 	tsol_ire_gw_secattr_t *attrp = (tsol_ire_gw_secattr_t *)buf;
236 
237 	mutex_destroy(&attrp->igsa_lock);
238 }
239 
240 tsol_ire_gw_secattr_t *
241 ire_gw_secattr_alloc(int kmflags)
242 {
243 	return (kmem_cache_alloc(ire_gw_secattr_cache, kmflags));
244 }
245 
246 void
247 ire_gw_secattr_free(tsol_ire_gw_secattr_t *attrp)
248 {
249 	ASSERT(MUTEX_NOT_HELD(&attrp->igsa_lock));
250 
251 	if (attrp->igsa_rhc != NULL) {
252 		TNRHC_RELE(attrp->igsa_rhc);
253 		attrp->igsa_rhc = NULL;
254 	}
255 
256 	if (attrp->igsa_gc != NULL) {
257 		GC_REFRELE(attrp->igsa_gc);
258 		attrp->igsa_gc = NULL;
259 	}
260 	if (attrp->igsa_gcgrp != NULL) {
261 		GCGRP_REFRELE(attrp->igsa_gcgrp);
262 		attrp->igsa_gcgrp = NULL;
263 	}
264 
265 	ASSERT(attrp->igsa_rhc == NULL);
266 	ASSERT(attrp->igsa_gc == NULL);
267 	ASSERT(attrp->igsa_gcgrp == NULL);
268 
269 	kmem_cache_free(ire_gw_secattr_cache, attrp);
270 }
271 
272 /* ARGSUSED */
273 static uint_t
274 gcdb_hash_by_secattr(void *hash_data, mod_hash_key_t key)
275 {
276 	const struct rtsa_s *rp = (struct rtsa_s *)key;
277 	const uint32_t *up, *ue;
278 	uint_t hash;
279 	int i;
280 
281 	ASSERT(rp != NULL);
282 
283 	/* See comments in hash_bylabel in zone.c for details */
284 	hash = rp->rtsa_doi + (rp->rtsa_doi << 1);
285 	up = (const uint32_t *)&rp->rtsa_slrange;
286 	ue = up + sizeof (rp->rtsa_slrange) / sizeof (*up);
287 	i = 1;
288 	while (up < ue) {
289 		/* using 2^n + 1, 1 <= n <= 16 as source of many primes */
290 		hash += *up + (*up << ((i % 16) + 1));
291 		up++;
292 		i++;
293 	}
294 	return (hash);
295 }
296 
297 static int
298 gcdb_hash_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
299 {
300 	struct rtsa_s *rp1 = (struct rtsa_s *)key1;
301 	struct rtsa_s *rp2 = (struct rtsa_s *)key2;
302 
303 	ASSERT(rp1 != NULL && rp2 != NULL);
304 
305 	if (blequal(&rp1->rtsa_slrange.lower_bound,
306 	    &rp2->rtsa_slrange.lower_bound) &&
307 	    blequal(&rp1->rtsa_slrange.upper_bound,
308 	    &rp2->rtsa_slrange.upper_bound) &&
309 	    rp1->rtsa_doi == rp2->rtsa_doi)
310 		return (0);
311 
312 	/* No match; not found */
313 	return (-1);
314 }
315 
316 /* ARGSUSED */
317 static uint_t
318 gcgrp_hash_by_addr(void *hash_data, mod_hash_key_t key)
319 {
320 	tsol_gcgrp_addr_t *ga = (tsol_gcgrp_addr_t *)key;
321 	uint_t		idx = 0;
322 	uint32_t	*ap;
323 
324 	ASSERT(ga != NULL);
325 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
326 
327 	ap = (uint32_t *)&ga->ga_addr.s6_addr32[0];
328 	idx ^= *ap++;
329 	idx ^= *ap++;
330 	idx ^= *ap++;
331 	idx ^= *ap;
332 
333 	return (idx);
334 }
335 
336 static int
337 gcgrp_hash_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
338 {
339 	tsol_gcgrp_addr_t *ga1 = (tsol_gcgrp_addr_t *)key1;
340 	tsol_gcgrp_addr_t *ga2 = (tsol_gcgrp_addr_t *)key2;
341 
342 	ASSERT(ga1 != NULL && ga2 != NULL);
343 
344 	/* Address family must match */
345 	if (ga1->ga_af != ga2->ga_af)
346 		return (-1);
347 
348 	if (ga1->ga_addr.s6_addr32[0] == ga2->ga_addr.s6_addr32[0] &&
349 	    ga1->ga_addr.s6_addr32[1] == ga2->ga_addr.s6_addr32[1] &&
350 	    ga1->ga_addr.s6_addr32[2] == ga2->ga_addr.s6_addr32[2] &&
351 	    ga1->ga_addr.s6_addr32[3] == ga2->ga_addr.s6_addr32[3])
352 		return (0);
353 
354 	/* No match; not found */
355 	return (-1);
356 }
357 
358 #define	RTSAFLAGS	"\20\11cipso\3doi\2max_sl\1min_sl"
359 
360 int
361 rtsa_validate(const struct rtsa_s *rp)
362 {
363 	uint32_t mask = rp->rtsa_mask;
364 
365 	/* RTSA_CIPSO must be set, and DOI must not be zero */
366 	if ((mask & RTSA_CIPSO) == 0 || rp->rtsa_doi == 0) {
367 		DTRACE_PROBE2(tx__gcdb__log__error__rtsa__validate, char *,
368 		    "rtsa(1) lacks flag or has 0 doi.",
369 		    rtsa_s *, rp);
370 		return (EINVAL);
371 	}
372 	/*
373 	 * SL range must be specified, and it must have its
374 	 * upper bound dominating its lower bound.
375 	 */
376 	if ((mask & RTSA_SLRANGE) != RTSA_SLRANGE ||
377 	    !bldominates(&rp->rtsa_slrange.upper_bound,
378 	    &rp->rtsa_slrange.lower_bound)) {
379 		DTRACE_PROBE2(tx__gcdb__log__error__rtsa__validate, char *,
380 		    "rtsa(1) min_sl and max_sl not set or max_sl is "
381 		    "not dominating.", rtsa_s *, rp);
382 		return (EINVAL);
383 	}
384 	return (0);
385 }
386 
387 /*
388  * A brief explanation of the reference counting scheme:
389  *
390  * Prefix IREs have a non-NULL igsa_gc and a NULL igsa_gcgrp;
391  * IRE_CACHEs have it vice-versa.
392  *
393  * Apart from dynamic references due to to reference holds done
394  * actively by threads, we have the following references:
395  *
396  * gcdb_refcnt:
397  *	- Every tsol_gc_t pointing to a tsol_gcdb_t contributes a reference
398  *	  to the gcdb_refcnt.
399  *
400  * gc_refcnt:
401  *	- A prefix IRE that points to an igsa_gc contributes a reference
402  *	  to the gc_refcnt.
403  *
404  * gcgrp_refcnt:
405  *	- An IRE_CACHE that points to an igsa_gcgrp contributes a reference
406  *	  to the gcgrp_refcnt of the associated tsol_gcgrp_t.
407  *	- Every tsol_gc_t in the chain headed by tsol_gcgrp_t contributes
408  *	  a reference to the gcgrp_refcnt.
409  */
410 static tsol_gcdb_t *
411 gcdb_lookup(struct rtsa_s *rp, boolean_t alloc)
412 {
413 	tsol_gcdb_t *gcdb = NULL;
414 
415 	if (rtsa_validate(rp) != 0)
416 		return (NULL);
417 
418 	mutex_enter(&gcdb_lock);
419 	/* Find a copy in the cache; otherwise, create one and cache it */
420 	if (mod_hash_find(gcdb_hash, (mod_hash_key_t)rp,
421 	    (mod_hash_val_t *)&gcdb) == 0) {
422 		gcdb->gcdb_refcnt++;
423 		ASSERT(gcdb->gcdb_refcnt != 0);
424 
425 		DTRACE_PROBE2(tx__gcdb__log__info__gcdb__lookup, char *,
426 		    "gcdb(1) is in gcdb_hash(global)", tsol_gcdb_t *, gcdb);
427 	} else if (alloc) {
428 		gcdb = kmem_zalloc(sizeof (*gcdb), KM_NOSLEEP);
429 		if (gcdb != NULL) {
430 			gcdb->gcdb_refcnt = 1;
431 			gcdb->gcdb_mask = rp->rtsa_mask;
432 			gcdb->gcdb_doi = rp->rtsa_doi;
433 			gcdb->gcdb_slrange = rp->rtsa_slrange;
434 
435 			if (mod_hash_insert(gcdb_hash,
436 			    (mod_hash_key_t)&gcdb->gcdb_attr,
437 			    (mod_hash_val_t)gcdb) != 0) {
438 				mutex_exit(&gcdb_lock);
439 				kmem_free(gcdb, sizeof (*gcdb));
440 				return (NULL);
441 			}
442 
443 			DTRACE_PROBE2(tx__gcdb__log__info__gcdb__insert, char *,
444 			    "gcdb(1) inserted in gcdb_hash(global)",
445 			    tsol_gcdb_t *, gcdb);
446 		}
447 	}
448 	mutex_exit(&gcdb_lock);
449 	return (gcdb);
450 }
451 
452 static void
453 gcdb_inactive(tsol_gcdb_t *gcdb)
454 {
455 	ASSERT(MUTEX_HELD(&gcdb_lock));
456 	ASSERT(gcdb != NULL && gcdb->gcdb_refcnt == 0);
457 
458 	(void) mod_hash_remove(gcdb_hash, (mod_hash_key_t)&gcdb->gcdb_attr,
459 	    (mod_hash_val_t *)&gcdb);
460 
461 	DTRACE_PROBE2(tx__gcdb__log__info__gcdb__remove, char *,
462 	    "gcdb(1) removed from gcdb_hash(global)",
463 	    tsol_gcdb_t *, gcdb);
464 	kmem_free(gcdb, sizeof (*gcdb));
465 }
466 
467 tsol_gc_t *
468 gc_create(struct rtsa_s *rp, tsol_gcgrp_t *gcgrp, boolean_t *gcgrp_xtrarefp)
469 {
470 	tsol_gc_t *gc;
471 	tsol_gcdb_t *gcdb;
472 
473 	*gcgrp_xtrarefp = B_TRUE;
474 
475 	rw_enter(&gcgrp->gcgrp_rwlock, RW_WRITER);
476 	if ((gcdb = gcdb_lookup(rp, B_TRUE)) == NULL) {
477 		rw_exit(&gcgrp->gcgrp_rwlock);
478 		return (NULL);
479 	}
480 
481 	for (gc = gcgrp->gcgrp_head; gc != NULL; gc = gc->gc_next) {
482 		if (gc->gc_db == gcdb) {
483 			ASSERT(gc->gc_grp == gcgrp);
484 
485 			gc->gc_refcnt++;
486 			ASSERT(gc->gc_refcnt != 0);
487 
488 			GCDB_REFRELE(gcdb);
489 
490 			DTRACE_PROBE3(tx__gcdb__log__info__gc__create,
491 			    char *, "found gc(1) in gcgrp(2)",
492 			    tsol_gc_t *, gc, tsol_gcgrp_t *, gcgrp);
493 			rw_exit(&gcgrp->gcgrp_rwlock);
494 			return (gc);
495 		}
496 	}
497 
498 	gc = kmem_zalloc(sizeof (*gc), KM_NOSLEEP);
499 	if (gc != NULL) {
500 		if (gcgrp->gcgrp_head == NULL) {
501 			gcgrp->gcgrp_head = gcgrp->gcgrp_tail = gc;
502 		} else {
503 			gcgrp->gcgrp_tail->gc_next = gc;
504 			gc->gc_prev = gcgrp->gcgrp_tail;
505 			gcgrp->gcgrp_tail = gc;
506 		}
507 		gcgrp->gcgrp_count++;
508 		ASSERT(gcgrp->gcgrp_count != 0);
509 
510 		/* caller has incremented gcgrp reference for us */
511 		gc->gc_grp = gcgrp;
512 
513 		gc->gc_db = gcdb;
514 		gc->gc_refcnt = 1;
515 
516 		DTRACE_PROBE3(tx__gcdb__log__info__gc__create, char *,
517 		    "added gc(1) to gcgrp(2)", tsol_gc_t *, gc,
518 		    tsol_gcgrp_t *, gcgrp);
519 
520 		*gcgrp_xtrarefp = B_FALSE;
521 	}
522 	rw_exit(&gcgrp->gcgrp_rwlock);
523 
524 	return (gc);
525 }
526 
527 void
528 gc_inactive(tsol_gc_t *gc)
529 {
530 	tsol_gcgrp_t *gcgrp = gc->gc_grp;
531 
532 	ASSERT(gcgrp != NULL);
533 	ASSERT(RW_WRITE_HELD(&gcgrp->gcgrp_rwlock));
534 	ASSERT(gc->gc_refcnt == 0);
535 
536 	if (gc->gc_prev != NULL)
537 		gc->gc_prev->gc_next = gc->gc_next;
538 	else
539 		gcgrp->gcgrp_head = gc->gc_next;
540 	if (gc->gc_next != NULL)
541 		gc->gc_next->gc_prev = gc->gc_prev;
542 	else
543 		gcgrp->gcgrp_tail = gc->gc_prev;
544 	ASSERT(gcgrp->gcgrp_count > 0);
545 	gcgrp->gcgrp_count--;
546 
547 	/* drop lock before it's destroyed */
548 	rw_exit(&gcgrp->gcgrp_rwlock);
549 
550 	DTRACE_PROBE3(tx__gcdb__log__info__gc__remove, char *,
551 	    "removed inactive gc(1) from gcgrp(2)",
552 	    tsol_gc_t *, gc, tsol_gcgrp_t *, gcgrp);
553 
554 	GCGRP_REFRELE(gcgrp);
555 
556 	gc->gc_grp = NULL;
557 	gc->gc_prev = gc->gc_next = NULL;
558 
559 	if (gc->gc_db != NULL)
560 		GCDB_REFRELE(gc->gc_db);
561 
562 	kmem_free(gc, sizeof (*gc));
563 }
564 
565 tsol_gcgrp_t *
566 gcgrp_lookup(tsol_gcgrp_addr_t *ga, boolean_t alloc)
567 {
568 	tsol_gcgrp_t *gcgrp = NULL;
569 	mod_hash_t *hashp;
570 
571 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
572 
573 	hashp = (ga->ga_af == AF_INET) ? gcgrp4_hash : gcgrp6_hash;
574 
575 	mutex_enter(&gcgrp_lock);
576 	if (mod_hash_find(hashp, (mod_hash_key_t)ga,
577 	    (mod_hash_val_t *)&gcgrp) == 0) {
578 		gcgrp->gcgrp_refcnt++;
579 		ASSERT(gcgrp->gcgrp_refcnt != 0);
580 
581 		DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__lookup, char *,
582 		    "found gcgrp(1) in hash(2)", tsol_gcgrp_t *, gcgrp,
583 		    mod_hash_t *, hashp);
584 
585 	} else if (alloc) {
586 		gcgrp = kmem_zalloc(sizeof (*gcgrp), KM_NOSLEEP);
587 		if (gcgrp != NULL) {
588 			gcgrp->gcgrp_refcnt = 1;
589 			rw_init(&gcgrp->gcgrp_rwlock, NULL, RW_DEFAULT, NULL);
590 			bcopy(ga, &gcgrp->gcgrp_addr, sizeof (*ga));
591 
592 			if (mod_hash_insert(hashp,
593 			    (mod_hash_key_t)&gcgrp->gcgrp_addr,
594 			    (mod_hash_val_t)gcgrp) != 0) {
595 				mutex_exit(&gcgrp_lock);
596 				kmem_free(gcgrp, sizeof (*gcgrp));
597 				return (NULL);
598 			}
599 
600 			DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__insert,
601 			    char *, "inserted gcgrp(1) in hash(2)",
602 			    tsol_gcgrp_t *, gcgrp, mod_hash_t *, hashp);
603 		}
604 	}
605 	mutex_exit(&gcgrp_lock);
606 	return (gcgrp);
607 }
608 
609 void
610 gcgrp_inactive(tsol_gcgrp_t *gcgrp)
611 {
612 	tsol_gcgrp_addr_t *ga;
613 	mod_hash_t *hashp;
614 
615 	ASSERT(MUTEX_HELD(&gcgrp_lock));
616 	ASSERT(!RW_LOCK_HELD(&gcgrp->gcgrp_rwlock));
617 	ASSERT(gcgrp != NULL && gcgrp->gcgrp_refcnt == 0);
618 	ASSERT(gcgrp->gcgrp_head == NULL && gcgrp->gcgrp_count == 0);
619 
620 	ga = &gcgrp->gcgrp_addr;
621 	ASSERT(ga->ga_af == AF_INET || ga->ga_af == AF_INET6);
622 
623 	hashp = (ga->ga_af == AF_INET) ? gcgrp4_hash : gcgrp6_hash;
624 	(void) mod_hash_remove(hashp, (mod_hash_key_t)ga,
625 	    (mod_hash_val_t *)&gcgrp);
626 	rw_destroy(&gcgrp->gcgrp_rwlock);
627 
628 	DTRACE_PROBE3(tx__gcdb__log__info__gcgrp__remove, char *,
629 	    "removed inactive gcgrp(1) from hash(2)",
630 	    tsol_gcgrp_t *, gcgrp, mod_hash_t *, hashp);
631 
632 	kmem_free(gcgrp, sizeof (*gcgrp));
633 }
634 
635 
636 /*
637  * Assign a sensitivity label to inbound traffic which arrived without
638  * an explicit on-the-wire label.
639  *
640  * In the case of CIPSO-type hosts, we assume packets arriving without
641  * a label are at the most sensitive label known for the host, most
642  * likely involving out-of-band key management traffic (such as IKE,
643  * etc.,)
644  */
645 static boolean_t
646 tsol_find_unlabeled_label(tsol_tpc_t *rhtp, bslabel_t *sl, uint32_t *doi)
647 {
648 	*doi = rhtp->tpc_tp.tp_doi;
649 	switch (rhtp->tpc_tp.host_type) {
650 	case UNLABELED:
651 		*sl = rhtp->tpc_tp.tp_def_label;
652 		break;
653 	case SUN_CIPSO:
654 		*sl = rhtp->tpc_tp.tp_sl_range_cipso.upper_bound;
655 		break;
656 	default:
657 		return (B_FALSE);
658 	}
659 	setbltype(sl, SUN_SL_ID);
660 	return (B_TRUE);
661 }
662 
663 /*
664  * Converts CIPSO option to sensitivity label.
665  * Validity checks based on restrictions defined in
666  * COMMERCIAL IP SECURITY OPTION (CIPSO 2.2) (draft-ietf-cipso-ipsecurity)
667  */
668 static boolean_t
669 cipso_to_sl(const uchar_t *option, bslabel_t *sl)
670 {
671 	const struct cipso_option *co = (const struct cipso_option *)option;
672 	const struct cipso_tag_type_1 *tt1;
673 
674 	tt1 = (struct cipso_tag_type_1 *)&co->cipso_tag_type[0];
675 	if (tt1->tag_type != 1 ||
676 	    tt1->tag_length < TSOL_TT1_MIN_LENGTH ||
677 	    tt1->tag_length > TSOL_TT1_MAX_LENGTH ||
678 	    tt1->tag_length + TSOL_CIPSO_TAG_OFFSET > co->cipso_length)
679 		return (B_FALSE);
680 
681 	bsllow(sl);	/* assumed: sets compartments to all zeroes */
682 	LCLASS_SET((_bslabel_impl_t *)sl, tt1->tag_sl);
683 	bcopy(tt1->tag_cat, &((_bslabel_impl_t *)sl)->compartments,
684 	    tt1->tag_length - TSOL_TT1_MIN_LENGTH);
685 	return (B_TRUE);
686 }
687 
688 /*
689  * If present, parse a CIPSO label in the incoming packet and
690  * construct a ts_label_t that reflects the CIPSO label and attach it
691  * to the dblk cred.  Later as the mblk flows up through the stack any
692  * code that needs to examine the packet label can inspect the label
693  * from the dblk cred. This function is called right in ip_rput for
694  * all packets, i.e. locally destined and to be forwarded packets. The
695  * forwarding path needs to examine the label to determine how to
696  * forward the packet.
697  *
698  * This routine pulls all message text up into the first mblk.
699  * For IPv4, only the first 20 bytes of the IP header are guaranteed
700  * to exist. For IPv6, only the IPv6 header is guaranteed to exist.
701  */
702 boolean_t
703 tsol_get_pkt_label(mblk_t *mp, int version)
704 {
705 	tsol_tpc_t	*src_rhtp = NULL;
706 	uchar_t		*opt_ptr = NULL;
707 	const ipha_t	*ipha;
708 	bslabel_t	sl;
709 	uint32_t	doi;
710 	tsol_ip_label_t	label_type;
711 	uint32_t	label_flags = 0; /* flags to set in label */
712 	const cipso_option_t *co;
713 	const void	*src;
714 	const ip6_t	*ip6h;
715 	cred_t		*credp;
716 	pid_t		cpid;
717 	int 		proto;
718 
719 	ASSERT(DB_TYPE(mp) == M_DATA);
720 
721 	if (mp->b_cont != NULL && !pullupmsg(mp, -1))
722 		return (B_FALSE);
723 
724 	if (version == IPV4_VERSION) {
725 		ASSERT(MBLKL(mp) >= IP_SIMPLE_HDR_LENGTH);
726 		ipha = (const ipha_t *)mp->b_rptr;
727 		src = &ipha->ipha_src;
728 		if (!tsol_get_option_v4(mp, &label_type, &opt_ptr))
729 			return (B_FALSE);
730 	} else {
731 		ASSERT(MBLKL(mp) >= IPV6_HDR_LEN);
732 		ip6h = (const ip6_t *)mp->b_rptr;
733 		src = &ip6h->ip6_src;
734 		if (!tsol_get_option_v6(mp, &label_type, &opt_ptr))
735 			return (B_FALSE);
736 	}
737 
738 	switch (label_type) {
739 	case OPT_CIPSO:
740 		/*
741 		 * Convert the CIPSO label to the internal format
742 		 * and attach it to the dblk cred.
743 		 * Validity checks based on restrictions defined in
744 		 * COMMERCIAL IP SECURITY OPTION (CIPSO 2.2)
745 		 * (draft-ietf-cipso-ipsecurity)
746 		 */
747 		if (version == IPV6_VERSION && ip6opt_ls == 0)
748 			return (B_FALSE);
749 		co = (const struct cipso_option *)opt_ptr;
750 		if ((co->cipso_length <
751 		    TSOL_CIPSO_TAG_OFFSET + TSOL_TT1_MIN_LENGTH) ||
752 		    (co->cipso_length > IP_MAX_OPT_LENGTH))
753 			return (B_FALSE);
754 		bcopy(co->cipso_doi, &doi, sizeof (doi));
755 		doi = ntohl(doi);
756 		if (!cipso_to_sl(opt_ptr, &sl))
757 			return (B_FALSE);
758 		setbltype(&sl, SUN_SL_ID);
759 
760 		/*
761 		 * If the source was unlabeled, then flag as such,
762 		 * (since CIPSO routers may add headers)
763 		 */
764 
765 		if ((src_rhtp = find_tpc(src, version, B_FALSE)) == NULL)
766 			return (B_FALSE);
767 
768 		if (src_rhtp->tpc_tp.host_type == UNLABELED)
769 			label_flags = TSLF_UNLABELED;
770 
771 		TPC_RELE(src_rhtp);
772 
773 		break;
774 
775 	case OPT_NONE:
776 		/*
777 		 * Handle special cases that may not be labeled, even
778 		 * though the sending system may otherwise be configured as
779 		 * labeled.
780 		 *	- IGMP
781 		 *	- IPv4 ICMP Router Discovery
782 		 *	- IPv6 Neighbor Discovery
783 		 *	- IPsec ESP
784 		 */
785 		if (version == IPV4_VERSION) {
786 			proto = ipha->ipha_protocol;
787 			if (proto == IPPROTO_IGMP)
788 				return (B_TRUE);
789 			if (proto == IPPROTO_ICMP) {
790 				const struct icmp *icmp = (const struct icmp *)
791 				    (mp->b_rptr + IPH_HDR_LENGTH(ipha));
792 
793 				if ((uchar_t *)icmp + ICMP_MINLEN > mp->b_wptr)
794 					return (B_FALSE);
795 				if (icmp->icmp_type == ICMP_ROUTERADVERT ||
796 				    icmp->icmp_type == ICMP_ROUTERSOLICIT)
797 					return (B_TRUE);
798 			}
799 		} else {
800 			proto = ip6h->ip6_nxt;
801 			if (proto == IPPROTO_ICMPV6) {
802 				const icmp6_t *icmp6 = (const icmp6_t *)
803 				    (mp->b_rptr + IPV6_HDR_LEN);
804 
805 				if ((uchar_t *)icmp6 + ICMP6_MINLEN >
806 				    mp->b_wptr)
807 					return (B_FALSE);
808 				if (icmp6->icmp6_type >= MLD_LISTENER_QUERY &&
809 				    icmp6->icmp6_type <= ICMP6_MAX_INFO_TYPE)
810 					return (B_TRUE);
811 			}
812 		}
813 
814 		/*
815 		 * Look up the tnrhtp database and get the implicit label
816 		 * that is associated with the sending host and attach
817 		 * it to the packet.
818 		 */
819 		if ((src_rhtp = find_tpc(src, version, B_FALSE)) == NULL)
820 			return (B_FALSE);
821 
822 		/*
823 		 * If peer is label-aware, mark as "implicit" rather than
824 		 * "unlabeled" to cause appropriate mac-exempt processing
825 		 * to happen.
826 		 */
827 		if (src_rhtp->tpc_tp.host_type == SUN_CIPSO)
828 			label_flags = TSLF_IMPLICIT_IN;
829 		else if (src_rhtp->tpc_tp.host_type == UNLABELED)
830 			label_flags = TSLF_UNLABELED;
831 		else {
832 			DTRACE_PROBE2(tx__get__pkt__label, char *,
833 			    "template(1) has unknown hosttype",
834 			    tsol_tpc_t *, src_rhtp);
835 		}
836 
837 
838 		if (!tsol_find_unlabeled_label(src_rhtp, &sl, &doi)) {
839 			TPC_RELE(src_rhtp);
840 			return (B_FALSE);
841 		}
842 		TPC_RELE(src_rhtp);
843 		break;
844 
845 	default:
846 		return (B_FALSE);
847 	}
848 
849 	/* Make sure no other thread is messing with this mblk */
850 	ASSERT(DB_REF(mp) == 1);
851 	/* Preserve db_cpid */
852 	credp = msg_extractcred(mp, &cpid);
853 	if (credp == NULL) {
854 		credp = newcred_from_bslabel(&sl, doi, KM_NOSLEEP);
855 	} else {
856 		cred_t	*newcr;
857 
858 		newcr = copycred_from_bslabel(credp, &sl, doi,
859 		    KM_NOSLEEP);
860 		crfree(credp);
861 		credp = newcr;
862 	}
863 	if (credp == NULL)
864 		return (B_FALSE);
865 
866 	crgetlabel(credp)->tsl_flags |= label_flags;
867 
868 	mblk_setcred(mp, credp, cpid);
869 	crfree(credp);			/* mblk has ref on cred */
870 
871 	return (B_TRUE);
872 }
873 
874 /*
875  * This routine determines whether the given packet should be accepted locally.
876  * It does a range/set check on the packet's label by looking up the given
877  * address in the remote host database.
878  */
879 boolean_t
880 tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
881     boolean_t shared_addr, const conn_t *connp)
882 {
883 	const cred_t *credp;
884 	ts_label_t *plabel, *conn_plabel;
885 	tsol_tpc_t *tp;
886 	boolean_t retv;
887 	const bslabel_t *label, *conn_label;
888 
889 	/*
890 	 * The cases in which this can happen are:
891 	 *	- IPv6 Router Alert, where ip_rput_data_v6 deliberately skips
892 	 *	  over the label attachment process.
893 	 *	- MLD output looped-back to ourselves.
894 	 *	- IPv4 Router Discovery, where tsol_get_pkt_label intentionally
895 	 *	  avoids the labeling process.
896 	 * We trust that all valid paths in the code set the cred pointer when
897 	 * needed.
898 	 */
899 	if ((credp = msg_getcred(mp, NULL)) == NULL)
900 		return (B_TRUE);
901 
902 	/*
903 	 * If this packet is from the inside (not a remote host) and has the
904 	 * same zoneid as the selected destination, then no checks are
905 	 * necessary.  Membership in the zone is enough proof.  This is
906 	 * intended to be a hot path through this function.
907 	 */
908 	if (!crisremote(credp) &&
909 	    crgetzone(credp) == crgetzone(connp->conn_cred))
910 		return (B_TRUE);
911 
912 	plabel = crgetlabel(credp);
913 	conn_plabel = crgetlabel(connp->conn_cred);
914 	ASSERT(plabel != NULL && conn_plabel != NULL);
915 
916 	label = label2bslabel(plabel);
917 	conn_label = label2bslabel(crgetlabel(connp->conn_cred));
918 
919 
920 	/*
921 	 * Implicitly labeled packets from label-aware sources
922 	 * go only to privileged receivers
923 	 */
924 	if ((plabel->tsl_flags & TSLF_IMPLICIT_IN) &&
925 	    (connp->conn_mac_mode != CONN_MAC_IMPLICIT)) {
926 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac_impl,
927 		    char *,
928 		    "implicitly labeled packet mp(1) for conn(2) "
929 		    "which isn't in implicit mac mode",
930 		    mblk_t *, mp, conn_t *, connp);
931 
932 		return (B_FALSE);
933 	}
934 
935 
936 	/*
937 	 * MLPs are always validated using the range and set of the local
938 	 * address, even when the remote host is unlabeled.
939 	 */
940 	if (connp->conn_mlp_type == mlptBoth ||
941 	/* LINTED: no consequent */
942 	    connp->conn_mlp_type == (shared_addr ? mlptShared : mlptPrivate)) {
943 		;
944 
945 	/*
946 	 * If this is a packet from an unlabeled sender, then we must apply
947 	 * different rules.  If the label is equal to the zone's label, then
948 	 * it's allowed.  If it's not equal, but the zone is either the global
949 	 * zone or the label is dominated by the zone's label, then allow it
950 	 * as long as it's in the range configured for the destination.
951 	 */
952 	} else if (plabel->tsl_flags & TSLF_UNLABELED) {
953 		if (plabel->tsl_doi == conn_plabel->tsl_doi &&
954 		    blequal(label, conn_label))
955 			return (B_TRUE);
956 
957 		/*
958 		 * conn_zoneid is global for an exclusive stack, thus we use
959 		 * conn_cred to get the zoneid
960 		 */
961 		if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
962 		    (crgetzoneid(connp->conn_cred) != GLOBAL_ZONEID &&
963 		    (plabel->tsl_doi != conn_plabel->tsl_doi ||
964 		    !bldominates(conn_label, label)))) {
965 			DTRACE_PROBE3(
966 			    tx__ip__log__drop__receivelocal__mac_unl,
967 			    char *,
968 			    "unlabeled packet mp(1) fails mac for conn(2)",
969 			    mblk_t *, mp, conn_t *, connp);
970 			return (B_FALSE);
971 		}
972 
973 	/*
974 	 * If this is a packet from a labeled sender, verify the
975 	 * label on the packet matches the connection label.
976 	 */
977 	} else {
978 		if (plabel->tsl_doi != conn_plabel->tsl_doi ||
979 		    !blequal(label, conn_label)) {
980 			DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac__slp,
981 			    char *,
982 			    "packet mp(1) failed label match to SLP conn(2)",
983 			    mblk_t *, mp, conn_t *, connp);
984 			return (B_FALSE);
985 		}
986 		/*
987 		 * No further checks will be needed if this is a zone-
988 		 * specific address because (1) The process for bringing up
989 		 * the interface ensures the zone's label is within the zone-
990 		 * specific address's valid label range; (2) For cases where
991 		 * the conn is bound to the unspecified addresses, ip fanout
992 		 * logic ensures conn's zoneid equals the dest addr's zoneid;
993 		 * (3) Mac-exempt and mlp logic above already handle all
994 		 * cases where the zone label may not be the same as the
995 		 * conn label.
996 		 */
997 		if (!shared_addr)
998 			return (B_TRUE);
999 	}
1000 
1001 	tp = find_tpc(addr, version, B_FALSE);
1002 	if (tp == NULL) {
1003 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__no__tnr,
1004 		    char *, "dropping mp(1), host(2) lacks entry",
1005 		    mblk_t *, mp, void *, addr);
1006 		return (B_FALSE);
1007 	}
1008 
1009 	/*
1010 	 * The local host address should not be unlabeled at this point.  The
1011 	 * only way this can happen is that the destination isn't unicast.  We
1012 	 * assume that the packet should not have had a label, and thus should
1013 	 * have been handled by the TSLF_UNLABELED logic above.
1014 	 */
1015 	if (tp->tpc_tp.host_type == UNLABELED) {
1016 		retv = B_FALSE;
1017 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__flag, char *,
1018 		    "mp(1) unlabeled source, but tp is not unlabeled.",
1019 		    mblk_t *, mp, tsol_tpc_t *, tp);
1020 
1021 	} else if (tp->tpc_tp.host_type != SUN_CIPSO) {
1022 		retv = B_FALSE;
1023 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__tptype, char *,
1024 		    "delivering mp(1), found unrecognized tpc(2) type.",
1025 		    mblk_t *, mp, tsol_tpc_t *, tp);
1026 
1027 	} else if (plabel->tsl_doi != tp->tpc_tp.tp_doi) {
1028 		retv = B_FALSE;
1029 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac, char *,
1030 		    "mp(1) could not be delievered to tp(2), doi mismatch",
1031 		    mblk_t *, mp, tsol_tpc_t *, tp);
1032 
1033 	} else if (!_blinrange(label, &tp->tpc_tp.tp_sl_range_cipso) &&
1034 	    !blinlset(label, tp->tpc_tp.tp_sl_set_cipso)) {
1035 		retv = B_FALSE;
1036 		DTRACE_PROBE3(tx__ip__log__drop__receivelocal__mac, char *,
1037 		    "mp(1) could not be delievered to tp(2), bad mac",
1038 		    mblk_t *, mp, tsol_tpc_t *, tp);
1039 	} else {
1040 		retv = B_TRUE;
1041 	}
1042 
1043 	TPC_RELE(tp);
1044 
1045 	return (retv);
1046 }
1047 
1048 boolean_t
1049 tsol_can_accept_raw(mblk_t *mp, boolean_t check_host)
1050 {
1051 	ts_label_t	*plabel = NULL;
1052 	tsol_tpc_t	*src_rhtp, *dst_rhtp;
1053 	boolean_t	retv;
1054 	cred_t		*credp;
1055 
1056 	credp = msg_getcred(mp, NULL);
1057 	if (credp != NULL)
1058 		plabel = crgetlabel(credp);
1059 
1060 	/* We are bootstrapping or the internal template was never deleted */
1061 	if (plabel == NULL)
1062 		return (B_TRUE);
1063 
1064 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1065 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
1066 
1067 		src_rhtp = find_tpc(&ipha->ipha_src, IPV4_VERSION,
1068 		    B_FALSE);
1069 		if (src_rhtp == NULL)
1070 			return (B_FALSE);
1071 		dst_rhtp = find_tpc(&ipha->ipha_dst, IPV4_VERSION,
1072 		    B_FALSE);
1073 	} else {
1074 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1075 
1076 		src_rhtp = find_tpc(&ip6h->ip6_src, IPV6_VERSION,
1077 		    B_FALSE);
1078 		if (src_rhtp == NULL)
1079 			return (B_FALSE);
1080 		dst_rhtp = find_tpc(&ip6h->ip6_dst, IPV6_VERSION,
1081 		    B_FALSE);
1082 	}
1083 	if (dst_rhtp == NULL) {
1084 		TPC_RELE(src_rhtp);
1085 		return (B_FALSE);
1086 	}
1087 
1088 	if (label2doi(plabel) != src_rhtp->tpc_tp.tp_doi) {
1089 		retv = B_FALSE;
1090 
1091 	/*
1092 	 * Check that the packet's label is in the correct range for labeled
1093 	 * sender, or is equal to the default label for unlabeled sender.
1094 	 */
1095 	} else if ((src_rhtp->tpc_tp.host_type != UNLABELED &&
1096 	    !_blinrange(label2bslabel(plabel),
1097 	    &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
1098 	    !blinlset(label2bslabel(plabel),
1099 	    src_rhtp->tpc_tp.tp_sl_set_cipso)) ||
1100 	    (src_rhtp->tpc_tp.host_type == UNLABELED &&
1101 	    !blequal(&plabel->tsl_label, &src_rhtp->tpc_tp.tp_def_label))) {
1102 		retv = B_FALSE;
1103 
1104 	} else if (check_host) {
1105 		retv = B_TRUE;
1106 
1107 	/*
1108 	 * Until we have SL range in the Zone structure, pass it
1109 	 * when our own address lookup returned an internal entry.
1110 	 */
1111 	} else switch (dst_rhtp->tpc_tp.host_type) {
1112 	case UNLABELED:
1113 		retv = B_TRUE;
1114 		break;
1115 
1116 	case SUN_CIPSO:
1117 		retv = _blinrange(label2bslabel(plabel),
1118 		    &dst_rhtp->tpc_tp.tp_sl_range_cipso) ||
1119 		    blinlset(label2bslabel(plabel),
1120 		    dst_rhtp->tpc_tp.tp_sl_set_cipso);
1121 		break;
1122 
1123 	default:
1124 		retv = B_FALSE;
1125 	}
1126 	TPC_RELE(src_rhtp);
1127 	TPC_RELE(dst_rhtp);
1128 	return (retv);
1129 }
1130 
1131 /*
1132  * This routine determines whether a response to a failed packet delivery or
1133  * connection should be sent back.  By default, the policy is to allow such
1134  * messages to be sent at all times, as these messages reveal little useful
1135  * information and are healthy parts of TCP/IP networking.
1136  *
1137  * If tsol_strict_error is set, then we do strict tests: if the packet label is
1138  * within the label range/set of this host/zone, return B_TRUE; otherwise
1139  * return B_FALSE, which causes the packet to be dropped silently.
1140  *
1141  * Note that tsol_get_pkt_label will cause the packet to drop if the sender is
1142  * marked as labeled in the remote host database, but the packet lacks a label.
1143  * This means that we don't need to do a lookup on the source; the
1144  * TSLF_UNLABELED flag is sufficient.
1145  */
1146 boolean_t
1147 tsol_can_reply_error(const mblk_t *mp)
1148 {
1149 	ts_label_t	*plabel = NULL;
1150 	tsol_tpc_t	*rhtp;
1151 	const ipha_t	*ipha;
1152 	const ip6_t	*ip6h;
1153 	boolean_t	retv;
1154 	bslabel_t	*pktbs;
1155 	cred_t		*credp;
1156 
1157 	/* Caller must pull up at least the IP header */
1158 	ASSERT(MBLKL(mp) >= (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ?
1159 	    sizeof (*ipha) : sizeof (*ip6h)));
1160 
1161 	if (!tsol_strict_error)
1162 		return (B_TRUE);
1163 
1164 	credp = msg_getcred(mp, NULL);
1165 	if (credp != NULL)
1166 		plabel = crgetlabel(credp);
1167 
1168 	/* We are bootstrapping or the internal template was never deleted */
1169 	if (plabel == NULL)
1170 		return (B_TRUE);
1171 
1172 	if (plabel->tsl_flags & TSLF_IMPLICIT_IN) {
1173 		DTRACE_PROBE3(tx__ip__log__drop__replyerror__unresolved__label,
1174 		    char *,
1175 		    "cannot send error report for packet mp(1) with "
1176 		    "unresolved security label sl(2)",
1177 		    mblk_t *, mp, ts_label_t *, plabel);
1178 		return (B_FALSE);
1179 	}
1180 
1181 
1182 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1183 		ipha = (const ipha_t *)mp->b_rptr;
1184 		rhtp = find_tpc(&ipha->ipha_dst, IPV4_VERSION, B_FALSE);
1185 	} else {
1186 		ip6h = (const ip6_t *)mp->b_rptr;
1187 		rhtp = find_tpc(&ip6h->ip6_dst, IPV6_VERSION, B_FALSE);
1188 	}
1189 
1190 	if (rhtp == NULL || label2doi(plabel) != rhtp->tpc_tp.tp_doi) {
1191 		retv = B_FALSE;
1192 	} else {
1193 		/*
1194 		 * If we're in the midst of forwarding, then the destination
1195 		 * address might not be labeled.  In that case, allow unlabeled
1196 		 * packets through only if the default label is the same, and
1197 		 * labeled ones if they dominate.
1198 		 */
1199 		pktbs = label2bslabel(plabel);
1200 		switch (rhtp->tpc_tp.host_type) {
1201 		case UNLABELED:
1202 			if (plabel->tsl_flags & TSLF_UNLABELED) {
1203 				retv = blequal(pktbs,
1204 				    &rhtp->tpc_tp.tp_def_label);
1205 			} else {
1206 				retv = bldominates(pktbs,
1207 				    &rhtp->tpc_tp.tp_def_label);
1208 			}
1209 			break;
1210 
1211 		case SUN_CIPSO:
1212 			retv = _blinrange(pktbs,
1213 			    &rhtp->tpc_tp.tp_sl_range_cipso) ||
1214 			    blinlset(pktbs, rhtp->tpc_tp.tp_sl_set_cipso);
1215 			break;
1216 
1217 		default:
1218 			retv = B_FALSE;
1219 			break;
1220 		}
1221 	}
1222 
1223 	if (rhtp != NULL)
1224 		TPC_RELE(rhtp);
1225 
1226 	return (retv);
1227 }
1228 
1229 /*
1230  * Finds the zone associated with the given packet.  Returns GLOBAL_ZONEID if
1231  * the zone cannot be located.
1232  *
1233  * This is used by the classifier when the packet matches an ALL_ZONES IRE, and
1234  * there's no MLP defined.
1235  *
1236  * Note that we assume that this is only invoked in the ALL_ZONES case.
1237  * Handling other cases would require handle exclusive stack zones where either
1238  * this routine or the callers would have to map from
1239  * the zoneid (zone->zone_id) to what IP uses in conn_zoneid etc.
1240  */
1241 zoneid_t
1242 tsol_packet_to_zoneid(const mblk_t *mp)
1243 {
1244 	cred_t *cr = msg_getcred(mp, NULL);
1245 	zone_t *zone;
1246 	ts_label_t *label;
1247 
1248 	if (cr != NULL) {
1249 		if ((label = crgetlabel(cr)) != NULL) {
1250 			zone = zone_find_by_label(label);
1251 			if (zone != NULL) {
1252 				zoneid_t zoneid = zone->zone_id;
1253 
1254 				zone_rele(zone);
1255 				return (zoneid);
1256 			}
1257 		}
1258 	}
1259 	return (GLOBAL_ZONEID);
1260 }
1261 
1262 int
1263 tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
1264 {
1265 	int		error = 0;
1266 	tsol_ire_gw_secattr_t *attrp = NULL;
1267 	tsol_tnrhc_t	*gw_rhc = NULL;
1268 	tsol_gcgrp_t	*gcgrp = NULL;
1269 	tsol_gc_t	*gc = NULL;
1270 	in_addr_t	ga_addr4;
1271 	void		*paddr = NULL;
1272 
1273 	/* Not in Trusted mode or IRE is local/loopback/broadcast/interface */
1274 	if (!is_system_labeled() ||
1275 	    (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
1276 	    IRE_INTERFACE)))
1277 		goto done;
1278 
1279 	/*
1280 	 * If we don't have a label to compare with, or the IRE does not
1281 	 * contain any gateway security attributes, there's not much that
1282 	 * we can do.  We let the former case pass, and the latter fail,
1283 	 * since the IRE doesn't qualify for a match due to the lack of
1284 	 * security attributes.
1285 	 */
1286 	if (tsl == NULL || ire->ire_gw_secattr == NULL) {
1287 		if (tsl != NULL) {
1288 			DTRACE_PROBE3(
1289 			    tx__ip__log__drop__irematch__nogwsec, char *,
1290 			    "ire(1) lacks ire_gw_secattr when matching "
1291 			    "label(2)", ire_t *, ire, ts_label_t *, tsl);
1292 			error = EACCES;
1293 		}
1294 		goto done;
1295 	}
1296 
1297 	attrp = ire->ire_gw_secattr;
1298 
1299 	/*
1300 	 * The possible lock order scenarios related to the tsol gateway
1301 	 * attribute locks are documented at the beginning of ip.c in the
1302 	 * lock order scenario section.
1303 	 */
1304 	mutex_enter(&attrp->igsa_lock);
1305 
1306 	/*
1307 	 * Depending on the IRE type (prefix vs. cache), we seek the group
1308 	 * structure which contains all security credentials of the gateway.
1309 	 * A prefix IRE is associated with at most one gateway credential,
1310 	 * while a cache IRE is associated with every credentials that the
1311 	 * gateway has.
1312 	 */
1313 	if ((gc = attrp->igsa_gc) != NULL) {			/* prefix */
1314 		gcgrp = gc->gc_grp;
1315 		ASSERT(gcgrp != NULL);
1316 		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1317 	} else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {	/* cache */
1318 		rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
1319 		gc = gcgrp->gcgrp_head;
1320 		if (gc == NULL) {
1321 			/* gc group is empty, so the drop lock now */
1322 			ASSERT(gcgrp->gcgrp_count == 0);
1323 			rw_exit(&gcgrp->gcgrp_rwlock);
1324 			gcgrp = NULL;
1325 		}
1326 	}
1327 
1328 	if (gcgrp != NULL)
1329 		GCGRP_REFHOLD(gcgrp);
1330 
1331 	if ((gw_rhc = attrp->igsa_rhc) != NULL) {
1332 		/*
1333 		 * If our cached entry has grown stale, then discard it so we
1334 		 * can get a new one.
1335 		 */
1336 		if (gw_rhc->rhc_invalid || gw_rhc->rhc_tpc->tpc_invalid) {
1337 			TNRHC_RELE(gw_rhc);
1338 			attrp->igsa_rhc = gw_rhc = NULL;
1339 		} else {
1340 			TNRHC_HOLD(gw_rhc)
1341 		}
1342 	}
1343 
1344 	/* Last attempt at loading the template had failed; try again */
1345 	if (gw_rhc == NULL) {
1346 		if (gcgrp != NULL) {
1347 			tsol_gcgrp_addr_t *ga = &gcgrp->gcgrp_addr;
1348 
1349 			if (ire->ire_ipversion == IPV4_VERSION) {
1350 				ASSERT(ga->ga_af == AF_INET);
1351 				IN6_V4MAPPED_TO_IPADDR(&ga->ga_addr, ga_addr4);
1352 				paddr = &ga_addr4;
1353 			} else {
1354 				ASSERT(ga->ga_af == AF_INET6);
1355 				paddr = &ga->ga_addr;
1356 			}
1357 		} else if (ire->ire_ipversion == IPV6_VERSION &&
1358 		    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
1359 			paddr = &ire->ire_gateway_addr_v6;
1360 		} else if (ire->ire_ipversion == IPV4_VERSION &&
1361 		    ire->ire_gateway_addr != INADDR_ANY) {
1362 			paddr = &ire->ire_gateway_addr;
1363 		}
1364 
1365 		/* We've found a gateway address to do the template lookup */
1366 		if (paddr != NULL) {
1367 			ASSERT(gw_rhc == NULL);
1368 			gw_rhc = find_rhc(paddr, ire->ire_ipversion, B_FALSE);
1369 			if (gw_rhc != NULL) {
1370 				/*
1371 				 * Note that if the lookup above returned an
1372 				 * internal template, we'll use it for the
1373 				 * time being, and do another lookup next
1374 				 * time around.
1375 				 */
1376 				/* Another thread has loaded the template? */
1377 				if (attrp->igsa_rhc != NULL) {
1378 					TNRHC_RELE(gw_rhc)
1379 					/* reload, it could be different */
1380 					gw_rhc = attrp->igsa_rhc;
1381 				} else {
1382 					attrp->igsa_rhc = gw_rhc;
1383 				}
1384 				/*
1385 				 * Hold an extra reference just like we did
1386 				 * above prior to dropping the igsa_lock.
1387 				 */
1388 				TNRHC_HOLD(gw_rhc)
1389 			}
1390 		}
1391 	}
1392 
1393 	mutex_exit(&attrp->igsa_lock);
1394 	/* Gateway template not found */
1395 	if (gw_rhc == NULL) {
1396 		/*
1397 		 * If destination address is directly reachable through an
1398 		 * interface rather than through a learned route, pass it.
1399 		 */
1400 		if (paddr != NULL) {
1401 			DTRACE_PROBE3(
1402 			    tx__ip__log__drop__irematch__nogwtmpl, char *,
1403 			    "ire(1), label(2) off-link with no gw_rhc",
1404 			    ire_t *, ire, ts_label_t *, tsl);
1405 			error = EINVAL;
1406 		}
1407 		goto done;
1408 	}
1409 
1410 	if (gc != NULL) {
1411 		tsol_gcdb_t *gcdb;
1412 		/*
1413 		 * In the case of IRE_CACHE we've got one or more gateway
1414 		 * security credentials to compare against the passed in label.
1415 		 * Perform label range comparison against each security
1416 		 * credential of the gateway. In the case of a prefix ire
1417 		 * we need to match against the security attributes of
1418 		 * just the route itself, so the loop is executed only once.
1419 		 */
1420 		ASSERT(gcgrp != NULL);
1421 		do {
1422 			gcdb = gc->gc_db;
1423 			if (tsl->tsl_doi == gcdb->gcdb_doi &&
1424 			    _blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange))
1425 				break;
1426 			if (ire->ire_type == IRE_CACHE)
1427 				gc = gc->gc_next;
1428 			else
1429 				gc = NULL;
1430 		} while (gc != NULL);
1431 
1432 		if (gc == NULL) {
1433 			DTRACE_PROBE3(
1434 			    tx__ip__log__drop__irematch__nogcmatched,
1435 			    char *, "ire(1), tsl(2): all gc failed match",
1436 			    ire_t *, ire, ts_label_t *, tsl);
1437 			error = EACCES;
1438 		}
1439 	} else {
1440 		/*
1441 		 * We didn't find any gateway credentials in the IRE
1442 		 * attributes; fall back to the gateway's template for
1443 		 * label range checks, if we are required to do so.
1444 		 */
1445 		ASSERT(gw_rhc != NULL);
1446 		switch (gw_rhc->rhc_tpc->tpc_tp.host_type) {
1447 		case SUN_CIPSO:
1448 			if (tsl->tsl_doi != gw_rhc->rhc_tpc->tpc_tp.tp_doi ||
1449 			    (!_blinrange(&tsl->tsl_label,
1450 			    &gw_rhc->rhc_tpc->tpc_tp.tp_sl_range_cipso) &&
1451 			    !blinlset(&tsl->tsl_label,
1452 			    gw_rhc->rhc_tpc->tpc_tp.tp_sl_set_cipso))) {
1453 				error = EACCES;
1454 				DTRACE_PROBE4(
1455 				    tx__ip__log__drop__irematch__deftmpl,
1456 				    char *, "ire(1), tsl(2), gw_rhc(3) "
1457 				    "failed match (cipso gw)",
1458 				    ire_t *, ire, ts_label_t *, tsl,
1459 				    tsol_tnrhc_t *, gw_rhc);
1460 			}
1461 			break;
1462 
1463 		case UNLABELED:
1464 			if (tsl->tsl_doi != gw_rhc->rhc_tpc->tpc_tp.tp_doi ||
1465 			    (!_blinrange(&tsl->tsl_label,
1466 			    &gw_rhc->rhc_tpc->tpc_tp.tp_gw_sl_range) &&
1467 			    !blinlset(&tsl->tsl_label,
1468 			    gw_rhc->rhc_tpc->tpc_tp.tp_gw_sl_set))) {
1469 				error = EACCES;
1470 				DTRACE_PROBE4(
1471 				    tx__ip__log__drop__irematch__deftmpl,
1472 				    char *, "ire(1), tsl(2), gw_rhc(3) "
1473 				    "failed match (unlabeled gw)",
1474 				    ire_t *, ire, ts_label_t *, tsl,
1475 				    tsol_tnrhc_t *, gw_rhc);
1476 			}
1477 			break;
1478 		}
1479 	}
1480 
1481 done:
1482 
1483 	if (gcgrp != NULL) {
1484 		rw_exit(&gcgrp->gcgrp_rwlock);
1485 		GCGRP_REFRELE(gcgrp);
1486 	}
1487 
1488 	if (gw_rhc != NULL)
1489 		TNRHC_RELE(gw_rhc)
1490 
1491 	return (error);
1492 }
1493 
1494 /*
1495  * Performs label accreditation checks for packet forwarding.
1496  *
1497  * Returns a pointer to the modified mblk if allowed for forwarding,
1498  * or NULL if the packet must be dropped.
1499  */
1500 mblk_t *
1501 tsol_ip_forward(ire_t *ire, mblk_t *mp)
1502 {
1503 	tsol_ire_gw_secattr_t *attrp = NULL;
1504 	ipha_t		*ipha;
1505 	ip6_t		*ip6h;
1506 	const void	*pdst;
1507 	const void	*psrc;
1508 	boolean_t	off_link;
1509 	tsol_tpc_t	*dst_rhtp, *gw_rhtp;
1510 	tsol_ip_label_t label_type;
1511 	uchar_t		*opt_ptr = NULL;
1512 	ts_label_t	*tsl;
1513 	uint8_t		proto;
1514 	int		af, adjust;
1515 	uint16_t	iplen;
1516 	boolean_t	need_tpc_rele = B_FALSE;
1517 	ipaddr_t	*gw;
1518 	ip_stack_t	*ipst = ire->ire_ipst;
1519 	cred_t		*credp;
1520 	pid_t		pid;
1521 
1522 	ASSERT(ire != NULL && mp != NULL);
1523 	ASSERT(ire->ire_stq != NULL);
1524 
1525 	af = (ire->ire_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
1526 
1527 	if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
1528 		ASSERT(ire->ire_ipversion == IPV4_VERSION);
1529 		ipha = (ipha_t *)mp->b_rptr;
1530 		psrc = &ipha->ipha_src;
1531 		pdst = &ipha->ipha_dst;
1532 		proto = ipha->ipha_protocol;
1533 
1534 		/*
1535 		 * off_link is TRUE if destination not directly reachable.
1536 		 * Surya note: we avoid creation of per-dst IRE_CACHE entries
1537 		 * for forwarded packets, so we set off_link to be TRUE
1538 		 * if the packet dst is different from the ire_addr of
1539 		 * the ire for the nexthop.
1540 		 */
1541 		off_link = ((ipha->ipha_dst != ire->ire_addr) ||
1542 		    (ire->ire_gateway_addr != INADDR_ANY));
1543 		if (!tsol_get_option_v4(mp, &label_type, &opt_ptr))
1544 			return (NULL);
1545 	} else {
1546 		ASSERT(ire->ire_ipversion == IPV6_VERSION);
1547 		ip6h = (ip6_t *)mp->b_rptr;
1548 		psrc = &ip6h->ip6_src;
1549 		pdst = &ip6h->ip6_dst;
1550 		proto = ip6h->ip6_nxt;
1551 
1552 		if (proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
1553 		    proto != IPPROTO_ICMPV6) {
1554 			uint8_t *nexthdrp;
1555 			uint16_t hdr_len;
1556 
1557 			if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len,
1558 			    &nexthdrp)) {
1559 				/* malformed packet; drop it */
1560 				return (NULL);
1561 			}
1562 			proto = *nexthdrp;
1563 		}
1564 
1565 		/* destination not directly reachable? */
1566 		off_link = !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
1567 		if (!tsol_get_option_v6(mp, &label_type, &opt_ptr))
1568 			return (NULL);
1569 	}
1570 
1571 	if ((tsl = msg_getlabel(mp)) == NULL)
1572 		return (mp);
1573 
1574 	if (tsl->tsl_flags & TSLF_IMPLICIT_IN) {
1575 		DTRACE_PROBE3(tx__ip__log__drop__forward__unresolved__label,
1576 		    char *,
1577 		    "cannot forward packet mp(1) with unresolved "
1578 		    "security label sl(2)",
1579 		    mblk_t *, mp, ts_label_t *, tsl);
1580 
1581 		return (NULL);
1582 	}
1583 
1584 
1585 	ASSERT(psrc != NULL && pdst != NULL);
1586 	dst_rhtp = find_tpc(pdst, ire->ire_ipversion, B_FALSE);
1587 
1588 	if (dst_rhtp == NULL) {
1589 		/*
1590 		 * Without a template we do not know if forwarding
1591 		 * violates MAC
1592 		 */
1593 		DTRACE_PROBE3(tx__ip__log__drop__forward__nodst, char *,
1594 		    "mp(1) dropped, no template for destination ip4|6(2)",
1595 		    mblk_t *, mp, void *, pdst);
1596 		return (NULL);
1597 	}
1598 
1599 	/*
1600 	 * Gateway template must have existed for off-link destinations,
1601 	 * since tsol_ire_match_gwattr has ensured such condition.
1602 	 */
1603 	if (ire->ire_ipversion == IPV4_VERSION && off_link) {
1604 		/*
1605 		 * Surya note: first check if we can get the gw_rhtp from
1606 		 * the ire_gw_secattr->igsa_rhc; if this is null, then
1607 		 * do a lookup based on the ire_addr (address of gw)
1608 		 */
1609 		if (ire->ire_gw_secattr != NULL &&
1610 		    ire->ire_gw_secattr->igsa_rhc != NULL) {
1611 			attrp = ire->ire_gw_secattr;
1612 			gw_rhtp = attrp->igsa_rhc->rhc_tpc;
1613 		} else  {
1614 			/*
1615 			 * use the ire_addr if this is the IRE_CACHE of nexthop
1616 			 */
1617 			gw = (ire->ire_gateway_addr == NULL? &ire->ire_addr :
1618 			    &ire->ire_gateway_addr);
1619 			gw_rhtp = find_tpc(gw, ire->ire_ipversion, B_FALSE);
1620 			need_tpc_rele = B_TRUE;
1621 		}
1622 		if (gw_rhtp == NULL) {
1623 			DTRACE_PROBE3(tx__ip__log__drop__forward__nogw, char *,
1624 			    "mp(1) dropped, no gateway in ire attributes(2)",
1625 			    mblk_t *, mp, tsol_ire_gw_secattr_t *, attrp);
1626 			mp = NULL;
1627 			goto keep_label;
1628 		}
1629 	}
1630 	if (ire->ire_ipversion == IPV6_VERSION &&
1631 	    ((attrp = ire->ire_gw_secattr) == NULL || attrp->igsa_rhc == NULL ||
1632 	    (gw_rhtp = attrp->igsa_rhc->rhc_tpc) == NULL) && off_link) {
1633 		DTRACE_PROBE3(tx__ip__log__drop__forward__nogw, char *,
1634 		    "mp(1) dropped, no gateway in ire attributes(2)",
1635 		    mblk_t *, mp, tsol_ire_gw_secattr_t *, attrp);
1636 		mp = NULL;
1637 		goto keep_label;
1638 	}
1639 
1640 	/*
1641 	 * Check that the label for the packet is acceptable
1642 	 * by destination host; otherwise, drop it.
1643 	 */
1644 	switch (dst_rhtp->tpc_tp.host_type) {
1645 	case SUN_CIPSO:
1646 		if (tsl->tsl_doi != dst_rhtp->tpc_tp.tp_doi ||
1647 		    (!_blinrange(&tsl->tsl_label,
1648 		    &dst_rhtp->tpc_tp.tp_sl_range_cipso) &&
1649 		    !blinlset(&tsl->tsl_label,
1650 		    dst_rhtp->tpc_tp.tp_sl_set_cipso))) {
1651 			DTRACE_PROBE4(tx__ip__log__drop__forward__mac, char *,
1652 			    "labeled packet mp(1) dropped, label(2) fails "
1653 			    "destination(3) accredation check",
1654 			    mblk_t *, mp, ts_label_t *, tsl,
1655 			    tsol_tpc_t *, dst_rhtp);
1656 			mp = NULL;
1657 			goto keep_label;
1658 		}
1659 		break;
1660 
1661 
1662 	case UNLABELED:
1663 		if (tsl->tsl_doi != dst_rhtp->tpc_tp.tp_doi ||
1664 		    !blequal(&dst_rhtp->tpc_tp.tp_def_label,
1665 		    &tsl->tsl_label)) {
1666 			DTRACE_PROBE4(tx__ip__log__drop__forward__mac, char *,
1667 			    "unlabeled packet mp(1) dropped, label(2) fails "
1668 			    "destination(3) accredation check",
1669 			    mblk_t *, mp, ts_label_t *, tsl,
1670 			    tsol_tpc_t *, dst_rhtp);
1671 			mp = NULL;
1672 			goto keep_label;
1673 		}
1674 		break;
1675 	}
1676 	if (label_type == OPT_CIPSO) {
1677 		/*
1678 		 * We keep the label on any of the following cases:
1679 		 *
1680 		 *   1. The destination is labeled (on/off-link).
1681 		 *   2. The unlabeled destination is off-link,
1682 		 *	and the next hop gateway is labeled.
1683 		 */
1684 		if (dst_rhtp->tpc_tp.host_type != UNLABELED ||
1685 		    (off_link &&
1686 		    gw_rhtp->tpc_tp.host_type != UNLABELED))
1687 			goto keep_label;
1688 
1689 		/*
1690 		 * Strip off the CIPSO option from the packet because: the
1691 		 * unlabeled destination host is directly reachable through
1692 		 * an interface (on-link); or, the unlabeled destination host
1693 		 * is not directly reachable (off-link), and the next hop
1694 		 * gateway is unlabeled.
1695 		 */
1696 		adjust = (af == AF_INET) ? tsol_remove_secopt(ipha, MBLKL(mp)) :
1697 		    tsol_remove_secopt_v6(ip6h, MBLKL(mp));
1698 
1699 		ASSERT(adjust <= 0);
1700 		if (adjust != 0) {
1701 
1702 			/* adjust is negative */
1703 			ASSERT((mp->b_wptr + adjust) >= mp->b_rptr);
1704 			mp->b_wptr += adjust;
1705 
1706 			if (af == AF_INET) {
1707 				ipha = (ipha_t *)mp->b_rptr;
1708 				iplen = ntohs(ipha->ipha_length) + adjust;
1709 				ipha->ipha_length = htons(iplen);
1710 				ipha->ipha_hdr_checksum = 0;
1711 				ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1712 			}
1713 			DTRACE_PROBE3(tx__ip__log__info__forward__adjust,
1714 			    char *,
1715 			    "mp(1) adjusted(2) for CIPSO option removal",
1716 			    mblk_t *, mp, int, adjust);
1717 		}
1718 		goto keep_label;
1719 	}
1720 
1721 	ASSERT(label_type == OPT_NONE);
1722 	ASSERT(dst_rhtp != NULL);
1723 
1724 	/*
1725 	 * We need to add CIPSO option if the destination or the next hop
1726 	 * gateway is labeled.  Otherwise, pass the packet as is.
1727 	 */
1728 	if (dst_rhtp->tpc_tp.host_type == UNLABELED &&
1729 	    (!off_link || gw_rhtp->tpc_tp.host_type == UNLABELED))
1730 		goto keep_label;
1731 
1732 
1733 	credp = msg_getcred(mp, &pid);
1734 	if ((af == AF_INET &&
1735 	    tsol_check_label(credp, &mp, CONN_MAC_DEFAULT, ipst, pid) != 0) ||
1736 	    (af == AF_INET6 &&
1737 	    tsol_check_label_v6(credp, &mp, CONN_MAC_DEFAULT, ipst,
1738 	    pid) != 0)) {
1739 		mp = NULL;
1740 		goto keep_label;
1741 	}
1742 
1743 	if (af == AF_INET) {
1744 		ipha = (ipha_t *)mp->b_rptr;
1745 		ipha->ipha_hdr_checksum = 0;
1746 		ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1747 	}
1748 
1749 keep_label:
1750 	TPC_RELE(dst_rhtp);
1751 	if (need_tpc_rele && gw_rhtp != NULL)
1752 		TPC_RELE(gw_rhtp);
1753 	return (mp);
1754 }
1755 
1756 /*
1757  * Name:	tsol_pmtu_adjust()
1758  *
1759  * Returns the adjusted mtu after removing security option.
1760  * Removes/subtracts the option if the packet's cred indicates an unlabeled
1761  * sender or if pkt_diff indicates this system enlarged the packet.
1762  */
1763 uint32_t
1764 tsol_pmtu_adjust(mblk_t *mp, uint32_t mtu, int pkt_diff, int af)
1765 {
1766 	int		label_adj = 0;
1767 	uint32_t	min_mtu = IP_MIN_MTU;
1768 	tsol_tpc_t	*src_rhtp;
1769 	void		*src;
1770 
1771 	/*
1772 	 * Note: label_adj is non-positive, indicating the number of
1773 	 * bytes removed by removing the security option from the
1774 	 * header.
1775 	 */
1776 	if (af == AF_INET6) {
1777 		ip6_t	*ip6h;
1778 
1779 		min_mtu = IPV6_MIN_MTU;
1780 		ip6h = (ip6_t *)mp->b_rptr;
1781 		src = &ip6h->ip6_src;
1782 		if ((src_rhtp = find_tpc(src, IPV6_VERSION, B_FALSE)) == NULL)
1783 			return (mtu);
1784 		if (pkt_diff > 0 || src_rhtp->tpc_tp.host_type == UNLABELED) {
1785 			label_adj = tsol_remove_secopt_v6(
1786 			    (ip6_t *)mp->b_rptr, MBLKL(mp));
1787 		}
1788 	} else {
1789 		ipha_t    *ipha;
1790 
1791 		ASSERT(af == AF_INET);
1792 		ipha = (ipha_t *)mp->b_rptr;
1793 		src = &ipha->ipha_src;
1794 		if ((src_rhtp = find_tpc(src, IPV4_VERSION, B_FALSE)) == NULL)
1795 			return (mtu);
1796 		if (pkt_diff > 0 || src_rhtp->tpc_tp.host_type == UNLABELED)
1797 			label_adj = tsol_remove_secopt(
1798 			    (ipha_t *)mp->b_rptr, MBLKL(mp));
1799 	}
1800 	/*
1801 	 * Make pkt_diff non-negative and the larger of the bytes
1802 	 * previously added (if any) or just removed, since label
1803 	 * addition + subtraction may not be completely idempotent.
1804 	 */
1805 	if (pkt_diff < -label_adj)
1806 		pkt_diff = -label_adj;
1807 	if (pkt_diff > 0 && pkt_diff < mtu)
1808 		mtu -= pkt_diff;
1809 
1810 	TPC_RELE(src_rhtp);
1811 	return (MAX(mtu, min_mtu));
1812 }
1813 
1814 /*
1815  * Name:	tsol_rtsa_init()
1816  *
1817  * Normal:	Sanity checks on the route security attributes provided by
1818  *		user.  Convert it into a route security parameter list to
1819  *		be returned to caller.
1820  *
1821  * Output:	EINVAL if bad security attributes in the routing message
1822  *		ENOMEM if unable to allocate data structures
1823  *		0 otherwise.
1824  *
1825  * Note:	On input, cp must point to the end of any addresses in
1826  *		the rt_msghdr_t structure.
1827  */
1828 int
1829 tsol_rtsa_init(rt_msghdr_t *rtm, tsol_rtsecattr_t *sp, caddr_t cp)
1830 {
1831 	uint_t	sacnt;
1832 	int	err;
1833 	caddr_t	lim;
1834 	tsol_rtsecattr_t *tp;
1835 
1836 	ASSERT((cp >= (caddr_t)&rtm[1]) && sp != NULL);
1837 
1838 	/*
1839 	 * In theory, we could accept as many security attributes configured
1840 	 * per route destination.  However, the current design is limited
1841 	 * such that at most only one set security attributes is allowed to
1842 	 * be associated with a prefix IRE.  We therefore assert for now.
1843 	 */
1844 	/* LINTED */
1845 	ASSERT(TSOL_RTSA_REQUEST_MAX == 1);
1846 
1847 	sp->rtsa_cnt = 0;
1848 	lim = (caddr_t)rtm + rtm->rtm_msglen;
1849 	ASSERT(cp <= lim);
1850 
1851 	if ((lim - cp) < sizeof (rtm_ext_t) ||
1852 	    ((rtm_ext_t *)cp)->rtmex_type != RTMEX_GATEWAY_SECATTR)
1853 		return (0);
1854 
1855 	if (((rtm_ext_t *)cp)->rtmex_len < sizeof (tsol_rtsecattr_t))
1856 		return (EINVAL);
1857 
1858 	cp += sizeof (rtm_ext_t);
1859 
1860 	if ((lim - cp) < sizeof (*tp) ||
1861 	    (tp = (tsol_rtsecattr_t *)cp, (sacnt = tp->rtsa_cnt) == 0) ||
1862 	    (lim - cp) < TSOL_RTSECATTR_SIZE(sacnt))
1863 		return (EINVAL);
1864 
1865 	/*
1866 	 * Trying to add route security attributes when system
1867 	 * labeling service is not available, or when user supllies
1868 	 * more than the maximum number of security attributes
1869 	 * allowed per request.
1870 	 */
1871 	if ((sacnt > 0 && !is_system_labeled()) ||
1872 	    sacnt > TSOL_RTSA_REQUEST_MAX)
1873 		return (EINVAL);
1874 
1875 	/* Ensure valid credentials */
1876 	if ((err = rtsa_validate(&((tsol_rtsecattr_t *)cp)->
1877 	    rtsa_attr[0])) != 0) {
1878 		cp += sizeof (*sp);
1879 		return (err);
1880 	}
1881 
1882 	bcopy(cp, sp, sizeof (*sp));
1883 	cp += sizeof (*sp);
1884 	return (0);
1885 }
1886 
1887 int
1888 tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
1889     tsol_gcgrp_t *gcgrp)
1890 {
1891 	tsol_ire_gw_secattr_t *attrp;
1892 	boolean_t exists = B_FALSE;
1893 	in_addr_t ga_addr4;
1894 	void *paddr = NULL;
1895 
1896 	ASSERT(ire != NULL);
1897 
1898 	/*
1899 	 * The only time that attrp can be NULL is when this routine is
1900 	 * called for the first time during the creation/initialization
1901 	 * of the corresponding IRE.  It will only get cleared when the
1902 	 * IRE is deleted.
1903 	 */
1904 	if ((attrp = ire->ire_gw_secattr) == NULL) {
1905 		attrp = ire_gw_secattr_alloc(KM_NOSLEEP);
1906 		if (attrp == NULL)
1907 			return (ENOMEM);
1908 		ire->ire_gw_secattr = attrp;
1909 	} else {
1910 		exists = B_TRUE;
1911 		mutex_enter(&attrp->igsa_lock);
1912 
1913 		if (attrp->igsa_rhc != NULL) {
1914 			TNRHC_RELE(attrp->igsa_rhc);
1915 			attrp->igsa_rhc = NULL;
1916 		}
1917 
1918 		if (attrp->igsa_gc != NULL)
1919 			GC_REFRELE(attrp->igsa_gc);
1920 		if (attrp->igsa_gcgrp != NULL)
1921 			GCGRP_REFRELE(attrp->igsa_gcgrp);
1922 	}
1923 	ASSERT(!exists || MUTEX_HELD(&attrp->igsa_lock));
1924 
1925 	/*
1926 	 * References already held by caller and we keep them;
1927 	 * note that both gc and gcgrp may be set to NULL to
1928 	 * clear out igsa_gc and igsa_gcgrp, respectively.
1929 	 */
1930 	attrp->igsa_gc = gc;
1931 	attrp->igsa_gcgrp = gcgrp;
1932 
1933 	if (gcgrp == NULL && gc != NULL) {
1934 		gcgrp = gc->gc_grp;
1935 		ASSERT(gcgrp != NULL);
1936 	}
1937 
1938 	/*
1939 	 * Intialize the template for gateway; we use the gateway's
1940 	 * address found in either the passed in gateway credential
1941 	 * or group pointer, or the ire_gateway_addr{_v6} field.
1942 	 */
1943 	if (gcgrp != NULL) {
1944 		tsol_gcgrp_addr_t *ga = &gcgrp->gcgrp_addr;
1945 
1946 		/*
1947 		 * Caller is holding a reference, and that we don't
1948 		 * need to hold any lock to access the address.
1949 		 */
1950 		if (ipversion == IPV4_VERSION) {
1951 			ASSERT(ga->ga_af == AF_INET);
1952 			IN6_V4MAPPED_TO_IPADDR(&ga->ga_addr, ga_addr4);
1953 			paddr = &ga_addr4;
1954 		} else {
1955 			ASSERT(ga->ga_af == AF_INET6);
1956 			paddr = &ga->ga_addr;
1957 		}
1958 	} else if (ipversion == IPV6_VERSION &&
1959 	    !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
1960 		paddr = &ire->ire_gateway_addr_v6;
1961 	} else if (ipversion == IPV4_VERSION &&
1962 	    ire->ire_gateway_addr != INADDR_ANY) {
1963 		paddr = &ire->ire_gateway_addr;
1964 	}
1965 
1966 	/*
1967 	 * Lookup the gateway template; note that we could get an internal
1968 	 * template here, which we cache anyway.  During IRE matching, we'll
1969 	 * try to update this gateway template cache and hopefully get a
1970 	 * real one.
1971 	 */
1972 	if (paddr != NULL) {
1973 		attrp->igsa_rhc = find_rhc(paddr, ipversion, B_FALSE);
1974 	}
1975 
1976 	if (exists)
1977 		mutex_exit(&attrp->igsa_lock);
1978 
1979 	return (0);
1980 }
1981 
1982 /*
1983  * This function figures the type of MLP that we'll be using based on the
1984  * address that the user is binding and the zone.  If the address is
1985  * unspecified, then we're looking at both private and shared.  If it's one
1986  * of the zone's private addresses, then it's private only.  If it's one
1987  * of the global addresses, then it's shared only. Multicast addresses are
1988  * treated same as unspecified address.
1989  *
1990  * If we can't figure out what it is, then return mlptSingle.  That's actually
1991  * an error case.
1992  *
1993  * The callers are assume to pass in zone->zone_id and not the zoneid that
1994  * is stored in a conn_t (since the latter will be GLOBAL_ZONEID in an
1995  * exclusive stack zone).
1996  */
1997 mlp_type_t
1998 tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr,
1999     ip_stack_t *ipst)
2000 {
2001 	in_addr_t in4;
2002 	ire_t *ire;
2003 	ipif_t *ipif;
2004 	zoneid_t addrzone;
2005 	zoneid_t ip_zoneid;
2006 
2007 	ASSERT(addr != NULL);
2008 
2009 	/*
2010 	 * For exclusive stacks we set the zoneid to zero
2011 	 * to operate as if in the global zone for IRE and conn_t comparisons.
2012 	 */
2013 	if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
2014 		ip_zoneid = GLOBAL_ZONEID;
2015 	else
2016 		ip_zoneid = zoneid;
2017 
2018 	if (version == IPV6_VERSION &&
2019 	    IN6_IS_ADDR_V4MAPPED((const in6_addr_t *)addr)) {
2020 		IN6_V4MAPPED_TO_IPADDR((const in6_addr_t *)addr, in4);
2021 		addr = &in4;
2022 		version = IPV4_VERSION;
2023 	}
2024 
2025 	if (version == IPV4_VERSION) {
2026 		in4 = *(const in_addr_t *)addr;
2027 		if ((in4 == INADDR_ANY) || CLASSD(in4)) {
2028 			return (mlptBoth);
2029 		}
2030 		ire = ire_cache_lookup(in4, ip_zoneid, NULL, ipst);
2031 	} else {
2032 		if (IN6_IS_ADDR_UNSPECIFIED((const in6_addr_t *)addr) ||
2033 		    IN6_IS_ADDR_MULTICAST((const in6_addr_t *)addr)) {
2034 			return (mlptBoth);
2035 		}
2036 		ire = ire_cache_lookup_v6(addr, ip_zoneid, NULL, ipst);
2037 	}
2038 	/*
2039 	 * If we can't find the IRE, then we have to behave exactly like
2040 	 * ip_bind_laddr{,_v6}.  That means looking up the IPIF so that users
2041 	 * can bind to addresses on "down" interfaces.
2042 	 *
2043 	 * If we can't find that either, then the bind is going to fail, so
2044 	 * just give up.  Note that there's a miniscule chance that the address
2045 	 * is in transition, but we don't bother handling that.
2046 	 */
2047 	if (ire == NULL) {
2048 		if (version == IPV4_VERSION)
2049 			ipif = ipif_lookup_addr(*(const in_addr_t *)addr, NULL,
2050 			    ip_zoneid, NULL, NULL, NULL, NULL, ipst);
2051 		else
2052 			ipif = ipif_lookup_addr_v6((const in6_addr_t *)addr,
2053 			    NULL, ip_zoneid, NULL, NULL, NULL, NULL, ipst);
2054 		if (ipif == NULL) {
2055 			return (mlptSingle);
2056 		}
2057 		addrzone = ipif->ipif_zoneid;
2058 		ipif_refrele(ipif);
2059 	} else {
2060 		addrzone = ire->ire_zoneid;
2061 		ire_refrele(ire);
2062 	}
2063 	return (addrzone == ALL_ZONES ? mlptShared : mlptPrivate);
2064 }
2065 
2066 /*
2067  * Since we are configuring local interfaces, and we know trusted
2068  * extension CDE requires local interfaces to be cipso host type in
2069  * order to function correctly, we'll associate a cipso template
2070  * to each local interface and let the interface come up.  Configuring
2071  * a local interface to be "unlabeled" host type is a configuration error.
2072  * We'll override that error and make the interface host type to be cipso
2073  * here.
2074  *
2075  * The code is optimized for the usual "success" case and unwinds things on
2076  * error.  We don't want to go to the trouble and expense of formatting the
2077  * interface name for the usual case where everything is configured correctly.
2078  */
2079 boolean_t
2080 tsol_check_interface_address(const ipif_t *ipif)
2081 {
2082 	tsol_tpc_t *tp;
2083 	char addrbuf[INET6_ADDRSTRLEN];
2084 	int af;
2085 	const void *addr;
2086 	zone_t *zone;
2087 	ts_label_t *plabel;
2088 	const bslabel_t *label;
2089 	char ifbuf[LIFNAMSIZ + 10];
2090 	const char *ifname;
2091 	boolean_t retval;
2092 	tsol_rhent_t rhent;
2093 	netstack_t *ns = ipif->ipif_ill->ill_ipst->ips_netstack;
2094 
2095 	if (IN6_IS_ADDR_V4MAPPED(&ipif->ipif_v6lcl_addr)) {
2096 		af = AF_INET;
2097 		addr = &V4_PART_OF_V6(ipif->ipif_v6lcl_addr);
2098 	} else {
2099 		af = AF_INET6;
2100 		addr = &ipif->ipif_v6lcl_addr;
2101 	}
2102 
2103 	tp = find_tpc(&ipif->ipif_v6lcl_addr, IPV6_VERSION, B_FALSE);
2104 
2105 	/* assumes that ALL_ZONES implies that there is no exclusive stack */
2106 	if (ipif->ipif_zoneid == ALL_ZONES) {
2107 		zone = NULL;
2108 	} else if (ns->netstack_stackid == GLOBAL_NETSTACKID) {
2109 		/* Shared stack case */
2110 		zone = zone_find_by_id(ipif->ipif_zoneid);
2111 	} else {
2112 		/* Exclusive stack case */
2113 		zone = zone_find_by_id(crgetzoneid(ipif->ipif_ill->ill_credp));
2114 	}
2115 	if (zone != NULL) {
2116 		plabel = zone->zone_slabel;
2117 		ASSERT(plabel != NULL);
2118 		label = label2bslabel(plabel);
2119 	}
2120 
2121 	/*
2122 	 * If it's CIPSO and an all-zones address, then we're done.
2123 	 * If it's a CIPSO zone specific address, the zone's label
2124 	 * must be in the range or set specified in the template.
2125 	 * When the remote host entry is missing or the template
2126 	 * type is incorrect for this interface, we create a
2127 	 * CIPSO host entry in kernel and allow the interface to be
2128 	 * brought up as CIPSO type.
2129 	 */
2130 	if (tp != NULL && (
2131 	    /* The all-zones case */
2132 	    (tp->tpc_tp.host_type == SUN_CIPSO &&
2133 	    tp->tpc_tp.tp_doi == default_doi &&
2134 	    ipif->ipif_zoneid == ALL_ZONES) ||
2135 	    /* The local-zone case */
2136 	    (zone != NULL && plabel->tsl_doi == tp->tpc_tp.tp_doi &&
2137 	    ((tp->tpc_tp.host_type == SUN_CIPSO &&
2138 	    (_blinrange(label, &tp->tpc_tp.tp_sl_range_cipso) ||
2139 	    blinlset(label, tp->tpc_tp.tp_sl_set_cipso))))))) {
2140 		if (zone != NULL)
2141 			zone_rele(zone);
2142 		TPC_RELE(tp);
2143 		return (B_TRUE);
2144 	}
2145 
2146 	ifname = ipif->ipif_ill->ill_name;
2147 	if (ipif->ipif_id != 0) {
2148 		(void) snprintf(ifbuf, sizeof (ifbuf), "%s:%u", ifname,
2149 		    ipif->ipif_id);
2150 		ifname = ifbuf;
2151 	}
2152 	(void) inet_ntop(af, addr, addrbuf, sizeof (addrbuf));
2153 
2154 	if (tp == NULL) {
2155 		cmn_err(CE_NOTE, "template entry for %s missing. Default to "
2156 		    "CIPSO type for %s", ifname, addrbuf);
2157 		retval = B_TRUE;
2158 	} else if (tp->tpc_tp.host_type == UNLABELED) {
2159 		cmn_err(CE_NOTE, "template type for %s incorrectly configured. "
2160 		    "Change to CIPSO type for %s", ifname, addrbuf);
2161 		retval = B_TRUE;
2162 	} else if (ipif->ipif_zoneid == ALL_ZONES) {
2163 		if (tp->tpc_tp.host_type != SUN_CIPSO) {
2164 			cmn_err(CE_NOTE, "%s failed: %s isn't set to CIPSO for "
2165 			    "all-zones. Converted to CIPSO.", ifname, addrbuf);
2166 			retval = B_TRUE;
2167 		} else {
2168 			cmn_err(CE_NOTE, "%s failed: %s has wrong DOI %d "
2169 			    "instead of %d", ifname, addrbuf,
2170 			    tp->tpc_tp.tp_doi, default_doi);
2171 			retval = B_FALSE;
2172 		}
2173 	} else if (zone == NULL) {
2174 		cmn_err(CE_NOTE, "%s failed: zoneid %d unknown",
2175 		    ifname, ipif->ipif_zoneid);
2176 		retval = B_FALSE;
2177 	} else if (plabel->tsl_doi != tp->tpc_tp.tp_doi) {
2178 		cmn_err(CE_NOTE, "%s failed: zone %s has DOI %d but %s has "
2179 		    "DOI %d", ifname, zone->zone_name, plabel->tsl_doi,
2180 		    addrbuf, tp->tpc_tp.tp_doi);
2181 		retval = B_FALSE;
2182 	} else {
2183 		cmn_err(CE_NOTE, "%s failed: zone %s label incompatible with "
2184 		    "%s", ifname, zone->zone_name, addrbuf);
2185 		tsol_print_label(label, "zone label");
2186 		retval = B_FALSE;
2187 	}
2188 
2189 	if (zone != NULL)
2190 		zone_rele(zone);
2191 	if (tp != NULL)
2192 		TPC_RELE(tp);
2193 	if (retval) {
2194 		/*
2195 		 * we've corrected a config error and let the interface
2196 		 * come up as cipso. Need to insert an rhent.
2197 		 */
2198 		if ((rhent.rh_address.ta_family = af) == AF_INET) {
2199 			rhent.rh_prefix = 32;
2200 			rhent.rh_address.ta_addr_v4 = *(struct in_addr *)addr;
2201 		} else {
2202 			rhent.rh_prefix = 128;
2203 			rhent.rh_address.ta_addr_v6 = *(in6_addr_t *)addr;
2204 		}
2205 		(void) strcpy(rhent.rh_template, "cipso");
2206 		if (tnrh_load(&rhent) != 0) {
2207 			cmn_err(CE_NOTE, "%s failed: Cannot insert CIPSO "
2208 			    "template for local addr %s", ifname, addrbuf);
2209 			retval = B_FALSE;
2210 		}
2211 	}
2212 	return (retval);
2213 }
2214