xref: /illumos-gate/usr/src/uts/common/inet/iptun/iptun.c (revision be4c8f742bc67a43d01e3ea82a814b7d6503dbfd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * iptun - IP Tunneling Driver
28  *
29  * This module is a GLDv3 driver that implements virtual datalinks over IP
30  * (a.k.a, IP tunneling).  The datalinks are managed through a dld ioctl
31  * interface (see iptun_ctl.c), and registered with GLDv3 using
32  * mac_register().  It implements the logic for various forms of IP (IPv4 or
33  * IPv6) encapsulation within IP (IPv4 or IPv6) by interacting with the ip
34  * module below it.  Each virtual IP tunnel datalink has a conn_t associated
35  * with it representing the "outer" IP connection.
36  *
37  * The module implements the following locking semantics:
38  *
39  * Lookups and deletions in iptun_hash are synchronized using iptun_hash_lock.
40  * See comments above iptun_hash_lock for details.
41  *
42  * No locks are ever held while calling up to GLDv3.  The general architecture
43  * of GLDv3 requires this, as the mac perimeter (essentially a lock) for a
44  * given link will be held while making downcalls (iptun_m_*() callbacks).
45  * Because we need to hold locks while handling downcalls, holding these locks
46  * while issuing upcalls results in deadlock scenarios.  See the block comment
47  * above iptun_task_cb() for details on how we safely issue upcalls without
48  * holding any locks.
49  *
50  * The contents of each iptun_t is protected by an iptun_mutex which is held
51  * in iptun_enter() (called by iptun_enter_by_linkid()), and exited in
52  * iptun_exit().
53  *
54  * See comments in iptun_delete() and iptun_free() for details on how the
55  * iptun_t is deleted safely.
56  */
57 
58 #include <sys/types.h>
59 #include <sys/kmem.h>
60 #include <sys/errno.h>
61 #include <sys/modhash.h>
62 #include <sys/list.h>
63 #include <sys/strsun.h>
64 #include <sys/file.h>
65 #include <sys/systm.h>
66 #include <sys/tihdr.h>
67 #include <sys/param.h>
68 #include <sys/mac_provider.h>
69 #include <sys/mac_ipv4.h>
70 #include <sys/mac_ipv6.h>
71 #include <sys/mac_6to4.h>
72 #include <sys/tsol/tnet.h>
73 #include <sys/sunldi.h>
74 #include <netinet/in.h>
75 #include <netinet/ip6.h>
76 #include <inet/ip.h>
77 #include <inet/ip_ire.h>
78 #include <inet/ipsec_impl.h>
79 #include <sys/tsol/label.h>
80 #include <sys/tsol/tnet.h>
81 #include <inet/iptun.h>
82 #include "iptun_impl.h"
83 
84 /* Do the tunnel type and address family match? */
85 #define	IPTUN_ADDR_MATCH(iptun_type, family)				\
86 	((iptun_type == IPTUN_TYPE_IPV4 && family == AF_INET) ||	\
87 	(iptun_type == IPTUN_TYPE_IPV6 && family == AF_INET6) ||	\
88 	(iptun_type == IPTUN_TYPE_6TO4 && family == AF_INET))
89 
90 #define	IPTUN_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
91 
92 #define	IPTUN_MIN_IPV4_MTU	576		/* ip.h still uses 68 (!) */
93 #define	IPTUN_MIN_IPV6_MTU	IPV6_MIN_MTU
94 #define	IPTUN_MAX_IPV4_MTU	(IP_MAXPACKET - sizeof (ipha_t))
95 #define	IPTUN_MAX_IPV6_MTU	(IP_MAXPACKET - sizeof (ip6_t) -	\
96 				    sizeof (iptun_encaplim_t))
97 
98 #define	IPTUN_MIN_HOPLIMIT	1
99 #define	IPTUN_MAX_HOPLIMIT	UINT8_MAX
100 
101 #define	IPTUN_MIN_ENCAPLIMIT	0
102 #define	IPTUN_MAX_ENCAPLIMIT	UINT8_MAX
103 
104 #define	IPTUN_IPSEC_REQ_MASK	(IPSEC_PREF_REQUIRED | IPSEC_PREF_NEVER)
105 
106 static iptun_encaplim_t	iptun_encaplim_init = {
107 	{ IPPROTO_NONE, 0 },
108 	IP6OPT_TUNNEL_LIMIT,
109 	1,
110 	IPTUN_DEFAULT_ENCAPLIMIT,	/* filled in with actual value later */
111 	IP6OPT_PADN,
112 	1,
113 	0
114 };
115 
116 /*
117  * Table containing per-iptun-type information.
118  * Since IPv6 can run over all of these we have the IPv6 min as the min MTU.
119  */
120 static iptun_typeinfo_t	iptun_type_table[] = {
121 	{ IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION,
122 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV4_MTU,	B_TRUE },
123 	{ IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION,
124 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV6_MTU,	B_TRUE },
125 	{ IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION,
126 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV4_MTU,	B_FALSE },
127 	{ IPTUN_TYPE_UNKNOWN, NULL, 0, 0, 0, B_FALSE }
128 };
129 
130 /*
131  * iptun_hash is an iptun_t lookup table by link ID protected by
132  * iptun_hash_lock.  While the hash table's integrity is maintained via
133  * internal locking in the mod_hash_*() functions, we need additional locking
134  * so that an iptun_t cannot be deleted after a hash lookup has returned an
135  * iptun_t and before iptun_lock has been entered.  As such, we use
136  * iptun_hash_lock when doing lookups and removals from iptun_hash.
137  */
138 mod_hash_t	*iptun_hash;
139 static kmutex_t	iptun_hash_lock;
140 
141 static uint_t	iptun_tunnelcount;	/* total for all stacks */
142 kmem_cache_t	*iptun_cache;
143 ddi_taskq_t 	*iptun_taskq;
144 
145 typedef enum {
146 	IPTUN_TASK_MTU_UPDATE,	/* tell mac about new tunnel link MTU */
147 	IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */
148 	IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */
149 	IPTUN_TASK_LINK_UPDATE,	/* tell mac about new link state */
150 	IPTUN_TASK_PDATA_UPDATE	/* tell mac about updated plugin data */
151 } iptun_task_t;
152 
153 typedef struct iptun_task_data_s {
154 	iptun_task_t	itd_task;
155 	datalink_id_t	itd_linkid;
156 } iptun_task_data_t;
157 
158 static void iptun_task_dispatch(iptun_t *, iptun_task_t);
159 static int iptun_enter(iptun_t *);
160 static void iptun_exit(iptun_t *);
161 static void iptun_headergen(iptun_t *, boolean_t);
162 static void iptun_drop_pkt(mblk_t *, uint64_t *);
163 static void iptun_input(void *, mblk_t *, void *, ip_recv_attr_t *);
164 static void iptun_input_icmp(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void iptun_output(iptun_t *, mblk_t *);
166 static uint32_t iptun_get_maxmtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
167 static uint32_t iptun_update_mtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
168 static uint32_t iptun_get_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
169 static void iptun_update_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
170 static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *);
171 
172 static void iptun_output_6to4(iptun_t *, mblk_t *);
173 static void iptun_output_common(iptun_t *, ip_xmit_attr_t *, mblk_t *);
174 static boolean_t iptun_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
175     ip_recv_attr_t *);
176 
177 static void iptun_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
178     ixa_notify_arg_t);
179 
180 static mac_callbacks_t iptun_m_callbacks;
181 
182 static int
183 iptun_m_getstat(void *arg, uint_t stat, uint64_t *val)
184 {
185 	iptun_t	*iptun = arg;
186 	int	err = 0;
187 
188 	switch (stat) {
189 	case MAC_STAT_IERRORS:
190 		*val = iptun->iptun_ierrors;
191 		break;
192 	case MAC_STAT_OERRORS:
193 		*val = iptun->iptun_oerrors;
194 		break;
195 	case MAC_STAT_RBYTES:
196 		*val = iptun->iptun_rbytes;
197 		break;
198 	case MAC_STAT_IPACKETS:
199 		*val = iptun->iptun_ipackets;
200 		break;
201 	case MAC_STAT_OBYTES:
202 		*val = iptun->iptun_obytes;
203 		break;
204 	case MAC_STAT_OPACKETS:
205 		*val = iptun->iptun_opackets;
206 		break;
207 	case MAC_STAT_NORCVBUF:
208 		*val = iptun->iptun_norcvbuf;
209 		break;
210 	case MAC_STAT_NOXMTBUF:
211 		*val = iptun->iptun_noxmtbuf;
212 		break;
213 	default:
214 		err = ENOTSUP;
215 	}
216 
217 	return (err);
218 }
219 
220 static int
221 iptun_m_start(void *arg)
222 {
223 	iptun_t	*iptun = arg;
224 	int	err;
225 
226 	if ((err = iptun_enter(iptun)) == 0) {
227 		iptun->iptun_flags |= IPTUN_MAC_STARTED;
228 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
229 		iptun_exit(iptun);
230 	}
231 	return (err);
232 }
233 
234 static void
235 iptun_m_stop(void *arg)
236 {
237 	iptun_t *iptun = arg;
238 
239 	if (iptun_enter(iptun) == 0) {
240 		iptun->iptun_flags &= ~IPTUN_MAC_STARTED;
241 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
242 		iptun_exit(iptun);
243 	}
244 }
245 
246 /*
247  * iptun_m_setpromisc() does nothing and always succeeds.  This is because a
248  * tunnel data-link only ever receives packets that are destined exclusively
249  * for the local address of the tunnel.
250  */
251 /* ARGSUSED */
252 static int
253 iptun_m_setpromisc(void *arg, boolean_t on)
254 {
255 	return (0);
256 }
257 
258 /* ARGSUSED */
259 static int
260 iptun_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
261 {
262 	return (ENOTSUP);
263 }
264 
265 /*
266  * iptun_m_unicst() sets the local address.
267  */
268 /* ARGSUSED */
269 static int
270 iptun_m_unicst(void *arg, const uint8_t *addrp)
271 {
272 	iptun_t			*iptun = arg;
273 	int			err;
274 	struct sockaddr_storage	ss;
275 	struct sockaddr_in	*sin;
276 	struct sockaddr_in6	*sin6;
277 
278 	if ((err = iptun_enter(iptun)) == 0) {
279 		switch (iptun->iptun_typeinfo->iti_ipvers) {
280 		case IPV4_VERSION:
281 			sin = (struct sockaddr_in *)&ss;
282 			sin->sin_family = AF_INET;
283 			bcopy(addrp, &sin->sin_addr, sizeof (in_addr_t));
284 			break;
285 		case IPV6_VERSION:
286 			sin6 = (struct sockaddr_in6 *)&ss;
287 			sin6->sin6_family = AF_INET6;
288 			bcopy(addrp, &sin6->sin6_addr, sizeof (in6_addr_t));
289 			break;
290 		default:
291 			ASSERT(0);
292 		}
293 		err = iptun_setladdr(iptun, &ss);
294 		iptun_exit(iptun);
295 	}
296 	return (err);
297 }
298 
299 static mblk_t *
300 iptun_m_tx(void *arg, mblk_t *mpchain)
301 {
302 	mblk_t	*mp, *nmp;
303 	iptun_t	*iptun = arg;
304 
305 	if (!IS_IPTUN_RUNNING(iptun)) {
306 		iptun_drop_pkt(mpchain, &iptun->iptun_noxmtbuf);
307 		return (NULL);
308 	}
309 
310 	for (mp = mpchain; mp != NULL; mp = nmp) {
311 		nmp = mp->b_next;
312 		mp->b_next = NULL;
313 		iptun_output(iptun, mp);
314 	}
315 
316 	return (NULL);
317 }
318 
319 /* ARGSUSED */
320 static int
321 iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
322     uint_t pr_valsize, const void *pr_val)
323 {
324 	iptun_t		*iptun = barg;
325 	uint32_t	value = *(uint32_t *)pr_val;
326 	int		err;
327 
328 	/*
329 	 * We need to enter this iptun_t since we'll be modifying the outer
330 	 * header.
331 	 */
332 	if ((err = iptun_enter(iptun)) != 0)
333 		return (err);
334 
335 	switch (pr_num) {
336 	case MAC_PROP_IPTUN_HOPLIMIT:
337 		if (value < IPTUN_MIN_HOPLIMIT || value > IPTUN_MAX_HOPLIMIT) {
338 			err = EINVAL;
339 			break;
340 		}
341 		if (value != iptun->iptun_hoplimit) {
342 			iptun->iptun_hoplimit = (uint8_t)value;
343 			iptun_headergen(iptun, B_TRUE);
344 		}
345 		break;
346 	case MAC_PROP_IPTUN_ENCAPLIMIT:
347 		if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6 ||
348 		    value > IPTUN_MAX_ENCAPLIMIT) {
349 			err = EINVAL;
350 			break;
351 		}
352 		if (value != iptun->iptun_encaplimit) {
353 			iptun->iptun_encaplimit = (uint8_t)value;
354 			iptun_headergen(iptun, B_TRUE);
355 		}
356 		break;
357 	case MAC_PROP_MTU: {
358 		uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
359 
360 		if (value < iptun->iptun_typeinfo->iti_minmtu ||
361 		    value > maxmtu) {
362 			err = EINVAL;
363 			break;
364 		}
365 		iptun->iptun_flags |= IPTUN_FIXED_MTU;
366 		if (value != iptun->iptun_mtu) {
367 			iptun->iptun_mtu = value;
368 			iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
369 		}
370 		break;
371 	}
372 	default:
373 		err = EINVAL;
374 	}
375 	iptun_exit(iptun);
376 	return (err);
377 }
378 
379 /* ARGSUSED */
380 static int
381 iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
382     uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm)
383 {
384 	iptun_t			*iptun = barg;
385 	mac_propval_range_t	range;
386 	boolean_t		is_default = (pr_flags & MAC_PROP_DEFAULT);
387 	boolean_t		is_possible = (pr_flags & MAC_PROP_POSSIBLE);
388 	int			err;
389 
390 	if ((err = iptun_enter(iptun)) != 0)
391 		return (err);
392 
393 	if ((pr_flags & ~(MAC_PROP_DEFAULT | MAC_PROP_POSSIBLE)) != 0) {
394 		err = ENOTSUP;
395 		goto done;
396 	}
397 	if (is_default && is_possible) {
398 		err = EINVAL;
399 		goto done;
400 	}
401 
402 	*perm = MAC_PROP_PERM_RW;
403 
404 	if (is_possible) {
405 		if (pr_valsize < sizeof (mac_propval_range_t)) {
406 			err = EINVAL;
407 			goto done;
408 		}
409 		range.mpr_count = 1;
410 		range.mpr_type = MAC_PROPVAL_UINT32;
411 	} else if (pr_valsize < sizeof (uint32_t)) {
412 		err = EINVAL;
413 		goto done;
414 	}
415 
416 	switch (pr_num) {
417 	case MAC_PROP_IPTUN_HOPLIMIT:
418 		if (is_possible) {
419 			range.range_uint32[0].mpur_min = IPTUN_MIN_HOPLIMIT;
420 			range.range_uint32[0].mpur_max = IPTUN_MAX_HOPLIMIT;
421 		} else if (is_default) {
422 			*(uint32_t *)pr_val = IPTUN_DEFAULT_HOPLIMIT;
423 		} else {
424 			*(uint32_t *)pr_val = iptun->iptun_hoplimit;
425 		}
426 		break;
427 	case MAC_PROP_IPTUN_ENCAPLIMIT:
428 		if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6) {
429 			err = ENOTSUP;
430 			goto done;
431 		}
432 		if (is_possible) {
433 			range.range_uint32[0].mpur_min = IPTUN_MIN_ENCAPLIMIT;
434 			range.range_uint32[0].mpur_max = IPTUN_MAX_ENCAPLIMIT;
435 		} else if (is_default) {
436 			*(uint32_t *)pr_val = IPTUN_DEFAULT_ENCAPLIMIT;
437 		} else {
438 			*(uint32_t *)pr_val = iptun->iptun_encaplimit;
439 		}
440 		break;
441 	case MAC_PROP_MTU: {
442 		uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
443 
444 		if (is_possible) {
445 			range.range_uint32[0].mpur_min =
446 			    iptun->iptun_typeinfo->iti_minmtu;
447 			range.range_uint32[0].mpur_max = maxmtu;
448 		} else {
449 			/*
450 			 * The MAC module knows the current value and should
451 			 * never call us for it.  There is also no default
452 			 * MTU, as by default, it is a dynamic property.
453 			 */
454 			err = ENOTSUP;
455 			goto done;
456 		}
457 		break;
458 	}
459 	default:
460 		err = EINVAL;
461 		goto done;
462 	}
463 	if (is_possible)
464 		bcopy(&range, pr_val, sizeof (range));
465 done:
466 	iptun_exit(iptun);
467 	return (err);
468 }
469 
470 uint_t
471 iptun_count(void)
472 {
473 	return (iptun_tunnelcount);
474 }
475 
476 /*
477  * Enter an iptun_t exclusively.  This is essentially just a mutex, but we
478  * don't allow iptun_enter() to succeed on a tunnel if it's in the process of
479  * being deleted.
480  */
481 static int
482 iptun_enter(iptun_t *iptun)
483 {
484 	mutex_enter(&iptun->iptun_lock);
485 	while (iptun->iptun_flags & IPTUN_DELETE_PENDING)
486 		cv_wait(&iptun->iptun_enter_cv, &iptun->iptun_lock);
487 	if (iptun->iptun_flags & IPTUN_CONDEMNED) {
488 		mutex_exit(&iptun->iptun_lock);
489 		return (ENOENT);
490 	}
491 	return (0);
492 }
493 
494 /*
495  * Exit the tunnel entered in iptun_enter().
496  */
497 static void
498 iptun_exit(iptun_t *iptun)
499 {
500 	mutex_exit(&iptun->iptun_lock);
501 }
502 
503 /*
504  * Enter the IP tunnel instance by datalink ID.
505  */
506 static int
507 iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun)
508 {
509 	int err;
510 
511 	mutex_enter(&iptun_hash_lock);
512 	if (mod_hash_find(iptun_hash, IPTUN_HASH_KEY(linkid),
513 	    (mod_hash_val_t *)iptun) == 0)
514 		err = iptun_enter(*iptun);
515 	else
516 		err = ENOENT;
517 	if (err != 0)
518 		*iptun = NULL;
519 	mutex_exit(&iptun_hash_lock);
520 	return (err);
521 }
522 
523 /*
524  * Handle tasks that were deferred through the iptun_taskq because they require
525  * calling up to the mac module, and we can't call up to the mac module while
526  * holding locks.
527  *
528  * This is tricky to get right without introducing race conditions and
529  * deadlocks with the mac module, as we cannot issue an upcall while in the
530  * iptun_t.  The reason is that upcalls may try and enter the mac perimeter,
531  * while iptun callbacks (such as iptun_m_setprop()) called from the mac
532  * module will already have the perimeter held, and will then try and enter
533  * the iptun_t.  You can see the lock ordering problem with this; this will
534  * deadlock.
535  *
536  * The safe way to do this is to enter the iptun_t in question and copy the
537  * information we need out of it so that we can exit it and know that the
538  * information being passed up to the upcalls won't be subject to modification
539  * by other threads.  The problem now is that we need to exit it prior to
540  * issuing the upcall, but once we do this, a thread could come along and
541  * delete the iptun_t and thus the mac handle required to issue the upcall.
542  * To prevent this, we set the IPTUN_UPCALL_PENDING flag prior to exiting the
543  * iptun_t.  This flag is the condition associated with iptun_upcall_cv, which
544  * iptun_delete() will cv_wait() on.  When the upcall completes, we clear
545  * IPTUN_UPCALL_PENDING and cv_signal() any potentially waiting
546  * iptun_delete().  We can thus still safely use iptun->iptun_mh after having
547  * exited the iptun_t.
548  */
549 static void
550 iptun_task_cb(void *arg)
551 {
552 	iptun_task_data_t	*itd = arg;
553 	iptun_task_t		task = itd->itd_task;
554 	datalink_id_t		linkid = itd->itd_linkid;
555 	iptun_t			*iptun;
556 	uint32_t		mtu;
557 	iptun_addr_t		addr;
558 	link_state_t		linkstate;
559 	size_t			header_size;
560 	iptun_header_t		header;
561 
562 	kmem_free(itd, sizeof (*itd));
563 
564 	/*
565 	 * Note that if the lookup fails, it's because the tunnel was deleted
566 	 * between the time the task was dispatched and now.  That isn't an
567 	 * error.
568 	 */
569 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
570 		return;
571 
572 	iptun->iptun_flags |= IPTUN_UPCALL_PENDING;
573 
574 	switch (task) {
575 	case IPTUN_TASK_MTU_UPDATE:
576 		mtu = iptun->iptun_mtu;
577 		break;
578 	case IPTUN_TASK_LADDR_UPDATE:
579 		addr = iptun->iptun_laddr;
580 		break;
581 	case IPTUN_TASK_RADDR_UPDATE:
582 		addr = iptun->iptun_raddr;
583 		break;
584 	case IPTUN_TASK_LINK_UPDATE:
585 		linkstate = IS_IPTUN_RUNNING(iptun) ?
586 		    LINK_STATE_UP : LINK_STATE_DOWN;
587 		break;
588 	case IPTUN_TASK_PDATA_UPDATE:
589 		header_size = iptun->iptun_header_size;
590 		header = iptun->iptun_header;
591 		break;
592 	default:
593 		ASSERT(0);
594 	}
595 
596 	iptun_exit(iptun);
597 
598 	switch (task) {
599 	case IPTUN_TASK_MTU_UPDATE:
600 		(void) mac_maxsdu_update(iptun->iptun_mh, mtu);
601 		break;
602 	case IPTUN_TASK_LADDR_UPDATE:
603 		mac_unicst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
604 		break;
605 	case IPTUN_TASK_RADDR_UPDATE:
606 		mac_dst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
607 		break;
608 	case IPTUN_TASK_LINK_UPDATE:
609 		mac_link_update(iptun->iptun_mh, linkstate);
610 		break;
611 	case IPTUN_TASK_PDATA_UPDATE:
612 		if (mac_pdata_update(iptun->iptun_mh,
613 		    header_size == 0 ? NULL : &header, header_size) != 0)
614 			atomic_inc_64(&iptun->iptun_taskq_fail);
615 		break;
616 	}
617 
618 	mutex_enter(&iptun->iptun_lock);
619 	iptun->iptun_flags &= ~IPTUN_UPCALL_PENDING;
620 	cv_signal(&iptun->iptun_upcall_cv);
621 	mutex_exit(&iptun->iptun_lock);
622 }
623 
624 static void
625 iptun_task_dispatch(iptun_t *iptun, iptun_task_t iptun_task)
626 {
627 	iptun_task_data_t *itd;
628 
629 	itd = kmem_alloc(sizeof (*itd), KM_NOSLEEP);
630 	if (itd == NULL) {
631 		atomic_inc_64(&iptun->iptun_taskq_fail);
632 		return;
633 	}
634 	itd->itd_task = iptun_task;
635 	itd->itd_linkid = iptun->iptun_linkid;
636 	if (ddi_taskq_dispatch(iptun_taskq, iptun_task_cb, itd, DDI_NOSLEEP)) {
637 		atomic_inc_64(&iptun->iptun_taskq_fail);
638 		kmem_free(itd, sizeof (*itd));
639 	}
640 }
641 
642 /*
643  * Convert an iptun_addr_t to sockaddr_storage.
644  */
645 static void
646 iptun_getaddr(iptun_addr_t *iptun_addr, struct sockaddr_storage *ss)
647 {
648 	struct sockaddr_in	*sin;
649 	struct sockaddr_in6	*sin6;
650 
651 	bzero(ss, sizeof (*ss));
652 	switch (iptun_addr->ia_family) {
653 	case AF_INET:
654 		sin = (struct sockaddr_in *)ss;
655 		sin->sin_addr.s_addr = iptun_addr->ia_addr.iau_addr4;
656 		break;
657 	case AF_INET6:
658 		sin6 = (struct sockaddr_in6 *)ss;
659 		sin6->sin6_addr = iptun_addr->ia_addr.iau_addr6;
660 		break;
661 	default:
662 		ASSERT(0);
663 	}
664 	ss->ss_family = iptun_addr->ia_family;
665 }
666 
667 /*
668  * General purpose function to set an IP tunnel source or destination address.
669  */
670 static int
671 iptun_setaddr(iptun_type_t iptun_type, iptun_addr_t *iptun_addr,
672     const struct sockaddr_storage *ss)
673 {
674 	if (!IPTUN_ADDR_MATCH(iptun_type, ss->ss_family))
675 		return (EINVAL);
676 
677 	switch (ss->ss_family) {
678 	case AF_INET: {
679 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
680 
681 		if ((sin->sin_addr.s_addr == INADDR_ANY) ||
682 		    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
683 		    CLASSD(sin->sin_addr.s_addr)) {
684 			return (EADDRNOTAVAIL);
685 		}
686 		iptun_addr->ia_addr.iau_addr4 = sin->sin_addr.s_addr;
687 		break;
688 	}
689 	case AF_INET6: {
690 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
691 
692 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
693 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) ||
694 		    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
695 			return (EADDRNOTAVAIL);
696 		}
697 		iptun_addr->ia_addr.iau_addr6 = sin6->sin6_addr;
698 		break;
699 	}
700 	default:
701 		return (EAFNOSUPPORT);
702 	}
703 	iptun_addr->ia_family = ss->ss_family;
704 	return (0);
705 }
706 
707 static int
708 iptun_setladdr(iptun_t *iptun, const struct sockaddr_storage *laddr)
709 {
710 	return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
711 	    &iptun->iptun_laddr, laddr));
712 }
713 
714 static int
715 iptun_setraddr(iptun_t *iptun, const struct sockaddr_storage *raddr)
716 {
717 	if (!(iptun->iptun_typeinfo->iti_hasraddr))
718 		return (EINVAL);
719 	return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
720 	    &iptun->iptun_raddr, raddr));
721 }
722 
723 static boolean_t
724 iptun_canbind(iptun_t *iptun)
725 {
726 	/*
727 	 * A tunnel may bind when its source address has been set, and if its
728 	 * tunnel type requires one, also its destination address.
729 	 */
730 	return ((iptun->iptun_flags & IPTUN_LADDR) &&
731 	    ((iptun->iptun_flags & IPTUN_RADDR) ||
732 	    !(iptun->iptun_typeinfo->iti_hasraddr)));
733 }
734 
735 /*
736  * Verify that the local address is valid, and insert in the fanout
737  */
738 static int
739 iptun_bind(iptun_t *iptun)
740 {
741 	conn_t			*connp = iptun->iptun_connp;
742 	int			error = 0;
743 	ip_xmit_attr_t		*ixa;
744 	iulp_t			uinfo;
745 	ip_stack_t		*ipst = connp->conn_netstack->netstack_ip;
746 
747 	/* Get an exclusive ixa for this thread, and replace conn_ixa */
748 	ixa = conn_get_ixa(connp, B_TRUE);
749 	if (ixa == NULL)
750 		return (ENOMEM);
751 	ASSERT(ixa->ixa_refcnt >= 2);
752 	ASSERT(ixa == connp->conn_ixa);
753 
754 	/* We create PMTU state including for 6to4 */
755 	ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
756 
757 	ASSERT(iptun_canbind(iptun));
758 
759 	mutex_enter(&connp->conn_lock);
760 	/*
761 	 * Note that conn_proto can't be set since the upper protocol
762 	 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
763 	 * ipcl_iptun_classify doesn't use conn_proto.
764 	 */
765 	connp->conn_ipversion = iptun->iptun_typeinfo->iti_ipvers;
766 
767 	switch (iptun->iptun_typeinfo->iti_type) {
768 	case IPTUN_TYPE_IPV4:
769 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
770 		    &connp->conn_laddr_v6);
771 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_raddr4,
772 		    &connp->conn_faddr_v6);
773 		ixa->ixa_flags |= IXAF_IS_IPV4;
774 		if (ip_laddr_verify_v4(iptun->iptun_laddr4, IPCL_ZONEID(connp),
775 		    ipst, B_FALSE) != IPVL_UNICAST_UP) {
776 			mutex_exit(&connp->conn_lock);
777 			error = EADDRNOTAVAIL;
778 			goto done;
779 		}
780 		break;
781 	case IPTUN_TYPE_IPV6:
782 		connp->conn_laddr_v6 = iptun->iptun_laddr6;
783 		connp->conn_faddr_v6 = iptun->iptun_raddr6;
784 		ixa->ixa_flags &= ~IXAF_IS_IPV4;
785 		/* We use a zero scopeid for now */
786 		if (ip_laddr_verify_v6(&iptun->iptun_laddr6, IPCL_ZONEID(connp),
787 		    ipst, B_FALSE, 0) != IPVL_UNICAST_UP) {
788 			mutex_exit(&connp->conn_lock);
789 			error = EADDRNOTAVAIL;
790 			goto done;
791 		}
792 		break;
793 	case IPTUN_TYPE_6TO4:
794 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
795 		    &connp->conn_laddr_v6);
796 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_faddr_v6);
797 		ixa->ixa_flags |= IXAF_IS_IPV4;
798 		mutex_exit(&connp->conn_lock);
799 
800 		switch (ip_laddr_verify_v4(iptun->iptun_laddr4,
801 		    IPCL_ZONEID(connp), ipst, B_FALSE)) {
802 		case IPVL_UNICAST_UP:
803 		case IPVL_UNICAST_DOWN:
804 			break;
805 		default:
806 			error = EADDRNOTAVAIL;
807 			goto done;
808 		}
809 		goto insert;
810 	}
811 
812 	/* In case previous destination was multirt */
813 	ip_attr_newdst(ixa);
814 
815 	/*
816 	 * When we set a tunnel's destination address, we do not
817 	 * care if the destination is reachable.  Transient routing
818 	 * issues should not inhibit the creation of a tunnel
819 	 * interface, for example. Thus we pass B_FALSE here.
820 	 */
821 	connp->conn_saddr_v6 = connp->conn_laddr_v6;
822 	mutex_exit(&connp->conn_lock);
823 
824 	/* As long as the MTU is large we avoid fragmentation */
825 	ixa->ixa_flags |= IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF;
826 
827 	/* We handle IPsec in iptun_output_common */
828 	error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
829 	    &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
830 	    &connp->conn_saddr_v6, &uinfo, 0);
831 
832 	if (error != 0)
833 		goto done;
834 
835 	/* saddr shouldn't change since it was already set */
836 	ASSERT(IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
837 	    &connp->conn_saddr_v6));
838 
839 	/* We set IXAF_VERIFY_PMTU to catch PMTU increases */
840 	ixa->ixa_flags |= IXAF_VERIFY_PMTU;
841 	ASSERT(uinfo.iulp_mtu != 0);
842 
843 	/*
844 	 * Allow setting new policies.
845 	 * The addresses/ports are already set, thus the IPsec policy calls
846 	 * can handle their passed-in conn's.
847 	 */
848 	connp->conn_policy_cached = B_FALSE;
849 
850 insert:
851 	error = ipcl_conn_insert(connp);
852 	if (error != 0)
853 		goto done;
854 
855 	/* Record this as the "last" send even though we haven't sent any */
856 	connp->conn_v6lastdst = connp->conn_faddr_v6;
857 
858 	iptun->iptun_flags |= IPTUN_BOUND;
859 	/*
860 	 * Now that we're bound with ip below us, this is a good
861 	 * time to initialize the destination path MTU and to
862 	 * re-calculate the tunnel's link MTU.
863 	 */
864 	(void) iptun_update_mtu(iptun, ixa, 0);
865 
866 	if (IS_IPTUN_RUNNING(iptun))
867 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
868 
869 done:
870 	ixa_refrele(ixa);
871 	return (error);
872 }
873 
874 static void
875 iptun_unbind(iptun_t *iptun)
876 {
877 	ASSERT(iptun->iptun_flags & IPTUN_BOUND);
878 	ASSERT(mutex_owned(&iptun->iptun_lock) ||
879 	    (iptun->iptun_flags & IPTUN_CONDEMNED));
880 	ip_unbind(iptun->iptun_connp);
881 	iptun->iptun_flags &= ~IPTUN_BOUND;
882 	if (!(iptun->iptun_flags & IPTUN_CONDEMNED))
883 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
884 }
885 
886 /*
887  * Re-generate the template data-link header for a given IP tunnel given the
888  * tunnel's current parameters.
889  */
890 static void
891 iptun_headergen(iptun_t *iptun, boolean_t update_mac)
892 {
893 	switch (iptun->iptun_typeinfo->iti_ipvers) {
894 	case IPV4_VERSION:
895 		/*
896 		 * We only need to use a custom IP header if the administrator
897 		 * has supplied a non-default hoplimit.
898 		 */
899 		if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT) {
900 			iptun->iptun_header_size = 0;
901 			break;
902 		}
903 		iptun->iptun_header_size = sizeof (ipha_t);
904 		iptun->iptun_header4.ipha_version_and_hdr_length =
905 		    IP_SIMPLE_HDR_VERSION;
906 		iptun->iptun_header4.ipha_fragment_offset_and_flags =
907 		    htons(IPH_DF);
908 		iptun->iptun_header4.ipha_ttl = iptun->iptun_hoplimit;
909 		break;
910 	case IPV6_VERSION: {
911 		ip6_t	*ip6hp = &iptun->iptun_header6.it6h_ip6h;
912 
913 		/*
914 		 * We only need to use a custom IPv6 header if either the
915 		 * administrator has supplied a non-default hoplimit, or we
916 		 * need to include an encapsulation limit option in the outer
917 		 * header.
918 		 */
919 		if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT &&
920 		    iptun->iptun_encaplimit == 0) {
921 			iptun->iptun_header_size = 0;
922 			break;
923 		}
924 
925 		(void) memset(ip6hp, 0, sizeof (*ip6hp));
926 		if (iptun->iptun_encaplimit == 0) {
927 			iptun->iptun_header_size = sizeof (ip6_t);
928 			ip6hp->ip6_nxt = IPPROTO_NONE;
929 		} else {
930 			iptun_encaplim_t	*iel;
931 
932 			iptun->iptun_header_size = sizeof (iptun_ipv6hdrs_t);
933 			/*
934 			 * The mac_ipv6 plugin requires ip6_plen to be in host
935 			 * byte order and reflect the extension headers
936 			 * present in the template.  The actual network byte
937 			 * order ip6_plen will be set on a per-packet basis on
938 			 * transmit.
939 			 */
940 			ip6hp->ip6_plen = sizeof (*iel);
941 			ip6hp->ip6_nxt = IPPROTO_DSTOPTS;
942 			iel = &iptun->iptun_header6.it6h_encaplim;
943 			*iel = iptun_encaplim_init;
944 			iel->iel_telopt.ip6ot_encap_limit =
945 			    iptun->iptun_encaplimit;
946 		}
947 
948 		ip6hp->ip6_hlim = iptun->iptun_hoplimit;
949 		break;
950 	}
951 	}
952 
953 	if (update_mac)
954 		iptun_task_dispatch(iptun, IPTUN_TASK_PDATA_UPDATE);
955 }
956 
957 /*
958  * Insert inbound and outbound IPv4 and IPv6 policy into the given policy
959  * head.
960  */
961 static boolean_t
962 iptun_insert_simple_policies(ipsec_policy_head_t *ph, ipsec_act_t *actp,
963     uint_t n, netstack_t *ns)
964 {
965 	int f = IPSEC_AF_V4;
966 
967 	if (!ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) ||
968 	    !ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns))
969 		return (B_FALSE);
970 
971 	f = IPSEC_AF_V6;
972 	return (ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) &&
973 	    ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns));
974 }
975 
976 /*
977  * Used to set IPsec policy when policy is set through the IPTUN_CREATE or
978  * IPTUN_MODIFY ioctls.
979  */
980 static int
981 iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr)
982 {
983 	int		rc = 0;
984 	uint_t		nact;
985 	ipsec_act_t	*actp = NULL;
986 	boolean_t	clear_all, old_policy = B_FALSE;
987 	ipsec_tun_pol_t	*itp;
988 	char		name[MAXLINKNAMELEN];
989 	uint64_t	gen;
990 	netstack_t	*ns = iptun->iptun_ns;
991 
992 	/* Can't specify self-encap on a tunnel. */
993 	if (ipsr->ipsr_self_encap_req != 0)
994 		return (EINVAL);
995 
996 	/*
997 	 * If it's a "clear-all" entry, unset the security flags and resume
998 	 * normal cleartext (or inherit-from-global) policy.
999 	 */
1000 	clear_all = ((ipsr->ipsr_ah_req & IPTUN_IPSEC_REQ_MASK) == 0 &&
1001 	    (ipsr->ipsr_esp_req & IPTUN_IPSEC_REQ_MASK) == 0);
1002 
1003 	ASSERT(mutex_owned(&iptun->iptun_lock));
1004 	itp = iptun->iptun_itp;
1005 	if (itp == NULL) {
1006 		if (clear_all)
1007 			goto bail;
1008 		if ((rc = dls_mgmt_get_linkinfo(iptun->iptun_linkid, name, NULL,
1009 		    NULL, NULL)) != 0)
1010 			goto bail;
1011 		ASSERT(name[0] != '\0');
1012 		if ((itp = create_tunnel_policy(name, &rc, &gen, ns)) == NULL)
1013 			goto bail;
1014 		iptun->iptun_itp = itp;
1015 	}
1016 
1017 	/* Allocate the actvec now, before holding itp or polhead locks. */
1018 	ipsec_actvec_from_req(ipsr, &actp, &nact, ns);
1019 	if (actp == NULL) {
1020 		rc = ENOMEM;
1021 		goto bail;
1022 	}
1023 
1024 	/*
1025 	 * Just write on the active polhead.  Save the primary/secondary stuff
1026 	 * for spdsock operations.
1027 	 *
1028 	 * Mutex because we need to write to the polhead AND flags atomically.
1029 	 * Other threads will acquire the polhead lock as a reader if the
1030 	 * (unprotected) flag is set.
1031 	 */
1032 	mutex_enter(&itp->itp_lock);
1033 	if (itp->itp_flags & ITPF_P_TUNNEL) {
1034 		/* Oops, we lost a race.  Let's get out of here. */
1035 		rc = EBUSY;
1036 		goto mutex_bail;
1037 	}
1038 	old_policy = ((itp->itp_flags & ITPF_P_ACTIVE) != 0);
1039 
1040 	if (old_policy) {
1041 		ITPF_CLONE(itp->itp_flags);
1042 		rc = ipsec_copy_polhead(itp->itp_policy, itp->itp_inactive, ns);
1043 		if (rc != 0) {
1044 			/* inactive has already been cleared. */
1045 			itp->itp_flags &= ~ITPF_IFLAGS;
1046 			goto mutex_bail;
1047 		}
1048 		rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1049 		ipsec_polhead_flush(itp->itp_policy, ns);
1050 	} else {
1051 		/* Else assume itp->itp_policy is already flushed. */
1052 		rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1053 	}
1054 
1055 	if (clear_all) {
1056 		ASSERT(avl_numnodes(&itp->itp_policy->iph_rulebyid) == 0);
1057 		itp->itp_flags &= ~ITPF_PFLAGS;
1058 		rw_exit(&itp->itp_policy->iph_lock);
1059 		old_policy = B_FALSE;	/* Clear out the inactive one too. */
1060 		goto recover_bail;
1061 	}
1062 
1063 	if (iptun_insert_simple_policies(itp->itp_policy, actp, nact, ns)) {
1064 		rw_exit(&itp->itp_policy->iph_lock);
1065 		/*
1066 		 * Adjust MTU and make sure the DL side knows what's up.
1067 		 */
1068 		itp->itp_flags = ITPF_P_ACTIVE;
1069 		(void) iptun_update_mtu(iptun, NULL, 0);
1070 		old_policy = B_FALSE;	/* Blank out inactive - we succeeded */
1071 	} else {
1072 		rw_exit(&itp->itp_policy->iph_lock);
1073 		rc = ENOMEM;
1074 	}
1075 
1076 recover_bail:
1077 	if (old_policy) {
1078 		/* Recover policy in in active polhead. */
1079 		ipsec_swap_policy(itp->itp_policy, itp->itp_inactive, ns);
1080 		ITPF_SWAP(itp->itp_flags);
1081 	}
1082 
1083 	/* Clear policy in inactive polhead. */
1084 	itp->itp_flags &= ~ITPF_IFLAGS;
1085 	rw_enter(&itp->itp_inactive->iph_lock, RW_WRITER);
1086 	ipsec_polhead_flush(itp->itp_inactive, ns);
1087 	rw_exit(&itp->itp_inactive->iph_lock);
1088 
1089 mutex_bail:
1090 	mutex_exit(&itp->itp_lock);
1091 
1092 bail:
1093 	if (actp != NULL)
1094 		ipsec_actvec_free(actp, nact);
1095 
1096 	return (rc);
1097 }
1098 
1099 static iptun_typeinfo_t *
1100 iptun_gettypeinfo(iptun_type_t type)
1101 {
1102 	int i;
1103 
1104 	for (i = 0; iptun_type_table[i].iti_type != IPTUN_TYPE_UNKNOWN; i++) {
1105 		if (iptun_type_table[i].iti_type == type)
1106 			break;
1107 	}
1108 	return (&iptun_type_table[i]);
1109 }
1110 
1111 /*
1112  * Set the parameters included in ik on the tunnel iptun.  Parameters that can
1113  * only be set at creation time are set in iptun_create().
1114  */
1115 static int
1116 iptun_setparams(iptun_t *iptun, const iptun_kparams_t *ik)
1117 {
1118 	int		err = 0;
1119 	netstack_t	*ns = iptun->iptun_ns;
1120 	iptun_addr_t	orig_laddr, orig_raddr;
1121 	uint_t		orig_flags = iptun->iptun_flags;
1122 
1123 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR) {
1124 		if (orig_flags & IPTUN_LADDR)
1125 			orig_laddr = iptun->iptun_laddr;
1126 		if ((err = iptun_setladdr(iptun, &ik->iptun_kparam_laddr)) != 0)
1127 			return (err);
1128 		iptun->iptun_flags |= IPTUN_LADDR;
1129 	}
1130 
1131 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) {
1132 		if (orig_flags & IPTUN_RADDR)
1133 			orig_raddr = iptun->iptun_raddr;
1134 		if ((err = iptun_setraddr(iptun, &ik->iptun_kparam_raddr)) != 0)
1135 			goto done;
1136 		iptun->iptun_flags |= IPTUN_RADDR;
1137 	}
1138 
1139 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_SECINFO) {
1140 		/*
1141 		 * Set IPsec policy originating from the ifconfig(1M) command
1142 		 * line.  This is traditionally called "simple" policy because
1143 		 * the ipsec_req_t (iptun_kparam_secinfo) can only describe a
1144 		 * simple policy of "do ESP on everything" and/or "do AH on
1145 		 * everything" (as opposed to the rich policy that can be
1146 		 * defined with ipsecconf(1M)).
1147 		 */
1148 		if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
1149 			/*
1150 			 * Can't set security properties for automatic
1151 			 * tunnels.
1152 			 */
1153 			err = EINVAL;
1154 			goto done;
1155 		}
1156 
1157 		if (!ipsec_loaded(ns->netstack_ipsec)) {
1158 			/* If IPsec can be loaded, try and load it now. */
1159 			if (ipsec_failed(ns->netstack_ipsec)) {
1160 				err = EPROTONOSUPPORT;
1161 				goto done;
1162 			}
1163 			ipsec_loader_loadnow(ns->netstack_ipsec);
1164 			/*
1165 			 * ipsec_loader_loadnow() returns while IPsec is
1166 			 * loaded asynchronously.  While a method exists to
1167 			 * wait for IPsec to load (ipsec_loader_wait()), it
1168 			 * requires use of a STREAMS queue to do a qwait().
1169 			 * We're not in STREAMS context here, and so we can't
1170 			 * use it.  This is not a problem in practice because
1171 			 * in the vast majority of cases, key management and
1172 			 * global policy will have loaded before any tunnels
1173 			 * are plumbed, and so IPsec will already have been
1174 			 * loaded.
1175 			 */
1176 			err = EAGAIN;
1177 			goto done;
1178 		}
1179 
1180 		err = iptun_set_sec_simple(iptun, &ik->iptun_kparam_secinfo);
1181 		if (err == 0) {
1182 			iptun->iptun_flags |= IPTUN_SIMPLE_POLICY;
1183 			iptun->iptun_simple_policy = ik->iptun_kparam_secinfo;
1184 		}
1185 	}
1186 done:
1187 	if (err != 0) {
1188 		/* Restore original source and destination. */
1189 		if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR &&
1190 		    (orig_flags & IPTUN_LADDR))
1191 			iptun->iptun_laddr = orig_laddr;
1192 		if ((ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) &&
1193 		    (orig_flags & IPTUN_RADDR))
1194 			iptun->iptun_raddr = orig_raddr;
1195 		iptun->iptun_flags = orig_flags;
1196 	}
1197 	return (err);
1198 }
1199 
1200 static int
1201 iptun_register(iptun_t *iptun)
1202 {
1203 	mac_register_t	*mac;
1204 	int		err;
1205 
1206 	ASSERT(!(iptun->iptun_flags & IPTUN_MAC_REGISTERED));
1207 
1208 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
1209 		return (EINVAL);
1210 
1211 	mac->m_type_ident = iptun->iptun_typeinfo->iti_ident;
1212 	mac->m_driver = iptun;
1213 	mac->m_dip = iptun_dip;
1214 	mac->m_instance = (uint_t)-1;
1215 	mac->m_src_addr = (uint8_t *)&iptun->iptun_laddr.ia_addr;
1216 	mac->m_dst_addr = iptun->iptun_typeinfo->iti_hasraddr ?
1217 	    (uint8_t *)&iptun->iptun_raddr.ia_addr : NULL;
1218 	mac->m_callbacks = &iptun_m_callbacks;
1219 	mac->m_min_sdu = iptun->iptun_typeinfo->iti_minmtu;
1220 	mac->m_max_sdu = iptun->iptun_mtu;
1221 	if (iptun->iptun_header_size != 0) {
1222 		mac->m_pdata = &iptun->iptun_header;
1223 		mac->m_pdata_size = iptun->iptun_header_size;
1224 	}
1225 	if ((err = mac_register(mac, &iptun->iptun_mh)) == 0)
1226 		iptun->iptun_flags |= IPTUN_MAC_REGISTERED;
1227 	mac_free(mac);
1228 	return (err);
1229 }
1230 
1231 static int
1232 iptun_unregister(iptun_t *iptun)
1233 {
1234 	int err;
1235 
1236 	ASSERT(iptun->iptun_flags & IPTUN_MAC_REGISTERED);
1237 	if ((err = mac_unregister(iptun->iptun_mh)) == 0)
1238 		iptun->iptun_flags &= ~IPTUN_MAC_REGISTERED;
1239 	return (err);
1240 }
1241 
1242 static conn_t *
1243 iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
1244 {
1245 	conn_t *connp;
1246 
1247 	if ((connp = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ns)) == NULL)
1248 		return (NULL);
1249 
1250 	connp->conn_flags |= IPCL_IPTUN;
1251 	connp->conn_iptun = iptun;
1252 	connp->conn_recv = iptun_input;
1253 	connp->conn_recvicmp = iptun_input_icmp;
1254 	connp->conn_verifyicmp = iptun_verifyicmp;
1255 
1256 	/*
1257 	 * Register iptun_notify to listen to capability changes detected by IP.
1258 	 * This upcall is made in the context of the call to conn_ip_output.
1259 	 */
1260 	connp->conn_ixa->ixa_notify = iptun_notify;
1261 	connp->conn_ixa->ixa_notify_cookie = iptun;
1262 
1263 	/*
1264 	 * For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done
1265 	 * for all other conn_t's.
1266 	 *
1267 	 * Note that there's an important distinction between iptun_zoneid and
1268 	 * conn_zoneid.  The conn_zoneid is set to GLOBAL_ZONEID in non-global
1269 	 * exclusive stack zones to make the ip module believe that the
1270 	 * non-global zone is actually a global zone.  Therefore, when
1271 	 * interacting with the ip module, we must always use conn_zoneid.
1272 	 */
1273 	connp->conn_zoneid = (ns->netstack_stackid == GLOBAL_NETSTACKID) ?
1274 	    crgetzoneid(credp) : GLOBAL_ZONEID;
1275 	connp->conn_cred = credp;
1276 	/* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */
1277 	crhold(connp->conn_cred);
1278 	connp->conn_cpid = NOPID;
1279 
1280 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1281 	connp->conn_ixa->ixa_zoneid = connp->conn_zoneid;
1282 	ASSERT(connp->conn_ref == 1);
1283 
1284 	/* Cache things in ixa without an extra refhold */
1285 	ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1286 	connp->conn_ixa->ixa_cred = connp->conn_cred;
1287 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1288 	if (is_system_labeled())
1289 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1290 
1291 	/*
1292 	 * Have conn_ip_output drop packets should our outer source
1293 	 * go invalid
1294 	 */
1295 	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1296 
1297 	switch (iptun->iptun_typeinfo->iti_ipvers) {
1298 	case IPV4_VERSION:
1299 		connp->conn_family = AF_INET6;
1300 		break;
1301 	case IPV6_VERSION:
1302 		connp->conn_family = AF_INET;
1303 		break;
1304 	}
1305 	mutex_enter(&connp->conn_lock);
1306 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1307 	mutex_exit(&connp->conn_lock);
1308 	return (connp);
1309 }
1310 
1311 static void
1312 iptun_conn_destroy(conn_t *connp)
1313 {
1314 	ip_quiesce_conn(connp);
1315 	connp->conn_iptun = NULL;
1316 	ASSERT(connp->conn_ref == 1);
1317 	CONN_DEC_REF(connp);
1318 }
1319 
1320 static iptun_t *
1321 iptun_alloc(void)
1322 {
1323 	iptun_t *iptun;
1324 
1325 	if ((iptun = kmem_cache_alloc(iptun_cache, KM_NOSLEEP)) != NULL) {
1326 		bzero(iptun, sizeof (*iptun));
1327 		atomic_inc_32(&iptun_tunnelcount);
1328 	}
1329 	return (iptun);
1330 }
1331 
1332 static void
1333 iptun_free(iptun_t *iptun)
1334 {
1335 	ASSERT(iptun->iptun_flags & IPTUN_CONDEMNED);
1336 
1337 	if (iptun->iptun_flags & IPTUN_HASH_INSERTED) {
1338 		iptun_stack_t	*iptuns = iptun->iptun_iptuns;
1339 
1340 		mutex_enter(&iptun_hash_lock);
1341 		VERIFY(mod_hash_remove(iptun_hash,
1342 		    IPTUN_HASH_KEY(iptun->iptun_linkid),
1343 		    (mod_hash_val_t *)&iptun) == 0);
1344 		mutex_exit(&iptun_hash_lock);
1345 		iptun->iptun_flags &= ~IPTUN_HASH_INSERTED;
1346 		mutex_enter(&iptuns->iptuns_lock);
1347 		list_remove(&iptuns->iptuns_iptunlist, iptun);
1348 		mutex_exit(&iptuns->iptuns_lock);
1349 	}
1350 
1351 	if (iptun->iptun_flags & IPTUN_BOUND)
1352 		iptun_unbind(iptun);
1353 
1354 	/*
1355 	 * After iptun_unregister(), there will be no threads executing a
1356 	 * downcall from the mac module, including in the tx datapath.
1357 	 */
1358 	if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
1359 		VERIFY(iptun_unregister(iptun) == 0);
1360 
1361 	if (iptun->iptun_itp != NULL) {
1362 		/*
1363 		 * Remove from the AVL tree, AND release the reference iptun_t
1364 		 * itself holds on the ITP.
1365 		 */
1366 		itp_unlink(iptun->iptun_itp, iptun->iptun_ns);
1367 		ITP_REFRELE(iptun->iptun_itp, iptun->iptun_ns);
1368 		iptun->iptun_itp = NULL;
1369 		iptun->iptun_flags &= ~IPTUN_SIMPLE_POLICY;
1370 	}
1371 
1372 	/*
1373 	 * After ipcl_conn_destroy(), there will be no threads executing an
1374 	 * upcall from ip (i.e., iptun_input()), and it is then safe to free
1375 	 * the iptun_t.
1376 	 */
1377 	if (iptun->iptun_connp != NULL) {
1378 		iptun_conn_destroy(iptun->iptun_connp);
1379 		iptun->iptun_connp = NULL;
1380 	}
1381 
1382 	kmem_cache_free(iptun_cache, iptun);
1383 	atomic_dec_32(&iptun_tunnelcount);
1384 }
1385 
1386 int
1387 iptun_create(iptun_kparams_t *ik, cred_t *credp)
1388 {
1389 	iptun_t		*iptun = NULL;
1390 	int		err = 0, mherr;
1391 	char		linkname[MAXLINKNAMELEN];
1392 	ipsec_tun_pol_t	*itp;
1393 	netstack_t	*ns = NULL;
1394 	iptun_stack_t	*iptuns;
1395 	datalink_id_t	tmpid;
1396 	zoneid_t	zoneid = crgetzoneid(credp);
1397 	boolean_t	link_created = B_FALSE;
1398 
1399 	/* The tunnel type is mandatory */
1400 	if (!(ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE))
1401 		return (EINVAL);
1402 
1403 	/*
1404 	 * Is the linkid that the caller wishes to associate with this new
1405 	 * tunnel assigned to this zone?
1406 	 */
1407 	if (zone_check_datalink(&zoneid, ik->iptun_kparam_linkid) != 0) {
1408 		if (zoneid != GLOBAL_ZONEID)
1409 			return (EINVAL);
1410 	} else if (zoneid == GLOBAL_ZONEID) {
1411 		return (EINVAL);
1412 	}
1413 
1414 	/*
1415 	 * Make sure that we're not trying to create a tunnel that has already
1416 	 * been created.
1417 	 */
1418 	if (iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun) == 0) {
1419 		iptun_exit(iptun);
1420 		iptun = NULL;
1421 		err = EEXIST;
1422 		goto done;
1423 	}
1424 
1425 	ns = netstack_find_by_cred(credp);
1426 	iptuns = ns->netstack_iptun;
1427 
1428 	if ((iptun = iptun_alloc()) == NULL) {
1429 		err = ENOMEM;
1430 		goto done;
1431 	}
1432 
1433 	iptun->iptun_linkid = ik->iptun_kparam_linkid;
1434 	iptun->iptun_zoneid = zoneid;
1435 	iptun->iptun_ns = ns;
1436 
1437 	iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type);
1438 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_UNKNOWN) {
1439 		err = EINVAL;
1440 		goto done;
1441 	}
1442 
1443 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_IMPLICIT)
1444 		iptun->iptun_flags |= IPTUN_IMPLICIT;
1445 
1446 	if ((err = iptun_setparams(iptun, ik)) != 0)
1447 		goto done;
1448 
1449 	iptun->iptun_hoplimit = IPTUN_DEFAULT_HOPLIMIT;
1450 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_IPV6)
1451 		iptun->iptun_encaplimit = IPTUN_DEFAULT_ENCAPLIMIT;
1452 
1453 	iptun_headergen(iptun, B_FALSE);
1454 
1455 	iptun->iptun_connp = iptun_conn_create(iptun, ns, credp);
1456 	if (iptun->iptun_connp == NULL) {
1457 		err = ENOMEM;
1458 		goto done;
1459 	}
1460 
1461 	iptun->iptun_mtu = iptun->iptun_typeinfo->iti_maxmtu;
1462 	iptun->iptun_dpmtu = iptun->iptun_mtu;
1463 
1464 	/*
1465 	 * Find an ITP based on linkname.  If we have parms already set via
1466 	 * the iptun_setparams() call above, it may have created an ITP for
1467 	 * us.  We always try get_tunnel_policy() for DEBUG correctness
1468 	 * checks, and we may wish to refactor this to only check when
1469 	 * iptun_itp is NULL.
1470 	 */
1471 	if ((err = dls_mgmt_get_linkinfo(iptun->iptun_linkid, linkname, NULL,
1472 	    NULL, NULL)) != 0)
1473 		goto done;
1474 	if ((itp = get_tunnel_policy(linkname, ns)) != NULL)
1475 		iptun->iptun_itp = itp;
1476 
1477 	/*
1478 	 * See if we have the necessary IP addresses assigned to this tunnel
1479 	 * to try and bind them with ip underneath us.  If we're not ready to
1480 	 * bind yet, then we'll defer the bind operation until the addresses
1481 	 * are modified.
1482 	 */
1483 	if (iptun_canbind(iptun) && ((err = iptun_bind(iptun)) != 0))
1484 		goto done;
1485 
1486 	if ((err = iptun_register(iptun)) != 0)
1487 		goto done;
1488 
1489 	err = dls_devnet_create(iptun->iptun_mh, iptun->iptun_linkid,
1490 	    iptun->iptun_zoneid);
1491 	if (err != 0)
1492 		goto done;
1493 	link_created = B_TRUE;
1494 
1495 	/*
1496 	 * We hash by link-id as that is the key used by all other iptun
1497 	 * interfaces (modify, delete, etc.).
1498 	 */
1499 	if ((mherr = mod_hash_insert(iptun_hash,
1500 	    IPTUN_HASH_KEY(iptun->iptun_linkid), (mod_hash_val_t)iptun)) == 0) {
1501 		mutex_enter(&iptuns->iptuns_lock);
1502 		list_insert_head(&iptuns->iptuns_iptunlist, iptun);
1503 		mutex_exit(&iptuns->iptuns_lock);
1504 		iptun->iptun_flags |= IPTUN_HASH_INSERTED;
1505 	} else if (mherr == MH_ERR_NOMEM) {
1506 		err = ENOMEM;
1507 	} else if (mherr == MH_ERR_DUPLICATE) {
1508 		err = EEXIST;
1509 	} else {
1510 		err = EINVAL;
1511 	}
1512 
1513 done:
1514 	if (iptun == NULL && ns != NULL)
1515 		netstack_rele(ns);
1516 	if (err != 0 && iptun != NULL) {
1517 		if (link_created) {
1518 			(void) dls_devnet_destroy(iptun->iptun_mh, &tmpid,
1519 			    B_TRUE);
1520 		}
1521 		iptun->iptun_flags |= IPTUN_CONDEMNED;
1522 		iptun_free(iptun);
1523 	}
1524 	return (err);
1525 }
1526 
1527 int
1528 iptun_delete(datalink_id_t linkid, cred_t *credp)
1529 {
1530 	int	err;
1531 	iptun_t	*iptun = NULL;
1532 
1533 	if ((err = iptun_enter_by_linkid(linkid, &iptun)) != 0)
1534 		return (err);
1535 
1536 	/* One cannot delete a tunnel that belongs to another zone. */
1537 	if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1538 		iptun_exit(iptun);
1539 		return (EACCES);
1540 	}
1541 
1542 	/*
1543 	 * We need to exit iptun in order to issue calls up the stack such as
1544 	 * dls_devnet_destroy().  If we call up while still in iptun, deadlock
1545 	 * with calls coming down the stack is possible.  We prevent other
1546 	 * threads from entering this iptun after we've exited it by setting
1547 	 * the IPTUN_DELETE_PENDING flag.  This will cause callers of
1548 	 * iptun_enter() to block waiting on iptun_enter_cv.  The assumption
1549 	 * here is that the functions we're calling while IPTUN_DELETE_PENDING
1550 	 * is set dont resuult in an iptun_enter() call, as that would result
1551 	 * in deadlock.
1552 	 */
1553 	iptun->iptun_flags |= IPTUN_DELETE_PENDING;
1554 
1555 	/* Wait for any pending upcall to the mac module to complete. */
1556 	while (iptun->iptun_flags & IPTUN_UPCALL_PENDING)
1557 		cv_wait(&iptun->iptun_upcall_cv, &iptun->iptun_lock);
1558 
1559 	iptun_exit(iptun);
1560 
1561 	if ((err = dls_devnet_destroy(iptun->iptun_mh, &linkid, B_TRUE)) == 0) {
1562 		/*
1563 		 * mac_disable() will fail with EBUSY if there are references
1564 		 * to the iptun MAC.  If there are none, then mac_disable()
1565 		 * will assure that none can be acquired until the MAC is
1566 		 * unregistered.
1567 		 *
1568 		 * XXX CR 6791335 prevents us from calling mac_disable() prior
1569 		 * to dls_devnet_destroy(), so we unfortunately need to
1570 		 * attempt to re-create the devnet node if mac_disable()
1571 		 * fails.
1572 		 */
1573 		if ((err = mac_disable(iptun->iptun_mh)) != 0) {
1574 			(void) dls_devnet_create(iptun->iptun_mh, linkid,
1575 			    iptun->iptun_zoneid);
1576 		}
1577 	}
1578 
1579 	/*
1580 	 * Now that we know the fate of this iptun_t, we need to clear
1581 	 * IPTUN_DELETE_PENDING, and set IPTUN_CONDEMNED if the iptun_t is
1582 	 * slated to be freed.  Either way, we need to signal the threads
1583 	 * waiting in iptun_enter() so that they can either fail if
1584 	 * IPTUN_CONDEMNED is set, or continue if it's not.
1585 	 */
1586 	mutex_enter(&iptun->iptun_lock);
1587 	iptun->iptun_flags &= ~IPTUN_DELETE_PENDING;
1588 	if (err == 0)
1589 		iptun->iptun_flags |= IPTUN_CONDEMNED;
1590 	cv_broadcast(&iptun->iptun_enter_cv);
1591 	mutex_exit(&iptun->iptun_lock);
1592 
1593 	/*
1594 	 * Note that there is no danger in calling iptun_free() after having
1595 	 * dropped the iptun_lock since callers of iptun_enter() at this point
1596 	 * are doing so from iptun_enter_by_linkid() (mac_disable() got rid of
1597 	 * threads entering from mac callbacks which call iptun_enter()
1598 	 * directly) which holds iptun_hash_lock, and iptun_free() grabs this
1599 	 * lock in order to remove the iptun_t from the hash table.
1600 	 */
1601 	if (err == 0)
1602 		iptun_free(iptun);
1603 
1604 	return (err);
1605 }
1606 
1607 int
1608 iptun_modify(const iptun_kparams_t *ik, cred_t *credp)
1609 {
1610 	iptun_t		*iptun;
1611 	boolean_t	laddr_change = B_FALSE, raddr_change = B_FALSE;
1612 	int		err;
1613 
1614 	if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1615 		return (err);
1616 
1617 	/* One cannot modify a tunnel that belongs to another zone. */
1618 	if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1619 		err = EACCES;
1620 		goto done;
1621 	}
1622 
1623 	/* The tunnel type cannot be changed */
1624 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE) {
1625 		err = EINVAL;
1626 		goto done;
1627 	}
1628 
1629 	if ((err = iptun_setparams(iptun, ik)) != 0)
1630 		goto done;
1631 	iptun_headergen(iptun, B_FALSE);
1632 
1633 	/*
1634 	 * If any of the tunnel's addresses has been modified and the tunnel
1635 	 * has the necessary addresses assigned to it, we need to try to bind
1636 	 * with ip underneath us.  If we're not ready to bind yet, then we'll
1637 	 * try again when the addresses are modified later.
1638 	 */
1639 	laddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR);
1640 	raddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR);
1641 	if (laddr_change || raddr_change) {
1642 		if (iptun->iptun_flags & IPTUN_BOUND)
1643 			iptun_unbind(iptun);
1644 		if (iptun_canbind(iptun) && (err = iptun_bind(iptun)) != 0) {
1645 			if (laddr_change)
1646 				iptun->iptun_flags &= ~IPTUN_LADDR;
1647 			if (raddr_change)
1648 				iptun->iptun_flags &= ~IPTUN_RADDR;
1649 			goto done;
1650 		}
1651 	}
1652 
1653 	if (laddr_change)
1654 		iptun_task_dispatch(iptun, IPTUN_TASK_LADDR_UPDATE);
1655 	if (raddr_change)
1656 		iptun_task_dispatch(iptun, IPTUN_TASK_RADDR_UPDATE);
1657 
1658 done:
1659 	iptun_exit(iptun);
1660 	return (err);
1661 }
1662 
1663 /* Given an IP tunnel's datalink id, fill in its parameters. */
1664 int
1665 iptun_info(iptun_kparams_t *ik, cred_t *credp)
1666 {
1667 	iptun_t	*iptun;
1668 	int	err;
1669 
1670 	/* Is the tunnel link visible from the caller's zone? */
1671 	if (!dls_devnet_islinkvisible(ik->iptun_kparam_linkid,
1672 	    crgetzoneid(credp)))
1673 		return (ENOENT);
1674 
1675 	if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1676 		return (err);
1677 
1678 	bzero(ik, sizeof (iptun_kparams_t));
1679 
1680 	ik->iptun_kparam_linkid = iptun->iptun_linkid;
1681 	ik->iptun_kparam_type = iptun->iptun_typeinfo->iti_type;
1682 	ik->iptun_kparam_flags |= IPTUN_KPARAM_TYPE;
1683 
1684 	if (iptun->iptun_flags & IPTUN_LADDR) {
1685 		iptun_getaddr(&iptun->iptun_laddr, &ik->iptun_kparam_laddr);
1686 		ik->iptun_kparam_flags |= IPTUN_KPARAM_LADDR;
1687 	}
1688 	if (iptun->iptun_flags & IPTUN_RADDR) {
1689 		iptun_getaddr(&iptun->iptun_raddr, &ik->iptun_kparam_raddr);
1690 		ik->iptun_kparam_flags |= IPTUN_KPARAM_RADDR;
1691 	}
1692 
1693 	if (iptun->iptun_flags & IPTUN_IMPLICIT)
1694 		ik->iptun_kparam_flags |= IPTUN_KPARAM_IMPLICIT;
1695 
1696 	if (iptun->iptun_itp != NULL) {
1697 		mutex_enter(&iptun->iptun_itp->itp_lock);
1698 		if (iptun->iptun_itp->itp_flags & ITPF_P_ACTIVE) {
1699 			ik->iptun_kparam_flags |= IPTUN_KPARAM_IPSECPOL;
1700 			if (iptun->iptun_flags & IPTUN_SIMPLE_POLICY) {
1701 				ik->iptun_kparam_flags |= IPTUN_KPARAM_SECINFO;
1702 				ik->iptun_kparam_secinfo =
1703 				    iptun->iptun_simple_policy;
1704 			}
1705 		}
1706 		mutex_exit(&iptun->iptun_itp->itp_lock);
1707 	}
1708 
1709 done:
1710 	iptun_exit(iptun);
1711 	return (err);
1712 }
1713 
1714 int
1715 iptun_set_6to4relay(netstack_t *ns, ipaddr_t relay_addr)
1716 {
1717 	if (relay_addr == INADDR_BROADCAST || CLASSD(relay_addr))
1718 		return (EADDRNOTAVAIL);
1719 	ns->netstack_iptun->iptuns_relay_rtr_addr = relay_addr;
1720 	return (0);
1721 }
1722 
1723 void
1724 iptun_get_6to4relay(netstack_t *ns, ipaddr_t *relay_addr)
1725 {
1726 	*relay_addr = ns->netstack_iptun->iptuns_relay_rtr_addr;
1727 }
1728 
1729 void
1730 iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp)
1731 {
1732 	iptun_t	*iptun;
1733 
1734 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
1735 		return;
1736 	if (iptun->iptun_itp != itp) {
1737 		ASSERT(iptun->iptun_itp == NULL);
1738 		ITP_REFHOLD(itp);
1739 		iptun->iptun_itp = itp;
1740 	}
1741 	/*
1742 	 * IPsec policy means IPsec overhead, which means lower MTU.
1743 	 * Refresh the MTU for this tunnel.
1744 	 */
1745 	(void) iptun_update_mtu(iptun, NULL, 0);
1746 	iptun_exit(iptun);
1747 }
1748 
1749 /*
1750  * Obtain the path MTU to the tunnel destination.
1751  * Can return zero in some cases.
1752  */
1753 static uint32_t
1754 iptun_get_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1755 {
1756 	uint32_t	pmtu = 0;
1757 	conn_t		*connp = iptun->iptun_connp;
1758 	boolean_t	need_rele = B_FALSE;
1759 
1760 	/*
1761 	 * We only obtain the pmtu for tunnels that have a remote tunnel
1762 	 * address.
1763 	 */
1764 	if (!(iptun->iptun_flags & IPTUN_RADDR))
1765 		return (0);
1766 
1767 	if (ixa == NULL) {
1768 		ixa = conn_get_ixa(connp, B_FALSE);
1769 		if (ixa == NULL)
1770 			return (0);
1771 		need_rele = B_TRUE;
1772 	}
1773 	/*
1774 	 * Guard against ICMP errors before we have sent, as well as against
1775 	 * and a thread which held conn_ixa.
1776 	 */
1777 	if (ixa->ixa_ire != NULL) {
1778 		pmtu = ip_get_pmtu(ixa);
1779 
1780 		/*
1781 		 * For both IPv4 and IPv6 we can have indication that the outer
1782 		 * header needs fragmentation.
1783 		 */
1784 		if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1785 			/* Must allow fragmentation in ip_output */
1786 			ixa->ixa_flags &= ~IXAF_DONTFRAG;
1787 		} else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1788 			ixa->ixa_flags |= IXAF_DONTFRAG;
1789 		} else {
1790 			/* ip_get_pmtu might have set this - we don't want it */
1791 			ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1792 		}
1793 	}
1794 
1795 	if (need_rele)
1796 		ixa_refrele(ixa);
1797 	return (pmtu);
1798 }
1799 
1800 /*
1801  * Update the ip_xmit_attr_t to capture the current lower path mtu as known
1802  * by ip.
1803  */
1804 static void
1805 iptun_update_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1806 {
1807 	uint32_t	pmtu;
1808 	conn_t		*connp = iptun->iptun_connp;
1809 	boolean_t	need_rele = B_FALSE;
1810 
1811 	/* IXAF_VERIFY_PMTU is not set if we don't have a fixed destination */
1812 	if (!(iptun->iptun_flags & IPTUN_RADDR))
1813 		return;
1814 
1815 	if (ixa == NULL) {
1816 		ixa = conn_get_ixa(connp, B_FALSE);
1817 		if (ixa == NULL)
1818 			return;
1819 		need_rele = B_TRUE;
1820 	}
1821 	/*
1822 	 * Guard against ICMP errors before we have sent, as well as against
1823 	 * and a thread which held conn_ixa.
1824 	 */
1825 	if (ixa->ixa_ire != NULL) {
1826 		pmtu = ip_get_pmtu(ixa);
1827 		/*
1828 		 * Update ixa_fragsize and ixa_pmtu.
1829 		 */
1830 		ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
1831 
1832 		/*
1833 		 * For both IPv4 and IPv6 we can have indication that the outer
1834 		 * header needs fragmentation.
1835 		 */
1836 		if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1837 			/* Must allow fragmentation in ip_output */
1838 			ixa->ixa_flags &= ~IXAF_DONTFRAG;
1839 		} else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1840 			ixa->ixa_flags |= IXAF_DONTFRAG;
1841 		} else {
1842 			/* ip_get_pmtu might have set this - we don't want it */
1843 			ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1844 		}
1845 	}
1846 
1847 	if (need_rele)
1848 		ixa_refrele(ixa);
1849 }
1850 
1851 /*
1852  * There is nothing that iptun can verify in addition to IP having
1853  * verified the IP addresses in the fanout.
1854  */
1855 /* ARGSUSED */
1856 static boolean_t
1857 iptun_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
1858     ip_recv_attr_t *ira)
1859 {
1860 	return (B_TRUE);
1861 }
1862 
1863 /*
1864  * Notify function registered with ip_xmit_attr_t.
1865  */
1866 static void
1867 iptun_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
1868     ixa_notify_arg_t narg)
1869 {
1870 	iptun_t		*iptun = (iptun_t *)arg;
1871 
1872 	switch (ntype) {
1873 	case IXAN_PMTU:
1874 		(void) iptun_update_mtu(iptun, ixa, narg);
1875 		break;
1876 	}
1877 }
1878 
1879 /*
1880  * Returns the max of old_ovhd and the overhead associated with pol.
1881  */
1882 static uint32_t
1883 iptun_max_policy_overhead(ipsec_policy_t *pol, uint32_t old_ovhd)
1884 {
1885 	uint32_t new_ovhd = old_ovhd;
1886 
1887 	while (pol != NULL) {
1888 		new_ovhd = max(new_ovhd,
1889 		    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1890 		pol = pol->ipsp_hash.hash_next;
1891 	}
1892 	return (new_ovhd);
1893 }
1894 
1895 static uint32_t
1896 iptun_get_ipsec_overhead(iptun_t *iptun)
1897 {
1898 	ipsec_policy_root_t	*ipr;
1899 	ipsec_policy_head_t	*iph;
1900 	ipsec_policy_t		*pol;
1901 	ipsec_selector_t	sel;
1902 	int			i;
1903 	uint32_t		ipsec_ovhd = 0;
1904 	ipsec_tun_pol_t		*itp = iptun->iptun_itp;
1905 	netstack_t		*ns = iptun->iptun_ns;
1906 
1907 	if (itp == NULL || !(itp->itp_flags & ITPF_P_ACTIVE)) {
1908 		/*
1909 		 * Consult global policy, just in case.  This will only work
1910 		 * if we have both source and destination addresses to work
1911 		 * with.
1912 		 */
1913 		if ((iptun->iptun_flags & (IPTUN_LADDR|IPTUN_RADDR)) !=
1914 		    (IPTUN_LADDR|IPTUN_RADDR))
1915 			return (0);
1916 
1917 		iph = ipsec_system_policy(ns);
1918 		bzero(&sel, sizeof (sel));
1919 		sel.ips_isv4 =
1920 		    (iptun->iptun_typeinfo->iti_ipvers == IPV4_VERSION);
1921 		switch (iptun->iptun_typeinfo->iti_ipvers) {
1922 		case IPV4_VERSION:
1923 			sel.ips_local_addr_v4 = iptun->iptun_laddr4;
1924 			sel.ips_remote_addr_v4 = iptun->iptun_raddr4;
1925 			break;
1926 		case IPV6_VERSION:
1927 			sel.ips_local_addr_v6 = iptun->iptun_laddr6;
1928 			sel.ips_remote_addr_v6 = iptun->iptun_raddr6;
1929 			break;
1930 		}
1931 		/* Check for both IPv4 and IPv6. */
1932 		sel.ips_protocol = IPPROTO_ENCAP;
1933 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1934 		    &sel);
1935 		if (pol != NULL) {
1936 			ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act);
1937 			IPPOL_REFRELE(pol);
1938 		}
1939 		sel.ips_protocol = IPPROTO_IPV6;
1940 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1941 		    &sel);
1942 		if (pol != NULL) {
1943 			ipsec_ovhd = max(ipsec_ovhd,
1944 			    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1945 			IPPOL_REFRELE(pol);
1946 		}
1947 		IPPH_REFRELE(iph, ns);
1948 	} else {
1949 		/*
1950 		 * Look through all of the possible IPsec actions for the
1951 		 * tunnel, and find the largest potential IPsec overhead.
1952 		 */
1953 		iph = itp->itp_policy;
1954 		rw_enter(&iph->iph_lock, RW_READER);
1955 		ipr = &(iph->iph_root[IPSEC_TYPE_OUTBOUND]);
1956 		ipsec_ovhd = iptun_max_policy_overhead(
1957 		    ipr->ipr_nonhash[IPSEC_AF_V4], 0);
1958 		ipsec_ovhd = iptun_max_policy_overhead(
1959 		    ipr->ipr_nonhash[IPSEC_AF_V6], ipsec_ovhd);
1960 		for (i = 0; i < ipr->ipr_nchains; i++) {
1961 			ipsec_ovhd = iptun_max_policy_overhead(
1962 			    ipr->ipr_hash[i].hash_head, ipsec_ovhd);
1963 		}
1964 		rw_exit(&iph->iph_lock);
1965 	}
1966 
1967 	return (ipsec_ovhd);
1968 }
1969 
1970 /*
1971  * Calculate and return the maximum possible upper MTU for the given tunnel.
1972  *
1973  * If new_pmtu is set then we also need to update the lower path MTU information
1974  * in the ip_xmit_attr_t. That is needed since we set IXAF_VERIFY_PMTU so that
1975  * we are notified by conn_ip_output() when the path MTU increases.
1976  */
1977 static uint32_t
1978 iptun_get_maxmtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
1979 {
1980 	size_t		header_size, ipsec_overhead;
1981 	uint32_t	maxmtu, pmtu;
1982 
1983 	/*
1984 	 * Start with the path-MTU to the remote address, which is either
1985 	 * provided as the new_pmtu argument, or obtained using
1986 	 * iptun_get_dst_pmtu().
1987 	 */
1988 	if (new_pmtu != 0) {
1989 		if (iptun->iptun_flags & IPTUN_RADDR)
1990 			iptun->iptun_dpmtu = new_pmtu;
1991 		pmtu = new_pmtu;
1992 	} else if (iptun->iptun_flags & IPTUN_RADDR) {
1993 		if ((pmtu = iptun_get_dst_pmtu(iptun, ixa)) == 0) {
1994 			/*
1995 			 * We weren't able to obtain the path-MTU of the
1996 			 * destination.  Use the previous value.
1997 			 */
1998 			pmtu = iptun->iptun_dpmtu;
1999 		} else {
2000 			iptun->iptun_dpmtu = pmtu;
2001 		}
2002 	} else {
2003 		/*
2004 		 * We have no path-MTU information to go on, use the maximum
2005 		 * possible value.
2006 		 */
2007 		pmtu = iptun->iptun_typeinfo->iti_maxmtu;
2008 	}
2009 
2010 	/*
2011 	 * Now calculate tunneling overhead and subtract that from the
2012 	 * path-MTU information obtained above.
2013 	 */
2014 	if (iptun->iptun_header_size != 0) {
2015 		header_size = iptun->iptun_header_size;
2016 	} else {
2017 		switch (iptun->iptun_typeinfo->iti_ipvers) {
2018 		case IPV4_VERSION:
2019 			header_size = sizeof (ipha_t);
2020 			if (is_system_labeled())
2021 				header_size += IP_MAX_OPT_LENGTH;
2022 			break;
2023 		case IPV6_VERSION:
2024 			header_size = sizeof (iptun_ipv6hdrs_t);
2025 			break;
2026 		}
2027 	}
2028 
2029 	ipsec_overhead = iptun_get_ipsec_overhead(iptun);
2030 
2031 	maxmtu = pmtu - (header_size + ipsec_overhead);
2032 	return (max(maxmtu, iptun->iptun_typeinfo->iti_minmtu));
2033 }
2034 
2035 /*
2036  * Re-calculate the tunnel's MTU as seen from above and notify the MAC layer
2037  * of any change in MTU.  The new_pmtu argument is the new lower path MTU to
2038  * the tunnel destination to be used in the tunnel MTU calculation.  Passing
2039  * in 0 for new_pmtu causes the lower path MTU to be dynamically updated using
2040  * ip_get_pmtu().
2041  *
2042  * If the calculated tunnel MTU is different than its previous value, then we
2043  * notify the MAC layer above us of this change using mac_maxsdu_update().
2044  */
2045 static uint32_t
2046 iptun_update_mtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
2047 {
2048 	uint32_t newmtu;
2049 
2050 	/* We always update the ixa since we might have set IXAF_VERIFY_PMTU */
2051 	iptun_update_dst_pmtu(iptun, ixa);
2052 
2053 	/*
2054 	 * We return the current MTU without updating it if it was pegged to a
2055 	 * static value using the MAC_PROP_MTU link property.
2056 	 */
2057 	if (iptun->iptun_flags & IPTUN_FIXED_MTU)
2058 		return (iptun->iptun_mtu);
2059 
2060 	/* If the MTU isn't fixed, then use the maximum possible value. */
2061 	newmtu = iptun_get_maxmtu(iptun, ixa, new_pmtu);
2062 	/*
2063 	 * We only dynamically adjust the tunnel MTU for tunnels with
2064 	 * destinations because dynamic MTU calculations are based on the
2065 	 * destination path-MTU.
2066 	 */
2067 	if ((iptun->iptun_flags & IPTUN_RADDR) && newmtu != iptun->iptun_mtu) {
2068 		iptun->iptun_mtu = newmtu;
2069 		if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
2070 			iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
2071 	}
2072 
2073 	return (newmtu);
2074 }
2075 
2076 /*
2077  * Frees a packet or packet chain and bumps stat for each freed packet.
2078  */
2079 static void
2080 iptun_drop_pkt(mblk_t *mp, uint64_t *stat)
2081 {
2082 	mblk_t *pktmp;
2083 
2084 	for (pktmp = mp; pktmp != NULL; pktmp = mp) {
2085 		mp = mp->b_next;
2086 		pktmp->b_next = NULL;
2087 		if (stat != NULL)
2088 			atomic_inc_64(stat);
2089 		freemsg(pktmp);
2090 	}
2091 }
2092 
2093 /*
2094  * Allocate and return a new mblk to hold an IP and ICMP header, and chain the
2095  * original packet to its b_cont.  Returns NULL on failure.
2096  */
2097 static mblk_t *
2098 iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
2099 {
2100 	mblk_t *icmperr_mp;
2101 
2102 	if ((icmperr_mp = allocb(hdrs_size, BPRI_MED)) != NULL) {
2103 		icmperr_mp->b_wptr += hdrs_size;
2104 		/* tack on the offending packet */
2105 		icmperr_mp->b_cont = orig_pkt;
2106 	}
2107 	return (icmperr_mp);
2108 }
2109 
2110 /*
2111  * Transmit an ICMP error.  mp->b_rptr points at the packet to be included in
2112  * the ICMP error.
2113  */
2114 static void
2115 iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp,
2116     ts_label_t *tsl)
2117 {
2118 	size_t	orig_pktsize, hdrs_size;
2119 	mblk_t	*icmperr_mp;
2120 	ipha_t	*new_ipha;
2121 	icmph_t	*new_icmp;
2122 	ip_xmit_attr_t	ixas;
2123 	conn_t	*connp = iptun->iptun_connp;
2124 
2125 	orig_pktsize = msgdsize(mp);
2126 	hdrs_size = sizeof (ipha_t) + sizeof (icmph_t);
2127 	if ((icmperr_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2128 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2129 		return;
2130 	}
2131 
2132 	new_ipha = (ipha_t *)icmperr_mp->b_rptr;
2133 	new_icmp = (icmph_t *)(new_ipha + 1);
2134 
2135 	new_ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2136 	new_ipha->ipha_type_of_service = 0;
2137 	new_ipha->ipha_ident = 0;
2138 	new_ipha->ipha_fragment_offset_and_flags = 0;
2139 	new_ipha->ipha_ttl = orig_ipha->ipha_ttl;
2140 	new_ipha->ipha_protocol = IPPROTO_ICMP;
2141 	new_ipha->ipha_src = orig_ipha->ipha_dst;
2142 	new_ipha->ipha_dst = orig_ipha->ipha_src;
2143 	new_ipha->ipha_hdr_checksum = 0; /* will be computed by ip */
2144 	new_ipha->ipha_length = htons(hdrs_size + orig_pktsize);
2145 
2146 	*new_icmp = *icmp;
2147 	new_icmp->icmph_checksum = 0;
2148 	new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0);
2149 
2150 	bzero(&ixas, sizeof (ixas));
2151 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2152 	if (new_ipha->ipha_src == INADDR_ANY)
2153 		ixas.ixa_flags |= IXAF_SET_SOURCE;
2154 
2155 	ixas.ixa_zoneid = IPCL_ZONEID(connp);
2156 	ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2157 	ixas.ixa_cred = connp->conn_cred;
2158 	ixas.ixa_cpid = NOPID;
2159 	if (is_system_labeled())
2160 		ixas.ixa_tsl = tsl;
2161 
2162 	ixas.ixa_ifindex = 0;
2163 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2164 
2165 	(void) ip_output_simple(icmperr_mp, &ixas);
2166 	ixa_cleanup(&ixas);
2167 }
2168 
2169 static void
2170 iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp,
2171     ts_label_t *tsl)
2172 {
2173 	size_t	orig_pktsize, hdrs_size;
2174 	mblk_t	*icmp6err_mp;
2175 	ip6_t	*new_ip6h;
2176 	icmp6_t	*new_icmp6;
2177 	ip_xmit_attr_t	ixas;
2178 	conn_t	*connp = iptun->iptun_connp;
2179 
2180 	orig_pktsize = msgdsize(mp);
2181 	hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t);
2182 	if ((icmp6err_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2183 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2184 		return;
2185 	}
2186 
2187 	new_ip6h = (ip6_t *)icmp6err_mp->b_rptr;
2188 	new_icmp6 = (icmp6_t *)(new_ip6h + 1);
2189 
2190 	new_ip6h->ip6_vcf = orig_ip6h->ip6_vcf;
2191 	new_ip6h->ip6_plen = htons(sizeof (icmp6_t) + orig_pktsize);
2192 	new_ip6h->ip6_hops = orig_ip6h->ip6_hops;
2193 	new_ip6h->ip6_nxt = IPPROTO_ICMPV6;
2194 	new_ip6h->ip6_src = orig_ip6h->ip6_dst;
2195 	new_ip6h->ip6_dst = orig_ip6h->ip6_src;
2196 
2197 	*new_icmp6 = *icmp6;
2198 	/* The checksum is calculated in ip_output_simple and friends. */
2199 	new_icmp6->icmp6_cksum = new_ip6h->ip6_plen;
2200 
2201 	bzero(&ixas, sizeof (ixas));
2202 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
2203 	if (IN6_IS_ADDR_UNSPECIFIED(&new_ip6h->ip6_src))
2204 		ixas.ixa_flags |= IXAF_SET_SOURCE;
2205 
2206 	ixas.ixa_zoneid = IPCL_ZONEID(connp);
2207 	ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2208 	ixas.ixa_cred = connp->conn_cred;
2209 	ixas.ixa_cpid = NOPID;
2210 	if (is_system_labeled())
2211 		ixas.ixa_tsl = tsl;
2212 
2213 	ixas.ixa_ifindex = 0;
2214 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2215 
2216 	(void) ip_output_simple(icmp6err_mp, &ixas);
2217 	ixa_cleanup(&ixas);
2218 }
2219 
2220 static void
2221 iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
2222     uint8_t type, uint8_t code, ts_label_t *tsl)
2223 {
2224 	icmph_t icmp;
2225 
2226 	bzero(&icmp, sizeof (icmp));
2227 	icmp.icmph_type = type;
2228 	icmp.icmph_code = code;
2229 
2230 	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2231 }
2232 
2233 static void
2234 iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
2235     mblk_t *mp, ts_label_t *tsl)
2236 {
2237 	icmph_t	icmp;
2238 
2239 	icmp.icmph_type = ICMP_DEST_UNREACHABLE;
2240 	icmp.icmph_code = ICMP_FRAGMENTATION_NEEDED;
2241 	icmp.icmph_du_zero = 0;
2242 	icmp.icmph_du_mtu = htons(newmtu);
2243 
2244 	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2245 }
2246 
2247 static void
2248 iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
2249     uint8_t type, uint8_t code, uint32_t offset, ts_label_t *tsl)
2250 {
2251 	icmp6_t icmp6;
2252 
2253 	bzero(&icmp6, sizeof (icmp6));
2254 	icmp6.icmp6_type = type;
2255 	icmp6.icmp6_code = code;
2256 	if (type == ICMP6_PARAM_PROB)
2257 		icmp6.icmp6_pptr = htonl(offset);
2258 
2259 	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2260 }
2261 
2262 static void
2263 iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
2264     mblk_t *mp, ts_label_t *tsl)
2265 {
2266 	icmp6_t icmp6;
2267 
2268 	icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
2269 	icmp6.icmp6_code = 0;
2270 	icmp6.icmp6_mtu = htonl(newmtu);
2271 
2272 	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2273 }
2274 
2275 /*
2276  * Determines if the packet pointed to by ipha or ip6h is an ICMP error.  The
2277  * mp argument is only used to do bounds checking.
2278  */
2279 static boolean_t
2280 is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
2281 {
2282 	uint16_t hlen;
2283 
2284 	if (ipha != NULL) {
2285 		icmph_t	*icmph;
2286 
2287 		ASSERT(ip6h == NULL);
2288 		if (ipha->ipha_protocol != IPPROTO_ICMP)
2289 			return (B_FALSE);
2290 
2291 		hlen = IPH_HDR_LENGTH(ipha);
2292 		icmph = (icmph_t *)((uint8_t *)ipha + hlen);
2293 		return (ICMP_IS_ERROR(icmph->icmph_type) ||
2294 		    icmph->icmph_type == ICMP_REDIRECT);
2295 	} else {
2296 		icmp6_t	*icmp6;
2297 		uint8_t	*nexthdrp;
2298 
2299 		ASSERT(ip6h != NULL);
2300 		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hlen, &nexthdrp) ||
2301 		    *nexthdrp != IPPROTO_ICMPV6) {
2302 			return (B_FALSE);
2303 		}
2304 
2305 		icmp6 = (icmp6_t *)((uint8_t *)ip6h + hlen);
2306 		return (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
2307 		    icmp6->icmp6_type == ND_REDIRECT);
2308 	}
2309 }
2310 
2311 /*
2312  * Find inner and outer IP headers from a tunneled packet as setup for calls
2313  * into ipsec_tun_{in,out}bound().
2314  * Note that we need to allow the outer header to be in a separate mblk from
2315  * the inner header.
2316  * If the caller knows the outer_hlen, the caller passes it in. Otherwise zero.
2317  */
2318 static size_t
2319 iptun_find_headers(mblk_t *mp, size_t outer_hlen, ipha_t **outer4,
2320     ipha_t **inner4, ip6_t **outer6, ip6_t **inner6)
2321 {
2322 	ipha_t	*ipha;
2323 	size_t	first_mblkl = MBLKL(mp);
2324 	mblk_t	*inner_mp;
2325 
2326 	/*
2327 	 * Don't bother handling packets that don't have a full IP header in
2328 	 * the fist mblk.  For the input path, the ip module ensures that this
2329 	 * won't happen, and on the output path, the IP tunneling MAC-type
2330 	 * plugins ensure that this also won't happen.
2331 	 */
2332 	if (first_mblkl < sizeof (ipha_t))
2333 		return (0);
2334 	ipha = (ipha_t *)(mp->b_rptr);
2335 	switch (IPH_HDR_VERSION(ipha)) {
2336 	case IPV4_VERSION:
2337 		*outer4 = ipha;
2338 		*outer6 = NULL;
2339 		if (outer_hlen == 0)
2340 			outer_hlen = IPH_HDR_LENGTH(ipha);
2341 		break;
2342 	case IPV6_VERSION:
2343 		*outer4 = NULL;
2344 		*outer6 = (ip6_t *)ipha;
2345 		if (outer_hlen == 0)
2346 			outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
2347 		break;
2348 	default:
2349 		return (0);
2350 	}
2351 
2352 	if (first_mblkl < outer_hlen ||
2353 	    (first_mblkl == outer_hlen && mp->b_cont == NULL))
2354 		return (0);
2355 
2356 	/*
2357 	 * We don't bother doing a pullup here since the outer header will
2358 	 * just get stripped off soon on input anyway.  We just want to ensure
2359 	 * that the inner* pointer points to a full header.
2360 	 */
2361 	if (first_mblkl == outer_hlen) {
2362 		inner_mp = mp->b_cont;
2363 		ipha = (ipha_t *)inner_mp->b_rptr;
2364 	} else {
2365 		inner_mp = mp;
2366 		ipha = (ipha_t *)(mp->b_rptr + outer_hlen);
2367 	}
2368 	switch (IPH_HDR_VERSION(ipha)) {
2369 	case IPV4_VERSION:
2370 		if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ipha_t))
2371 			return (0);
2372 		*inner4 = ipha;
2373 		*inner6 = NULL;
2374 		break;
2375 	case IPV6_VERSION:
2376 		if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ip6_t))
2377 			return (0);
2378 		*inner4 = NULL;
2379 		*inner6 = (ip6_t *)ipha;
2380 		break;
2381 	default:
2382 		return (0);
2383 	}
2384 
2385 	return (outer_hlen);
2386 }
2387 
2388 /*
2389  * Received ICMP error in response to an X over IPv4 packet that we
2390  * transmitted.
2391  *
2392  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2393  * the following:
2394  *
2395  * [IPv4(0)][ICMPv4][IPv4(1)][IPv4(2)][ULP]
2396  *
2397  *	or
2398  *
2399  * [IPv4(0)][ICMPv4][IPv4(1)][IPv6][ULP]
2400  *
2401  * And "outer4" will get set to IPv4(1), and inner[46] will correspond to
2402  * whatever the very-inner packet is (IPv4(2) or IPv6).
2403  */
2404 static void
2405 iptun_input_icmp_v4(iptun_t *iptun, mblk_t *data_mp, icmph_t *icmph,
2406     ip_recv_attr_t *ira)
2407 {
2408 	uint8_t	*orig;
2409 	ipha_t	*outer4, *inner4;
2410 	ip6_t	*outer6, *inner6;
2411 	int	outer_hlen;
2412 	uint8_t	type, code;
2413 
2414 	ASSERT(data_mp->b_cont == NULL);
2415 	/*
2416 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
2417 	 * find headers in the ICMP packet payload.
2418 	 */
2419 	orig = data_mp->b_rptr;
2420 	data_mp->b_rptr = (uint8_t *)(icmph + 1);
2421 	/*
2422 	 * The ip module ensures that ICMP errors contain at least the
2423 	 * original IP header (otherwise, the error would never have made it
2424 	 * here).
2425 	 */
2426 	ASSERT(MBLKL(data_mp) >= 0);
2427 	outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2428 	    &inner6);
2429 	ASSERT(outer6 == NULL);
2430 	data_mp->b_rptr = orig;
2431 	if (outer_hlen == 0) {
2432 		iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2433 		return;
2434 	}
2435 
2436 	/* Only ICMP errors due to tunneled packets should reach here. */
2437 	ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP ||
2438 	    outer4->ipha_protocol == IPPROTO_IPV6);
2439 
2440 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2441 	    inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2442 	if (data_mp == NULL) {
2443 		/* Callee did all of the freeing. */
2444 		atomic_inc_64(&iptun->iptun_ierrors);
2445 		return;
2446 	}
2447 	/* We should never see reassembled fragment here. */
2448 	ASSERT(data_mp->b_next == NULL);
2449 
2450 	data_mp->b_rptr = (uint8_t *)outer4 + outer_hlen;
2451 
2452 	/*
2453 	 * If the original packet being transmitted was itself an ICMP error,
2454 	 * then drop this packet.  We don't want to generate an ICMP error in
2455 	 * response to an ICMP error.
2456 	 */
2457 	if (is_icmp_error(data_mp, inner4, inner6)) {
2458 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2459 		return;
2460 	}
2461 
2462 	switch (icmph->icmph_type) {
2463 	case ICMP_DEST_UNREACHABLE:
2464 		type = (inner4 != NULL ? icmph->icmph_type : ICMP6_DST_UNREACH);
2465 		switch (icmph->icmph_code) {
2466 		case ICMP_FRAGMENTATION_NEEDED: {
2467 			uint32_t newmtu;
2468 
2469 			/*
2470 			 * We reconcile this with the fact that the tunnel may
2471 			 * also have IPsec policy by letting iptun_update_mtu
2472 			 * take care of it.
2473 			 */
2474 			newmtu = iptun_update_mtu(iptun, NULL,
2475 			    ntohs(icmph->icmph_du_mtu));
2476 
2477 			if (inner4 != NULL) {
2478 				iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2479 				    data_mp, ira->ira_tsl);
2480 			} else {
2481 				iptun_icmp_toobig_v6(iptun, newmtu, inner6,
2482 				    data_mp, ira->ira_tsl);
2483 			}
2484 			return;
2485 		}
2486 		case ICMP_DEST_NET_UNREACH_ADMIN:
2487 		case ICMP_DEST_HOST_UNREACH_ADMIN:
2488 			code = (inner4 != NULL ? ICMP_DEST_NET_UNREACH_ADMIN :
2489 			    ICMP6_DST_UNREACH_ADMIN);
2490 			break;
2491 		default:
2492 			code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2493 			    ICMP6_DST_UNREACH_ADDR);
2494 			break;
2495 		}
2496 		break;
2497 	case ICMP_TIME_EXCEEDED:
2498 		if (inner6 != NULL) {
2499 			type = ICMP6_TIME_EXCEEDED;
2500 			code = 0;
2501 		} /* else we're already set. */
2502 		break;
2503 	case ICMP_PARAM_PROBLEM:
2504 		/*
2505 		 * This is a problem with the outer header we transmitted.
2506 		 * Treat this as an output error.
2507 		 */
2508 		iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2509 		return;
2510 	default:
2511 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2512 		return;
2513 	}
2514 
2515 	if (inner4 != NULL) {
2516 		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2517 		    ira->ira_tsl);
2518 	} else {
2519 		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2520 		    ira->ira_tsl);
2521 	}
2522 }
2523 
2524 /*
2525  * Return B_TRUE if the IPv6 packet pointed to by ip6h contains a Tunnel
2526  * Encapsulation Limit destination option.  If there is one, set encaplim_ptr
2527  * to point to the option value.
2528  */
2529 static boolean_t
2530 iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
2531 {
2532 	ip_pkt_t	pkt;
2533 	uint8_t		*endptr;
2534 	ip6_dest_t	*destp;
2535 	struct ip6_opt	*optp;
2536 
2537 	pkt.ipp_fields = 0; /* must be initialized */
2538 	(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &pkt, NULL);
2539 	if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) {
2540 		destp = pkt.ipp_dstopts;
2541 	} else if ((pkt.ipp_fields & IPPF_RTHDRDSTOPTS) != 0) {
2542 		destp = pkt.ipp_rthdrdstopts;
2543 	} else {
2544 		return (B_FALSE);
2545 	}
2546 
2547 	endptr = (uint8_t *)destp + 8 * (destp->ip6d_len + 1);
2548 	optp = (struct ip6_opt *)(destp + 1);
2549 	while (endptr - (uint8_t *)optp > sizeof (*optp)) {
2550 		if (optp->ip6o_type == IP6OPT_TUNNEL_LIMIT) {
2551 			if ((uint8_t *)(optp + 1) >= endptr)
2552 				return (B_FALSE);
2553 			*encaplim_ptr = (uint8_t *)&optp[1];
2554 			return (B_TRUE);
2555 		}
2556 		optp = (struct ip6_opt *)((uint8_t *)optp + optp->ip6o_len + 2);
2557 	}
2558 	return (B_FALSE);
2559 }
2560 
2561 /*
2562  * Received ICMPv6 error in response to an X over IPv6 packet that we
2563  * transmitted.
2564  *
2565  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2566  * the following:
2567  *
2568  * [IPv6(0)][ICMPv6][IPv6(1)][IPv4][ULP]
2569  *
2570  *	or
2571  *
2572  * [IPv6(0)][ICMPv6][IPv6(1)][IPv6(2)][ULP]
2573  *
2574  * And "outer6" will get set to IPv6(1), and inner[46] will correspond to
2575  * whatever the very-inner packet is (IPv4 or IPv6(2)).
2576  */
2577 static void
2578 iptun_input_icmp_v6(iptun_t *iptun, mblk_t *data_mp, icmp6_t *icmp6h,
2579     ip_recv_attr_t *ira)
2580 {
2581 	uint8_t	*orig;
2582 	ipha_t	*outer4, *inner4;
2583 	ip6_t	*outer6, *inner6;
2584 	int	outer_hlen;
2585 	uint8_t	type, code;
2586 
2587 	ASSERT(data_mp->b_cont == NULL);
2588 
2589 	/*
2590 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
2591 	 * find IP headers in the ICMP packet payload.
2592 	 */
2593 	orig = data_mp->b_rptr;
2594 	data_mp->b_rptr = (uint8_t *)(icmp6h + 1);
2595 	/*
2596 	 * The ip module ensures that ICMP errors contain at least the
2597 	 * original IP header (otherwise, the error would never have made it
2598 	 * here).
2599 	 */
2600 	ASSERT(MBLKL(data_mp) >= 0);
2601 	outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2602 	    &inner6);
2603 	ASSERT(outer4 == NULL);
2604 	data_mp->b_rptr = orig;	/* Restore r_ptr */
2605 	if (outer_hlen == 0) {
2606 		iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2607 		return;
2608 	}
2609 
2610 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2611 	    inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2612 	if (data_mp == NULL) {
2613 		/* Callee did all of the freeing. */
2614 		atomic_inc_64(&iptun->iptun_ierrors);
2615 		return;
2616 	}
2617 	/* We should never see reassembled fragment here. */
2618 	ASSERT(data_mp->b_next == NULL);
2619 
2620 	data_mp->b_rptr = (uint8_t *)outer6 + outer_hlen;
2621 
2622 	/*
2623 	 * If the original packet being transmitted was itself an ICMP error,
2624 	 * then drop this packet.  We don't want to generate an ICMP error in
2625 	 * response to an ICMP error.
2626 	 */
2627 	if (is_icmp_error(data_mp, inner4, inner6)) {
2628 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2629 		return;
2630 	}
2631 
2632 	switch (icmp6h->icmp6_type) {
2633 	case ICMP6_PARAM_PROB: {
2634 		uint8_t *encaplim_ptr;
2635 
2636 		/*
2637 		 * If the ICMPv6 error points to a valid Tunnel Encapsulation
2638 		 * Limit option and the limit value is 0, then fall through
2639 		 * and send a host unreachable message.  Otherwise, treat the
2640 		 * error as an output error, as there must have been a problem
2641 		 * with a packet we sent.
2642 		 */
2643 		if (!iptun_find_encaplimit(data_mp, outer6, &encaplim_ptr) ||
2644 		    (icmp6h->icmp6_pptr !=
2645 		    ((ptrdiff_t)encaplim_ptr - (ptrdiff_t)outer6)) ||
2646 		    *encaplim_ptr != 0) {
2647 			iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2648 			return;
2649 		}
2650 		/* FALLTHRU */
2651 	}
2652 	case ICMP6_TIME_EXCEEDED:
2653 	case ICMP6_DST_UNREACH:
2654 		type = (inner4 != NULL ? ICMP_DEST_UNREACHABLE :
2655 		    ICMP6_DST_UNREACH);
2656 		code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2657 		    ICMP6_DST_UNREACH_ADDR);
2658 		break;
2659 	case ICMP6_PACKET_TOO_BIG: {
2660 		uint32_t newmtu;
2661 
2662 		/*
2663 		 * We reconcile this with the fact that the tunnel may also
2664 		 * have IPsec policy by letting iptun_update_mtu take care of
2665 		 * it.
2666 		 */
2667 		newmtu = iptun_update_mtu(iptun, NULL,
2668 		    ntohl(icmp6h->icmp6_mtu));
2669 
2670 		if (inner4 != NULL) {
2671 			iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2672 			    data_mp, ira->ira_tsl);
2673 		} else {
2674 			iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp,
2675 			    ira->ira_tsl);
2676 		}
2677 		return;
2678 	}
2679 	default:
2680 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2681 		return;
2682 	}
2683 
2684 	if (inner4 != NULL) {
2685 		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2686 		    ira->ira_tsl);
2687 	} else {
2688 		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2689 		    ira->ira_tsl);
2690 	}
2691 }
2692 
2693 /*
2694  * Called as conn_recvicmp from IP for ICMP errors.
2695  */
2696 /* ARGSUSED2 */
2697 static void
2698 iptun_input_icmp(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2699 {
2700 	conn_t		*connp = arg;
2701 	iptun_t		*iptun = connp->conn_iptun;
2702 	mblk_t		*tmpmp;
2703 	size_t		hlen;
2704 
2705 	ASSERT(IPCL_IS_IPTUN(connp));
2706 
2707 	if (mp->b_cont != NULL) {
2708 		/*
2709 		 * Since ICMP error processing necessitates access to bits
2710 		 * that are within the ICMP error payload (the original packet
2711 		 * that caused the error), pull everything up into a single
2712 		 * block for convenience.
2713 		 */
2714 		if ((tmpmp = msgpullup(mp, -1)) == NULL) {
2715 			iptun_drop_pkt(mp, &iptun->iptun_norcvbuf);
2716 			return;
2717 		}
2718 		freemsg(mp);
2719 		mp = tmpmp;
2720 	}
2721 
2722 	hlen = ira->ira_ip_hdr_length;
2723 	switch (iptun->iptun_typeinfo->iti_ipvers) {
2724 	case IPV4_VERSION:
2725 		/*
2726 		 * The outer IP header coming up from IP is always ipha_t
2727 		 * alligned (otherwise, we would have crashed in ip).
2728 		 */
2729 		iptun_input_icmp_v4(iptun, mp, (icmph_t *)(mp->b_rptr + hlen),
2730 		    ira);
2731 		break;
2732 	case IPV6_VERSION:
2733 		iptun_input_icmp_v6(iptun, mp, (icmp6_t *)(mp->b_rptr + hlen),
2734 		    ira);
2735 		break;
2736 	}
2737 }
2738 
2739 static boolean_t
2740 iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2741 {
2742 	ipaddr_t v4addr;
2743 
2744 	/*
2745 	 * It's possible that someone sent us an IPv4-in-IPv4 packet with the
2746 	 * IPv4 address of a 6to4 tunnel as the destination.
2747 	 */
2748 	if (inner6 == NULL)
2749 		return (B_FALSE);
2750 
2751 	/*
2752 	 * Make sure that the IPv6 destination is within the site that this
2753 	 * 6to4 tunnel is routing for.  We don't want people bouncing random
2754 	 * tunneled IPv6 packets through this 6to4 router.
2755 	 */
2756 	IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst, (struct in_addr *)&v4addr);
2757 	if (outer4->ipha_dst != v4addr)
2758 		return (B_FALSE);
2759 
2760 	if (IN6_IS_ADDR_6TO4(&inner6->ip6_src)) {
2761 		/*
2762 		 * Section 9 of RFC 3056 (security considerations) suggests
2763 		 * that when a packet is from a 6to4 site (i.e., it's not a
2764 		 * global address being forwarded froma relay router), make
2765 		 * sure that the packet was tunneled by that site's 6to4
2766 		 * router.
2767 		 */
2768 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2769 		if (outer4->ipha_src != v4addr)
2770 			return (B_FALSE);
2771 	} else {
2772 		/*
2773 		 * Only accept packets from a relay router if we've configured
2774 		 * outbound relay router functionality.
2775 		 */
2776 		if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2777 			return (B_FALSE);
2778 	}
2779 
2780 	return (B_TRUE);
2781 }
2782 
2783 /*
2784  * Input function for everything that comes up from the ip module below us.
2785  * This is called directly from the ip module via connp->conn_recv().
2786  *
2787  * We receive M_DATA messages with IP-in-IP tunneled packets.
2788  */
2789 /* ARGSUSED2 */
2790 static void
2791 iptun_input(void *arg, mblk_t *data_mp, void *arg2, ip_recv_attr_t *ira)
2792 {
2793 	conn_t	*connp = arg;
2794 	iptun_t	*iptun = connp->conn_iptun;
2795 	int	outer_hlen;
2796 	ipha_t	*outer4, *inner4;
2797 	ip6_t	*outer6, *inner6;
2798 
2799 	ASSERT(IPCL_IS_IPTUN(connp));
2800 	ASSERT(DB_TYPE(data_mp) == M_DATA);
2801 
2802 	outer_hlen = iptun_find_headers(data_mp, ira->ira_ip_hdr_length,
2803 	    &outer4, &inner4, &outer6, &inner6);
2804 	if (outer_hlen == 0)
2805 		goto drop;
2806 
2807 	/*
2808 	 * If the system is labeled, we call tsol_check_dest() on the packet
2809 	 * destination (our local tunnel address) to ensure that the packet as
2810 	 * labeled should be allowed to be sent to us.  We don't need to call
2811 	 * the more involved tsol_receive_local() since the tunnel link itself
2812 	 * cannot be assigned to shared-stack non-global zones.
2813 	 */
2814 	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2815 		if (ira->ira_tsl == NULL)
2816 			goto drop;
2817 		if (tsol_check_dest(ira->ira_tsl, (outer4 != NULL ?
2818 		    (void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst),
2819 		    (outer4 != NULL ? IPV4_VERSION : IPV6_VERSION),
2820 		    CONN_MAC_DEFAULT, B_FALSE, NULL) != 0)
2821 			goto drop;
2822 	}
2823 
2824 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2825 	    inner4, inner6, outer4, outer6, outer_hlen, iptun->iptun_ns);
2826 	if (data_mp == NULL) {
2827 		/* Callee did all of the freeing. */
2828 		return;
2829 	}
2830 
2831 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
2832 	    !iptun_in_6to4_ok(iptun, outer4, inner6))
2833 		goto drop;
2834 
2835 	/*
2836 	 * We need to statistically account for each packet individually, so
2837 	 * we might as well split up any b_next chains here.
2838 	 */
2839 	do {
2840 		mblk_t	*mp;
2841 
2842 		mp = data_mp->b_next;
2843 		data_mp->b_next = NULL;
2844 
2845 		atomic_inc_64(&iptun->iptun_ipackets);
2846 		atomic_add_64(&iptun->iptun_rbytes, msgdsize(data_mp));
2847 		mac_rx(iptun->iptun_mh, NULL, data_mp);
2848 
2849 		data_mp = mp;
2850 	} while (data_mp != NULL);
2851 	return;
2852 drop:
2853 	iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2854 }
2855 
2856 /*
2857  * Do 6to4-specific header-processing on output.  Return B_TRUE if the packet
2858  * was processed without issue, or B_FALSE if the packet had issues and should
2859  * be dropped.
2860  */
2861 static boolean_t
2862 iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2863 {
2864 	ipaddr_t v4addr;
2865 
2866 	/*
2867 	 * IPv6 source must be a 6to4 address.  This is because a conscious
2868 	 * decision was made to not allow a Solaris system to be used as a
2869 	 * relay router (for security reasons) when 6to4 was initially
2870 	 * integrated.  If this decision is ever reversed, the following check
2871 	 * can be removed.
2872 	 */
2873 	if (!IN6_IS_ADDR_6TO4(&inner6->ip6_src))
2874 		return (B_FALSE);
2875 
2876 	/*
2877 	 * RFC3056 mandates that the IPv4 source MUST be set to the IPv4
2878 	 * portion of the 6to4 IPv6 source address.  In other words, make sure
2879 	 * that we're tunneling packets from our own 6to4 site.
2880 	 */
2881 	IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2882 	if (outer4->ipha_src != v4addr)
2883 		return (B_FALSE);
2884 
2885 	/*
2886 	 * Automatically set the destination of the outer IPv4 header as
2887 	 * described in RFC3056.  There are two possibilities:
2888 	 *
2889 	 * a. If the IPv6 destination is a 6to4 address, set the IPv4 address
2890 	 *    to the IPv4 portion of the 6to4 address.
2891 	 * b. If the IPv6 destination is a native IPv6 address, set the IPv4
2892 	 *    destination to the address of a relay router.
2893 	 *
2894 	 * Design Note: b shouldn't be necessary here, and this is a flaw in
2895 	 * the design of the 6to4relay command.  Instead of setting a 6to4
2896 	 * relay address in this module via an ioctl, the 6to4relay command
2897 	 * could simply add a IPv6 route for native IPv6 addresses (such as a
2898 	 * default route) in the forwarding table that uses a 6to4 destination
2899 	 * as its next hop, and the IPv4 portion of that address could be a
2900 	 * 6to4 relay address.  In order for this to work, IP would have to
2901 	 * resolve the next hop address, which would necessitate a link-layer
2902 	 * address resolver for 6to4 links, which doesn't exist today.
2903 	 *
2904 	 * In fact, if a resolver existed for 6to4 links, then setting the
2905 	 * IPv4 destination in the outer header could be done as part of
2906 	 * link-layer address resolution and fast-path header generation, and
2907 	 * not here.
2908 	 */
2909 	if (IN6_IS_ADDR_6TO4(&inner6->ip6_dst)) {
2910 		/* destination is a 6to4 router */
2911 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst,
2912 		    (struct in_addr *)&outer4->ipha_dst);
2913 
2914 		/* Reject attempts to send to INADDR_ANY */
2915 		if (outer4->ipha_dst == INADDR_ANY)
2916 			return (B_FALSE);
2917 	} else {
2918 		/*
2919 		 * The destination is a native IPv6 address.  If output to a
2920 		 * relay-router is enabled, use the relay-router's IPv4
2921 		 * address as the destination.
2922 		 */
2923 		if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2924 			return (B_FALSE);
2925 		outer4->ipha_dst = iptun->iptun_iptuns->iptuns_relay_rtr_addr;
2926 	}
2927 
2928 	/*
2929 	 * If the outer source and destination are equal, this means that the
2930 	 * 6to4 router somehow forwarded an IPv6 packet destined for its own
2931 	 * 6to4 site to its 6to4 tunnel interface, which will result in this
2932 	 * packet infinitely bouncing between ip and iptun.
2933 	 */
2934 	return (outer4->ipha_src != outer4->ipha_dst);
2935 }
2936 
2937 /*
2938  * Process output packets with outer IPv4 headers.  Frees mp and bumps stat on
2939  * error.
2940  */
2941 static mblk_t *
2942 iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
2943     ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
2944 {
2945 	uint8_t	*innerptr = (inner4 != NULL ?
2946 	    (uint8_t *)inner4 : (uint8_t *)inner6);
2947 	size_t	minmtu = iptun->iptun_typeinfo->iti_minmtu;
2948 
2949 	if (inner4 != NULL) {
2950 		ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP);
2951 		/*
2952 		 * Copy the tos from the inner IPv4 header. We mask off ECN
2953 		 * bits (bits 6 and 7) because there is currently no
2954 		 * tunnel-tunnel communication to determine if both sides
2955 		 * support ECN.  We opt for the safe choice: don't copy the
2956 		 * ECN bits when doing encapsulation.
2957 		 */
2958 		outer4->ipha_type_of_service =
2959 		    inner4->ipha_type_of_service & ~0x03;
2960 	} else {
2961 		ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 &&
2962 		    inner6 != NULL);
2963 	}
2964 	if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
2965 		outer4->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
2966 	else
2967 		outer4->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
2968 
2969 	/*
2970 	 * As described in section 3.2.2 of RFC4213, if the packet payload is
2971 	 * less than or equal to the minimum MTU size, then we need to allow
2972 	 * IPv4 to fragment the packet.  The reason is that even if we end up
2973 	 * receiving an ICMP frag-needed, the interface above this tunnel
2974 	 * won't be allowed to drop its MTU as a result, since the packet was
2975 	 * already smaller than the smallest allowable MTU for that interface.
2976 	 */
2977 	if (mp->b_wptr - innerptr <= minmtu) {
2978 		outer4->ipha_fragment_offset_and_flags = 0;
2979 		ixa->ixa_flags &= ~IXAF_DONTFRAG;
2980 	} else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) &&
2981 	    (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4)) {
2982 		ixa->ixa_flags |= IXAF_DONTFRAG;
2983 	}
2984 
2985 	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(outer4);
2986 	ixa->ixa_pktlen = msgdsize(mp);
2987 	ixa->ixa_protocol = outer4->ipha_protocol;
2988 
2989 	outer4->ipha_length = htons(ixa->ixa_pktlen);
2990 	return (mp);
2991 }
2992 
2993 /*
2994  * Insert an encapsulation limit destination option in the packet provided.
2995  * Always consumes the mp argument and returns a new mblk pointer.
2996  */
2997 static mblk_t *
2998 iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
2999     uint8_t limit)
3000 {
3001 	mblk_t			*newmp;
3002 	iptun_ipv6hdrs_t	*newouter6;
3003 
3004 	ASSERT(outer6->ip6_nxt == IPPROTO_IPV6);
3005 	ASSERT(mp->b_cont == NULL);
3006 
3007 	mp->b_rptr += sizeof (ip6_t);
3008 	newmp = allocb(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), BPRI_MED);
3009 	if (newmp == NULL) {
3010 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
3011 		return (NULL);
3012 	}
3013 	newmp->b_wptr += sizeof (iptun_ipv6hdrs_t);
3014 	/* Copy the payload (Starting with the inner IPv6 header). */
3015 	bcopy(mp->b_rptr, newmp->b_wptr, MBLKL(mp));
3016 	newmp->b_wptr += MBLKL(mp);
3017 	newouter6 = (iptun_ipv6hdrs_t *)newmp->b_rptr;
3018 	/* Now copy the outer IPv6 header. */
3019 	bcopy(outer6, &newouter6->it6h_ip6h, sizeof (ip6_t));
3020 	newouter6->it6h_ip6h.ip6_nxt = IPPROTO_DSTOPTS;
3021 	newouter6->it6h_encaplim = iptun_encaplim_init;
3022 	newouter6->it6h_encaplim.iel_destopt.ip6d_nxt = outer6->ip6_nxt;
3023 	newouter6->it6h_encaplim.iel_telopt.ip6ot_encap_limit = limit;
3024 
3025 	/*
3026 	 * The payload length will be set at the end of
3027 	 * iptun_out_process_ipv6().
3028 	 */
3029 
3030 	freemsg(mp);
3031 	return (newmp);
3032 }
3033 
3034 /*
3035  * Process output packets with outer IPv6 headers.  Frees mp and bumps stats
3036  * on error.
3037  */
3038 static mblk_t *
3039 iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
3040     ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
3041 {
3042 	uint8_t		*innerptr = (inner4 != NULL ?
3043 	    (uint8_t *)inner4 : (uint8_t *)inner6);
3044 	size_t		minmtu = iptun->iptun_typeinfo->iti_minmtu;
3045 	uint8_t		*limit, *configlimit;
3046 	uint32_t	offset;
3047 	iptun_ipv6hdrs_t *v6hdrs;
3048 
3049 	if (inner6 != NULL && iptun_find_encaplimit(mp, inner6, &limit)) {
3050 		/*
3051 		 * The inner packet is an IPv6 packet which itself contains an
3052 		 * encapsulation limit option.  The limit variable points to
3053 		 * the value in the embedded option.  Process the
3054 		 * encapsulation limit option as specified in RFC 2473.
3055 		 *
3056 		 * If limit is 0, then we've exceeded the limit and we need to
3057 		 * send back an ICMPv6 parameter problem message.
3058 		 *
3059 		 * If limit is > 0, then we decrement it by 1 and make sure
3060 		 * that the encapsulation limit option in the outer header
3061 		 * reflects that (adding an option if one isn't already
3062 		 * there).
3063 		 */
3064 		ASSERT(limit > mp->b_rptr && limit < mp->b_wptr);
3065 		if (*limit == 0) {
3066 			mp->b_rptr = (uint8_t *)inner6;
3067 			offset = limit - mp->b_rptr;
3068 			iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB,
3069 			    0, offset, ixa->ixa_tsl);
3070 			atomic_inc_64(&iptun->iptun_noxmtbuf);
3071 			return (NULL);
3072 		}
3073 
3074 		/*
3075 		 * The outer header requires an encapsulation limit option.
3076 		 * If there isn't one already, add one.
3077 		 */
3078 		if (iptun->iptun_encaplimit == 0) {
3079 			if ((mp = iptun_insert_encaplimit(iptun, mp, outer6,
3080 			    (*limit - 1))) == NULL)
3081 				return (NULL);
3082 			v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3083 		} else {
3084 			/*
3085 			 * There is an existing encapsulation limit option in
3086 			 * the outer header.  If the inner encapsulation limit
3087 			 * is less than the configured encapsulation limit,
3088 			 * update the outer encapsulation limit to reflect
3089 			 * this lesser value.
3090 			 */
3091 			v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3092 			configlimit =
3093 			    &v6hdrs->it6h_encaplim.iel_telopt.ip6ot_encap_limit;
3094 			if ((*limit - 1) < *configlimit)
3095 				*configlimit = (*limit - 1);
3096 		}
3097 		ixa->ixa_ip_hdr_length = sizeof (iptun_ipv6hdrs_t);
3098 		ixa->ixa_protocol = v6hdrs->it6h_encaplim.iel_destopt.ip6d_nxt;
3099 	} else {
3100 		ixa->ixa_ip_hdr_length = sizeof (ip6_t);
3101 		ixa->ixa_protocol = outer6->ip6_nxt;
3102 	}
3103 	/*
3104 	 * See iptun_output_process_ipv4() why we allow fragmentation for
3105 	 * small packets
3106 	 */
3107 	if (mp->b_wptr - innerptr <= minmtu)
3108 		ixa->ixa_flags &= ~IXAF_DONTFRAG;
3109 	else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL))
3110 		ixa->ixa_flags |= IXAF_DONTFRAG;
3111 
3112 	ixa->ixa_pktlen = msgdsize(mp);
3113 	outer6->ip6_plen = htons(ixa->ixa_pktlen - sizeof (ip6_t));
3114 	return (mp);
3115 }
3116 
3117 /*
3118  * The IP tunneling MAC-type plugins have already done most of the header
3119  * processing and validity checks.  We are simply responsible for multiplexing
3120  * down to the ip module below us.
3121  */
3122 static void
3123 iptun_output(iptun_t *iptun, mblk_t *mp)
3124 {
3125 	conn_t	*connp = iptun->iptun_connp;
3126 	mblk_t	*newmp;
3127 	int	error;
3128 	ip_xmit_attr_t	*ixa;
3129 
3130 	ASSERT(mp->b_datap->db_type == M_DATA);
3131 
3132 	if (mp->b_cont != NULL) {
3133 		if ((newmp = msgpullup(mp, -1)) == NULL) {
3134 			iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
3135 			return;
3136 		}
3137 		freemsg(mp);
3138 		mp = newmp;
3139 	}
3140 
3141 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
3142 		iptun_output_6to4(iptun, mp);
3143 		return;
3144 	}
3145 
3146 	if (is_system_labeled()) {
3147 		/*
3148 		 * Since the label can be different meaning a potentially
3149 		 * different IRE,we always use a unique ip_xmit_attr_t.
3150 		 */
3151 		ixa = conn_get_ixa_exclusive(connp);
3152 	} else {
3153 		/*
3154 		 * If no other thread is using conn_ixa this just gets a
3155 		 * reference to conn_ixa. Otherwise we get a safe copy of
3156 		 * conn_ixa.
3157 		 */
3158 		ixa = conn_get_ixa(connp, B_FALSE);
3159 	}
3160 	if (ixa == NULL) {
3161 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3162 		return;
3163 	}
3164 
3165 	/*
3166 	 * In case we got a safe copy of conn_ixa, then we need
3167 	 * to fill in any pointers in it.
3168 	 */
3169 	if (ixa->ixa_ire == NULL) {
3170 		error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3171 		    &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
3172 		    NULL, NULL, 0);
3173 		if (error != 0) {
3174 			if (ixa->ixa_ire != NULL &&
3175 			    (error == EHOSTUNREACH || error == ENETUNREACH)) {
3176 				/*
3177 				 * Let conn_ip_output/ire_send_noroute return
3178 				 * the error and send any local ICMP error.
3179 				 */
3180 				error = 0;
3181 			} else {
3182 				ixa_refrele(ixa);
3183 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3184 				return;
3185 			}
3186 		}
3187 	}
3188 
3189 	iptun_output_common(iptun, ixa, mp);
3190 	ixa_refrele(ixa);
3191 }
3192 
3193 /*
3194  * We use an ixa based on the last destination.
3195  */
3196 static void
3197 iptun_output_6to4(iptun_t *iptun, mblk_t *mp)
3198 {
3199 	conn_t		*connp = iptun->iptun_connp;
3200 	ipha_t		*outer4, *inner4;
3201 	ip6_t		*outer6, *inner6;
3202 	ip_xmit_attr_t	*ixa;
3203 	ip_xmit_attr_t	*oldixa;
3204 	int		error;
3205 	boolean_t	need_connect;
3206 	in6_addr_t	v6dst;
3207 
3208 	ASSERT(mp->b_cont == NULL);	/* Verified by iptun_output */
3209 
3210 	/* Make sure we set ipha_dst before we look at ipha_dst */
3211 
3212 	(void) iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, &inner6);
3213 	ASSERT(outer4 != NULL);
3214 	if (!iptun_out_process_6to4(iptun, outer4, inner6)) {
3215 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3216 		return;
3217 	}
3218 
3219 	if (is_system_labeled()) {
3220 		/*
3221 		 * Since the label can be different meaning a potentially
3222 		 * different IRE,we always use a unique ip_xmit_attr_t.
3223 		 */
3224 		ixa = conn_get_ixa_exclusive(connp);
3225 	} else {
3226 		/*
3227 		 * If no other thread is using conn_ixa this just gets a
3228 		 * reference to conn_ixa. Otherwise we get a safe copy of
3229 		 * conn_ixa.
3230 		 */
3231 		ixa = conn_get_ixa(connp, B_FALSE);
3232 	}
3233 	if (ixa == NULL) {
3234 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3235 		return;
3236 	}
3237 
3238 	mutex_enter(&connp->conn_lock);
3239 	if (connp->conn_v4lastdst == outer4->ipha_dst) {
3240 		need_connect = (ixa->ixa_ire == NULL);
3241 	} else {
3242 		/* In case previous destination was multirt */
3243 		ip_attr_newdst(ixa);
3244 
3245 		/*
3246 		 * We later update conn_ixa when we update conn_v4lastdst
3247 		 * which enables subsequent packets to avoid redoing
3248 		 * ip_attr_connect
3249 		 */
3250 		need_connect = B_TRUE;
3251 	}
3252 	mutex_exit(&connp->conn_lock);
3253 
3254 	/*
3255 	 * In case we got a safe copy of conn_ixa, or otherwise we don't
3256 	 * have a current ixa_ire, then we need to fill in any pointers in
3257 	 * the ixa.
3258 	 */
3259 	if (need_connect) {
3260 		IN6_IPADDR_TO_V4MAPPED(outer4->ipha_dst, &v6dst);
3261 
3262 		/* We handle IPsec in iptun_output_common */
3263 		error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3264 		    &v6dst, &v6dst, 0, NULL, NULL, 0);
3265 		if (error != 0) {
3266 			if (ixa->ixa_ire != NULL &&
3267 			    (error == EHOSTUNREACH || error == ENETUNREACH)) {
3268 				/*
3269 				 * Let conn_ip_output/ire_send_noroute return
3270 				 * the error and send any local ICMP error.
3271 				 */
3272 				error = 0;
3273 			} else {
3274 				ixa_refrele(ixa);
3275 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3276 				return;
3277 			}
3278 		}
3279 	}
3280 
3281 	iptun_output_common(iptun, ixa, mp);
3282 
3283 	/* Atomically replace conn_ixa and conn_v4lastdst */
3284 	mutex_enter(&connp->conn_lock);
3285 	if (connp->conn_v4lastdst != outer4->ipha_dst) {
3286 		/* Remember the dst which corresponds to conn_ixa */
3287 		connp->conn_v6lastdst = v6dst;
3288 		oldixa = conn_replace_ixa(connp, ixa);
3289 	} else {
3290 		oldixa = NULL;
3291 	}
3292 	mutex_exit(&connp->conn_lock);
3293 	ixa_refrele(ixa);
3294 	if (oldixa != NULL)
3295 		ixa_refrele(oldixa);
3296 }
3297 
3298 /*
3299  * Check the destination/label. Modifies *mpp by adding/removing CIPSO.
3300  *
3301  * We get the label from the message in order to honor the
3302  * ULPs/IPs choice of label. This will be NULL for forwarded
3303  * packets, neighbor discovery packets and some others.
3304  */
3305 static int
3306 iptun_output_check_label(mblk_t **mpp, ip_xmit_attr_t *ixa)
3307 {
3308 	cred_t	*cr;
3309 	int	adjust;
3310 	int	iplen;
3311 	int	err;
3312 	ts_label_t *effective_tsl = NULL;
3313 
3314 
3315 	ASSERT(is_system_labeled());
3316 
3317 	cr = msg_getcred(*mpp, NULL);
3318 	if (cr == NULL)
3319 		return (0);
3320 
3321 	/*
3322 	 * We need to start with a label based on the IP/ULP above us
3323 	 */
3324 	ip_xmit_attr_restore_tsl(ixa, cr);
3325 
3326 	/*
3327 	 * Need to update packet with any CIPSO option since
3328 	 * conn_ip_output doesn't do that.
3329 	 */
3330 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
3331 		ipha_t *ipha;
3332 
3333 		ipha = (ipha_t *)(*mpp)->b_rptr;
3334 		iplen = ntohs(ipha->ipha_length);
3335 		err = tsol_check_label_v4(ixa->ixa_tsl,
3336 		    ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3337 		    ixa->ixa_ipst, &effective_tsl);
3338 		if (err != 0)
3339 			return (err);
3340 
3341 		ipha = (ipha_t *)(*mpp)->b_rptr;
3342 		adjust = (int)ntohs(ipha->ipha_length) - iplen;
3343 	} else {
3344 		ip6_t *ip6h;
3345 
3346 		ip6h = (ip6_t *)(*mpp)->b_rptr;
3347 		iplen = ntohs(ip6h->ip6_plen);
3348 
3349 		err = tsol_check_label_v6(ixa->ixa_tsl,
3350 		    ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3351 		    ixa->ixa_ipst, &effective_tsl);
3352 		if (err != 0)
3353 			return (err);
3354 
3355 		ip6h = (ip6_t *)(*mpp)->b_rptr;
3356 		adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
3357 	}
3358 
3359 	if (effective_tsl != NULL) {
3360 		/* Update the label */
3361 		ip_xmit_attr_replace_tsl(ixa, effective_tsl);
3362 	}
3363 	ixa->ixa_pktlen += adjust;
3364 	ixa->ixa_ip_hdr_length += adjust;
3365 	return (0);
3366 }
3367 
3368 
3369 static void
3370 iptun_output_common(iptun_t *iptun, ip_xmit_attr_t *ixa, mblk_t *mp)
3371 {
3372 	ipsec_tun_pol_t	*itp = iptun->iptun_itp;
3373 	int		outer_hlen;
3374 	mblk_t		*newmp;
3375 	ipha_t		*outer4, *inner4;
3376 	ip6_t		*outer6, *inner6;
3377 	int		error;
3378 	boolean_t	update_pktlen;
3379 
3380 	ASSERT(ixa->ixa_ire != NULL);
3381 
3382 	outer_hlen = iptun_find_headers(mp, 0, &outer4, &inner4, &outer6,
3383 	    &inner6);
3384 	if (outer_hlen == 0) {
3385 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3386 		return;
3387 	}
3388 
3389 	/* Save IXAF_DONTFRAG value */
3390 	iaflags_t dontfrag = ixa->ixa_flags & IXAF_DONTFRAG;
3391 
3392 	/* Perform header processing. */
3393 	if (outer4 != NULL) {
3394 		mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6,
3395 		    ixa);
3396 	} else {
3397 		mp = iptun_out_process_ipv6(iptun, mp, outer6, inner4, inner6,
3398 		    ixa);
3399 	}
3400 	if (mp == NULL)
3401 		return;
3402 
3403 	/*
3404 	 * Let's hope the compiler optimizes this with "branch taken".
3405 	 */
3406 	if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
3407 		/* This updates the ip_xmit_attr_t */
3408 		mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
3409 		    outer6, outer_hlen, ixa);
3410 		if (mp == NULL) {
3411 			atomic_inc_64(&iptun->iptun_oerrors);
3412 			return;
3413 		}
3414 		if (is_system_labeled()) {
3415 			/*
3416 			 * Might change the packet by adding/removing CIPSO.
3417 			 * After this caller inner* and outer* and outer_hlen
3418 			 * might be invalid.
3419 			 */
3420 			error = iptun_output_check_label(&mp, ixa);
3421 			if (error != 0) {
3422 				ip2dbg(("label check failed (%d)\n", error));
3423 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3424 				return;
3425 			}
3426 		}
3427 
3428 		/*
3429 		 * ipsec_tun_outbound() returns a chain of tunneled IP
3430 		 * fragments linked with b_next (or a single message if the
3431 		 * tunneled packet wasn't a fragment).
3432 		 * If fragcache returned a list then we need to update
3433 		 * ixa_pktlen for all packets in the list.
3434 		 */
3435 		update_pktlen = (mp->b_next != NULL);
3436 
3437 		/*
3438 		 * Otherwise, we're good to go.  The ixa has been updated with
3439 		 * instructions for outbound IPsec processing.
3440 		 */
3441 		for (newmp = mp; newmp != NULL; newmp = mp) {
3442 			size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
3443 
3444 			atomic_inc_64(&iptun->iptun_opackets);
3445 			atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3446 			mp = mp->b_next;
3447 			newmp->b_next = NULL;
3448 
3449 			/*
3450 			 * The IXAF_DONTFRAG flag is global, but there is
3451 			 * a chain here.  Check if we're really already
3452 			 * smaller than the minimum allowed MTU and reset here
3453 			 * appropriately.  Otherwise one small packet can kill
3454 			 * the whole chain's path mtu discovery.
3455 			 * In addition, update the pktlen to the length of
3456 			 * the actual packet being processed.
3457 			 */
3458 			if (update_pktlen) {
3459 				ixa->ixa_pktlen = msgdsize(newmp);
3460 				if (ixa->ixa_pktlen <= minmtu)
3461 					ixa->ixa_flags &= ~IXAF_DONTFRAG;
3462 			}
3463 
3464 			atomic_inc_64(&iptun->iptun_opackets);
3465 			atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3466 
3467 			error = conn_ip_output(newmp, ixa);
3468 
3469 			/* Restore IXAF_DONTFRAG value */
3470 			ixa->ixa_flags |= dontfrag;
3471 
3472 			if (error == EMSGSIZE) {
3473 				/* IPsec policy might have changed */
3474 				(void) iptun_update_mtu(iptun, ixa, 0);
3475 			}
3476 		}
3477 	} else {
3478 		/*
3479 		 * The ip module will potentially apply global policy to the
3480 		 * packet in its output path if there's no active tunnel
3481 		 * policy.
3482 		 */
3483 		ASSERT(ixa->ixa_ipsec_policy == NULL);
3484 		mp = ip_output_attach_policy(mp, outer4, outer6, NULL, ixa);
3485 		if (mp == NULL) {
3486 			atomic_inc_64(&iptun->iptun_oerrors);
3487 			return;
3488 		}
3489 		if (is_system_labeled()) {
3490 			/*
3491 			 * Might change the packet by adding/removing CIPSO.
3492 			 * After this caller inner* and outer* and outer_hlen
3493 			 * might be invalid.
3494 			 */
3495 			error = iptun_output_check_label(&mp, ixa);
3496 			if (error != 0) {
3497 				ip2dbg(("label check failed (%d)\n", error));
3498 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3499 				return;
3500 			}
3501 		}
3502 
3503 		atomic_inc_64(&iptun->iptun_opackets);
3504 		atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3505 
3506 		error = conn_ip_output(mp, ixa);
3507 		if (error == EMSGSIZE) {
3508 			/* IPsec policy might have changed */
3509 			(void) iptun_update_mtu(iptun, ixa, 0);
3510 		}
3511 	}
3512 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE)
3513 		ipsec_out_release_refs(ixa);
3514 }
3515 
3516 static mac_callbacks_t iptun_m_callbacks = {
3517 	.mc_callbacks	= (MC_SETPROP | MC_GETPROP),
3518 	.mc_getstat	= iptun_m_getstat,
3519 	.mc_start	= iptun_m_start,
3520 	.mc_stop	= iptun_m_stop,
3521 	.mc_setpromisc	= iptun_m_setpromisc,
3522 	.mc_multicst	= iptun_m_multicst,
3523 	.mc_unicst	= iptun_m_unicst,
3524 	.mc_tx		= iptun_m_tx,
3525 	.mc_setprop	= iptun_m_setprop,
3526 	.mc_getprop	= iptun_m_getprop
3527 };
3528