xref: /titanic_52/usr/src/uts/common/inet/iptun/iptun.c (revision ba91f08b676cdb873326906656bad68790a01301)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * iptun - IP Tunneling Driver
27  *
28  * This module is a GLDv3 driver that implements virtual datalinks over IP
29  * (a.k.a, IP tunneling).  The datalinks are managed through a dld ioctl
30  * interface (see iptun_ctl.c), and registered with GLDv3 using
31  * mac_register().  It implements the logic for various forms of IP (IPv4 or
32  * IPv6) encapsulation within IP (IPv4 or IPv6) by interacting with the ip
33  * module below it.  Each virtual IP tunnel datalink has a conn_t associated
34  * with it representing the "outer" IP connection.
35  *
36  * The module implements the following locking semantics:
37  *
38  * Lookups and deletions in iptun_hash are synchronized using iptun_hash_lock.
39  * See comments above iptun_hash_lock for details.
40  *
41  * No locks are ever held while calling up to GLDv3.  The general architecture
42  * of GLDv3 requires this, as the mac perimeter (essentially a lock) for a
43  * given link will be held while making downcalls (iptun_m_*() callbacks).
44  * Because we need to hold locks while handling downcalls, holding these locks
45  * while issuing upcalls results in deadlock scenarios.  See the block comment
46  * above iptun_task_cb() for details on how we safely issue upcalls without
47  * holding any locks.
48  *
49  * The contents of each iptun_t is protected by an iptun_mutex which is held
50  * in iptun_enter() (called by iptun_enter_by_linkid()), and exited in
51  * iptun_exit().
52  *
53  * See comments in iptun_delete() and iptun_free() for details on how the
54  * iptun_t is deleted safely.
55  */
56 
57 #include <sys/types.h>
58 #include <sys/kmem.h>
59 #include <sys/errno.h>
60 #include <sys/modhash.h>
61 #include <sys/list.h>
62 #include <sys/strsun.h>
63 #include <sys/file.h>
64 #include <sys/systm.h>
65 #include <sys/tihdr.h>
66 #include <sys/param.h>
67 #include <sys/mac_provider.h>
68 #include <sys/mac_ipv4.h>
69 #include <sys/mac_ipv6.h>
70 #include <sys/mac_6to4.h>
71 #include <sys/tsol/tnet.h>
72 #include <sys/sunldi.h>
73 #include <netinet/in.h>
74 #include <netinet/ip6.h>
75 #include <inet/ip.h>
76 #include <inet/ip_ire.h>
77 #include <inet/ipsec_impl.h>
78 #include <sys/tsol/label.h>
79 #include <sys/tsol/tnet.h>
80 #include <inet/iptun.h>
81 #include "iptun_impl.h"
82 
83 /* Do the tunnel type and address family match? */
84 #define	IPTUN_ADDR_MATCH(iptun_type, family)				\
85 	((iptun_type == IPTUN_TYPE_IPV4 && family == AF_INET) ||	\
86 	(iptun_type == IPTUN_TYPE_IPV6 && family == AF_INET6) ||	\
87 	(iptun_type == IPTUN_TYPE_6TO4 && family == AF_INET))
88 
89 #define	IPTUN_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
90 
91 #define	IPTUN_MIN_IPV4_MTU	576		/* ip.h still uses 68 (!) */
92 #define	IPTUN_MIN_IPV6_MTU	IPV6_MIN_MTU
93 #define	IPTUN_MAX_IPV4_MTU	(IP_MAXPACKET - sizeof (ipha_t))
94 #define	IPTUN_MAX_IPV6_MTU	(IP_MAXPACKET - sizeof (ip6_t) -	\
95 				    sizeof (iptun_encaplim_t))
96 
97 #define	IPTUN_MIN_HOPLIMIT	1
98 #define	IPTUN_MAX_HOPLIMIT	UINT8_MAX
99 
100 #define	IPTUN_MIN_ENCAPLIMIT	0
101 #define	IPTUN_MAX_ENCAPLIMIT	UINT8_MAX
102 
103 #define	IPTUN_IPSEC_REQ_MASK	(IPSEC_PREF_REQUIRED | IPSEC_PREF_NEVER)
104 
105 static iptun_encaplim_t	iptun_encaplim_init = {
106 	{ IPPROTO_NONE, 0 },
107 	IP6OPT_TUNNEL_LIMIT,
108 	1,
109 	IPTUN_DEFAULT_ENCAPLIMIT,	/* filled in with actual value later */
110 	IP6OPT_PADN,
111 	1,
112 	0
113 };
114 
115 /*
116  * Table containing per-iptun-type information.
117  * Since IPv6 can run over all of these we have the IPv6 min as the min MTU.
118  */
119 static iptun_typeinfo_t	iptun_type_table[] = {
120 	{ IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION,
121 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV4_MTU,	B_TRUE },
122 	{ IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION,
123 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV6_MTU,	B_TRUE },
124 	{ IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION,
125 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV4_MTU,	B_FALSE },
126 	{ IPTUN_TYPE_UNKNOWN, NULL, 0, 0, 0, B_FALSE }
127 };
128 
129 /*
130  * iptun_hash is an iptun_t lookup table by link ID protected by
131  * iptun_hash_lock.  While the hash table's integrity is maintained via
132  * internal locking in the mod_hash_*() functions, we need additional locking
133  * so that an iptun_t cannot be deleted after a hash lookup has returned an
134  * iptun_t and before iptun_lock has been entered.  As such, we use
135  * iptun_hash_lock when doing lookups and removals from iptun_hash.
136  */
137 mod_hash_t	*iptun_hash;
138 static kmutex_t	iptun_hash_lock;
139 
140 static uint_t	iptun_tunnelcount;	/* total for all stacks */
141 kmem_cache_t	*iptun_cache;
142 ddi_taskq_t 	*iptun_taskq;
143 
144 typedef enum {
145 	IPTUN_TASK_MTU_UPDATE,	/* tell mac about new tunnel link MTU */
146 	IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */
147 	IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */
148 	IPTUN_TASK_LINK_UPDATE,	/* tell mac about new link state */
149 	IPTUN_TASK_PDATA_UPDATE	/* tell mac about updated plugin data */
150 } iptun_task_t;
151 
152 typedef struct iptun_task_data_s {
153 	iptun_task_t	itd_task;
154 	datalink_id_t	itd_linkid;
155 } iptun_task_data_t;
156 
157 static void iptun_task_dispatch(iptun_t *, iptun_task_t);
158 static int iptun_enter(iptun_t *);
159 static void iptun_exit(iptun_t *);
160 static void iptun_headergen(iptun_t *, boolean_t);
161 static void iptun_drop_pkt(mblk_t *, uint64_t *);
162 static void iptun_input(void *, mblk_t *, void *, ip_recv_attr_t *);
163 static void iptun_input_icmp(void *, mblk_t *, void *, ip_recv_attr_t *);
164 static void iptun_output(iptun_t *, mblk_t *);
165 static uint32_t iptun_get_maxmtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
166 static uint32_t iptun_update_mtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
167 static uint32_t iptun_get_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
168 static void iptun_update_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
169 static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *);
170 
171 static void iptun_output_6to4(iptun_t *, mblk_t *);
172 static void iptun_output_common(iptun_t *, ip_xmit_attr_t *, mblk_t *);
173 static boolean_t iptun_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
174     ip_recv_attr_t *);
175 
176 static void iptun_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
177     ixa_notify_arg_t);
178 
179 static mac_callbacks_t iptun_m_callbacks;
180 
181 static int
182 iptun_m_getstat(void *arg, uint_t stat, uint64_t *val)
183 {
184 	iptun_t	*iptun = arg;
185 	int	err = 0;
186 
187 	switch (stat) {
188 	case MAC_STAT_IERRORS:
189 		*val = iptun->iptun_ierrors;
190 		break;
191 	case MAC_STAT_OERRORS:
192 		*val = iptun->iptun_oerrors;
193 		break;
194 	case MAC_STAT_RBYTES:
195 		*val = iptun->iptun_rbytes;
196 		break;
197 	case MAC_STAT_IPACKETS:
198 		*val = iptun->iptun_ipackets;
199 		break;
200 	case MAC_STAT_OBYTES:
201 		*val = iptun->iptun_obytes;
202 		break;
203 	case MAC_STAT_OPACKETS:
204 		*val = iptun->iptun_opackets;
205 		break;
206 	case MAC_STAT_NORCVBUF:
207 		*val = iptun->iptun_norcvbuf;
208 		break;
209 	case MAC_STAT_NOXMTBUF:
210 		*val = iptun->iptun_noxmtbuf;
211 		break;
212 	default:
213 		err = ENOTSUP;
214 	}
215 
216 	return (err);
217 }
218 
219 static int
220 iptun_m_start(void *arg)
221 {
222 	iptun_t	*iptun = arg;
223 	int	err;
224 
225 	if ((err = iptun_enter(iptun)) == 0) {
226 		iptun->iptun_flags |= IPTUN_MAC_STARTED;
227 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
228 		iptun_exit(iptun);
229 	}
230 	return (err);
231 }
232 
233 static void
234 iptun_m_stop(void *arg)
235 {
236 	iptun_t *iptun = arg;
237 
238 	if (iptun_enter(iptun) == 0) {
239 		iptun->iptun_flags &= ~IPTUN_MAC_STARTED;
240 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
241 		iptun_exit(iptun);
242 	}
243 }
244 
245 /*
246  * iptun_m_setpromisc() does nothing and always succeeds.  This is because a
247  * tunnel data-link only ever receives packets that are destined exclusively
248  * for the local address of the tunnel.
249  */
250 /* ARGSUSED */
251 static int
252 iptun_m_setpromisc(void *arg, boolean_t on)
253 {
254 	return (0);
255 }
256 
257 /* ARGSUSED */
258 static int
259 iptun_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
260 {
261 	return (ENOTSUP);
262 }
263 
264 /*
265  * iptun_m_unicst() sets the local address.
266  */
267 /* ARGSUSED */
268 static int
269 iptun_m_unicst(void *arg, const uint8_t *addrp)
270 {
271 	iptun_t			*iptun = arg;
272 	int			err;
273 	struct sockaddr_storage	ss;
274 	struct sockaddr_in	*sin;
275 	struct sockaddr_in6	*sin6;
276 
277 	if ((err = iptun_enter(iptun)) == 0) {
278 		switch (iptun->iptun_typeinfo->iti_ipvers) {
279 		case IPV4_VERSION:
280 			sin = (struct sockaddr_in *)&ss;
281 			sin->sin_family = AF_INET;
282 			bcopy(addrp, &sin->sin_addr, sizeof (in_addr_t));
283 			break;
284 		case IPV6_VERSION:
285 			sin6 = (struct sockaddr_in6 *)&ss;
286 			sin6->sin6_family = AF_INET6;
287 			bcopy(addrp, &sin6->sin6_addr, sizeof (in6_addr_t));
288 			break;
289 		default:
290 			ASSERT(0);
291 		}
292 		err = iptun_setladdr(iptun, &ss);
293 		iptun_exit(iptun);
294 	}
295 	return (err);
296 }
297 
298 static mblk_t *
299 iptun_m_tx(void *arg, mblk_t *mpchain)
300 {
301 	mblk_t	*mp, *nmp;
302 	iptun_t	*iptun = arg;
303 
304 	if (!IS_IPTUN_RUNNING(iptun)) {
305 		iptun_drop_pkt(mpchain, &iptun->iptun_noxmtbuf);
306 		return (NULL);
307 	}
308 
309 	for (mp = mpchain; mp != NULL; mp = nmp) {
310 		nmp = mp->b_next;
311 		mp->b_next = NULL;
312 		iptun_output(iptun, mp);
313 	}
314 
315 	return (NULL);
316 }
317 
318 /* ARGSUSED */
319 static int
320 iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
321     uint_t pr_valsize, const void *pr_val)
322 {
323 	iptun_t		*iptun = barg;
324 	uint32_t	value = *(uint32_t *)pr_val;
325 	int		err;
326 
327 	/*
328 	 * We need to enter this iptun_t since we'll be modifying the outer
329 	 * header.
330 	 */
331 	if ((err = iptun_enter(iptun)) != 0)
332 		return (err);
333 
334 	switch (pr_num) {
335 	case MAC_PROP_IPTUN_HOPLIMIT:
336 		if (value < IPTUN_MIN_HOPLIMIT || value > IPTUN_MAX_HOPLIMIT) {
337 			err = EINVAL;
338 			break;
339 		}
340 		if (value != iptun->iptun_hoplimit) {
341 			iptun->iptun_hoplimit = (uint8_t)value;
342 			iptun_headergen(iptun, B_TRUE);
343 		}
344 		break;
345 	case MAC_PROP_IPTUN_ENCAPLIMIT:
346 		if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6 ||
347 		    value > IPTUN_MAX_ENCAPLIMIT) {
348 			err = EINVAL;
349 			break;
350 		}
351 		if (value != iptun->iptun_encaplimit) {
352 			iptun->iptun_encaplimit = (uint8_t)value;
353 			iptun_headergen(iptun, B_TRUE);
354 		}
355 		break;
356 	case MAC_PROP_MTU: {
357 		uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
358 
359 		if (value < iptun->iptun_typeinfo->iti_minmtu ||
360 		    value > maxmtu) {
361 			err = EINVAL;
362 			break;
363 		}
364 		iptun->iptun_flags |= IPTUN_FIXED_MTU;
365 		if (value != iptun->iptun_mtu) {
366 			iptun->iptun_mtu = value;
367 			iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
368 		}
369 		break;
370 	}
371 	default:
372 		err = EINVAL;
373 	}
374 	iptun_exit(iptun);
375 	return (err);
376 }
377 
378 /* ARGSUSED */
379 static int
380 iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
381     uint_t pr_valsize, void *pr_val)
382 {
383 	iptun_t			*iptun = barg;
384 	int			err;
385 
386 	if ((err = iptun_enter(iptun)) != 0)
387 		return (err);
388 
389 	switch (pr_num) {
390 	case MAC_PROP_IPTUN_HOPLIMIT:
391 		ASSERT(pr_valsize >= sizeof (uint32_t));
392 		*(uint32_t *)pr_val = iptun->iptun_hoplimit;
393 		break;
394 
395 	case MAC_PROP_IPTUN_ENCAPLIMIT:
396 		*(uint32_t *)pr_val = iptun->iptun_encaplimit;
397 		break;
398 	default:
399 		err = ENOTSUP;
400 	}
401 done:
402 	iptun_exit(iptun);
403 	return (err);
404 }
405 
406 /* ARGSUSED */
407 static void
408 iptun_m_propinfo(void *barg, const char *pr_name, mac_prop_id_t pr_num,
409     mac_prop_info_handle_t prh)
410 {
411 	iptun_t			*iptun = barg;
412 
413 	switch (pr_num) {
414 	case MAC_PROP_IPTUN_HOPLIMIT:
415 		mac_prop_info_set_range_uint32(prh,
416 		    IPTUN_MIN_HOPLIMIT, IPTUN_MAX_HOPLIMIT);
417 		mac_prop_info_set_default_uint32(prh, IPTUN_DEFAULT_HOPLIMIT);
418 		break;
419 
420 	case MAC_PROP_IPTUN_ENCAPLIMIT:
421 		if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6)
422 			break;
423 		mac_prop_info_set_range_uint32(prh,
424 		    IPTUN_MIN_ENCAPLIMIT, IPTUN_MAX_ENCAPLIMIT);
425 		mac_prop_info_set_default_uint32(prh, IPTUN_DEFAULT_ENCAPLIMIT);
426 		break;
427 	case MAC_PROP_MTU:
428 		mac_prop_info_set_range_uint32(prh,
429 		    iptun->iptun_typeinfo->iti_minmtu,
430 		    iptun_get_maxmtu(iptun, NULL, 0));
431 		break;
432 	}
433 }
434 
435 uint_t
436 iptun_count(void)
437 {
438 	return (iptun_tunnelcount);
439 }
440 
441 /*
442  * Enter an iptun_t exclusively.  This is essentially just a mutex, but we
443  * don't allow iptun_enter() to succeed on a tunnel if it's in the process of
444  * being deleted.
445  */
446 static int
447 iptun_enter(iptun_t *iptun)
448 {
449 	mutex_enter(&iptun->iptun_lock);
450 	while (iptun->iptun_flags & IPTUN_DELETE_PENDING)
451 		cv_wait(&iptun->iptun_enter_cv, &iptun->iptun_lock);
452 	if (iptun->iptun_flags & IPTUN_CONDEMNED) {
453 		mutex_exit(&iptun->iptun_lock);
454 		return (ENOENT);
455 	}
456 	return (0);
457 }
458 
459 /*
460  * Exit the tunnel entered in iptun_enter().
461  */
462 static void
463 iptun_exit(iptun_t *iptun)
464 {
465 	mutex_exit(&iptun->iptun_lock);
466 }
467 
468 /*
469  * Enter the IP tunnel instance by datalink ID.
470  */
471 static int
472 iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun)
473 {
474 	int err;
475 
476 	mutex_enter(&iptun_hash_lock);
477 	if (mod_hash_find(iptun_hash, IPTUN_HASH_KEY(linkid),
478 	    (mod_hash_val_t *)iptun) == 0)
479 		err = iptun_enter(*iptun);
480 	else
481 		err = ENOENT;
482 	if (err != 0)
483 		*iptun = NULL;
484 	mutex_exit(&iptun_hash_lock);
485 	return (err);
486 }
487 
488 /*
489  * Handle tasks that were deferred through the iptun_taskq because they require
490  * calling up to the mac module, and we can't call up to the mac module while
491  * holding locks.
492  *
493  * This is tricky to get right without introducing race conditions and
494  * deadlocks with the mac module, as we cannot issue an upcall while in the
495  * iptun_t.  The reason is that upcalls may try and enter the mac perimeter,
496  * while iptun callbacks (such as iptun_m_setprop()) called from the mac
497  * module will already have the perimeter held, and will then try and enter
498  * the iptun_t.  You can see the lock ordering problem with this; this will
499  * deadlock.
500  *
501  * The safe way to do this is to enter the iptun_t in question and copy the
502  * information we need out of it so that we can exit it and know that the
503  * information being passed up to the upcalls won't be subject to modification
504  * by other threads.  The problem now is that we need to exit it prior to
505  * issuing the upcall, but once we do this, a thread could come along and
506  * delete the iptun_t and thus the mac handle required to issue the upcall.
507  * To prevent this, we set the IPTUN_UPCALL_PENDING flag prior to exiting the
508  * iptun_t.  This flag is the condition associated with iptun_upcall_cv, which
509  * iptun_delete() will cv_wait() on.  When the upcall completes, we clear
510  * IPTUN_UPCALL_PENDING and cv_signal() any potentially waiting
511  * iptun_delete().  We can thus still safely use iptun->iptun_mh after having
512  * exited the iptun_t.
513  */
514 static void
515 iptun_task_cb(void *arg)
516 {
517 	iptun_task_data_t	*itd = arg;
518 	iptun_task_t		task = itd->itd_task;
519 	datalink_id_t		linkid = itd->itd_linkid;
520 	iptun_t			*iptun;
521 	uint32_t		mtu;
522 	iptun_addr_t		addr;
523 	link_state_t		linkstate;
524 	size_t			header_size;
525 	iptun_header_t		header;
526 
527 	kmem_free(itd, sizeof (*itd));
528 
529 	/*
530 	 * Note that if the lookup fails, it's because the tunnel was deleted
531 	 * between the time the task was dispatched and now.  That isn't an
532 	 * error.
533 	 */
534 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
535 		return;
536 
537 	iptun->iptun_flags |= IPTUN_UPCALL_PENDING;
538 
539 	switch (task) {
540 	case IPTUN_TASK_MTU_UPDATE:
541 		mtu = iptun->iptun_mtu;
542 		break;
543 	case IPTUN_TASK_LADDR_UPDATE:
544 		addr = iptun->iptun_laddr;
545 		break;
546 	case IPTUN_TASK_RADDR_UPDATE:
547 		addr = iptun->iptun_raddr;
548 		break;
549 	case IPTUN_TASK_LINK_UPDATE:
550 		linkstate = IS_IPTUN_RUNNING(iptun) ?
551 		    LINK_STATE_UP : LINK_STATE_DOWN;
552 		break;
553 	case IPTUN_TASK_PDATA_UPDATE:
554 		header_size = iptun->iptun_header_size;
555 		header = iptun->iptun_header;
556 		break;
557 	default:
558 		ASSERT(0);
559 	}
560 
561 	iptun_exit(iptun);
562 
563 	switch (task) {
564 	case IPTUN_TASK_MTU_UPDATE:
565 		(void) mac_maxsdu_update(iptun->iptun_mh, mtu);
566 		break;
567 	case IPTUN_TASK_LADDR_UPDATE:
568 		mac_unicst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
569 		break;
570 	case IPTUN_TASK_RADDR_UPDATE:
571 		mac_dst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
572 		break;
573 	case IPTUN_TASK_LINK_UPDATE:
574 		mac_link_update(iptun->iptun_mh, linkstate);
575 		break;
576 	case IPTUN_TASK_PDATA_UPDATE:
577 		if (mac_pdata_update(iptun->iptun_mh,
578 		    header_size == 0 ? NULL : &header, header_size) != 0)
579 			atomic_inc_64(&iptun->iptun_taskq_fail);
580 		break;
581 	}
582 
583 	mutex_enter(&iptun->iptun_lock);
584 	iptun->iptun_flags &= ~IPTUN_UPCALL_PENDING;
585 	cv_signal(&iptun->iptun_upcall_cv);
586 	mutex_exit(&iptun->iptun_lock);
587 }
588 
589 static void
590 iptun_task_dispatch(iptun_t *iptun, iptun_task_t iptun_task)
591 {
592 	iptun_task_data_t *itd;
593 
594 	itd = kmem_alloc(sizeof (*itd), KM_NOSLEEP);
595 	if (itd == NULL) {
596 		atomic_inc_64(&iptun->iptun_taskq_fail);
597 		return;
598 	}
599 	itd->itd_task = iptun_task;
600 	itd->itd_linkid = iptun->iptun_linkid;
601 	if (ddi_taskq_dispatch(iptun_taskq, iptun_task_cb, itd, DDI_NOSLEEP)) {
602 		atomic_inc_64(&iptun->iptun_taskq_fail);
603 		kmem_free(itd, sizeof (*itd));
604 	}
605 }
606 
607 /*
608  * Convert an iptun_addr_t to sockaddr_storage.
609  */
610 static void
611 iptun_getaddr(iptun_addr_t *iptun_addr, struct sockaddr_storage *ss)
612 {
613 	struct sockaddr_in	*sin;
614 	struct sockaddr_in6	*sin6;
615 
616 	bzero(ss, sizeof (*ss));
617 	switch (iptun_addr->ia_family) {
618 	case AF_INET:
619 		sin = (struct sockaddr_in *)ss;
620 		sin->sin_addr.s_addr = iptun_addr->ia_addr.iau_addr4;
621 		break;
622 	case AF_INET6:
623 		sin6 = (struct sockaddr_in6 *)ss;
624 		sin6->sin6_addr = iptun_addr->ia_addr.iau_addr6;
625 		break;
626 	default:
627 		ASSERT(0);
628 	}
629 	ss->ss_family = iptun_addr->ia_family;
630 }
631 
632 /*
633  * General purpose function to set an IP tunnel source or destination address.
634  */
635 static int
636 iptun_setaddr(iptun_type_t iptun_type, iptun_addr_t *iptun_addr,
637     const struct sockaddr_storage *ss)
638 {
639 	if (!IPTUN_ADDR_MATCH(iptun_type, ss->ss_family))
640 		return (EINVAL);
641 
642 	switch (ss->ss_family) {
643 	case AF_INET: {
644 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
645 
646 		if ((sin->sin_addr.s_addr == INADDR_ANY) ||
647 		    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
648 		    CLASSD(sin->sin_addr.s_addr)) {
649 			return (EADDRNOTAVAIL);
650 		}
651 		iptun_addr->ia_addr.iau_addr4 = sin->sin_addr.s_addr;
652 		break;
653 	}
654 	case AF_INET6: {
655 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
656 
657 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
658 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) ||
659 		    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
660 			return (EADDRNOTAVAIL);
661 		}
662 		iptun_addr->ia_addr.iau_addr6 = sin6->sin6_addr;
663 		break;
664 	}
665 	default:
666 		return (EAFNOSUPPORT);
667 	}
668 	iptun_addr->ia_family = ss->ss_family;
669 	return (0);
670 }
671 
672 static int
673 iptun_setladdr(iptun_t *iptun, const struct sockaddr_storage *laddr)
674 {
675 	return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
676 	    &iptun->iptun_laddr, laddr));
677 }
678 
679 static int
680 iptun_setraddr(iptun_t *iptun, const struct sockaddr_storage *raddr)
681 {
682 	if (!(iptun->iptun_typeinfo->iti_hasraddr))
683 		return (EINVAL);
684 	return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
685 	    &iptun->iptun_raddr, raddr));
686 }
687 
688 static boolean_t
689 iptun_canbind(iptun_t *iptun)
690 {
691 	/*
692 	 * A tunnel may bind when its source address has been set, and if its
693 	 * tunnel type requires one, also its destination address.
694 	 */
695 	return ((iptun->iptun_flags & IPTUN_LADDR) &&
696 	    ((iptun->iptun_flags & IPTUN_RADDR) ||
697 	    !(iptun->iptun_typeinfo->iti_hasraddr)));
698 }
699 
700 /*
701  * Verify that the local address is valid, and insert in the fanout
702  */
703 static int
704 iptun_bind(iptun_t *iptun)
705 {
706 	conn_t			*connp = iptun->iptun_connp;
707 	int			error = 0;
708 	ip_xmit_attr_t		*ixa;
709 	ip_xmit_attr_t		*oldixa;
710 	iulp_t			uinfo;
711 	ip_stack_t		*ipst = connp->conn_netstack->netstack_ip;
712 
713 	/*
714 	 * Get an exclusive ixa for this thread.
715 	 * We defer updating conn_ixa until later to handle any concurrent
716 	 * conn_ixa_cleanup thread.
717 	 */
718 	ixa = conn_get_ixa(connp, B_FALSE);
719 	if (ixa == NULL)
720 		return (ENOMEM);
721 
722 	/* We create PMTU state including for 6to4 */
723 	ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
724 
725 	ASSERT(iptun_canbind(iptun));
726 
727 	mutex_enter(&connp->conn_lock);
728 	/*
729 	 * Note that conn_proto can't be set since the upper protocol
730 	 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
731 	 * ipcl_iptun_classify doesn't use conn_proto.
732 	 */
733 	connp->conn_ipversion = iptun->iptun_typeinfo->iti_ipvers;
734 
735 	switch (iptun->iptun_typeinfo->iti_type) {
736 	case IPTUN_TYPE_IPV4:
737 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
738 		    &connp->conn_laddr_v6);
739 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_raddr4,
740 		    &connp->conn_faddr_v6);
741 		ixa->ixa_flags |= IXAF_IS_IPV4;
742 		if (ip_laddr_verify_v4(iptun->iptun_laddr4, IPCL_ZONEID(connp),
743 		    ipst, B_FALSE) != IPVL_UNICAST_UP) {
744 			mutex_exit(&connp->conn_lock);
745 			error = EADDRNOTAVAIL;
746 			goto done;
747 		}
748 		break;
749 	case IPTUN_TYPE_IPV6:
750 		connp->conn_laddr_v6 = iptun->iptun_laddr6;
751 		connp->conn_faddr_v6 = iptun->iptun_raddr6;
752 		ixa->ixa_flags &= ~IXAF_IS_IPV4;
753 		/* We use a zero scopeid for now */
754 		if (ip_laddr_verify_v6(&iptun->iptun_laddr6, IPCL_ZONEID(connp),
755 		    ipst, B_FALSE, 0) != IPVL_UNICAST_UP) {
756 			mutex_exit(&connp->conn_lock);
757 			error = EADDRNOTAVAIL;
758 			goto done;
759 		}
760 		break;
761 	case IPTUN_TYPE_6TO4:
762 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
763 		    &connp->conn_laddr_v6);
764 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_faddr_v6);
765 		ixa->ixa_flags |= IXAF_IS_IPV4;
766 		mutex_exit(&connp->conn_lock);
767 
768 		switch (ip_laddr_verify_v4(iptun->iptun_laddr4,
769 		    IPCL_ZONEID(connp), ipst, B_FALSE)) {
770 		case IPVL_UNICAST_UP:
771 		case IPVL_UNICAST_DOWN:
772 			break;
773 		default:
774 			error = EADDRNOTAVAIL;
775 			goto done;
776 		}
777 		goto insert;
778 	}
779 
780 	/* In case previous destination was multirt */
781 	ip_attr_newdst(ixa);
782 
783 	/*
784 	 * When we set a tunnel's destination address, we do not
785 	 * care if the destination is reachable.  Transient routing
786 	 * issues should not inhibit the creation of a tunnel
787 	 * interface, for example. Thus we pass B_FALSE here.
788 	 */
789 	connp->conn_saddr_v6 = connp->conn_laddr_v6;
790 	mutex_exit(&connp->conn_lock);
791 
792 	/* As long as the MTU is large we avoid fragmentation */
793 	ixa->ixa_flags |= IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF;
794 
795 	/* We handle IPsec in iptun_output_common */
796 	error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
797 	    &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
798 	    &connp->conn_saddr_v6, &uinfo, 0);
799 
800 	if (error != 0)
801 		goto done;
802 
803 	/* saddr shouldn't change since it was already set */
804 	ASSERT(IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
805 	    &connp->conn_saddr_v6));
806 
807 	/* We set IXAF_VERIFY_PMTU to catch PMTU increases */
808 	ixa->ixa_flags |= IXAF_VERIFY_PMTU;
809 	ASSERT(uinfo.iulp_mtu != 0);
810 
811 	/*
812 	 * Allow setting new policies.
813 	 * The addresses/ports are already set, thus the IPsec policy calls
814 	 * can handle their passed-in conn's.
815 	 */
816 	connp->conn_policy_cached = B_FALSE;
817 
818 insert:
819 	error = ipcl_conn_insert(connp);
820 	if (error != 0)
821 		goto done;
822 
823 	/* Atomically update v6lastdst and conn_ixa */
824 	mutex_enter(&connp->conn_lock);
825 	/* Record this as the "last" send even though we haven't sent any */
826 	connp->conn_v6lastdst = connp->conn_faddr_v6;
827 
828 	iptun->iptun_flags |= IPTUN_BOUND;
829 
830 	oldixa = conn_replace_ixa(connp, ixa);
831 	/* Done with conn_t */
832 	mutex_exit(&connp->conn_lock);
833 	ixa_refrele(oldixa);
834 
835 	/*
836 	 * Now that we're bound with ip below us, this is a good
837 	 * time to initialize the destination path MTU and to
838 	 * re-calculate the tunnel's link MTU.
839 	 */
840 	(void) iptun_update_mtu(iptun, ixa, 0);
841 
842 	if (IS_IPTUN_RUNNING(iptun))
843 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
844 
845 done:
846 	ixa_refrele(ixa);
847 	return (error);
848 }
849 
850 static void
851 iptun_unbind(iptun_t *iptun)
852 {
853 	ASSERT(iptun->iptun_flags & IPTUN_BOUND);
854 	ASSERT(mutex_owned(&iptun->iptun_lock) ||
855 	    (iptun->iptun_flags & IPTUN_CONDEMNED));
856 	ip_unbind(iptun->iptun_connp);
857 	iptun->iptun_flags &= ~IPTUN_BOUND;
858 	if (!(iptun->iptun_flags & IPTUN_CONDEMNED))
859 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
860 }
861 
862 /*
863  * Re-generate the template data-link header for a given IP tunnel given the
864  * tunnel's current parameters.
865  */
866 static void
867 iptun_headergen(iptun_t *iptun, boolean_t update_mac)
868 {
869 	switch (iptun->iptun_typeinfo->iti_ipvers) {
870 	case IPV4_VERSION:
871 		/*
872 		 * We only need to use a custom IP header if the administrator
873 		 * has supplied a non-default hoplimit.
874 		 */
875 		if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT) {
876 			iptun->iptun_header_size = 0;
877 			break;
878 		}
879 		iptun->iptun_header_size = sizeof (ipha_t);
880 		iptun->iptun_header4.ipha_version_and_hdr_length =
881 		    IP_SIMPLE_HDR_VERSION;
882 		iptun->iptun_header4.ipha_fragment_offset_and_flags =
883 		    htons(IPH_DF);
884 		iptun->iptun_header4.ipha_ttl = iptun->iptun_hoplimit;
885 		break;
886 	case IPV6_VERSION: {
887 		ip6_t	*ip6hp = &iptun->iptun_header6.it6h_ip6h;
888 
889 		/*
890 		 * We only need to use a custom IPv6 header if either the
891 		 * administrator has supplied a non-default hoplimit, or we
892 		 * need to include an encapsulation limit option in the outer
893 		 * header.
894 		 */
895 		if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT &&
896 		    iptun->iptun_encaplimit == 0) {
897 			iptun->iptun_header_size = 0;
898 			break;
899 		}
900 
901 		(void) memset(ip6hp, 0, sizeof (*ip6hp));
902 		if (iptun->iptun_encaplimit == 0) {
903 			iptun->iptun_header_size = sizeof (ip6_t);
904 			ip6hp->ip6_nxt = IPPROTO_NONE;
905 		} else {
906 			iptun_encaplim_t	*iel;
907 
908 			iptun->iptun_header_size = sizeof (iptun_ipv6hdrs_t);
909 			/*
910 			 * The mac_ipv6 plugin requires ip6_plen to be in host
911 			 * byte order and reflect the extension headers
912 			 * present in the template.  The actual network byte
913 			 * order ip6_plen will be set on a per-packet basis on
914 			 * transmit.
915 			 */
916 			ip6hp->ip6_plen = sizeof (*iel);
917 			ip6hp->ip6_nxt = IPPROTO_DSTOPTS;
918 			iel = &iptun->iptun_header6.it6h_encaplim;
919 			*iel = iptun_encaplim_init;
920 			iel->iel_telopt.ip6ot_encap_limit =
921 			    iptun->iptun_encaplimit;
922 		}
923 
924 		ip6hp->ip6_hlim = iptun->iptun_hoplimit;
925 		break;
926 	}
927 	}
928 
929 	if (update_mac)
930 		iptun_task_dispatch(iptun, IPTUN_TASK_PDATA_UPDATE);
931 }
932 
933 /*
934  * Insert inbound and outbound IPv4 and IPv6 policy into the given policy
935  * head.
936  */
937 static boolean_t
938 iptun_insert_simple_policies(ipsec_policy_head_t *ph, ipsec_act_t *actp,
939     uint_t n, netstack_t *ns)
940 {
941 	int f = IPSEC_AF_V4;
942 
943 	if (!ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) ||
944 	    !ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns))
945 		return (B_FALSE);
946 
947 	f = IPSEC_AF_V6;
948 	return (ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) &&
949 	    ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns));
950 }
951 
952 /*
953  * Used to set IPsec policy when policy is set through the IPTUN_CREATE or
954  * IPTUN_MODIFY ioctls.
955  */
956 static int
957 iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr)
958 {
959 	int		rc = 0;
960 	uint_t		nact;
961 	ipsec_act_t	*actp = NULL;
962 	boolean_t	clear_all, old_policy = B_FALSE;
963 	ipsec_tun_pol_t	*itp;
964 	char		name[MAXLINKNAMELEN];
965 	uint64_t	gen;
966 	netstack_t	*ns = iptun->iptun_ns;
967 
968 	/* Can't specify self-encap on a tunnel. */
969 	if (ipsr->ipsr_self_encap_req != 0)
970 		return (EINVAL);
971 
972 	/*
973 	 * If it's a "clear-all" entry, unset the security flags and resume
974 	 * normal cleartext (or inherit-from-global) policy.
975 	 */
976 	clear_all = ((ipsr->ipsr_ah_req & IPTUN_IPSEC_REQ_MASK) == 0 &&
977 	    (ipsr->ipsr_esp_req & IPTUN_IPSEC_REQ_MASK) == 0);
978 
979 	ASSERT(mutex_owned(&iptun->iptun_lock));
980 	itp = iptun->iptun_itp;
981 	if (itp == NULL) {
982 		if (clear_all)
983 			goto bail;
984 		if ((rc = dls_mgmt_get_linkinfo(iptun->iptun_linkid, name, NULL,
985 		    NULL, NULL)) != 0)
986 			goto bail;
987 		ASSERT(name[0] != '\0');
988 		if ((itp = create_tunnel_policy(name, &rc, &gen, ns)) == NULL)
989 			goto bail;
990 		iptun->iptun_itp = itp;
991 	}
992 
993 	/* Allocate the actvec now, before holding itp or polhead locks. */
994 	ipsec_actvec_from_req(ipsr, &actp, &nact, ns);
995 	if (actp == NULL) {
996 		rc = ENOMEM;
997 		goto bail;
998 	}
999 
1000 	/*
1001 	 * Just write on the active polhead.  Save the primary/secondary stuff
1002 	 * for spdsock operations.
1003 	 *
1004 	 * Mutex because we need to write to the polhead AND flags atomically.
1005 	 * Other threads will acquire the polhead lock as a reader if the
1006 	 * (unprotected) flag is set.
1007 	 */
1008 	mutex_enter(&itp->itp_lock);
1009 	if (itp->itp_flags & ITPF_P_TUNNEL) {
1010 		/* Oops, we lost a race.  Let's get out of here. */
1011 		rc = EBUSY;
1012 		goto mutex_bail;
1013 	}
1014 	old_policy = ((itp->itp_flags & ITPF_P_ACTIVE) != 0);
1015 
1016 	if (old_policy) {
1017 		ITPF_CLONE(itp->itp_flags);
1018 		rc = ipsec_copy_polhead(itp->itp_policy, itp->itp_inactive, ns);
1019 		if (rc != 0) {
1020 			/* inactive has already been cleared. */
1021 			itp->itp_flags &= ~ITPF_IFLAGS;
1022 			goto mutex_bail;
1023 		}
1024 		rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1025 		ipsec_polhead_flush(itp->itp_policy, ns);
1026 	} else {
1027 		/* Else assume itp->itp_policy is already flushed. */
1028 		rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1029 	}
1030 
1031 	if (clear_all) {
1032 		ASSERT(avl_numnodes(&itp->itp_policy->iph_rulebyid) == 0);
1033 		itp->itp_flags &= ~ITPF_PFLAGS;
1034 		rw_exit(&itp->itp_policy->iph_lock);
1035 		old_policy = B_FALSE;	/* Clear out the inactive one too. */
1036 		goto recover_bail;
1037 	}
1038 
1039 	if (iptun_insert_simple_policies(itp->itp_policy, actp, nact, ns)) {
1040 		rw_exit(&itp->itp_policy->iph_lock);
1041 		/*
1042 		 * Adjust MTU and make sure the DL side knows what's up.
1043 		 */
1044 		itp->itp_flags = ITPF_P_ACTIVE;
1045 		(void) iptun_update_mtu(iptun, NULL, 0);
1046 		old_policy = B_FALSE;	/* Blank out inactive - we succeeded */
1047 	} else {
1048 		rw_exit(&itp->itp_policy->iph_lock);
1049 		rc = ENOMEM;
1050 	}
1051 
1052 recover_bail:
1053 	if (old_policy) {
1054 		/* Recover policy in in active polhead. */
1055 		ipsec_swap_policy(itp->itp_policy, itp->itp_inactive, ns);
1056 		ITPF_SWAP(itp->itp_flags);
1057 	}
1058 
1059 	/* Clear policy in inactive polhead. */
1060 	itp->itp_flags &= ~ITPF_IFLAGS;
1061 	rw_enter(&itp->itp_inactive->iph_lock, RW_WRITER);
1062 	ipsec_polhead_flush(itp->itp_inactive, ns);
1063 	rw_exit(&itp->itp_inactive->iph_lock);
1064 
1065 mutex_bail:
1066 	mutex_exit(&itp->itp_lock);
1067 
1068 bail:
1069 	if (actp != NULL)
1070 		ipsec_actvec_free(actp, nact);
1071 
1072 	return (rc);
1073 }
1074 
1075 static iptun_typeinfo_t *
1076 iptun_gettypeinfo(iptun_type_t type)
1077 {
1078 	int i;
1079 
1080 	for (i = 0; iptun_type_table[i].iti_type != IPTUN_TYPE_UNKNOWN; i++) {
1081 		if (iptun_type_table[i].iti_type == type)
1082 			break;
1083 	}
1084 	return (&iptun_type_table[i]);
1085 }
1086 
1087 /*
1088  * Set the parameters included in ik on the tunnel iptun.  Parameters that can
1089  * only be set at creation time are set in iptun_create().
1090  */
1091 static int
1092 iptun_setparams(iptun_t *iptun, const iptun_kparams_t *ik)
1093 {
1094 	int		err = 0;
1095 	netstack_t	*ns = iptun->iptun_ns;
1096 	iptun_addr_t	orig_laddr, orig_raddr;
1097 	uint_t		orig_flags = iptun->iptun_flags;
1098 
1099 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR) {
1100 		if (orig_flags & IPTUN_LADDR)
1101 			orig_laddr = iptun->iptun_laddr;
1102 		if ((err = iptun_setladdr(iptun, &ik->iptun_kparam_laddr)) != 0)
1103 			return (err);
1104 		iptun->iptun_flags |= IPTUN_LADDR;
1105 	}
1106 
1107 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) {
1108 		if (orig_flags & IPTUN_RADDR)
1109 			orig_raddr = iptun->iptun_raddr;
1110 		if ((err = iptun_setraddr(iptun, &ik->iptun_kparam_raddr)) != 0)
1111 			goto done;
1112 		iptun->iptun_flags |= IPTUN_RADDR;
1113 	}
1114 
1115 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_SECINFO) {
1116 		/*
1117 		 * Set IPsec policy originating from the ifconfig(1M) command
1118 		 * line.  This is traditionally called "simple" policy because
1119 		 * the ipsec_req_t (iptun_kparam_secinfo) can only describe a
1120 		 * simple policy of "do ESP on everything" and/or "do AH on
1121 		 * everything" (as opposed to the rich policy that can be
1122 		 * defined with ipsecconf(1M)).
1123 		 */
1124 		if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
1125 			/*
1126 			 * Can't set security properties for automatic
1127 			 * tunnels.
1128 			 */
1129 			err = EINVAL;
1130 			goto done;
1131 		}
1132 
1133 		if (!ipsec_loaded(ns->netstack_ipsec)) {
1134 			/* If IPsec can be loaded, try and load it now. */
1135 			if (ipsec_failed(ns->netstack_ipsec)) {
1136 				err = EPROTONOSUPPORT;
1137 				goto done;
1138 			}
1139 			ipsec_loader_loadnow(ns->netstack_ipsec);
1140 			/*
1141 			 * ipsec_loader_loadnow() returns while IPsec is
1142 			 * loaded asynchronously.  While a method exists to
1143 			 * wait for IPsec to load (ipsec_loader_wait()), it
1144 			 * requires use of a STREAMS queue to do a qwait().
1145 			 * We're not in STREAMS context here, and so we can't
1146 			 * use it.  This is not a problem in practice because
1147 			 * in the vast majority of cases, key management and
1148 			 * global policy will have loaded before any tunnels
1149 			 * are plumbed, and so IPsec will already have been
1150 			 * loaded.
1151 			 */
1152 			err = EAGAIN;
1153 			goto done;
1154 		}
1155 
1156 		err = iptun_set_sec_simple(iptun, &ik->iptun_kparam_secinfo);
1157 		if (err == 0) {
1158 			iptun->iptun_flags |= IPTUN_SIMPLE_POLICY;
1159 			iptun->iptun_simple_policy = ik->iptun_kparam_secinfo;
1160 		}
1161 	}
1162 done:
1163 	if (err != 0) {
1164 		/* Restore original source and destination. */
1165 		if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR &&
1166 		    (orig_flags & IPTUN_LADDR))
1167 			iptun->iptun_laddr = orig_laddr;
1168 		if ((ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) &&
1169 		    (orig_flags & IPTUN_RADDR))
1170 			iptun->iptun_raddr = orig_raddr;
1171 		iptun->iptun_flags = orig_flags;
1172 	}
1173 	return (err);
1174 }
1175 
1176 static int
1177 iptun_register(iptun_t *iptun)
1178 {
1179 	mac_register_t	*mac;
1180 	int		err;
1181 
1182 	ASSERT(!(iptun->iptun_flags & IPTUN_MAC_REGISTERED));
1183 
1184 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
1185 		return (EINVAL);
1186 
1187 	mac->m_type_ident = iptun->iptun_typeinfo->iti_ident;
1188 	mac->m_driver = iptun;
1189 	mac->m_dip = iptun_dip;
1190 	mac->m_instance = (uint_t)-1;
1191 	mac->m_src_addr = (uint8_t *)&iptun->iptun_laddr.ia_addr;
1192 	mac->m_dst_addr = iptun->iptun_typeinfo->iti_hasraddr ?
1193 	    (uint8_t *)&iptun->iptun_raddr.ia_addr : NULL;
1194 	mac->m_callbacks = &iptun_m_callbacks;
1195 	mac->m_min_sdu = iptun->iptun_typeinfo->iti_minmtu;
1196 	mac->m_max_sdu = iptun->iptun_mtu;
1197 	if (iptun->iptun_header_size != 0) {
1198 		mac->m_pdata = &iptun->iptun_header;
1199 		mac->m_pdata_size = iptun->iptun_header_size;
1200 	}
1201 	if ((err = mac_register(mac, &iptun->iptun_mh)) == 0)
1202 		iptun->iptun_flags |= IPTUN_MAC_REGISTERED;
1203 	mac_free(mac);
1204 	return (err);
1205 }
1206 
1207 static int
1208 iptun_unregister(iptun_t *iptun)
1209 {
1210 	int err;
1211 
1212 	ASSERT(iptun->iptun_flags & IPTUN_MAC_REGISTERED);
1213 	if ((err = mac_unregister(iptun->iptun_mh)) == 0)
1214 		iptun->iptun_flags &= ~IPTUN_MAC_REGISTERED;
1215 	return (err);
1216 }
1217 
1218 static conn_t *
1219 iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
1220 {
1221 	conn_t *connp;
1222 
1223 	if ((connp = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ns)) == NULL)
1224 		return (NULL);
1225 
1226 	connp->conn_flags |= IPCL_IPTUN;
1227 	connp->conn_iptun = iptun;
1228 	connp->conn_recv = iptun_input;
1229 	connp->conn_recvicmp = iptun_input_icmp;
1230 	connp->conn_verifyicmp = iptun_verifyicmp;
1231 
1232 	/*
1233 	 * Register iptun_notify to listen to capability changes detected by IP.
1234 	 * This upcall is made in the context of the call to conn_ip_output.
1235 	 */
1236 	connp->conn_ixa->ixa_notify = iptun_notify;
1237 	connp->conn_ixa->ixa_notify_cookie = iptun;
1238 
1239 	/*
1240 	 * For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done
1241 	 * for all other conn_t's.
1242 	 *
1243 	 * Note that there's an important distinction between iptun_zoneid and
1244 	 * conn_zoneid.  The conn_zoneid is set to GLOBAL_ZONEID in non-global
1245 	 * exclusive stack zones to make the ip module believe that the
1246 	 * non-global zone is actually a global zone.  Therefore, when
1247 	 * interacting with the ip module, we must always use conn_zoneid.
1248 	 */
1249 	connp->conn_zoneid = (ns->netstack_stackid == GLOBAL_NETSTACKID) ?
1250 	    crgetzoneid(credp) : GLOBAL_ZONEID;
1251 	connp->conn_cred = credp;
1252 	/* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */
1253 	crhold(connp->conn_cred);
1254 	connp->conn_cpid = NOPID;
1255 
1256 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1257 	connp->conn_ixa->ixa_zoneid = connp->conn_zoneid;
1258 	ASSERT(connp->conn_ref == 1);
1259 
1260 	/* Cache things in ixa without an extra refhold */
1261 	ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1262 	connp->conn_ixa->ixa_cred = connp->conn_cred;
1263 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1264 	if (is_system_labeled())
1265 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1266 
1267 	/*
1268 	 * Have conn_ip_output drop packets should our outer source
1269 	 * go invalid
1270 	 */
1271 	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1272 
1273 	switch (iptun->iptun_typeinfo->iti_ipvers) {
1274 	case IPV4_VERSION:
1275 		connp->conn_family = AF_INET6;
1276 		break;
1277 	case IPV6_VERSION:
1278 		connp->conn_family = AF_INET;
1279 		break;
1280 	}
1281 	mutex_enter(&connp->conn_lock);
1282 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1283 	mutex_exit(&connp->conn_lock);
1284 	return (connp);
1285 }
1286 
1287 static void
1288 iptun_conn_destroy(conn_t *connp)
1289 {
1290 	ip_quiesce_conn(connp);
1291 	connp->conn_iptun = NULL;
1292 	ASSERT(connp->conn_ref == 1);
1293 	CONN_DEC_REF(connp);
1294 }
1295 
1296 static iptun_t *
1297 iptun_alloc(void)
1298 {
1299 	iptun_t *iptun;
1300 
1301 	if ((iptun = kmem_cache_alloc(iptun_cache, KM_NOSLEEP)) != NULL) {
1302 		bzero(iptun, sizeof (*iptun));
1303 		atomic_inc_32(&iptun_tunnelcount);
1304 	}
1305 	return (iptun);
1306 }
1307 
1308 static void
1309 iptun_free(iptun_t *iptun)
1310 {
1311 	ASSERT(iptun->iptun_flags & IPTUN_CONDEMNED);
1312 
1313 	if (iptun->iptun_flags & IPTUN_HASH_INSERTED) {
1314 		iptun_stack_t	*iptuns = iptun->iptun_iptuns;
1315 
1316 		mutex_enter(&iptun_hash_lock);
1317 		VERIFY(mod_hash_remove(iptun_hash,
1318 		    IPTUN_HASH_KEY(iptun->iptun_linkid),
1319 		    (mod_hash_val_t *)&iptun) == 0);
1320 		mutex_exit(&iptun_hash_lock);
1321 		iptun->iptun_flags &= ~IPTUN_HASH_INSERTED;
1322 		mutex_enter(&iptuns->iptuns_lock);
1323 		list_remove(&iptuns->iptuns_iptunlist, iptun);
1324 		mutex_exit(&iptuns->iptuns_lock);
1325 	}
1326 
1327 	if (iptun->iptun_flags & IPTUN_BOUND)
1328 		iptun_unbind(iptun);
1329 
1330 	/*
1331 	 * After iptun_unregister(), there will be no threads executing a
1332 	 * downcall from the mac module, including in the tx datapath.
1333 	 */
1334 	if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
1335 		VERIFY(iptun_unregister(iptun) == 0);
1336 
1337 	if (iptun->iptun_itp != NULL) {
1338 		/*
1339 		 * Remove from the AVL tree, AND release the reference iptun_t
1340 		 * itself holds on the ITP.
1341 		 */
1342 		itp_unlink(iptun->iptun_itp, iptun->iptun_ns);
1343 		ITP_REFRELE(iptun->iptun_itp, iptun->iptun_ns);
1344 		iptun->iptun_itp = NULL;
1345 		iptun->iptun_flags &= ~IPTUN_SIMPLE_POLICY;
1346 	}
1347 
1348 	/*
1349 	 * After ipcl_conn_destroy(), there will be no threads executing an
1350 	 * upcall from ip (i.e., iptun_input()), and it is then safe to free
1351 	 * the iptun_t.
1352 	 */
1353 	if (iptun->iptun_connp != NULL) {
1354 		iptun_conn_destroy(iptun->iptun_connp);
1355 		iptun->iptun_connp = NULL;
1356 	}
1357 
1358 	kmem_cache_free(iptun_cache, iptun);
1359 	atomic_dec_32(&iptun_tunnelcount);
1360 }
1361 
1362 int
1363 iptun_create(iptun_kparams_t *ik, cred_t *credp)
1364 {
1365 	iptun_t		*iptun = NULL;
1366 	int		err = 0, mherr;
1367 	char		linkname[MAXLINKNAMELEN];
1368 	ipsec_tun_pol_t	*itp;
1369 	netstack_t	*ns = NULL;
1370 	iptun_stack_t	*iptuns;
1371 	datalink_id_t	tmpid;
1372 	zoneid_t	zoneid = crgetzoneid(credp);
1373 	boolean_t	link_created = B_FALSE;
1374 
1375 	/* The tunnel type is mandatory */
1376 	if (!(ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE))
1377 		return (EINVAL);
1378 
1379 	/*
1380 	 * Is the linkid that the caller wishes to associate with this new
1381 	 * tunnel assigned to this zone?
1382 	 */
1383 	if (zone_check_datalink(&zoneid, ik->iptun_kparam_linkid) != 0) {
1384 		if (zoneid != GLOBAL_ZONEID)
1385 			return (EINVAL);
1386 	} else if (zoneid == GLOBAL_ZONEID) {
1387 		return (EINVAL);
1388 	}
1389 
1390 	/*
1391 	 * Make sure that we're not trying to create a tunnel that has already
1392 	 * been created.
1393 	 */
1394 	if (iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun) == 0) {
1395 		iptun_exit(iptun);
1396 		iptun = NULL;
1397 		err = EEXIST;
1398 		goto done;
1399 	}
1400 
1401 	ns = netstack_find_by_cred(credp);
1402 	iptuns = ns->netstack_iptun;
1403 
1404 	if ((iptun = iptun_alloc()) == NULL) {
1405 		err = ENOMEM;
1406 		goto done;
1407 	}
1408 
1409 	iptun->iptun_linkid = ik->iptun_kparam_linkid;
1410 	iptun->iptun_zoneid = zoneid;
1411 	iptun->iptun_ns = ns;
1412 
1413 	iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type);
1414 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_UNKNOWN) {
1415 		err = EINVAL;
1416 		goto done;
1417 	}
1418 
1419 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_IMPLICIT)
1420 		iptun->iptun_flags |= IPTUN_IMPLICIT;
1421 
1422 	if ((err = iptun_setparams(iptun, ik)) != 0)
1423 		goto done;
1424 
1425 	iptun->iptun_hoplimit = IPTUN_DEFAULT_HOPLIMIT;
1426 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_IPV6)
1427 		iptun->iptun_encaplimit = IPTUN_DEFAULT_ENCAPLIMIT;
1428 
1429 	iptun_headergen(iptun, B_FALSE);
1430 
1431 	iptun->iptun_connp = iptun_conn_create(iptun, ns, credp);
1432 	if (iptun->iptun_connp == NULL) {
1433 		err = ENOMEM;
1434 		goto done;
1435 	}
1436 
1437 	iptun->iptun_mtu = iptun->iptun_typeinfo->iti_maxmtu;
1438 	iptun->iptun_dpmtu = iptun->iptun_mtu;
1439 
1440 	/*
1441 	 * Find an ITP based on linkname.  If we have parms already set via
1442 	 * the iptun_setparams() call above, it may have created an ITP for
1443 	 * us.  We always try get_tunnel_policy() for DEBUG correctness
1444 	 * checks, and we may wish to refactor this to only check when
1445 	 * iptun_itp is NULL.
1446 	 */
1447 	if ((err = dls_mgmt_get_linkinfo(iptun->iptun_linkid, linkname, NULL,
1448 	    NULL, NULL)) != 0)
1449 		goto done;
1450 	if ((itp = get_tunnel_policy(linkname, ns)) != NULL)
1451 		iptun->iptun_itp = itp;
1452 
1453 	/*
1454 	 * See if we have the necessary IP addresses assigned to this tunnel
1455 	 * to try and bind them with ip underneath us.  If we're not ready to
1456 	 * bind yet, then we'll defer the bind operation until the addresses
1457 	 * are modified.
1458 	 */
1459 	if (iptun_canbind(iptun) && ((err = iptun_bind(iptun)) != 0))
1460 		goto done;
1461 
1462 	if ((err = iptun_register(iptun)) != 0)
1463 		goto done;
1464 
1465 	err = dls_devnet_create(iptun->iptun_mh, iptun->iptun_linkid,
1466 	    iptun->iptun_zoneid);
1467 	if (err != 0)
1468 		goto done;
1469 	link_created = B_TRUE;
1470 
1471 	/*
1472 	 * We hash by link-id as that is the key used by all other iptun
1473 	 * interfaces (modify, delete, etc.).
1474 	 */
1475 	if ((mherr = mod_hash_insert(iptun_hash,
1476 	    IPTUN_HASH_KEY(iptun->iptun_linkid), (mod_hash_val_t)iptun)) == 0) {
1477 		mutex_enter(&iptuns->iptuns_lock);
1478 		list_insert_head(&iptuns->iptuns_iptunlist, iptun);
1479 		mutex_exit(&iptuns->iptuns_lock);
1480 		iptun->iptun_flags |= IPTUN_HASH_INSERTED;
1481 	} else if (mherr == MH_ERR_NOMEM) {
1482 		err = ENOMEM;
1483 	} else if (mherr == MH_ERR_DUPLICATE) {
1484 		err = EEXIST;
1485 	} else {
1486 		err = EINVAL;
1487 	}
1488 
1489 done:
1490 	if (iptun == NULL && ns != NULL)
1491 		netstack_rele(ns);
1492 	if (err != 0 && iptun != NULL) {
1493 		if (link_created) {
1494 			(void) dls_devnet_destroy(iptun->iptun_mh, &tmpid,
1495 			    B_TRUE);
1496 		}
1497 		iptun->iptun_flags |= IPTUN_CONDEMNED;
1498 		iptun_free(iptun);
1499 	}
1500 	return (err);
1501 }
1502 
1503 int
1504 iptun_delete(datalink_id_t linkid, cred_t *credp)
1505 {
1506 	int	err;
1507 	iptun_t	*iptun = NULL;
1508 
1509 	if ((err = iptun_enter_by_linkid(linkid, &iptun)) != 0)
1510 		return (err);
1511 
1512 	/* One cannot delete a tunnel that belongs to another zone. */
1513 	if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1514 		iptun_exit(iptun);
1515 		return (EACCES);
1516 	}
1517 
1518 	/*
1519 	 * We need to exit iptun in order to issue calls up the stack such as
1520 	 * dls_devnet_destroy().  If we call up while still in iptun, deadlock
1521 	 * with calls coming down the stack is possible.  We prevent other
1522 	 * threads from entering this iptun after we've exited it by setting
1523 	 * the IPTUN_DELETE_PENDING flag.  This will cause callers of
1524 	 * iptun_enter() to block waiting on iptun_enter_cv.  The assumption
1525 	 * here is that the functions we're calling while IPTUN_DELETE_PENDING
1526 	 * is set dont resuult in an iptun_enter() call, as that would result
1527 	 * in deadlock.
1528 	 */
1529 	iptun->iptun_flags |= IPTUN_DELETE_PENDING;
1530 
1531 	/* Wait for any pending upcall to the mac module to complete. */
1532 	while (iptun->iptun_flags & IPTUN_UPCALL_PENDING)
1533 		cv_wait(&iptun->iptun_upcall_cv, &iptun->iptun_lock);
1534 
1535 	iptun_exit(iptun);
1536 
1537 	if ((err = dls_devnet_destroy(iptun->iptun_mh, &linkid, B_TRUE)) == 0) {
1538 		/*
1539 		 * mac_disable() will fail with EBUSY if there are references
1540 		 * to the iptun MAC.  If there are none, then mac_disable()
1541 		 * will assure that none can be acquired until the MAC is
1542 		 * unregistered.
1543 		 *
1544 		 * XXX CR 6791335 prevents us from calling mac_disable() prior
1545 		 * to dls_devnet_destroy(), so we unfortunately need to
1546 		 * attempt to re-create the devnet node if mac_disable()
1547 		 * fails.
1548 		 */
1549 		if ((err = mac_disable(iptun->iptun_mh)) != 0) {
1550 			(void) dls_devnet_create(iptun->iptun_mh, linkid,
1551 			    iptun->iptun_zoneid);
1552 		}
1553 	}
1554 
1555 	/*
1556 	 * Now that we know the fate of this iptun_t, we need to clear
1557 	 * IPTUN_DELETE_PENDING, and set IPTUN_CONDEMNED if the iptun_t is
1558 	 * slated to be freed.  Either way, we need to signal the threads
1559 	 * waiting in iptun_enter() so that they can either fail if
1560 	 * IPTUN_CONDEMNED is set, or continue if it's not.
1561 	 */
1562 	mutex_enter(&iptun->iptun_lock);
1563 	iptun->iptun_flags &= ~IPTUN_DELETE_PENDING;
1564 	if (err == 0)
1565 		iptun->iptun_flags |= IPTUN_CONDEMNED;
1566 	cv_broadcast(&iptun->iptun_enter_cv);
1567 	mutex_exit(&iptun->iptun_lock);
1568 
1569 	/*
1570 	 * Note that there is no danger in calling iptun_free() after having
1571 	 * dropped the iptun_lock since callers of iptun_enter() at this point
1572 	 * are doing so from iptun_enter_by_linkid() (mac_disable() got rid of
1573 	 * threads entering from mac callbacks which call iptun_enter()
1574 	 * directly) which holds iptun_hash_lock, and iptun_free() grabs this
1575 	 * lock in order to remove the iptun_t from the hash table.
1576 	 */
1577 	if (err == 0)
1578 		iptun_free(iptun);
1579 
1580 	return (err);
1581 }
1582 
1583 int
1584 iptun_modify(const iptun_kparams_t *ik, cred_t *credp)
1585 {
1586 	iptun_t		*iptun;
1587 	boolean_t	laddr_change = B_FALSE, raddr_change = B_FALSE;
1588 	int		err;
1589 
1590 	if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1591 		return (err);
1592 
1593 	/* One cannot modify a tunnel that belongs to another zone. */
1594 	if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1595 		err = EACCES;
1596 		goto done;
1597 	}
1598 
1599 	/* The tunnel type cannot be changed */
1600 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE) {
1601 		err = EINVAL;
1602 		goto done;
1603 	}
1604 
1605 	if ((err = iptun_setparams(iptun, ik)) != 0)
1606 		goto done;
1607 	iptun_headergen(iptun, B_FALSE);
1608 
1609 	/*
1610 	 * If any of the tunnel's addresses has been modified and the tunnel
1611 	 * has the necessary addresses assigned to it, we need to try to bind
1612 	 * with ip underneath us.  If we're not ready to bind yet, then we'll
1613 	 * try again when the addresses are modified later.
1614 	 */
1615 	laddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR);
1616 	raddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR);
1617 	if (laddr_change || raddr_change) {
1618 		if (iptun->iptun_flags & IPTUN_BOUND)
1619 			iptun_unbind(iptun);
1620 		if (iptun_canbind(iptun) && (err = iptun_bind(iptun)) != 0) {
1621 			if (laddr_change)
1622 				iptun->iptun_flags &= ~IPTUN_LADDR;
1623 			if (raddr_change)
1624 				iptun->iptun_flags &= ~IPTUN_RADDR;
1625 			goto done;
1626 		}
1627 	}
1628 
1629 	if (laddr_change)
1630 		iptun_task_dispatch(iptun, IPTUN_TASK_LADDR_UPDATE);
1631 	if (raddr_change)
1632 		iptun_task_dispatch(iptun, IPTUN_TASK_RADDR_UPDATE);
1633 
1634 done:
1635 	iptun_exit(iptun);
1636 	return (err);
1637 }
1638 
1639 /* Given an IP tunnel's datalink id, fill in its parameters. */
1640 int
1641 iptun_info(iptun_kparams_t *ik, cred_t *credp)
1642 {
1643 	iptun_t	*iptun;
1644 	int	err;
1645 
1646 	/* Is the tunnel link visible from the caller's zone? */
1647 	if (!dls_devnet_islinkvisible(ik->iptun_kparam_linkid,
1648 	    crgetzoneid(credp)))
1649 		return (ENOENT);
1650 
1651 	if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1652 		return (err);
1653 
1654 	bzero(ik, sizeof (iptun_kparams_t));
1655 
1656 	ik->iptun_kparam_linkid = iptun->iptun_linkid;
1657 	ik->iptun_kparam_type = iptun->iptun_typeinfo->iti_type;
1658 	ik->iptun_kparam_flags |= IPTUN_KPARAM_TYPE;
1659 
1660 	if (iptun->iptun_flags & IPTUN_LADDR) {
1661 		iptun_getaddr(&iptun->iptun_laddr, &ik->iptun_kparam_laddr);
1662 		ik->iptun_kparam_flags |= IPTUN_KPARAM_LADDR;
1663 	}
1664 	if (iptun->iptun_flags & IPTUN_RADDR) {
1665 		iptun_getaddr(&iptun->iptun_raddr, &ik->iptun_kparam_raddr);
1666 		ik->iptun_kparam_flags |= IPTUN_KPARAM_RADDR;
1667 	}
1668 
1669 	if (iptun->iptun_flags & IPTUN_IMPLICIT)
1670 		ik->iptun_kparam_flags |= IPTUN_KPARAM_IMPLICIT;
1671 
1672 	if (iptun->iptun_itp != NULL) {
1673 		mutex_enter(&iptun->iptun_itp->itp_lock);
1674 		if (iptun->iptun_itp->itp_flags & ITPF_P_ACTIVE) {
1675 			ik->iptun_kparam_flags |= IPTUN_KPARAM_IPSECPOL;
1676 			if (iptun->iptun_flags & IPTUN_SIMPLE_POLICY) {
1677 				ik->iptun_kparam_flags |= IPTUN_KPARAM_SECINFO;
1678 				ik->iptun_kparam_secinfo =
1679 				    iptun->iptun_simple_policy;
1680 			}
1681 		}
1682 		mutex_exit(&iptun->iptun_itp->itp_lock);
1683 	}
1684 
1685 done:
1686 	iptun_exit(iptun);
1687 	return (err);
1688 }
1689 
1690 int
1691 iptun_set_6to4relay(netstack_t *ns, ipaddr_t relay_addr)
1692 {
1693 	if (relay_addr == INADDR_BROADCAST || CLASSD(relay_addr))
1694 		return (EADDRNOTAVAIL);
1695 	ns->netstack_iptun->iptuns_relay_rtr_addr = relay_addr;
1696 	return (0);
1697 }
1698 
1699 void
1700 iptun_get_6to4relay(netstack_t *ns, ipaddr_t *relay_addr)
1701 {
1702 	*relay_addr = ns->netstack_iptun->iptuns_relay_rtr_addr;
1703 }
1704 
1705 void
1706 iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp)
1707 {
1708 	iptun_t	*iptun;
1709 
1710 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
1711 		return;
1712 	if (iptun->iptun_itp != itp) {
1713 		ASSERT(iptun->iptun_itp == NULL);
1714 		ITP_REFHOLD(itp);
1715 		iptun->iptun_itp = itp;
1716 	}
1717 	/*
1718 	 * IPsec policy means IPsec overhead, which means lower MTU.
1719 	 * Refresh the MTU for this tunnel.
1720 	 */
1721 	(void) iptun_update_mtu(iptun, NULL, 0);
1722 	iptun_exit(iptun);
1723 }
1724 
1725 /*
1726  * Obtain the path MTU to the tunnel destination.
1727  * Can return zero in some cases.
1728  */
1729 static uint32_t
1730 iptun_get_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1731 {
1732 	uint32_t	pmtu = 0;
1733 	conn_t		*connp = iptun->iptun_connp;
1734 	boolean_t	need_rele = B_FALSE;
1735 
1736 	/*
1737 	 * We only obtain the pmtu for tunnels that have a remote tunnel
1738 	 * address.
1739 	 */
1740 	if (!(iptun->iptun_flags & IPTUN_RADDR))
1741 		return (0);
1742 
1743 	if (ixa == NULL) {
1744 		ixa = conn_get_ixa(connp, B_FALSE);
1745 		if (ixa == NULL)
1746 			return (0);
1747 		need_rele = B_TRUE;
1748 	}
1749 	/*
1750 	 * Guard against ICMP errors before we have sent, as well as against
1751 	 * and a thread which held conn_ixa.
1752 	 */
1753 	if (ixa->ixa_ire != NULL) {
1754 		pmtu = ip_get_pmtu(ixa);
1755 
1756 		/*
1757 		 * For both IPv4 and IPv6 we can have indication that the outer
1758 		 * header needs fragmentation.
1759 		 */
1760 		if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1761 			/* Must allow fragmentation in ip_output */
1762 			ixa->ixa_flags &= ~IXAF_DONTFRAG;
1763 		} else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1764 			ixa->ixa_flags |= IXAF_DONTFRAG;
1765 		} else {
1766 			/* ip_get_pmtu might have set this - we don't want it */
1767 			ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1768 		}
1769 	}
1770 
1771 	if (need_rele)
1772 		ixa_refrele(ixa);
1773 	return (pmtu);
1774 }
1775 
1776 /*
1777  * Update the ip_xmit_attr_t to capture the current lower path mtu as known
1778  * by ip.
1779  */
1780 static void
1781 iptun_update_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1782 {
1783 	uint32_t	pmtu;
1784 	conn_t		*connp = iptun->iptun_connp;
1785 	boolean_t	need_rele = B_FALSE;
1786 
1787 	/* IXAF_VERIFY_PMTU is not set if we don't have a fixed destination */
1788 	if (!(iptun->iptun_flags & IPTUN_RADDR))
1789 		return;
1790 
1791 	if (ixa == NULL) {
1792 		ixa = conn_get_ixa(connp, B_FALSE);
1793 		if (ixa == NULL)
1794 			return;
1795 		need_rele = B_TRUE;
1796 	}
1797 	/*
1798 	 * Guard against ICMP errors before we have sent, as well as against
1799 	 * and a thread which held conn_ixa.
1800 	 */
1801 	if (ixa->ixa_ire != NULL) {
1802 		pmtu = ip_get_pmtu(ixa);
1803 		/*
1804 		 * Update ixa_fragsize and ixa_pmtu.
1805 		 */
1806 		ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
1807 
1808 		/*
1809 		 * For both IPv4 and IPv6 we can have indication that the outer
1810 		 * header needs fragmentation.
1811 		 */
1812 		if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1813 			/* Must allow fragmentation in ip_output */
1814 			ixa->ixa_flags &= ~IXAF_DONTFRAG;
1815 		} else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1816 			ixa->ixa_flags |= IXAF_DONTFRAG;
1817 		} else {
1818 			/* ip_get_pmtu might have set this - we don't want it */
1819 			ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1820 		}
1821 	}
1822 
1823 	if (need_rele)
1824 		ixa_refrele(ixa);
1825 }
1826 
1827 /*
1828  * There is nothing that iptun can verify in addition to IP having
1829  * verified the IP addresses in the fanout.
1830  */
1831 /* ARGSUSED */
1832 static boolean_t
1833 iptun_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
1834     ip_recv_attr_t *ira)
1835 {
1836 	return (B_TRUE);
1837 }
1838 
1839 /*
1840  * Notify function registered with ip_xmit_attr_t.
1841  */
1842 static void
1843 iptun_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
1844     ixa_notify_arg_t narg)
1845 {
1846 	iptun_t		*iptun = (iptun_t *)arg;
1847 
1848 	switch (ntype) {
1849 	case IXAN_PMTU:
1850 		(void) iptun_update_mtu(iptun, ixa, narg);
1851 		break;
1852 	}
1853 }
1854 
1855 /*
1856  * Returns the max of old_ovhd and the overhead associated with pol.
1857  */
1858 static uint32_t
1859 iptun_max_policy_overhead(ipsec_policy_t *pol, uint32_t old_ovhd)
1860 {
1861 	uint32_t new_ovhd = old_ovhd;
1862 
1863 	while (pol != NULL) {
1864 		new_ovhd = max(new_ovhd,
1865 		    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1866 		pol = pol->ipsp_hash.hash_next;
1867 	}
1868 	return (new_ovhd);
1869 }
1870 
1871 static uint32_t
1872 iptun_get_ipsec_overhead(iptun_t *iptun)
1873 {
1874 	ipsec_policy_root_t	*ipr;
1875 	ipsec_policy_head_t	*iph;
1876 	ipsec_policy_t		*pol;
1877 	ipsec_selector_t	sel;
1878 	int			i;
1879 	uint32_t		ipsec_ovhd = 0;
1880 	ipsec_tun_pol_t		*itp = iptun->iptun_itp;
1881 	netstack_t		*ns = iptun->iptun_ns;
1882 
1883 	if (itp == NULL || !(itp->itp_flags & ITPF_P_ACTIVE)) {
1884 		/*
1885 		 * Consult global policy, just in case.  This will only work
1886 		 * if we have both source and destination addresses to work
1887 		 * with.
1888 		 */
1889 		if ((iptun->iptun_flags & (IPTUN_LADDR|IPTUN_RADDR)) !=
1890 		    (IPTUN_LADDR|IPTUN_RADDR))
1891 			return (0);
1892 
1893 		iph = ipsec_system_policy(ns);
1894 		bzero(&sel, sizeof (sel));
1895 		sel.ips_isv4 =
1896 		    (iptun->iptun_typeinfo->iti_ipvers == IPV4_VERSION);
1897 		switch (iptun->iptun_typeinfo->iti_ipvers) {
1898 		case IPV4_VERSION:
1899 			sel.ips_local_addr_v4 = iptun->iptun_laddr4;
1900 			sel.ips_remote_addr_v4 = iptun->iptun_raddr4;
1901 			break;
1902 		case IPV6_VERSION:
1903 			sel.ips_local_addr_v6 = iptun->iptun_laddr6;
1904 			sel.ips_remote_addr_v6 = iptun->iptun_raddr6;
1905 			break;
1906 		}
1907 		/* Check for both IPv4 and IPv6. */
1908 		sel.ips_protocol = IPPROTO_ENCAP;
1909 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1910 		    &sel);
1911 		if (pol != NULL) {
1912 			ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act);
1913 			IPPOL_REFRELE(pol);
1914 		}
1915 		sel.ips_protocol = IPPROTO_IPV6;
1916 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1917 		    &sel);
1918 		if (pol != NULL) {
1919 			ipsec_ovhd = max(ipsec_ovhd,
1920 			    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1921 			IPPOL_REFRELE(pol);
1922 		}
1923 		IPPH_REFRELE(iph, ns);
1924 	} else {
1925 		/*
1926 		 * Look through all of the possible IPsec actions for the
1927 		 * tunnel, and find the largest potential IPsec overhead.
1928 		 */
1929 		iph = itp->itp_policy;
1930 		rw_enter(&iph->iph_lock, RW_READER);
1931 		ipr = &(iph->iph_root[IPSEC_TYPE_OUTBOUND]);
1932 		ipsec_ovhd = iptun_max_policy_overhead(
1933 		    ipr->ipr_nonhash[IPSEC_AF_V4], 0);
1934 		ipsec_ovhd = iptun_max_policy_overhead(
1935 		    ipr->ipr_nonhash[IPSEC_AF_V6], ipsec_ovhd);
1936 		for (i = 0; i < ipr->ipr_nchains; i++) {
1937 			ipsec_ovhd = iptun_max_policy_overhead(
1938 			    ipr->ipr_hash[i].hash_head, ipsec_ovhd);
1939 		}
1940 		rw_exit(&iph->iph_lock);
1941 	}
1942 
1943 	return (ipsec_ovhd);
1944 }
1945 
1946 /*
1947  * Calculate and return the maximum possible upper MTU for the given tunnel.
1948  *
1949  * If new_pmtu is set then we also need to update the lower path MTU information
1950  * in the ip_xmit_attr_t. That is needed since we set IXAF_VERIFY_PMTU so that
1951  * we are notified by conn_ip_output() when the path MTU increases.
1952  */
1953 static uint32_t
1954 iptun_get_maxmtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
1955 {
1956 	size_t		header_size, ipsec_overhead;
1957 	uint32_t	maxmtu, pmtu;
1958 
1959 	/*
1960 	 * Start with the path-MTU to the remote address, which is either
1961 	 * provided as the new_pmtu argument, or obtained using
1962 	 * iptun_get_dst_pmtu().
1963 	 */
1964 	if (new_pmtu != 0) {
1965 		if (iptun->iptun_flags & IPTUN_RADDR)
1966 			iptun->iptun_dpmtu = new_pmtu;
1967 		pmtu = new_pmtu;
1968 	} else if (iptun->iptun_flags & IPTUN_RADDR) {
1969 		if ((pmtu = iptun_get_dst_pmtu(iptun, ixa)) == 0) {
1970 			/*
1971 			 * We weren't able to obtain the path-MTU of the
1972 			 * destination.  Use the previous value.
1973 			 */
1974 			pmtu = iptun->iptun_dpmtu;
1975 		} else {
1976 			iptun->iptun_dpmtu = pmtu;
1977 		}
1978 	} else {
1979 		/*
1980 		 * We have no path-MTU information to go on, use the maximum
1981 		 * possible value.
1982 		 */
1983 		pmtu = iptun->iptun_typeinfo->iti_maxmtu;
1984 	}
1985 
1986 	/*
1987 	 * Now calculate tunneling overhead and subtract that from the
1988 	 * path-MTU information obtained above.
1989 	 */
1990 	if (iptun->iptun_header_size != 0) {
1991 		header_size = iptun->iptun_header_size;
1992 	} else {
1993 		switch (iptun->iptun_typeinfo->iti_ipvers) {
1994 		case IPV4_VERSION:
1995 			header_size = sizeof (ipha_t);
1996 			if (is_system_labeled())
1997 				header_size += IP_MAX_OPT_LENGTH;
1998 			break;
1999 		case IPV6_VERSION:
2000 			header_size = sizeof (iptun_ipv6hdrs_t);
2001 			break;
2002 		}
2003 	}
2004 
2005 	ipsec_overhead = iptun_get_ipsec_overhead(iptun);
2006 
2007 	maxmtu = pmtu - (header_size + ipsec_overhead);
2008 	return (max(maxmtu, iptun->iptun_typeinfo->iti_minmtu));
2009 }
2010 
2011 /*
2012  * Re-calculate the tunnel's MTU as seen from above and notify the MAC layer
2013  * of any change in MTU.  The new_pmtu argument is the new lower path MTU to
2014  * the tunnel destination to be used in the tunnel MTU calculation.  Passing
2015  * in 0 for new_pmtu causes the lower path MTU to be dynamically updated using
2016  * ip_get_pmtu().
2017  *
2018  * If the calculated tunnel MTU is different than its previous value, then we
2019  * notify the MAC layer above us of this change using mac_maxsdu_update().
2020  */
2021 static uint32_t
2022 iptun_update_mtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
2023 {
2024 	uint32_t newmtu;
2025 
2026 	/* We always update the ixa since we might have set IXAF_VERIFY_PMTU */
2027 	iptun_update_dst_pmtu(iptun, ixa);
2028 
2029 	/*
2030 	 * We return the current MTU without updating it if it was pegged to a
2031 	 * static value using the MAC_PROP_MTU link property.
2032 	 */
2033 	if (iptun->iptun_flags & IPTUN_FIXED_MTU)
2034 		return (iptun->iptun_mtu);
2035 
2036 	/* If the MTU isn't fixed, then use the maximum possible value. */
2037 	newmtu = iptun_get_maxmtu(iptun, ixa, new_pmtu);
2038 	/*
2039 	 * We only dynamically adjust the tunnel MTU for tunnels with
2040 	 * destinations because dynamic MTU calculations are based on the
2041 	 * destination path-MTU.
2042 	 */
2043 	if ((iptun->iptun_flags & IPTUN_RADDR) && newmtu != iptun->iptun_mtu) {
2044 		iptun->iptun_mtu = newmtu;
2045 		if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
2046 			iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
2047 	}
2048 
2049 	return (newmtu);
2050 }
2051 
2052 /*
2053  * Frees a packet or packet chain and bumps stat for each freed packet.
2054  */
2055 static void
2056 iptun_drop_pkt(mblk_t *mp, uint64_t *stat)
2057 {
2058 	mblk_t *pktmp;
2059 
2060 	for (pktmp = mp; pktmp != NULL; pktmp = mp) {
2061 		mp = mp->b_next;
2062 		pktmp->b_next = NULL;
2063 		if (stat != NULL)
2064 			atomic_inc_64(stat);
2065 		freemsg(pktmp);
2066 	}
2067 }
2068 
2069 /*
2070  * Allocate and return a new mblk to hold an IP and ICMP header, and chain the
2071  * original packet to its b_cont.  Returns NULL on failure.
2072  */
2073 static mblk_t *
2074 iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
2075 {
2076 	mblk_t *icmperr_mp;
2077 
2078 	if ((icmperr_mp = allocb(hdrs_size, BPRI_MED)) != NULL) {
2079 		icmperr_mp->b_wptr += hdrs_size;
2080 		/* tack on the offending packet */
2081 		icmperr_mp->b_cont = orig_pkt;
2082 	}
2083 	return (icmperr_mp);
2084 }
2085 
2086 /*
2087  * Transmit an ICMP error.  mp->b_rptr points at the packet to be included in
2088  * the ICMP error.
2089  */
2090 static void
2091 iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp,
2092     ts_label_t *tsl)
2093 {
2094 	size_t	orig_pktsize, hdrs_size;
2095 	mblk_t	*icmperr_mp;
2096 	ipha_t	*new_ipha;
2097 	icmph_t	*new_icmp;
2098 	ip_xmit_attr_t	ixas;
2099 	conn_t	*connp = iptun->iptun_connp;
2100 
2101 	orig_pktsize = msgdsize(mp);
2102 	hdrs_size = sizeof (ipha_t) + sizeof (icmph_t);
2103 	if ((icmperr_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2104 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2105 		return;
2106 	}
2107 
2108 	new_ipha = (ipha_t *)icmperr_mp->b_rptr;
2109 	new_icmp = (icmph_t *)(new_ipha + 1);
2110 
2111 	new_ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2112 	new_ipha->ipha_type_of_service = 0;
2113 	new_ipha->ipha_ident = 0;
2114 	new_ipha->ipha_fragment_offset_and_flags = 0;
2115 	new_ipha->ipha_ttl = orig_ipha->ipha_ttl;
2116 	new_ipha->ipha_protocol = IPPROTO_ICMP;
2117 	new_ipha->ipha_src = orig_ipha->ipha_dst;
2118 	new_ipha->ipha_dst = orig_ipha->ipha_src;
2119 	new_ipha->ipha_hdr_checksum = 0; /* will be computed by ip */
2120 	new_ipha->ipha_length = htons(hdrs_size + orig_pktsize);
2121 
2122 	*new_icmp = *icmp;
2123 	new_icmp->icmph_checksum = 0;
2124 	new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0);
2125 
2126 	bzero(&ixas, sizeof (ixas));
2127 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2128 	if (new_ipha->ipha_src == INADDR_ANY) {
2129 		ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
2130 		ixas.ixa_flags |= IXAF_SET_SOURCE;
2131 	}
2132 
2133 	ixas.ixa_zoneid = IPCL_ZONEID(connp);
2134 	ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2135 	ixas.ixa_cred = connp->conn_cred;
2136 	ixas.ixa_cpid = NOPID;
2137 	if (is_system_labeled())
2138 		ixas.ixa_tsl = tsl;
2139 
2140 	ixas.ixa_ifindex = 0;
2141 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2142 
2143 	(void) ip_output_simple(icmperr_mp, &ixas);
2144 	ixa_cleanup(&ixas);
2145 }
2146 
2147 static void
2148 iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp,
2149     ts_label_t *tsl)
2150 {
2151 	size_t	orig_pktsize, hdrs_size;
2152 	mblk_t	*icmp6err_mp;
2153 	ip6_t	*new_ip6h;
2154 	icmp6_t	*new_icmp6;
2155 	ip_xmit_attr_t	ixas;
2156 	conn_t	*connp = iptun->iptun_connp;
2157 
2158 	orig_pktsize = msgdsize(mp);
2159 	hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t);
2160 	if ((icmp6err_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2161 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2162 		return;
2163 	}
2164 
2165 	new_ip6h = (ip6_t *)icmp6err_mp->b_rptr;
2166 	new_icmp6 = (icmp6_t *)(new_ip6h + 1);
2167 
2168 	new_ip6h->ip6_vcf = orig_ip6h->ip6_vcf;
2169 	new_ip6h->ip6_plen = htons(sizeof (icmp6_t) + orig_pktsize);
2170 	new_ip6h->ip6_hops = orig_ip6h->ip6_hops;
2171 	new_ip6h->ip6_nxt = IPPROTO_ICMPV6;
2172 	new_ip6h->ip6_src = orig_ip6h->ip6_dst;
2173 	new_ip6h->ip6_dst = orig_ip6h->ip6_src;
2174 
2175 	*new_icmp6 = *icmp6;
2176 	/* The checksum is calculated in ip_output_simple and friends. */
2177 	new_icmp6->icmp6_cksum = new_ip6h->ip6_plen;
2178 
2179 	bzero(&ixas, sizeof (ixas));
2180 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
2181 	if (IN6_IS_ADDR_UNSPECIFIED(&new_ip6h->ip6_src)) {
2182 		ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
2183 		ixas.ixa_flags |= IXAF_SET_SOURCE;
2184 	}
2185 
2186 	ixas.ixa_zoneid = IPCL_ZONEID(connp);
2187 	ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2188 	ixas.ixa_cred = connp->conn_cred;
2189 	ixas.ixa_cpid = NOPID;
2190 	if (is_system_labeled())
2191 		ixas.ixa_tsl = tsl;
2192 
2193 	ixas.ixa_ifindex = 0;
2194 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2195 
2196 	(void) ip_output_simple(icmp6err_mp, &ixas);
2197 	ixa_cleanup(&ixas);
2198 }
2199 
2200 static void
2201 iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
2202     uint8_t type, uint8_t code, ts_label_t *tsl)
2203 {
2204 	icmph_t icmp;
2205 
2206 	bzero(&icmp, sizeof (icmp));
2207 	icmp.icmph_type = type;
2208 	icmp.icmph_code = code;
2209 
2210 	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2211 }
2212 
2213 static void
2214 iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
2215     mblk_t *mp, ts_label_t *tsl)
2216 {
2217 	icmph_t	icmp;
2218 
2219 	icmp.icmph_type = ICMP_DEST_UNREACHABLE;
2220 	icmp.icmph_code = ICMP_FRAGMENTATION_NEEDED;
2221 	icmp.icmph_du_zero = 0;
2222 	icmp.icmph_du_mtu = htons(newmtu);
2223 
2224 	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2225 }
2226 
2227 static void
2228 iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
2229     uint8_t type, uint8_t code, uint32_t offset, ts_label_t *tsl)
2230 {
2231 	icmp6_t icmp6;
2232 
2233 	bzero(&icmp6, sizeof (icmp6));
2234 	icmp6.icmp6_type = type;
2235 	icmp6.icmp6_code = code;
2236 	if (type == ICMP6_PARAM_PROB)
2237 		icmp6.icmp6_pptr = htonl(offset);
2238 
2239 	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2240 }
2241 
2242 static void
2243 iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
2244     mblk_t *mp, ts_label_t *tsl)
2245 {
2246 	icmp6_t icmp6;
2247 
2248 	icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
2249 	icmp6.icmp6_code = 0;
2250 	icmp6.icmp6_mtu = htonl(newmtu);
2251 
2252 	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2253 }
2254 
2255 /*
2256  * Determines if the packet pointed to by ipha or ip6h is an ICMP error.  The
2257  * mp argument is only used to do bounds checking.
2258  */
2259 static boolean_t
2260 is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
2261 {
2262 	uint16_t hlen;
2263 
2264 	if (ipha != NULL) {
2265 		icmph_t	*icmph;
2266 
2267 		ASSERT(ip6h == NULL);
2268 		if (ipha->ipha_protocol != IPPROTO_ICMP)
2269 			return (B_FALSE);
2270 
2271 		hlen = IPH_HDR_LENGTH(ipha);
2272 		icmph = (icmph_t *)((uint8_t *)ipha + hlen);
2273 		return (ICMP_IS_ERROR(icmph->icmph_type) ||
2274 		    icmph->icmph_type == ICMP_REDIRECT);
2275 	} else {
2276 		icmp6_t	*icmp6;
2277 		uint8_t	*nexthdrp;
2278 
2279 		ASSERT(ip6h != NULL);
2280 		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hlen, &nexthdrp) ||
2281 		    *nexthdrp != IPPROTO_ICMPV6) {
2282 			return (B_FALSE);
2283 		}
2284 
2285 		icmp6 = (icmp6_t *)((uint8_t *)ip6h + hlen);
2286 		return (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
2287 		    icmp6->icmp6_type == ND_REDIRECT);
2288 	}
2289 }
2290 
2291 /*
2292  * Find inner and outer IP headers from a tunneled packet as setup for calls
2293  * into ipsec_tun_{in,out}bound().
2294  * Note that we need to allow the outer header to be in a separate mblk from
2295  * the inner header.
2296  * If the caller knows the outer_hlen, the caller passes it in. Otherwise zero.
2297  */
2298 static size_t
2299 iptun_find_headers(mblk_t *mp, size_t outer_hlen, ipha_t **outer4,
2300     ipha_t **inner4, ip6_t **outer6, ip6_t **inner6)
2301 {
2302 	ipha_t	*ipha;
2303 	size_t	first_mblkl = MBLKL(mp);
2304 	mblk_t	*inner_mp;
2305 
2306 	/*
2307 	 * Don't bother handling packets that don't have a full IP header in
2308 	 * the fist mblk.  For the input path, the ip module ensures that this
2309 	 * won't happen, and on the output path, the IP tunneling MAC-type
2310 	 * plugins ensure that this also won't happen.
2311 	 */
2312 	if (first_mblkl < sizeof (ipha_t))
2313 		return (0);
2314 	ipha = (ipha_t *)(mp->b_rptr);
2315 	switch (IPH_HDR_VERSION(ipha)) {
2316 	case IPV4_VERSION:
2317 		*outer4 = ipha;
2318 		*outer6 = NULL;
2319 		if (outer_hlen == 0)
2320 			outer_hlen = IPH_HDR_LENGTH(ipha);
2321 		break;
2322 	case IPV6_VERSION:
2323 		*outer4 = NULL;
2324 		*outer6 = (ip6_t *)ipha;
2325 		if (outer_hlen == 0)
2326 			outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
2327 		break;
2328 	default:
2329 		return (0);
2330 	}
2331 
2332 	if (first_mblkl < outer_hlen ||
2333 	    (first_mblkl == outer_hlen && mp->b_cont == NULL))
2334 		return (0);
2335 
2336 	/*
2337 	 * We don't bother doing a pullup here since the outer header will
2338 	 * just get stripped off soon on input anyway.  We just want to ensure
2339 	 * that the inner* pointer points to a full header.
2340 	 */
2341 	if (first_mblkl == outer_hlen) {
2342 		inner_mp = mp->b_cont;
2343 		ipha = (ipha_t *)inner_mp->b_rptr;
2344 	} else {
2345 		inner_mp = mp;
2346 		ipha = (ipha_t *)(mp->b_rptr + outer_hlen);
2347 	}
2348 	switch (IPH_HDR_VERSION(ipha)) {
2349 	case IPV4_VERSION:
2350 		if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ipha_t))
2351 			return (0);
2352 		*inner4 = ipha;
2353 		*inner6 = NULL;
2354 		break;
2355 	case IPV6_VERSION:
2356 		if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ip6_t))
2357 			return (0);
2358 		*inner4 = NULL;
2359 		*inner6 = (ip6_t *)ipha;
2360 		break;
2361 	default:
2362 		return (0);
2363 	}
2364 
2365 	return (outer_hlen);
2366 }
2367 
2368 /*
2369  * Received ICMP error in response to an X over IPv4 packet that we
2370  * transmitted.
2371  *
2372  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2373  * the following:
2374  *
2375  * [IPv4(0)][ICMPv4][IPv4(1)][IPv4(2)][ULP]
2376  *
2377  *	or
2378  *
2379  * [IPv4(0)][ICMPv4][IPv4(1)][IPv6][ULP]
2380  *
2381  * And "outer4" will get set to IPv4(1), and inner[46] will correspond to
2382  * whatever the very-inner packet is (IPv4(2) or IPv6).
2383  */
2384 static void
2385 iptun_input_icmp_v4(iptun_t *iptun, mblk_t *data_mp, icmph_t *icmph,
2386     ip_recv_attr_t *ira)
2387 {
2388 	uint8_t	*orig;
2389 	ipha_t	*outer4, *inner4;
2390 	ip6_t	*outer6, *inner6;
2391 	int	outer_hlen;
2392 	uint8_t	type, code;
2393 
2394 	ASSERT(data_mp->b_cont == NULL);
2395 	/*
2396 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
2397 	 * find headers in the ICMP packet payload.
2398 	 */
2399 	orig = data_mp->b_rptr;
2400 	data_mp->b_rptr = (uint8_t *)(icmph + 1);
2401 	/*
2402 	 * The ip module ensures that ICMP errors contain at least the
2403 	 * original IP header (otherwise, the error would never have made it
2404 	 * here).
2405 	 */
2406 	ASSERT(MBLKL(data_mp) >= 0);
2407 	outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2408 	    &inner6);
2409 	ASSERT(outer6 == NULL);
2410 	data_mp->b_rptr = orig;
2411 	if (outer_hlen == 0) {
2412 		iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2413 		return;
2414 	}
2415 
2416 	/* Only ICMP errors due to tunneled packets should reach here. */
2417 	ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP ||
2418 	    outer4->ipha_protocol == IPPROTO_IPV6);
2419 
2420 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2421 	    inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2422 	if (data_mp == NULL) {
2423 		/* Callee did all of the freeing. */
2424 		atomic_inc_64(&iptun->iptun_ierrors);
2425 		return;
2426 	}
2427 	/* We should never see reassembled fragment here. */
2428 	ASSERT(data_mp->b_next == NULL);
2429 
2430 	data_mp->b_rptr = (uint8_t *)outer4 + outer_hlen;
2431 
2432 	/*
2433 	 * If the original packet being transmitted was itself an ICMP error,
2434 	 * then drop this packet.  We don't want to generate an ICMP error in
2435 	 * response to an ICMP error.
2436 	 */
2437 	if (is_icmp_error(data_mp, inner4, inner6)) {
2438 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2439 		return;
2440 	}
2441 
2442 	switch (icmph->icmph_type) {
2443 	case ICMP_DEST_UNREACHABLE:
2444 		type = (inner4 != NULL ? icmph->icmph_type : ICMP6_DST_UNREACH);
2445 		switch (icmph->icmph_code) {
2446 		case ICMP_FRAGMENTATION_NEEDED: {
2447 			uint32_t newmtu;
2448 
2449 			/*
2450 			 * We reconcile this with the fact that the tunnel may
2451 			 * also have IPsec policy by letting iptun_update_mtu
2452 			 * take care of it.
2453 			 */
2454 			newmtu = iptun_update_mtu(iptun, NULL,
2455 			    ntohs(icmph->icmph_du_mtu));
2456 
2457 			if (inner4 != NULL) {
2458 				iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2459 				    data_mp, ira->ira_tsl);
2460 			} else {
2461 				iptun_icmp_toobig_v6(iptun, newmtu, inner6,
2462 				    data_mp, ira->ira_tsl);
2463 			}
2464 			return;
2465 		}
2466 		case ICMP_DEST_NET_UNREACH_ADMIN:
2467 		case ICMP_DEST_HOST_UNREACH_ADMIN:
2468 			code = (inner4 != NULL ? ICMP_DEST_NET_UNREACH_ADMIN :
2469 			    ICMP6_DST_UNREACH_ADMIN);
2470 			break;
2471 		default:
2472 			code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2473 			    ICMP6_DST_UNREACH_ADDR);
2474 			break;
2475 		}
2476 		break;
2477 	case ICMP_TIME_EXCEEDED:
2478 		if (inner6 != NULL) {
2479 			type = ICMP6_TIME_EXCEEDED;
2480 			code = 0;
2481 		} /* else we're already set. */
2482 		break;
2483 	case ICMP_PARAM_PROBLEM:
2484 		/*
2485 		 * This is a problem with the outer header we transmitted.
2486 		 * Treat this as an output error.
2487 		 */
2488 		iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2489 		return;
2490 	default:
2491 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2492 		return;
2493 	}
2494 
2495 	if (inner4 != NULL) {
2496 		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2497 		    ira->ira_tsl);
2498 	} else {
2499 		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2500 		    ira->ira_tsl);
2501 	}
2502 }
2503 
2504 /*
2505  * Return B_TRUE if the IPv6 packet pointed to by ip6h contains a Tunnel
2506  * Encapsulation Limit destination option.  If there is one, set encaplim_ptr
2507  * to point to the option value.
2508  */
2509 static boolean_t
2510 iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
2511 {
2512 	ip_pkt_t	pkt;
2513 	uint8_t		*endptr;
2514 	ip6_dest_t	*destp;
2515 	struct ip6_opt	*optp;
2516 
2517 	pkt.ipp_fields = 0; /* must be initialized */
2518 	(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &pkt, NULL);
2519 	if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) {
2520 		destp = pkt.ipp_dstopts;
2521 	} else if ((pkt.ipp_fields & IPPF_RTHDRDSTOPTS) != 0) {
2522 		destp = pkt.ipp_rthdrdstopts;
2523 	} else {
2524 		return (B_FALSE);
2525 	}
2526 
2527 	endptr = (uint8_t *)destp + 8 * (destp->ip6d_len + 1);
2528 	optp = (struct ip6_opt *)(destp + 1);
2529 	while (endptr - (uint8_t *)optp > sizeof (*optp)) {
2530 		if (optp->ip6o_type == IP6OPT_TUNNEL_LIMIT) {
2531 			if ((uint8_t *)(optp + 1) >= endptr)
2532 				return (B_FALSE);
2533 			*encaplim_ptr = (uint8_t *)&optp[1];
2534 			return (B_TRUE);
2535 		}
2536 		optp = (struct ip6_opt *)((uint8_t *)optp + optp->ip6o_len + 2);
2537 	}
2538 	return (B_FALSE);
2539 }
2540 
2541 /*
2542  * Received ICMPv6 error in response to an X over IPv6 packet that we
2543  * transmitted.
2544  *
2545  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2546  * the following:
2547  *
2548  * [IPv6(0)][ICMPv6][IPv6(1)][IPv4][ULP]
2549  *
2550  *	or
2551  *
2552  * [IPv6(0)][ICMPv6][IPv6(1)][IPv6(2)][ULP]
2553  *
2554  * And "outer6" will get set to IPv6(1), and inner[46] will correspond to
2555  * whatever the very-inner packet is (IPv4 or IPv6(2)).
2556  */
2557 static void
2558 iptun_input_icmp_v6(iptun_t *iptun, mblk_t *data_mp, icmp6_t *icmp6h,
2559     ip_recv_attr_t *ira)
2560 {
2561 	uint8_t	*orig;
2562 	ipha_t	*outer4, *inner4;
2563 	ip6_t	*outer6, *inner6;
2564 	int	outer_hlen;
2565 	uint8_t	type, code;
2566 
2567 	ASSERT(data_mp->b_cont == NULL);
2568 
2569 	/*
2570 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
2571 	 * find IP headers in the ICMP packet payload.
2572 	 */
2573 	orig = data_mp->b_rptr;
2574 	data_mp->b_rptr = (uint8_t *)(icmp6h + 1);
2575 	/*
2576 	 * The ip module ensures that ICMP errors contain at least the
2577 	 * original IP header (otherwise, the error would never have made it
2578 	 * here).
2579 	 */
2580 	ASSERT(MBLKL(data_mp) >= 0);
2581 	outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2582 	    &inner6);
2583 	ASSERT(outer4 == NULL);
2584 	data_mp->b_rptr = orig;	/* Restore r_ptr */
2585 	if (outer_hlen == 0) {
2586 		iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2587 		return;
2588 	}
2589 
2590 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2591 	    inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2592 	if (data_mp == NULL) {
2593 		/* Callee did all of the freeing. */
2594 		atomic_inc_64(&iptun->iptun_ierrors);
2595 		return;
2596 	}
2597 	/* We should never see reassembled fragment here. */
2598 	ASSERT(data_mp->b_next == NULL);
2599 
2600 	data_mp->b_rptr = (uint8_t *)outer6 + outer_hlen;
2601 
2602 	/*
2603 	 * If the original packet being transmitted was itself an ICMP error,
2604 	 * then drop this packet.  We don't want to generate an ICMP error in
2605 	 * response to an ICMP error.
2606 	 */
2607 	if (is_icmp_error(data_mp, inner4, inner6)) {
2608 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2609 		return;
2610 	}
2611 
2612 	switch (icmp6h->icmp6_type) {
2613 	case ICMP6_PARAM_PROB: {
2614 		uint8_t *encaplim_ptr;
2615 
2616 		/*
2617 		 * If the ICMPv6 error points to a valid Tunnel Encapsulation
2618 		 * Limit option and the limit value is 0, then fall through
2619 		 * and send a host unreachable message.  Otherwise, treat the
2620 		 * error as an output error, as there must have been a problem
2621 		 * with a packet we sent.
2622 		 */
2623 		if (!iptun_find_encaplimit(data_mp, outer6, &encaplim_ptr) ||
2624 		    (icmp6h->icmp6_pptr !=
2625 		    ((ptrdiff_t)encaplim_ptr - (ptrdiff_t)outer6)) ||
2626 		    *encaplim_ptr != 0) {
2627 			iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2628 			return;
2629 		}
2630 		/* FALLTHRU */
2631 	}
2632 	case ICMP6_TIME_EXCEEDED:
2633 	case ICMP6_DST_UNREACH:
2634 		type = (inner4 != NULL ? ICMP_DEST_UNREACHABLE :
2635 		    ICMP6_DST_UNREACH);
2636 		code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2637 		    ICMP6_DST_UNREACH_ADDR);
2638 		break;
2639 	case ICMP6_PACKET_TOO_BIG: {
2640 		uint32_t newmtu;
2641 
2642 		/*
2643 		 * We reconcile this with the fact that the tunnel may also
2644 		 * have IPsec policy by letting iptun_update_mtu take care of
2645 		 * it.
2646 		 */
2647 		newmtu = iptun_update_mtu(iptun, NULL,
2648 		    ntohl(icmp6h->icmp6_mtu));
2649 
2650 		if (inner4 != NULL) {
2651 			iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2652 			    data_mp, ira->ira_tsl);
2653 		} else {
2654 			iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp,
2655 			    ira->ira_tsl);
2656 		}
2657 		return;
2658 	}
2659 	default:
2660 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2661 		return;
2662 	}
2663 
2664 	if (inner4 != NULL) {
2665 		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2666 		    ira->ira_tsl);
2667 	} else {
2668 		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2669 		    ira->ira_tsl);
2670 	}
2671 }
2672 
2673 /*
2674  * Called as conn_recvicmp from IP for ICMP errors.
2675  */
2676 /* ARGSUSED2 */
2677 static void
2678 iptun_input_icmp(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2679 {
2680 	conn_t		*connp = arg;
2681 	iptun_t		*iptun = connp->conn_iptun;
2682 	mblk_t		*tmpmp;
2683 	size_t		hlen;
2684 
2685 	ASSERT(IPCL_IS_IPTUN(connp));
2686 
2687 	if (mp->b_cont != NULL) {
2688 		/*
2689 		 * Since ICMP error processing necessitates access to bits
2690 		 * that are within the ICMP error payload (the original packet
2691 		 * that caused the error), pull everything up into a single
2692 		 * block for convenience.
2693 		 */
2694 		if ((tmpmp = msgpullup(mp, -1)) == NULL) {
2695 			iptun_drop_pkt(mp, &iptun->iptun_norcvbuf);
2696 			return;
2697 		}
2698 		freemsg(mp);
2699 		mp = tmpmp;
2700 	}
2701 
2702 	hlen = ira->ira_ip_hdr_length;
2703 	switch (iptun->iptun_typeinfo->iti_ipvers) {
2704 	case IPV4_VERSION:
2705 		/*
2706 		 * The outer IP header coming up from IP is always ipha_t
2707 		 * alligned (otherwise, we would have crashed in ip).
2708 		 */
2709 		iptun_input_icmp_v4(iptun, mp, (icmph_t *)(mp->b_rptr + hlen),
2710 		    ira);
2711 		break;
2712 	case IPV6_VERSION:
2713 		iptun_input_icmp_v6(iptun, mp, (icmp6_t *)(mp->b_rptr + hlen),
2714 		    ira);
2715 		break;
2716 	}
2717 }
2718 
2719 static boolean_t
2720 iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2721 {
2722 	ipaddr_t v4addr;
2723 
2724 	/*
2725 	 * It's possible that someone sent us an IPv4-in-IPv4 packet with the
2726 	 * IPv4 address of a 6to4 tunnel as the destination.
2727 	 */
2728 	if (inner6 == NULL)
2729 		return (B_FALSE);
2730 
2731 	/*
2732 	 * Make sure that the IPv6 destination is within the site that this
2733 	 * 6to4 tunnel is routing for.  We don't want people bouncing random
2734 	 * tunneled IPv6 packets through this 6to4 router.
2735 	 */
2736 	IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst, (struct in_addr *)&v4addr);
2737 	if (outer4->ipha_dst != v4addr)
2738 		return (B_FALSE);
2739 
2740 	if (IN6_IS_ADDR_6TO4(&inner6->ip6_src)) {
2741 		/*
2742 		 * Section 9 of RFC 3056 (security considerations) suggests
2743 		 * that when a packet is from a 6to4 site (i.e., it's not a
2744 		 * global address being forwarded froma relay router), make
2745 		 * sure that the packet was tunneled by that site's 6to4
2746 		 * router.
2747 		 */
2748 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2749 		if (outer4->ipha_src != v4addr)
2750 			return (B_FALSE);
2751 	} else {
2752 		/*
2753 		 * Only accept packets from a relay router if we've configured
2754 		 * outbound relay router functionality.
2755 		 */
2756 		if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2757 			return (B_FALSE);
2758 	}
2759 
2760 	return (B_TRUE);
2761 }
2762 
2763 /*
2764  * Input function for everything that comes up from the ip module below us.
2765  * This is called directly from the ip module via connp->conn_recv().
2766  *
2767  * We receive M_DATA messages with IP-in-IP tunneled packets.
2768  */
2769 /* ARGSUSED2 */
2770 static void
2771 iptun_input(void *arg, mblk_t *data_mp, void *arg2, ip_recv_attr_t *ira)
2772 {
2773 	conn_t	*connp = arg;
2774 	iptun_t	*iptun = connp->conn_iptun;
2775 	int	outer_hlen;
2776 	ipha_t	*outer4, *inner4;
2777 	ip6_t	*outer6, *inner6;
2778 
2779 	ASSERT(IPCL_IS_IPTUN(connp));
2780 	ASSERT(DB_TYPE(data_mp) == M_DATA);
2781 
2782 	outer_hlen = iptun_find_headers(data_mp, ira->ira_ip_hdr_length,
2783 	    &outer4, &inner4, &outer6, &inner6);
2784 	if (outer_hlen == 0)
2785 		goto drop;
2786 
2787 	/*
2788 	 * If the system is labeled, we call tsol_check_dest() on the packet
2789 	 * destination (our local tunnel address) to ensure that the packet as
2790 	 * labeled should be allowed to be sent to us.  We don't need to call
2791 	 * the more involved tsol_receive_local() since the tunnel link itself
2792 	 * cannot be assigned to shared-stack non-global zones.
2793 	 */
2794 	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2795 		if (ira->ira_tsl == NULL)
2796 			goto drop;
2797 		if (tsol_check_dest(ira->ira_tsl, (outer4 != NULL ?
2798 		    (void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst),
2799 		    (outer4 != NULL ? IPV4_VERSION : IPV6_VERSION),
2800 		    CONN_MAC_DEFAULT, B_FALSE, NULL) != 0)
2801 			goto drop;
2802 	}
2803 
2804 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2805 	    inner4, inner6, outer4, outer6, outer_hlen, iptun->iptun_ns);
2806 	if (data_mp == NULL) {
2807 		/* Callee did all of the freeing. */
2808 		return;
2809 	}
2810 
2811 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
2812 	    !iptun_in_6to4_ok(iptun, outer4, inner6))
2813 		goto drop;
2814 
2815 	/*
2816 	 * We need to statistically account for each packet individually, so
2817 	 * we might as well split up any b_next chains here.
2818 	 */
2819 	do {
2820 		mblk_t	*mp;
2821 
2822 		mp = data_mp->b_next;
2823 		data_mp->b_next = NULL;
2824 
2825 		atomic_inc_64(&iptun->iptun_ipackets);
2826 		atomic_add_64(&iptun->iptun_rbytes, msgdsize(data_mp));
2827 		mac_rx(iptun->iptun_mh, NULL, data_mp);
2828 
2829 		data_mp = mp;
2830 	} while (data_mp != NULL);
2831 	return;
2832 drop:
2833 	iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2834 }
2835 
2836 /*
2837  * Do 6to4-specific header-processing on output.  Return B_TRUE if the packet
2838  * was processed without issue, or B_FALSE if the packet had issues and should
2839  * be dropped.
2840  */
2841 static boolean_t
2842 iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2843 {
2844 	ipaddr_t v4addr;
2845 
2846 	/*
2847 	 * IPv6 source must be a 6to4 address.  This is because a conscious
2848 	 * decision was made to not allow a Solaris system to be used as a
2849 	 * relay router (for security reasons) when 6to4 was initially
2850 	 * integrated.  If this decision is ever reversed, the following check
2851 	 * can be removed.
2852 	 */
2853 	if (!IN6_IS_ADDR_6TO4(&inner6->ip6_src))
2854 		return (B_FALSE);
2855 
2856 	/*
2857 	 * RFC3056 mandates that the IPv4 source MUST be set to the IPv4
2858 	 * portion of the 6to4 IPv6 source address.  In other words, make sure
2859 	 * that we're tunneling packets from our own 6to4 site.
2860 	 */
2861 	IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2862 	if (outer4->ipha_src != v4addr)
2863 		return (B_FALSE);
2864 
2865 	/*
2866 	 * Automatically set the destination of the outer IPv4 header as
2867 	 * described in RFC3056.  There are two possibilities:
2868 	 *
2869 	 * a. If the IPv6 destination is a 6to4 address, set the IPv4 address
2870 	 *    to the IPv4 portion of the 6to4 address.
2871 	 * b. If the IPv6 destination is a native IPv6 address, set the IPv4
2872 	 *    destination to the address of a relay router.
2873 	 *
2874 	 * Design Note: b shouldn't be necessary here, and this is a flaw in
2875 	 * the design of the 6to4relay command.  Instead of setting a 6to4
2876 	 * relay address in this module via an ioctl, the 6to4relay command
2877 	 * could simply add a IPv6 route for native IPv6 addresses (such as a
2878 	 * default route) in the forwarding table that uses a 6to4 destination
2879 	 * as its next hop, and the IPv4 portion of that address could be a
2880 	 * 6to4 relay address.  In order for this to work, IP would have to
2881 	 * resolve the next hop address, which would necessitate a link-layer
2882 	 * address resolver for 6to4 links, which doesn't exist today.
2883 	 *
2884 	 * In fact, if a resolver existed for 6to4 links, then setting the
2885 	 * IPv4 destination in the outer header could be done as part of
2886 	 * link-layer address resolution and fast-path header generation, and
2887 	 * not here.
2888 	 */
2889 	if (IN6_IS_ADDR_6TO4(&inner6->ip6_dst)) {
2890 		/* destination is a 6to4 router */
2891 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst,
2892 		    (struct in_addr *)&outer4->ipha_dst);
2893 
2894 		/* Reject attempts to send to INADDR_ANY */
2895 		if (outer4->ipha_dst == INADDR_ANY)
2896 			return (B_FALSE);
2897 	} else {
2898 		/*
2899 		 * The destination is a native IPv6 address.  If output to a
2900 		 * relay-router is enabled, use the relay-router's IPv4
2901 		 * address as the destination.
2902 		 */
2903 		if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2904 			return (B_FALSE);
2905 		outer4->ipha_dst = iptun->iptun_iptuns->iptuns_relay_rtr_addr;
2906 	}
2907 
2908 	/*
2909 	 * If the outer source and destination are equal, this means that the
2910 	 * 6to4 router somehow forwarded an IPv6 packet destined for its own
2911 	 * 6to4 site to its 6to4 tunnel interface, which will result in this
2912 	 * packet infinitely bouncing between ip and iptun.
2913 	 */
2914 	return (outer4->ipha_src != outer4->ipha_dst);
2915 }
2916 
2917 /*
2918  * Process output packets with outer IPv4 headers.  Frees mp and bumps stat on
2919  * error.
2920  */
2921 static mblk_t *
2922 iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
2923     ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
2924 {
2925 	uint8_t	*innerptr = (inner4 != NULL ?
2926 	    (uint8_t *)inner4 : (uint8_t *)inner6);
2927 	size_t	minmtu = iptun->iptun_typeinfo->iti_minmtu;
2928 
2929 	if (inner4 != NULL) {
2930 		ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP);
2931 		/*
2932 		 * Copy the tos from the inner IPv4 header. We mask off ECN
2933 		 * bits (bits 6 and 7) because there is currently no
2934 		 * tunnel-tunnel communication to determine if both sides
2935 		 * support ECN.  We opt for the safe choice: don't copy the
2936 		 * ECN bits when doing encapsulation.
2937 		 */
2938 		outer4->ipha_type_of_service =
2939 		    inner4->ipha_type_of_service & ~0x03;
2940 	} else {
2941 		ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 &&
2942 		    inner6 != NULL);
2943 	}
2944 	if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
2945 		outer4->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
2946 	else
2947 		outer4->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
2948 
2949 	/*
2950 	 * As described in section 3.2.2 of RFC4213, if the packet payload is
2951 	 * less than or equal to the minimum MTU size, then we need to allow
2952 	 * IPv4 to fragment the packet.  The reason is that even if we end up
2953 	 * receiving an ICMP frag-needed, the interface above this tunnel
2954 	 * won't be allowed to drop its MTU as a result, since the packet was
2955 	 * already smaller than the smallest allowable MTU for that interface.
2956 	 */
2957 	if (mp->b_wptr - innerptr <= minmtu) {
2958 		outer4->ipha_fragment_offset_and_flags = 0;
2959 		ixa->ixa_flags &= ~IXAF_DONTFRAG;
2960 	} else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) &&
2961 	    (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4)) {
2962 		ixa->ixa_flags |= IXAF_DONTFRAG;
2963 	}
2964 
2965 	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(outer4);
2966 	ixa->ixa_pktlen = msgdsize(mp);
2967 	ixa->ixa_protocol = outer4->ipha_protocol;
2968 
2969 	outer4->ipha_length = htons(ixa->ixa_pktlen);
2970 	return (mp);
2971 }
2972 
2973 /*
2974  * Insert an encapsulation limit destination option in the packet provided.
2975  * Always consumes the mp argument and returns a new mblk pointer.
2976  */
2977 static mblk_t *
2978 iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
2979     uint8_t limit)
2980 {
2981 	mblk_t			*newmp;
2982 	iptun_ipv6hdrs_t	*newouter6;
2983 
2984 	ASSERT(outer6->ip6_nxt == IPPROTO_IPV6);
2985 	ASSERT(mp->b_cont == NULL);
2986 
2987 	mp->b_rptr += sizeof (ip6_t);
2988 	newmp = allocb(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), BPRI_MED);
2989 	if (newmp == NULL) {
2990 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2991 		return (NULL);
2992 	}
2993 	newmp->b_wptr += sizeof (iptun_ipv6hdrs_t);
2994 	/* Copy the payload (Starting with the inner IPv6 header). */
2995 	bcopy(mp->b_rptr, newmp->b_wptr, MBLKL(mp));
2996 	newmp->b_wptr += MBLKL(mp);
2997 	newouter6 = (iptun_ipv6hdrs_t *)newmp->b_rptr;
2998 	/* Now copy the outer IPv6 header. */
2999 	bcopy(outer6, &newouter6->it6h_ip6h, sizeof (ip6_t));
3000 	newouter6->it6h_ip6h.ip6_nxt = IPPROTO_DSTOPTS;
3001 	newouter6->it6h_encaplim = iptun_encaplim_init;
3002 	newouter6->it6h_encaplim.iel_destopt.ip6d_nxt = outer6->ip6_nxt;
3003 	newouter6->it6h_encaplim.iel_telopt.ip6ot_encap_limit = limit;
3004 
3005 	/*
3006 	 * The payload length will be set at the end of
3007 	 * iptun_out_process_ipv6().
3008 	 */
3009 
3010 	freemsg(mp);
3011 	return (newmp);
3012 }
3013 
3014 /*
3015  * Process output packets with outer IPv6 headers.  Frees mp and bumps stats
3016  * on error.
3017  */
3018 static mblk_t *
3019 iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
3020     ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
3021 {
3022 	uint8_t		*innerptr = (inner4 != NULL ?
3023 	    (uint8_t *)inner4 : (uint8_t *)inner6);
3024 	size_t		minmtu = iptun->iptun_typeinfo->iti_minmtu;
3025 	uint8_t		*limit, *configlimit;
3026 	uint32_t	offset;
3027 	iptun_ipv6hdrs_t *v6hdrs;
3028 
3029 	if (inner6 != NULL && iptun_find_encaplimit(mp, inner6, &limit)) {
3030 		/*
3031 		 * The inner packet is an IPv6 packet which itself contains an
3032 		 * encapsulation limit option.  The limit variable points to
3033 		 * the value in the embedded option.  Process the
3034 		 * encapsulation limit option as specified in RFC 2473.
3035 		 *
3036 		 * If limit is 0, then we've exceeded the limit and we need to
3037 		 * send back an ICMPv6 parameter problem message.
3038 		 *
3039 		 * If limit is > 0, then we decrement it by 1 and make sure
3040 		 * that the encapsulation limit option in the outer header
3041 		 * reflects that (adding an option if one isn't already
3042 		 * there).
3043 		 */
3044 		ASSERT(limit > mp->b_rptr && limit < mp->b_wptr);
3045 		if (*limit == 0) {
3046 			mp->b_rptr = (uint8_t *)inner6;
3047 			offset = limit - mp->b_rptr;
3048 			iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB,
3049 			    0, offset, ixa->ixa_tsl);
3050 			atomic_inc_64(&iptun->iptun_noxmtbuf);
3051 			return (NULL);
3052 		}
3053 
3054 		/*
3055 		 * The outer header requires an encapsulation limit option.
3056 		 * If there isn't one already, add one.
3057 		 */
3058 		if (iptun->iptun_encaplimit == 0) {
3059 			if ((mp = iptun_insert_encaplimit(iptun, mp, outer6,
3060 			    (*limit - 1))) == NULL)
3061 				return (NULL);
3062 			v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3063 		} else {
3064 			/*
3065 			 * There is an existing encapsulation limit option in
3066 			 * the outer header.  If the inner encapsulation limit
3067 			 * is less than the configured encapsulation limit,
3068 			 * update the outer encapsulation limit to reflect
3069 			 * this lesser value.
3070 			 */
3071 			v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3072 			configlimit =
3073 			    &v6hdrs->it6h_encaplim.iel_telopt.ip6ot_encap_limit;
3074 			if ((*limit - 1) < *configlimit)
3075 				*configlimit = (*limit - 1);
3076 		}
3077 		ixa->ixa_ip_hdr_length = sizeof (iptun_ipv6hdrs_t);
3078 		ixa->ixa_protocol = v6hdrs->it6h_encaplim.iel_destopt.ip6d_nxt;
3079 	} else {
3080 		ixa->ixa_ip_hdr_length = sizeof (ip6_t);
3081 		ixa->ixa_protocol = outer6->ip6_nxt;
3082 	}
3083 	/*
3084 	 * See iptun_output_process_ipv4() why we allow fragmentation for
3085 	 * small packets
3086 	 */
3087 	if (mp->b_wptr - innerptr <= minmtu)
3088 		ixa->ixa_flags &= ~IXAF_DONTFRAG;
3089 	else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL))
3090 		ixa->ixa_flags |= IXAF_DONTFRAG;
3091 
3092 	ixa->ixa_pktlen = msgdsize(mp);
3093 	outer6->ip6_plen = htons(ixa->ixa_pktlen - sizeof (ip6_t));
3094 	return (mp);
3095 }
3096 
3097 /*
3098  * The IP tunneling MAC-type plugins have already done most of the header
3099  * processing and validity checks.  We are simply responsible for multiplexing
3100  * down to the ip module below us.
3101  */
3102 static void
3103 iptun_output(iptun_t *iptun, mblk_t *mp)
3104 {
3105 	conn_t	*connp = iptun->iptun_connp;
3106 	mblk_t	*newmp;
3107 	int	error;
3108 	ip_xmit_attr_t	*ixa;
3109 
3110 	ASSERT(mp->b_datap->db_type == M_DATA);
3111 
3112 	if (mp->b_cont != NULL) {
3113 		if ((newmp = msgpullup(mp, -1)) == NULL) {
3114 			iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
3115 			return;
3116 		}
3117 		freemsg(mp);
3118 		mp = newmp;
3119 	}
3120 
3121 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
3122 		iptun_output_6to4(iptun, mp);
3123 		return;
3124 	}
3125 
3126 	if (is_system_labeled()) {
3127 		/*
3128 		 * Since the label can be different meaning a potentially
3129 		 * different IRE,we always use a unique ip_xmit_attr_t.
3130 		 */
3131 		ixa = conn_get_ixa_exclusive(connp);
3132 	} else {
3133 		/*
3134 		 * If no other thread is using conn_ixa this just gets a
3135 		 * reference to conn_ixa. Otherwise we get a safe copy of
3136 		 * conn_ixa.
3137 		 */
3138 		ixa = conn_get_ixa(connp, B_FALSE);
3139 	}
3140 	if (ixa == NULL) {
3141 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3142 		return;
3143 	}
3144 
3145 	/*
3146 	 * In case we got a safe copy of conn_ixa, then we need
3147 	 * to fill in any pointers in it.
3148 	 */
3149 	if (ixa->ixa_ire == NULL) {
3150 		error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3151 		    &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
3152 		    NULL, NULL, 0);
3153 		if (error != 0) {
3154 			if (ixa->ixa_ire != NULL &&
3155 			    (error == EHOSTUNREACH || error == ENETUNREACH)) {
3156 				/*
3157 				 * Let conn_ip_output/ire_send_noroute return
3158 				 * the error and send any local ICMP error.
3159 				 */
3160 				error = 0;
3161 			} else {
3162 				ixa_refrele(ixa);
3163 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3164 				return;
3165 			}
3166 		}
3167 	}
3168 
3169 	iptun_output_common(iptun, ixa, mp);
3170 	ixa_refrele(ixa);
3171 }
3172 
3173 /*
3174  * We use an ixa based on the last destination.
3175  */
3176 static void
3177 iptun_output_6to4(iptun_t *iptun, mblk_t *mp)
3178 {
3179 	conn_t		*connp = iptun->iptun_connp;
3180 	ipha_t		*outer4, *inner4;
3181 	ip6_t		*outer6, *inner6;
3182 	ip_xmit_attr_t	*ixa;
3183 	ip_xmit_attr_t	*oldixa;
3184 	int		error;
3185 	boolean_t	need_connect;
3186 	in6_addr_t	v6dst;
3187 
3188 	ASSERT(mp->b_cont == NULL);	/* Verified by iptun_output */
3189 
3190 	/* Make sure we set ipha_dst before we look at ipha_dst */
3191 
3192 	(void) iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, &inner6);
3193 	ASSERT(outer4 != NULL);
3194 	if (!iptun_out_process_6to4(iptun, outer4, inner6)) {
3195 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3196 		return;
3197 	}
3198 
3199 	if (is_system_labeled()) {
3200 		/*
3201 		 * Since the label can be different meaning a potentially
3202 		 * different IRE,we always use a unique ip_xmit_attr_t.
3203 		 */
3204 		ixa = conn_get_ixa_exclusive(connp);
3205 	} else {
3206 		/*
3207 		 * If no other thread is using conn_ixa this just gets a
3208 		 * reference to conn_ixa. Otherwise we get a safe copy of
3209 		 * conn_ixa.
3210 		 */
3211 		ixa = conn_get_ixa(connp, B_FALSE);
3212 	}
3213 	if (ixa == NULL) {
3214 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3215 		return;
3216 	}
3217 
3218 	mutex_enter(&connp->conn_lock);
3219 	if (connp->conn_v4lastdst == outer4->ipha_dst) {
3220 		need_connect = (ixa->ixa_ire == NULL);
3221 	} else {
3222 		/* In case previous destination was multirt */
3223 		ip_attr_newdst(ixa);
3224 
3225 		/*
3226 		 * We later update conn_ixa when we update conn_v4lastdst
3227 		 * which enables subsequent packets to avoid redoing
3228 		 * ip_attr_connect
3229 		 */
3230 		need_connect = B_TRUE;
3231 	}
3232 	mutex_exit(&connp->conn_lock);
3233 
3234 	/*
3235 	 * In case we got a safe copy of conn_ixa, or otherwise we don't
3236 	 * have a current ixa_ire, then we need to fill in any pointers in
3237 	 * the ixa.
3238 	 */
3239 	if (need_connect) {
3240 		IN6_IPADDR_TO_V4MAPPED(outer4->ipha_dst, &v6dst);
3241 
3242 		/* We handle IPsec in iptun_output_common */
3243 		error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3244 		    &v6dst, &v6dst, 0, NULL, NULL, 0);
3245 		if (error != 0) {
3246 			if (ixa->ixa_ire != NULL &&
3247 			    (error == EHOSTUNREACH || error == ENETUNREACH)) {
3248 				/*
3249 				 * Let conn_ip_output/ire_send_noroute return
3250 				 * the error and send any local ICMP error.
3251 				 */
3252 				error = 0;
3253 			} else {
3254 				ixa_refrele(ixa);
3255 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3256 				return;
3257 			}
3258 		}
3259 	}
3260 
3261 	iptun_output_common(iptun, ixa, mp);
3262 
3263 	/* Atomically replace conn_ixa and conn_v4lastdst */
3264 	mutex_enter(&connp->conn_lock);
3265 	if (connp->conn_v4lastdst != outer4->ipha_dst) {
3266 		/* Remember the dst which corresponds to conn_ixa */
3267 		connp->conn_v6lastdst = v6dst;
3268 		oldixa = conn_replace_ixa(connp, ixa);
3269 	} else {
3270 		oldixa = NULL;
3271 	}
3272 	mutex_exit(&connp->conn_lock);
3273 	ixa_refrele(ixa);
3274 	if (oldixa != NULL)
3275 		ixa_refrele(oldixa);
3276 }
3277 
3278 /*
3279  * Check the destination/label. Modifies *mpp by adding/removing CIPSO.
3280  *
3281  * We get the label from the message in order to honor the
3282  * ULPs/IPs choice of label. This will be NULL for forwarded
3283  * packets, neighbor discovery packets and some others.
3284  */
3285 static int
3286 iptun_output_check_label(mblk_t **mpp, ip_xmit_attr_t *ixa)
3287 {
3288 	cred_t	*cr;
3289 	int	adjust;
3290 	int	iplen;
3291 	int	err;
3292 	ts_label_t *effective_tsl = NULL;
3293 
3294 
3295 	ASSERT(is_system_labeled());
3296 
3297 	cr = msg_getcred(*mpp, NULL);
3298 	if (cr == NULL)
3299 		return (0);
3300 
3301 	/*
3302 	 * We need to start with a label based on the IP/ULP above us
3303 	 */
3304 	ip_xmit_attr_restore_tsl(ixa, cr);
3305 
3306 	/*
3307 	 * Need to update packet with any CIPSO option since
3308 	 * conn_ip_output doesn't do that.
3309 	 */
3310 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
3311 		ipha_t *ipha;
3312 
3313 		ipha = (ipha_t *)(*mpp)->b_rptr;
3314 		iplen = ntohs(ipha->ipha_length);
3315 		err = tsol_check_label_v4(ixa->ixa_tsl,
3316 		    ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3317 		    ixa->ixa_ipst, &effective_tsl);
3318 		if (err != 0)
3319 			return (err);
3320 
3321 		ipha = (ipha_t *)(*mpp)->b_rptr;
3322 		adjust = (int)ntohs(ipha->ipha_length) - iplen;
3323 	} else {
3324 		ip6_t *ip6h;
3325 
3326 		ip6h = (ip6_t *)(*mpp)->b_rptr;
3327 		iplen = ntohs(ip6h->ip6_plen);
3328 
3329 		err = tsol_check_label_v6(ixa->ixa_tsl,
3330 		    ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3331 		    ixa->ixa_ipst, &effective_tsl);
3332 		if (err != 0)
3333 			return (err);
3334 
3335 		ip6h = (ip6_t *)(*mpp)->b_rptr;
3336 		adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
3337 	}
3338 
3339 	if (effective_tsl != NULL) {
3340 		/* Update the label */
3341 		ip_xmit_attr_replace_tsl(ixa, effective_tsl);
3342 	}
3343 	ixa->ixa_pktlen += adjust;
3344 	ixa->ixa_ip_hdr_length += adjust;
3345 	return (0);
3346 }
3347 
3348 
3349 static void
3350 iptun_output_common(iptun_t *iptun, ip_xmit_attr_t *ixa, mblk_t *mp)
3351 {
3352 	ipsec_tun_pol_t	*itp = iptun->iptun_itp;
3353 	int		outer_hlen;
3354 	mblk_t		*newmp;
3355 	ipha_t		*outer4, *inner4;
3356 	ip6_t		*outer6, *inner6;
3357 	int		error;
3358 	boolean_t	update_pktlen;
3359 
3360 	ASSERT(ixa->ixa_ire != NULL);
3361 
3362 	outer_hlen = iptun_find_headers(mp, 0, &outer4, &inner4, &outer6,
3363 	    &inner6);
3364 	if (outer_hlen == 0) {
3365 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3366 		return;
3367 	}
3368 
3369 	/* Save IXAF_DONTFRAG value */
3370 	iaflags_t dontfrag = ixa->ixa_flags & IXAF_DONTFRAG;
3371 
3372 	/* Perform header processing. */
3373 	if (outer4 != NULL) {
3374 		mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6,
3375 		    ixa);
3376 	} else {
3377 		mp = iptun_out_process_ipv6(iptun, mp, outer6, inner4, inner6,
3378 		    ixa);
3379 	}
3380 	if (mp == NULL)
3381 		return;
3382 
3383 	/*
3384 	 * Let's hope the compiler optimizes this with "branch taken".
3385 	 */
3386 	if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
3387 		/* This updates the ip_xmit_attr_t */
3388 		mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
3389 		    outer6, outer_hlen, ixa);
3390 		if (mp == NULL) {
3391 			atomic_inc_64(&iptun->iptun_oerrors);
3392 			return;
3393 		}
3394 		if (is_system_labeled()) {
3395 			/*
3396 			 * Might change the packet by adding/removing CIPSO.
3397 			 * After this caller inner* and outer* and outer_hlen
3398 			 * might be invalid.
3399 			 */
3400 			error = iptun_output_check_label(&mp, ixa);
3401 			if (error != 0) {
3402 				ip2dbg(("label check failed (%d)\n", error));
3403 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3404 				return;
3405 			}
3406 		}
3407 
3408 		/*
3409 		 * ipsec_tun_outbound() returns a chain of tunneled IP
3410 		 * fragments linked with b_next (or a single message if the
3411 		 * tunneled packet wasn't a fragment).
3412 		 * If fragcache returned a list then we need to update
3413 		 * ixa_pktlen for all packets in the list.
3414 		 */
3415 		update_pktlen = (mp->b_next != NULL);
3416 
3417 		/*
3418 		 * Otherwise, we're good to go.  The ixa has been updated with
3419 		 * instructions for outbound IPsec processing.
3420 		 */
3421 		for (newmp = mp; newmp != NULL; newmp = mp) {
3422 			size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
3423 
3424 			atomic_inc_64(&iptun->iptun_opackets);
3425 			atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3426 			mp = mp->b_next;
3427 			newmp->b_next = NULL;
3428 
3429 			/*
3430 			 * The IXAF_DONTFRAG flag is global, but there is
3431 			 * a chain here.  Check if we're really already
3432 			 * smaller than the minimum allowed MTU and reset here
3433 			 * appropriately.  Otherwise one small packet can kill
3434 			 * the whole chain's path mtu discovery.
3435 			 * In addition, update the pktlen to the length of
3436 			 * the actual packet being processed.
3437 			 */
3438 			if (update_pktlen) {
3439 				ixa->ixa_pktlen = msgdsize(newmp);
3440 				if (ixa->ixa_pktlen <= minmtu)
3441 					ixa->ixa_flags &= ~IXAF_DONTFRAG;
3442 			}
3443 
3444 			atomic_inc_64(&iptun->iptun_opackets);
3445 			atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3446 
3447 			error = conn_ip_output(newmp, ixa);
3448 
3449 			/* Restore IXAF_DONTFRAG value */
3450 			ixa->ixa_flags |= dontfrag;
3451 
3452 			if (error == EMSGSIZE) {
3453 				/* IPsec policy might have changed */
3454 				(void) iptun_update_mtu(iptun, ixa, 0);
3455 			}
3456 		}
3457 	} else {
3458 		/*
3459 		 * The ip module will potentially apply global policy to the
3460 		 * packet in its output path if there's no active tunnel
3461 		 * policy.
3462 		 */
3463 		ASSERT(ixa->ixa_ipsec_policy == NULL);
3464 		mp = ip_output_attach_policy(mp, outer4, outer6, NULL, ixa);
3465 		if (mp == NULL) {
3466 			atomic_inc_64(&iptun->iptun_oerrors);
3467 			return;
3468 		}
3469 		if (is_system_labeled()) {
3470 			/*
3471 			 * Might change the packet by adding/removing CIPSO.
3472 			 * After this caller inner* and outer* and outer_hlen
3473 			 * might be invalid.
3474 			 */
3475 			error = iptun_output_check_label(&mp, ixa);
3476 			if (error != 0) {
3477 				ip2dbg(("label check failed (%d)\n", error));
3478 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3479 				return;
3480 			}
3481 		}
3482 
3483 		atomic_inc_64(&iptun->iptun_opackets);
3484 		atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3485 
3486 		error = conn_ip_output(mp, ixa);
3487 		if (error == EMSGSIZE) {
3488 			/* IPsec policy might have changed */
3489 			(void) iptun_update_mtu(iptun, ixa, 0);
3490 		}
3491 	}
3492 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE)
3493 		ipsec_out_release_refs(ixa);
3494 }
3495 
3496 static mac_callbacks_t iptun_m_callbacks = {
3497 	.mc_callbacks	= (MC_SETPROP | MC_GETPROP | MC_PROPINFO),
3498 	.mc_getstat	= iptun_m_getstat,
3499 	.mc_start	= iptun_m_start,
3500 	.mc_stop	= iptun_m_stop,
3501 	.mc_setpromisc	= iptun_m_setpromisc,
3502 	.mc_multicst	= iptun_m_multicst,
3503 	.mc_unicst	= iptun_m_unicst,
3504 	.mc_tx		= iptun_m_tx,
3505 	.mc_reserved	= NULL,
3506 	.mc_setprop	= iptun_m_setprop,
3507 	.mc_getprop	= iptun_m_getprop,
3508 	.mc_propinfo	= iptun_m_propinfo
3509 };
3510