xref: /illumos-gate/usr/src/uts/common/inet/iptun/iptun.c (revision 33efde4275d24731ef87927237b0ffb0630b6b2d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2016, Joyent, Inc. All rights reserved.
24  */
25 
26 /*
27  * iptun - IP Tunneling Driver
28  *
29  * This module is a GLDv3 driver that implements virtual datalinks over IP
30  * (a.k.a, IP tunneling).  The datalinks are managed through a dld ioctl
31  * interface (see iptun_ctl.c), and registered with GLDv3 using
32  * mac_register().  It implements the logic for various forms of IP (IPv4 or
33  * IPv6) encapsulation within IP (IPv4 or IPv6) by interacting with the ip
34  * module below it.  Each virtual IP tunnel datalink has a conn_t associated
35  * with it representing the "outer" IP connection.
36  *
37  * The module implements the following locking semantics:
38  *
39  * Lookups and deletions in iptun_hash are synchronized using iptun_hash_lock.
40  * See comments above iptun_hash_lock for details.
41  *
42  * No locks are ever held while calling up to GLDv3.  The general architecture
43  * of GLDv3 requires this, as the mac perimeter (essentially a lock) for a
44  * given link will be held while making downcalls (iptun_m_*() callbacks).
45  * Because we need to hold locks while handling downcalls, holding these locks
46  * while issuing upcalls results in deadlock scenarios.  See the block comment
47  * above iptun_task_cb() for details on how we safely issue upcalls without
48  * holding any locks.
49  *
50  * The contents of each iptun_t is protected by an iptun_mutex which is held
51  * in iptun_enter() (called by iptun_enter_by_linkid()), and exited in
52  * iptun_exit().
53  *
54  * See comments in iptun_delete() and iptun_free() for details on how the
55  * iptun_t is deleted safely.
56  */
57 
58 #include <sys/types.h>
59 #include <sys/kmem.h>
60 #include <sys/errno.h>
61 #include <sys/modhash.h>
62 #include <sys/list.h>
63 #include <sys/strsun.h>
64 #include <sys/file.h>
65 #include <sys/systm.h>
66 #include <sys/tihdr.h>
67 #include <sys/param.h>
68 #include <sys/mac_provider.h>
69 #include <sys/mac_ipv4.h>
70 #include <sys/mac_ipv6.h>
71 #include <sys/mac_6to4.h>
72 #include <sys/tsol/tnet.h>
73 #include <sys/sunldi.h>
74 #include <netinet/in.h>
75 #include <netinet/ip6.h>
76 #include <inet/ip.h>
77 #include <inet/ip_ire.h>
78 #include <inet/ipsec_impl.h>
79 #include <sys/tsol/label.h>
80 #include <sys/tsol/tnet.h>
81 #include <inet/iptun.h>
82 #include "iptun_impl.h"
83 
84 /* Do the tunnel type and address family match? */
85 #define	IPTUN_ADDR_MATCH(iptun_type, family)				\
86 	((iptun_type == IPTUN_TYPE_IPV4 && family == AF_INET) ||	\
87 	(iptun_type == IPTUN_TYPE_IPV6 && family == AF_INET6) ||	\
88 	(iptun_type == IPTUN_TYPE_6TO4 && family == AF_INET))
89 
90 #define	IPTUN_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
91 
92 #define	IPTUN_MIN_IPV4_MTU	576		/* ip.h still uses 68 (!) */
93 #define	IPTUN_MIN_IPV6_MTU	IPV6_MIN_MTU
94 #define	IPTUN_MAX_IPV4_MTU	(IP_MAXPACKET - sizeof (ipha_t))
95 #define	IPTUN_MAX_IPV6_MTU	(IP_MAXPACKET - sizeof (ip6_t) -	\
96 				    sizeof (iptun_encaplim_t))
97 
98 #define	IPTUN_MIN_HOPLIMIT	1
99 #define	IPTUN_MAX_HOPLIMIT	UINT8_MAX
100 
101 #define	IPTUN_MIN_ENCAPLIMIT	0
102 #define	IPTUN_MAX_ENCAPLIMIT	UINT8_MAX
103 
104 #define	IPTUN_IPSEC_REQ_MASK	(IPSEC_PREF_REQUIRED | IPSEC_PREF_NEVER)
105 
106 static iptun_encaplim_t	iptun_encaplim_init = {
107 	{ IPPROTO_NONE, 0 },
108 	IP6OPT_TUNNEL_LIMIT,
109 	1,
110 	IPTUN_DEFAULT_ENCAPLIMIT,	/* filled in with actual value later */
111 	IP6OPT_PADN,
112 	1,
113 	0
114 };
115 
116 /*
117  * Table containing per-iptun-type information.
118  * Since IPv6 can run over all of these we have the IPv6 min as the min MTU.
119  */
120 static iptun_typeinfo_t	iptun_type_table[] = {
121 	{ IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION,
122 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV4_MTU,	B_TRUE },
123 	{ IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION,
124 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV6_MTU,	B_TRUE },
125 	{ IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION,
126 	    IPTUN_MIN_IPV6_MTU,	IPTUN_MAX_IPV4_MTU,	B_FALSE },
127 	{ IPTUN_TYPE_UNKNOWN, NULL, 0, 0, 0, B_FALSE }
128 };
129 
130 /*
131  * iptun_hash is an iptun_t lookup table by link ID protected by
132  * iptun_hash_lock.  While the hash table's integrity is maintained via
133  * internal locking in the mod_hash_*() functions, we need additional locking
134  * so that an iptun_t cannot be deleted after a hash lookup has returned an
135  * iptun_t and before iptun_lock has been entered.  As such, we use
136  * iptun_hash_lock when doing lookups and removals from iptun_hash.
137  */
138 mod_hash_t	*iptun_hash;
139 static kmutex_t	iptun_hash_lock;
140 
141 static uint_t	iptun_tunnelcount;	/* total for all stacks */
142 kmem_cache_t	*iptun_cache;
143 ddi_taskq_t 	*iptun_taskq;
144 
145 typedef enum {
146 	IPTUN_TASK_MTU_UPDATE,	/* tell mac about new tunnel link MTU */
147 	IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */
148 	IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */
149 	IPTUN_TASK_LINK_UPDATE,	/* tell mac about new link state */
150 	IPTUN_TASK_PDATA_UPDATE	/* tell mac about updated plugin data */
151 } iptun_task_t;
152 
153 typedef struct iptun_task_data_s {
154 	iptun_task_t	itd_task;
155 	datalink_id_t	itd_linkid;
156 } iptun_task_data_t;
157 
158 static void iptun_task_dispatch(iptun_t *, iptun_task_t);
159 static int iptun_enter(iptun_t *);
160 static void iptun_exit(iptun_t *);
161 static void iptun_headergen(iptun_t *, boolean_t);
162 static void iptun_drop_pkt(mblk_t *, uint64_t *);
163 static void iptun_input(void *, mblk_t *, void *, ip_recv_attr_t *);
164 static void iptun_input_icmp(void *, mblk_t *, void *, ip_recv_attr_t *);
165 static void iptun_output(iptun_t *, mblk_t *);
166 static uint32_t iptun_get_maxmtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
167 static uint32_t iptun_update_mtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
168 static uint32_t iptun_get_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
169 static void iptun_update_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
170 static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *);
171 
172 static void iptun_output_6to4(iptun_t *, mblk_t *);
173 static void iptun_output_common(iptun_t *, ip_xmit_attr_t *, mblk_t *);
174 static boolean_t iptun_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
175     ip_recv_attr_t *);
176 
177 static void iptun_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
178     ixa_notify_arg_t);
179 
180 static mac_callbacks_t iptun_m_callbacks;
181 
182 static int
iptun_m_getstat(void * arg,uint_t stat,uint64_t * val)183 iptun_m_getstat(void *arg, uint_t stat, uint64_t *val)
184 {
185 	iptun_t	*iptun = arg;
186 	int	err = 0;
187 
188 	switch (stat) {
189 	case MAC_STAT_IERRORS:
190 		*val = iptun->iptun_ierrors;
191 		break;
192 	case MAC_STAT_OERRORS:
193 		*val = iptun->iptun_oerrors;
194 		break;
195 	case MAC_STAT_RBYTES:
196 		*val = iptun->iptun_rbytes;
197 		break;
198 	case MAC_STAT_IPACKETS:
199 		*val = iptun->iptun_ipackets;
200 		break;
201 	case MAC_STAT_OBYTES:
202 		*val = iptun->iptun_obytes;
203 		break;
204 	case MAC_STAT_OPACKETS:
205 		*val = iptun->iptun_opackets;
206 		break;
207 	case MAC_STAT_NORCVBUF:
208 		*val = iptun->iptun_norcvbuf;
209 		break;
210 	case MAC_STAT_NOXMTBUF:
211 		*val = iptun->iptun_noxmtbuf;
212 		break;
213 	default:
214 		err = ENOTSUP;
215 	}
216 
217 	return (err);
218 }
219 
220 static int
iptun_m_start(void * arg)221 iptun_m_start(void *arg)
222 {
223 	iptun_t	*iptun = arg;
224 	int	err;
225 
226 	if ((err = iptun_enter(iptun)) == 0) {
227 		iptun->iptun_flags |= IPTUN_MAC_STARTED;
228 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
229 		iptun_exit(iptun);
230 	}
231 	return (err);
232 }
233 
234 static void
iptun_m_stop(void * arg)235 iptun_m_stop(void *arg)
236 {
237 	iptun_t *iptun = arg;
238 
239 	if (iptun_enter(iptun) == 0) {
240 		iptun->iptun_flags &= ~IPTUN_MAC_STARTED;
241 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
242 		iptun_exit(iptun);
243 	}
244 }
245 
246 /*
247  * iptun_m_setpromisc() does nothing and always succeeds.  This is because a
248  * tunnel data-link only ever receives packets that are destined exclusively
249  * for the local address of the tunnel.
250  */
251 /* ARGSUSED */
252 static int
iptun_m_setpromisc(void * arg,boolean_t on)253 iptun_m_setpromisc(void *arg, boolean_t on)
254 {
255 	return (0);
256 }
257 
258 /* ARGSUSED */
259 static int
iptun_m_multicst(void * arg,boolean_t add,const uint8_t * addrp)260 iptun_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
261 {
262 	return (ENOTSUP);
263 }
264 
265 /*
266  * iptun_m_unicst() sets the local address.
267  */
268 /* ARGSUSED */
269 static int
iptun_m_unicst(void * arg,const uint8_t * addrp)270 iptun_m_unicst(void *arg, const uint8_t *addrp)
271 {
272 	iptun_t			*iptun = arg;
273 	int			err;
274 	struct sockaddr_storage	ss;
275 	struct sockaddr_in	*sin;
276 	struct sockaddr_in6	*sin6;
277 
278 	if ((err = iptun_enter(iptun)) == 0) {
279 		switch (iptun->iptun_typeinfo->iti_ipvers) {
280 		case IPV4_VERSION:
281 			sin = (struct sockaddr_in *)&ss;
282 			sin->sin_family = AF_INET;
283 			bcopy(addrp, &sin->sin_addr, sizeof (in_addr_t));
284 			break;
285 		case IPV6_VERSION:
286 			sin6 = (struct sockaddr_in6 *)&ss;
287 			sin6->sin6_family = AF_INET6;
288 			bcopy(addrp, &sin6->sin6_addr, sizeof (in6_addr_t));
289 			break;
290 		default:
291 			ASSERT(0);
292 		}
293 		err = iptun_setladdr(iptun, &ss);
294 		iptun_exit(iptun);
295 	}
296 	return (err);
297 }
298 
299 static mblk_t *
iptun_m_tx(void * arg,mblk_t * mpchain)300 iptun_m_tx(void *arg, mblk_t *mpchain)
301 {
302 	mblk_t	*mp, *nmp;
303 	iptun_t	*iptun = arg;
304 
305 	if (!IS_IPTUN_RUNNING(iptun)) {
306 		iptun_drop_pkt(mpchain, &iptun->iptun_noxmtbuf);
307 		return (NULL);
308 	}
309 
310 	for (mp = mpchain; mp != NULL; mp = nmp) {
311 		nmp = mp->b_next;
312 		mp->b_next = NULL;
313 		iptun_output(iptun, mp);
314 	}
315 
316 	return (NULL);
317 }
318 
319 /* ARGSUSED */
320 static int
iptun_m_setprop(void * barg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)321 iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
322     uint_t pr_valsize, const void *pr_val)
323 {
324 	iptun_t		*iptun = barg;
325 	uint32_t	value = *(uint32_t *)pr_val;
326 	int		err;
327 
328 	/*
329 	 * We need to enter this iptun_t since we'll be modifying the outer
330 	 * header.
331 	 */
332 	if ((err = iptun_enter(iptun)) != 0)
333 		return (err);
334 
335 	switch (pr_num) {
336 	case MAC_PROP_IPTUN_HOPLIMIT:
337 		if (value < IPTUN_MIN_HOPLIMIT || value > IPTUN_MAX_HOPLIMIT) {
338 			err = EINVAL;
339 			break;
340 		}
341 		if (value != iptun->iptun_hoplimit) {
342 			iptun->iptun_hoplimit = (uint8_t)value;
343 			iptun_headergen(iptun, B_TRUE);
344 		}
345 		break;
346 	case MAC_PROP_IPTUN_ENCAPLIMIT:
347 		if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6 ||
348 		    value > IPTUN_MAX_ENCAPLIMIT) {
349 			err = EINVAL;
350 			break;
351 		}
352 		if (value != iptun->iptun_encaplimit) {
353 			iptun->iptun_encaplimit = (uint8_t)value;
354 			iptun_headergen(iptun, B_TRUE);
355 		}
356 		break;
357 	case MAC_PROP_MTU: {
358 		uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
359 
360 		if (value < iptun->iptun_typeinfo->iti_minmtu ||
361 		    value > maxmtu) {
362 			err = EINVAL;
363 			break;
364 		}
365 		iptun->iptun_flags |= IPTUN_FIXED_MTU;
366 		if (value != iptun->iptun_mtu) {
367 			iptun->iptun_mtu = value;
368 			iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
369 		}
370 		break;
371 	}
372 	default:
373 		err = EINVAL;
374 	}
375 	iptun_exit(iptun);
376 	return (err);
377 }
378 
379 /* ARGSUSED */
380 static int
iptun_m_getprop(void * barg,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,void * pr_val)381 iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
382     uint_t pr_valsize, void *pr_val)
383 {
384 	iptun_t			*iptun = barg;
385 	int			err;
386 
387 	if ((err = iptun_enter(iptun)) != 0)
388 		return (err);
389 
390 	switch (pr_num) {
391 	case MAC_PROP_IPTUN_HOPLIMIT:
392 		ASSERT(pr_valsize >= sizeof (uint32_t));
393 		*(uint32_t *)pr_val = iptun->iptun_hoplimit;
394 		break;
395 
396 	case MAC_PROP_IPTUN_ENCAPLIMIT:
397 		*(uint32_t *)pr_val = iptun->iptun_encaplimit;
398 		break;
399 	default:
400 		err = ENOTSUP;
401 	}
402 
403 	iptun_exit(iptun);
404 	return (err);
405 }
406 
407 /* ARGSUSED */
408 static void
iptun_m_propinfo(void * barg,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)409 iptun_m_propinfo(void *barg, const char *pr_name, mac_prop_id_t pr_num,
410     mac_prop_info_handle_t prh)
411 {
412 	iptun_t			*iptun = barg;
413 
414 	switch (pr_num) {
415 	case MAC_PROP_IPTUN_HOPLIMIT:
416 		mac_prop_info_set_range_uint32(prh,
417 		    IPTUN_MIN_HOPLIMIT, IPTUN_MAX_HOPLIMIT);
418 		mac_prop_info_set_default_uint32(prh, IPTUN_DEFAULT_HOPLIMIT);
419 		break;
420 
421 	case MAC_PROP_IPTUN_ENCAPLIMIT:
422 		if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6)
423 			break;
424 		mac_prop_info_set_range_uint32(prh,
425 		    IPTUN_MIN_ENCAPLIMIT, IPTUN_MAX_ENCAPLIMIT);
426 		mac_prop_info_set_default_uint32(prh, IPTUN_DEFAULT_ENCAPLIMIT);
427 		break;
428 	case MAC_PROP_MTU:
429 		mac_prop_info_set_range_uint32(prh,
430 		    iptun->iptun_typeinfo->iti_minmtu,
431 		    iptun_get_maxmtu(iptun, NULL, 0));
432 		break;
433 	}
434 }
435 
436 uint_t
iptun_count(void)437 iptun_count(void)
438 {
439 	return (iptun_tunnelcount);
440 }
441 
442 /*
443  * Enter an iptun_t exclusively.  This is essentially just a mutex, but we
444  * don't allow iptun_enter() to succeed on a tunnel if it's in the process of
445  * being deleted.
446  */
447 static int
iptun_enter(iptun_t * iptun)448 iptun_enter(iptun_t *iptun)
449 {
450 	mutex_enter(&iptun->iptun_lock);
451 	while (iptun->iptun_flags & IPTUN_DELETE_PENDING)
452 		cv_wait(&iptun->iptun_enter_cv, &iptun->iptun_lock);
453 	if (iptun->iptun_flags & IPTUN_CONDEMNED) {
454 		mutex_exit(&iptun->iptun_lock);
455 		return (ENOENT);
456 	}
457 	return (0);
458 }
459 
460 /*
461  * Exit the tunnel entered in iptun_enter().
462  */
463 static void
iptun_exit(iptun_t * iptun)464 iptun_exit(iptun_t *iptun)
465 {
466 	mutex_exit(&iptun->iptun_lock);
467 }
468 
469 /*
470  * Enter the IP tunnel instance by datalink ID.
471  */
472 static int
iptun_enter_by_linkid(datalink_id_t linkid,iptun_t ** iptun)473 iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun)
474 {
475 	int err;
476 
477 	mutex_enter(&iptun_hash_lock);
478 	if (mod_hash_find(iptun_hash, IPTUN_HASH_KEY(linkid),
479 	    (mod_hash_val_t *)iptun) == 0)
480 		err = iptun_enter(*iptun);
481 	else
482 		err = ENOENT;
483 	if (err != 0)
484 		*iptun = NULL;
485 	mutex_exit(&iptun_hash_lock);
486 	return (err);
487 }
488 
489 /*
490  * Handle tasks that were deferred through the iptun_taskq because they require
491  * calling up to the mac module, and we can't call up to the mac module while
492  * holding locks.
493  *
494  * This is tricky to get right without introducing race conditions and
495  * deadlocks with the mac module, as we cannot issue an upcall while in the
496  * iptun_t.  The reason is that upcalls may try and enter the mac perimeter,
497  * while iptun callbacks (such as iptun_m_setprop()) called from the mac
498  * module will already have the perimeter held, and will then try and enter
499  * the iptun_t.  You can see the lock ordering problem with this; this will
500  * deadlock.
501  *
502  * The safe way to do this is to enter the iptun_t in question and copy the
503  * information we need out of it so that we can exit it and know that the
504  * information being passed up to the upcalls won't be subject to modification
505  * by other threads.  The problem now is that we need to exit it prior to
506  * issuing the upcall, but once we do this, a thread could come along and
507  * delete the iptun_t and thus the mac handle required to issue the upcall.
508  * To prevent this, we set the IPTUN_UPCALL_PENDING flag prior to exiting the
509  * iptun_t.  This flag is the condition associated with iptun_upcall_cv, which
510  * iptun_delete() will cv_wait() on.  When the upcall completes, we clear
511  * IPTUN_UPCALL_PENDING and cv_signal() any potentially waiting
512  * iptun_delete().  We can thus still safely use iptun->iptun_mh after having
513  * exited the iptun_t.
514  */
515 static void
iptun_task_cb(void * arg)516 iptun_task_cb(void *arg)
517 {
518 	iptun_task_data_t	*itd = arg;
519 	iptun_task_t		task = itd->itd_task;
520 	datalink_id_t		linkid = itd->itd_linkid;
521 	iptun_t			*iptun;
522 	uint32_t		mtu;
523 	iptun_addr_t		addr;
524 	link_state_t		linkstate;
525 	size_t			header_size;
526 	iptun_header_t		header;
527 
528 	kmem_free(itd, sizeof (*itd));
529 
530 	/*
531 	 * Note that if the lookup fails, it's because the tunnel was deleted
532 	 * between the time the task was dispatched and now.  That isn't an
533 	 * error.
534 	 */
535 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
536 		return;
537 
538 	iptun->iptun_flags |= IPTUN_UPCALL_PENDING;
539 
540 	switch (task) {
541 	case IPTUN_TASK_MTU_UPDATE:
542 		mtu = iptun->iptun_mtu;
543 		break;
544 	case IPTUN_TASK_LADDR_UPDATE:
545 		addr = iptun->iptun_laddr;
546 		break;
547 	case IPTUN_TASK_RADDR_UPDATE:
548 		addr = iptun->iptun_raddr;
549 		break;
550 	case IPTUN_TASK_LINK_UPDATE:
551 		linkstate = IS_IPTUN_RUNNING(iptun) ?
552 		    LINK_STATE_UP : LINK_STATE_DOWN;
553 		break;
554 	case IPTUN_TASK_PDATA_UPDATE:
555 		header_size = iptun->iptun_header_size;
556 		header = iptun->iptun_header;
557 		break;
558 	default:
559 		ASSERT(0);
560 	}
561 
562 	iptun_exit(iptun);
563 
564 	switch (task) {
565 	case IPTUN_TASK_MTU_UPDATE:
566 		(void) mac_maxsdu_update(iptun->iptun_mh, mtu);
567 		break;
568 	case IPTUN_TASK_LADDR_UPDATE:
569 		mac_unicst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
570 		break;
571 	case IPTUN_TASK_RADDR_UPDATE:
572 		mac_dst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
573 		break;
574 	case IPTUN_TASK_LINK_UPDATE:
575 		mac_link_update(iptun->iptun_mh, linkstate);
576 		break;
577 	case IPTUN_TASK_PDATA_UPDATE:
578 		if (mac_pdata_update(iptun->iptun_mh,
579 		    header_size == 0 ? NULL : &header, header_size) != 0)
580 			atomic_inc_64(&iptun->iptun_taskq_fail);
581 		break;
582 	}
583 
584 	mutex_enter(&iptun->iptun_lock);
585 	iptun->iptun_flags &= ~IPTUN_UPCALL_PENDING;
586 	cv_signal(&iptun->iptun_upcall_cv);
587 	mutex_exit(&iptun->iptun_lock);
588 }
589 
590 static void
iptun_task_dispatch(iptun_t * iptun,iptun_task_t iptun_task)591 iptun_task_dispatch(iptun_t *iptun, iptun_task_t iptun_task)
592 {
593 	iptun_task_data_t *itd;
594 
595 	itd = kmem_alloc(sizeof (*itd), KM_NOSLEEP);
596 	if (itd == NULL) {
597 		atomic_inc_64(&iptun->iptun_taskq_fail);
598 		return;
599 	}
600 	itd->itd_task = iptun_task;
601 	itd->itd_linkid = iptun->iptun_linkid;
602 	if (ddi_taskq_dispatch(iptun_taskq, iptun_task_cb, itd, DDI_NOSLEEP)) {
603 		atomic_inc_64(&iptun->iptun_taskq_fail);
604 		kmem_free(itd, sizeof (*itd));
605 	}
606 }
607 
608 /*
609  * Convert an iptun_addr_t to sockaddr_storage.
610  */
611 static void
iptun_getaddr(iptun_addr_t * iptun_addr,struct sockaddr_storage * ss)612 iptun_getaddr(iptun_addr_t *iptun_addr, struct sockaddr_storage *ss)
613 {
614 	struct sockaddr_in	*sin;
615 	struct sockaddr_in6	*sin6;
616 
617 	bzero(ss, sizeof (*ss));
618 	switch (iptun_addr->ia_family) {
619 	case AF_INET:
620 		sin = (struct sockaddr_in *)ss;
621 		sin->sin_addr.s_addr = iptun_addr->ia_addr.iau_addr4;
622 		break;
623 	case AF_INET6:
624 		sin6 = (struct sockaddr_in6 *)ss;
625 		sin6->sin6_addr = iptun_addr->ia_addr.iau_addr6;
626 		break;
627 	default:
628 		ASSERT(0);
629 	}
630 	ss->ss_family = iptun_addr->ia_family;
631 }
632 
633 /*
634  * General purpose function to set an IP tunnel source or destination address.
635  */
636 static int
iptun_setaddr(iptun_type_t iptun_type,iptun_addr_t * iptun_addr,const struct sockaddr_storage * ss)637 iptun_setaddr(iptun_type_t iptun_type, iptun_addr_t *iptun_addr,
638     const struct sockaddr_storage *ss)
639 {
640 	if (!IPTUN_ADDR_MATCH(iptun_type, ss->ss_family))
641 		return (EINVAL);
642 
643 	switch (ss->ss_family) {
644 	case AF_INET: {
645 		struct sockaddr_in *sin = (struct sockaddr_in *)ss;
646 
647 		if ((sin->sin_addr.s_addr == INADDR_ANY) ||
648 		    (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
649 		    CLASSD(sin->sin_addr.s_addr)) {
650 			return (EADDRNOTAVAIL);
651 		}
652 		iptun_addr->ia_addr.iau_addr4 = sin->sin_addr.s_addr;
653 		break;
654 	}
655 	case AF_INET6: {
656 		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
657 
658 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
659 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) ||
660 		    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
661 			return (EADDRNOTAVAIL);
662 		}
663 		iptun_addr->ia_addr.iau_addr6 = sin6->sin6_addr;
664 		break;
665 	}
666 	default:
667 		return (EAFNOSUPPORT);
668 	}
669 	iptun_addr->ia_family = ss->ss_family;
670 	return (0);
671 }
672 
673 static int
iptun_setladdr(iptun_t * iptun,const struct sockaddr_storage * laddr)674 iptun_setladdr(iptun_t *iptun, const struct sockaddr_storage *laddr)
675 {
676 	return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
677 	    &iptun->iptun_laddr, laddr));
678 }
679 
680 static int
iptun_setraddr(iptun_t * iptun,const struct sockaddr_storage * raddr)681 iptun_setraddr(iptun_t *iptun, const struct sockaddr_storage *raddr)
682 {
683 	if (!(iptun->iptun_typeinfo->iti_hasraddr))
684 		return (EINVAL);
685 	return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
686 	    &iptun->iptun_raddr, raddr));
687 }
688 
689 static boolean_t
iptun_canbind(iptun_t * iptun)690 iptun_canbind(iptun_t *iptun)
691 {
692 	/*
693 	 * A tunnel may bind when its source address has been set, and if its
694 	 * tunnel type requires one, also its destination address.
695 	 */
696 	return ((iptun->iptun_flags & IPTUN_LADDR) &&
697 	    ((iptun->iptun_flags & IPTUN_RADDR) ||
698 	    !(iptun->iptun_typeinfo->iti_hasraddr)));
699 }
700 
701 /*
702  * Verify that the local address is valid, and insert in the fanout
703  */
704 static int
iptun_bind(iptun_t * iptun)705 iptun_bind(iptun_t *iptun)
706 {
707 	conn_t			*connp = iptun->iptun_connp;
708 	int			error = 0;
709 	ip_xmit_attr_t		*ixa;
710 	ip_xmit_attr_t		*oldixa;
711 	iulp_t			uinfo;
712 	ip_stack_t		*ipst = connp->conn_netstack->netstack_ip;
713 
714 	/*
715 	 * Get an exclusive ixa for this thread.
716 	 * We defer updating conn_ixa until later to handle any concurrent
717 	 * conn_ixa_cleanup thread.
718 	 */
719 	ixa = conn_get_ixa(connp, B_FALSE);
720 	if (ixa == NULL)
721 		return (ENOMEM);
722 
723 	/* We create PMTU state including for 6to4 */
724 	ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
725 
726 	ASSERT(iptun_canbind(iptun));
727 
728 	mutex_enter(&connp->conn_lock);
729 	/*
730 	 * Note that conn_proto can't be set since the upper protocol
731 	 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
732 	 * ipcl_iptun_classify doesn't use conn_proto.
733 	 */
734 	connp->conn_ipversion = iptun->iptun_typeinfo->iti_ipvers;
735 
736 	switch (iptun->iptun_typeinfo->iti_type) {
737 	case IPTUN_TYPE_IPV4:
738 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
739 		    &connp->conn_laddr_v6);
740 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_raddr4,
741 		    &connp->conn_faddr_v6);
742 		ixa->ixa_flags |= IXAF_IS_IPV4;
743 		if (ip_laddr_verify_v4(iptun->iptun_laddr4, IPCL_ZONEID(connp),
744 		    ipst, B_FALSE) != IPVL_UNICAST_UP) {
745 			mutex_exit(&connp->conn_lock);
746 			error = EADDRNOTAVAIL;
747 			goto done;
748 		}
749 		break;
750 	case IPTUN_TYPE_IPV6:
751 		connp->conn_laddr_v6 = iptun->iptun_laddr6;
752 		connp->conn_faddr_v6 = iptun->iptun_raddr6;
753 		ixa->ixa_flags &= ~IXAF_IS_IPV4;
754 		/* We use a zero scopeid for now */
755 		if (ip_laddr_verify_v6(&iptun->iptun_laddr6, IPCL_ZONEID(connp),
756 		    ipst, B_FALSE, 0) != IPVL_UNICAST_UP) {
757 			mutex_exit(&connp->conn_lock);
758 			error = EADDRNOTAVAIL;
759 			goto done;
760 		}
761 		break;
762 	case IPTUN_TYPE_6TO4:
763 		IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
764 		    &connp->conn_laddr_v6);
765 		IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_faddr_v6);
766 		ixa->ixa_flags |= IXAF_IS_IPV4;
767 		mutex_exit(&connp->conn_lock);
768 
769 		switch (ip_laddr_verify_v4(iptun->iptun_laddr4,
770 		    IPCL_ZONEID(connp), ipst, B_FALSE)) {
771 		case IPVL_UNICAST_UP:
772 		case IPVL_UNICAST_DOWN:
773 			break;
774 		default:
775 			error = EADDRNOTAVAIL;
776 			goto done;
777 		}
778 		goto insert;
779 	}
780 
781 	/* In case previous destination was multirt */
782 	ip_attr_newdst(ixa);
783 
784 	/*
785 	 * When we set a tunnel's destination address, we do not
786 	 * care if the destination is reachable.  Transient routing
787 	 * issues should not inhibit the creation of a tunnel
788 	 * interface, for example. Thus we pass B_FALSE here.
789 	 */
790 	connp->conn_saddr_v6 = connp->conn_laddr_v6;
791 	mutex_exit(&connp->conn_lock);
792 
793 	/* As long as the MTU is large we avoid fragmentation */
794 	ixa->ixa_flags |= IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF;
795 
796 	/* We handle IPsec in iptun_output_common */
797 	error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
798 	    &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
799 	    &connp->conn_saddr_v6, &uinfo, 0);
800 
801 	if (error != 0)
802 		goto done;
803 
804 	/* saddr shouldn't change since it was already set */
805 	ASSERT(IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
806 	    &connp->conn_saddr_v6));
807 
808 	/* We set IXAF_VERIFY_PMTU to catch PMTU increases */
809 	ixa->ixa_flags |= IXAF_VERIFY_PMTU;
810 	ASSERT(uinfo.iulp_mtu != 0);
811 
812 	/*
813 	 * Allow setting new policies.
814 	 * The addresses/ports are already set, thus the IPsec policy calls
815 	 * can handle their passed-in conn's.
816 	 */
817 	connp->conn_policy_cached = B_FALSE;
818 
819 insert:
820 	error = ipcl_conn_insert(connp);
821 	if (error != 0)
822 		goto done;
823 
824 	/* Atomically update v6lastdst and conn_ixa */
825 	mutex_enter(&connp->conn_lock);
826 	/* Record this as the "last" send even though we haven't sent any */
827 	connp->conn_v6lastdst = connp->conn_faddr_v6;
828 
829 	iptun->iptun_flags |= IPTUN_BOUND;
830 
831 	oldixa = conn_replace_ixa(connp, ixa);
832 	/* Done with conn_t */
833 	mutex_exit(&connp->conn_lock);
834 	ixa_refrele(oldixa);
835 
836 	/*
837 	 * Now that we're bound with ip below us, this is a good
838 	 * time to initialize the destination path MTU and to
839 	 * re-calculate the tunnel's link MTU.
840 	 */
841 	(void) iptun_update_mtu(iptun, ixa, 0);
842 
843 	if (IS_IPTUN_RUNNING(iptun))
844 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
845 
846 done:
847 	ixa_refrele(ixa);
848 	return (error);
849 }
850 
851 static void
iptun_unbind(iptun_t * iptun)852 iptun_unbind(iptun_t *iptun)
853 {
854 	ASSERT(iptun->iptun_flags & IPTUN_BOUND);
855 	ASSERT(mutex_owned(&iptun->iptun_lock) ||
856 	    (iptun->iptun_flags & IPTUN_CONDEMNED));
857 	ip_unbind(iptun->iptun_connp);
858 	iptun->iptun_flags &= ~IPTUN_BOUND;
859 	if (!(iptun->iptun_flags & IPTUN_CONDEMNED))
860 		iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
861 }
862 
863 /*
864  * Re-generate the template data-link header for a given IP tunnel given the
865  * tunnel's current parameters.
866  */
867 static void
iptun_headergen(iptun_t * iptun,boolean_t update_mac)868 iptun_headergen(iptun_t *iptun, boolean_t update_mac)
869 {
870 	switch (iptun->iptun_typeinfo->iti_ipvers) {
871 	case IPV4_VERSION:
872 		/*
873 		 * We only need to use a custom IP header if the administrator
874 		 * has supplied a non-default hoplimit.
875 		 */
876 		if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT) {
877 			iptun->iptun_header_size = 0;
878 			break;
879 		}
880 		iptun->iptun_header_size = sizeof (ipha_t);
881 		iptun->iptun_header4.ipha_version_and_hdr_length =
882 		    IP_SIMPLE_HDR_VERSION;
883 		iptun->iptun_header4.ipha_fragment_offset_and_flags =
884 		    htons(IPH_DF);
885 		iptun->iptun_header4.ipha_ttl = iptun->iptun_hoplimit;
886 		break;
887 	case IPV6_VERSION: {
888 		ip6_t	*ip6hp = &iptun->iptun_header6.it6h_ip6h;
889 
890 		/*
891 		 * We only need to use a custom IPv6 header if either the
892 		 * administrator has supplied a non-default hoplimit, or we
893 		 * need to include an encapsulation limit option in the outer
894 		 * header.
895 		 */
896 		if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT &&
897 		    iptun->iptun_encaplimit == 0) {
898 			iptun->iptun_header_size = 0;
899 			break;
900 		}
901 
902 		(void) memset(ip6hp, 0, sizeof (*ip6hp));
903 		if (iptun->iptun_encaplimit == 0) {
904 			iptun->iptun_header_size = sizeof (ip6_t);
905 			ip6hp->ip6_nxt = IPPROTO_NONE;
906 		} else {
907 			iptun_encaplim_t	*iel;
908 
909 			iptun->iptun_header_size = sizeof (iptun_ipv6hdrs_t);
910 			/*
911 			 * The mac_ipv6 plugin requires ip6_plen to be in host
912 			 * byte order and reflect the extension headers
913 			 * present in the template.  The actual network byte
914 			 * order ip6_plen will be set on a per-packet basis on
915 			 * transmit.
916 			 */
917 			ip6hp->ip6_plen = sizeof (*iel);
918 			ip6hp->ip6_nxt = IPPROTO_DSTOPTS;
919 			iel = &iptun->iptun_header6.it6h_encaplim;
920 			*iel = iptun_encaplim_init;
921 			iel->iel_telopt.ip6ot_encap_limit =
922 			    iptun->iptun_encaplimit;
923 		}
924 
925 		ip6hp->ip6_hlim = iptun->iptun_hoplimit;
926 		break;
927 	}
928 	}
929 
930 	if (update_mac)
931 		iptun_task_dispatch(iptun, IPTUN_TASK_PDATA_UPDATE);
932 }
933 
934 /*
935  * Insert inbound and outbound IPv4 and IPv6 policy into the given policy
936  * head.
937  */
938 static boolean_t
iptun_insert_simple_policies(ipsec_policy_head_t * ph,ipsec_act_t * actp,uint_t n,netstack_t * ns)939 iptun_insert_simple_policies(ipsec_policy_head_t *ph, ipsec_act_t *actp,
940     uint_t n, netstack_t *ns)
941 {
942 	int f = IPSEC_AF_V4;
943 
944 	if (!ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) ||
945 	    !ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns))
946 		return (B_FALSE);
947 
948 	f = IPSEC_AF_V6;
949 	return (ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) &&
950 	    ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns));
951 }
952 
953 /*
954  * Used to set IPsec policy when policy is set through the IPTUN_CREATE or
955  * IPTUN_MODIFY ioctls.
956  */
957 static int
iptun_set_sec_simple(iptun_t * iptun,const ipsec_req_t * ipsr)958 iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr)
959 {
960 	int		rc = 0;
961 	uint_t		nact;
962 	ipsec_act_t	*actp = NULL;
963 	boolean_t	clear_all, old_policy = B_FALSE;
964 	ipsec_tun_pol_t	*itp;
965 	char		name[MAXLINKNAMELEN];
966 	uint64_t	gen;
967 	netstack_t	*ns = iptun->iptun_ns;
968 
969 	/* Can't specify self-encap on a tunnel. */
970 	if (ipsr->ipsr_self_encap_req != 0)
971 		return (EINVAL);
972 
973 	/*
974 	 * If it's a "clear-all" entry, unset the security flags and resume
975 	 * normal cleartext (or inherit-from-global) policy.
976 	 */
977 	clear_all = ((ipsr->ipsr_ah_req & IPTUN_IPSEC_REQ_MASK) == 0 &&
978 	    (ipsr->ipsr_esp_req & IPTUN_IPSEC_REQ_MASK) == 0);
979 
980 	ASSERT(mutex_owned(&iptun->iptun_lock));
981 	itp = iptun->iptun_itp;
982 	if (itp == NULL) {
983 		if (clear_all)
984 			goto bail;
985 		if ((rc = dls_mgmt_get_linkinfo(iptun->iptun_linkid, name, NULL,
986 		    NULL, NULL)) != 0)
987 			goto bail;
988 		ASSERT(name[0] != '\0');
989 		if ((itp = create_tunnel_policy(name, &rc, &gen, ns)) == NULL)
990 			goto bail;
991 		iptun->iptun_itp = itp;
992 	}
993 
994 	/* Allocate the actvec now, before holding itp or polhead locks. */
995 	ipsec_actvec_from_req(ipsr, &actp, &nact, ns);
996 	if (actp == NULL) {
997 		rc = ENOMEM;
998 		goto bail;
999 	}
1000 
1001 	/*
1002 	 * Just write on the active polhead.  Save the primary/secondary stuff
1003 	 * for spdsock operations.
1004 	 *
1005 	 * Mutex because we need to write to the polhead AND flags atomically.
1006 	 * Other threads will acquire the polhead lock as a reader if the
1007 	 * (unprotected) flag is set.
1008 	 */
1009 	mutex_enter(&itp->itp_lock);
1010 	if (itp->itp_flags & ITPF_P_TUNNEL) {
1011 		/* Oops, we lost a race.  Let's get out of here. */
1012 		rc = EBUSY;
1013 		goto mutex_bail;
1014 	}
1015 	old_policy = ((itp->itp_flags & ITPF_P_ACTIVE) != 0);
1016 
1017 	if (old_policy) {
1018 		ITPF_CLONE(itp->itp_flags);
1019 		rc = ipsec_copy_polhead(itp->itp_policy, itp->itp_inactive, ns);
1020 		if (rc != 0) {
1021 			/* inactive has already been cleared. */
1022 			itp->itp_flags &= ~ITPF_IFLAGS;
1023 			goto mutex_bail;
1024 		}
1025 		rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1026 		ipsec_polhead_flush(itp->itp_policy, ns);
1027 	} else {
1028 		/* Else assume itp->itp_policy is already flushed. */
1029 		rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1030 	}
1031 
1032 	if (clear_all) {
1033 		ASSERT(avl_numnodes(&itp->itp_policy->iph_rulebyid) == 0);
1034 		itp->itp_flags &= ~ITPF_PFLAGS;
1035 		rw_exit(&itp->itp_policy->iph_lock);
1036 		old_policy = B_FALSE;	/* Clear out the inactive one too. */
1037 		goto recover_bail;
1038 	}
1039 
1040 	if (iptun_insert_simple_policies(itp->itp_policy, actp, nact, ns)) {
1041 		rw_exit(&itp->itp_policy->iph_lock);
1042 		/*
1043 		 * Adjust MTU and make sure the DL side knows what's up.
1044 		 */
1045 		itp->itp_flags = ITPF_P_ACTIVE;
1046 		(void) iptun_update_mtu(iptun, NULL, 0);
1047 		old_policy = B_FALSE;	/* Blank out inactive - we succeeded */
1048 	} else {
1049 		rw_exit(&itp->itp_policy->iph_lock);
1050 		rc = ENOMEM;
1051 	}
1052 
1053 recover_bail:
1054 	if (old_policy) {
1055 		/* Recover policy in in active polhead. */
1056 		ipsec_swap_policy(itp->itp_policy, itp->itp_inactive, ns);
1057 		ITPF_SWAP(itp->itp_flags);
1058 	}
1059 
1060 	/* Clear policy in inactive polhead. */
1061 	itp->itp_flags &= ~ITPF_IFLAGS;
1062 	rw_enter(&itp->itp_inactive->iph_lock, RW_WRITER);
1063 	ipsec_polhead_flush(itp->itp_inactive, ns);
1064 	rw_exit(&itp->itp_inactive->iph_lock);
1065 
1066 mutex_bail:
1067 	mutex_exit(&itp->itp_lock);
1068 
1069 bail:
1070 	if (actp != NULL)
1071 		ipsec_actvec_free(actp, nact);
1072 
1073 	return (rc);
1074 }
1075 
1076 static iptun_typeinfo_t *
iptun_gettypeinfo(iptun_type_t type)1077 iptun_gettypeinfo(iptun_type_t type)
1078 {
1079 	int i;
1080 
1081 	for (i = 0; iptun_type_table[i].iti_type != IPTUN_TYPE_UNKNOWN; i++) {
1082 		if (iptun_type_table[i].iti_type == type)
1083 			break;
1084 	}
1085 	return (&iptun_type_table[i]);
1086 }
1087 
1088 /*
1089  * Set the parameters included in ik on the tunnel iptun.  Parameters that can
1090  * only be set at creation time are set in iptun_create().
1091  */
1092 static int
iptun_setparams(iptun_t * iptun,const iptun_kparams_t * ik)1093 iptun_setparams(iptun_t *iptun, const iptun_kparams_t *ik)
1094 {
1095 	int		err = 0;
1096 	netstack_t	*ns = iptun->iptun_ns;
1097 	iptun_addr_t	orig_laddr, orig_raddr;
1098 	uint_t		orig_flags = iptun->iptun_flags;
1099 
1100 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR) {
1101 		if (orig_flags & IPTUN_LADDR)
1102 			orig_laddr = iptun->iptun_laddr;
1103 		if ((err = iptun_setladdr(iptun, &ik->iptun_kparam_laddr)) != 0)
1104 			return (err);
1105 		iptun->iptun_flags |= IPTUN_LADDR;
1106 	}
1107 
1108 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) {
1109 		if (orig_flags & IPTUN_RADDR)
1110 			orig_raddr = iptun->iptun_raddr;
1111 		if ((err = iptun_setraddr(iptun, &ik->iptun_kparam_raddr)) != 0)
1112 			goto done;
1113 		iptun->iptun_flags |= IPTUN_RADDR;
1114 	}
1115 
1116 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_SECINFO) {
1117 		/*
1118 		 * Set IPsec policy originating from the ifconfig(8) command
1119 		 * line.  This is traditionally called "simple" policy because
1120 		 * the ipsec_req_t (iptun_kparam_secinfo) can only describe a
1121 		 * simple policy of "do ESP on everything" and/or "do AH on
1122 		 * everything" (as opposed to the rich policy that can be
1123 		 * defined with ipsecconf(8)).
1124 		 */
1125 		if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
1126 			/*
1127 			 * Can't set security properties for automatic
1128 			 * tunnels.
1129 			 */
1130 			err = EINVAL;
1131 			goto done;
1132 		}
1133 
1134 		if (!ipsec_loaded(ns->netstack_ipsec)) {
1135 			/* If IPsec can be loaded, try and load it now. */
1136 			if (ipsec_failed(ns->netstack_ipsec)) {
1137 				err = EPROTONOSUPPORT;
1138 				goto done;
1139 			}
1140 			ipsec_loader_loadnow(ns->netstack_ipsec);
1141 			/*
1142 			 * ipsec_loader_loadnow() returns while IPsec is
1143 			 * loaded asynchronously.  While a method exists to
1144 			 * wait for IPsec to load (ipsec_loader_wait()), it
1145 			 * requires use of a STREAMS queue to do a qwait().
1146 			 * We're not in STREAMS context here, and so we can't
1147 			 * use it.  This is not a problem in practice because
1148 			 * in the vast majority of cases, key management and
1149 			 * global policy will have loaded before any tunnels
1150 			 * are plumbed, and so IPsec will already have been
1151 			 * loaded.
1152 			 */
1153 			err = EAGAIN;
1154 			goto done;
1155 		}
1156 
1157 		err = iptun_set_sec_simple(iptun, &ik->iptun_kparam_secinfo);
1158 		if (err == 0) {
1159 			iptun->iptun_flags |= IPTUN_SIMPLE_POLICY;
1160 			iptun->iptun_simple_policy = ik->iptun_kparam_secinfo;
1161 		}
1162 	}
1163 done:
1164 	if (err != 0) {
1165 		/* Restore original source and destination. */
1166 		if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR &&
1167 		    (orig_flags & IPTUN_LADDR))
1168 			iptun->iptun_laddr = orig_laddr;
1169 		if ((ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) &&
1170 		    (orig_flags & IPTUN_RADDR))
1171 			iptun->iptun_raddr = orig_raddr;
1172 		iptun->iptun_flags = orig_flags;
1173 	}
1174 	return (err);
1175 }
1176 
1177 static int
iptun_register(iptun_t * iptun)1178 iptun_register(iptun_t *iptun)
1179 {
1180 	mac_register_t	*mac;
1181 	int		err;
1182 
1183 	ASSERT(!(iptun->iptun_flags & IPTUN_MAC_REGISTERED));
1184 
1185 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
1186 		return (EINVAL);
1187 
1188 	mac->m_type_ident = iptun->iptun_typeinfo->iti_ident;
1189 	mac->m_driver = iptun;
1190 	mac->m_dip = iptun_dip;
1191 	mac->m_instance = (uint_t)-1;
1192 	mac->m_src_addr = (uint8_t *)&iptun->iptun_laddr.ia_addr;
1193 	mac->m_dst_addr = iptun->iptun_typeinfo->iti_hasraddr ?
1194 	    (uint8_t *)&iptun->iptun_raddr.ia_addr : NULL;
1195 	mac->m_callbacks = &iptun_m_callbacks;
1196 	mac->m_min_sdu = iptun->iptun_typeinfo->iti_minmtu;
1197 	mac->m_max_sdu = iptun->iptun_mtu;
1198 	if (iptun->iptun_header_size != 0) {
1199 		mac->m_pdata = &iptun->iptun_header;
1200 		mac->m_pdata_size = iptun->iptun_header_size;
1201 	}
1202 	if ((err = mac_register(mac, &iptun->iptun_mh)) == 0)
1203 		iptun->iptun_flags |= IPTUN_MAC_REGISTERED;
1204 	mac_free(mac);
1205 	return (err);
1206 }
1207 
1208 static int
iptun_unregister(iptun_t * iptun)1209 iptun_unregister(iptun_t *iptun)
1210 {
1211 	int err;
1212 
1213 	ASSERT(iptun->iptun_flags & IPTUN_MAC_REGISTERED);
1214 	if ((err = mac_unregister(iptun->iptun_mh)) == 0)
1215 		iptun->iptun_flags &= ~IPTUN_MAC_REGISTERED;
1216 	return (err);
1217 }
1218 
1219 static conn_t *
iptun_conn_create(iptun_t * iptun,netstack_t * ns,cred_t * credp)1220 iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
1221 {
1222 	conn_t *connp;
1223 
1224 	if ((connp = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ns)) == NULL)
1225 		return (NULL);
1226 
1227 	connp->conn_flags |= IPCL_IPTUN;
1228 	connp->conn_iptun = iptun;
1229 	connp->conn_recv = iptun_input;
1230 	connp->conn_recvicmp = iptun_input_icmp;
1231 	connp->conn_verifyicmp = iptun_verifyicmp;
1232 
1233 	/*
1234 	 * Register iptun_notify to listen to capability changes detected by IP.
1235 	 * This upcall is made in the context of the call to conn_ip_output.
1236 	 */
1237 	connp->conn_ixa->ixa_notify = iptun_notify;
1238 	connp->conn_ixa->ixa_notify_cookie = iptun;
1239 
1240 	/*
1241 	 * For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done
1242 	 * for all other conn_t's.
1243 	 *
1244 	 * Note that there's an important distinction between iptun_zoneid and
1245 	 * conn_zoneid.  The conn_zoneid is set to GLOBAL_ZONEID in non-global
1246 	 * exclusive stack zones to make the ip module believe that the
1247 	 * non-global zone is actually a global zone.  Therefore, when
1248 	 * interacting with the ip module, we must always use conn_zoneid.
1249 	 */
1250 	connp->conn_zoneid = (ns->netstack_stackid == GLOBAL_NETSTACKID) ?
1251 	    crgetzoneid(credp) : GLOBAL_ZONEID;
1252 	connp->conn_cred = credp;
1253 	/* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */
1254 	crhold(connp->conn_cred);
1255 	connp->conn_cpid = NOPID;
1256 
1257 	/* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1258 	connp->conn_ixa->ixa_zoneid = connp->conn_zoneid;
1259 	ASSERT(connp->conn_ref == 1);
1260 
1261 	/* Cache things in ixa without an extra refhold */
1262 	ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1263 	connp->conn_ixa->ixa_cred = connp->conn_cred;
1264 	connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1265 	if (is_system_labeled())
1266 		connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1267 
1268 	/*
1269 	 * Have conn_ip_output drop packets should our outer source
1270 	 * go invalid
1271 	 */
1272 	connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1273 
1274 	switch (iptun->iptun_typeinfo->iti_ipvers) {
1275 	case IPV4_VERSION:
1276 		connp->conn_family = AF_INET6;
1277 		break;
1278 	case IPV6_VERSION:
1279 		connp->conn_family = AF_INET;
1280 		break;
1281 	}
1282 	mutex_enter(&connp->conn_lock);
1283 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1284 	mutex_exit(&connp->conn_lock);
1285 	return (connp);
1286 }
1287 
1288 static void
iptun_conn_destroy(conn_t * connp)1289 iptun_conn_destroy(conn_t *connp)
1290 {
1291 	ip_quiesce_conn(connp);
1292 	connp->conn_iptun = NULL;
1293 	ASSERT(connp->conn_ref == 1);
1294 	CONN_DEC_REF(connp);
1295 }
1296 
1297 static iptun_t *
iptun_alloc(void)1298 iptun_alloc(void)
1299 {
1300 	iptun_t *iptun;
1301 
1302 	if ((iptun = kmem_cache_alloc(iptun_cache, KM_NOSLEEP)) != NULL) {
1303 		bzero(iptun, sizeof (*iptun));
1304 		atomic_inc_32(&iptun_tunnelcount);
1305 	}
1306 	return (iptun);
1307 }
1308 
1309 static void
iptun_free(iptun_t * iptun)1310 iptun_free(iptun_t *iptun)
1311 {
1312 	ASSERT(iptun->iptun_flags & IPTUN_CONDEMNED);
1313 
1314 	if (iptun->iptun_flags & IPTUN_HASH_INSERTED) {
1315 		iptun_stack_t	*iptuns = iptun->iptun_iptuns;
1316 
1317 		mutex_enter(&iptun_hash_lock);
1318 		VERIFY(mod_hash_remove(iptun_hash,
1319 		    IPTUN_HASH_KEY(iptun->iptun_linkid),
1320 		    (mod_hash_val_t *)&iptun) == 0);
1321 		mutex_exit(&iptun_hash_lock);
1322 		iptun->iptun_flags &= ~IPTUN_HASH_INSERTED;
1323 		mutex_enter(&iptuns->iptuns_lock);
1324 		list_remove(&iptuns->iptuns_iptunlist, iptun);
1325 		mutex_exit(&iptuns->iptuns_lock);
1326 	}
1327 
1328 	if (iptun->iptun_flags & IPTUN_BOUND)
1329 		iptun_unbind(iptun);
1330 
1331 	/*
1332 	 * After iptun_unregister(), there will be no threads executing a
1333 	 * downcall from the mac module, including in the tx datapath.
1334 	 */
1335 	if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
1336 		VERIFY(iptun_unregister(iptun) == 0);
1337 
1338 	if (iptun->iptun_itp != NULL) {
1339 		/*
1340 		 * Remove from the AVL tree, AND release the reference iptun_t
1341 		 * itself holds on the ITP.
1342 		 */
1343 		itp_unlink(iptun->iptun_itp, iptun->iptun_ns);
1344 		ITP_REFRELE(iptun->iptun_itp, iptun->iptun_ns);
1345 		iptun->iptun_itp = NULL;
1346 		iptun->iptun_flags &= ~IPTUN_SIMPLE_POLICY;
1347 	}
1348 
1349 	/*
1350 	 * After ipcl_conn_destroy(), there will be no threads executing an
1351 	 * upcall from ip (i.e., iptun_input()), and it is then safe to free
1352 	 * the iptun_t.
1353 	 */
1354 	if (iptun->iptun_connp != NULL) {
1355 		iptun_conn_destroy(iptun->iptun_connp);
1356 		iptun->iptun_connp = NULL;
1357 	}
1358 
1359 	netstack_rele(iptun->iptun_ns);
1360 	kmem_cache_free(iptun_cache, iptun);
1361 	atomic_dec_32(&iptun_tunnelcount);
1362 }
1363 
1364 int
iptun_create(iptun_kparams_t * ik,cred_t * credp)1365 iptun_create(iptun_kparams_t *ik, cred_t *credp)
1366 {
1367 	iptun_t		*iptun = NULL;
1368 	int		err = 0, mherr;
1369 	char		linkname[MAXLINKNAMELEN];
1370 	ipsec_tun_pol_t	*itp;
1371 	netstack_t	*ns = NULL;
1372 	iptun_stack_t	*iptuns;
1373 	datalink_id_t	tmpid;
1374 	zoneid_t	zoneid = crgetzoneid(credp);
1375 	boolean_t	link_created = B_FALSE;
1376 
1377 	/* The tunnel type is mandatory */
1378 	if (!(ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE))
1379 		return (EINVAL);
1380 
1381 	/*
1382 	 * Is the linkid that the caller wishes to associate with this new
1383 	 * tunnel assigned to this zone?
1384 	 */
1385 	if (zone_check_datalink(&zoneid, ik->iptun_kparam_linkid) != 0) {
1386 		if (zoneid != GLOBAL_ZONEID)
1387 			return (EINVAL);
1388 	} else if (zoneid == GLOBAL_ZONEID) {
1389 		return (EINVAL);
1390 	}
1391 
1392 	/*
1393 	 * Make sure that we're not trying to create a tunnel that has already
1394 	 * been created.
1395 	 */
1396 	if (iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun) == 0) {
1397 		iptun_exit(iptun);
1398 		iptun = NULL;
1399 		err = EEXIST;
1400 		goto done;
1401 	}
1402 
1403 	ns = netstack_find_by_cred(credp);
1404 	iptuns = ns->netstack_iptun;
1405 
1406 	if ((iptun = iptun_alloc()) == NULL) {
1407 		err = ENOMEM;
1408 		goto done;
1409 	}
1410 
1411 	iptun->iptun_linkid = ik->iptun_kparam_linkid;
1412 	iptun->iptun_zoneid = zoneid;
1413 	iptun->iptun_ns = ns;
1414 
1415 	iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type);
1416 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_UNKNOWN) {
1417 		err = EINVAL;
1418 		goto done;
1419 	}
1420 
1421 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_IMPLICIT)
1422 		iptun->iptun_flags |= IPTUN_IMPLICIT;
1423 
1424 	if ((err = iptun_setparams(iptun, ik)) != 0)
1425 		goto done;
1426 
1427 	iptun->iptun_hoplimit = IPTUN_DEFAULT_HOPLIMIT;
1428 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_IPV6)
1429 		iptun->iptun_encaplimit = IPTUN_DEFAULT_ENCAPLIMIT;
1430 
1431 	iptun_headergen(iptun, B_FALSE);
1432 
1433 	iptun->iptun_connp = iptun_conn_create(iptun, ns, credp);
1434 	if (iptun->iptun_connp == NULL) {
1435 		err = ENOMEM;
1436 		goto done;
1437 	}
1438 
1439 	iptun->iptun_mtu = iptun->iptun_typeinfo->iti_maxmtu;
1440 	iptun->iptun_dpmtu = iptun->iptun_mtu;
1441 
1442 	/*
1443 	 * Find an ITP based on linkname.  If we have parms already set via
1444 	 * the iptun_setparams() call above, it may have created an ITP for
1445 	 * us.  We always try get_tunnel_policy() for DEBUG correctness
1446 	 * checks, and we may wish to refactor this to only check when
1447 	 * iptun_itp is NULL.
1448 	 */
1449 	if ((err = dls_mgmt_get_linkinfo(iptun->iptun_linkid, linkname, NULL,
1450 	    NULL, NULL)) != 0)
1451 		goto done;
1452 	if ((itp = get_tunnel_policy(linkname, ns)) != NULL)
1453 		iptun->iptun_itp = itp;
1454 
1455 	/*
1456 	 * See if we have the necessary IP addresses assigned to this tunnel
1457 	 * to try and bind them with ip underneath us.  If we're not ready to
1458 	 * bind yet, then we'll defer the bind operation until the addresses
1459 	 * are modified.
1460 	 */
1461 	if (iptun_canbind(iptun) && ((err = iptun_bind(iptun)) != 0))
1462 		goto done;
1463 
1464 	if ((err = iptun_register(iptun)) != 0)
1465 		goto done;
1466 
1467 	err = dls_devnet_create(iptun->iptun_mh, iptun->iptun_linkid,
1468 	    iptun->iptun_zoneid);
1469 	if (err != 0)
1470 		goto done;
1471 	link_created = B_TRUE;
1472 
1473 	/*
1474 	 * We hash by link-id as that is the key used by all other iptun
1475 	 * interfaces (modify, delete, etc.).
1476 	 */
1477 	if ((mherr = mod_hash_insert(iptun_hash,
1478 	    IPTUN_HASH_KEY(iptun->iptun_linkid), (mod_hash_val_t)iptun)) == 0) {
1479 		mutex_enter(&iptuns->iptuns_lock);
1480 		list_insert_head(&iptuns->iptuns_iptunlist, iptun);
1481 		mutex_exit(&iptuns->iptuns_lock);
1482 		iptun->iptun_flags |= IPTUN_HASH_INSERTED;
1483 	} else if (mherr == MH_ERR_NOMEM) {
1484 		err = ENOMEM;
1485 	} else if (mherr == MH_ERR_DUPLICATE) {
1486 		err = EEXIST;
1487 	} else {
1488 		err = EINVAL;
1489 	}
1490 
1491 done:
1492 	if (iptun == NULL && ns != NULL)
1493 		netstack_rele(ns);
1494 	if (err != 0 && iptun != NULL) {
1495 		if (link_created) {
1496 			(void) dls_devnet_destroy(iptun->iptun_mh, &tmpid,
1497 			    B_TRUE);
1498 		}
1499 		iptun->iptun_flags |= IPTUN_CONDEMNED;
1500 		iptun_free(iptun);
1501 	}
1502 	return (err);
1503 }
1504 
1505 int
iptun_delete(datalink_id_t linkid,cred_t * credp)1506 iptun_delete(datalink_id_t linkid, cred_t *credp)
1507 {
1508 	int	err;
1509 	iptun_t	*iptun = NULL;
1510 
1511 	if ((err = iptun_enter_by_linkid(linkid, &iptun)) != 0)
1512 		return (err);
1513 
1514 	/* One cannot delete a tunnel that belongs to another zone. */
1515 	if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1516 		iptun_exit(iptun);
1517 		return (EACCES);
1518 	}
1519 
1520 	/*
1521 	 * We need to exit iptun in order to issue calls up the stack such as
1522 	 * dls_devnet_destroy().  If we call up while still in iptun, deadlock
1523 	 * with calls coming down the stack is possible.  We prevent other
1524 	 * threads from entering this iptun after we've exited it by setting
1525 	 * the IPTUN_DELETE_PENDING flag.  This will cause callers of
1526 	 * iptun_enter() to block waiting on iptun_enter_cv.  The assumption
1527 	 * here is that the functions we're calling while IPTUN_DELETE_PENDING
1528 	 * is set dont resuult in an iptun_enter() call, as that would result
1529 	 * in deadlock.
1530 	 */
1531 	iptun->iptun_flags |= IPTUN_DELETE_PENDING;
1532 
1533 	/* Wait for any pending upcall to the mac module to complete. */
1534 	while (iptun->iptun_flags & IPTUN_UPCALL_PENDING)
1535 		cv_wait(&iptun->iptun_upcall_cv, &iptun->iptun_lock);
1536 
1537 	iptun_exit(iptun);
1538 
1539 	if ((err = dls_devnet_destroy(iptun->iptun_mh, &linkid, B_TRUE)) == 0) {
1540 		/*
1541 		 * mac_disable() will fail with EBUSY if there are references
1542 		 * to the iptun MAC.  If there are none, then mac_disable()
1543 		 * will assure that none can be acquired until the MAC is
1544 		 * unregistered.
1545 		 *
1546 		 * XXX CR 6791335 prevents us from calling mac_disable() prior
1547 		 * to dls_devnet_destroy(), so we unfortunately need to
1548 		 * attempt to re-create the devnet node if mac_disable()
1549 		 * fails.
1550 		 */
1551 		if ((err = mac_disable(iptun->iptun_mh)) != 0) {
1552 			(void) dls_devnet_create(iptun->iptun_mh, linkid,
1553 			    iptun->iptun_zoneid);
1554 		}
1555 	}
1556 
1557 	/*
1558 	 * Now that we know the fate of this iptun_t, we need to clear
1559 	 * IPTUN_DELETE_PENDING, and set IPTUN_CONDEMNED if the iptun_t is
1560 	 * slated to be freed.  Either way, we need to signal the threads
1561 	 * waiting in iptun_enter() so that they can either fail if
1562 	 * IPTUN_CONDEMNED is set, or continue if it's not.
1563 	 */
1564 	mutex_enter(&iptun->iptun_lock);
1565 	iptun->iptun_flags &= ~IPTUN_DELETE_PENDING;
1566 	if (err == 0)
1567 		iptun->iptun_flags |= IPTUN_CONDEMNED;
1568 	cv_broadcast(&iptun->iptun_enter_cv);
1569 	mutex_exit(&iptun->iptun_lock);
1570 
1571 	/*
1572 	 * Note that there is no danger in calling iptun_free() after having
1573 	 * dropped the iptun_lock since callers of iptun_enter() at this point
1574 	 * are doing so from iptun_enter_by_linkid() (mac_disable() got rid of
1575 	 * threads entering from mac callbacks which call iptun_enter()
1576 	 * directly) which holds iptun_hash_lock, and iptun_free() grabs this
1577 	 * lock in order to remove the iptun_t from the hash table.
1578 	 */
1579 	if (err == 0)
1580 		iptun_free(iptun);
1581 
1582 	return (err);
1583 }
1584 
1585 int
iptun_modify(const iptun_kparams_t * ik,cred_t * credp)1586 iptun_modify(const iptun_kparams_t *ik, cred_t *credp)
1587 {
1588 	iptun_t		*iptun;
1589 	boolean_t	laddr_change = B_FALSE, raddr_change = B_FALSE;
1590 	int		err;
1591 
1592 	if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1593 		return (err);
1594 
1595 	/* One cannot modify a tunnel that belongs to another zone. */
1596 	if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1597 		err = EACCES;
1598 		goto done;
1599 	}
1600 
1601 	/* The tunnel type cannot be changed */
1602 	if (ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE) {
1603 		err = EINVAL;
1604 		goto done;
1605 	}
1606 
1607 	if ((err = iptun_setparams(iptun, ik)) != 0)
1608 		goto done;
1609 	iptun_headergen(iptun, B_FALSE);
1610 
1611 	/*
1612 	 * If any of the tunnel's addresses has been modified and the tunnel
1613 	 * has the necessary addresses assigned to it, we need to try to bind
1614 	 * with ip underneath us.  If we're not ready to bind yet, then we'll
1615 	 * try again when the addresses are modified later.
1616 	 */
1617 	laddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR);
1618 	raddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR);
1619 	if (laddr_change || raddr_change) {
1620 		if (iptun->iptun_flags & IPTUN_BOUND)
1621 			iptun_unbind(iptun);
1622 		if (iptun_canbind(iptun) && (err = iptun_bind(iptun)) != 0) {
1623 			if (laddr_change)
1624 				iptun->iptun_flags &= ~IPTUN_LADDR;
1625 			if (raddr_change)
1626 				iptun->iptun_flags &= ~IPTUN_RADDR;
1627 			goto done;
1628 		}
1629 	}
1630 
1631 	if (laddr_change)
1632 		iptun_task_dispatch(iptun, IPTUN_TASK_LADDR_UPDATE);
1633 	if (raddr_change)
1634 		iptun_task_dispatch(iptun, IPTUN_TASK_RADDR_UPDATE);
1635 
1636 done:
1637 	iptun_exit(iptun);
1638 	return (err);
1639 }
1640 
1641 /* Given an IP tunnel's datalink id, fill in its parameters. */
1642 int
iptun_info(iptun_kparams_t * ik,cred_t * credp)1643 iptun_info(iptun_kparams_t *ik, cred_t *credp)
1644 {
1645 	iptun_t	*iptun;
1646 	int	err;
1647 
1648 	/* Is the tunnel link visible from the caller's zone? */
1649 	if (!dls_devnet_islinkvisible(ik->iptun_kparam_linkid,
1650 	    crgetzoneid(credp)))
1651 		return (ENOENT);
1652 
1653 	if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1654 		return (err);
1655 
1656 	bzero(ik, sizeof (iptun_kparams_t));
1657 
1658 	ik->iptun_kparam_linkid = iptun->iptun_linkid;
1659 	ik->iptun_kparam_type = iptun->iptun_typeinfo->iti_type;
1660 	ik->iptun_kparam_flags |= IPTUN_KPARAM_TYPE;
1661 
1662 	if (iptun->iptun_flags & IPTUN_LADDR) {
1663 		iptun_getaddr(&iptun->iptun_laddr, &ik->iptun_kparam_laddr);
1664 		ik->iptun_kparam_flags |= IPTUN_KPARAM_LADDR;
1665 	}
1666 	if (iptun->iptun_flags & IPTUN_RADDR) {
1667 		iptun_getaddr(&iptun->iptun_raddr, &ik->iptun_kparam_raddr);
1668 		ik->iptun_kparam_flags |= IPTUN_KPARAM_RADDR;
1669 	}
1670 
1671 	if (iptun->iptun_flags & IPTUN_IMPLICIT)
1672 		ik->iptun_kparam_flags |= IPTUN_KPARAM_IMPLICIT;
1673 
1674 	if (iptun->iptun_itp != NULL) {
1675 		mutex_enter(&iptun->iptun_itp->itp_lock);
1676 		if (iptun->iptun_itp->itp_flags & ITPF_P_ACTIVE) {
1677 			ik->iptun_kparam_flags |= IPTUN_KPARAM_IPSECPOL;
1678 			if (iptun->iptun_flags & IPTUN_SIMPLE_POLICY) {
1679 				ik->iptun_kparam_flags |= IPTUN_KPARAM_SECINFO;
1680 				ik->iptun_kparam_secinfo =
1681 				    iptun->iptun_simple_policy;
1682 			}
1683 		}
1684 		mutex_exit(&iptun->iptun_itp->itp_lock);
1685 	}
1686 
1687 	iptun_exit(iptun);
1688 	return (err);
1689 }
1690 
1691 int
iptun_set_6to4relay(netstack_t * ns,ipaddr_t relay_addr)1692 iptun_set_6to4relay(netstack_t *ns, ipaddr_t relay_addr)
1693 {
1694 	if (relay_addr == INADDR_BROADCAST || CLASSD(relay_addr))
1695 		return (EADDRNOTAVAIL);
1696 	ns->netstack_iptun->iptuns_relay_rtr_addr = relay_addr;
1697 	return (0);
1698 }
1699 
1700 void
iptun_get_6to4relay(netstack_t * ns,ipaddr_t * relay_addr)1701 iptun_get_6to4relay(netstack_t *ns, ipaddr_t *relay_addr)
1702 {
1703 	*relay_addr = ns->netstack_iptun->iptuns_relay_rtr_addr;
1704 }
1705 
1706 void
iptun_set_policy(datalink_id_t linkid,ipsec_tun_pol_t * itp)1707 iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp)
1708 {
1709 	iptun_t	*iptun;
1710 
1711 	if (iptun_enter_by_linkid(linkid, &iptun) != 0)
1712 		return;
1713 	if (iptun->iptun_itp != itp) {
1714 		ASSERT(iptun->iptun_itp == NULL);
1715 		ITP_REFHOLD(itp);
1716 		iptun->iptun_itp = itp;
1717 	}
1718 	/*
1719 	 * IPsec policy means IPsec overhead, which means lower MTU.
1720 	 * Refresh the MTU for this tunnel.
1721 	 */
1722 	(void) iptun_update_mtu(iptun, NULL, 0);
1723 	iptun_exit(iptun);
1724 }
1725 
1726 /*
1727  * Obtain the path MTU to the tunnel destination.
1728  * Can return zero in some cases.
1729  */
1730 static uint32_t
iptun_get_dst_pmtu(iptun_t * iptun,ip_xmit_attr_t * ixa)1731 iptun_get_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1732 {
1733 	uint32_t	pmtu = 0;
1734 	conn_t		*connp = iptun->iptun_connp;
1735 	boolean_t	need_rele = B_FALSE;
1736 
1737 	/*
1738 	 * We only obtain the pmtu for tunnels that have a remote tunnel
1739 	 * address.
1740 	 */
1741 	if (!(iptun->iptun_flags & IPTUN_RADDR))
1742 		return (0);
1743 
1744 	if (ixa == NULL) {
1745 		ixa = conn_get_ixa(connp, B_FALSE);
1746 		if (ixa == NULL)
1747 			return (0);
1748 		need_rele = B_TRUE;
1749 	}
1750 	/*
1751 	 * Guard against ICMP errors before we have sent, as well as against
1752 	 * and a thread which held conn_ixa.
1753 	 */
1754 	if (ixa->ixa_ire != NULL) {
1755 		pmtu = ip_get_pmtu(ixa);
1756 
1757 		/*
1758 		 * For both IPv4 and IPv6 we can have indication that the outer
1759 		 * header needs fragmentation.
1760 		 */
1761 		if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1762 			/* Must allow fragmentation in ip_output */
1763 			ixa->ixa_flags &= ~IXAF_DONTFRAG;
1764 		} else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1765 			ixa->ixa_flags |= IXAF_DONTFRAG;
1766 		} else {
1767 			/* ip_get_pmtu might have set this - we don't want it */
1768 			ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1769 		}
1770 	}
1771 
1772 	if (need_rele)
1773 		ixa_refrele(ixa);
1774 	return (pmtu);
1775 }
1776 
1777 /*
1778  * Update the ip_xmit_attr_t to capture the current lower path mtu as known
1779  * by ip.
1780  */
1781 static void
iptun_update_dst_pmtu(iptun_t * iptun,ip_xmit_attr_t * ixa)1782 iptun_update_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1783 {
1784 	uint32_t	pmtu;
1785 	conn_t		*connp = iptun->iptun_connp;
1786 	boolean_t	need_rele = B_FALSE;
1787 
1788 	/* IXAF_VERIFY_PMTU is not set if we don't have a fixed destination */
1789 	if (!(iptun->iptun_flags & IPTUN_RADDR))
1790 		return;
1791 
1792 	if (ixa == NULL) {
1793 		ixa = conn_get_ixa(connp, B_FALSE);
1794 		if (ixa == NULL)
1795 			return;
1796 		need_rele = B_TRUE;
1797 	}
1798 	/*
1799 	 * Guard against ICMP errors before we have sent, as well as against
1800 	 * and a thread which held conn_ixa.
1801 	 */
1802 	if (ixa->ixa_ire != NULL) {
1803 		pmtu = ip_get_pmtu(ixa);
1804 		/*
1805 		 * Update ixa_fragsize and ixa_pmtu.
1806 		 */
1807 		ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
1808 
1809 		/*
1810 		 * For both IPv4 and IPv6 we can have indication that the outer
1811 		 * header needs fragmentation.
1812 		 */
1813 		if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1814 			/* Must allow fragmentation in ip_output */
1815 			ixa->ixa_flags &= ~IXAF_DONTFRAG;
1816 		} else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1817 			ixa->ixa_flags |= IXAF_DONTFRAG;
1818 		} else {
1819 			/* ip_get_pmtu might have set this - we don't want it */
1820 			ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1821 		}
1822 	}
1823 
1824 	if (need_rele)
1825 		ixa_refrele(ixa);
1826 }
1827 
1828 /*
1829  * There is nothing that iptun can verify in addition to IP having
1830  * verified the IP addresses in the fanout.
1831  */
1832 /* ARGSUSED */
1833 static boolean_t
iptun_verifyicmp(conn_t * connp,void * arg2,icmph_t * icmph,icmp6_t * icmp6,ip_recv_attr_t * ira)1834 iptun_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
1835     ip_recv_attr_t *ira)
1836 {
1837 	return (B_TRUE);
1838 }
1839 
1840 /*
1841  * Notify function registered with ip_xmit_attr_t.
1842  */
1843 static void
iptun_notify(void * arg,ip_xmit_attr_t * ixa,ixa_notify_type_t ntype,ixa_notify_arg_t narg)1844 iptun_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
1845     ixa_notify_arg_t narg)
1846 {
1847 	iptun_t		*iptun = (iptun_t *)arg;
1848 
1849 	switch (ntype) {
1850 	case IXAN_PMTU:
1851 		(void) iptun_update_mtu(iptun, ixa, narg);
1852 		break;
1853 	}
1854 }
1855 
1856 /*
1857  * Returns the max of old_ovhd and the overhead associated with pol.
1858  */
1859 static uint32_t
iptun_max_policy_overhead(ipsec_policy_t * pol,uint32_t old_ovhd)1860 iptun_max_policy_overhead(ipsec_policy_t *pol, uint32_t old_ovhd)
1861 {
1862 	uint32_t new_ovhd = old_ovhd;
1863 
1864 	while (pol != NULL) {
1865 		new_ovhd = max(new_ovhd,
1866 		    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1867 		pol = pol->ipsp_hash.hash_next;
1868 	}
1869 	return (new_ovhd);
1870 }
1871 
1872 static uint32_t
iptun_get_ipsec_overhead(iptun_t * iptun)1873 iptun_get_ipsec_overhead(iptun_t *iptun)
1874 {
1875 	ipsec_policy_root_t	*ipr;
1876 	ipsec_policy_head_t	*iph;
1877 	ipsec_policy_t		*pol;
1878 	ipsec_selector_t	sel;
1879 	int			i;
1880 	uint32_t		ipsec_ovhd = 0;
1881 	ipsec_tun_pol_t		*itp = iptun->iptun_itp;
1882 	netstack_t		*ns = iptun->iptun_ns;
1883 
1884 	if (itp == NULL || !(itp->itp_flags & ITPF_P_ACTIVE)) {
1885 		/*
1886 		 * Consult global policy, just in case.  This will only work
1887 		 * if we have both source and destination addresses to work
1888 		 * with.
1889 		 */
1890 		if ((iptun->iptun_flags & (IPTUN_LADDR|IPTUN_RADDR)) !=
1891 		    (IPTUN_LADDR|IPTUN_RADDR))
1892 			return (0);
1893 
1894 		iph = ipsec_system_policy(ns);
1895 		bzero(&sel, sizeof (sel));
1896 		sel.ips_isv4 =
1897 		    (iptun->iptun_typeinfo->iti_ipvers == IPV4_VERSION);
1898 		switch (iptun->iptun_typeinfo->iti_ipvers) {
1899 		case IPV4_VERSION:
1900 			sel.ips_local_addr_v4 = iptun->iptun_laddr4;
1901 			sel.ips_remote_addr_v4 = iptun->iptun_raddr4;
1902 			break;
1903 		case IPV6_VERSION:
1904 			sel.ips_local_addr_v6 = iptun->iptun_laddr6;
1905 			sel.ips_remote_addr_v6 = iptun->iptun_raddr6;
1906 			break;
1907 		}
1908 		/* Check for both IPv4 and IPv6. */
1909 		sel.ips_protocol = IPPROTO_ENCAP;
1910 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1911 		    &sel);
1912 		if (pol != NULL) {
1913 			ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act);
1914 			IPPOL_REFRELE(pol);
1915 		}
1916 		sel.ips_protocol = IPPROTO_IPV6;
1917 		pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1918 		    &sel);
1919 		if (pol != NULL) {
1920 			ipsec_ovhd = max(ipsec_ovhd,
1921 			    ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1922 			IPPOL_REFRELE(pol);
1923 		}
1924 		IPPH_REFRELE(iph, ns);
1925 	} else {
1926 		/*
1927 		 * Look through all of the possible IPsec actions for the
1928 		 * tunnel, and find the largest potential IPsec overhead.
1929 		 */
1930 		iph = itp->itp_policy;
1931 		rw_enter(&iph->iph_lock, RW_READER);
1932 		ipr = &(iph->iph_root[IPSEC_TYPE_OUTBOUND]);
1933 		ipsec_ovhd = iptun_max_policy_overhead(
1934 		    ipr->ipr_nonhash[IPSEC_AF_V4], 0);
1935 		ipsec_ovhd = iptun_max_policy_overhead(
1936 		    ipr->ipr_nonhash[IPSEC_AF_V6], ipsec_ovhd);
1937 		for (i = 0; i < ipr->ipr_nchains; i++) {
1938 			ipsec_ovhd = iptun_max_policy_overhead(
1939 			    ipr->ipr_hash[i].hash_head, ipsec_ovhd);
1940 		}
1941 		rw_exit(&iph->iph_lock);
1942 	}
1943 
1944 	return (ipsec_ovhd);
1945 }
1946 
1947 /*
1948  * Calculate and return the maximum possible upper MTU for the given tunnel.
1949  *
1950  * If new_pmtu is set then we also need to update the lower path MTU information
1951  * in the ip_xmit_attr_t. That is needed since we set IXAF_VERIFY_PMTU so that
1952  * we are notified by conn_ip_output() when the path MTU increases.
1953  */
1954 static uint32_t
iptun_get_maxmtu(iptun_t * iptun,ip_xmit_attr_t * ixa,uint32_t new_pmtu)1955 iptun_get_maxmtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
1956 {
1957 	size_t		header_size, ipsec_overhead;
1958 	uint32_t	maxmtu, pmtu;
1959 
1960 	/*
1961 	 * Start with the path-MTU to the remote address, which is either
1962 	 * provided as the new_pmtu argument, or obtained using
1963 	 * iptun_get_dst_pmtu().
1964 	 */
1965 	if (new_pmtu != 0) {
1966 		if (iptun->iptun_flags & IPTUN_RADDR)
1967 			iptun->iptun_dpmtu = new_pmtu;
1968 		pmtu = new_pmtu;
1969 	} else if (iptun->iptun_flags & IPTUN_RADDR) {
1970 		if ((pmtu = iptun_get_dst_pmtu(iptun, ixa)) == 0) {
1971 			/*
1972 			 * We weren't able to obtain the path-MTU of the
1973 			 * destination.  Use the previous value.
1974 			 */
1975 			pmtu = iptun->iptun_dpmtu;
1976 		} else {
1977 			iptun->iptun_dpmtu = pmtu;
1978 		}
1979 	} else {
1980 		/*
1981 		 * We have no path-MTU information to go on, use the maximum
1982 		 * possible value.
1983 		 */
1984 		pmtu = iptun->iptun_typeinfo->iti_maxmtu;
1985 	}
1986 
1987 	/*
1988 	 * Now calculate tunneling overhead and subtract that from the
1989 	 * path-MTU information obtained above.
1990 	 */
1991 	if (iptun->iptun_header_size != 0) {
1992 		header_size = iptun->iptun_header_size;
1993 	} else {
1994 		switch (iptun->iptun_typeinfo->iti_ipvers) {
1995 		case IPV4_VERSION:
1996 			header_size = sizeof (ipha_t);
1997 			if (is_system_labeled())
1998 				header_size += IP_MAX_OPT_LENGTH;
1999 			break;
2000 		case IPV6_VERSION:
2001 			header_size = sizeof (iptun_ipv6hdrs_t);
2002 			break;
2003 		}
2004 	}
2005 
2006 	ipsec_overhead = iptun_get_ipsec_overhead(iptun);
2007 
2008 	maxmtu = pmtu - (header_size + ipsec_overhead);
2009 	return (max(maxmtu, iptun->iptun_typeinfo->iti_minmtu));
2010 }
2011 
2012 /*
2013  * Re-calculate the tunnel's MTU as seen from above and notify the MAC layer
2014  * of any change in MTU.  The new_pmtu argument is the new lower path MTU to
2015  * the tunnel destination to be used in the tunnel MTU calculation.  Passing
2016  * in 0 for new_pmtu causes the lower path MTU to be dynamically updated using
2017  * ip_get_pmtu().
2018  *
2019  * If the calculated tunnel MTU is different than its previous value, then we
2020  * notify the MAC layer above us of this change using mac_maxsdu_update().
2021  */
2022 static uint32_t
iptun_update_mtu(iptun_t * iptun,ip_xmit_attr_t * ixa,uint32_t new_pmtu)2023 iptun_update_mtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
2024 {
2025 	uint32_t newmtu;
2026 
2027 	/* We always update the ixa since we might have set IXAF_VERIFY_PMTU */
2028 	iptun_update_dst_pmtu(iptun, ixa);
2029 
2030 	/*
2031 	 * We return the current MTU without updating it if it was pegged to a
2032 	 * static value using the MAC_PROP_MTU link property.
2033 	 */
2034 	if (iptun->iptun_flags & IPTUN_FIXED_MTU)
2035 		return (iptun->iptun_mtu);
2036 
2037 	/* If the MTU isn't fixed, then use the maximum possible value. */
2038 	newmtu = iptun_get_maxmtu(iptun, ixa, new_pmtu);
2039 	/*
2040 	 * We only dynamically adjust the tunnel MTU for tunnels with
2041 	 * destinations because dynamic MTU calculations are based on the
2042 	 * destination path-MTU.
2043 	 */
2044 	if ((iptun->iptun_flags & IPTUN_RADDR) && newmtu != iptun->iptun_mtu) {
2045 		iptun->iptun_mtu = newmtu;
2046 		if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
2047 			iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
2048 	}
2049 
2050 	return (newmtu);
2051 }
2052 
2053 /*
2054  * Frees a packet or packet chain and bumps stat for each freed packet.
2055  */
2056 static void
iptun_drop_pkt(mblk_t * mp,uint64_t * stat)2057 iptun_drop_pkt(mblk_t *mp, uint64_t *stat)
2058 {
2059 	mblk_t *pktmp;
2060 
2061 	for (pktmp = mp; pktmp != NULL; pktmp = mp) {
2062 		mp = mp->b_next;
2063 		pktmp->b_next = NULL;
2064 		if (stat != NULL)
2065 			atomic_inc_64(stat);
2066 		freemsg(pktmp);
2067 	}
2068 }
2069 
2070 /*
2071  * Allocate and return a new mblk to hold an IP and ICMP header, and chain the
2072  * original packet to its b_cont.  Returns NULL on failure.
2073  */
2074 static mblk_t *
iptun_build_icmperr(size_t hdrs_size,mblk_t * orig_pkt)2075 iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
2076 {
2077 	mblk_t *icmperr_mp;
2078 
2079 	if ((icmperr_mp = allocb(hdrs_size, BPRI_MED)) != NULL) {
2080 		icmperr_mp->b_wptr += hdrs_size;
2081 		/* tack on the offending packet */
2082 		icmperr_mp->b_cont = orig_pkt;
2083 	}
2084 	return (icmperr_mp);
2085 }
2086 
2087 /*
2088  * Transmit an ICMP error.  mp->b_rptr points at the packet to be included in
2089  * the ICMP error.
2090  */
2091 static void
iptun_sendicmp_v4(iptun_t * iptun,icmph_t * icmp,ipha_t * orig_ipha,mblk_t * mp,ts_label_t * tsl)2092 iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp,
2093     ts_label_t *tsl)
2094 {
2095 	size_t	orig_pktsize, hdrs_size;
2096 	mblk_t	*icmperr_mp;
2097 	ipha_t	*new_ipha;
2098 	icmph_t	*new_icmp;
2099 	ip_xmit_attr_t	ixas;
2100 	conn_t	*connp = iptun->iptun_connp;
2101 
2102 	orig_pktsize = msgdsize(mp);
2103 	hdrs_size = sizeof (ipha_t) + sizeof (icmph_t);
2104 	if ((icmperr_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2105 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2106 		return;
2107 	}
2108 
2109 	new_ipha = (ipha_t *)icmperr_mp->b_rptr;
2110 	new_icmp = (icmph_t *)(new_ipha + 1);
2111 
2112 	new_ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2113 	new_ipha->ipha_type_of_service = 0;
2114 	new_ipha->ipha_ident = 0;
2115 	new_ipha->ipha_fragment_offset_and_flags = 0;
2116 	new_ipha->ipha_ttl = orig_ipha->ipha_ttl;
2117 	new_ipha->ipha_protocol = IPPROTO_ICMP;
2118 	new_ipha->ipha_src = orig_ipha->ipha_dst;
2119 	new_ipha->ipha_dst = orig_ipha->ipha_src;
2120 	new_ipha->ipha_hdr_checksum = 0; /* will be computed by ip */
2121 	new_ipha->ipha_length = htons(hdrs_size + orig_pktsize);
2122 
2123 	*new_icmp = *icmp;
2124 	new_icmp->icmph_checksum = 0;
2125 	new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0);
2126 
2127 	bzero(&ixas, sizeof (ixas));
2128 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2129 	if (new_ipha->ipha_src == INADDR_ANY) {
2130 		ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
2131 		ixas.ixa_flags |= IXAF_SET_SOURCE;
2132 	}
2133 
2134 	ixas.ixa_zoneid = IPCL_ZONEID(connp);
2135 	ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2136 	ixas.ixa_cred = connp->conn_cred;
2137 	ixas.ixa_cpid = NOPID;
2138 	if (is_system_labeled())
2139 		ixas.ixa_tsl = tsl;
2140 
2141 	ixas.ixa_ifindex = 0;
2142 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2143 
2144 	(void) ip_output_simple(icmperr_mp, &ixas);
2145 	ixa_cleanup(&ixas);
2146 }
2147 
2148 static void
iptun_sendicmp_v6(iptun_t * iptun,icmp6_t * icmp6,ip6_t * orig_ip6h,mblk_t * mp,ts_label_t * tsl)2149 iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp,
2150     ts_label_t *tsl)
2151 {
2152 	size_t	orig_pktsize, hdrs_size;
2153 	mblk_t	*icmp6err_mp;
2154 	ip6_t	*new_ip6h;
2155 	icmp6_t	*new_icmp6;
2156 	ip_xmit_attr_t	ixas;
2157 	conn_t	*connp = iptun->iptun_connp;
2158 
2159 	orig_pktsize = msgdsize(mp);
2160 	hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t);
2161 	if ((icmp6err_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2162 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2163 		return;
2164 	}
2165 
2166 	new_ip6h = (ip6_t *)icmp6err_mp->b_rptr;
2167 	new_icmp6 = (icmp6_t *)(new_ip6h + 1);
2168 
2169 	new_ip6h->ip6_vcf = orig_ip6h->ip6_vcf;
2170 	new_ip6h->ip6_plen = htons(sizeof (icmp6_t) + orig_pktsize);
2171 	new_ip6h->ip6_hops = orig_ip6h->ip6_hops;
2172 	new_ip6h->ip6_nxt = IPPROTO_ICMPV6;
2173 	new_ip6h->ip6_src = orig_ip6h->ip6_dst;
2174 	new_ip6h->ip6_dst = orig_ip6h->ip6_src;
2175 
2176 	*new_icmp6 = *icmp6;
2177 	/* The checksum is calculated in ip_output_simple and friends. */
2178 	new_icmp6->icmp6_cksum = new_ip6h->ip6_plen;
2179 
2180 	bzero(&ixas, sizeof (ixas));
2181 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
2182 	if (IN6_IS_ADDR_UNSPECIFIED(&new_ip6h->ip6_src)) {
2183 		ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
2184 		ixas.ixa_flags |= IXAF_SET_SOURCE;
2185 	}
2186 
2187 	ixas.ixa_zoneid = IPCL_ZONEID(connp);
2188 	ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2189 	ixas.ixa_cred = connp->conn_cred;
2190 	ixas.ixa_cpid = NOPID;
2191 	if (is_system_labeled())
2192 		ixas.ixa_tsl = tsl;
2193 
2194 	ixas.ixa_ifindex = 0;
2195 	ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2196 
2197 	(void) ip_output_simple(icmp6err_mp, &ixas);
2198 	ixa_cleanup(&ixas);
2199 }
2200 
2201 static void
iptun_icmp_error_v4(iptun_t * iptun,ipha_t * orig_ipha,mblk_t * mp,uint8_t type,uint8_t code,ts_label_t * tsl)2202 iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
2203     uint8_t type, uint8_t code, ts_label_t *tsl)
2204 {
2205 	icmph_t icmp;
2206 
2207 	bzero(&icmp, sizeof (icmp));
2208 	icmp.icmph_type = type;
2209 	icmp.icmph_code = code;
2210 
2211 	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2212 }
2213 
2214 static void
iptun_icmp_fragneeded_v4(iptun_t * iptun,uint32_t newmtu,ipha_t * orig_ipha,mblk_t * mp,ts_label_t * tsl)2215 iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
2216     mblk_t *mp, ts_label_t *tsl)
2217 {
2218 	icmph_t	icmp;
2219 
2220 	icmp.icmph_type = ICMP_DEST_UNREACHABLE;
2221 	icmp.icmph_code = ICMP_FRAGMENTATION_NEEDED;
2222 	icmp.icmph_du_zero = 0;
2223 	icmp.icmph_du_mtu = htons(newmtu);
2224 
2225 	iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2226 }
2227 
2228 static void
iptun_icmp_error_v6(iptun_t * iptun,ip6_t * orig_ip6h,mblk_t * mp,uint8_t type,uint8_t code,uint32_t offset,ts_label_t * tsl)2229 iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
2230     uint8_t type, uint8_t code, uint32_t offset, ts_label_t *tsl)
2231 {
2232 	icmp6_t icmp6;
2233 
2234 	bzero(&icmp6, sizeof (icmp6));
2235 	icmp6.icmp6_type = type;
2236 	icmp6.icmp6_code = code;
2237 	if (type == ICMP6_PARAM_PROB)
2238 		icmp6.icmp6_pptr = htonl(offset);
2239 
2240 	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2241 }
2242 
2243 static void
iptun_icmp_toobig_v6(iptun_t * iptun,uint32_t newmtu,ip6_t * orig_ip6h,mblk_t * mp,ts_label_t * tsl)2244 iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
2245     mblk_t *mp, ts_label_t *tsl)
2246 {
2247 	icmp6_t icmp6;
2248 
2249 	icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
2250 	icmp6.icmp6_code = 0;
2251 	icmp6.icmp6_mtu = htonl(newmtu);
2252 
2253 	iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2254 }
2255 
2256 /*
2257  * Determines if the packet pointed to by ipha or ip6h is an ICMP error.  The
2258  * mp argument is only used to do bounds checking.
2259  */
2260 static boolean_t
is_icmp_error(mblk_t * mp,ipha_t * ipha,ip6_t * ip6h)2261 is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
2262 {
2263 	uint16_t hlen;
2264 
2265 	if (ipha != NULL) {
2266 		icmph_t	*icmph;
2267 
2268 		ASSERT(ip6h == NULL);
2269 		if (ipha->ipha_protocol != IPPROTO_ICMP)
2270 			return (B_FALSE);
2271 
2272 		hlen = IPH_HDR_LENGTH(ipha);
2273 		icmph = (icmph_t *)((uint8_t *)ipha + hlen);
2274 		return (ICMP_IS_ERROR(icmph->icmph_type) ||
2275 		    icmph->icmph_type == ICMP_REDIRECT);
2276 	} else {
2277 		icmp6_t	*icmp6;
2278 		uint8_t	*nexthdrp;
2279 
2280 		ASSERT(ip6h != NULL);
2281 		if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hlen, &nexthdrp) ||
2282 		    *nexthdrp != IPPROTO_ICMPV6) {
2283 			return (B_FALSE);
2284 		}
2285 
2286 		icmp6 = (icmp6_t *)((uint8_t *)ip6h + hlen);
2287 		return (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
2288 		    icmp6->icmp6_type == ND_REDIRECT);
2289 	}
2290 }
2291 
2292 /*
2293  * Find inner and outer IP headers from a tunneled packet as setup for calls
2294  * into ipsec_tun_{in,out}bound().
2295  * Note that we need to allow the outer header to be in a separate mblk from
2296  * the inner header.
2297  * If the caller knows the outer_hlen, the caller passes it in. Otherwise zero.
2298  */
2299 static size_t
iptun_find_headers(mblk_t * mp,size_t outer_hlen,ipha_t ** outer4,ipha_t ** inner4,ip6_t ** outer6,ip6_t ** inner6)2300 iptun_find_headers(mblk_t *mp, size_t outer_hlen, ipha_t **outer4,
2301     ipha_t **inner4, ip6_t **outer6, ip6_t **inner6)
2302 {
2303 	ipha_t	*ipha;
2304 	size_t	first_mblkl = MBLKL(mp);
2305 	mblk_t	*inner_mp;
2306 
2307 	/*
2308 	 * Don't bother handling packets that don't have a full IP header in
2309 	 * the fist mblk.  For the input path, the ip module ensures that this
2310 	 * won't happen, and on the output path, the IP tunneling MAC-type
2311 	 * plugins ensure that this also won't happen.
2312 	 */
2313 	if (first_mblkl < sizeof (ipha_t))
2314 		return (0);
2315 	ipha = (ipha_t *)(mp->b_rptr);
2316 	switch (IPH_HDR_VERSION(ipha)) {
2317 	case IPV4_VERSION:
2318 		*outer4 = ipha;
2319 		*outer6 = NULL;
2320 		if (outer_hlen == 0)
2321 			outer_hlen = IPH_HDR_LENGTH(ipha);
2322 		break;
2323 	case IPV6_VERSION:
2324 		*outer4 = NULL;
2325 		*outer6 = (ip6_t *)ipha;
2326 		if (outer_hlen == 0)
2327 			outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
2328 		break;
2329 	default:
2330 		return (0);
2331 	}
2332 
2333 	if (first_mblkl < outer_hlen ||
2334 	    (first_mblkl == outer_hlen && mp->b_cont == NULL))
2335 		return (0);
2336 
2337 	/*
2338 	 * We don't bother doing a pullup here since the outer header will
2339 	 * just get stripped off soon on input anyway.  We just want to ensure
2340 	 * that the inner* pointer points to a full header.
2341 	 */
2342 	if (first_mblkl == outer_hlen) {
2343 		inner_mp = mp->b_cont;
2344 		ipha = (ipha_t *)inner_mp->b_rptr;
2345 	} else {
2346 		inner_mp = mp;
2347 		ipha = (ipha_t *)(mp->b_rptr + outer_hlen);
2348 	}
2349 	switch (IPH_HDR_VERSION(ipha)) {
2350 	case IPV4_VERSION:
2351 		if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ipha_t))
2352 			return (0);
2353 		*inner4 = ipha;
2354 		*inner6 = NULL;
2355 		break;
2356 	case IPV6_VERSION:
2357 		if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ip6_t))
2358 			return (0);
2359 		*inner4 = NULL;
2360 		*inner6 = (ip6_t *)ipha;
2361 		break;
2362 	default:
2363 		return (0);
2364 	}
2365 
2366 	return (outer_hlen);
2367 }
2368 
2369 /*
2370  * Received ICMP error in response to an X over IPv4 packet that we
2371  * transmitted.
2372  *
2373  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2374  * the following:
2375  *
2376  * [IPv4(0)][ICMPv4][IPv4(1)][IPv4(2)][ULP]
2377  *
2378  *	or
2379  *
2380  * [IPv4(0)][ICMPv4][IPv4(1)][IPv6][ULP]
2381  *
2382  * And "outer4" will get set to IPv4(1), and inner[46] will correspond to
2383  * whatever the very-inner packet is (IPv4(2) or IPv6).
2384  */
2385 static void
iptun_input_icmp_v4(iptun_t * iptun,mblk_t * data_mp,icmph_t * icmph,ip_recv_attr_t * ira)2386 iptun_input_icmp_v4(iptun_t *iptun, mblk_t *data_mp, icmph_t *icmph,
2387     ip_recv_attr_t *ira)
2388 {
2389 	uint8_t	*orig;
2390 	ipha_t	*outer4, *inner4;
2391 	ip6_t	*outer6, *inner6;
2392 	int	outer_hlen;
2393 	uint8_t	type, code;
2394 
2395 	ASSERT(data_mp->b_cont == NULL);
2396 	/*
2397 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
2398 	 * find headers in the ICMP packet payload.
2399 	 */
2400 	orig = data_mp->b_rptr;
2401 	data_mp->b_rptr = (uint8_t *)(icmph + 1);
2402 	/*
2403 	 * The ip module ensures that ICMP errors contain at least the
2404 	 * original IP header (otherwise, the error would never have made it
2405 	 * here).
2406 	 */
2407 	ASSERT(MBLKL(data_mp) >= 0);
2408 	outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2409 	    &inner6);
2410 	ASSERT(outer6 == NULL);
2411 	data_mp->b_rptr = orig;
2412 	if (outer_hlen == 0) {
2413 		iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2414 		return;
2415 	}
2416 
2417 	/* Only ICMP errors due to tunneled packets should reach here. */
2418 	ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP ||
2419 	    outer4->ipha_protocol == IPPROTO_IPV6);
2420 
2421 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2422 	    inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2423 	if (data_mp == NULL) {
2424 		/* Callee did all of the freeing. */
2425 		atomic_inc_64(&iptun->iptun_ierrors);
2426 		return;
2427 	}
2428 	/* We should never see reassembled fragment here. */
2429 	ASSERT(data_mp->b_next == NULL);
2430 
2431 	data_mp->b_rptr = (uint8_t *)outer4 + outer_hlen;
2432 
2433 	/*
2434 	 * If the original packet being transmitted was itself an ICMP error,
2435 	 * then drop this packet.  We don't want to generate an ICMP error in
2436 	 * response to an ICMP error.
2437 	 */
2438 	if (is_icmp_error(data_mp, inner4, inner6)) {
2439 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2440 		return;
2441 	}
2442 
2443 	switch (icmph->icmph_type) {
2444 	case ICMP_DEST_UNREACHABLE:
2445 		type = (inner4 != NULL ? icmph->icmph_type : ICMP6_DST_UNREACH);
2446 		switch (icmph->icmph_code) {
2447 		case ICMP_FRAGMENTATION_NEEDED: {
2448 			uint32_t newmtu;
2449 
2450 			/*
2451 			 * We reconcile this with the fact that the tunnel may
2452 			 * also have IPsec policy by letting iptun_update_mtu
2453 			 * take care of it.
2454 			 */
2455 			newmtu = iptun_update_mtu(iptun, NULL,
2456 			    ntohs(icmph->icmph_du_mtu));
2457 
2458 			if (inner4 != NULL) {
2459 				iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2460 				    data_mp, ira->ira_tsl);
2461 			} else {
2462 				iptun_icmp_toobig_v6(iptun, newmtu, inner6,
2463 				    data_mp, ira->ira_tsl);
2464 			}
2465 			return;
2466 		}
2467 		case ICMP_DEST_NET_UNREACH_ADMIN:
2468 		case ICMP_DEST_HOST_UNREACH_ADMIN:
2469 			code = (inner4 != NULL ? ICMP_DEST_NET_UNREACH_ADMIN :
2470 			    ICMP6_DST_UNREACH_ADMIN);
2471 			break;
2472 		default:
2473 			code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2474 			    ICMP6_DST_UNREACH_ADDR);
2475 			break;
2476 		}
2477 		break;
2478 	case ICMP_TIME_EXCEEDED:
2479 		if (inner6 != NULL) {
2480 			type = ICMP6_TIME_EXCEEDED;
2481 			code = 0;
2482 		} /* else we're already set. */
2483 		break;
2484 	case ICMP_PARAM_PROBLEM:
2485 		/*
2486 		 * This is a problem with the outer header we transmitted.
2487 		 * Treat this as an output error.
2488 		 */
2489 		iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2490 		return;
2491 	default:
2492 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2493 		return;
2494 	}
2495 
2496 	if (inner4 != NULL) {
2497 		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2498 		    ira->ira_tsl);
2499 	} else {
2500 		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2501 		    ira->ira_tsl);
2502 	}
2503 }
2504 
2505 /*
2506  * Return B_TRUE if the IPv6 packet pointed to by ip6h contains a Tunnel
2507  * Encapsulation Limit destination option.  If there is one, set encaplim_ptr
2508  * to point to the option value.
2509  */
2510 static boolean_t
iptun_find_encaplimit(mblk_t * mp,ip6_t * ip6h,uint8_t ** encaplim_ptr)2511 iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
2512 {
2513 	ip_pkt_t	pkt;
2514 	uint8_t		*endptr;
2515 	ip6_dest_t	*destp;
2516 	struct ip6_opt	*optp;
2517 
2518 	pkt.ipp_fields = 0; /* must be initialized */
2519 	(void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &pkt, NULL);
2520 	if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) {
2521 		destp = pkt.ipp_dstopts;
2522 	} else if ((pkt.ipp_fields & IPPF_RTHDRDSTOPTS) != 0) {
2523 		destp = pkt.ipp_rthdrdstopts;
2524 	} else {
2525 		return (B_FALSE);
2526 	}
2527 
2528 	endptr = (uint8_t *)destp + 8 * (destp->ip6d_len + 1);
2529 	optp = (struct ip6_opt *)(destp + 1);
2530 	while (endptr - (uint8_t *)optp > sizeof (*optp)) {
2531 		if (optp->ip6o_type == IP6OPT_TUNNEL_LIMIT) {
2532 			if ((uint8_t *)(optp + 1) >= endptr)
2533 				return (B_FALSE);
2534 			*encaplim_ptr = (uint8_t *)&optp[1];
2535 			return (B_TRUE);
2536 		}
2537 		optp = (struct ip6_opt *)((uint8_t *)optp + optp->ip6o_len + 2);
2538 	}
2539 	return (B_FALSE);
2540 }
2541 
2542 /*
2543  * Received ICMPv6 error in response to an X over IPv6 packet that we
2544  * transmitted.
2545  *
2546  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2547  * the following:
2548  *
2549  * [IPv6(0)][ICMPv6][IPv6(1)][IPv4][ULP]
2550  *
2551  *	or
2552  *
2553  * [IPv6(0)][ICMPv6][IPv6(1)][IPv6(2)][ULP]
2554  *
2555  * And "outer6" will get set to IPv6(1), and inner[46] will correspond to
2556  * whatever the very-inner packet is (IPv4 or IPv6(2)).
2557  */
2558 static void
iptun_input_icmp_v6(iptun_t * iptun,mblk_t * data_mp,icmp6_t * icmp6h,ip_recv_attr_t * ira)2559 iptun_input_icmp_v6(iptun_t *iptun, mblk_t *data_mp, icmp6_t *icmp6h,
2560     ip_recv_attr_t *ira)
2561 {
2562 	uint8_t	*orig;
2563 	ipha_t	*outer4, *inner4;
2564 	ip6_t	*outer6, *inner6;
2565 	int	outer_hlen;
2566 	uint8_t	type, code;
2567 
2568 	ASSERT(data_mp->b_cont == NULL);
2569 
2570 	/*
2571 	 * Temporarily move b_rptr forward so that iptun_find_headers() can
2572 	 * find IP headers in the ICMP packet payload.
2573 	 */
2574 	orig = data_mp->b_rptr;
2575 	data_mp->b_rptr = (uint8_t *)(icmp6h + 1);
2576 	/*
2577 	 * The ip module ensures that ICMP errors contain at least the
2578 	 * original IP header (otherwise, the error would never have made it
2579 	 * here).
2580 	 */
2581 	ASSERT(MBLKL(data_mp) >= 0);
2582 	outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2583 	    &inner6);
2584 	ASSERT(outer4 == NULL);
2585 	data_mp->b_rptr = orig;	/* Restore r_ptr */
2586 	if (outer_hlen == 0) {
2587 		iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2588 		return;
2589 	}
2590 
2591 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2592 	    inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2593 	if (data_mp == NULL) {
2594 		/* Callee did all of the freeing. */
2595 		atomic_inc_64(&iptun->iptun_ierrors);
2596 		return;
2597 	}
2598 	/* We should never see reassembled fragment here. */
2599 	ASSERT(data_mp->b_next == NULL);
2600 
2601 	data_mp->b_rptr = (uint8_t *)outer6 + outer_hlen;
2602 
2603 	/*
2604 	 * If the original packet being transmitted was itself an ICMP error,
2605 	 * then drop this packet.  We don't want to generate an ICMP error in
2606 	 * response to an ICMP error.
2607 	 */
2608 	if (is_icmp_error(data_mp, inner4, inner6)) {
2609 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2610 		return;
2611 	}
2612 
2613 	switch (icmp6h->icmp6_type) {
2614 	case ICMP6_PARAM_PROB: {
2615 		uint8_t *encaplim_ptr;
2616 
2617 		/*
2618 		 * If the ICMPv6 error points to a valid Tunnel Encapsulation
2619 		 * Limit option and the limit value is 0, then fall through
2620 		 * and send a host unreachable message.  Otherwise, treat the
2621 		 * error as an output error, as there must have been a problem
2622 		 * with a packet we sent.
2623 		 */
2624 		if (!iptun_find_encaplimit(data_mp, outer6, &encaplim_ptr) ||
2625 		    (icmp6h->icmp6_pptr !=
2626 		    ((ptrdiff_t)encaplim_ptr - (ptrdiff_t)outer6)) ||
2627 		    *encaplim_ptr != 0) {
2628 			iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2629 			return;
2630 		}
2631 	}
2632 	/* FALLTHROUGH */
2633 	case ICMP6_TIME_EXCEEDED:
2634 	case ICMP6_DST_UNREACH:
2635 		type = (inner4 != NULL ? ICMP_DEST_UNREACHABLE :
2636 		    ICMP6_DST_UNREACH);
2637 		code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2638 		    ICMP6_DST_UNREACH_ADDR);
2639 		break;
2640 	case ICMP6_PACKET_TOO_BIG: {
2641 		uint32_t newmtu;
2642 
2643 		/*
2644 		 * We reconcile this with the fact that the tunnel may also
2645 		 * have IPsec policy by letting iptun_update_mtu take care of
2646 		 * it.
2647 		 */
2648 		newmtu = iptun_update_mtu(iptun, NULL,
2649 		    ntohl(icmp6h->icmp6_mtu));
2650 
2651 		if (inner4 != NULL) {
2652 			iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2653 			    data_mp, ira->ira_tsl);
2654 		} else {
2655 			iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp,
2656 			    ira->ira_tsl);
2657 		}
2658 		return;
2659 	}
2660 	default:
2661 		iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2662 		return;
2663 	}
2664 
2665 	if (inner4 != NULL) {
2666 		iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2667 		    ira->ira_tsl);
2668 	} else {
2669 		iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2670 		    ira->ira_tsl);
2671 	}
2672 }
2673 
2674 /*
2675  * Called as conn_recvicmp from IP for ICMP errors.
2676  */
2677 /* ARGSUSED2 */
2678 static void
iptun_input_icmp(void * arg,mblk_t * mp,void * arg2,ip_recv_attr_t * ira)2679 iptun_input_icmp(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2680 {
2681 	conn_t		*connp = arg;
2682 	iptun_t		*iptun = connp->conn_iptun;
2683 	mblk_t		*tmpmp;
2684 	size_t		hlen;
2685 
2686 	ASSERT(IPCL_IS_IPTUN(connp));
2687 
2688 	if (mp->b_cont != NULL) {
2689 		/*
2690 		 * Since ICMP error processing necessitates access to bits
2691 		 * that are within the ICMP error payload (the original packet
2692 		 * that caused the error), pull everything up into a single
2693 		 * block for convenience.
2694 		 */
2695 		if ((tmpmp = msgpullup(mp, -1)) == NULL) {
2696 			iptun_drop_pkt(mp, &iptun->iptun_norcvbuf);
2697 			return;
2698 		}
2699 		freemsg(mp);
2700 		mp = tmpmp;
2701 	}
2702 
2703 	hlen = ira->ira_ip_hdr_length;
2704 	switch (iptun->iptun_typeinfo->iti_ipvers) {
2705 	case IPV4_VERSION:
2706 		/*
2707 		 * The outer IP header coming up from IP is always ipha_t
2708 		 * alligned (otherwise, we would have crashed in ip).
2709 		 */
2710 		iptun_input_icmp_v4(iptun, mp, (icmph_t *)(mp->b_rptr + hlen),
2711 		    ira);
2712 		break;
2713 	case IPV6_VERSION:
2714 		iptun_input_icmp_v6(iptun, mp, (icmp6_t *)(mp->b_rptr + hlen),
2715 		    ira);
2716 		break;
2717 	}
2718 }
2719 
2720 static boolean_t
iptun_in_6to4_ok(iptun_t * iptun,ipha_t * outer4,ip6_t * inner6)2721 iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2722 {
2723 	ipaddr_t v4addr;
2724 
2725 	/*
2726 	 * It's possible that someone sent us an IPv4-in-IPv4 packet with the
2727 	 * IPv4 address of a 6to4 tunnel as the destination.
2728 	 */
2729 	if (inner6 == NULL)
2730 		return (B_FALSE);
2731 
2732 	/*
2733 	 * Make sure that the IPv6 destination is within the site that this
2734 	 * 6to4 tunnel is routing for.  We don't want people bouncing random
2735 	 * tunneled IPv6 packets through this 6to4 router.
2736 	 */
2737 	IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst, (struct in_addr *)&v4addr);
2738 	if (outer4->ipha_dst != v4addr)
2739 		return (B_FALSE);
2740 
2741 	if (IN6_IS_ADDR_6TO4(&inner6->ip6_src)) {
2742 		/*
2743 		 * Section 9 of RFC 3056 (security considerations) suggests
2744 		 * that when a packet is from a 6to4 site (i.e., it's not a
2745 		 * global address being forwarded froma relay router), make
2746 		 * sure that the packet was tunneled by that site's 6to4
2747 		 * router.
2748 		 */
2749 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2750 		if (outer4->ipha_src != v4addr)
2751 			return (B_FALSE);
2752 	} else {
2753 		/*
2754 		 * Only accept packets from a relay router if we've configured
2755 		 * outbound relay router functionality.
2756 		 */
2757 		if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2758 			return (B_FALSE);
2759 	}
2760 
2761 	return (B_TRUE);
2762 }
2763 
2764 /*
2765  * Input function for everything that comes up from the ip module below us.
2766  * This is called directly from the ip module via connp->conn_recv().
2767  *
2768  * We receive M_DATA messages with IP-in-IP tunneled packets.
2769  */
2770 /* ARGSUSED2 */
2771 static void
iptun_input(void * arg,mblk_t * data_mp,void * arg2,ip_recv_attr_t * ira)2772 iptun_input(void *arg, mblk_t *data_mp, void *arg2, ip_recv_attr_t *ira)
2773 {
2774 	conn_t	*connp = arg;
2775 	iptun_t	*iptun = connp->conn_iptun;
2776 	int	outer_hlen;
2777 	ipha_t	*outer4, *inner4;
2778 	ip6_t	*outer6, *inner6;
2779 
2780 	ASSERT(IPCL_IS_IPTUN(connp));
2781 	ASSERT(DB_TYPE(data_mp) == M_DATA);
2782 
2783 	outer_hlen = iptun_find_headers(data_mp, ira->ira_ip_hdr_length,
2784 	    &outer4, &inner4, &outer6, &inner6);
2785 	if (outer_hlen == 0)
2786 		goto drop;
2787 
2788 	/*
2789 	 * If the system is labeled, we call tsol_check_dest() on the packet
2790 	 * destination (our local tunnel address) to ensure that the packet as
2791 	 * labeled should be allowed to be sent to us.  We don't need to call
2792 	 * the more involved tsol_receive_local() since the tunnel link itself
2793 	 * cannot be assigned to shared-stack non-global zones.
2794 	 */
2795 	if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2796 		if (ira->ira_tsl == NULL)
2797 			goto drop;
2798 		if (tsol_check_dest(ira->ira_tsl, (outer4 != NULL ?
2799 		    (void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst),
2800 		    (outer4 != NULL ? IPV4_VERSION : IPV6_VERSION),
2801 		    CONN_MAC_DEFAULT, B_FALSE, NULL) != 0)
2802 			goto drop;
2803 	}
2804 
2805 	data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2806 	    inner4, inner6, outer4, outer6, outer_hlen, iptun->iptun_ns);
2807 	if (data_mp == NULL) {
2808 		/* Callee did all of the freeing. */
2809 		return;
2810 	}
2811 
2812 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
2813 	    !iptun_in_6to4_ok(iptun, outer4, inner6))
2814 		goto drop;
2815 
2816 	/*
2817 	 * We need to statistically account for each packet individually, so
2818 	 * we might as well split up any b_next chains here.
2819 	 */
2820 	do {
2821 		mblk_t	*mp;
2822 
2823 		mp = data_mp->b_next;
2824 		data_mp->b_next = NULL;
2825 
2826 		atomic_inc_64(&iptun->iptun_ipackets);
2827 		atomic_add_64(&iptun->iptun_rbytes, msgdsize(data_mp));
2828 		mac_rx(iptun->iptun_mh, NULL, data_mp);
2829 
2830 		data_mp = mp;
2831 	} while (data_mp != NULL);
2832 	return;
2833 drop:
2834 	iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2835 }
2836 
2837 /*
2838  * Do 6to4-specific header-processing on output.  Return B_TRUE if the packet
2839  * was processed without issue, or B_FALSE if the packet had issues and should
2840  * be dropped.
2841  */
2842 static boolean_t
iptun_out_process_6to4(iptun_t * iptun,ipha_t * outer4,ip6_t * inner6)2843 iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2844 {
2845 	ipaddr_t v4addr;
2846 
2847 	/*
2848 	 * IPv6 source must be a 6to4 address.  This is because a conscious
2849 	 * decision was made to not allow a Solaris system to be used as a
2850 	 * relay router (for security reasons) when 6to4 was initially
2851 	 * integrated.  If this decision is ever reversed, the following check
2852 	 * can be removed.
2853 	 */
2854 	if (!IN6_IS_ADDR_6TO4(&inner6->ip6_src))
2855 		return (B_FALSE);
2856 
2857 	/*
2858 	 * RFC3056 mandates that the IPv4 source MUST be set to the IPv4
2859 	 * portion of the 6to4 IPv6 source address.  In other words, make sure
2860 	 * that we're tunneling packets from our own 6to4 site.
2861 	 */
2862 	IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2863 	if (outer4->ipha_src != v4addr)
2864 		return (B_FALSE);
2865 
2866 	/*
2867 	 * Automatically set the destination of the outer IPv4 header as
2868 	 * described in RFC3056.  There are two possibilities:
2869 	 *
2870 	 * a. If the IPv6 destination is a 6to4 address, set the IPv4 address
2871 	 *    to the IPv4 portion of the 6to4 address.
2872 	 * b. If the IPv6 destination is a native IPv6 address, set the IPv4
2873 	 *    destination to the address of a relay router.
2874 	 *
2875 	 * Design Note: b shouldn't be necessary here, and this is a flaw in
2876 	 * the design of the 6to4relay command.  Instead of setting a 6to4
2877 	 * relay address in this module via an ioctl, the 6to4relay command
2878 	 * could simply add a IPv6 route for native IPv6 addresses (such as a
2879 	 * default route) in the forwarding table that uses a 6to4 destination
2880 	 * as its next hop, and the IPv4 portion of that address could be a
2881 	 * 6to4 relay address.  In order for this to work, IP would have to
2882 	 * resolve the next hop address, which would necessitate a link-layer
2883 	 * address resolver for 6to4 links, which doesn't exist today.
2884 	 *
2885 	 * In fact, if a resolver existed for 6to4 links, then setting the
2886 	 * IPv4 destination in the outer header could be done as part of
2887 	 * link-layer address resolution and fast-path header generation, and
2888 	 * not here.
2889 	 */
2890 	if (IN6_IS_ADDR_6TO4(&inner6->ip6_dst)) {
2891 		/* destination is a 6to4 router */
2892 		IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst,
2893 		    (struct in_addr *)&outer4->ipha_dst);
2894 
2895 		/* Reject attempts to send to INADDR_ANY */
2896 		if (outer4->ipha_dst == INADDR_ANY)
2897 			return (B_FALSE);
2898 	} else {
2899 		/*
2900 		 * The destination is a native IPv6 address.  If output to a
2901 		 * relay-router is enabled, use the relay-router's IPv4
2902 		 * address as the destination.
2903 		 */
2904 		if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2905 			return (B_FALSE);
2906 		outer4->ipha_dst = iptun->iptun_iptuns->iptuns_relay_rtr_addr;
2907 	}
2908 
2909 	/*
2910 	 * If the outer source and destination are equal, this means that the
2911 	 * 6to4 router somehow forwarded an IPv6 packet destined for its own
2912 	 * 6to4 site to its 6to4 tunnel interface, which will result in this
2913 	 * packet infinitely bouncing between ip and iptun.
2914 	 */
2915 	return (outer4->ipha_src != outer4->ipha_dst);
2916 }
2917 
2918 /*
2919  * Process output packets with outer IPv4 headers.  Frees mp and bumps stat on
2920  * error.
2921  */
2922 static mblk_t *
iptun_out_process_ipv4(iptun_t * iptun,mblk_t * mp,ipha_t * outer4,ipha_t * inner4,ip6_t * inner6,ip_xmit_attr_t * ixa)2923 iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
2924     ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
2925 {
2926 	uint8_t	*innerptr = (inner4 != NULL ?
2927 	    (uint8_t *)inner4 : (uint8_t *)inner6);
2928 	size_t	minmtu = iptun->iptun_typeinfo->iti_minmtu;
2929 
2930 	if (inner4 != NULL) {
2931 		ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP);
2932 		/*
2933 		 * Copy the tos from the inner IPv4 header. We mask off ECN
2934 		 * bits (bits 6 and 7) because there is currently no
2935 		 * tunnel-tunnel communication to determine if both sides
2936 		 * support ECN.  We opt for the safe choice: don't copy the
2937 		 * ECN bits when doing encapsulation.
2938 		 */
2939 		outer4->ipha_type_of_service =
2940 		    inner4->ipha_type_of_service & ~0x03;
2941 	} else {
2942 		ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 &&
2943 		    inner6 != NULL);
2944 	}
2945 	if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
2946 		outer4->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
2947 	else
2948 		outer4->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
2949 
2950 	/*
2951 	 * As described in section 3.2.2 of RFC4213, if the packet payload is
2952 	 * less than or equal to the minimum MTU size, then we need to allow
2953 	 * IPv4 to fragment the packet.  The reason is that even if we end up
2954 	 * receiving an ICMP frag-needed, the interface above this tunnel
2955 	 * won't be allowed to drop its MTU as a result, since the packet was
2956 	 * already smaller than the smallest allowable MTU for that interface.
2957 	 */
2958 	if (mp->b_wptr - innerptr <= minmtu) {
2959 		outer4->ipha_fragment_offset_and_flags = 0;
2960 		ixa->ixa_flags &= ~IXAF_DONTFRAG;
2961 	} else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) &&
2962 	    (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4)) {
2963 		ixa->ixa_flags |= IXAF_DONTFRAG;
2964 	}
2965 
2966 	ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(outer4);
2967 	ixa->ixa_pktlen = msgdsize(mp);
2968 	ixa->ixa_protocol = outer4->ipha_protocol;
2969 
2970 	outer4->ipha_length = htons(ixa->ixa_pktlen);
2971 	return (mp);
2972 }
2973 
2974 /*
2975  * Insert an encapsulation limit destination option in the packet provided.
2976  * Always consumes the mp argument and returns a new mblk pointer.
2977  */
2978 static mblk_t *
iptun_insert_encaplimit(iptun_t * iptun,mblk_t * mp,ip6_t * outer6,uint8_t limit)2979 iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
2980     uint8_t limit)
2981 {
2982 	mblk_t			*newmp;
2983 	iptun_ipv6hdrs_t	*newouter6;
2984 
2985 	ASSERT(outer6->ip6_nxt == IPPROTO_IPV6);
2986 	ASSERT(mp->b_cont == NULL);
2987 
2988 	mp->b_rptr += sizeof (ip6_t);
2989 	newmp = allocb(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), BPRI_MED);
2990 	if (newmp == NULL) {
2991 		iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2992 		return (NULL);
2993 	}
2994 	newmp->b_wptr += sizeof (iptun_ipv6hdrs_t);
2995 	/* Copy the payload (Starting with the inner IPv6 header). */
2996 	bcopy(mp->b_rptr, newmp->b_wptr, MBLKL(mp));
2997 	newmp->b_wptr += MBLKL(mp);
2998 	newouter6 = (iptun_ipv6hdrs_t *)newmp->b_rptr;
2999 	/* Now copy the outer IPv6 header. */
3000 	bcopy(outer6, &newouter6->it6h_ip6h, sizeof (ip6_t));
3001 	newouter6->it6h_ip6h.ip6_nxt = IPPROTO_DSTOPTS;
3002 	newouter6->it6h_encaplim = iptun_encaplim_init;
3003 	newouter6->it6h_encaplim.iel_destopt.ip6d_nxt = outer6->ip6_nxt;
3004 	newouter6->it6h_encaplim.iel_telopt.ip6ot_encap_limit = limit;
3005 
3006 	/*
3007 	 * The payload length will be set at the end of
3008 	 * iptun_out_process_ipv6().
3009 	 */
3010 
3011 	freemsg(mp);
3012 	return (newmp);
3013 }
3014 
3015 /*
3016  * Process output packets with outer IPv6 headers.  Frees mp and bumps stats
3017  * on error.
3018  */
3019 static mblk_t *
iptun_out_process_ipv6(iptun_t * iptun,mblk_t * mp,ip6_t * outer6,ipha_t * inner4,ip6_t * inner6,ip_xmit_attr_t * ixa)3020 iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
3021     ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
3022 {
3023 	uint8_t		*innerptr = (inner4 != NULL ?
3024 	    (uint8_t *)inner4 : (uint8_t *)inner6);
3025 	size_t		minmtu = iptun->iptun_typeinfo->iti_minmtu;
3026 	uint8_t		*limit, *configlimit;
3027 	uint32_t	offset;
3028 	iptun_ipv6hdrs_t *v6hdrs;
3029 
3030 	if (inner6 != NULL && iptun_find_encaplimit(mp, inner6, &limit)) {
3031 		/*
3032 		 * The inner packet is an IPv6 packet which itself contains an
3033 		 * encapsulation limit option.  The limit variable points to
3034 		 * the value in the embedded option.  Process the
3035 		 * encapsulation limit option as specified in RFC 2473.
3036 		 *
3037 		 * If limit is 0, then we've exceeded the limit and we need to
3038 		 * send back an ICMPv6 parameter problem message.
3039 		 *
3040 		 * If limit is > 0, then we decrement it by 1 and make sure
3041 		 * that the encapsulation limit option in the outer header
3042 		 * reflects that (adding an option if one isn't already
3043 		 * there).
3044 		 */
3045 		ASSERT(limit > mp->b_rptr && limit < mp->b_wptr);
3046 		if (*limit == 0) {
3047 			mp->b_rptr = (uint8_t *)inner6;
3048 			offset = limit - mp->b_rptr;
3049 			iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB,
3050 			    0, offset, ixa->ixa_tsl);
3051 			atomic_inc_64(&iptun->iptun_noxmtbuf);
3052 			return (NULL);
3053 		}
3054 
3055 		/*
3056 		 * The outer header requires an encapsulation limit option.
3057 		 * If there isn't one already, add one.
3058 		 */
3059 		if (iptun->iptun_encaplimit == 0) {
3060 			if ((mp = iptun_insert_encaplimit(iptun, mp, outer6,
3061 			    (*limit - 1))) == NULL)
3062 				return (NULL);
3063 			v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3064 		} else {
3065 			/*
3066 			 * There is an existing encapsulation limit option in
3067 			 * the outer header.  If the inner encapsulation limit
3068 			 * is less than the configured encapsulation limit,
3069 			 * update the outer encapsulation limit to reflect
3070 			 * this lesser value.
3071 			 */
3072 			v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3073 			configlimit =
3074 			    &v6hdrs->it6h_encaplim.iel_telopt.ip6ot_encap_limit;
3075 			if ((*limit - 1) < *configlimit)
3076 				*configlimit = (*limit - 1);
3077 		}
3078 		ixa->ixa_ip_hdr_length = sizeof (iptun_ipv6hdrs_t);
3079 		ixa->ixa_protocol = v6hdrs->it6h_encaplim.iel_destopt.ip6d_nxt;
3080 	} else {
3081 		ixa->ixa_ip_hdr_length = sizeof (ip6_t);
3082 		ixa->ixa_protocol = outer6->ip6_nxt;
3083 	}
3084 	/*
3085 	 * See iptun_output_process_ipv4() why we allow fragmentation for
3086 	 * small packets
3087 	 */
3088 	if (mp->b_wptr - innerptr <= minmtu)
3089 		ixa->ixa_flags &= ~IXAF_DONTFRAG;
3090 	else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL))
3091 		ixa->ixa_flags |= IXAF_DONTFRAG;
3092 
3093 	ixa->ixa_pktlen = msgdsize(mp);
3094 	outer6->ip6_plen = htons(ixa->ixa_pktlen - sizeof (ip6_t));
3095 	return (mp);
3096 }
3097 
3098 /*
3099  * The IP tunneling MAC-type plugins have already done most of the header
3100  * processing and validity checks.  We are simply responsible for multiplexing
3101  * down to the ip module below us.
3102  */
3103 static void
iptun_output(iptun_t * iptun,mblk_t * mp)3104 iptun_output(iptun_t *iptun, mblk_t *mp)
3105 {
3106 	conn_t	*connp = iptun->iptun_connp;
3107 	mblk_t	*newmp;
3108 	int	error;
3109 	ip_xmit_attr_t	*ixa;
3110 
3111 	ASSERT(mp->b_datap->db_type == M_DATA);
3112 
3113 	if (mp->b_cont != NULL) {
3114 		if ((newmp = msgpullup(mp, -1)) == NULL) {
3115 			iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
3116 			return;
3117 		}
3118 		freemsg(mp);
3119 		mp = newmp;
3120 	}
3121 
3122 	if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
3123 		iptun_output_6to4(iptun, mp);
3124 		return;
3125 	}
3126 
3127 	if (is_system_labeled()) {
3128 		/*
3129 		 * Since the label can be different meaning a potentially
3130 		 * different IRE,we always use a unique ip_xmit_attr_t.
3131 		 */
3132 		ixa = conn_get_ixa_exclusive(connp);
3133 	} else {
3134 		/*
3135 		 * If no other thread is using conn_ixa this just gets a
3136 		 * reference to conn_ixa. Otherwise we get a safe copy of
3137 		 * conn_ixa.
3138 		 */
3139 		ixa = conn_get_ixa(connp, B_FALSE);
3140 	}
3141 	if (ixa == NULL) {
3142 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3143 		return;
3144 	}
3145 
3146 	/*
3147 	 * In case we got a safe copy of conn_ixa, then we need
3148 	 * to fill in any pointers in it.
3149 	 */
3150 	if (ixa->ixa_ire == NULL) {
3151 		error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3152 		    &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
3153 		    NULL, NULL, 0);
3154 		if (error != 0) {
3155 			if (ixa->ixa_ire != NULL &&
3156 			    (error == EHOSTUNREACH || error == ENETUNREACH)) {
3157 				/*
3158 				 * Let conn_ip_output/ire_send_noroute return
3159 				 * the error and send any local ICMP error.
3160 				 */
3161 				error = 0;
3162 			} else {
3163 				ixa_refrele(ixa);
3164 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3165 				return;
3166 			}
3167 		}
3168 	}
3169 
3170 	iptun_output_common(iptun, ixa, mp);
3171 	ixa_refrele(ixa);
3172 }
3173 
3174 /*
3175  * We use an ixa based on the last destination.
3176  */
3177 static void
iptun_output_6to4(iptun_t * iptun,mblk_t * mp)3178 iptun_output_6to4(iptun_t *iptun, mblk_t *mp)
3179 {
3180 	conn_t		*connp = iptun->iptun_connp;
3181 	ipha_t		*outer4, *inner4;
3182 	ip6_t		*outer6, *inner6;
3183 	ip_xmit_attr_t	*ixa;
3184 	ip_xmit_attr_t	*oldixa;
3185 	int		error;
3186 	boolean_t	need_connect;
3187 	in6_addr_t	v6dst;
3188 
3189 	ASSERT(mp->b_cont == NULL);	/* Verified by iptun_output */
3190 
3191 	/* Make sure we set ipha_dst before we look at ipha_dst */
3192 
3193 	(void) iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, &inner6);
3194 	ASSERT(outer4 != NULL);
3195 	if (!iptun_out_process_6to4(iptun, outer4, inner6)) {
3196 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3197 		return;
3198 	}
3199 
3200 	if (is_system_labeled()) {
3201 		/*
3202 		 * Since the label can be different meaning a potentially
3203 		 * different IRE,we always use a unique ip_xmit_attr_t.
3204 		 */
3205 		ixa = conn_get_ixa_exclusive(connp);
3206 	} else {
3207 		/*
3208 		 * If no other thread is using conn_ixa this just gets a
3209 		 * reference to conn_ixa. Otherwise we get a safe copy of
3210 		 * conn_ixa.
3211 		 */
3212 		ixa = conn_get_ixa(connp, B_FALSE);
3213 	}
3214 	if (ixa == NULL) {
3215 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3216 		return;
3217 	}
3218 
3219 	mutex_enter(&connp->conn_lock);
3220 	if (connp->conn_v4lastdst == outer4->ipha_dst) {
3221 		need_connect = (ixa->ixa_ire == NULL);
3222 	} else {
3223 		/* In case previous destination was multirt */
3224 		ip_attr_newdst(ixa);
3225 
3226 		/*
3227 		 * We later update conn_ixa when we update conn_v4lastdst
3228 		 * which enables subsequent packets to avoid redoing
3229 		 * ip_attr_connect
3230 		 */
3231 		need_connect = B_TRUE;
3232 	}
3233 	mutex_exit(&connp->conn_lock);
3234 
3235 	/*
3236 	 * In case we got a safe copy of conn_ixa, or otherwise we don't
3237 	 * have a current ixa_ire, then we need to fill in any pointers in
3238 	 * the ixa.
3239 	 */
3240 	if (need_connect) {
3241 		IN6_IPADDR_TO_V4MAPPED(outer4->ipha_dst, &v6dst);
3242 
3243 		/* We handle IPsec in iptun_output_common */
3244 		error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3245 		    &v6dst, &v6dst, 0, NULL, NULL, 0);
3246 		if (error != 0) {
3247 			if (ixa->ixa_ire != NULL &&
3248 			    (error == EHOSTUNREACH || error == ENETUNREACH)) {
3249 				/*
3250 				 * Let conn_ip_output/ire_send_noroute return
3251 				 * the error and send any local ICMP error.
3252 				 */
3253 				error = 0;
3254 			} else {
3255 				ixa_refrele(ixa);
3256 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3257 				return;
3258 			}
3259 		}
3260 	}
3261 
3262 	iptun_output_common(iptun, ixa, mp);
3263 
3264 	/* Atomically replace conn_ixa and conn_v4lastdst */
3265 	mutex_enter(&connp->conn_lock);
3266 	if (connp->conn_v4lastdst != outer4->ipha_dst) {
3267 		/* Remember the dst which corresponds to conn_ixa */
3268 		connp->conn_v6lastdst = v6dst;
3269 		oldixa = conn_replace_ixa(connp, ixa);
3270 	} else {
3271 		oldixa = NULL;
3272 	}
3273 	mutex_exit(&connp->conn_lock);
3274 	ixa_refrele(ixa);
3275 	if (oldixa != NULL)
3276 		ixa_refrele(oldixa);
3277 }
3278 
3279 /*
3280  * Check the destination/label. Modifies *mpp by adding/removing CIPSO.
3281  *
3282  * We get the label from the message in order to honor the
3283  * ULPs/IPs choice of label. This will be NULL for forwarded
3284  * packets, neighbor discovery packets and some others.
3285  */
3286 static int
iptun_output_check_label(mblk_t ** mpp,ip_xmit_attr_t * ixa)3287 iptun_output_check_label(mblk_t **mpp, ip_xmit_attr_t *ixa)
3288 {
3289 	cred_t	*cr;
3290 	int	adjust;
3291 	int	iplen;
3292 	int	err;
3293 	ts_label_t *effective_tsl = NULL;
3294 
3295 
3296 	ASSERT(is_system_labeled());
3297 
3298 	cr = msg_getcred(*mpp, NULL);
3299 	if (cr == NULL)
3300 		return (0);
3301 
3302 	/*
3303 	 * We need to start with a label based on the IP/ULP above us
3304 	 */
3305 	ip_xmit_attr_restore_tsl(ixa, cr);
3306 
3307 	/*
3308 	 * Need to update packet with any CIPSO option since
3309 	 * conn_ip_output doesn't do that.
3310 	 */
3311 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
3312 		ipha_t *ipha;
3313 
3314 		ipha = (ipha_t *)(*mpp)->b_rptr;
3315 		iplen = ntohs(ipha->ipha_length);
3316 		err = tsol_check_label_v4(ixa->ixa_tsl,
3317 		    ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3318 		    ixa->ixa_ipst, &effective_tsl);
3319 		if (err != 0)
3320 			return (err);
3321 
3322 		ipha = (ipha_t *)(*mpp)->b_rptr;
3323 		adjust = (int)ntohs(ipha->ipha_length) - iplen;
3324 	} else {
3325 		ip6_t *ip6h;
3326 
3327 		ip6h = (ip6_t *)(*mpp)->b_rptr;
3328 		iplen = ntohs(ip6h->ip6_plen);
3329 
3330 		err = tsol_check_label_v6(ixa->ixa_tsl,
3331 		    ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3332 		    ixa->ixa_ipst, &effective_tsl);
3333 		if (err != 0)
3334 			return (err);
3335 
3336 		ip6h = (ip6_t *)(*mpp)->b_rptr;
3337 		adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
3338 	}
3339 
3340 	if (effective_tsl != NULL) {
3341 		/* Update the label */
3342 		ip_xmit_attr_replace_tsl(ixa, effective_tsl);
3343 	}
3344 	ixa->ixa_pktlen += adjust;
3345 	ixa->ixa_ip_hdr_length += adjust;
3346 	return (0);
3347 }
3348 
3349 
3350 static void
iptun_output_common(iptun_t * iptun,ip_xmit_attr_t * ixa,mblk_t * mp)3351 iptun_output_common(iptun_t *iptun, ip_xmit_attr_t *ixa, mblk_t *mp)
3352 {
3353 	ipsec_tun_pol_t	*itp = iptun->iptun_itp;
3354 	int		outer_hlen;
3355 	mblk_t		*newmp;
3356 	ipha_t		*outer4, *inner4;
3357 	ip6_t		*outer6, *inner6;
3358 	int		error;
3359 	boolean_t	update_pktlen;
3360 
3361 	ASSERT(ixa->ixa_ire != NULL);
3362 
3363 	outer_hlen = iptun_find_headers(mp, 0, &outer4, &inner4, &outer6,
3364 	    &inner6);
3365 	if (outer_hlen == 0) {
3366 		iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3367 		return;
3368 	}
3369 
3370 	/* Save IXAF_DONTFRAG value */
3371 	iaflags_t dontfrag = ixa->ixa_flags & IXAF_DONTFRAG;
3372 
3373 	/* Perform header processing. */
3374 	if (outer4 != NULL) {
3375 		mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6,
3376 		    ixa);
3377 	} else {
3378 		mp = iptun_out_process_ipv6(iptun, mp, outer6, inner4, inner6,
3379 		    ixa);
3380 	}
3381 	if (mp == NULL)
3382 		return;
3383 
3384 	/*
3385 	 * Let's hope the compiler optimizes this with "branch taken".
3386 	 */
3387 	if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
3388 		/* This updates the ip_xmit_attr_t */
3389 		mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
3390 		    outer6, outer_hlen, ixa);
3391 		if (mp == NULL) {
3392 			atomic_inc_64(&iptun->iptun_oerrors);
3393 			return;
3394 		}
3395 		if (is_system_labeled()) {
3396 			/*
3397 			 * Might change the packet by adding/removing CIPSO.
3398 			 * After this caller inner* and outer* and outer_hlen
3399 			 * might be invalid.
3400 			 */
3401 			error = iptun_output_check_label(&mp, ixa);
3402 			if (error != 0) {
3403 				ip2dbg(("label check failed (%d)\n", error));
3404 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3405 				return;
3406 			}
3407 		}
3408 
3409 		/*
3410 		 * ipsec_tun_outbound() returns a chain of tunneled IP
3411 		 * fragments linked with b_next (or a single message if the
3412 		 * tunneled packet wasn't a fragment).
3413 		 * If fragcache returned a list then we need to update
3414 		 * ixa_pktlen for all packets in the list.
3415 		 */
3416 		update_pktlen = (mp->b_next != NULL);
3417 
3418 		/*
3419 		 * Otherwise, we're good to go.  The ixa has been updated with
3420 		 * instructions for outbound IPsec processing.
3421 		 */
3422 		for (newmp = mp; newmp != NULL; newmp = mp) {
3423 			size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
3424 
3425 			atomic_inc_64(&iptun->iptun_opackets);
3426 			atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3427 			mp = mp->b_next;
3428 			newmp->b_next = NULL;
3429 
3430 			/*
3431 			 * The IXAF_DONTFRAG flag is global, but there is
3432 			 * a chain here.  Check if we're really already
3433 			 * smaller than the minimum allowed MTU and reset here
3434 			 * appropriately.  Otherwise one small packet can kill
3435 			 * the whole chain's path mtu discovery.
3436 			 * In addition, update the pktlen to the length of
3437 			 * the actual packet being processed.
3438 			 */
3439 			if (update_pktlen) {
3440 				ixa->ixa_pktlen = msgdsize(newmp);
3441 				if (ixa->ixa_pktlen <= minmtu)
3442 					ixa->ixa_flags &= ~IXAF_DONTFRAG;
3443 			}
3444 
3445 			atomic_inc_64(&iptun->iptun_opackets);
3446 			atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3447 
3448 			error = conn_ip_output(newmp, ixa);
3449 
3450 			/* Restore IXAF_DONTFRAG value */
3451 			ixa->ixa_flags |= dontfrag;
3452 
3453 			if (error == EMSGSIZE) {
3454 				/* IPsec policy might have changed */
3455 				(void) iptun_update_mtu(iptun, ixa, 0);
3456 			}
3457 		}
3458 	} else {
3459 		/*
3460 		 * The ip module will potentially apply global policy to the
3461 		 * packet in its output path if there's no active tunnel
3462 		 * policy.
3463 		 */
3464 		ASSERT(ixa->ixa_ipsec_policy == NULL);
3465 		mp = ip_output_attach_policy(mp, outer4, outer6, NULL, ixa);
3466 		if (mp == NULL) {
3467 			atomic_inc_64(&iptun->iptun_oerrors);
3468 			return;
3469 		}
3470 		if (is_system_labeled()) {
3471 			/*
3472 			 * Might change the packet by adding/removing CIPSO.
3473 			 * After this caller inner* and outer* and outer_hlen
3474 			 * might be invalid.
3475 			 */
3476 			error = iptun_output_check_label(&mp, ixa);
3477 			if (error != 0) {
3478 				ip2dbg(("label check failed (%d)\n", error));
3479 				iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3480 				return;
3481 			}
3482 		}
3483 
3484 		atomic_inc_64(&iptun->iptun_opackets);
3485 		atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3486 
3487 		error = conn_ip_output(mp, ixa);
3488 		if (error == EMSGSIZE) {
3489 			/* IPsec policy might have changed */
3490 			(void) iptun_update_mtu(iptun, ixa, 0);
3491 		}
3492 	}
3493 	if (ixa->ixa_flags & IXAF_IPSEC_SECURE)
3494 		ipsec_out_release_refs(ixa);
3495 }
3496 
3497 static mac_callbacks_t iptun_m_callbacks = {
3498 	.mc_callbacks	= (MC_SETPROP | MC_GETPROP | MC_PROPINFO),
3499 	.mc_getstat	= iptun_m_getstat,
3500 	.mc_start	= iptun_m_start,
3501 	.mc_stop	= iptun_m_stop,
3502 	.mc_setpromisc	= iptun_m_setpromisc,
3503 	.mc_multicst	= iptun_m_multicst,
3504 	.mc_unicst	= iptun_m_unicst,
3505 	.mc_tx		= iptun_m_tx,
3506 	.mc_reserved	= NULL,
3507 	.mc_setprop	= iptun_m_setprop,
3508 	.mc_getprop	= iptun_m_getprop,
3509 	.mc_propinfo	= iptun_m_propinfo
3510 };
3511