xref: /illumos-gate/usr/src/uts/common/inet/ip/icmp.c (revision 382c8bca6eeec6112959d16142d3a20406c3ca9b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <sys/types.h>
31 #include <sys/stream.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #include <sys/strsun.h>
35 #define	_SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/timod.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/strsubr.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/kmem.h>
44 #include <sys/policy.h>
45 #include <sys/priv.h>
46 #include <sys/zone.h>
47 #include <sys/time.h>
48 
49 #include <sys/socket.h>
50 #include <sys/isa_defs.h>
51 #include <sys/suntpi.h>
52 #include <sys/xti_inet.h>
53 #include <sys/netstack.h>
54 
55 #include <net/route.h>
56 #include <net/if.h>
57 
58 #include <netinet/in.h>
59 #include <netinet/ip6.h>
60 #include <netinet/icmp6.h>
61 #include <inet/common.h>
62 #include <inet/ip.h>
63 #include <inet/ip6.h>
64 #include <inet/mi.h>
65 #include <inet/nd.h>
66 #include <inet/optcom.h>
67 #include <inet/snmpcom.h>
68 #include <inet/kstatcom.h>
69 #include <inet/rawip_impl.h>
70 
71 #include <netinet/ip_mroute.h>
72 #include <inet/tcp.h>
73 #include <net/pfkeyv2.h>
74 #include <inet/ipsec_info.h>
75 #include <inet/ipclassifier.h>
76 
77 #include <sys/tsol/label.h>
78 #include <sys/tsol/tnet.h>
79 
80 #include <inet/ip_ire.h>
81 #include <inet/ip_if.h>
82 
83 #include <inet/ip_impl.h>
84 
85 /*
86  * Synchronization notes:
87  *
88  * RAWIP is MT and uses the usual kernel synchronization primitives. There is
89  * locks, which is icmp_rwlock. We also use conn_lock when updating things
90  * which affect the IP classifier lookup.
91  * The lock order is icmp_rwlock -> conn_lock.
92  *
93  * The icmp_rwlock:
94  * This protects most of the other fields in the icmp_t. The exact list of
95  * fields which are protected by each of the above locks is documented in
96  * the icmp_t structure definition.
97  *
98  * Plumbing notes:
99  * ICMP is always a device driver. For compatibility with mibopen() code
100  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
101  * dummy module.
102  */
103 
104 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
105 static void	icmp_bind(queue_t *q, mblk_t *mp);
106 static void	icmp_bind_proto(queue_t *q);
107 static void	icmp_bind_result(conn_t *, mblk_t *);
108 static void	icmp_bind_ack(conn_t *, mblk_t *mp);
109 static void	icmp_bind_error(conn_t *, mblk_t *mp);
110 static int	icmp_build_hdrs(icmp_t *icmp);
111 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
112 static int	icmp_close(queue_t *q);
113 static void	icmp_connect(queue_t *q, mblk_t *mp);
114 static void	icmp_disconnect(queue_t *q, mblk_t *mp);
115 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
116 		    int sys_error);
117 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
118 		    t_scalar_t t_error, int sys_error);
119 static void	icmp_icmp_error(queue_t *q, mblk_t *mp);
120 static void	icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp);
121 static void	icmp_info_req(queue_t *q, mblk_t *mp);
122 static void	icmp_input(void *, mblk_t *, void *);
123 static mblk_t	*icmp_ip_bind_mp(icmp_t *icmp, t_scalar_t bind_prim,
124 		    t_scalar_t addr_length, in_port_t);
125 static int	icmp_open(queue_t *q, dev_t *devp, int flag, int sflag,
126 		    cred_t *credp, boolean_t isv6);
127 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
128 		    cred_t *credp);
129 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
130 		    cred_t *credp);
131 static void	icmp_output(queue_t *q, mblk_t *mp);
132 static int	icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
133 		    int *errorp, void *thisdg_attrs);
134 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
135 int		icmp_opt_set(queue_t *q, uint_t optset_context,
136 		    int level, int name, uint_t inlen,
137 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
138 		    void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
139 int		icmp_opt_get(queue_t *q, int level, int name,
140 		    uchar_t *ptr);
141 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
142 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
143 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
144 		    caddr_t cp, cred_t *cr);
145 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
146 		    uchar_t *ptr, int len);
147 static int	icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
148 		    cred_t *cr);
149 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
150 static void	icmp_unbind(queue_t *q, mblk_t *mp);
151 static void	icmp_wput(queue_t *q, mblk_t *mp);
152 static void	icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6,
153 		    t_scalar_t tudr_optlen);
154 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
155 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
156 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
157 
158 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
159 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
160 
161 static void	*rawip_kstat_init(netstackid_t stackid);
162 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
163 static int	rawip_kstat_update(kstat_t *kp, int rw);
164 
165 
166 static struct module_info icmp_mod_info =  {
167 	5707, "icmp", 1, INFPSZ, 512, 128
168 };
169 
170 /*
171  * Entry points for ICMP as a device.
172  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
173  */
174 static struct qinit icmprinitv4 = {
175 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
176 };
177 
178 static struct qinit icmprinitv6 = {
179 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
180 };
181 
182 static struct qinit icmpwinit = {
183 	(pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
184 };
185 
186 /* For AF_INET aka /dev/icmp */
187 struct streamtab icmpinfov4 = {
188 	&icmprinitv4, &icmpwinit
189 };
190 
191 /* For AF_INET6 aka /dev/icmp6 */
192 struct streamtab icmpinfov6 = {
193 	&icmprinitv6, &icmpwinit
194 };
195 
196 static sin_t	sin_null;	/* Zero address for quick clears */
197 static sin6_t	sin6_null;	/* Zero address for quick clears */
198 
199 /* Default structure copied into T_INFO_ACK messages */
200 static struct T_info_ack icmp_g_t_info_ack = {
201 	T_INFO_ACK,
202 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
203 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
204 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
205 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
206 	0,		/* ADDR_size - filled in later. */
207 	0,		/* OPT_size - not initialized here */
208 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
209 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
210 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
211 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
212 };
213 
214 /*
215  * Table of ND variables supported by icmp.  These are loaded into is_nd
216  * when the stack instance is created.
217  * All of these are alterable, within the min/max values given, at run time.
218  */
219 static icmpparam_t	icmp_param_arr[] = {
220 	/* min	max	value	name */
221 	{ 0,	128,	32,	"icmp_wroff_extra" },
222 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
223 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
224 	{ 0,	1,	1,	"icmp_bsd_compat" },
225 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
226 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
227 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
228 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
229 };
230 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
231 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
232 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
233 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
234 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
235 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
236 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
237 #define	is_max_buf			is_param_arr[7].icmp_param_value
238 
239 /*
240  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
241  * passed to icmp_wput.
242  * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
243  * protocol type placed in the message following the address. A T_BIND_ACK
244  * message is returned by ip_bind_v4/v6.
245  */
246 static void
247 icmp_bind(queue_t *q, mblk_t *mp)
248 {
249 	sin_t	*sin;
250 	sin6_t	*sin6;
251 	mblk_t	*mp1;
252 	struct T_bind_req	*tbr;
253 	icmp_t	*icmp;
254 	conn_t	*connp = Q_TO_CONN(q);
255 
256 	icmp = connp->conn_icmp;
257 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
258 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
259 		    "icmp_bind: bad req, len %u",
260 		    (uint_t)(mp->b_wptr - mp->b_rptr));
261 		icmp_err_ack(q, mp, TPROTO, 0);
262 		return;
263 	}
264 	if (icmp->icmp_state != TS_UNBND) {
265 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
266 		    "icmp_bind: bad state, %d", icmp->icmp_state);
267 		icmp_err_ack(q, mp, TOUTSTATE, 0);
268 		return;
269 	}
270 	/*
271 	 * Reallocate the message to make sure we have enough room for an
272 	 * address and the protocol type.
273 	 */
274 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
275 	if (!mp1) {
276 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
277 		return;
278 	}
279 	mp = mp1;
280 	tbr = (struct T_bind_req *)mp->b_rptr;
281 	switch (tbr->ADDR_length) {
282 	case 0:			/* Generic request */
283 		tbr->ADDR_offset = sizeof (struct T_bind_req);
284 		if (icmp->icmp_family == AF_INET) {
285 			tbr->ADDR_length = sizeof (sin_t);
286 			sin = (sin_t *)&tbr[1];
287 			*sin = sin_null;
288 			sin->sin_family = AF_INET;
289 			mp->b_wptr = (uchar_t *)&sin[1];
290 		} else {
291 			ASSERT(icmp->icmp_family == AF_INET6);
292 			tbr->ADDR_length = sizeof (sin6_t);
293 			sin6 = (sin6_t *)&tbr[1];
294 			*sin6 = sin6_null;
295 			sin6->sin6_family = AF_INET6;
296 			mp->b_wptr = (uchar_t *)&sin6[1];
297 		}
298 		break;
299 	case sizeof (sin_t):	/* Complete IP address */
300 		sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
301 		    sizeof (sin_t));
302 		if (sin == NULL || !OK_32PTR((char *)sin)) {
303 			icmp_err_ack(q, mp, TSYSERR, EINVAL);
304 			return;
305 		}
306 		if (icmp->icmp_family != AF_INET ||
307 		    sin->sin_family != AF_INET) {
308 			icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
309 			return;
310 		}
311 		break;
312 	case sizeof (sin6_t):	/* Complete IP address */
313 		sin6 = (sin6_t *)mi_offset_param(mp, tbr->ADDR_offset,
314 		    sizeof (sin6_t));
315 		if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
316 			icmp_err_ack(q, mp, TSYSERR, EINVAL);
317 			return;
318 		}
319 		if (icmp->icmp_family != AF_INET6 ||
320 		    sin6->sin6_family != AF_INET6) {
321 			icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
322 			return;
323 		}
324 		/* No support for mapped addresses on raw sockets */
325 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
326 			icmp_err_ack(q, mp, TSYSERR, EADDRNOTAVAIL);
327 			return;
328 		}
329 		break;
330 	default:
331 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
332 		    "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
333 		icmp_err_ack(q, mp, TBADADDR, 0);
334 		return;
335 	}
336 
337 	/*
338 	 * The state must be TS_UNBND. TPI mandates that users must send
339 	 * TPI primitives only 1 at a time and wait for the response before
340 	 * sending the next primitive.
341 	 */
342 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
343 	if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
344 		rw_exit(&icmp->icmp_rwlock);
345 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
346 		    "icmp_bind: bad state, %d", icmp->icmp_state);
347 		icmp_err_ack(q, mp, TOUTSTATE, 0);
348 		return;
349 	}
350 
351 	icmp->icmp_pending_op = tbr->PRIM_type;
352 
353 	/*
354 	 * Copy the source address into our icmp structure.  This address
355 	 * may still be zero; if so, ip will fill in the correct address
356 	 * each time an outbound packet is passed to it.
357 	 * If we are binding to a broadcast or multicast address then
358 	 * icmp_bind_ack will clear the source address when it receives
359 	 * the T_BIND_ACK.
360 	 */
361 	icmp->icmp_state = TS_IDLE;
362 
363 	if (icmp->icmp_family == AF_INET) {
364 		ASSERT(sin != NULL);
365 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
366 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
367 		    &icmp->icmp_v6src);
368 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
369 		    icmp->icmp_ip_snd_options_len;
370 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
371 	} else {
372 		int error;
373 
374 		ASSERT(sin6 != NULL);
375 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
376 		icmp->icmp_v6src = sin6->sin6_addr;
377 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
378 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
379 
380 		/* Rebuild the header template */
381 		error = icmp_build_hdrs(icmp);
382 		if (error != 0) {
383 			icmp->icmp_pending_op = -1;
384 			rw_exit(&icmp->icmp_rwlock);
385 			icmp_err_ack(q, mp, TSYSERR, error);
386 			return;
387 		}
388 	}
389 	/*
390 	 * Place protocol type in the O_T_BIND_REQ/T_BIND_REQ following
391 	 * the address.
392 	 */
393 	*mp->b_wptr++ = icmp->icmp_proto;
394 	if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
395 		/*
396 		 * Append a request for an IRE if src not 0 (INADDR_ANY)
397 		 */
398 		mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
399 		if (!mp->b_cont) {
400 			icmp->icmp_pending_op = -1;
401 			rw_exit(&icmp->icmp_rwlock);
402 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
403 			return;
404 		}
405 		mp->b_cont->b_wptr += sizeof (ire_t);
406 		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
407 	}
408 	rw_exit(&icmp->icmp_rwlock);
409 
410 	/* Pass the O_T_BIND_REQ/T_BIND_REQ to ip. */
411 	if (icmp->icmp_family == AF_INET6)
412 		mp = ip_bind_v6(q, mp, connp, NULL);
413 	else
414 		mp = ip_bind_v4(q, mp, connp);
415 
416 	/* The above return NULL if the bind needs to be deferred */
417 	if (mp != NULL)
418 		icmp_bind_result(connp, mp);
419 	else
420 		CONN_INC_REF(connp);
421 }
422 
423 /*
424  * Send message to IP to just bind to the protocol.
425  */
426 static void
427 icmp_bind_proto(queue_t *q)
428 {
429 	mblk_t	*mp;
430 	struct T_bind_req	*tbr;
431 	icmp_t	*icmp;
432 	conn_t	*connp = Q_TO_CONN(q);
433 
434 	icmp = connp->conn_icmp;
435 
436 	mp = allocb(sizeof (struct T_bind_req) + sizeof (sin6_t) + 1,
437 	    BPRI_MED);
438 	if (!mp) {
439 		return;
440 	}
441 	mp->b_datap->db_type = M_PROTO;
442 	tbr = (struct T_bind_req *)mp->b_rptr;
443 	tbr->PRIM_type = O_T_BIND_REQ; /* change to T_BIND_REQ ? */
444 	tbr->ADDR_offset = sizeof (struct T_bind_req);
445 
446 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
447 	if (icmp->icmp_ipversion == IPV4_VERSION) {
448 		sin_t	*sin;
449 
450 		tbr->ADDR_length = sizeof (sin_t);
451 		sin = (sin_t *)&tbr[1];
452 		*sin = sin_null;
453 		sin->sin_family = AF_INET;
454 		mp->b_wptr = (uchar_t *)&sin[1];
455 	} else {
456 		sin6_t	*sin6;
457 
458 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
459 		tbr->ADDR_length = sizeof (sin6_t);
460 		sin6 = (sin6_t *)&tbr[1];
461 		*sin6 = sin6_null;
462 		sin6->sin6_family = AF_INET6;
463 		mp->b_wptr = (uchar_t *)&sin6[1];
464 	}
465 
466 	/* Place protocol type in the O_T_BIND_REQ following the address. */
467 	*mp->b_wptr++ = icmp->icmp_proto;
468 	rw_exit(&icmp->icmp_rwlock);
469 
470 	/* Pass the O_T_BIND_REQ to ip. */
471 	if (icmp->icmp_family == AF_INET6)
472 		mp = ip_bind_v6(q, mp, connp, NULL);
473 	else
474 		mp = ip_bind_v4(q, mp, connp);
475 
476 	/* The above return NULL if the bind needs to be deferred */
477 	if (mp != NULL)
478 		icmp_bind_result(connp, mp);
479 	else
480 		CONN_INC_REF(connp);
481 }
482 
483 /*
484  * This is called from ip_wput_nondata to handle the results of a
485  * deferred RAWIP bind.  It is called once the bind has been completed.
486  */
487 void
488 rawip_resume_bind(conn_t *connp, mblk_t *mp)
489 {
490 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
491 
492 	icmp_bind_result(connp, mp);
493 
494 	CONN_OPER_PENDING_DONE(connp);
495 }
496 
497 /*
498  * This routine handles each T_CONN_REQ message passed to icmp.  It
499  * associates a default destination address with the stream.
500  *
501  * This routine sends down a T_BIND_REQ to IP with the following mblks:
502  *	T_BIND_REQ	- specifying local and remote address.
503  *	IRE_DB_REQ_TYPE	- to get an IRE back containing ire_type and src
504  *	T_OK_ACK	- for the T_CONN_REQ
505  *	T_CONN_CON	- to keep the TPI user happy
506  *
507  * The connect completes in icmp_bind_result.
508  * When a T_BIND_ACK is received information is extracted from the IRE
509  * and the two appended messages are sent to the TPI user.
510  * Should icmp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
511  * convert it to an error ack for the appropriate primitive.
512  */
513 static void
514 icmp_connect(queue_t *q, mblk_t *mp)
515 {
516 	sin_t	*sin;
517 	sin6_t	*sin6;
518 	mblk_t	*mp1, *mp2;
519 	struct T_conn_req	*tcr;
520 	icmp_t	*icmp;
521 	ipaddr_t	v4dst;
522 	in6_addr_t	v6dst;
523 	uint32_t	flowinfo;
524 	conn_t	*connp = Q_TO_CONN(q);
525 
526 	icmp = connp->conn_icmp;
527 	tcr = (struct T_conn_req *)mp->b_rptr;
528 	/* Sanity checks */
529 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
530 		icmp_err_ack(q, mp, TPROTO, 0);
531 		return;
532 	}
533 
534 	if (tcr->OPT_length != 0) {
535 		icmp_err_ack(q, mp, TBADOPT, 0);
536 		return;
537 	}
538 
539 	switch (tcr->DEST_length) {
540 	default:
541 		icmp_err_ack(q, mp, TBADADDR, 0);
542 		return;
543 
544 	case sizeof (sin_t):
545 		sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
546 		    sizeof (sin_t));
547 		if (sin == NULL || !OK_32PTR((char *)sin)) {
548 			icmp_err_ack(q, mp, TSYSERR, EINVAL);
549 			return;
550 		}
551 		if (icmp->icmp_family != AF_INET ||
552 		    sin->sin_family != AF_INET) {
553 			icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
554 			return;
555 		}
556 		v4dst = sin->sin_addr.s_addr;
557 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
558 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
559 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
560 		    icmp->icmp_ip_snd_options_len;
561 		break;
562 
563 	case sizeof (sin6_t):
564 		sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
565 		    sizeof (sin6_t));
566 		if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
567 			icmp_err_ack(q, mp, TSYSERR, EINVAL);
568 			return;
569 		}
570 		if (icmp->icmp_family != AF_INET6 ||
571 		    sin6->sin6_family != AF_INET6) {
572 			icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
573 			return;
574 		}
575 		/* No support for mapped addresses on raw sockets */
576 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
577 			icmp_err_ack(q, mp, TSYSERR, EADDRNOTAVAIL);
578 			return;
579 		}
580 		v6dst = sin6->sin6_addr;
581 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
582 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
583 		flowinfo = sin6->sin6_flowinfo;
584 		break;
585 	}
586 	if (icmp->icmp_ipversion == IPV4_VERSION) {
587 		/*
588 		 * Interpret a zero destination to mean loopback.
589 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
590 		 * generate the T_CONN_CON.
591 		 */
592 		if (v4dst == INADDR_ANY) {
593 			v4dst = htonl(INADDR_LOOPBACK);
594 			IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
595 			if (icmp->icmp_family == AF_INET) {
596 				sin->sin_addr.s_addr = v4dst;
597 			} else {
598 				sin6->sin6_addr = v6dst;
599 			}
600 		}
601 		icmp->icmp_v6dst = v6dst;
602 		icmp->icmp_flowinfo = 0;
603 
604 		/*
605 		 * If the destination address is multicast and
606 		 * an outgoing multicast interface has been set,
607 		 * use the address of that interface as our
608 		 * source address if no source address has been set.
609 		 */
610 		if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
611 		    CLASSD(v4dst) &&
612 		    icmp->icmp_multicast_if_addr != INADDR_ANY) {
613 			IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
614 			    &icmp->icmp_v6src);
615 		}
616 	} else {
617 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
618 		/*
619 		 * Interpret a zero destination to mean loopback.
620 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
621 		 * generate the T_CONN_CON.
622 		 */
623 		if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
624 			v6dst = ipv6_loopback;
625 			sin6->sin6_addr = v6dst;
626 		}
627 		icmp->icmp_v6dst = v6dst;
628 		icmp->icmp_flowinfo = flowinfo;
629 		/*
630 		 * If the destination address is multicast and
631 		 * an outgoing multicast interface has been set,
632 		 * then the ip bind logic will pick the correct source
633 		 * address (i.e. matching the outgoing multicast interface).
634 		 */
635 	}
636 
637 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
638 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
639 		rw_exit(&icmp->icmp_rwlock);
640 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
641 		    "icmp_connect: bad state, %d", icmp->icmp_state);
642 		icmp_err_ack(q, mp, TOUTSTATE, 0);
643 		return;
644 	}
645 	icmp->icmp_pending_op = T_CONN_REQ;
646 
647 	if (icmp->icmp_state == TS_DATA_XFER) {
648 		/* Already connected - clear out state */
649 		icmp->icmp_v6src = icmp->icmp_bound_v6src;
650 		icmp->icmp_state = TS_IDLE;
651 	}
652 
653 	/*
654 	 * Send down bind to IP to verify that there is a route
655 	 * and to determine the source address.
656 	 * This will come back as T_BIND_ACK with an IRE_DB_TYPE in rput.
657 	 */
658 	if (icmp->icmp_family == AF_INET) {
659 		mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (ipa_conn_t),
660 		    sin->sin_port);
661 	} else {
662 		ASSERT(icmp->icmp_family == AF_INET6);
663 		mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (ipa6_conn_t),
664 		    sin6->sin6_port);
665 	}
666 	if (mp1 == NULL) {
667 		icmp->icmp_pending_op = -1;
668 		rw_exit(&icmp->icmp_rwlock);
669 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
670 		return;
671 	}
672 
673 	/*
674 	 * We also have to send a connection confirmation to
675 	 * keep TLI happy. Prepare it for icmp_bind_result.
676 	 */
677 	if (icmp->icmp_family == AF_INET) {
678 		mp2 = mi_tpi_conn_con(NULL, (char *)sin, sizeof (*sin), NULL,
679 		    0);
680 	} else {
681 		ASSERT(icmp->icmp_family == AF_INET6);
682 		mp2 = mi_tpi_conn_con(NULL, (char *)sin6, sizeof (*sin6), NULL,
683 		    0);
684 	}
685 	if (mp2 == NULL) {
686 		freemsg(mp1);
687 		icmp->icmp_pending_op = -1;
688 		rw_exit(&icmp->icmp_rwlock);
689 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
690 		return;
691 	}
692 
693 	mp = mi_tpi_ok_ack_alloc(mp);
694 	if (mp == NULL) {
695 		/* Unable to reuse the T_CONN_REQ for the ack. */
696 		freemsg(mp2);
697 		icmp->icmp_pending_op = -1;
698 		rw_exit(&icmp->icmp_rwlock);
699 		icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
700 		return;
701 	}
702 
703 	icmp->icmp_state = TS_DATA_XFER;
704 	rw_exit(&icmp->icmp_rwlock);
705 
706 	/* Hang onto the T_OK_ACK and T_CONN_CON for later. */
707 	linkb(mp1, mp);
708 	linkb(mp1, mp2);
709 
710 	mblk_setcred(mp1, connp->conn_cred);
711 	if (icmp->icmp_family == AF_INET)
712 		mp1 = ip_bind_v4(q, mp1, connp);
713 	else
714 		mp1 = ip_bind_v6(q, mp1, connp, NULL);
715 
716 	/* The above return NULL if the bind needs to be deferred */
717 	if (mp1 != NULL)
718 		icmp_bind_result(connp, mp1);
719 	else
720 		CONN_INC_REF(connp);
721 }
722 
723 static void
724 icmp_close_free(conn_t *connp)
725 {
726 	icmp_t *icmp = connp->conn_icmp;
727 
728 	/* If there are any options associated with the stream, free them. */
729 	if (icmp->icmp_ip_snd_options)
730 		mi_free((char *)icmp->icmp_ip_snd_options);
731 
732 	if (icmp->icmp_filter != NULL)
733 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
734 
735 	/* Free memory associated with sticky options */
736 	if (icmp->icmp_sticky_hdrs_len != 0) {
737 		kmem_free(icmp->icmp_sticky_hdrs,
738 		    icmp->icmp_sticky_hdrs_len);
739 		icmp->icmp_sticky_hdrs = NULL;
740 		icmp->icmp_sticky_hdrs_len = 0;
741 	}
742 	ip6_pkt_free(&icmp->icmp_sticky_ipp);
743 }
744 
745 static int
746 icmp_close(queue_t *q)
747 {
748 	conn_t	*connp = (conn_t *)q->q_ptr;
749 
750 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
751 
752 	ip_quiesce_conn(connp);
753 
754 	qprocsoff(connp->conn_rq);
755 
756 	icmp_close_free(connp);
757 
758 	/*
759 	 * Now we are truly single threaded on this stream, and can
760 	 * delete the things hanging off the connp, and finally the connp.
761 	 * We removed this connp from the fanout list, it cannot be
762 	 * accessed thru the fanouts, and we already waited for the
763 	 * conn_ref to drop to 0. We are already in close, so
764 	 * there cannot be any other thread from the top. qprocsoff
765 	 * has completed, and service has completed or won't run in
766 	 * future.
767 	 */
768 	ASSERT(connp->conn_ref == 1);
769 
770 	inet_minor_free(ip_minor_arena, connp->conn_dev);
771 
772 	connp->conn_ref--;
773 	ipcl_conn_destroy(connp);
774 
775 	q->q_ptr = WR(q)->q_ptr = NULL;
776 	return (0);
777 }
778 
779 /*
780  * This routine handles each T_DISCON_REQ message passed to icmp
781  * as an indicating that ICMP is no longer connected. This results
782  * in sending a T_BIND_REQ to IP to restore the binding to just
783  * the local address.
784  *
785  * This routine sends down a T_BIND_REQ to IP with the following mblks:
786  *	T_BIND_REQ	- specifying just the local address.
787  *	T_OK_ACK	- for the T_DISCON_REQ
788  *
789  * The disconnect completes in icmp_bind_result.
790  * When a T_BIND_ACK is received the appended T_OK_ACK is sent to the TPI user.
791  * Should icmp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
792  * convert it to an error ack for the appropriate primitive.
793  */
794 static void
795 icmp_disconnect(queue_t *q, mblk_t *mp)
796 {
797 	icmp_t	*icmp;
798 	mblk_t	*mp1;
799 	conn_t	*connp = Q_TO_CONN(q);
800 
801 	icmp = connp->conn_icmp;
802 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
803 	if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
804 		rw_exit(&icmp->icmp_rwlock);
805 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
806 		    "icmp_disconnect: bad state, %d", icmp->icmp_state);
807 		icmp_err_ack(q, mp, TOUTSTATE, 0);
808 		return;
809 	}
810 	icmp->icmp_pending_op = T_DISCON_REQ;
811 	icmp->icmp_v6src = icmp->icmp_bound_v6src;
812 	icmp->icmp_state = TS_IDLE;
813 
814 	/*
815 	 * Send down bind to IP to remove the full binding and revert
816 	 * to the local address binding.
817 	 */
818 	if (icmp->icmp_family == AF_INET) {
819 		mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (sin_t), 0);
820 	} else {
821 		ASSERT(icmp->icmp_family == AF_INET6);
822 		mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (sin6_t), 0);
823 	}
824 	if (mp1 == NULL) {
825 		icmp->icmp_pending_op = -1;
826 		rw_exit(&icmp->icmp_rwlock);
827 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
828 		return;
829 	}
830 	mp = mi_tpi_ok_ack_alloc(mp);
831 	if (mp == NULL) {
832 		/* Unable to reuse the T_DISCON_REQ for the ack. */
833 		icmp->icmp_pending_op = -1;
834 		rw_exit(&icmp->icmp_rwlock);
835 		icmp_err_ack_prim(q, mp1, T_DISCON_REQ, TSYSERR, ENOMEM);
836 		return;
837 	}
838 
839 	if (icmp->icmp_family == AF_INET6) {
840 		int error;
841 
842 		/* Rebuild the header template */
843 		error = icmp_build_hdrs(icmp);
844 		if (error != 0) {
845 			icmp->icmp_pending_op = -1;
846 			rw_exit(&icmp->icmp_rwlock);
847 			icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, error);
848 			freemsg(mp1);
849 			return;
850 		}
851 	}
852 
853 	rw_exit(&icmp->icmp_rwlock);
854 	/* Append the T_OK_ACK to the T_BIND_REQ for icmp_bind_result */
855 	linkb(mp1, mp);
856 
857 	if (icmp->icmp_family == AF_INET6)
858 		mp1 = ip_bind_v6(q, mp1, connp, NULL);
859 	else
860 		mp1 = ip_bind_v4(q, mp1, connp);
861 
862 	/* The above return NULL if the bind needs to be deferred */
863 	if (mp1 != NULL)
864 		icmp_bind_result(connp, mp1);
865 	else
866 		CONN_INC_REF(connp);
867 }
868 
869 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
870 static void
871 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
872 {
873 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
874 		qreply(q, mp);
875 }
876 
877 /* Shorthand to generate and send TPI error acks to our client */
878 static void
879 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
880     t_scalar_t t_error, int sys_error)
881 {
882 	struct T_error_ack	*teackp;
883 
884 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
885 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
886 		teackp = (struct T_error_ack *)mp->b_rptr;
887 		teackp->ERROR_prim = primitive;
888 		teackp->TLI_error = t_error;
889 		teackp->UNIX_error = sys_error;
890 		qreply(q, mp);
891 	}
892 }
893 
894 /*
895  * icmp_icmp_error is called by icmp_input to process ICMP
896  * messages passed up by IP.
897  * Generates the appropriate T_UDERROR_IND for permanent
898  * (non-transient) errors.
899  * Assumes that IP has pulled up everything up to and including
900  * the ICMP header.
901  */
902 static void
903 icmp_icmp_error(queue_t *q, mblk_t *mp)
904 {
905 	icmph_t *icmph;
906 	ipha_t	*ipha;
907 	int	iph_hdr_length;
908 	sin_t	sin;
909 	sin6_t	sin6;
910 	mblk_t	*mp1;
911 	int	error = 0;
912 	icmp_t	*icmp = Q_TO_ICMP(q);
913 
914 	ipha = (ipha_t *)mp->b_rptr;
915 
916 	ASSERT(OK_32PTR(mp->b_rptr));
917 
918 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
919 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
920 		icmp_icmp_error_ipv6(q, mp);
921 		return;
922 	}
923 	ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
924 
925 	/* Skip past the outer IP and ICMP headers */
926 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
927 	icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
928 	ipha = (ipha_t *)&icmph[1];
929 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
930 
931 	switch (icmph->icmph_type) {
932 	case ICMP_DEST_UNREACHABLE:
933 		switch (icmph->icmph_code) {
934 		case ICMP_FRAGMENTATION_NEEDED:
935 			/*
936 			 * IP has already adjusted the path MTU.
937 			 */
938 			break;
939 		case ICMP_PORT_UNREACHABLE:
940 		case ICMP_PROTOCOL_UNREACHABLE:
941 			error = ECONNREFUSED;
942 			break;
943 		default:
944 			/* Transient errors */
945 			break;
946 		}
947 		break;
948 	default:
949 		/* Transient errors */
950 		break;
951 	}
952 	if (error == 0) {
953 		freemsg(mp);
954 		return;
955 	}
956 
957 	/*
958 	 * Deliver T_UDERROR_IND when the application has asked for it.
959 	 * The socket layer enables this automatically when connected.
960 	 */
961 	if (!icmp->icmp_dgram_errind) {
962 		freemsg(mp);
963 		return;
964 	}
965 
966 	switch (icmp->icmp_family) {
967 	case AF_INET:
968 		sin = sin_null;
969 		sin.sin_family = AF_INET;
970 		sin.sin_addr.s_addr = ipha->ipha_dst;
971 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
972 		    error);
973 		break;
974 	case AF_INET6:
975 		sin6 = sin6_null;
976 		sin6.sin6_family = AF_INET6;
977 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr);
978 
979 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
980 		    NULL, 0, error);
981 		break;
982 	}
983 	if (mp1)
984 		putnext(q, mp1);
985 	freemsg(mp);
986 }
987 
988 /*
989  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6
990  * for IPv6 packets.
991  * Send permanent (non-transient) errors upstream.
992  * Assumes that IP has pulled up all the extension headers as well
993  * as the ICMPv6 header.
994  */
995 static void
996 icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
997 {
998 	icmp6_t		*icmp6;
999 	ip6_t		*ip6h, *outer_ip6h;
1000 	uint16_t	iph_hdr_length;
1001 	uint8_t		*nexthdrp;
1002 	sin6_t		sin6;
1003 	mblk_t		*mp1;
1004 	int		error = 0;
1005 	icmp_t		*icmp = Q_TO_ICMP(q);
1006 
1007 	outer_ip6h = (ip6_t *)mp->b_rptr;
1008 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1009 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1010 	else
1011 		iph_hdr_length = IPV6_HDR_LEN;
1012 
1013 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1014 	ip6h = (ip6_t *)&icmp6[1];
1015 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1016 		freemsg(mp);
1017 		return;
1018 	}
1019 
1020 	switch (icmp6->icmp6_type) {
1021 	case ICMP6_DST_UNREACH:
1022 		switch (icmp6->icmp6_code) {
1023 		case ICMP6_DST_UNREACH_NOPORT:
1024 			error = ECONNREFUSED;
1025 			break;
1026 		case ICMP6_DST_UNREACH_ADMIN:
1027 		case ICMP6_DST_UNREACH_NOROUTE:
1028 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1029 		case ICMP6_DST_UNREACH_ADDR:
1030 			/* Transient errors */
1031 			break;
1032 		default:
1033 			break;
1034 		}
1035 		break;
1036 	case ICMP6_PACKET_TOO_BIG: {
1037 		struct T_unitdata_ind	*tudi;
1038 		struct T_opthdr		*toh;
1039 		size_t			udi_size;
1040 		mblk_t			*newmp;
1041 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1042 		    sizeof (struct ip6_mtuinfo);
1043 		sin6_t			*sin6;
1044 		struct ip6_mtuinfo	*mtuinfo;
1045 
1046 		/*
1047 		 * If the application has requested to receive path mtu
1048 		 * information, send up an empty message containing an
1049 		 * IPV6_PATHMTU ancillary data item.
1050 		 */
1051 		if (!icmp->icmp_ipv6_recvpathmtu)
1052 			break;
1053 
1054 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1055 		    opt_length;
1056 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1057 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1058 			break;
1059 		}
1060 
1061 		/*
1062 		 * newmp->b_cont is left to NULL on purpose.  This is an
1063 		 * empty message containing only ancillary data.
1064 		 */
1065 		newmp->b_datap->db_type = M_PROTO;
1066 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1067 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1068 		tudi->PRIM_type = T_UNITDATA_IND;
1069 		tudi->SRC_length = sizeof (sin6_t);
1070 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1071 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1072 		tudi->OPT_length = opt_length;
1073 
1074 		sin6 = (sin6_t *)&tudi[1];
1075 		bzero(sin6, sizeof (sin6_t));
1076 		sin6->sin6_family = AF_INET6;
1077 		sin6->sin6_addr = icmp->icmp_v6dst;
1078 
1079 		toh = (struct T_opthdr *)&sin6[1];
1080 		toh->level = IPPROTO_IPV6;
1081 		toh->name = IPV6_PATHMTU;
1082 		toh->len = opt_length;
1083 		toh->status = 0;
1084 
1085 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1086 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1087 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1088 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1089 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1090 		/*
1091 		 * We've consumed everything we need from the original
1092 		 * message.  Free it, then send our empty message.
1093 		 */
1094 		freemsg(mp);
1095 		putnext(q, newmp);
1096 		return;
1097 	}
1098 	case ICMP6_TIME_EXCEEDED:
1099 		/* Transient errors */
1100 		break;
1101 	case ICMP6_PARAM_PROB:
1102 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1103 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1104 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1105 		    (uchar_t *)nexthdrp) {
1106 			error = ECONNREFUSED;
1107 			break;
1108 		}
1109 		break;
1110 	}
1111 	if (error == 0) {
1112 		freemsg(mp);
1113 		return;
1114 	}
1115 
1116 	/*
1117 	 * Deliver T_UDERROR_IND when the application has asked for it.
1118 	 * The socket layer enables this automatically when connected.
1119 	 */
1120 	if (!icmp->icmp_dgram_errind) {
1121 		freemsg(mp);
1122 		return;
1123 	}
1124 
1125 	sin6 = sin6_null;
1126 	sin6.sin6_family = AF_INET6;
1127 	sin6.sin6_addr = ip6h->ip6_dst;
1128 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1129 
1130 	mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0,
1131 	    error);
1132 	if (mp1)
1133 		putnext(q, mp1);
1134 	freemsg(mp);
1135 }
1136 
1137 /*
1138  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1139  * The local address is filled in if endpoint is bound. The remote address
1140  * is filled in if remote address has been precified ("connected endpoint")
1141  * (The concept of connected CLTS sockets is alien to published TPI
1142  *  but we support it anyway).
1143  */
1144 static void
1145 icmp_addr_req(queue_t *q, mblk_t *mp)
1146 {
1147 	icmp_t	*icmp = Q_TO_ICMP(q);
1148 	mblk_t	*ackmp;
1149 	struct T_addr_ack *taa;
1150 
1151 	/* Make it large enough for worst case */
1152 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1153 	    2 * sizeof (sin6_t), 1);
1154 	if (ackmp == NULL) {
1155 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1156 		return;
1157 	}
1158 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1159 
1160 	bzero(taa, sizeof (struct T_addr_ack));
1161 	ackmp->b_wptr = (uchar_t *)&taa[1];
1162 
1163 	taa->PRIM_type = T_ADDR_ACK;
1164 	ackmp->b_datap->db_type = M_PCPROTO;
1165 	rw_enter(&icmp->icmp_rwlock, RW_READER);
1166 	/*
1167 	 * Note: Following code assumes 32 bit alignment of basic
1168 	 * data structures like sin_t and struct T_addr_ack.
1169 	 */
1170 	if (icmp->icmp_state != TS_UNBND) {
1171 		/*
1172 		 * Fill in local address
1173 		 */
1174 		taa->LOCADDR_offset = sizeof (*taa);
1175 		if (icmp->icmp_family == AF_INET) {
1176 			sin_t	*sin;
1177 
1178 			taa->LOCADDR_length = sizeof (sin_t);
1179 			sin = (sin_t *)&taa[1];
1180 			/* Fill zeroes and then intialize non-zero fields */
1181 			*sin = sin_null;
1182 			sin->sin_family = AF_INET;
1183 			if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
1184 			    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1185 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src,
1186 				    sin->sin_addr.s_addr);
1187 			} else {
1188 				/*
1189 				 * INADDR_ANY
1190 				 * icmp_v6src is not set, we might be bound to
1191 				 * broadcast/multicast. Use icmp_bound_v6src as
1192 				 * local address instead (that could
1193 				 * also still be INADDR_ANY)
1194 				 */
1195 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src,
1196 				    sin->sin_addr.s_addr);
1197 			}
1198 			ackmp->b_wptr = (uchar_t *)&sin[1];
1199 		} else {
1200 			sin6_t	*sin6;
1201 
1202 			ASSERT(icmp->icmp_family == AF_INET6);
1203 			taa->LOCADDR_length = sizeof (sin6_t);
1204 			sin6 = (sin6_t *)&taa[1];
1205 			/* Fill zeroes and then intialize non-zero fields */
1206 			*sin6 = sin6_null;
1207 			sin6->sin6_family = AF_INET6;
1208 			if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1209 				sin6->sin6_addr = icmp->icmp_v6src;
1210 			} else {
1211 				/*
1212 				 * UNSPECIFIED
1213 				 * icmp_v6src is not set, we might be bound to
1214 				 * broadcast/multicast. Use icmp_bound_v6src as
1215 				 * local address instead (that could
1216 				 * also still be UNSPECIFIED)
1217 				 */
1218 				sin6->sin6_addr = icmp->icmp_bound_v6src;
1219 			}
1220 			ackmp->b_wptr = (uchar_t *)&sin6[1];
1221 		}
1222 	}
1223 	rw_exit(&icmp->icmp_rwlock);
1224 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1225 	qreply(q, ackmp);
1226 }
1227 
1228 static void
1229 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1230 {
1231 	*tap = icmp_g_t_info_ack;
1232 
1233 	if (icmp->icmp_family == AF_INET6)
1234 		tap->ADDR_size = sizeof (sin6_t);
1235 	else
1236 		tap->ADDR_size = sizeof (sin_t);
1237 	tap->CURRENT_state = icmp->icmp_state;
1238 	tap->OPT_size = icmp_max_optsize;
1239 }
1240 
1241 /*
1242  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1243  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1244  * icmp_g_t_info_ack.  The current state of the stream is copied from
1245  * icmp_state.
1246  */
1247 static void
1248 icmp_capability_req(queue_t *q, mblk_t *mp)
1249 {
1250 	icmp_t			*icmp = Q_TO_ICMP(q);
1251 	t_uscalar_t		cap_bits1;
1252 	struct T_capability_ack	*tcap;
1253 
1254 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1255 
1256 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1257 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1258 	if (!mp)
1259 		return;
1260 
1261 	tcap = (struct T_capability_ack *)mp->b_rptr;
1262 	tcap->CAP_bits1 = 0;
1263 
1264 	if (cap_bits1 & TC1_INFO) {
1265 		icmp_copy_info(&tcap->INFO_ack, icmp);
1266 		tcap->CAP_bits1 |= TC1_INFO;
1267 	}
1268 
1269 	qreply(q, mp);
1270 }
1271 
1272 /*
1273  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1274  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1275  * The current state of the stream is copied from icmp_state.
1276  */
1277 static void
1278 icmp_info_req(queue_t *q, mblk_t *mp)
1279 {
1280 	icmp_t	*icmp = Q_TO_ICMP(q);
1281 
1282 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1283 	    T_INFO_ACK);
1284 	if (!mp)
1285 		return;
1286 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1287 	qreply(q, mp);
1288 }
1289 
1290 /*
1291  * IP recognizes seven kinds of bind requests:
1292  *
1293  * - A zero-length address binds only to the protocol number.
1294  *
1295  * - A 4-byte address is treated as a request to
1296  * validate that the address is a valid local IPv4
1297  * address, appropriate for an application to bind to.
1298  * IP does the verification, but does not make any note
1299  * of the address at this time.
1300  *
1301  * - A 16-byte address contains is treated as a request
1302  * to validate a local IPv6 address, as the 4-byte
1303  * address case above.
1304  *
1305  * - A 16-byte sockaddr_in to validate the local IPv4 address and also
1306  * use it for the inbound fanout of packets.
1307  *
1308  * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also
1309  * use it for the inbound fanout of packets.
1310  *
1311  * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout
1312  * information consisting of local and remote addresses
1313  * and ports (unused for raw sockets).  In this case, the addresses are both
1314  * validated as appropriate for this operation, and, if
1315  * so, the information is retained for use in the
1316  * inbound fanout.
1317  *
1318  * - A 36-byte address address (ipa6_conn_t) containing complete IPv6
1319  * fanout information, like the 12-byte case above.
1320  *
1321  * IP will also fill in the IRE request mblk with information
1322  * regarding our peer.  In all cases, we notify IP of our protocol
1323  * type by appending a single protocol byte to the bind request.
1324  */
1325 static mblk_t *
1326 icmp_ip_bind_mp(icmp_t *icmp, t_scalar_t bind_prim, t_scalar_t addr_length,
1327     in_port_t fport)
1328 {
1329 	char	*cp;
1330 	mblk_t	*mp;
1331 	struct T_bind_req *tbr;
1332 	ipa_conn_t	*ac;
1333 	ipa6_conn_t	*ac6;
1334 	sin_t		*sin;
1335 	sin6_t		*sin6;
1336 
1337 	ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ);
1338 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
1339 	mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI);
1340 	if (mp == NULL)
1341 		return (NULL);
1342 	mp->b_datap->db_type = M_PROTO;
1343 	tbr = (struct T_bind_req *)mp->b_rptr;
1344 	tbr->PRIM_type = bind_prim;
1345 	tbr->ADDR_offset = sizeof (*tbr);
1346 	tbr->CONIND_number = 0;
1347 	tbr->ADDR_length = addr_length;
1348 	cp = (char *)&tbr[1];
1349 	switch (addr_length) {
1350 	case sizeof (ipa_conn_t):
1351 		ASSERT(icmp->icmp_family == AF_INET);
1352 		/* Append a request for an IRE */
1353 		mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
1354 		if (mp->b_cont == NULL) {
1355 			freemsg(mp);
1356 			return (NULL);
1357 		}
1358 		mp->b_cont->b_wptr += sizeof (ire_t);
1359 		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
1360 
1361 		/* cp known to be 32 bit aligned */
1362 		ac = (ipa_conn_t *)cp;
1363 		ac->ac_laddr = V4_PART_OF_V6(icmp->icmp_v6src);
1364 		ac->ac_faddr = V4_PART_OF_V6(icmp->icmp_v6dst);
1365 		ac->ac_fport = fport;
1366 		ac->ac_lport = 0;
1367 		break;
1368 
1369 	case sizeof (ipa6_conn_t):
1370 		ASSERT(icmp->icmp_family == AF_INET6);
1371 		/* Append a request for an IRE */
1372 		mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
1373 		if (mp->b_cont == NULL) {
1374 			freemsg(mp);
1375 			return (NULL);
1376 		}
1377 		mp->b_cont->b_wptr += sizeof (ire_t);
1378 		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
1379 
1380 		/* cp known to be 32 bit aligned */
1381 		ac6 = (ipa6_conn_t *)cp;
1382 		ac6->ac6_laddr = icmp->icmp_v6src;
1383 		ac6->ac6_faddr = icmp->icmp_v6dst;
1384 		ac6->ac6_fport = fport;
1385 		ac6->ac6_lport = 0;
1386 		break;
1387 
1388 	case sizeof (sin_t):
1389 		ASSERT(icmp->icmp_family == AF_INET);
1390 		/* Append a request for an IRE */
1391 		mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
1392 		if (!mp->b_cont) {
1393 			freemsg(mp);
1394 			return (NULL);
1395 		}
1396 		mp->b_cont->b_wptr += sizeof (ire_t);
1397 		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
1398 
1399 		sin = (sin_t *)cp;
1400 		*sin = sin_null;
1401 		sin->sin_family = AF_INET;
1402 		sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_bound_v6src);
1403 		break;
1404 
1405 	case sizeof (sin6_t):
1406 		ASSERT(icmp->icmp_family == AF_INET6);
1407 		/* Append a request for an IRE */
1408 		mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
1409 		if (!mp->b_cont) {
1410 			freemsg(mp);
1411 			return (NULL);
1412 		}
1413 		mp->b_cont->b_wptr += sizeof (ire_t);
1414 		mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
1415 
1416 		sin6 = (sin6_t *)cp;
1417 		*sin6 = sin6_null;
1418 		sin6->sin6_family = AF_INET6;
1419 		sin6->sin6_addr = icmp->icmp_bound_v6src;
1420 		break;
1421 	}
1422 	/* Add protocol number to end */
1423 	cp[addr_length] = icmp->icmp_proto;
1424 	mp->b_wptr = (uchar_t *)&cp[addr_length + 1];
1425 	return (mp);
1426 }
1427 
1428 /* For /dev/icmp aka AF_INET open */
1429 static int
1430 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1431 {
1432 	return (icmp_open(q, devp, flag, sflag, credp, B_FALSE));
1433 }
1434 
1435 /* For /dev/icmp6 aka AF_INET6 open */
1436 static int
1437 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1438 {
1439 	return (icmp_open(q, devp, flag, sflag, credp, B_TRUE));
1440 }
1441 
1442 /*
1443  * This is the open routine for icmp.  It allocates a icmp_t structure for
1444  * the stream and, on the first open of the module, creates an ND table.
1445  */
1446 /*ARGSUSED2*/
1447 static int
1448 icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1449     boolean_t isv6)
1450 {
1451 	int	err;
1452 	icmp_t	*icmp;
1453 	conn_t *connp;
1454 	dev_t	conn_dev;
1455 	zoneid_t zoneid;
1456 	netstack_t *ns;
1457 	icmp_stack_t *is;
1458 
1459 	/* If the stream is already open, return immediately. */
1460 	if (q->q_ptr != NULL)
1461 		return (0);
1462 
1463 	if (sflag == MODOPEN)
1464 		return (EINVAL);
1465 
1466 	ns = netstack_find_by_cred(credp);
1467 	ASSERT(ns != NULL);
1468 	is = ns->netstack_icmp;
1469 	ASSERT(is != NULL);
1470 
1471 	/*
1472 	 * For exclusive stacks we set the zoneid to zero
1473 	 * to make ICMP operate as if in the global zone.
1474 	 */
1475 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1476 		zoneid = GLOBAL_ZONEID;
1477 	else
1478 		zoneid = crgetzoneid(credp);
1479 
1480 	if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) {
1481 		netstack_rele(ns);
1482 		return (EBUSY);
1483 	}
1484 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1485 
1486 	connp = ipcl_conn_create(IPCL_RAWIPCONN, KM_SLEEP, ns);
1487 	connp->conn_dev = conn_dev;
1488 	icmp = connp->conn_icmp;
1489 
1490 	/*
1491 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1492 	 * done by netstack_find_by_cred()
1493 	 */
1494 	netstack_rele(ns);
1495 
1496 	/*
1497 	 * Initialize the icmp_t structure for this stream.
1498 	 */
1499 	q->q_ptr = connp;
1500 	WR(q)->q_ptr = connp;
1501 	connp->conn_rq = q;
1502 	connp->conn_wq = WR(q);
1503 
1504 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1505 	ASSERT(connp->conn_ulp == IPPROTO_ICMP);
1506 	ASSERT(connp->conn_icmp == icmp);
1507 	ASSERT(icmp->icmp_connp == connp);
1508 
1509 	/* Set the initial state of the stream and the privilege status. */
1510 	icmp->icmp_state = TS_UNBND;
1511 	if (isv6) {
1512 		icmp->icmp_ipversion = IPV6_VERSION;
1513 		icmp->icmp_family = AF_INET6;
1514 		connp->conn_ulp = IPPROTO_ICMPV6;
1515 		/* May be changed by a SO_PROTOTYPE socket option. */
1516 		icmp->icmp_proto = IPPROTO_ICMPV6;
1517 		icmp->icmp_checksum_off = 2;	/* Offset for icmp6_cksum */
1518 		icmp->icmp_max_hdr_len = IPV6_HDR_LEN;
1519 		icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit;
1520 		connp->conn_af_isv6 = B_TRUE;
1521 		connp->conn_flags |= IPCL_ISV6;
1522 	} else {
1523 		icmp->icmp_ipversion = IPV4_VERSION;
1524 		icmp->icmp_family = AF_INET;
1525 		/* May be changed by a SO_PROTOTYPE socket option. */
1526 		icmp->icmp_proto = IPPROTO_ICMP;
1527 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH;
1528 		icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl;
1529 		connp->conn_af_isv6 = B_FALSE;
1530 		connp->conn_flags &= ~IPCL_ISV6;
1531 	}
1532 	icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1533 	icmp->icmp_pending_op = -1;
1534 	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1535 	connp->conn_zoneid = zoneid;
1536 
1537 	/*
1538 	 * If the caller has the process-wide flag set, then default to MAC
1539 	 * exempt mode.  This allows read-down to unlabeled hosts.
1540 	 */
1541 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1542 		icmp->icmp_mac_exempt = B_TRUE;
1543 
1544 	connp->conn_ulp_labeled = is_system_labeled();
1545 
1546 	icmp->icmp_is = is;
1547 
1548 	q->q_hiwat = is->is_recv_hiwat;
1549 	WR(q)->q_hiwat = is->is_xmit_hiwat;
1550 	WR(q)->q_lowat = is->is_xmit_lowat;
1551 
1552 	connp->conn_recv = icmp_input;
1553 	crhold(credp);
1554 	connp->conn_cred = credp;
1555 
1556 	mutex_enter(&connp->conn_lock);
1557 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1558 	mutex_exit(&connp->conn_lock);
1559 
1560 	qprocson(q);
1561 
1562 	if (icmp->icmp_family == AF_INET6) {
1563 		/* Build initial header template for transmit */
1564 		if ((err = icmp_build_hdrs(icmp)) != 0) {
1565 			rw_exit(&icmp->icmp_rwlock);
1566 			qprocsoff(q);
1567 			ipcl_conn_destroy(connp);
1568 			return (err);
1569 		}
1570 	}
1571 	rw_exit(&icmp->icmp_rwlock);
1572 
1573 	/* Set the Stream head write offset. */
1574 	(void) mi_set_sth_wroff(q,
1575 	    icmp->icmp_max_hdr_len + is->is_wroff_extra);
1576 	(void) mi_set_sth_hiwat(q, q->q_hiwat);
1577 
1578 	return (0);
1579 }
1580 
1581 /*
1582  * Which ICMP options OK to set through T_UNITDATA_REQ...
1583  */
1584 /* ARGSUSED */
1585 static boolean_t
1586 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1587 {
1588 	return (B_TRUE);
1589 }
1590 
1591 /*
1592  * This routine gets default values of certain options whose default
1593  * values are maintained by protcol specific code
1594  */
1595 /* ARGSUSED */
1596 int
1597 icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
1598 {
1599 	icmp_t *icmp = Q_TO_ICMP(q);
1600 	icmp_stack_t *is = icmp->icmp_is;
1601 	int *i1 = (int *)ptr;
1602 
1603 	switch (level) {
1604 	case IPPROTO_IP:
1605 		switch (name) {
1606 		case IP_MULTICAST_TTL:
1607 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1608 			return (sizeof (uchar_t));
1609 		case IP_MULTICAST_LOOP:
1610 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1611 			return (sizeof (uchar_t));
1612 		}
1613 		break;
1614 	case IPPROTO_IPV6:
1615 		switch (name) {
1616 		case IPV6_MULTICAST_HOPS:
1617 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1618 			return (sizeof (int));
1619 		case IPV6_MULTICAST_LOOP:
1620 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1621 			return (sizeof (int));
1622 		case IPV6_UNICAST_HOPS:
1623 			*i1 = is->is_ipv6_hoplimit;
1624 			return (sizeof (int));
1625 		}
1626 		break;
1627 	case IPPROTO_ICMPV6:
1628 		switch (name) {
1629 		case ICMP6_FILTER:
1630 			/* Make it look like "pass all" */
1631 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1632 			return (sizeof (icmp6_filter_t));
1633 		}
1634 		break;
1635 	}
1636 	return (-1);
1637 }
1638 
1639 /*
1640  * This routine retrieves the current status of socket options.
1641  * It returns the size of the option retrieved.
1642  */
1643 int
1644 icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
1645 {
1646 	conn_t	*connp = Q_TO_CONN(q);
1647 	icmp_t	*icmp = connp->conn_icmp;
1648 	icmp_stack_t *is = icmp->icmp_is;
1649 	int	*i1 = (int *)ptr;
1650 	ip6_pkt_t	*ipp = &icmp->icmp_sticky_ipp;
1651 
1652 	switch (level) {
1653 	case SOL_SOCKET:
1654 		switch (name) {
1655 		case SO_DEBUG:
1656 			*i1 = icmp->icmp_debug;
1657 			break;
1658 		case SO_TYPE:
1659 			*i1 = SOCK_RAW;
1660 			break;
1661 		case SO_PROTOTYPE:
1662 			*i1 = icmp->icmp_proto;
1663 			break;
1664 		case SO_REUSEADDR:
1665 			*i1 = icmp->icmp_reuseaddr;
1666 			break;
1667 
1668 		/*
1669 		 * The following three items are available here,
1670 		 * but are only meaningful to IP.
1671 		 */
1672 		case SO_DONTROUTE:
1673 			*i1 = icmp->icmp_dontroute;
1674 			break;
1675 		case SO_USELOOPBACK:
1676 			*i1 = icmp->icmp_useloopback;
1677 			break;
1678 		case SO_BROADCAST:
1679 			*i1 = icmp->icmp_broadcast;
1680 			break;
1681 
1682 		case SO_SNDBUF:
1683 			ASSERT(q->q_hiwat <= INT_MAX);
1684 			*i1 = (int)q->q_hiwat;
1685 			break;
1686 		case SO_RCVBUF:
1687 			ASSERT(RD(q)->q_hiwat <= INT_MAX);
1688 			*i1 = (int)RD(q)->q_hiwat;
1689 			break;
1690 		case SO_DGRAM_ERRIND:
1691 			*i1 = icmp->icmp_dgram_errind;
1692 			break;
1693 		case SO_TIMESTAMP:
1694 			*i1 = icmp->icmp_timestamp;
1695 			break;
1696 		case SO_MAC_EXEMPT:
1697 			*i1 = icmp->icmp_mac_exempt;
1698 			break;
1699 		case SO_DOMAIN:
1700 			*i1 = icmp->icmp_family;
1701 			break;
1702 
1703 		/*
1704 		 * Following four not meaningful for icmp
1705 		 * Action is same as "default" to which we fallthrough
1706 		 * so we keep them in comments.
1707 		 * case SO_LINGER:
1708 		 * case SO_KEEPALIVE:
1709 		 * case SO_OOBINLINE:
1710 		 * case SO_ALLZONES:
1711 		 */
1712 		default:
1713 			return (-1);
1714 		}
1715 		break;
1716 	case IPPROTO_IP:
1717 		/*
1718 		 * Only allow IPv4 option processing on IPv4 sockets.
1719 		 */
1720 		if (icmp->icmp_family != AF_INET)
1721 			return (-1);
1722 
1723 		switch (name) {
1724 		case IP_OPTIONS:
1725 		case T_IP_OPTIONS:
1726 			/* Options are passed up with each packet */
1727 			return (0);
1728 		case IP_HDRINCL:
1729 			*i1 = (int)icmp->icmp_hdrincl;
1730 			break;
1731 		case IP_TOS:
1732 		case T_IP_TOS:
1733 			*i1 = (int)icmp->icmp_type_of_service;
1734 			break;
1735 		case IP_TTL:
1736 			*i1 = (int)icmp->icmp_ttl;
1737 			break;
1738 		case IP_MULTICAST_IF:
1739 			/* 0 address if not set */
1740 			*(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
1741 			return (sizeof (ipaddr_t));
1742 		case IP_MULTICAST_TTL:
1743 			*(uchar_t *)ptr = icmp->icmp_multicast_ttl;
1744 			return (sizeof (uchar_t));
1745 		case IP_MULTICAST_LOOP:
1746 			*ptr = connp->conn_multicast_loop;
1747 			return (sizeof (uint8_t));
1748 		case IP_BOUND_IF:
1749 			/* Zero if not set */
1750 			*i1 = icmp->icmp_bound_if;
1751 			break;	/* goto sizeof (int) option return */
1752 		case IP_UNSPEC_SRC:
1753 			*ptr = icmp->icmp_unspec_source;
1754 			break;	/* goto sizeof (int) option return */
1755 		case IP_XMIT_IF:
1756 			*i1 = icmp->icmp_xmit_if;
1757 			break;	/* goto sizeof (int) option return */
1758 		case IP_RECVIF:
1759 			*ptr = icmp->icmp_recvif;
1760 			break;	/* goto sizeof (int) option return */
1761 		case IP_RECVPKTINFO:
1762 			/*
1763 			 * This also handles IP_PKTINFO.
1764 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
1765 			 * Differentiation is based on the size of the argument
1766 			 * passed in.
1767 			 * This option is handled in IP which will return an
1768 			 * error for IP_PKTINFO as it's not supported as a
1769 			 * sticky option.
1770 			 */
1771 			return (-EINVAL);
1772 		/*
1773 		 * Cannot "get" the value of following options
1774 		 * at this level. Action is same as "default" to
1775 		 * which we fallthrough so we keep them in comments.
1776 		 *
1777 		 * case IP_ADD_MEMBERSHIP:
1778 		 * case IP_DROP_MEMBERSHIP:
1779 		 * case IP_BLOCK_SOURCE:
1780 		 * case IP_UNBLOCK_SOURCE:
1781 		 * case IP_ADD_SOURCE_MEMBERSHIP:
1782 		 * case IP_DROP_SOURCE_MEMBERSHIP:
1783 		 * case MCAST_JOIN_GROUP:
1784 		 * case MCAST_LEAVE_GROUP:
1785 		 * case MCAST_BLOCK_SOURCE:
1786 		 * case MCAST_UNBLOCK_SOURCE:
1787 		 * case MCAST_JOIN_SOURCE_GROUP:
1788 		 * case MCAST_LEAVE_SOURCE_GROUP:
1789 		 * case MRT_INIT:
1790 		 * case MRT_DONE:
1791 		 * case MRT_ADD_VIF:
1792 		 * case MRT_DEL_VIF:
1793 		 * case MRT_ADD_MFC:
1794 		 * case MRT_DEL_MFC:
1795 		 * case MRT_VERSION:
1796 		 * case MRT_ASSERT:
1797 		 * case IP_SEC_OPT:
1798 		 * case IP_DONTFAILOVER_IF:
1799 		 * case IP_NEXTHOP:
1800 		 */
1801 		default:
1802 			return (-1);
1803 		}
1804 		break;
1805 	case IPPROTO_IPV6:
1806 		/*
1807 		 * Only allow IPv6 option processing on native IPv6 sockets.
1808 		 */
1809 		if (icmp->icmp_family != AF_INET6)
1810 			return (-1);
1811 		switch (name) {
1812 		case IPV6_UNICAST_HOPS:
1813 			*i1 = (unsigned int)icmp->icmp_ttl;
1814 			break;
1815 		case IPV6_MULTICAST_IF:
1816 			/* 0 index if not set */
1817 			*i1 = icmp->icmp_multicast_if_index;
1818 			break;
1819 		case IPV6_MULTICAST_HOPS:
1820 			*i1 = icmp->icmp_multicast_ttl;
1821 			break;
1822 		case IPV6_MULTICAST_LOOP:
1823 			*i1 = connp->conn_multicast_loop;
1824 			break;
1825 		case IPV6_BOUND_IF:
1826 			/* Zero if not set */
1827 			*i1 = icmp->icmp_bound_if;
1828 			break;
1829 		case IPV6_UNSPEC_SRC:
1830 			*i1 = icmp->icmp_unspec_source;
1831 			break;
1832 		case IPV6_CHECKSUM:
1833 			/*
1834 			 * Return offset or -1 if no checksum offset.
1835 			 * Does not apply to IPPROTO_ICMPV6
1836 			 */
1837 			if (icmp->icmp_proto == IPPROTO_ICMPV6)
1838 				return (-1);
1839 
1840 			if (icmp->icmp_raw_checksum) {
1841 				*i1 = icmp->icmp_checksum_off;
1842 			} else {
1843 				*i1 = -1;
1844 			}
1845 			break;
1846 		case IPV6_JOIN_GROUP:
1847 		case IPV6_LEAVE_GROUP:
1848 		case MCAST_JOIN_GROUP:
1849 		case MCAST_LEAVE_GROUP:
1850 		case MCAST_BLOCK_SOURCE:
1851 		case MCAST_UNBLOCK_SOURCE:
1852 		case MCAST_JOIN_SOURCE_GROUP:
1853 		case MCAST_LEAVE_SOURCE_GROUP:
1854 			/* cannot "get" the value for these */
1855 			return (-1);
1856 		case IPV6_RECVPKTINFO:
1857 			*i1 = icmp->icmp_ip_recvpktinfo;
1858 			break;
1859 		case IPV6_RECVTCLASS:
1860 			*i1 = icmp->icmp_ipv6_recvtclass;
1861 			break;
1862 		case IPV6_RECVPATHMTU:
1863 			*i1 = icmp->icmp_ipv6_recvpathmtu;
1864 			break;
1865 		case IPV6_V6ONLY:
1866 			*i1 = 1;
1867 			break;
1868 		case IPV6_RECVHOPLIMIT:
1869 			*i1 = icmp->icmp_ipv6_recvhoplimit;
1870 			break;
1871 		case IPV6_RECVHOPOPTS:
1872 			*i1 = icmp->icmp_ipv6_recvhopopts;
1873 			break;
1874 		case IPV6_RECVDSTOPTS:
1875 			*i1 = icmp->icmp_ipv6_recvdstopts;
1876 			break;
1877 		case _OLD_IPV6_RECVDSTOPTS:
1878 			*i1 = icmp->icmp_old_ipv6_recvdstopts;
1879 			break;
1880 		case IPV6_RECVRTHDRDSTOPTS:
1881 			*i1 = icmp->icmp_ipv6_recvrtdstopts;
1882 			break;
1883 		case IPV6_RECVRTHDR:
1884 			*i1 = icmp->icmp_ipv6_recvrthdr;
1885 			break;
1886 		case IPV6_PKTINFO: {
1887 			/* XXX assumes that caller has room for max size! */
1888 			struct in6_pktinfo *pkti;
1889 
1890 			pkti = (struct in6_pktinfo *)ptr;
1891 			if (ipp->ipp_fields & IPPF_IFINDEX)
1892 				pkti->ipi6_ifindex = ipp->ipp_ifindex;
1893 			else
1894 				pkti->ipi6_ifindex = 0;
1895 			if (ipp->ipp_fields & IPPF_ADDR)
1896 				pkti->ipi6_addr = ipp->ipp_addr;
1897 			else
1898 				pkti->ipi6_addr = ipv6_all_zeros;
1899 			return (sizeof (struct in6_pktinfo));
1900 		}
1901 		case IPV6_NEXTHOP: {
1902 			sin6_t *sin6 = (sin6_t *)ptr;
1903 
1904 			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
1905 				return (0);
1906 			*sin6 = sin6_null;
1907 			sin6->sin6_family = AF_INET6;
1908 			sin6->sin6_addr = ipp->ipp_nexthop;
1909 			return (sizeof (sin6_t));
1910 		}
1911 		case IPV6_HOPOPTS:
1912 			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
1913 				return (0);
1914 			if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6)
1915 				return (0);
1916 			bcopy((char *)ipp->ipp_hopopts +
1917 			    icmp->icmp_label_len_v6, ptr,
1918 			    ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
1919 			if (icmp->icmp_label_len_v6 > 0) {
1920 				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
1921 				ptr[1] = (ipp->ipp_hopoptslen -
1922 				    icmp->icmp_label_len_v6 + 7) / 8 - 1;
1923 			}
1924 			return (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
1925 		case IPV6_RTHDRDSTOPTS:
1926 			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
1927 				return (0);
1928 			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
1929 			return (ipp->ipp_rtdstoptslen);
1930 		case IPV6_RTHDR:
1931 			if (!(ipp->ipp_fields & IPPF_RTHDR))
1932 				return (0);
1933 			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
1934 			return (ipp->ipp_rthdrlen);
1935 		case IPV6_DSTOPTS:
1936 			if (!(ipp->ipp_fields & IPPF_DSTOPTS))
1937 				return (0);
1938 			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
1939 			return (ipp->ipp_dstoptslen);
1940 		case IPV6_PATHMTU:
1941 			if (!(ipp->ipp_fields & IPPF_PATHMTU))
1942 				return (0);
1943 
1944 			return (ip_fill_mtuinfo(&icmp->icmp_v6dst, 0,
1945 			    (struct ip6_mtuinfo *)ptr,
1946 			    is->is_netstack));
1947 		case IPV6_TCLASS:
1948 			if (ipp->ipp_fields & IPPF_TCLASS)
1949 				*i1 = ipp->ipp_tclass;
1950 			else
1951 				*i1 = IPV6_FLOW_TCLASS(
1952 				    IPV6_DEFAULT_VERS_AND_FLOW);
1953 			break;
1954 		default:
1955 			return (-1);
1956 		}
1957 		break;
1958 	case IPPROTO_ICMPV6:
1959 		/*
1960 		 * Only allow IPv6 option processing on native IPv6 sockets.
1961 		 */
1962 		if (icmp->icmp_family != AF_INET6)
1963 			return (-1);
1964 
1965 		if (icmp->icmp_proto != IPPROTO_ICMPV6)
1966 			return (-1);
1967 
1968 		switch (name) {
1969 		case ICMP6_FILTER:
1970 			if (icmp->icmp_filter == NULL) {
1971 				/* Make it look like "pass all" */
1972 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1973 			} else {
1974 				(void) bcopy(icmp->icmp_filter, ptr,
1975 				    sizeof (icmp6_filter_t));
1976 			}
1977 			return (sizeof (icmp6_filter_t));
1978 		default:
1979 			return (-1);
1980 		}
1981 	default:
1982 		return (-1);
1983 	}
1984 	return (sizeof (int));
1985 }
1986 
1987 /*
1988  * This routine retrieves the current status of socket options.
1989  * It returns the size of the option retrieved.
1990  */
1991 int
1992 icmp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1993 {
1994 	icmp_t  *icmp = Q_TO_ICMP(q);
1995 	int 	err;
1996 
1997 	rw_enter(&icmp->icmp_rwlock, RW_READER);
1998 	err = icmp_opt_get_locked(q, level, name, ptr);
1999 	rw_exit(&icmp->icmp_rwlock);
2000 	return (err);
2001 }
2002 
2003 
2004 /* This routine sets socket options. */
2005 /* ARGSUSED */
2006 int
2007 icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
2008     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2009     void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
2010 {
2011 	conn_t	*connp = Q_TO_CONN(q);
2012 	icmp_t	*icmp = connp->conn_icmp;
2013 	icmp_stack_t *is = icmp->icmp_is;
2014 	int	*i1 = (int *)invalp;
2015 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
2016 	boolean_t checkonly;
2017 	int	error;
2018 
2019 	switch (optset_context) {
2020 	case SETFN_OPTCOM_CHECKONLY:
2021 		checkonly = B_TRUE;
2022 		/*
2023 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2024 		 * inlen != 0 implies value supplied and
2025 		 * 	we have to "pretend" to set it.
2026 		 * inlen == 0 implies that there is no
2027 		 * 	value part in T_CHECK request and just validation
2028 		 * done elsewhere should be enough, we just return here.
2029 		 */
2030 		if (inlen == 0) {
2031 			*outlenp = 0;
2032 			return (0);
2033 		}
2034 		break;
2035 	case SETFN_OPTCOM_NEGOTIATE:
2036 		checkonly = B_FALSE;
2037 		break;
2038 	case SETFN_UD_NEGOTIATE:
2039 	case SETFN_CONN_NEGOTIATE:
2040 		checkonly = B_FALSE;
2041 		/*
2042 		 * Negotiating local and "association-related" options
2043 		 * through T_UNITDATA_REQ.
2044 		 *
2045 		 * Following routine can filter out ones we do not
2046 		 * want to be "set" this way.
2047 		 */
2048 		if (!icmp_opt_allow_udr_set(level, name)) {
2049 			*outlenp = 0;
2050 			return (EINVAL);
2051 		}
2052 		break;
2053 	default:
2054 		/*
2055 		 * We should never get here
2056 		 */
2057 		*outlenp = 0;
2058 		return (EINVAL);
2059 	}
2060 
2061 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2062 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2063 
2064 	/*
2065 	 * For fixed length options, no sanity check
2066 	 * of passed in length is done. It is assumed *_optcom_req()
2067 	 * routines do the right thing.
2068 	 */
2069 
2070 	switch (level) {
2071 	case SOL_SOCKET:
2072 		switch (name) {
2073 		case SO_DEBUG:
2074 			if (!checkonly)
2075 				icmp->icmp_debug = onoff;
2076 			break;
2077 		case SO_PROTOTYPE:
2078 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2079 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2080 			    secpolicy_net_rawaccess(cr) != 0) {
2081 				*outlenp = 0;
2082 				return (EACCES);
2083 			}
2084 			/* Can't use IPPROTO_RAW with IPv6 */
2085 			if ((*i1 & 0xFF) == IPPROTO_RAW &&
2086 			    icmp->icmp_family == AF_INET6) {
2087 				*outlenp = 0;
2088 				return (EPROTONOSUPPORT);
2089 			}
2090 			if (checkonly) {
2091 				/* T_CHECK case */
2092 				*(int *)outvalp = (*i1 & 0xFF);
2093 				break;
2094 			}
2095 			icmp->icmp_proto = *i1 & 0xFF;
2096 			if ((icmp->icmp_proto == IPPROTO_RAW ||
2097 			    icmp->icmp_proto == IPPROTO_IGMP) &&
2098 			    icmp->icmp_family == AF_INET)
2099 				icmp->icmp_hdrincl = 1;
2100 			else
2101 				icmp->icmp_hdrincl = 0;
2102 
2103 			if (icmp->icmp_family == AF_INET6 &&
2104 			    icmp->icmp_proto == IPPROTO_ICMPV6) {
2105 				/* Set offset for icmp6_cksum */
2106 				icmp->icmp_raw_checksum = 0;
2107 				icmp->icmp_checksum_off = 2;
2108 			}
2109 			if (icmp->icmp_proto == IPPROTO_UDP ||
2110 			    icmp->icmp_proto == IPPROTO_TCP ||
2111 			    icmp->icmp_proto == IPPROTO_SCTP) {
2112 				icmp->icmp_no_tp_cksum = 1;
2113 				icmp->icmp_sticky_ipp.ipp_fields |=
2114 				    IPPF_NO_CKSUM;
2115 			} else {
2116 				icmp->icmp_no_tp_cksum = 0;
2117 				icmp->icmp_sticky_ipp.ipp_fields &=
2118 				    ~IPPF_NO_CKSUM;
2119 			}
2120 
2121 			if (icmp->icmp_filter != NULL &&
2122 			    icmp->icmp_proto != IPPROTO_ICMPV6) {
2123 				kmem_free(icmp->icmp_filter,
2124 				    sizeof (icmp6_filter_t));
2125 				icmp->icmp_filter = NULL;
2126 			}
2127 
2128 			/* Rebuild the header template */
2129 			error = icmp_build_hdrs(icmp);
2130 			if (error != 0) {
2131 				*outlenp = 0;
2132 				return (error);
2133 			}
2134 
2135 			/*
2136 			 * For SCTP, we don't use icmp_bind_proto() for
2137 			 * raw socket binding.  Note that we do not need
2138 			 * to set *outlenp.
2139 			 * FIXME: how does SCTP work?
2140 			 */
2141 			if (icmp->icmp_proto == IPPROTO_SCTP)
2142 				return (0);
2143 
2144 			*outlenp = sizeof (int);
2145 			*(int *)outvalp = *i1 & 0xFF;
2146 
2147 			/* Drop lock across the bind operation */
2148 			rw_exit(&icmp->icmp_rwlock);
2149 			icmp_bind_proto(q);
2150 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2151 			return (0);
2152 		case SO_REUSEADDR:
2153 			if (!checkonly)
2154 				icmp->icmp_reuseaddr = onoff;
2155 			break;
2156 
2157 		/*
2158 		 * The following three items are available here,
2159 		 * but are only meaningful to IP.
2160 		 */
2161 		case SO_DONTROUTE:
2162 			if (!checkonly)
2163 				icmp->icmp_dontroute = onoff;
2164 			break;
2165 		case SO_USELOOPBACK:
2166 			if (!checkonly)
2167 				icmp->icmp_useloopback = onoff;
2168 			break;
2169 		case SO_BROADCAST:
2170 			if (!checkonly)
2171 				icmp->icmp_broadcast = onoff;
2172 			break;
2173 
2174 		case SO_SNDBUF:
2175 			if (*i1 > is->is_max_buf) {
2176 				*outlenp = 0;
2177 				return (ENOBUFS);
2178 			}
2179 			if (!checkonly) {
2180 				q->q_hiwat = *i1;
2181 			}
2182 			break;
2183 		case SO_RCVBUF:
2184 			if (*i1 > is->is_max_buf) {
2185 				*outlenp = 0;
2186 				return (ENOBUFS);
2187 			}
2188 			if (!checkonly) {
2189 				RD(q)->q_hiwat = *i1;
2190 				rw_exit(&icmp->icmp_rwlock);
2191 				(void) mi_set_sth_hiwat(RD(q), *i1);
2192 				rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2193 			}
2194 			break;
2195 		case SO_DGRAM_ERRIND:
2196 			if (!checkonly)
2197 				icmp->icmp_dgram_errind = onoff;
2198 			break;
2199 		case SO_ALLZONES:
2200 			/*
2201 			 * "soft" error (negative)
2202 			 * option not handled at this level
2203 			 * Note: Do not modify *outlenp
2204 			 */
2205 			return (-EINVAL);
2206 		case SO_TIMESTAMP:
2207 			if (!checkonly) {
2208 				icmp->icmp_timestamp = onoff;
2209 			}
2210 			break;
2211 		case SO_MAC_EXEMPT:
2212 			if (secpolicy_net_mac_aware(cr) != 0 ||
2213 			    icmp->icmp_state != TS_UNBND)
2214 				return (EACCES);
2215 			if (!checkonly)
2216 				icmp->icmp_mac_exempt = onoff;
2217 			break;
2218 		/*
2219 		 * Following three not meaningful for icmp
2220 		 * Action is same as "default" so we keep them
2221 		 * in comments.
2222 		 * case SO_LINGER:
2223 		 * case SO_KEEPALIVE:
2224 		 * case SO_OOBINLINE:
2225 		 */
2226 		default:
2227 			*outlenp = 0;
2228 			return (EINVAL);
2229 		}
2230 		break;
2231 	case IPPROTO_IP:
2232 		/*
2233 		 * Only allow IPv4 option processing on IPv4 sockets.
2234 		 */
2235 		if (icmp->icmp_family != AF_INET) {
2236 			*outlenp = 0;
2237 			return (ENOPROTOOPT);
2238 		}
2239 		switch (name) {
2240 		case IP_OPTIONS:
2241 		case T_IP_OPTIONS:
2242 			/* Save options for use by IP. */
2243 			if ((inlen & 0x3) ||
2244 			    inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) {
2245 				*outlenp = 0;
2246 				return (EINVAL);
2247 			}
2248 			if (checkonly)
2249 				break;
2250 
2251 			if (!tsol_option_set(&icmp->icmp_ip_snd_options,
2252 			    &icmp->icmp_ip_snd_options_len,
2253 			    icmp->icmp_label_len, invalp, inlen)) {
2254 				*outlenp = 0;
2255 				return (ENOMEM);
2256 			}
2257 
2258 			icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
2259 			    icmp->icmp_ip_snd_options_len;
2260 			rw_exit(&icmp->icmp_rwlock);
2261 			(void) mi_set_sth_wroff(RD(q), icmp->icmp_max_hdr_len +
2262 			    is->is_wroff_extra);
2263 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2264 			break;
2265 		case IP_HDRINCL:
2266 			if (!checkonly)
2267 				icmp->icmp_hdrincl = onoff;
2268 			break;
2269 		case IP_TOS:
2270 		case T_IP_TOS:
2271 			if (!checkonly) {
2272 				icmp->icmp_type_of_service = (uint8_t)*i1;
2273 			}
2274 			break;
2275 		case IP_TTL:
2276 			if (!checkonly) {
2277 				icmp->icmp_ttl = (uint8_t)*i1;
2278 			}
2279 			break;
2280 		case IP_MULTICAST_IF:
2281 			/*
2282 			 * TODO should check OPTMGMT reply and undo this if
2283 			 * there is an error.
2284 			 */
2285 			if (!checkonly)
2286 				icmp->icmp_multicast_if_addr = *i1;
2287 			break;
2288 		case IP_MULTICAST_TTL:
2289 			if (!checkonly)
2290 				icmp->icmp_multicast_ttl = *invalp;
2291 			break;
2292 		case IP_MULTICAST_LOOP:
2293 			if (!checkonly) {
2294 				connp->conn_multicast_loop =
2295 				    (*invalp == 0) ? 0 : 1;
2296 			}
2297 			break;
2298 		case IP_BOUND_IF:
2299 			if (!checkonly)
2300 				icmp->icmp_bound_if = *i1;
2301 			break;
2302 		case IP_UNSPEC_SRC:
2303 			if (!checkonly)
2304 				icmp->icmp_unspec_source = onoff;
2305 			break;
2306 		case IP_XMIT_IF:
2307 			if (!checkonly)
2308 				icmp->icmp_xmit_if = *i1;
2309 			break;
2310 		case IP_RECVIF:
2311 			if (!checkonly)
2312 				icmp->icmp_recvif = onoff;
2313 			/*
2314 			 * pass to ip
2315 			 */
2316 			return (-EINVAL);
2317 		case IP_PKTINFO: {
2318 			/*
2319 			 * This also handles IP_RECVPKTINFO.
2320 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
2321 			 * Differentiation is based on the size of the argument
2322 			 * passed in.
2323 			 */
2324 			struct in_pktinfo *pktinfop;
2325 			ip4_pkt_t *attr_pktinfop;
2326 
2327 			if (checkonly)
2328 				break;
2329 
2330 			if (inlen == sizeof (int)) {
2331 				/*
2332 				 * This is IP_RECVPKTINFO option.
2333 				 * Keep a local copy of wether this option is
2334 				 * set or not and pass it down to IP for
2335 				 * processing.
2336 				 */
2337 				icmp->icmp_ip_recvpktinfo = onoff;
2338 				return (-EINVAL);
2339 			}
2340 
2341 
2342 			if (inlen != sizeof (struct in_pktinfo))
2343 				return (EINVAL);
2344 
2345 			if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
2346 			    == NULL) {
2347 				/*
2348 				 * sticky option is not supported
2349 				 */
2350 				return (EINVAL);
2351 			}
2352 
2353 			pktinfop = (struct in_pktinfo *)invalp;
2354 
2355 			/*
2356 			 * Atleast one of the values should be specified
2357 			 */
2358 			if (pktinfop->ipi_ifindex == 0 &&
2359 			    pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
2360 				return (EINVAL);
2361 			}
2362 
2363 			attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
2364 			attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
2365 		}
2366 			break;
2367 		case IP_ADD_MEMBERSHIP:
2368 		case IP_DROP_MEMBERSHIP:
2369 		case IP_BLOCK_SOURCE:
2370 		case IP_UNBLOCK_SOURCE:
2371 		case IP_ADD_SOURCE_MEMBERSHIP:
2372 		case IP_DROP_SOURCE_MEMBERSHIP:
2373 		case MCAST_JOIN_GROUP:
2374 		case MCAST_LEAVE_GROUP:
2375 		case MCAST_BLOCK_SOURCE:
2376 		case MCAST_UNBLOCK_SOURCE:
2377 		case MCAST_JOIN_SOURCE_GROUP:
2378 		case MCAST_LEAVE_SOURCE_GROUP:
2379 		case MRT_INIT:
2380 		case MRT_DONE:
2381 		case MRT_ADD_VIF:
2382 		case MRT_DEL_VIF:
2383 		case MRT_ADD_MFC:
2384 		case MRT_DEL_MFC:
2385 		case MRT_VERSION:
2386 		case MRT_ASSERT:
2387 		case IP_SEC_OPT:
2388 		case IP_DONTFAILOVER_IF:
2389 		case IP_NEXTHOP:
2390 			/*
2391 			 * "soft" error (negative)
2392 			 * option not handled at this level
2393 			 * Note: Do not modify *outlenp
2394 			 */
2395 			return (-EINVAL);
2396 		default:
2397 			*outlenp = 0;
2398 			return (EINVAL);
2399 		}
2400 		break;
2401 	case IPPROTO_IPV6: {
2402 		ip6_pkt_t		*ipp;
2403 		boolean_t		sticky;
2404 
2405 		if (icmp->icmp_family != AF_INET6) {
2406 			*outlenp = 0;
2407 			return (ENOPROTOOPT);
2408 		}
2409 		/*
2410 		 * Deal with both sticky options and ancillary data
2411 		 */
2412 		if (thisdg_attrs == NULL) {
2413 			/* sticky options, or none */
2414 			ipp = &icmp->icmp_sticky_ipp;
2415 			sticky = B_TRUE;
2416 		} else {
2417 			/* ancillary data */
2418 			ipp = (ip6_pkt_t *)thisdg_attrs;
2419 			sticky = B_FALSE;
2420 		}
2421 
2422 		switch (name) {
2423 		case IPV6_MULTICAST_IF:
2424 			if (!checkonly)
2425 				icmp->icmp_multicast_if_index = *i1;
2426 			break;
2427 		case IPV6_UNICAST_HOPS:
2428 			/* -1 means use default */
2429 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2430 				*outlenp = 0;
2431 				return (EINVAL);
2432 			}
2433 			if (!checkonly) {
2434 				if (*i1 == -1) {
2435 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2436 					    is->is_ipv6_hoplimit;
2437 					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
2438 					/* Pass modified value to IP. */
2439 					*i1 = ipp->ipp_hoplimit;
2440 				} else {
2441 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2442 					    (uint8_t)*i1;
2443 					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
2444 				}
2445 				/* Rebuild the header template */
2446 				error = icmp_build_hdrs(icmp);
2447 				if (error != 0) {
2448 					*outlenp = 0;
2449 					return (error);
2450 				}
2451 			}
2452 			break;
2453 		case IPV6_MULTICAST_HOPS:
2454 			/* -1 means use default */
2455 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2456 				*outlenp = 0;
2457 				return (EINVAL);
2458 			}
2459 			if (!checkonly) {
2460 				if (*i1 == -1) {
2461 					icmp->icmp_multicast_ttl =
2462 					    ipp->ipp_multicast_hops =
2463 					    IP_DEFAULT_MULTICAST_TTL;
2464 					ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
2465 					/* Pass modified value to IP. */
2466 					*i1 = icmp->icmp_multicast_ttl;
2467 				} else {
2468 					icmp->icmp_multicast_ttl =
2469 					    ipp->ipp_multicast_hops =
2470 					    (uint8_t)*i1;
2471 					ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
2472 				}
2473 			}
2474 			break;
2475 		case IPV6_MULTICAST_LOOP:
2476 			if (*i1 != 0 && *i1 != 1) {
2477 				*outlenp = 0;
2478 				return (EINVAL);
2479 			}
2480 			if (!checkonly)
2481 				connp->conn_multicast_loop = *i1;
2482 			break;
2483 		case IPV6_CHECKSUM:
2484 			/*
2485 			 * Integer offset into the user data of where the
2486 			 * checksum is located.
2487 			 * Offset of -1 disables option.
2488 			 * Does not apply to IPPROTO_ICMPV6.
2489 			 */
2490 			if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) {
2491 				*outlenp = 0;
2492 				return (EINVAL);
2493 			}
2494 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2495 				/* Negative or not 16 bit aligned offset */
2496 				*outlenp = 0;
2497 				return (EINVAL);
2498 			}
2499 			if (checkonly)
2500 				break;
2501 
2502 			if (*i1 == -1) {
2503 				icmp->icmp_raw_checksum = 0;
2504 				ipp->ipp_fields &= ~IPPF_RAW_CKSUM;
2505 			} else {
2506 				icmp->icmp_raw_checksum = 1;
2507 				icmp->icmp_checksum_off = *i1;
2508 				ipp->ipp_fields |= IPPF_RAW_CKSUM;
2509 			}
2510 			/* Rebuild the header template */
2511 			error = icmp_build_hdrs(icmp);
2512 			if (error != 0) {
2513 				*outlenp = 0;
2514 				return (error);
2515 			}
2516 			break;
2517 		case IPV6_JOIN_GROUP:
2518 		case IPV6_LEAVE_GROUP:
2519 		case MCAST_JOIN_GROUP:
2520 		case MCAST_LEAVE_GROUP:
2521 		case MCAST_BLOCK_SOURCE:
2522 		case MCAST_UNBLOCK_SOURCE:
2523 		case MCAST_JOIN_SOURCE_GROUP:
2524 		case MCAST_LEAVE_SOURCE_GROUP:
2525 			/*
2526 			 * "soft" error (negative)
2527 			 * option not handled at this level
2528 			 * Note: Do not modify *outlenp
2529 			 */
2530 			return (-EINVAL);
2531 		case IPV6_BOUND_IF:
2532 			if (!checkonly)
2533 				icmp->icmp_bound_if = *i1;
2534 			break;
2535 		case IPV6_UNSPEC_SRC:
2536 			if (!checkonly)
2537 				icmp->icmp_unspec_source = onoff;
2538 			break;
2539 		case IPV6_RECVTCLASS:
2540 			if (!checkonly)
2541 				icmp->icmp_ipv6_recvtclass = onoff;
2542 			break;
2543 		/*
2544 		 * Set boolean switches for ancillary data delivery
2545 		 */
2546 		case IPV6_RECVPKTINFO:
2547 			if (!checkonly)
2548 				icmp->icmp_ip_recvpktinfo = onoff;
2549 			break;
2550 		case IPV6_RECVPATHMTU:
2551 			if (!checkonly)
2552 				icmp->icmp_ipv6_recvpathmtu = onoff;
2553 			break;
2554 		case IPV6_RECVHOPLIMIT:
2555 			if (!checkonly)
2556 				icmp->icmp_ipv6_recvhoplimit = onoff;
2557 			break;
2558 		case IPV6_RECVHOPOPTS:
2559 			if (!checkonly)
2560 				icmp->icmp_ipv6_recvhopopts = onoff;
2561 			break;
2562 		case IPV6_RECVDSTOPTS:
2563 			if (!checkonly)
2564 				icmp->icmp_ipv6_recvdstopts = onoff;
2565 			break;
2566 		case _OLD_IPV6_RECVDSTOPTS:
2567 			if (!checkonly)
2568 				icmp->icmp_old_ipv6_recvdstopts = onoff;
2569 			break;
2570 		case IPV6_RECVRTHDRDSTOPTS:
2571 			if (!checkonly)
2572 				icmp->icmp_ipv6_recvrtdstopts = onoff;
2573 			break;
2574 		case IPV6_RECVRTHDR:
2575 			if (!checkonly)
2576 				icmp->icmp_ipv6_recvrthdr = onoff;
2577 			break;
2578 		/*
2579 		 * Set sticky options or ancillary data.
2580 		 * If sticky options, (re)build any extension headers
2581 		 * that might be needed as a result.
2582 		 */
2583 		case IPV6_PKTINFO:
2584 			/*
2585 			 * The source address and ifindex are verified
2586 			 * in ip_opt_set(). For ancillary data the
2587 			 * source address is checked in ip_wput_v6.
2588 			 */
2589 			if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
2590 				return (EINVAL);
2591 			if (checkonly)
2592 				break;
2593 
2594 			if (inlen == 0) {
2595 				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
2596 				ipp->ipp_sticky_ignored |=
2597 				    (IPPF_IFINDEX|IPPF_ADDR);
2598 			} else {
2599 				struct in6_pktinfo *pkti;
2600 
2601 				pkti = (struct in6_pktinfo *)invalp;
2602 				ipp->ipp_ifindex = pkti->ipi6_ifindex;
2603 				ipp->ipp_addr = pkti->ipi6_addr;
2604 				if (ipp->ipp_ifindex != 0)
2605 					ipp->ipp_fields |= IPPF_IFINDEX;
2606 				else
2607 					ipp->ipp_fields &= ~IPPF_IFINDEX;
2608 				if (!IN6_IS_ADDR_UNSPECIFIED(
2609 				    &ipp->ipp_addr))
2610 					ipp->ipp_fields |= IPPF_ADDR;
2611 				else
2612 					ipp->ipp_fields &= ~IPPF_ADDR;
2613 			}
2614 			if (sticky) {
2615 				error = icmp_build_hdrs(icmp);
2616 				if (error != 0)
2617 					return (error);
2618 			}
2619 			break;
2620 		case IPV6_HOPLIMIT:
2621 			/* This option can only be used as ancillary data. */
2622 			if (sticky)
2623 				return (EINVAL);
2624 			if (inlen != 0 && inlen != sizeof (int))
2625 				return (EINVAL);
2626 			if (checkonly)
2627 				break;
2628 
2629 			if (inlen == 0) {
2630 				ipp->ipp_fields &= ~IPPF_HOPLIMIT;
2631 				ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
2632 			} else {
2633 				if (*i1 > 255 || *i1 < -1)
2634 					return (EINVAL);
2635 				if (*i1 == -1)
2636 					ipp->ipp_hoplimit =
2637 					    is->is_ipv6_hoplimit;
2638 				else
2639 					ipp->ipp_hoplimit = *i1;
2640 				ipp->ipp_fields |= IPPF_HOPLIMIT;
2641 			}
2642 			break;
2643 		case IPV6_TCLASS:
2644 			/*
2645 			 * IPV6_RECVTCLASS accepts -1 as use kernel default
2646 			 * and [0, 255] as the actualy traffic class.
2647 			 */
2648 			if (inlen != 0 && inlen != sizeof (int))
2649 				return (EINVAL);
2650 			if (checkonly)
2651 				break;
2652 
2653 			if (inlen == 0) {
2654 				ipp->ipp_fields &= ~IPPF_TCLASS;
2655 				ipp->ipp_sticky_ignored |= IPPF_TCLASS;
2656 			} else {
2657 				if (*i1 >= 256 || *i1 < -1)
2658 					return (EINVAL);
2659 				if (*i1 == -1) {
2660 					ipp->ipp_tclass =
2661 					    IPV6_FLOW_TCLASS(
2662 					    IPV6_DEFAULT_VERS_AND_FLOW);
2663 				} else {
2664 					ipp->ipp_tclass = *i1;
2665 				}
2666 				ipp->ipp_fields |= IPPF_TCLASS;
2667 			}
2668 			if (sticky) {
2669 				error = icmp_build_hdrs(icmp);
2670 				if (error != 0)
2671 					return (error);
2672 			}
2673 			break;
2674 		case IPV6_NEXTHOP:
2675 			/*
2676 			 * IP will verify that the nexthop is reachable
2677 			 * and fail for sticky options.
2678 			 */
2679 			if (inlen != 0 && inlen != sizeof (sin6_t))
2680 				return (EINVAL);
2681 			if (checkonly)
2682 				break;
2683 
2684 			if (inlen == 0) {
2685 				ipp->ipp_fields &= ~IPPF_NEXTHOP;
2686 				ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
2687 			} else {
2688 				sin6_t *sin6 = (sin6_t *)invalp;
2689 
2690 				if (sin6->sin6_family != AF_INET6)
2691 					return (EAFNOSUPPORT);
2692 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
2693 					return (EADDRNOTAVAIL);
2694 				ipp->ipp_nexthop = sin6->sin6_addr;
2695 				if (!IN6_IS_ADDR_UNSPECIFIED(
2696 				    &ipp->ipp_nexthop))
2697 					ipp->ipp_fields |= IPPF_NEXTHOP;
2698 				else
2699 					ipp->ipp_fields &= ~IPPF_NEXTHOP;
2700 			}
2701 			if (sticky) {
2702 				error = icmp_build_hdrs(icmp);
2703 				if (error != 0)
2704 					return (error);
2705 			}
2706 			break;
2707 		case IPV6_HOPOPTS: {
2708 			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
2709 			/*
2710 			 * Sanity checks - minimum size, size a multiple of
2711 			 * eight bytes, and matching size passed in.
2712 			 */
2713 			if (inlen != 0 &&
2714 			    inlen != (8 * (hopts->ip6h_len + 1)))
2715 				return (EINVAL);
2716 
2717 			if (checkonly)
2718 				break;
2719 			error = optcom_pkt_set(invalp, inlen, sticky,
2720 			    (uchar_t **)&ipp->ipp_hopopts,
2721 			    &ipp->ipp_hopoptslen,
2722 			    sticky ? icmp->icmp_label_len_v6 : 0);
2723 			if (error != 0)
2724 				return (error);
2725 			if (ipp->ipp_hopoptslen == 0) {
2726 				ipp->ipp_fields &= ~IPPF_HOPOPTS;
2727 				ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
2728 			} else {
2729 				ipp->ipp_fields |= IPPF_HOPOPTS;
2730 			}
2731 			if (sticky) {
2732 				error = icmp_build_hdrs(icmp);
2733 				if (error != 0)
2734 					return (error);
2735 			}
2736 			break;
2737 		}
2738 		case IPV6_RTHDRDSTOPTS: {
2739 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2740 
2741 			/*
2742 			 * Sanity checks - minimum size, size a multiple of
2743 			 * eight bytes, and matching size passed in.
2744 			 */
2745 			if (inlen != 0 &&
2746 			    inlen != (8 * (dopts->ip6d_len + 1)))
2747 				return (EINVAL);
2748 
2749 			if (checkonly)
2750 				break;
2751 
2752 			if (inlen == 0) {
2753 				if (sticky &&
2754 				    (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
2755 					kmem_free(ipp->ipp_rtdstopts,
2756 					    ipp->ipp_rtdstoptslen);
2757 					ipp->ipp_rtdstopts = NULL;
2758 					ipp->ipp_rtdstoptslen = 0;
2759 				}
2760 				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
2761 				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
2762 			} else {
2763 				error = optcom_pkt_set(invalp, inlen, sticky,
2764 				    (uchar_t **)&ipp->ipp_rtdstopts,
2765 				    &ipp->ipp_rtdstoptslen, 0);
2766 				if (error != 0)
2767 					return (error);
2768 				ipp->ipp_fields |= IPPF_RTDSTOPTS;
2769 			}
2770 			if (sticky) {
2771 				error = icmp_build_hdrs(icmp);
2772 				if (error != 0)
2773 					return (error);
2774 			}
2775 			break;
2776 		}
2777 		case IPV6_DSTOPTS: {
2778 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2779 
2780 			/*
2781 			 * Sanity checks - minimum size, size a multiple of
2782 			 * eight bytes, and matching size passed in.
2783 			 */
2784 			if (inlen != 0 &&
2785 			    inlen != (8 * (dopts->ip6d_len + 1)))
2786 				return (EINVAL);
2787 
2788 			if (checkonly)
2789 				break;
2790 
2791 			if (inlen == 0) {
2792 				if (sticky &&
2793 				    (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
2794 					kmem_free(ipp->ipp_dstopts,
2795 					    ipp->ipp_dstoptslen);
2796 					ipp->ipp_dstopts = NULL;
2797 					ipp->ipp_dstoptslen = 0;
2798 				}
2799 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2800 				ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
2801 			} else {
2802 				error = optcom_pkt_set(invalp, inlen, sticky,
2803 				    (uchar_t **)&ipp->ipp_dstopts,
2804 				    &ipp->ipp_dstoptslen, 0);
2805 				if (error != 0)
2806 					return (error);
2807 				ipp->ipp_fields |= IPPF_DSTOPTS;
2808 			}
2809 			if (sticky) {
2810 				error = icmp_build_hdrs(icmp);
2811 				if (error != 0)
2812 					return (error);
2813 			}
2814 			break;
2815 		}
2816 		case IPV6_RTHDR: {
2817 			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
2818 
2819 			/*
2820 			 * Sanity checks - minimum size, size a multiple of
2821 			 * eight bytes, and matching size passed in.
2822 			 */
2823 			if (inlen != 0 &&
2824 			    inlen != (8 * (rt->ip6r_len + 1)))
2825 				return (EINVAL);
2826 
2827 			if (checkonly)
2828 				break;
2829 
2830 			if (inlen == 0) {
2831 				if (sticky &&
2832 				    (ipp->ipp_fields & IPPF_RTHDR) != 0) {
2833 					kmem_free(ipp->ipp_rthdr,
2834 					    ipp->ipp_rthdrlen);
2835 					ipp->ipp_rthdr = NULL;
2836 					ipp->ipp_rthdrlen = 0;
2837 				}
2838 				ipp->ipp_fields &= ~IPPF_RTHDR;
2839 				ipp->ipp_sticky_ignored |= IPPF_RTHDR;
2840 			} else {
2841 				error = optcom_pkt_set(invalp, inlen, sticky,
2842 				    (uchar_t **)&ipp->ipp_rthdr,
2843 				    &ipp->ipp_rthdrlen, 0);
2844 				if (error != 0)
2845 					return (error);
2846 				ipp->ipp_fields |= IPPF_RTHDR;
2847 			}
2848 			if (sticky) {
2849 				error = icmp_build_hdrs(icmp);
2850 				if (error != 0)
2851 					return (error);
2852 			}
2853 			break;
2854 		}
2855 
2856 		case IPV6_DONTFRAG:
2857 			if (checkonly)
2858 				break;
2859 
2860 			if (onoff) {
2861 				ipp->ipp_fields |= IPPF_DONTFRAG;
2862 			} else {
2863 				ipp->ipp_fields &= ~IPPF_DONTFRAG;
2864 			}
2865 			break;
2866 
2867 		case IPV6_USE_MIN_MTU:
2868 			if (inlen != sizeof (int))
2869 				return (EINVAL);
2870 
2871 			if (*i1 < -1 || *i1 > 1)
2872 				return (EINVAL);
2873 
2874 			if (checkonly)
2875 				break;
2876 
2877 			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
2878 			ipp->ipp_use_min_mtu = *i1;
2879 			break;
2880 
2881 		/*
2882 		 * This option can't be set.  Its only returned via
2883 		 * getsockopt() or ancillary data.
2884 		 */
2885 		case IPV6_PATHMTU:
2886 			return (EINVAL);
2887 
2888 		case IPV6_BOUND_PIF:
2889 		case IPV6_SEC_OPT:
2890 		case IPV6_DONTFAILOVER_IF:
2891 		case IPV6_SRC_PREFERENCES:
2892 		case IPV6_V6ONLY:
2893 			/* Handled at IP level */
2894 			return (-EINVAL);
2895 		default:
2896 			*outlenp = 0;
2897 			return (EINVAL);
2898 		}
2899 		break;
2900 	}		/* end IPPROTO_IPV6 */
2901 
2902 	case IPPROTO_ICMPV6:
2903 		/*
2904 		 * Only allow IPv6 option processing on IPv6 sockets.
2905 		 */
2906 		if (icmp->icmp_family != AF_INET6) {
2907 			*outlenp = 0;
2908 			return (ENOPROTOOPT);
2909 		}
2910 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
2911 			*outlenp = 0;
2912 			return (ENOPROTOOPT);
2913 		}
2914 		switch (name) {
2915 		case ICMP6_FILTER:
2916 			if (!checkonly) {
2917 				if ((inlen != 0) &&
2918 				    (inlen != sizeof (icmp6_filter_t)))
2919 					return (EINVAL);
2920 
2921 				if (inlen == 0) {
2922 					if (icmp->icmp_filter != NULL) {
2923 						kmem_free(icmp->icmp_filter,
2924 						    sizeof (icmp6_filter_t));
2925 						icmp->icmp_filter = NULL;
2926 					}
2927 				} else {
2928 					if (icmp->icmp_filter == NULL) {
2929 						icmp->icmp_filter = kmem_alloc(
2930 						    sizeof (icmp6_filter_t),
2931 						    KM_NOSLEEP);
2932 						if (icmp->icmp_filter == NULL) {
2933 							*outlenp = 0;
2934 							return (ENOBUFS);
2935 						}
2936 					}
2937 					(void) bcopy(invalp, icmp->icmp_filter,
2938 					    inlen);
2939 				}
2940 			}
2941 			break;
2942 
2943 		default:
2944 			*outlenp = 0;
2945 			return (EINVAL);
2946 		}
2947 		break;
2948 	default:
2949 		*outlenp = 0;
2950 		return (EINVAL);
2951 	}
2952 	/*
2953 	 * Common case of OK return with outval same as inval.
2954 	 */
2955 	if (invalp != outvalp) {
2956 		/* don't trust bcopy for identical src/dst */
2957 		(void) bcopy(invalp, outvalp, inlen);
2958 	}
2959 	*outlenp = inlen;
2960 	return (0);
2961 }
2962 /* This routine sets socket options. */
2963 /* ARGSUSED */
2964 int
2965 icmp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2966     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2967     void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
2968 {
2969 	icmp_t	*icmp;
2970 	int	err;
2971 
2972 	icmp = Q_TO_ICMP(q);
2973 
2974 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2975 	err = icmp_opt_set_locked(q, optset_context, level, name, inlen, invalp,
2976 	    outlenp, outvalp, thisdg_attrs, cr, mblk);
2977 	rw_exit(&icmp->icmp_rwlock);
2978 	return (err);
2979 }
2980 
2981 /*
2982  * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl,
2983  * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum.
2984  * The headers include ip6i_t (if needed), ip6_t, and any sticky extension
2985  * headers.
2986  * Returns failure if can't allocate memory.
2987  */
2988 static int
2989 icmp_build_hdrs(icmp_t *icmp)
2990 {
2991 	icmp_stack_t *is = icmp->icmp_is;
2992 	uchar_t	*hdrs;
2993 	uint_t	hdrs_len;
2994 	ip6_t	*ip6h;
2995 	ip6i_t	*ip6i;
2996 	ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
2997 
2998 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
2999 	hdrs_len = ip_total_hdrs_len_v6(ipp);
3000 	ASSERT(hdrs_len != 0);
3001 	if (hdrs_len != icmp->icmp_sticky_hdrs_len) {
3002 		/* Need to reallocate */
3003 		if (hdrs_len != 0) {
3004 			hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
3005 			if (hdrs == NULL)
3006 				return (ENOMEM);
3007 		} else {
3008 			hdrs = NULL;
3009 		}
3010 		if (icmp->icmp_sticky_hdrs_len != 0) {
3011 			kmem_free(icmp->icmp_sticky_hdrs,
3012 			    icmp->icmp_sticky_hdrs_len);
3013 		}
3014 		icmp->icmp_sticky_hdrs = hdrs;
3015 		icmp->icmp_sticky_hdrs_len = hdrs_len;
3016 	}
3017 	ip_build_hdrs_v6(icmp->icmp_sticky_hdrs,
3018 	    icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto);
3019 
3020 	/* Set header fields not in ipp */
3021 	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
3022 		ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs;
3023 		ip6h = (ip6_t *)&ip6i[1];
3024 
3025 		if (ipp->ipp_fields & IPPF_RAW_CKSUM) {
3026 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
3027 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
3028 		}
3029 		if (ipp->ipp_fields & IPPF_NO_CKSUM) {
3030 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
3031 		}
3032 	} else {
3033 		ip6h = (ip6_t *)icmp->icmp_sticky_hdrs;
3034 	}
3035 
3036 	if (!(ipp->ipp_fields & IPPF_ADDR))
3037 		ip6h->ip6_src = icmp->icmp_v6src;
3038 
3039 	/* Try to get everything in a single mblk */
3040 	if (hdrs_len > icmp->icmp_max_hdr_len) {
3041 		icmp->icmp_max_hdr_len = hdrs_len;
3042 		rw_exit(&icmp->icmp_rwlock);
3043 		(void) mi_set_sth_wroff(icmp->icmp_connp->conn_rq,
3044 		    icmp->icmp_max_hdr_len + is->is_wroff_extra);
3045 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3046 	}
3047 	return (0);
3048 }
3049 
3050 /*
3051  * This routine retrieves the value of an ND variable in a icmpparam_t
3052  * structure.  It is called through nd_getset when a user reads the
3053  * variable.
3054  */
3055 /* ARGSUSED */
3056 static int
3057 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
3058 {
3059 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3060 
3061 	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
3062 	return (0);
3063 }
3064 
3065 /*
3066  * Walk through the param array specified registering each element with the
3067  * named dispatch (ND) handler.
3068  */
3069 static boolean_t
3070 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
3071 {
3072 	for (; cnt-- > 0; icmppa++) {
3073 		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
3074 			if (!nd_load(ndp, icmppa->icmp_param_name,
3075 			    icmp_param_get, icmp_param_set,
3076 			    (caddr_t)icmppa)) {
3077 				nd_free(ndp);
3078 				return (B_FALSE);
3079 			}
3080 		}
3081 	}
3082 	if (!nd_load(ndp, "icmp_status", icmp_status_report, NULL,
3083 	    NULL)) {
3084 		nd_free(ndp);
3085 		return (B_FALSE);
3086 	}
3087 	return (B_TRUE);
3088 }
3089 
3090 /* This routine sets an ND variable in a icmpparam_t structure. */
3091 /* ARGSUSED */
3092 static int
3093 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
3094 {
3095 	long		new_value;
3096 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3097 
3098 	/*
3099 	 * Fail the request if the new value does not lie within the
3100 	 * required bounds.
3101 	 */
3102 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
3103 	    new_value < icmppa->icmp_param_min ||
3104 	    new_value > icmppa->icmp_param_max) {
3105 		return (EINVAL);
3106 	}
3107 	/* Set the new value */
3108 	icmppa->icmp_param_value = new_value;
3109 	return (0);
3110 }
3111 /*ARGSUSED2*/
3112 static void
3113 icmp_input(void *arg1, mblk_t *mp, void *arg2)
3114 {
3115 	conn_t *connp = (conn_t *)arg1;
3116 	struct T_unitdata_ind	*tudi;
3117 	uchar_t			*rptr;
3118 	icmp_t			*icmp;
3119 	icmp_stack_t		*is;
3120 	sin_t			*sin;
3121 	sin6_t			*sin6;
3122 	ip6_t			*ip6h;
3123 	ip6i_t			*ip6i;
3124 	mblk_t			*mp1;
3125 	int			hdr_len;
3126 	ipha_t			*ipha;
3127 	int			udi_size;	/* Size of T_unitdata_ind */
3128 	uint_t			ipvers;
3129 	ip6_pkt_t		ipp;
3130 	uint8_t			nexthdr;
3131 	boolean_t		recvif = B_FALSE;
3132 	ip_pktinfo_t		*pinfo = NULL;
3133 	mblk_t			*options_mp = NULL;
3134 	uint_t			icmp_opt = 0;
3135 	boolean_t		icmp_ipv6_recvhoplimit = B_FALSE;
3136 	uint_t			hopstrip;
3137 
3138 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
3139 
3140 	icmp = connp->conn_icmp;
3141 	is = icmp->icmp_is;
3142 	rptr = mp->b_rptr;
3143 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
3144 	ASSERT(OK_32PTR(rptr));
3145 
3146 	/*
3147 	 * IP should have prepended the options data in an M_CTL
3148 	 * Check M_CTL "type" to make sure are not here bcos of
3149 	 * a valid ICMP message
3150 	 */
3151 	if (DB_TYPE(mp) == M_CTL) {
3152 		/*
3153 		 * FIXME: does IP still do this?
3154 		 * IP sends up the IPSEC_IN message for handling IPSEC
3155 		 * policy at the TCP level. We don't need it here.
3156 		 */
3157 		if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) {
3158 			mp1 = mp->b_cont;
3159 			freeb(mp);
3160 			mp = mp1;
3161 			rptr = mp->b_rptr;
3162 		} else if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
3163 		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
3164 		    IN_PKTINFO) {
3165 			/*
3166 			 * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
3167 			 * has been prepended to the packet by IP. We need to
3168 			 * extract the mblk and adjust the rptr
3169 			 */
3170 			pinfo = (ip_pktinfo_t *)mp->b_rptr;
3171 			options_mp = mp;
3172 			mp = mp->b_cont;
3173 			rptr = mp->b_rptr;
3174 		} else {
3175 			/*
3176 			 * ICMP messages.
3177 			 */
3178 			icmp_icmp_error(connp->conn_rq, mp);
3179 			return;
3180 		}
3181 	}
3182 
3183 	/*
3184 	 * Discard message if it is misaligned or smaller than the IP header.
3185 	 */
3186 	if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) {
3187 		freemsg(mp);
3188 		if (options_mp != NULL)
3189 			freeb(options_mp);
3190 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3191 		return;
3192 	}
3193 	ipvers = IPH_HDR_VERSION((ipha_t *)rptr);
3194 
3195 	/* Handle M_DATA messages containing IP packets messages */
3196 	if (ipvers == IPV4_VERSION) {
3197 		/*
3198 		 * Special case where IP attaches
3199 		 * the IRE needs to be handled so that we don't send up
3200 		 * IRE to the user land.
3201 		 */
3202 		ipha = (ipha_t *)rptr;
3203 		hdr_len = IPH_HDR_LENGTH(ipha);
3204 
3205 		if (ipha->ipha_protocol == IPPROTO_TCP) {
3206 			tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3207 
3208 			if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) ==
3209 			    TH_SYN) && mp->b_cont != NULL) {
3210 				mp1 = mp->b_cont;
3211 				if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3212 					freeb(mp1);
3213 					mp->b_cont = NULL;
3214 				}
3215 			}
3216 		}
3217 		if (is->is_bsd_compat) {
3218 			ushort_t len;
3219 			len = ntohs(ipha->ipha_length);
3220 
3221 			if (mp->b_datap->db_ref > 1) {
3222 				/*
3223 				 * Allocate a new IP header so that we can
3224 				 * modify ipha_length.
3225 				 */
3226 				mblk_t	*mp1;
3227 
3228 				mp1 = allocb(hdr_len, BPRI_MED);
3229 				if (!mp1) {
3230 					freemsg(mp);
3231 					if (options_mp != NULL)
3232 						freeb(options_mp);
3233 					BUMP_MIB(&is->is_rawip_mib,
3234 					    rawipInErrors);
3235 					return;
3236 				}
3237 				bcopy(rptr, mp1->b_rptr, hdr_len);
3238 				mp->b_rptr = rptr + hdr_len;
3239 				rptr = mp1->b_rptr;
3240 				ipha = (ipha_t *)rptr;
3241 				mp1->b_cont = mp;
3242 				mp1->b_wptr = rptr + hdr_len;
3243 				mp = mp1;
3244 			}
3245 			len -= hdr_len;
3246 			ipha->ipha_length = htons(len);
3247 		}
3248 	}
3249 
3250 	/*
3251 	 * This is the inbound data path.  Packets are passed upstream as
3252 	 * T_UNITDATA_IND messages with full IP headers still attached.
3253 	 */
3254 	if (icmp->icmp_family == AF_INET) {
3255 		ASSERT(ipvers == IPV4_VERSION);
3256 		udi_size =  sizeof (struct T_unitdata_ind) + sizeof (sin_t);
3257 		if (icmp->icmp_recvif && recvif &&
3258 		    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3259 			udi_size += sizeof (struct T_opthdr) +
3260 			    sizeof (uint_t);
3261 		}
3262 
3263 		if (icmp->icmp_ip_recvpktinfo && recvif &&
3264 		    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3265 			udi_size += sizeof (struct T_opthdr) +
3266 			    sizeof (struct in_pktinfo);
3267 		}
3268 
3269 		/*
3270 		 * If SO_TIMESTAMP is set allocate the appropriate sized
3271 		 * buffer. Since gethrestime() expects a pointer aligned
3272 		 * argument, we allocate space necessary for extra
3273 		 * alignment (even though it might not be used).
3274 		 */
3275 		if (icmp->icmp_timestamp) {
3276 			udi_size += sizeof (struct T_opthdr) +
3277 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3278 		}
3279 		mp1 = allocb(udi_size, BPRI_MED);
3280 		if (mp1 == NULL) {
3281 			freemsg(mp);
3282 			if (options_mp != NULL)
3283 				freeb(options_mp);
3284 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3285 			return;
3286 		}
3287 		mp1->b_cont = mp;
3288 		mp = mp1;
3289 		tudi = (struct T_unitdata_ind *)mp->b_rptr;
3290 		mp->b_datap->db_type = M_PROTO;
3291 		mp->b_wptr = (uchar_t *)tudi + udi_size;
3292 		tudi->PRIM_type = T_UNITDATA_IND;
3293 		tudi->SRC_length = sizeof (sin_t);
3294 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3295 		sin = (sin_t *)&tudi[1];
3296 		*sin = sin_null;
3297 		sin->sin_family = AF_INET;
3298 		sin->sin_addr.s_addr = ipha->ipha_src;
3299 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
3300 		    sizeof (sin_t);
3301 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
3302 		tudi->OPT_length = udi_size;
3303 
3304 		/*
3305 		 * Add options if IP_RECVIF is set
3306 		 */
3307 		if (udi_size != 0) {
3308 			char *dstopt;
3309 
3310 			dstopt = (char *)&sin[1];
3311 			if (icmp->icmp_recvif && recvif &&
3312 			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3313 
3314 				struct T_opthdr *toh;
3315 				uint_t		*dstptr;
3316 
3317 				toh = (struct T_opthdr *)dstopt;
3318 				toh->level = IPPROTO_IP;
3319 				toh->name = IP_RECVIF;
3320 				toh->len = sizeof (struct T_opthdr) +
3321 				    sizeof (uint_t);
3322 				toh->status = 0;
3323 				dstopt += sizeof (struct T_opthdr);
3324 				dstptr = (uint_t *)dstopt;
3325 				*dstptr = pinfo->ip_pkt_ifindex;
3326 				dstopt += sizeof (uint_t);
3327 				freeb(options_mp);
3328 				udi_size -= toh->len;
3329 			}
3330 			if (icmp->icmp_timestamp) {
3331 				struct	T_opthdr *toh;
3332 
3333 				toh = (struct T_opthdr *)dstopt;
3334 				toh->level = SOL_SOCKET;
3335 				toh->name = SCM_TIMESTAMP;
3336 				toh->len = sizeof (struct T_opthdr) +
3337 				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3338 				toh->status = 0;
3339 				dstopt += sizeof (struct T_opthdr);
3340 				/* Align for gethrestime() */
3341 				dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
3342 				    sizeof (intptr_t));
3343 				gethrestime((timestruc_t *)dstopt);
3344 				dstopt = (char *)toh + toh->len;
3345 				udi_size -= toh->len;
3346 			}
3347 			if (icmp->icmp_ip_recvpktinfo && recvif &&
3348 			    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3349 				struct	T_opthdr *toh;
3350 				struct	in_pktinfo *pktinfop;
3351 
3352 				toh = (struct T_opthdr *)dstopt;
3353 				toh->level = IPPROTO_IP;
3354 				toh->name = IP_PKTINFO;
3355 				toh->len = sizeof (struct T_opthdr) +
3356 				    sizeof (in_pktinfo_t);
3357 				toh->status = 0;
3358 				dstopt += sizeof (struct T_opthdr);
3359 				pktinfop = (struct in_pktinfo *)dstopt;
3360 				pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
3361 				pktinfop->ipi_spec_dst =
3362 				    pinfo->ip_pkt_match_addr;
3363 
3364 				pktinfop->ipi_addr.s_addr = ipha->ipha_dst;
3365 
3366 				dstopt += sizeof (struct in_pktinfo);
3367 				udi_size -= toh->len;
3368 			}
3369 
3370 			/* Consumed all of allocated space */
3371 			ASSERT(udi_size == 0);
3372 		}
3373 
3374 		BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
3375 		putnext(connp->conn_rq, mp);
3376 		return;
3377 	}
3378 
3379 	/*
3380 	 * We don't need options_mp in the IPv6 path.
3381 	 */
3382 	if (options_mp != NULL) {
3383 		freeb(options_mp);
3384 		options_mp = NULL;
3385 	}
3386 
3387 	/*
3388 	 * Discard message if it is smaller than the IPv6 header
3389 	 * or if the header is malformed.
3390 	 */
3391 	if ((mp->b_wptr - rptr) < sizeof (ip6_t) ||
3392 	    IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION ||
3393 	    icmp->icmp_family != AF_INET6) {
3394 		freemsg(mp);
3395 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3396 		return;
3397 	}
3398 
3399 	/* Initialize */
3400 	ipp.ipp_fields = 0;
3401 	hopstrip = 0;
3402 
3403 	ip6h = (ip6_t *)rptr;
3404 	/*
3405 	 * Call on ip_find_hdr_v6 which gets the total hdr len
3406 	 * as well as individual lenghts of ext hdrs (and ptrs to
3407 	 * them).
3408 	 */
3409 	if (ip6h->ip6_nxt != icmp->icmp_proto) {
3410 		/* Look for ifindex information */
3411 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3412 			ip6i = (ip6i_t *)ip6h;
3413 			if (ip6i->ip6i_flags & IP6I_IFINDEX) {
3414 				ASSERT(ip6i->ip6i_ifindex != 0);
3415 				ipp.ipp_fields |= IPPF_IFINDEX;
3416 				ipp.ipp_ifindex = ip6i->ip6i_ifindex;
3417 			}
3418 			rptr = (uchar_t *)&ip6i[1];
3419 			mp->b_rptr = rptr;
3420 			if (rptr == mp->b_wptr) {
3421 				mp1 = mp->b_cont;
3422 				freeb(mp);
3423 				mp = mp1;
3424 				rptr = mp->b_rptr;
3425 			}
3426 			ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN);
3427 			ip6h = (ip6_t *)rptr;
3428 		}
3429 		hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr);
3430 
3431 		/*
3432 		 * We need to lie a bit to the user because users inside
3433 		 * labeled compartments should not see their own labels.  We
3434 		 * assume that in all other respects IP has checked the label,
3435 		 * and that the label is always first among the options.  (If
3436 		 * it's not first, then this code won't see it, and the option
3437 		 * will be passed along to the user.)
3438 		 *
3439 		 * If we had multilevel ICMP sockets, then the following code
3440 		 * should be skipped for them to allow the user to see the
3441 		 * label.
3442 		 *
3443 		 * Alignment restrictions in the definition of IP options
3444 		 * (namely, the requirement that the 4-octet DOI goes on a
3445 		 * 4-octet boundary) mean that we know exactly where the option
3446 		 * should start, but we're lenient for other hosts.
3447 		 *
3448 		 * Note that there are no multilevel ICMP or raw IP sockets
3449 		 * yet, thus nobody ever sees the IP6OPT_LS option.
3450 		 */
3451 		if ((ipp.ipp_fields & IPPF_HOPOPTS) &&
3452 		    ipp.ipp_hopoptslen > 5 && is_system_labeled()) {
3453 			const uchar_t *ucp =
3454 			    (const uchar_t *)ipp.ipp_hopopts + 2;
3455 			int remlen = ipp.ipp_hopoptslen - 2;
3456 
3457 			while (remlen > 0) {
3458 				if (*ucp == IP6OPT_PAD1) {
3459 					remlen--;
3460 					ucp++;
3461 				} else if (*ucp == IP6OPT_PADN) {
3462 					remlen -= ucp[1] + 2;
3463 					ucp += ucp[1] + 2;
3464 				} else if (*ucp == ip6opt_ls) {
3465 					hopstrip = (ucp -
3466 					    (const uchar_t *)ipp.ipp_hopopts) +
3467 					    ucp[1] + 2;
3468 					hopstrip = (hopstrip + 7) & ~7;
3469 					break;
3470 				} else {
3471 					/* label option must be first */
3472 					break;
3473 				}
3474 			}
3475 		}
3476 	} else {
3477 		hdr_len = IPV6_HDR_LEN;
3478 		ip6i = NULL;
3479 		nexthdr = ip6h->ip6_nxt;
3480 	}
3481 	/*
3482 	 * One special case where IP attaches the IRE needs to
3483 	 * be handled so that we don't send up IRE to the user land.
3484 	 */
3485 	if (nexthdr == IPPROTO_TCP) {
3486 		tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3487 
3488 		if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) &&
3489 		    mp->b_cont != NULL) {
3490 			mp1 = mp->b_cont;
3491 			if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3492 				freeb(mp1);
3493 				mp->b_cont = NULL;
3494 			}
3495 		}
3496 	}
3497 	/*
3498 	 * Check a filter for ICMPv6 types if needed.
3499 	 * Verify raw checksums if needed.
3500 	 */
3501 	if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) {
3502 		if (icmp->icmp_filter != NULL) {
3503 			int type;
3504 
3505 			/* Assumes that IP has done the pullupmsg */
3506 			type = mp->b_rptr[hdr_len];
3507 
3508 			ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr);
3509 			if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
3510 				freemsg(mp);
3511 				return;
3512 			}
3513 		} else {
3514 			/* Checksum */
3515 			uint16_t	*up;
3516 			uint32_t	sum;
3517 			int		remlen;
3518 
3519 			up = (uint16_t *)&ip6h->ip6_src;
3520 
3521 			remlen = msgdsize(mp) - hdr_len;
3522 			sum = htons(icmp->icmp_proto + remlen)
3523 			    + up[0] + up[1] + up[2] + up[3]
3524 			    + up[4] + up[5] + up[6] + up[7]
3525 			    + up[8] + up[9] + up[10] + up[11]
3526 			    + up[12] + up[13] + up[14] + up[15];
3527 			sum = (sum & 0xffff) + (sum >> 16);
3528 			sum = IP_CSUM(mp, hdr_len, sum);
3529 			if (sum != 0) {
3530 				/* IPv6 RAW checksum failed */
3531 				ip0dbg(("icmp_rput: RAW checksum "
3532 				    "failed %x\n", sum));
3533 				freemsg(mp);
3534 				BUMP_MIB(&is->is_rawip_mib,
3535 				    rawipInCksumErrs);
3536 				return;
3537 			}
3538 		}
3539 	}
3540 	/* Skip all the IPv6 headers per API */
3541 	mp->b_rptr += hdr_len;
3542 
3543 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3544 
3545 	/*
3546 	 * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to
3547 	 * maintain state information, instead of relying on icmp_t
3548 	 * structure, since there arent any locks protecting these members
3549 	 * and there is a window where there might be a race between a
3550 	 * thread setting options on the write side and a thread reading
3551 	 * these options on the read size.
3552 	 */
3553 	if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
3554 	    IPPF_RTHDR|IPPF_IFINDEX)) {
3555 		if (icmp->icmp_ipv6_recvhopopts &&
3556 		    (ipp.ipp_fields & IPPF_HOPOPTS) &&
3557 		    ipp.ipp_hopoptslen > hopstrip) {
3558 			udi_size += sizeof (struct T_opthdr) +
3559 			    ipp.ipp_hopoptslen - hopstrip;
3560 			icmp_opt |= IPPF_HOPOPTS;
3561 		}
3562 		if ((icmp->icmp_ipv6_recvdstopts ||
3563 		    icmp->icmp_old_ipv6_recvdstopts) &&
3564 		    (ipp.ipp_fields & IPPF_DSTOPTS)) {
3565 			udi_size += sizeof (struct T_opthdr) +
3566 			    ipp.ipp_dstoptslen;
3567 			icmp_opt |= IPPF_DSTOPTS;
3568 		}
3569 		if (((icmp->icmp_ipv6_recvdstopts &&
3570 		    icmp->icmp_ipv6_recvrthdr &&
3571 		    (ipp.ipp_fields & IPPF_RTHDR)) ||
3572 		    icmp->icmp_ipv6_recvrtdstopts) &&
3573 		    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
3574 			udi_size += sizeof (struct T_opthdr) +
3575 			    ipp.ipp_rtdstoptslen;
3576 			icmp_opt |= IPPF_RTDSTOPTS;
3577 		}
3578 		if (icmp->icmp_ipv6_recvrthdr &&
3579 		    (ipp.ipp_fields & IPPF_RTHDR)) {
3580 			udi_size += sizeof (struct T_opthdr) +
3581 			    ipp.ipp_rthdrlen;
3582 			icmp_opt |= IPPF_RTHDR;
3583 		}
3584 		if (icmp->icmp_ip_recvpktinfo &&
3585 		    (ipp.ipp_fields & IPPF_IFINDEX)) {
3586 			udi_size += sizeof (struct T_opthdr) +
3587 			    sizeof (struct in6_pktinfo);
3588 			icmp_opt |= IPPF_IFINDEX;
3589 		}
3590 	}
3591 	if (icmp->icmp_ipv6_recvhoplimit) {
3592 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3593 		icmp_ipv6_recvhoplimit = B_TRUE;
3594 	}
3595 
3596 	if (icmp->icmp_ipv6_recvtclass)
3597 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3598 
3599 	mp1 = allocb(udi_size, BPRI_MED);
3600 	if (mp1 == NULL) {
3601 		freemsg(mp);
3602 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3603 		return;
3604 	}
3605 	mp1->b_cont = mp;
3606 	mp = mp1;
3607 	mp->b_datap->db_type = M_PROTO;
3608 	tudi = (struct T_unitdata_ind *)mp->b_rptr;
3609 	mp->b_wptr = (uchar_t *)tudi + udi_size;
3610 	tudi->PRIM_type = T_UNITDATA_IND;
3611 	tudi->SRC_length = sizeof (sin6_t);
3612 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3613 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3614 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
3615 	tudi->OPT_length = udi_size;
3616 	sin6 = (sin6_t *)&tudi[1];
3617 	sin6->sin6_port = 0;
3618 	sin6->sin6_family = AF_INET6;
3619 
3620 	sin6->sin6_addr = ip6h->ip6_src;
3621 	/* No sin6_flowinfo per API */
3622 	sin6->sin6_flowinfo = 0;
3623 	/* For link-scope source pass up scope id */
3624 	if ((ipp.ipp_fields & IPPF_IFINDEX) &&
3625 	    IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
3626 		sin6->sin6_scope_id = ipp.ipp_ifindex;
3627 	else
3628 		sin6->sin6_scope_id = 0;
3629 
3630 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
3631 	    icmp->icmp_zoneid, is->is_netstack);
3632 
3633 	if (udi_size != 0) {
3634 		uchar_t *dstopt;
3635 
3636 		dstopt = (uchar_t *)&sin6[1];
3637 		if (icmp_opt & IPPF_IFINDEX) {
3638 			struct T_opthdr *toh;
3639 			struct in6_pktinfo *pkti;
3640 
3641 			toh = (struct T_opthdr *)dstopt;
3642 			toh->level = IPPROTO_IPV6;
3643 			toh->name = IPV6_PKTINFO;
3644 			toh->len = sizeof (struct T_opthdr) +
3645 			    sizeof (*pkti);
3646 			toh->status = 0;
3647 			dstopt += sizeof (struct T_opthdr);
3648 			pkti = (struct in6_pktinfo *)dstopt;
3649 			pkti->ipi6_addr = ip6h->ip6_dst;
3650 			pkti->ipi6_ifindex = ipp.ipp_ifindex;
3651 			dstopt += sizeof (*pkti);
3652 			udi_size -= toh->len;
3653 		}
3654 		if (icmp_ipv6_recvhoplimit) {
3655 			struct T_opthdr *toh;
3656 
3657 			toh = (struct T_opthdr *)dstopt;
3658 			toh->level = IPPROTO_IPV6;
3659 			toh->name = IPV6_HOPLIMIT;
3660 			toh->len = sizeof (struct T_opthdr) +
3661 			    sizeof (uint_t);
3662 			toh->status = 0;
3663 			dstopt += sizeof (struct T_opthdr);
3664 			*(uint_t *)dstopt = ip6h->ip6_hops;
3665 			dstopt += sizeof (uint_t);
3666 			udi_size -= toh->len;
3667 		}
3668 		if (icmp->icmp_ipv6_recvtclass) {
3669 			struct T_opthdr *toh;
3670 
3671 			toh = (struct T_opthdr *)dstopt;
3672 			toh->level = IPPROTO_IPV6;
3673 			toh->name = IPV6_TCLASS;
3674 			toh->len = sizeof (struct T_opthdr) +
3675 			    sizeof (uint_t);
3676 			toh->status = 0;
3677 			dstopt += sizeof (struct T_opthdr);
3678 			*(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
3679 			dstopt += sizeof (uint_t);
3680 			udi_size -= toh->len;
3681 		}
3682 		if (icmp_opt & IPPF_HOPOPTS) {
3683 			struct T_opthdr *toh;
3684 
3685 			toh = (struct T_opthdr *)dstopt;
3686 			toh->level = IPPROTO_IPV6;
3687 			toh->name = IPV6_HOPOPTS;
3688 			toh->len = sizeof (struct T_opthdr) +
3689 			    ipp.ipp_hopoptslen - hopstrip;
3690 			toh->status = 0;
3691 			dstopt += sizeof (struct T_opthdr);
3692 			bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt,
3693 			    ipp.ipp_hopoptslen - hopstrip);
3694 			if (hopstrip > 0) {
3695 				/* copy next header value and fake length */
3696 				dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0];
3697 				dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] -
3698 				    hopstrip / 8;
3699 			}
3700 			dstopt += ipp.ipp_hopoptslen - hopstrip;
3701 			udi_size -= toh->len;
3702 		}
3703 		if (icmp_opt & IPPF_RTDSTOPTS) {
3704 			struct T_opthdr *toh;
3705 
3706 			toh = (struct T_opthdr *)dstopt;
3707 			toh->level = IPPROTO_IPV6;
3708 			toh->name = IPV6_DSTOPTS;
3709 			toh->len = sizeof (struct T_opthdr) +
3710 			    ipp.ipp_rtdstoptslen;
3711 			toh->status = 0;
3712 			dstopt += sizeof (struct T_opthdr);
3713 			bcopy(ipp.ipp_rtdstopts, dstopt,
3714 			    ipp.ipp_rtdstoptslen);
3715 			dstopt += ipp.ipp_rtdstoptslen;
3716 			udi_size -= toh->len;
3717 		}
3718 		if (icmp_opt & IPPF_RTHDR) {
3719 			struct T_opthdr *toh;
3720 
3721 			toh = (struct T_opthdr *)dstopt;
3722 			toh->level = IPPROTO_IPV6;
3723 			toh->name = IPV6_RTHDR;
3724 			toh->len = sizeof (struct T_opthdr) +
3725 			    ipp.ipp_rthdrlen;
3726 			toh->status = 0;
3727 			dstopt += sizeof (struct T_opthdr);
3728 			bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
3729 			dstopt += ipp.ipp_rthdrlen;
3730 			udi_size -= toh->len;
3731 		}
3732 		if (icmp_opt & IPPF_DSTOPTS) {
3733 			struct T_opthdr *toh;
3734 
3735 			toh = (struct T_opthdr *)dstopt;
3736 			toh->level = IPPROTO_IPV6;
3737 			toh->name = IPV6_DSTOPTS;
3738 			toh->len = sizeof (struct T_opthdr) +
3739 			    ipp.ipp_dstoptslen;
3740 			toh->status = 0;
3741 			dstopt += sizeof (struct T_opthdr);
3742 			bcopy(ipp.ipp_dstopts, dstopt,
3743 			    ipp.ipp_dstoptslen);
3744 			dstopt += ipp.ipp_dstoptslen;
3745 			udi_size -= toh->len;
3746 		}
3747 		/* Consumed all of allocated space */
3748 		ASSERT(udi_size == 0);
3749 	}
3750 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
3751 	putnext(connp->conn_rq, mp);
3752 }
3753 
3754 /*
3755  * Handle the results of a T_BIND_REQ whether deferred by IP or handled
3756  * immediately.
3757  */
3758 static void
3759 icmp_bind_result(conn_t *connp, mblk_t *mp)
3760 {
3761 	struct T_error_ack	*tea;
3762 
3763 	switch (mp->b_datap->db_type) {
3764 	case M_PROTO:
3765 	case M_PCPROTO:
3766 		/* M_PROTO messages contain some type of TPI message. */
3767 		if ((mp->b_wptr - mp->b_rptr) < sizeof (t_scalar_t)) {
3768 			freemsg(mp);
3769 			return;
3770 		}
3771 		tea = (struct T_error_ack *)mp->b_rptr;
3772 
3773 		switch (tea->PRIM_type) {
3774 		case T_ERROR_ACK:
3775 			switch (tea->ERROR_prim) {
3776 			case O_T_BIND_REQ:
3777 			case T_BIND_REQ:
3778 				icmp_bind_error(connp, mp);
3779 				return;
3780 			default:
3781 				break;
3782 			}
3783 			ASSERT(0);
3784 			freemsg(mp);
3785 			return;
3786 
3787 		case T_BIND_ACK:
3788 			icmp_bind_ack(connp, mp);
3789 			return;
3790 
3791 		default:
3792 			break;
3793 		}
3794 		freemsg(mp);
3795 		return;
3796 	default:
3797 		/* FIXME: other cases? */
3798 		ASSERT(0);
3799 		freemsg(mp);
3800 		return;
3801 	}
3802 }
3803 
3804 /*
3805  * Process a T_BIND_ACK
3806  */
3807 static void
3808 icmp_bind_ack(conn_t *connp, mblk_t *mp)
3809 {
3810 	icmp_t	*icmp = connp->conn_icmp;
3811 	mblk_t	*mp1;
3812 	ire_t	*ire;
3813 	struct T_bind_ack *tba;
3814 	uchar_t *addrp;
3815 	ipa_conn_t	*ac;
3816 	ipa6_conn_t	*ac6;
3817 
3818 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3819 	/*
3820 	 * We know if headers are included or not so we can
3821 	 * safely do this.
3822 	 */
3823 	if (icmp->icmp_state == TS_UNBND) {
3824 		/*
3825 		 * TPI has not yet bound - bind sent by
3826 		 * icmp_bind_proto.
3827 		 */
3828 		freemsg(mp);
3829 		rw_exit(&icmp->icmp_rwlock);
3830 		return;
3831 	}
3832 	ASSERT(icmp->icmp_pending_op != -1);
3833 
3834 	/*
3835 	 * If a broadcast/multicast address was bound set
3836 	 * the source address to 0.
3837 	 * This ensures no datagrams with broadcast address
3838 	 * as source address are emitted (which would violate
3839 	 * RFC1122 - Hosts requirements)
3840 	 *
3841 	 * Note that when connecting the returned IRE is
3842 	 * for the destination address and we only perform
3843 	 * the broadcast check for the source address (it
3844 	 * is OK to connect to a broadcast/multicast address.)
3845 	 */
3846 	mp1 = mp->b_cont;
3847 	if (mp1 != NULL && mp1->b_datap->db_type == IRE_DB_TYPE) {
3848 		ire = (ire_t *)mp1->b_rptr;
3849 
3850 		/*
3851 		 * Note: we get IRE_BROADCAST for IPv6 to "mark" a multicast
3852 		 * local address.
3853 		 */
3854 		if (ire->ire_type == IRE_BROADCAST &&
3855 		    icmp->icmp_state != TS_DATA_XFER) {
3856 			ASSERT(icmp->icmp_pending_op == T_BIND_REQ ||
3857 			    icmp->icmp_pending_op == O_T_BIND_REQ);
3858 			/* This was just a local bind to a MC/broadcast addr */
3859 			V6_SET_ZERO(icmp->icmp_v6src);
3860 			if (icmp->icmp_family == AF_INET6)
3861 				(void) icmp_build_hdrs(icmp);
3862 		} else if (V6_OR_V4_INADDR_ANY(icmp->icmp_v6src)) {
3863 			/*
3864 			 * Local address not yet set - pick it from the
3865 			 * T_bind_ack
3866 			 */
3867 			tba = (struct T_bind_ack *)mp->b_rptr;
3868 			addrp = &mp->b_rptr[tba->ADDR_offset];
3869 			switch (icmp->icmp_family) {
3870 			case AF_INET:
3871 				if (tba->ADDR_length == sizeof (ipa_conn_t)) {
3872 					ac = (ipa_conn_t *)addrp;
3873 				} else {
3874 					ASSERT(tba->ADDR_length ==
3875 					    sizeof (ipa_conn_x_t));
3876 					ac = &((ipa_conn_x_t *)addrp)->acx_conn;
3877 				}
3878 				IN6_IPADDR_TO_V4MAPPED(ac->ac_laddr,
3879 				    &icmp->icmp_v6src);
3880 				break;
3881 			case AF_INET6:
3882 				if (tba->ADDR_length == sizeof (ipa6_conn_t)) {
3883 					ac6 = (ipa6_conn_t *)addrp;
3884 				} else {
3885 					ASSERT(tba->ADDR_length ==
3886 					    sizeof (ipa6_conn_x_t));
3887 					ac6 = &((ipa6_conn_x_t *)
3888 					    addrp)->ac6x_conn;
3889 				}
3890 				icmp->icmp_v6src = ac6->ac6_laddr;
3891 				(void) icmp_build_hdrs(icmp);
3892 			}
3893 		}
3894 		mp1 = mp1->b_cont;
3895 	}
3896 	icmp->icmp_pending_op = -1;
3897 	rw_exit(&icmp->icmp_rwlock);
3898 	/*
3899 	 * Look for one or more appended ACK message added by
3900 	 * icmp_connect or icmp_disconnect.
3901 	 * If none found just send up the T_BIND_ACK.
3902 	 * icmp_connect has appended a T_OK_ACK and a
3903 	 * T_CONN_CON.
3904 	 * icmp_disconnect has appended a T_OK_ACK.
3905 	 */
3906 	if (mp1 != NULL) {
3907 		if (mp->b_cont == mp1)
3908 			mp->b_cont = NULL;
3909 		else {
3910 			ASSERT(mp->b_cont->b_cont == mp1);
3911 			mp->b_cont->b_cont = NULL;
3912 		}
3913 		freemsg(mp);
3914 		mp = mp1;
3915 		while (mp != NULL) {
3916 			mp1 = mp->b_cont;
3917 			mp->b_cont = NULL;
3918 			putnext(connp->conn_rq, mp);
3919 			mp = mp1;
3920 		}
3921 		return;
3922 	}
3923 	freemsg(mp->b_cont);
3924 	mp->b_cont = NULL;
3925 	putnext(connp->conn_rq, mp);
3926 }
3927 
3928 static void
3929 icmp_bind_error(conn_t *connp, mblk_t *mp)
3930 {
3931 	icmp_t	*icmp = connp->conn_icmp;
3932 	struct T_error_ack *tea;
3933 
3934 	tea = (struct T_error_ack *)mp->b_rptr;
3935 	/*
3936 	 * If our O_T_BIND_REQ/T_BIND_REQ fails,
3937 	 * clear out the source address before
3938 	 * passing the message upstream.
3939 	 * If this was caused by a T_CONN_REQ
3940 	 * revert back to bound state.
3941 	 */
3942 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3943 	if (icmp->icmp_state == TS_UNBND) {
3944 		/*
3945 		 * TPI has not yet bound - bind sent by icmp_bind_proto.
3946 		 */
3947 		freemsg(mp);
3948 		rw_exit(&icmp->icmp_rwlock);
3949 		return;
3950 	}
3951 	ASSERT(icmp->icmp_pending_op != -1);
3952 	tea->ERROR_prim = icmp->icmp_pending_op;
3953 	icmp->icmp_pending_op = -1;
3954 
3955 	switch (tea->ERROR_prim) {
3956 	case T_CONN_REQ:
3957 		ASSERT(icmp->icmp_state == TS_DATA_XFER);
3958 		/* Connect failed */
3959 		/* Revert back to the bound source */
3960 		icmp->icmp_v6src = icmp->icmp_bound_v6src;
3961 		icmp->icmp_state = TS_IDLE;
3962 		if (icmp->icmp_family == AF_INET6)
3963 			(void) icmp_build_hdrs(icmp);
3964 		break;
3965 
3966 	case T_DISCON_REQ:
3967 	case T_BIND_REQ:
3968 	case O_T_BIND_REQ:
3969 		V6_SET_ZERO(icmp->icmp_v6src);
3970 		V6_SET_ZERO(icmp->icmp_bound_v6src);
3971 		icmp->icmp_state = TS_UNBND;
3972 		if (icmp->icmp_family == AF_INET6)
3973 			(void) icmp_build_hdrs(icmp);
3974 		break;
3975 	default:
3976 		break;
3977 	}
3978 	rw_exit(&icmp->icmp_rwlock);
3979 	putnext(connp->conn_rq, mp);
3980 }
3981 
3982 /*
3983  * return SNMP stuff in buffer in mpdata
3984  */
3985 mblk_t *
3986 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
3987 {
3988 	mblk_t			*mpdata;
3989 	struct opthdr		*optp;
3990 	conn_t			*connp = Q_TO_CONN(q);
3991 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
3992 	mblk_t			*mp2ctl;
3993 
3994 	/*
3995 	 * make a copy of the original message
3996 	 */
3997 	mp2ctl = copymsg(mpctl);
3998 
3999 	if (mpctl == NULL ||
4000 	    (mpdata = mpctl->b_cont) == NULL) {
4001 		freemsg(mpctl);
4002 		freemsg(mp2ctl);
4003 		return (0);
4004 	}
4005 
4006 	/* fixed length structure for IPv4 and IPv6 counters */
4007 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
4008 	optp->level = EXPER_RAWIP;
4009 	optp->name = 0;
4010 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
4011 	    sizeof (is->is_rawip_mib));
4012 	optp->len = msgdsize(mpdata);
4013 	qreply(q, mpctl);
4014 
4015 	return (mp2ctl);
4016 }
4017 
4018 /*
4019  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
4020  * TODO:  If this ever actually tries to set anything, it needs to be
4021  * to do the appropriate locking.
4022  */
4023 /* ARGSUSED */
4024 int
4025 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
4026     uchar_t *ptr, int len)
4027 {
4028 	switch (level) {
4029 	case EXPER_RAWIP:
4030 		return (0);
4031 	default:
4032 		return (1);
4033 	}
4034 }
4035 
4036 /* Report for ndd "icmp_status" */
4037 /* ARGSUSED */
4038 static int
4039 icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
4040 {
4041 	conn_t  *connp;
4042 	ip_stack_t *ipst;
4043 	char	laddrbuf[INET6_ADDRSTRLEN];
4044 	char	faddrbuf[INET6_ADDRSTRLEN];
4045 	int	i;
4046 
4047 	(void) mi_mpprintf(mp,
4048 	    "RAWIP    " MI_COL_HDRPAD_STR
4049 	/*   01234567[89ABCDEF] */
4050 	    "  src addr        dest addr       state");
4051 	/*   xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx UNBOUND */
4052 
4053 	connp = Q_TO_CONN(q);
4054 	ipst = connp->conn_netstack->netstack_ip;
4055 	for (i = 0; i < CONN_G_HASH_SIZE; i++) {
4056 		connf_t *connfp;
4057 		char	*state;
4058 
4059 		connfp = &ipst->ips_ipcl_globalhash_fanout[i];
4060 		connp = NULL;
4061 
4062 		while ((connp = ipcl_get_next_conn(connfp, connp,
4063 		    IPCL_RAWIPCONN)) != NULL) {
4064 			icmp_t  *icmp;
4065 
4066 			mutex_enter(&(connp)->conn_lock);
4067 			icmp = connp->conn_icmp;
4068 
4069 			if (icmp->icmp_state == TS_UNBND)
4070 				state = "UNBOUND";
4071 			else if (icmp->icmp_state == TS_IDLE)
4072 				state = "IDLE";
4073 			else if (icmp->icmp_state == TS_DATA_XFER)
4074 				state = "CONNECTED";
4075 			else
4076 				state = "UnkState";
4077 
4078 			(void) mi_mpprintf(mp, MI_COL_PTRFMT_STR "%s %s %s",
4079 			    (void *)icmp,
4080 			    inet_ntop(AF_INET6, &icmp->icmp_v6dst, faddrbuf,
4081 			    sizeof (faddrbuf)),
4082 			    inet_ntop(AF_INET6, &icmp->icmp_v6src, laddrbuf,
4083 			    sizeof (laddrbuf)),
4084 			    state);
4085 			mutex_exit(&(connp)->conn_lock);
4086 		}
4087 	}
4088 	return (0);
4089 }
4090 
4091 /*
4092  * This routine creates a T_UDERROR_IND message and passes it upstream.
4093  * The address and options are copied from the T_UNITDATA_REQ message
4094  * passed in mp.  This message is freed.
4095  */
4096 static void
4097 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
4098 {
4099 	mblk_t	*mp1;
4100 	uchar_t	*rptr = mp->b_rptr;
4101 	struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr;
4102 
4103 	mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset],
4104 	    tudr->DEST_length, (char *)&rptr[tudr->OPT_offset],
4105 	    tudr->OPT_length, err);
4106 	if (mp1)
4107 		qreply(q, mp1);
4108 	freemsg(mp);
4109 }
4110 
4111 /*
4112  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
4113  * After some error checking, the message is passed downstream to ip.
4114  */
4115 static void
4116 icmp_unbind(queue_t *q, mblk_t *mp)
4117 {
4118 	icmp_t	*icmp = Q_TO_ICMP(q);
4119 
4120 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4121 	/* If a bind has not been done, we can't unbind. */
4122 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
4123 		rw_exit(&icmp->icmp_rwlock);
4124 		icmp_err_ack(q, mp, TOUTSTATE, 0);
4125 		return;
4126 	}
4127 	icmp->icmp_pending_op = T_UNBIND_REQ;
4128 	rw_exit(&icmp->icmp_rwlock);
4129 
4130 	/*
4131 	 * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
4132 	 * and therefore ip_unbind must never return NULL.
4133 	 */
4134 	mp = ip_unbind(q, mp);
4135 	ASSERT(mp != NULL);
4136 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
4137 
4138 	/*
4139 	 * Once we're unbound from IP, the pending operation may be cleared
4140 	 * here.
4141 	 */
4142 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4143 	V6_SET_ZERO(icmp->icmp_v6src);
4144 	V6_SET_ZERO(icmp->icmp_bound_v6src);
4145 	icmp->icmp_pending_op = -1;
4146 	icmp->icmp_state = TS_UNBND;
4147 	if (icmp->icmp_family == AF_INET6)
4148 		(void) icmp_build_hdrs(icmp);
4149 	rw_exit(&icmp->icmp_rwlock);
4150 
4151 	qreply(q, mp);
4152 }
4153 
4154 /*
4155  * Process IPv4 packets that already include an IP header.
4156  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
4157  * IPPROTO_IGMP).
4158  */
4159 static void
4160 icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
4161 {
4162 	icmp_stack_t *is = icmp->icmp_is;
4163 	ipha_t	*ipha;
4164 	int	ip_hdr_length;
4165 	int	tp_hdr_len;
4166 	mblk_t	*mp1;
4167 	uint_t	pkt_len;
4168 	ip_opt_info_t optinfo;
4169 	conn_t	*connp = icmp->icmp_connp;
4170 
4171 	optinfo.ip_opt_flags = 0;
4172 	optinfo.ip_opt_ill_index = 0;
4173 	ipha = (ipha_t *)mp->b_rptr;
4174 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4175 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
4176 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4177 			ASSERT(icmp != NULL);
4178 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4179 			freemsg(mp);
4180 			return;
4181 		}
4182 		ipha = (ipha_t *)mp->b_rptr;
4183 	}
4184 	ipha->ipha_version_and_hdr_length =
4185 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
4186 
4187 	/*
4188 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4189 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4190 	 * tell IP that the application has sent a complete IP header and not
4191 	 * to compute the transport checksum nor change the DF flag.
4192 	 */
4193 	ipha->ipha_ident = IP_HDR_INCLUDED;
4194 	ipha->ipha_hdr_checksum = 0;
4195 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
4196 	/* Insert options if any */
4197 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4198 		/*
4199 		 * Put the IP header plus any transport header that is
4200 		 * checksumed by ip_wput into the first mblk. (ip_wput assumes
4201 		 * that at least the checksum field is in the first mblk.)
4202 		 */
4203 		switch (ipha->ipha_protocol) {
4204 		case IPPROTO_UDP:
4205 			tp_hdr_len = 8;
4206 			break;
4207 		case IPPROTO_TCP:
4208 			tp_hdr_len = 20;
4209 			break;
4210 		default:
4211 			tp_hdr_len = 0;
4212 			break;
4213 		}
4214 		/*
4215 		 * The code below assumes that IP_SIMPLE_HDR_LENGTH plus
4216 		 * tp_hdr_len bytes will be in a single mblk.
4217 		 */
4218 		if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH +
4219 		    tp_hdr_len)) {
4220 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH +
4221 			    tp_hdr_len)) {
4222 				BUMP_MIB(&is->is_rawip_mib,
4223 				    rawipOutErrors);
4224 				freemsg(mp);
4225 				return;
4226 			}
4227 			ipha = (ipha_t *)mp->b_rptr;
4228 		}
4229 
4230 		/*
4231 		 * if the length is larger then the max allowed IP packet,
4232 		 * then send an error and abort the processing.
4233 		 */
4234 		pkt_len = ntohs(ipha->ipha_length)
4235 		    + icmp->icmp_ip_snd_options_len;
4236 		if (pkt_len > IP_MAXPACKET) {
4237 			icmp_ud_err(q, mp, EMSGSIZE);
4238 			return;
4239 		}
4240 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
4241 		    tp_hdr_len, BPRI_LO))) {
4242 			icmp_ud_err(q, mp, ENOMEM);
4243 			return;
4244 		}
4245 		mp1->b_rptr += is->is_wroff_extra;
4246 		mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
4247 
4248 		ipha->ipha_length = htons((uint16_t)pkt_len);
4249 		bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH);
4250 
4251 		/* Copy transport header if any */
4252 		bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len);
4253 		mp1->b_wptr += tp_hdr_len;
4254 
4255 		/* Add options */
4256 		ipha = (ipha_t *)mp1->b_rptr;
4257 		bcopy(icmp->icmp_ip_snd_options, &ipha[1],
4258 		    icmp->icmp_ip_snd_options_len);
4259 
4260 		/* Drop IP header and transport header from original */
4261 		(void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len);
4262 
4263 		mp1->b_cont = mp;
4264 		mp = mp1;
4265 		/*
4266 		 * Massage source route putting first source
4267 		 * route in ipha_dst.
4268 		 */
4269 		(void) ip_massage_options(ipha, is->is_netstack);
4270 	}
4271 
4272 	if (pktinfop != NULL) {
4273 		/*
4274 		 * Over write the source address provided in the header
4275 		 */
4276 		if (pktinfop->ip4_addr != INADDR_ANY) {
4277 			ipha->ipha_src = pktinfop->ip4_addr;
4278 			optinfo.ip_opt_flags = IP_VERIFY_SRC;
4279 		}
4280 
4281 		if (pktinfop->ip4_ill_index != 0) {
4282 			optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4283 		}
4284 	}
4285 
4286 	mblk_setcred(mp, connp->conn_cred);
4287 	ip_output_options(connp, mp, q, IP_WPUT,
4288 	    &optinfo);
4289 }
4290 
4291 static boolean_t
4292 icmp_update_label(queue_t *q, icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
4293 {
4294 	int err;
4295 	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
4296 	icmp_stack_t		*is = icmp->icmp_is;
4297 	conn_t	*connp = icmp->icmp_connp;
4298 
4299 	err = tsol_compute_label(DB_CREDDEF(mp, connp->conn_cred), dst,
4300 	    opt_storage, icmp->icmp_mac_exempt,
4301 	    is->is_netstack->netstack_ip);
4302 	if (err == 0) {
4303 		err = tsol_update_options(&icmp->icmp_ip_snd_options,
4304 		    &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len,
4305 		    opt_storage);
4306 	}
4307 	if (err != 0) {
4308 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4309 		DTRACE_PROBE4(
4310 		    tx__ip__log__drop__updatelabel__icmp,
4311 		    char *, "queue(1) failed to update options(2) on mp(3)",
4312 		    queue_t *, q, char *, opt_storage, mblk_t *, mp);
4313 		icmp_ud_err(q, mp, err);
4314 		return (B_FALSE);
4315 	}
4316 	IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
4317 	return (B_TRUE);
4318 }
4319 
4320 /*
4321  * This routine handles all messages passed downstream.  It either
4322  * consumes the message or passes it downstream; it never queues a
4323  * a message.
4324  */
4325 static void
4326 icmp_wput(queue_t *q, mblk_t *mp)
4327 {
4328 	uchar_t	*rptr = mp->b_rptr;
4329 	ipha_t	*ipha;
4330 	mblk_t	*mp1;
4331 	int	ip_hdr_length;
4332 #define	tudr ((struct T_unitdata_req *)rptr)
4333 	size_t	ip_len;
4334 	conn_t	*connp = Q_TO_CONN(q);
4335 	icmp_t	*icmp = connp->conn_icmp;
4336 	icmp_stack_t *is = icmp->icmp_is;
4337 	sin6_t	*sin6;
4338 	sin_t	*sin;
4339 	ipaddr_t	v4dst;
4340 	ip4_pkt_t	pktinfo;
4341 	ip4_pkt_t	*pktinfop = &pktinfo;
4342 	ip_opt_info_t	optinfo;
4343 
4344 	switch (mp->b_datap->db_type) {
4345 	case M_DATA:
4346 		if (icmp->icmp_hdrincl) {
4347 			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
4348 			ipha = (ipha_t *)mp->b_rptr;
4349 			if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
4350 				if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4351 					BUMP_MIB(&is->is_rawip_mib,
4352 					    rawipOutErrors);
4353 					freemsg(mp);
4354 					return;
4355 				}
4356 				ipha = (ipha_t *)mp->b_rptr;
4357 			}
4358 			/*
4359 			 * If this connection was used for v6 (inconceivable!)
4360 			 * or if we have a new destination, then it's time to
4361 			 * figure a new label.
4362 			 */
4363 			if (is_system_labeled() &&
4364 			    (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4365 			    V4_PART_OF_V6(icmp->icmp_v6lastdst) !=
4366 			    ipha->ipha_dst) &&
4367 			    !icmp_update_label(q, icmp, mp, ipha->ipha_dst)) {
4368 				return;
4369 			}
4370 			icmp_wput_hdrincl(q, mp, icmp, NULL);
4371 			return;
4372 		}
4373 		freemsg(mp);
4374 		return;
4375 	case M_PROTO:
4376 	case M_PCPROTO:
4377 		ip_len = mp->b_wptr - rptr;
4378 		if (ip_len >= sizeof (struct T_unitdata_req)) {
4379 			/* Expedite valid T_UNITDATA_REQ to below the switch */
4380 			if (((union T_primitives *)rptr)->type
4381 			    == T_UNITDATA_REQ)
4382 				break;
4383 		}
4384 		/* FALLTHRU */
4385 	default:
4386 		icmp_wput_other(q, mp);
4387 		return;
4388 	}
4389 
4390 	/* Handle T_UNITDATA_REQ messages here. */
4391 
4392 
4393 
4394 	if (icmp->icmp_state == TS_UNBND) {
4395 		/* If a port has not been bound to the stream, fail. */
4396 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4397 		icmp_ud_err(q, mp, EPROTO);
4398 		return;
4399 	}
4400 	mp1 = mp->b_cont;
4401 	if (mp1 == NULL) {
4402 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4403 		icmp_ud_err(q, mp, EPROTO);
4404 		return;
4405 	}
4406 
4407 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
4408 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4409 		icmp_ud_err(q, mp, EADDRNOTAVAIL);
4410 		return;
4411 	}
4412 
4413 	switch (icmp->icmp_family) {
4414 	case AF_INET6:
4415 		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
4416 		if (!OK_32PTR((char *)sin6) ||
4417 		    tudr->DEST_length != sizeof (sin6_t) ||
4418 		    sin6->sin6_family != AF_INET6) {
4419 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4420 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4421 			return;
4422 		}
4423 
4424 		/* No support for mapped addresses on raw sockets */
4425 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4426 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4427 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4428 			return;
4429 		}
4430 
4431 		/*
4432 		 * Destination is a native IPv6 address.
4433 		 * Send out an IPv6 format packet.
4434 		 */
4435 		icmp_wput_ipv6(q, mp, sin6, tudr->OPT_length);
4436 		return;
4437 
4438 	case AF_INET:
4439 		sin = (sin_t *)&rptr[tudr->DEST_offset];
4440 		if (!OK_32PTR((char *)sin) ||
4441 		    tudr->DEST_length != sizeof (sin_t) ||
4442 		    sin->sin_family != AF_INET) {
4443 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4444 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4445 			return;
4446 		}
4447 		/* Extract and ipaddr */
4448 		v4dst = sin->sin_addr.s_addr;
4449 		break;
4450 
4451 	default:
4452 		ASSERT(0);
4453 	}
4454 
4455 	pktinfop->ip4_ill_index = 0;
4456 	pktinfop->ip4_addr = INADDR_ANY;
4457 	optinfo.ip_opt_flags = 0;
4458 	optinfo.ip_opt_ill_index = 0;
4459 
4460 
4461 	/*
4462 	 * If options passed in, feed it for verification and handling
4463 	 */
4464 	if (tudr->OPT_length != 0) {
4465 		int error;
4466 
4467 		error = 0;
4468 		if (icmp_unitdata_opt_process(q, mp, &error,
4469 		    (void *)pktinfop) < 0) {
4470 			/* failure */
4471 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4472 			icmp_ud_err(q, mp, error);
4473 			return;
4474 		}
4475 		ASSERT(error == 0);
4476 		/*
4477 		 * Note: Success in processing options.
4478 		 * mp option buffer represented by
4479 		 * OPT_length/offset now potentially modified
4480 		 * and contain option setting results
4481 		 */
4482 
4483 	}
4484 
4485 	if (v4dst == INADDR_ANY)
4486 		v4dst = htonl(INADDR_LOOPBACK);
4487 
4488 	/* Check if our saved options are valid; update if not */
4489 	if (is_system_labeled() &&
4490 	    (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4491 	    V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst) &&
4492 	    !icmp_update_label(q, icmp, mp, v4dst)) {
4493 		return;
4494 	}
4495 
4496 	/* Protocol 255 contains full IP headers */
4497 	if (icmp->icmp_hdrincl) {
4498 		freeb(mp);
4499 		icmp_wput_hdrincl(q, mp1, icmp, pktinfop);
4500 		return;
4501 	}
4502 
4503 
4504 	/* Add an IP header */
4505 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4506 	ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length];
4507 	if ((uchar_t *)ipha < mp1->b_datap->db_base ||
4508 	    mp1->b_datap->db_ref != 1 ||
4509 	    !OK_32PTR(ipha)) {
4510 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
4511 		    BPRI_LO))) {
4512 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4513 			icmp_ud_err(q, mp, ENOMEM);
4514 			return;
4515 		}
4516 		mp1->b_cont = mp->b_cont;
4517 		ipha = (ipha_t *)mp1->b_datap->db_lim;
4518 		mp1->b_wptr = (uchar_t *)ipha;
4519 		ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
4520 	}
4521 #ifdef	_BIG_ENDIAN
4522 	/* Set version, header length, and tos */
4523 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4524 	    ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
4525 	    icmp->icmp_type_of_service);
4526 	/* Set ttl and protocol */
4527 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto;
4528 #else
4529 	/* Set version, header length, and tos */
4530 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4531 	    ((icmp->icmp_type_of_service << 8) |
4532 	    ((IP_VERSION << 4) | (ip_hdr_length>>2)));
4533 	/* Set ttl and protocol */
4534 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl;
4535 #endif
4536 	if (pktinfop->ip4_addr != INADDR_ANY) {
4537 		ipha->ipha_src = pktinfop->ip4_addr;
4538 		optinfo.ip_opt_flags = IP_VERIFY_SRC;
4539 	} else {
4540 
4541 		/*
4542 		 * Copy our address into the packet.  If this is zero,
4543 		 * ip will fill in the real source address.
4544 		 */
4545 		IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src);
4546 	}
4547 
4548 	ipha->ipha_fragment_offset_and_flags = 0;
4549 
4550 	if (pktinfop->ip4_ill_index != 0) {
4551 		optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4552 	}
4553 
4554 
4555 	/*
4556 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4557 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4558 	 * tell IP that the application has sent a complete IP header and not
4559 	 * to compute the transport checksum nor change the DF flag.
4560 	 */
4561 	ipha->ipha_ident = IP_HDR_INCLUDED;
4562 
4563 	/* Finish common formatting of the packet. */
4564 	mp1->b_rptr = (uchar_t *)ipha;
4565 
4566 	ip_len = mp1->b_wptr - (uchar_t *)ipha;
4567 	if (mp1->b_cont != NULL)
4568 		ip_len += msgdsize(mp1->b_cont);
4569 
4570 	/*
4571 	 * Set the length into the IP header.
4572 	 * If the length is greater than the maximum allowed by IP,
4573 	 * then free the message and return. Do not try and send it
4574 	 * as this can cause problems in layers below.
4575 	 */
4576 	if (ip_len > IP_MAXPACKET) {
4577 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4578 		icmp_ud_err(q, mp, EMSGSIZE);
4579 		return;
4580 	}
4581 	ipha->ipha_length = htons((uint16_t)ip_len);
4582 	/*
4583 	 * Copy in the destination address from the T_UNITDATA
4584 	 * request
4585 	 */
4586 	ipha->ipha_dst = v4dst;
4587 
4588 	/*
4589 	 * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
4590 	 */
4591 	if (CLASSD(v4dst))
4592 		ipha->ipha_ttl = icmp->icmp_multicast_ttl;
4593 
4594 	/* Copy in options if any */
4595 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4596 		bcopy(icmp->icmp_ip_snd_options,
4597 		    &ipha[1], icmp->icmp_ip_snd_options_len);
4598 		/*
4599 		 * Massage source route putting first source route in ipha_dst.
4600 		 * Ignore the destination in the T_unitdata_req.
4601 		 */
4602 		(void) ip_massage_options(ipha, is->is_netstack);
4603 	}
4604 
4605 	freeb(mp);
4606 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4607 	mblk_setcred(mp1, connp->conn_cred);
4608 	ip_output_options(Q_TO_CONN(q), mp1, q, IP_WPUT, &optinfo);
4609 #undef	ipha
4610 #undef tudr
4611 }
4612 
4613 static boolean_t
4614 icmp_update_label_v6(queue_t *wq, icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
4615 {
4616 	int err;
4617 	uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
4618 	icmp_stack_t		*is = icmp->icmp_is;
4619 	conn_t	*connp = icmp->icmp_connp;
4620 
4621 	err = tsol_compute_label_v6(DB_CREDDEF(mp, connp->conn_cred), dst,
4622 	    opt_storage, icmp->icmp_mac_exempt,
4623 	    is->is_netstack->netstack_ip);
4624 	if (err == 0) {
4625 		err = tsol_update_sticky(&icmp->icmp_sticky_ipp,
4626 		    &icmp->icmp_label_len_v6, opt_storage);
4627 	}
4628 	if (err != 0) {
4629 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4630 		DTRACE_PROBE4(
4631 		    tx__ip__log__drop__updatelabel__icmp6,
4632 		    char *, "queue(1) failed to update options(2) on mp(3)",
4633 		    queue_t *, wq, char *, opt_storage, mblk_t *, mp);
4634 		icmp_ud_err(wq, mp, err);
4635 		return (B_FALSE);
4636 	}
4637 
4638 	icmp->icmp_v6lastdst = *dst;
4639 	return (B_TRUE);
4640 }
4641 
4642 /*
4643  * icmp_wput_ipv6():
4644  * Assumes that icmp_wput did some sanity checking on the destination
4645  * address, but that the label may not yet be correct.
4646  */
4647 void
4648 icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
4649 {
4650 	ip6_t			*ip6h;
4651 	ip6i_t			*ip6i;	/* mp1->b_rptr even if no ip6i_t */
4652 	mblk_t			*mp1;
4653 	int			ip_hdr_len = IPV6_HDR_LEN;
4654 	size_t			ip_len;
4655 	icmp_t			*icmp = Q_TO_ICMP(q);
4656 	icmp_stack_t		*is = icmp->icmp_is;
4657 	ip6_pkt_t		ipp_s;	/* For ancillary data options */
4658 	ip6_pkt_t		*ipp = &ipp_s;
4659 	ip6_pkt_t		*tipp;
4660 	uint32_t		csum = 0;
4661 	uint_t			ignore = 0;
4662 	uint_t			option_exists = 0, is_sticky = 0;
4663 	uint8_t			*cp;
4664 	uint8_t			*nxthdr_ptr;
4665 	in6_addr_t		ip6_dst;
4666 
4667 	/*
4668 	 * If the local address is a mapped address return
4669 	 * an error.
4670 	 * It would be possible to send an IPv6 packet but the
4671 	 * response would never make it back to the application
4672 	 * since it is bound to a mapped address.
4673 	 */
4674 	if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
4675 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4676 		icmp_ud_err(q, mp, EADDRNOTAVAIL);
4677 		return;
4678 	}
4679 
4680 	ipp->ipp_fields = 0;
4681 	ipp->ipp_sticky_ignored = 0;
4682 
4683 	/*
4684 	 * If TPI options passed in, feed it for verification and handling
4685 	 */
4686 	if (tudr_optlen != 0) {
4687 		int error;
4688 
4689 		if (icmp_unitdata_opt_process(q, mp, &error,
4690 		    (void *)ipp) < 0) {
4691 			/* failure */
4692 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4693 			icmp_ud_err(q, mp, error);
4694 			return;
4695 		}
4696 		ignore = ipp->ipp_sticky_ignored;
4697 		ASSERT(error == 0);
4698 	}
4699 
4700 	if (sin6->sin6_scope_id != 0 &&
4701 	    IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
4702 		/*
4703 		 * IPPF_SCOPE_ID is special.  It's neither a sticky
4704 		 * option nor ancillary data.  It needs to be
4705 		 * explicitly set in options_exists.
4706 		 */
4707 		option_exists |= IPPF_SCOPE_ID;
4708 	}
4709 
4710 	/*
4711 	 * Compute the destination address
4712 	 */
4713 	ip6_dst = sin6->sin6_addr;
4714 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4715 		ip6_dst = ipv6_loopback;
4716 
4717 	/*
4718 	 * If we're not going to the same destination as last time, then
4719 	 * recompute the label required.  This is done in a separate routine to
4720 	 * avoid blowing up our stack here.
4721 	 */
4722 	if (is_system_labeled() &&
4723 	    !IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst) &&
4724 	    !icmp_update_label_v6(q, icmp, mp, &ip6_dst)) {
4725 		return;
4726 	}
4727 
4728 	/*
4729 	 * If there's a security label here, then we ignore any options the
4730 	 * user may try to set.  We keep the peer's label as a hidden sticky
4731 	 * option.
4732 	 */
4733 	if (icmp->icmp_label_len_v6 > 0) {
4734 		ignore &= ~IPPF_HOPOPTS;
4735 		ipp->ipp_fields &= ~IPPF_HOPOPTS;
4736 	}
4737 
4738 	if ((icmp->icmp_sticky_ipp.ipp_fields == 0) &&
4739 	    (ipp->ipp_fields == 0)) {
4740 		/* No sticky options nor ancillary data. */
4741 		goto no_options;
4742 	}
4743 
4744 	/*
4745 	 * Go through the options figuring out where each is going to
4746 	 * come from and build two masks.  The first mask indicates if
4747 	 * the option exists at all.  The second mask indicates if the
4748 	 * option is sticky or ancillary.
4749 	 */
4750 	if (!(ignore & IPPF_HOPOPTS)) {
4751 		if (ipp->ipp_fields & IPPF_HOPOPTS) {
4752 			option_exists |= IPPF_HOPOPTS;
4753 			ip_hdr_len += ipp->ipp_hopoptslen;
4754 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
4755 			option_exists |= IPPF_HOPOPTS;
4756 			is_sticky |= IPPF_HOPOPTS;
4757 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_hopoptslen;
4758 		}
4759 	}
4760 
4761 	if (!(ignore & IPPF_RTHDR)) {
4762 		if (ipp->ipp_fields & IPPF_RTHDR) {
4763 			option_exists |= IPPF_RTHDR;
4764 			ip_hdr_len += ipp->ipp_rthdrlen;
4765 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
4766 			option_exists |= IPPF_RTHDR;
4767 			is_sticky |= IPPF_RTHDR;
4768 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen;
4769 		}
4770 	}
4771 
4772 	if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
4773 		/*
4774 		 * Need to have a router header to use these.
4775 		 */
4776 		if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
4777 			option_exists |= IPPF_RTDSTOPTS;
4778 			ip_hdr_len += ipp->ipp_rtdstoptslen;
4779 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
4780 			option_exists |= IPPF_RTDSTOPTS;
4781 			is_sticky |= IPPF_RTDSTOPTS;
4782 			ip_hdr_len +=
4783 			    icmp->icmp_sticky_ipp.ipp_rtdstoptslen;
4784 		}
4785 	}
4786 
4787 	if (!(ignore & IPPF_DSTOPTS)) {
4788 		if (ipp->ipp_fields & IPPF_DSTOPTS) {
4789 			option_exists |= IPPF_DSTOPTS;
4790 			ip_hdr_len += ipp->ipp_dstoptslen;
4791 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
4792 			option_exists |= IPPF_DSTOPTS;
4793 			is_sticky |= IPPF_DSTOPTS;
4794 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen;
4795 		}
4796 	}
4797 
4798 	if (!(ignore & IPPF_IFINDEX)) {
4799 		if (ipp->ipp_fields & IPPF_IFINDEX) {
4800 			option_exists |= IPPF_IFINDEX;
4801 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
4802 			option_exists |= IPPF_IFINDEX;
4803 			is_sticky |= IPPF_IFINDEX;
4804 		}
4805 	}
4806 
4807 	if (!(ignore & IPPF_ADDR)) {
4808 		if (ipp->ipp_fields & IPPF_ADDR) {
4809 			option_exists |= IPPF_ADDR;
4810 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) {
4811 			option_exists |= IPPF_ADDR;
4812 			is_sticky |= IPPF_ADDR;
4813 		}
4814 	}
4815 
4816 	if (!(ignore & IPPF_DONTFRAG)) {
4817 		if (ipp->ipp_fields & IPPF_DONTFRAG) {
4818 			option_exists |= IPPF_DONTFRAG;
4819 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
4820 			option_exists |= IPPF_DONTFRAG;
4821 			is_sticky |= IPPF_DONTFRAG;
4822 		}
4823 	}
4824 
4825 	if (!(ignore & IPPF_USE_MIN_MTU)) {
4826 		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
4827 			option_exists |= IPPF_USE_MIN_MTU;
4828 		} else if (icmp->icmp_sticky_ipp.ipp_fields &
4829 		    IPPF_USE_MIN_MTU) {
4830 			option_exists |= IPPF_USE_MIN_MTU;
4831 			is_sticky |= IPPF_USE_MIN_MTU;
4832 		}
4833 	}
4834 
4835 	if (!(ignore & IPPF_NEXTHOP)) {
4836 		if (ipp->ipp_fields & IPPF_NEXTHOP) {
4837 			option_exists |= IPPF_NEXTHOP;
4838 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) {
4839 			option_exists |= IPPF_NEXTHOP;
4840 			is_sticky |= IPPF_NEXTHOP;
4841 		}
4842 	}
4843 
4844 	if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
4845 		option_exists |= IPPF_HOPLIMIT;
4846 	/* IPV6_HOPLIMIT can never be sticky */
4847 	ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
4848 
4849 	if (!(ignore & IPPF_UNICAST_HOPS) &&
4850 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
4851 		option_exists |= IPPF_UNICAST_HOPS;
4852 		is_sticky |= IPPF_UNICAST_HOPS;
4853 	}
4854 
4855 	if (!(ignore & IPPF_MULTICAST_HOPS) &&
4856 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
4857 		option_exists |= IPPF_MULTICAST_HOPS;
4858 		is_sticky |= IPPF_MULTICAST_HOPS;
4859 	}
4860 
4861 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) {
4862 		/* This is a sticky socket option only */
4863 		option_exists |= IPPF_NO_CKSUM;
4864 		is_sticky |= IPPF_NO_CKSUM;
4865 	}
4866 
4867 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) {
4868 		/* This is a sticky socket option only */
4869 		option_exists |= IPPF_RAW_CKSUM;
4870 		is_sticky |= IPPF_RAW_CKSUM;
4871 	}
4872 
4873 	if (!(ignore & IPPF_TCLASS)) {
4874 		if (ipp->ipp_fields & IPPF_TCLASS) {
4875 			option_exists |= IPPF_TCLASS;
4876 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
4877 			option_exists |= IPPF_TCLASS;
4878 			is_sticky |= IPPF_TCLASS;
4879 		}
4880 	}
4881 
4882 no_options:
4883 
4884 	/*
4885 	 * If any options carried in the ip6i_t were specified, we
4886 	 * need to account for the ip6i_t in the data we'll be sending
4887 	 * down.
4888 	 */
4889 	if (option_exists & IPPF_HAS_IP6I)
4890 		ip_hdr_len += sizeof (ip6i_t);
4891 
4892 	/* check/fix buffer config, setup pointers into it */
4893 	mp1 = mp->b_cont;
4894 	ip6h = (ip6_t *)&mp1->b_rptr[-ip_hdr_len];
4895 	if ((mp1->b_datap->db_ref != 1) ||
4896 	    ((unsigned char *)ip6h < mp1->b_datap->db_base) ||
4897 	    !OK_32PTR(ip6h)) {
4898 		/* Try to get everything in a single mblk next time */
4899 		if (ip_hdr_len > icmp->icmp_max_hdr_len) {
4900 			icmp->icmp_max_hdr_len = ip_hdr_len;
4901 			(void) mi_set_sth_wroff(RD(q),
4902 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
4903 		}
4904 		mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
4905 		if (!mp1) {
4906 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4907 			icmp_ud_err(q, mp, ENOMEM);
4908 			return;
4909 		}
4910 		mp1->b_cont = mp->b_cont;
4911 		mp1->b_wptr = mp1->b_datap->db_lim;
4912 		ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
4913 	}
4914 	mp1->b_rptr = (unsigned char *)ip6h;
4915 	ip6i = (ip6i_t *)ip6h;
4916 
4917 #define	ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
4918 	if (option_exists & IPPF_HAS_IP6I) {
4919 		ip6h = (ip6_t *)&ip6i[1];
4920 		ip6i->ip6i_flags = 0;
4921 		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
4922 
4923 		/* sin6_scope_id takes precendence over IPPF_IFINDEX */
4924 		if (option_exists & IPPF_SCOPE_ID) {
4925 			ip6i->ip6i_flags |= IP6I_IFINDEX;
4926 			ip6i->ip6i_ifindex = sin6->sin6_scope_id;
4927 		} else if (option_exists & IPPF_IFINDEX) {
4928 			tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
4929 			ASSERT(tipp->ipp_ifindex != 0);
4930 			ip6i->ip6i_flags |= IP6I_IFINDEX;
4931 			ip6i->ip6i_ifindex = tipp->ipp_ifindex;
4932 		}
4933 
4934 		if (option_exists & IPPF_RAW_CKSUM) {
4935 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
4936 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
4937 		}
4938 
4939 		if (option_exists & IPPF_NO_CKSUM) {
4940 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
4941 		}
4942 
4943 		if (option_exists & IPPF_ADDR) {
4944 			/*
4945 			 * Enable per-packet source address verification if
4946 			 * IPV6_PKTINFO specified the source address.
4947 			 * ip6_src is set in the transport's _wput function.
4948 			 */
4949 			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
4950 		}
4951 
4952 		if (option_exists & IPPF_DONTFRAG) {
4953 			ip6i->ip6i_flags |= IP6I_DONTFRAG;
4954 		}
4955 
4956 		if (option_exists & IPPF_USE_MIN_MTU) {
4957 			ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
4958 			    ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
4959 		}
4960 
4961 		if (option_exists & IPPF_NEXTHOP) {
4962 			tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
4963 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
4964 			ip6i->ip6i_flags |= IP6I_NEXTHOP;
4965 			ip6i->ip6i_nexthop = tipp->ipp_nexthop;
4966 		}
4967 
4968 		/*
4969 		 * tell IP this is an ip6i_t private header
4970 		 */
4971 		ip6i->ip6i_nxt = IPPROTO_RAW;
4972 	}
4973 
4974 	/* Initialize IPv6 header */
4975 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
4976 	bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
4977 
4978 	/* Set the hoplimit of the outgoing packet. */
4979 	if (option_exists & IPPF_HOPLIMIT) {
4980 		/* IPV6_HOPLIMIT ancillary data overrides all other settings. */
4981 		ip6h->ip6_hops = ipp->ipp_hoplimit;
4982 		ip6i->ip6i_flags |= IP6I_HOPLIMIT;
4983 	} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
4984 		ip6h->ip6_hops = icmp->icmp_multicast_ttl;
4985 		if (option_exists & IPPF_MULTICAST_HOPS)
4986 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
4987 	} else {
4988 		ip6h->ip6_hops = icmp->icmp_ttl;
4989 		if (option_exists & IPPF_UNICAST_HOPS)
4990 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
4991 	}
4992 
4993 	if (option_exists & IPPF_ADDR) {
4994 		tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
4995 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
4996 		ip6h->ip6_src = tipp->ipp_addr;
4997 	} else {
4998 		/*
4999 		 * The source address was not set using IPV6_PKTINFO.
5000 		 * First look at the bound source.
5001 		 * If unspecified fallback to __sin6_src_id.
5002 		 */
5003 		ip6h->ip6_src = icmp->icmp_v6src;
5004 		if (sin6->__sin6_src_id != 0 &&
5005 		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
5006 			ip_srcid_find_id(sin6->__sin6_src_id,
5007 			    &ip6h->ip6_src, icmp->icmp_zoneid,
5008 			    is->is_netstack);
5009 		}
5010 	}
5011 
5012 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
5013 	cp = (uint8_t *)&ip6h[1];
5014 
5015 	/*
5016 	 * Here's where we have to start stringing together
5017 	 * any extension headers in the right order:
5018 	 * Hop-by-hop, destination, routing, and final destination opts.
5019 	 */
5020 	if (option_exists & IPPF_HOPOPTS) {
5021 		/* Hop-by-hop options */
5022 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
5023 		tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
5024 
5025 		*nxthdr_ptr = IPPROTO_HOPOPTS;
5026 		nxthdr_ptr = &hbh->ip6h_nxt;
5027 
5028 		bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen);
5029 		cp += tipp->ipp_hopoptslen;
5030 	}
5031 	/*
5032 	 * En-route destination options
5033 	 * Only do them if there's a routing header as well
5034 	 */
5035 	if (option_exists & IPPF_RTDSTOPTS) {
5036 		ip6_dest_t *dst = (ip6_dest_t *)cp;
5037 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
5038 
5039 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5040 		nxthdr_ptr = &dst->ip6d_nxt;
5041 
5042 		bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
5043 		cp += tipp->ipp_rtdstoptslen;
5044 	}
5045 	/*
5046 	 * Routing header next
5047 	 */
5048 	if (option_exists & IPPF_RTHDR) {
5049 		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
5050 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
5051 
5052 		*nxthdr_ptr = IPPROTO_ROUTING;
5053 		nxthdr_ptr = &rt->ip6r_nxt;
5054 
5055 		bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
5056 		cp += tipp->ipp_rthdrlen;
5057 	}
5058 	/*
5059 	 * Do ultimate destination options
5060 	 */
5061 	if (option_exists & IPPF_DSTOPTS) {
5062 		ip6_dest_t *dest = (ip6_dest_t *)cp;
5063 		tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
5064 
5065 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5066 		nxthdr_ptr = &dest->ip6d_nxt;
5067 
5068 		bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
5069 		cp += tipp->ipp_dstoptslen;
5070 	}
5071 
5072 	/*
5073 	 * Now set the last header pointer to the proto passed in
5074 	 */
5075 	ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len);
5076 	*nxthdr_ptr = icmp->icmp_proto;
5077 
5078 	/*
5079 	 * Copy in the destination address
5080 	 */
5081 	ip6h->ip6_dst = ip6_dst;
5082 
5083 	ip6h->ip6_vcf =
5084 	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
5085 	    (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
5086 
5087 	if (option_exists & IPPF_TCLASS) {
5088 		tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
5089 		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
5090 		    tipp->ipp_tclass);
5091 	}
5092 	if (option_exists & IPPF_RTHDR) {
5093 		ip6_rthdr_t	*rth;
5094 
5095 		/*
5096 		 * Perform any processing needed for source routing.
5097 		 * We know that all extension headers will be in the same mblk
5098 		 * as the IPv6 header.
5099 		 */
5100 		rth = ip_find_rthdr_v6(ip6h, mp1->b_wptr);
5101 		if (rth != NULL && rth->ip6r_segleft != 0) {
5102 			if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
5103 				/*
5104 				 * Drop packet - only support Type 0 routing.
5105 				 * Notify the application as well.
5106 				 */
5107 				icmp_ud_err(q, mp, EPROTO);
5108 				BUMP_MIB(&is->is_rawip_mib,
5109 				    rawipOutErrors);
5110 				return;
5111 			}
5112 			/*
5113 			 * rth->ip6r_len is twice the number of
5114 			 * addresses in the header
5115 			 */
5116 			if (rth->ip6r_len & 0x1) {
5117 				icmp_ud_err(q, mp, EPROTO);
5118 				BUMP_MIB(&is->is_rawip_mib,
5119 				    rawipOutErrors);
5120 				return;
5121 			}
5122 			/*
5123 			 * Shuffle the routing header and ip6_dst
5124 			 * addresses, and get the checksum difference
5125 			 * between the first hop (in ip6_dst) and
5126 			 * the destination (in the last routing hdr entry).
5127 			 */
5128 			csum = ip_massage_options_v6(ip6h, rth,
5129 			    is->is_netstack);
5130 			/*
5131 			 * Verify that the first hop isn't a mapped address.
5132 			 * Routers along the path need to do this verification
5133 			 * for subsequent hops.
5134 			 */
5135 			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
5136 				icmp_ud_err(q, mp, EADDRNOTAVAIL);
5137 				BUMP_MIB(&is->is_rawip_mib,
5138 				    rawipOutErrors);
5139 				return;
5140 			}
5141 		}
5142 	}
5143 
5144 	ip_len = mp1->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
5145 	if (mp1->b_cont != NULL)
5146 		ip_len += msgdsize(mp1->b_cont);
5147 
5148 	/*
5149 	 * Set the length into the IP header.
5150 	 * If the length is greater than the maximum allowed by IP,
5151 	 * then free the message and return. Do not try and send it
5152 	 * as this can cause problems in layers below.
5153 	 */
5154 	if (ip_len > IP_MAXPACKET) {
5155 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5156 		icmp_ud_err(q, mp, EMSGSIZE);
5157 		return;
5158 	}
5159 	if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
5160 		uint_t	cksum_off;	/* From ip6i == mp1->b_rptr */
5161 		uint16_t *cksum_ptr;
5162 		uint_t	ext_hdrs_len;
5163 
5164 		/* ICMPv6 must have an offset matching icmp6_cksum offset */
5165 		ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 ||
5166 		    icmp->icmp_checksum_off == 2);
5167 
5168 		/*
5169 		 * We make it easy for IP to include our pseudo header
5170 		 * by putting our length in uh_checksum, modified (if
5171 		 * we have a routing header) by the checksum difference
5172 		 * between the ultimate destination and first hop addresses.
5173 		 * Note: ICMPv6 must always checksum the packet.
5174 		 */
5175 		cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
5176 		if (cksum_off + sizeof (uint16_t) > mp1->b_wptr - mp1->b_rptr) {
5177 			if (!pullupmsg(mp1, cksum_off + sizeof (uint16_t))) {
5178 				BUMP_MIB(&is->is_rawip_mib,
5179 				    rawipOutErrors);
5180 				freemsg(mp);
5181 				return;
5182 			}
5183 			ip6i = (ip6i_t *)mp1->b_rptr;
5184 			if (ip6i->ip6i_nxt == IPPROTO_RAW)
5185 				ip6h = (ip6_t *)&ip6i[1];
5186 			else
5187 				ip6h = (ip6_t *)ip6i;
5188 		}
5189 		/* Add payload length to checksum */
5190 		ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN -
5191 		    (int)((uchar_t *)ip6h - (uchar_t *)ip6i);
5192 		csum += htons(ip_len - ext_hdrs_len);
5193 
5194 		cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off);
5195 		csum = (csum & 0xFFFF) + (csum >> 16);
5196 		*cksum_ptr = (uint16_t)csum;
5197 	}
5198 
5199 #ifdef _LITTLE_ENDIAN
5200 	ip_len = htons(ip_len);
5201 #endif
5202 	ip6h->ip6_plen = (uint16_t)ip_len;
5203 
5204 	freeb(mp);
5205 
5206 	/* We're done. Pass the packet to IP */
5207 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
5208 	ip_output_v6(icmp->icmp_connp, mp1, q, IP_WPUT);
5209 }
5210 
5211 static void
5212 icmp_wput_other(queue_t *q, mblk_t *mp)
5213 {
5214 	uchar_t	*rptr = mp->b_rptr;
5215 	struct iocblk *iocp;
5216 #define	tudr ((struct T_unitdata_req *)rptr)
5217 	conn_t	*connp = Q_TO_CONN(q);
5218 	icmp_t	*icmp = connp->conn_icmp;
5219 	icmp_stack_t *is = icmp->icmp_is;
5220 	cred_t *cr;
5221 
5222 	cr = DB_CREDDEF(mp, connp->conn_cred);
5223 
5224 	switch (mp->b_datap->db_type) {
5225 	case M_PROTO:
5226 	case M_PCPROTO:
5227 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
5228 			/*
5229 			 * If the message does not contain a PRIM_type,
5230 			 * throw it away.
5231 			 */
5232 			freemsg(mp);
5233 			return;
5234 		}
5235 		switch (((union T_primitives *)rptr)->type) {
5236 		case T_ADDR_REQ:
5237 			icmp_addr_req(q, mp);
5238 			return;
5239 		case O_T_BIND_REQ:
5240 		case T_BIND_REQ:
5241 			icmp_bind(q, mp);
5242 			return;
5243 		case T_CONN_REQ:
5244 			icmp_connect(q, mp);
5245 			return;
5246 		case T_CAPABILITY_REQ:
5247 			icmp_capability_req(q, mp);
5248 			return;
5249 		case T_INFO_REQ:
5250 			icmp_info_req(q, mp);
5251 			return;
5252 		case T_UNITDATA_REQ:
5253 			/*
5254 			 * If a T_UNITDATA_REQ gets here, the address must
5255 			 * be bad.  Valid T_UNITDATA_REQs are found above
5256 			 * and break to below this switch.
5257 			 */
5258 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
5259 			return;
5260 		case T_UNBIND_REQ:
5261 			icmp_unbind(q, mp);
5262 			return;
5263 
5264 		case T_SVR4_OPTMGMT_REQ:
5265 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
5266 			    cr)) {
5267 				/* Only IP can return anything meaningful */
5268 				(void) svr4_optcom_req(q, mp, cr,
5269 				    &icmp_opt_obj, B_TRUE);
5270 			}
5271 			return;
5272 
5273 		case T_OPTMGMT_REQ:
5274 			/* Only IP can return anything meaningful */
5275 			(void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE);
5276 			return;
5277 
5278 		case T_DISCON_REQ:
5279 			icmp_disconnect(q, mp);
5280 			return;
5281 
5282 		/* The following TPI message is not supported by icmp. */
5283 		case O_T_CONN_RES:
5284 		case T_CONN_RES:
5285 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
5286 			return;
5287 
5288 		/* The following 3 TPI requests are illegal for icmp. */
5289 		case T_DATA_REQ:
5290 		case T_EXDATA_REQ:
5291 		case T_ORDREL_REQ:
5292 			freemsg(mp);
5293 			(void) putctl1(RD(q), M_ERROR, EPROTO);
5294 			return;
5295 		default:
5296 			break;
5297 		}
5298 		break;
5299 	case M_IOCTL:
5300 		iocp = (struct iocblk *)mp->b_rptr;
5301 		switch (iocp->ioc_cmd) {
5302 		case TI_GETPEERNAME:
5303 			if (icmp->icmp_state != TS_DATA_XFER) {
5304 				/*
5305 				 * If a default destination address has not
5306 				 * been associated with the stream, then we
5307 				 * don't know the peer's name.
5308 				 */
5309 				iocp->ioc_error = ENOTCONN;
5310 		err_ret:;
5311 				iocp->ioc_count = 0;
5312 				mp->b_datap->db_type = M_IOCACK;
5313 				qreply(q, mp);
5314 				return;
5315 			}
5316 			/* FALLTHRU */
5317 		case TI_GETMYNAME:
5318 			/*
5319 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
5320 			 * need to copyin the user's strbuf structure.
5321 			 * Processing will continue in the M_IOCDATA case
5322 			 * below.
5323 			 */
5324 			mi_copyin(q, mp, NULL,
5325 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
5326 			return;
5327 		case ND_SET:
5328 			/* nd_getset performs the necessary error checking */
5329 		case ND_GET:
5330 			if (nd_getset(q, is->is_nd, mp)) {
5331 				qreply(q, mp);
5332 				return;
5333 			}
5334 			break;
5335 		default:
5336 			break;
5337 		}
5338 		break;
5339 	case M_IOCDATA:
5340 		icmp_wput_iocdata(q, mp);
5341 		return;
5342 	default:
5343 		break;
5344 	}
5345 	ip_wput(q, mp);
5346 }
5347 
5348 /*
5349  * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA
5350  * messages.
5351  */
5352 static void
5353 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
5354 {
5355 	mblk_t	*mp1;
5356 	STRUCT_HANDLE(strbuf, sb);
5357 	icmp_t	*icmp;
5358 	in6_addr_t	v6addr;
5359 	ipaddr_t	v4addr;
5360 	uint32_t	flowinfo = 0;
5361 	int		addrlen;
5362 
5363 	/* Make sure it is one of ours. */
5364 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5365 	case TI_GETMYNAME:
5366 	case TI_GETPEERNAME:
5367 		break;
5368 	default:
5369 		icmp = Q_TO_ICMP(q);
5370 		ip_output(icmp->icmp_connp, mp, q, IP_WPUT);
5371 		return;
5372 	}
5373 	switch (mi_copy_state(q, mp, &mp1)) {
5374 	case -1:
5375 		return;
5376 	case MI_COPY_CASE(MI_COPY_IN, 1):
5377 		break;
5378 	case MI_COPY_CASE(MI_COPY_OUT, 1):
5379 		/*
5380 		 * The address has been copied out, so now
5381 		 * copyout the strbuf.
5382 		 */
5383 		mi_copyout(q, mp);
5384 		return;
5385 	case MI_COPY_CASE(MI_COPY_OUT, 2):
5386 		/*
5387 		 * The address and strbuf have been copied out.
5388 		 * We're done, so just acknowledge the original
5389 		 * M_IOCTL.
5390 		 */
5391 		mi_copy_done(q, mp, 0);
5392 		return;
5393 	default:
5394 		/*
5395 		 * Something strange has happened, so acknowledge
5396 		 * the original M_IOCTL with an EPROTO error.
5397 		 */
5398 		mi_copy_done(q, mp, EPROTO);
5399 		return;
5400 	}
5401 	/*
5402 	 * Now we have the strbuf structure for TI_GETMYNAME
5403 	 * and TI_GETPEERNAME.  Next we copyout the requested
5404 	 * address and then we'll copyout the strbuf.
5405 	 */
5406 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
5407 	    (void *)mp1->b_rptr);
5408 	icmp = Q_TO_ICMP(q);
5409 	if (icmp->icmp_family == AF_INET)
5410 		addrlen = sizeof (sin_t);
5411 	else
5412 		addrlen = sizeof (sin6_t);
5413 
5414 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
5415 		mi_copy_done(q, mp, EINVAL);
5416 		return;
5417 	}
5418 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5419 	case TI_GETMYNAME:
5420 		if (icmp->icmp_family == AF_INET) {
5421 			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
5422 			if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
5423 			    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
5424 				v4addr = V4_PART_OF_V6(icmp->icmp_v6src);
5425 			} else {
5426 				/*
5427 				 * INADDR_ANY
5428 				 * icmp_v6src is not set, we might be bound to
5429 				 * broadcast/multicast. Use icmp_bound_v6src as
5430 				 * local address instead (that could
5431 				 * also still be INADDR_ANY)
5432 				 */
5433 				v4addr = V4_PART_OF_V6(icmp->icmp_bound_v6src);
5434 			}
5435 		} else {
5436 			/* icmp->icmp_family == AF_INET6 */
5437 			if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
5438 				v6addr = icmp->icmp_v6src;
5439 			} else {
5440 				/*
5441 				 * UNSPECIFIED
5442 				 * icmp_v6src is not set, we might be bound to
5443 				 * broadcast/multicast. Use icmp_bound_v6src as
5444 				 * local address instead (that could
5445 				 * also still be UNSPECIFIED)
5446 				 */
5447 				v6addr = icmp->icmp_bound_v6src;
5448 			}
5449 		}
5450 		break;
5451 	case TI_GETPEERNAME:
5452 		if (icmp->icmp_family == AF_INET) {
5453 			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
5454 			v4addr = V4_PART_OF_V6(icmp->icmp_v6dst);
5455 		} else {
5456 			/* icmp->icmp_family == AF_INET6) */
5457 			v6addr = icmp->icmp_v6dst;
5458 			flowinfo = icmp->icmp_flowinfo;
5459 		}
5460 		break;
5461 	default:
5462 		mi_copy_done(q, mp, EPROTO);
5463 		return;
5464 	}
5465 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
5466 	if (!mp1)
5467 		return;
5468 
5469 	if (icmp->icmp_family == AF_INET) {
5470 		sin_t *sin;
5471 
5472 		STRUCT_FSET(sb, len, (int)sizeof (sin_t));
5473 		sin = (sin_t *)mp1->b_rptr;
5474 		mp1->b_wptr = (uchar_t *)&sin[1];
5475 		*sin = sin_null;
5476 		sin->sin_family = AF_INET;
5477 		sin->sin_addr.s_addr = v4addr;
5478 	} else {
5479 		/* icmp->icmp_family == AF_INET6 */
5480 		sin6_t *sin6;
5481 
5482 		ASSERT(icmp->icmp_family == AF_INET6);
5483 		STRUCT_FSET(sb, len, (int)sizeof (sin6_t));
5484 		sin6 = (sin6_t *)mp1->b_rptr;
5485 		mp1->b_wptr = (uchar_t *)&sin6[1];
5486 		*sin6 = sin6_null;
5487 		sin6->sin6_family = AF_INET6;
5488 		sin6->sin6_flowinfo = flowinfo;
5489 		sin6->sin6_addr = v6addr;
5490 	}
5491 	/* Copy out the address */
5492 	mi_copyout(q, mp);
5493 }
5494 
5495 static int
5496 icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
5497     void *thisdg_attrs)
5498 {
5499 	conn_t	*connp = Q_TO_CONN(q);
5500 	struct T_unitdata_req *udreqp;
5501 	int is_absreq_failure;
5502 	cred_t *cr;
5503 
5504 	udreqp = (struct T_unitdata_req *)mp->b_rptr;
5505 	*errorp = 0;
5506 
5507 	cr = DB_CREDDEF(mp, connp->conn_cred);
5508 
5509 	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
5510 	    udreqp->OPT_offset, cr, &icmp_opt_obj,
5511 	    thisdg_attrs, &is_absreq_failure);
5512 
5513 	if (*errorp != 0) {
5514 		/*
5515 		 * Note: No special action needed in this
5516 		 * module for "is_absreq_failure"
5517 		 */
5518 		return (-1);		/* failure */
5519 	}
5520 	ASSERT(is_absreq_failure == 0);
5521 	return (0);	/* success */
5522 }
5523 
5524 void
5525 icmp_ddi_init(void)
5526 {
5527 	icmp_max_optsize =
5528 	    optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
5529 	    icmp_opt_obj.odb_opt_arr_cnt);
5530 
5531 	/*
5532 	 * We want to be informed each time a stack is created or
5533 	 * destroyed in the kernel, so we can maintain the
5534 	 * set of icmp_stack_t's.
5535 	 */
5536 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
5537 }
5538 
5539 void
5540 icmp_ddi_destroy(void)
5541 {
5542 	netstack_unregister(NS_ICMP);
5543 }
5544 
5545 /*
5546  * Initialize the ICMP stack instance.
5547  */
5548 static void *
5549 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5550 {
5551 	icmp_stack_t	*is;
5552 	icmpparam_t	*pa;
5553 
5554 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5555 	is->is_netstack = ns;
5556 
5557 	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
5558 	is->is_param_arr = pa;
5559 	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
5560 
5561 	(void) icmp_param_register(&is->is_nd,
5562 	    is->is_param_arr, A_CNT(icmp_param_arr));
5563 	is->is_ksp = rawip_kstat_init(stackid);
5564 	return (is);
5565 }
5566 
5567 /*
5568  * Free the ICMP stack instance.
5569  */
5570 static void
5571 rawip_stack_fini(netstackid_t stackid, void *arg)
5572 {
5573 	icmp_stack_t *is = (icmp_stack_t *)arg;
5574 
5575 	nd_free(&is->is_nd);
5576 	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
5577 	is->is_param_arr = NULL;
5578 
5579 	rawip_kstat_fini(stackid, is->is_ksp);
5580 	is->is_ksp = NULL;
5581 	kmem_free(is, sizeof (*is));
5582 }
5583 
5584 static void *
5585 rawip_kstat_init(netstackid_t stackid) {
5586 	kstat_t	*ksp;
5587 
5588 	rawip_named_kstat_t template = {
5589 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5590 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5591 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5592 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5593 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5594 	};
5595 
5596 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5597 					KSTAT_TYPE_NAMED,
5598 					NUM_OF_FIELDS(rawip_named_kstat_t),
5599 					0, stackid);
5600 	if (ksp == NULL || ksp->ks_data == NULL)
5601 		return (NULL);
5602 
5603 	bcopy(&template, ksp->ks_data, sizeof (template));
5604 	ksp->ks_update = rawip_kstat_update;
5605 	ksp->ks_private = (void *)(uintptr_t)stackid;
5606 
5607 	kstat_install(ksp);
5608 	return (ksp);
5609 }
5610 
5611 static void
5612 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5613 {
5614 	if (ksp != NULL) {
5615 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5616 		kstat_delete_netstack(ksp, stackid);
5617 	}
5618 }
5619 
5620 static int
5621 rawip_kstat_update(kstat_t *ksp, int rw)
5622 {
5623 	rawip_named_kstat_t *rawipkp;
5624 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5625 	netstack_t	*ns;
5626 	icmp_stack_t	*is;
5627 
5628 	if ((ksp == NULL) || (ksp->ks_data == NULL))
5629 		return (EIO);
5630 
5631 	if (rw == KSTAT_WRITE)
5632 		return (EACCES);
5633 
5634 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5635 
5636 	ns = netstack_find_by_stackid(stackid);
5637 	if (ns == NULL)
5638 		return (-1);
5639 	is = ns->netstack_icmp;
5640 	if (is == NULL) {
5641 		netstack_rele(ns);
5642 		return (-1);
5643 	}
5644 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5645 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5646 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5647 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5648 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5649 	netstack_rele(ns);
5650 	return (0);
5651 }
5652