xref: /illumos-gate/usr/src/uts/common/inet/ip/icmp.c (revision c85864d8472aaccb47ceb468ebd9b3a85b66d161)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/stropts.h>
30 #include <sys/strlog.h>
31 #include <sys/strsun.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/timod.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/kmem.h>
41 #include <sys/policy.h>
42 #include <sys/priv.h>
43 #include <sys/zone.h>
44 #include <sys/time.h>
45 
46 #include <sys/sockio.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/isa_defs.h>
50 #include <sys/suntpi.h>
51 #include <sys/xti_inet.h>
52 #include <sys/netstack.h>
53 
54 #include <net/route.h>
55 #include <net/if.h>
56 
57 #include <netinet/in.h>
58 #include <netinet/ip6.h>
59 #include <netinet/icmp6.h>
60 #include <inet/common.h>
61 #include <inet/ip.h>
62 #include <inet/ip6.h>
63 #include <inet/proto_set.h>
64 #include <inet/nd.h>
65 #include <inet/optcom.h>
66 #include <inet/snmpcom.h>
67 #include <inet/kstatcom.h>
68 #include <inet/rawip_impl.h>
69 
70 #include <netinet/ip_mroute.h>
71 #include <inet/tcp.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 #include <inet/ipclassifier.h>
75 
76 #include <sys/tsol/label.h>
77 #include <sys/tsol/tnet.h>
78 
79 #include <inet/ip_ire.h>
80 #include <inet/ip_if.h>
81 
82 #include <inet/ip_impl.h>
83 #include <sys/disp.h>
84 
85 /*
86  * Synchronization notes:
87  *
88  * RAWIP is MT and uses the usual kernel synchronization primitives. There is
89  * locks, which is icmp_rwlock. We also use conn_lock when updating things
90  * which affect the IP classifier lookup.
91  * The lock order is icmp_rwlock -> conn_lock.
92  *
93  * The icmp_rwlock:
94  * This protects most of the other fields in the icmp_t. The exact list of
95  * fields which are protected by each of the above locks is documented in
96  * the icmp_t structure definition.
97  *
98  * Plumbing notes:
99  * ICMP is always a device driver. For compatibility with mibopen() code
100  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
101  * dummy module.
102  */
103 
104 static void	icmp_addr_req(queue_t *q, mblk_t *mp);
105 static void	icmp_tpi_bind(queue_t *q, mblk_t *mp);
106 static int	icmp_bind_proto(conn_t *connp);
107 static int	icmp_build_hdrs(icmp_t *icmp);
108 static void	icmp_capability_req(queue_t *q, mblk_t *mp);
109 static int	icmp_close(queue_t *q, int flags);
110 static void	icmp_tpi_connect(queue_t *q, mblk_t *mp);
111 static void	icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
112 static void	icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
113 		    int sys_error);
114 static void	icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
115 		    t_scalar_t t_error, int sys_error);
116 static void	icmp_icmp_error(conn_t *connp, mblk_t *mp);
117 static void	icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp);
118 static void	icmp_info_req(queue_t *q, mblk_t *mp);
119 static void	icmp_input(void *, mblk_t *, void *);
120 static conn_t 	*icmp_open(int family, cred_t *credp, int *err, int flags);
121 static int	icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
122 		    cred_t *credp);
123 static int	icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
124 		    cred_t *credp);
125 static int	icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
126 		    int *errorp, void *thisdg_attrs);
127 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
128 int		icmp_opt_set(conn_t *connp, uint_t optset_context,
129 		    int level, int name, uint_t inlen,
130 		    uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
131 		    void *thisdg_attrs, cred_t *cr);
132 int		icmp_opt_get(conn_t *connp, int level, int name,
133 		    uchar_t *ptr);
134 static int	icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
135 static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
136 static int	icmp_param_set(queue_t *q, mblk_t *mp, char *value,
137 		    caddr_t cp, cred_t *cr);
138 static int	icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
139 		    uchar_t *ptr, int len);
140 static void	icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
141 static void	icmp_tpi_unbind(queue_t *q, mblk_t *mp);
142 static int	icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst);
143 static void	icmp_wput(queue_t *q, mblk_t *mp);
144 static void	icmp_wput_fallback(queue_t *q, mblk_t *mp);
145 static int	raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp,
146 		    sin6_t *sin6, ip6_pkt_t *ipp);
147 static int	raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp,
148 		    ipaddr_t v4dst, ip4_pkt_t *pktinfop);
149 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
150 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
151 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
152 static void	icmp_ulp_recv(conn_t *, mblk_t *);
153 
154 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
155 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
156 
157 static void	*rawip_kstat_init(netstackid_t stackid);
158 static void	rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
159 static int	rawip_kstat_update(kstat_t *kp, int rw);
160 static void	rawip_stack_shutdown(netstackid_t stackid, void *arg);
161 static int	rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa,
162 		    uint_t *salenp);
163 static int	rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa,
164 		    uint_t *salenp);
165 
166 int		rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
167 		    socklen_t *, cred_t *);
168 int		rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
169 		    socklen_t *, cred_t *);
170 
171 static struct module_info icmp_mod_info =  {
172 	5707, "icmp", 1, INFPSZ, 512, 128
173 };
174 
175 /*
176  * Entry points for ICMP as a device.
177  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
178  */
179 static struct qinit icmprinitv4 = {
180 	NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
181 };
182 
183 static struct qinit icmprinitv6 = {
184 	NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
185 };
186 
187 static struct qinit icmpwinit = {
188 	(pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
189 };
190 
191 /* ICMP entry point during fallback */
192 static struct qinit icmp_fallback_sock_winit = {
193 	(pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
194 };
195 
196 /* For AF_INET aka /dev/icmp */
197 struct streamtab icmpinfov4 = {
198 	&icmprinitv4, &icmpwinit
199 };
200 
201 /* For AF_INET6 aka /dev/icmp6 */
202 struct streamtab icmpinfov6 = {
203 	&icmprinitv6, &icmpwinit
204 };
205 
206 static sin_t	sin_null;	/* Zero address for quick clears */
207 static sin6_t	sin6_null;	/* Zero address for quick clears */
208 
209 /* Default structure copied into T_INFO_ACK messages */
210 static struct T_info_ack icmp_g_t_info_ack = {
211 	T_INFO_ACK,
212 	IP_MAXPACKET,	 /* TSDU_size.  icmp allows maximum size messages. */
213 	T_INVALID,	/* ETSDU_size.  icmp does not support expedited data. */
214 	T_INVALID,	/* CDATA_size. icmp does not support connect data. */
215 	T_INVALID,	/* DDATA_size. icmp does not support disconnect data. */
216 	0,		/* ADDR_size - filled in later. */
217 	0,		/* OPT_size - not initialized here */
218 	IP_MAXPACKET,	/* TIDU_size.  icmp allows maximum size messages. */
219 	T_CLTS,		/* SERV_type.  icmp supports connection-less. */
220 	TS_UNBND,	/* CURRENT_state.  This is set from icmp_state. */
221 	(XPG4_1|SENDZERO) /* PROVIDER_flag */
222 };
223 
224 /*
225  * Table of ND variables supported by icmp.  These are loaded into is_nd
226  * when the stack instance is created.
227  * All of these are alterable, within the min/max values given, at run time.
228  */
229 static icmpparam_t	icmp_param_arr[] = {
230 	/* min	max	value	name */
231 	{ 0,	128,	32,	"icmp_wroff_extra" },
232 	{ 1,	255,	255,	"icmp_ipv4_ttl" },
233 	{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS,	"icmp_ipv6_hoplimit"},
234 	{ 0,	1,	1,	"icmp_bsd_compat" },
235 	{ 4096,	65536,	8192,	"icmp_xmit_hiwat"},
236 	{ 0,	65536,	1024,	"icmp_xmit_lowat"},
237 	{ 4096,	65536,	8192,	"icmp_recv_hiwat"},
238 	{ 65536, 1024*1024*1024, 256*1024,	"icmp_max_buf"},
239 };
240 #define	is_wroff_extra			is_param_arr[0].icmp_param_value
241 #define	is_ipv4_ttl			is_param_arr[1].icmp_param_value
242 #define	is_ipv6_hoplimit		is_param_arr[2].icmp_param_value
243 #define	is_bsd_compat			is_param_arr[3].icmp_param_value
244 #define	is_xmit_hiwat			is_param_arr[4].icmp_param_value
245 #define	is_xmit_lowat			is_param_arr[5].icmp_param_value
246 #define	is_recv_hiwat			is_param_arr[6].icmp_param_value
247 #define	is_max_buf			is_param_arr[7].icmp_param_value
248 
249 static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len);
250 static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa,
251     socklen_t len, cred_t *cr);
252 static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error);
253 
254 /*
255  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
256  * passed to icmp_wput.
257  * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
258  * protocol type placed in the message following the address. A T_BIND_ACK
259  * message is returned by ip_bind_v4/v6.
260  */
261 static void
262 icmp_tpi_bind(queue_t *q, mblk_t *mp)
263 {
264 	int	error;
265 	struct sockaddr *sa;
266 	struct T_bind_req *tbr;
267 	socklen_t	len;
268 	sin_t	*sin;
269 	sin6_t	*sin6;
270 	icmp_t		*icmp;
271 	conn_t	*connp = Q_TO_CONN(q);
272 	mblk_t *mp1;
273 	cred_t *cr;
274 
275 	/*
276 	 * All Solaris components should pass a db_credp
277 	 * for this TPI message, hence we ASSERT.
278 	 * But in case there is some other M_PROTO that looks
279 	 * like a TPI message sent by some other kernel
280 	 * component, we check and return an error.
281 	 */
282 	cr = msg_getcred(mp, NULL);
283 	ASSERT(cr != NULL);
284 	if (cr == NULL) {
285 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
286 		return;
287 	}
288 
289 	icmp = connp->conn_icmp;
290 	if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
291 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
292 		    "icmp_bind: bad req, len %u",
293 		    (uint_t)(mp->b_wptr - mp->b_rptr));
294 		icmp_err_ack(q, mp, TPROTO, 0);
295 		return;
296 	}
297 
298 	if (icmp->icmp_state != TS_UNBND) {
299 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
300 		    "icmp_bind: bad state, %d", icmp->icmp_state);
301 		icmp_err_ack(q, mp, TOUTSTATE, 0);
302 		return;
303 	}
304 
305 	/*
306 	 * Reallocate the message to make sure we have enough room for an
307 	 * address and the protocol type.
308 	 */
309 	mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
310 	if (!mp1) {
311 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
312 		return;
313 	}
314 	mp = mp1;
315 
316 	/* Reset the message type in preparation for shipping it back. */
317 	DB_TYPE(mp) = M_PCPROTO;
318 	tbr = (struct T_bind_req *)mp->b_rptr;
319 	len = tbr->ADDR_length;
320 	switch (len) {
321 	case 0:	/* request for a generic port */
322 		tbr->ADDR_offset = sizeof (struct T_bind_req);
323 		if (icmp->icmp_family == AF_INET) {
324 			tbr->ADDR_length = sizeof (sin_t);
325 			sin = (sin_t *)&tbr[1];
326 			*sin = sin_null;
327 			sin->sin_family = AF_INET;
328 			mp->b_wptr = (uchar_t *)&sin[1];
329 			sa = (struct sockaddr *)sin;
330 			len = sizeof (sin_t);
331 		} else {
332 			ASSERT(icmp->icmp_family == AF_INET6);
333 			tbr->ADDR_length = sizeof (sin6_t);
334 			sin6 = (sin6_t *)&tbr[1];
335 			*sin6 = sin6_null;
336 			sin6->sin6_family = AF_INET6;
337 			mp->b_wptr = (uchar_t *)&sin6[1];
338 			sa = (struct sockaddr *)sin6;
339 			len = sizeof (sin6_t);
340 		}
341 		break;
342 
343 	case sizeof (sin_t):	/* Complete IPv4 address */
344 		sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
345 		    sizeof (sin_t));
346 		break;
347 
348 	case sizeof (sin6_t):	/* Complete IPv6 address */
349 		sa = (struct sockaddr *)mi_offset_param(mp,
350 		    tbr->ADDR_offset, sizeof (sin6_t));
351 		break;
352 
353 	default:
354 		(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
355 		    "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
356 		icmp_err_ack(q, mp, TBADADDR, 0);
357 		return;
358 	}
359 
360 	error = rawip_do_bind(connp, sa, len);
361 done:
362 	ASSERT(mp->b_cont == NULL);
363 	if (error != 0) {
364 		if (error > 0) {
365 			icmp_err_ack(q, mp, TSYSERR, error);
366 		} else {
367 			icmp_err_ack(q, mp, -error, 0);
368 		}
369 	} else {
370 		tbr->PRIM_type = T_BIND_ACK;
371 		qreply(q, mp);
372 	}
373 }
374 
375 static int
376 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
377 {
378 	sin_t		*sin;
379 	sin6_t		*sin6;
380 	icmp_t		*icmp;
381 	int		error = 0;
382 	mblk_t		*ire_mp;
383 
384 
385 	icmp = connp->conn_icmp;
386 
387 	if (sa == NULL || !OK_32PTR((char *)sa)) {
388 		return (EINVAL);
389 	}
390 
391 	/*
392 	 * The state must be TS_UNBND. TPI mandates that users must send
393 	 * TPI primitives only 1 at a time and wait for the response before
394 	 * sending the next primitive.
395 	 */
396 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
397 	if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
398 		error = -TOUTSTATE;
399 		goto done;
400 	}
401 
402 	ASSERT(len != 0);
403 	switch (len) {
404 	case sizeof (sin_t):    /* Complete IPv4 address */
405 		sin = (sin_t *)sa;
406 		if (sin->sin_family != AF_INET ||
407 		    icmp->icmp_family != AF_INET) {
408 			/* TSYSERR, EAFNOSUPPORT */
409 			error = EAFNOSUPPORT;
410 			goto done;
411 		}
412 		break;
413 	case sizeof (sin6_t): /* Complete IPv6 address */
414 		sin6 = (sin6_t *)sa;
415 		if (sin6->sin6_family != AF_INET6 ||
416 		    icmp->icmp_family != AF_INET6) {
417 			/* TSYSERR, EAFNOSUPPORT */
418 			error = EAFNOSUPPORT;
419 			goto done;
420 		}
421 		/* No support for mapped addresses on raw sockets */
422 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
423 			/* TSYSERR, EADDRNOTAVAIL */
424 			error = EADDRNOTAVAIL;
425 			goto done;
426 		}
427 		break;
428 
429 	default:
430 		/* TBADADDR */
431 		error = EADDRNOTAVAIL;
432 		goto done;
433 	}
434 
435 	icmp->icmp_pending_op = T_BIND_REQ;
436 	icmp->icmp_state = TS_IDLE;
437 
438 	/*
439 	 * Copy the source address into our icmp structure.  This address
440 	 * may still be zero; if so, ip will fill in the correct address
441 	 * each time an outbound packet is passed to it.
442 	 * If we are binding to a broadcast or multicast address then
443 	 * rawip_post_ip_bind_connect will clear the source address.
444 	 */
445 
446 	if (icmp->icmp_family == AF_INET) {
447 		ASSERT(sin != NULL);
448 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
449 		IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
450 		    &icmp->icmp_v6src);
451 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
452 		    icmp->icmp_ip_snd_options_len;
453 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
454 	} else {
455 		int error;
456 
457 		ASSERT(sin6 != NULL);
458 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
459 		icmp->icmp_v6src = sin6->sin6_addr;
460 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
461 		icmp->icmp_bound_v6src = icmp->icmp_v6src;
462 
463 		/* Rebuild the header template */
464 		error = icmp_build_hdrs(icmp);
465 		if (error != 0) {
466 			icmp->icmp_pending_op = -1;
467 			/*
468 			 * TSYSERR
469 			 */
470 			goto done;
471 		}
472 	}
473 
474 	ire_mp = NULL;
475 	if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
476 		/*
477 		 * request an IRE if src not 0 (INADDR_ANY)
478 		 */
479 		ire_mp = allocb(sizeof (ire_t), BPRI_HI);
480 		if (ire_mp == NULL) {
481 			icmp->icmp_pending_op = -1;
482 			error = ENOMEM;
483 			goto done;
484 		}
485 		DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
486 	}
487 done:
488 	rw_exit(&icmp->icmp_rwlock);
489 	if (error != 0)
490 		return (error);
491 
492 	if (icmp->icmp_family == AF_INET6) {
493 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
494 		    &sin6->sin6_addr, sin6->sin6_port, B_TRUE);
495 	} else {
496 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
497 		    sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
498 	}
499 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
500 	return (error);
501 }
502 
503 static void
504 rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error)
505 {
506 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
507 	if (icmp->icmp_state == TS_UNBND) {
508 		/*
509 		 * not yet bound - bind sent by icmp_bind_proto.
510 		 */
511 		rw_exit(&icmp->icmp_rwlock);
512 		return;
513 	}
514 	ASSERT(icmp->icmp_pending_op != -1);
515 	icmp->icmp_pending_op = -1;
516 
517 	if (error != 0) {
518 		if (icmp->icmp_state == TS_DATA_XFER) {
519 			/* Connect failed */
520 			/* Revert back to the bound source */
521 			icmp->icmp_v6src = icmp->icmp_bound_v6src;
522 			icmp->icmp_state = TS_IDLE;
523 			if (icmp->icmp_family == AF_INET6)
524 				(void) icmp_build_hdrs(icmp);
525 		} else {
526 			V6_SET_ZERO(icmp->icmp_v6src);
527 			V6_SET_ZERO(icmp->icmp_bound_v6src);
528 			icmp->icmp_state = TS_UNBND;
529 			if (icmp->icmp_family == AF_INET6)
530 				(void) icmp_build_hdrs(icmp);
531 		}
532 	} else {
533 		if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
534 			ire_t *ire;
535 
536 			ire = (ire_t *)ire_mp->b_rptr;
537 			/*
538 			 * If a broadcast/multicast address was bound set
539 			 * the source address to 0.
540 			 * This ensures no datagrams with broadcast address
541 			 * as source address are emitted (which would violate
542 			 * RFC1122 - Hosts requirements)
543 			 * Note: we get IRE_BROADCAST for IPv6
544 			 * to "mark" a multicast local address.
545 			 */
546 
547 
548 			if (ire->ire_type == IRE_BROADCAST &&
549 			    icmp->icmp_state != TS_DATA_XFER) {
550 				/*
551 				 * This was just a local bind to a
552 				 * MC/broadcast addr
553 				 */
554 				V6_SET_ZERO(icmp->icmp_v6src);
555 				if (icmp->icmp_family == AF_INET6)
556 					(void) icmp_build_hdrs(icmp);
557 			}
558 		}
559 
560 	}
561 	rw_exit(&icmp->icmp_rwlock);
562 	if (ire_mp != NULL)
563 		freeb(ire_mp);
564 }
565 
566 /*
567  * Send message to IP to just bind to the protocol.
568  */
569 static int
570 icmp_bind_proto(conn_t *connp)
571 {
572 	icmp_t	*icmp;
573 	int	error;
574 
575 	icmp = connp->conn_icmp;
576 
577 	if (icmp->icmp_family == AF_INET6)
578 		error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto,
579 		    &sin6_null.sin6_addr, 0, B_TRUE);
580 	else
581 		error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto,
582 		    sin_null.sin_addr.s_addr, 0, B_TRUE);
583 
584 	rawip_post_ip_bind_connect(icmp, NULL, error);
585 	return (error);
586 }
587 
588 static void
589 icmp_tpi_connect(queue_t *q, mblk_t *mp)
590 {
591 	conn_t	*connp = Q_TO_CONN(q);
592 	struct T_conn_req	*tcr;
593 	icmp_t	*icmp;
594 	struct sockaddr *sa;
595 	socklen_t len;
596 	int error;
597 	cred_t *cr;
598 
599 	/*
600 	 * All Solaris components should pass a db_credp
601 	 * for this TPI message, hence we ASSERT.
602 	 * But in case there is some other M_PROTO that looks
603 	 * like a TPI message sent by some other kernel
604 	 * component, we check and return an error.
605 	 */
606 	cr = msg_getcred(mp, NULL);
607 	ASSERT(cr != NULL);
608 	if (cr == NULL) {
609 		icmp_err_ack(q, mp, TSYSERR, EINVAL);
610 		return;
611 	}
612 
613 	icmp = connp->conn_icmp;
614 	tcr = (struct T_conn_req *)mp->b_rptr;
615 	/* Sanity checks */
616 	if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
617 		icmp_err_ack(q, mp, TPROTO, 0);
618 		return;
619 	}
620 
621 	if (tcr->OPT_length != 0) {
622 		icmp_err_ack(q, mp, TBADOPT, 0);
623 		return;
624 	}
625 
626 	len = tcr->DEST_length;
627 
628 	switch (len) {
629 	default:
630 		icmp_err_ack(q, mp, TBADADDR, 0);
631 		return;
632 	case sizeof (sin_t):
633 		sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
634 		    sizeof (sin_t));
635 		break;
636 	case sizeof (sin6_t):
637 		sa = (struct sockaddr *)mi_offset_param(mp,
638 		    tcr->DEST_offset, sizeof (sin6_t));
639 		break;
640 	}
641 
642 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
643 	if (error != 0) {
644 		icmp_err_ack(q, mp, TSYSERR, error);
645 		return;
646 	}
647 
648 	error = rawip_do_connect(connp, sa, len, cr);
649 	if (error != 0) {
650 		if (error < 0) {
651 			icmp_err_ack(q, mp, -error, 0);
652 		} else {
653 			icmp_err_ack(q, mp, 0, error);
654 		}
655 	} else {
656 		mblk_t *mp1;
657 
658 		/*
659 		 * We have to send a connection confirmation to
660 		 * keep TLI happy.
661 		 */
662 		if (icmp->icmp_family == AF_INET) {
663 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
664 			    sizeof (sin_t), NULL, 0);
665 		} else {
666 			ASSERT(icmp->icmp_family == AF_INET6);
667 			mp1 = mi_tpi_conn_con(NULL, (char *)sa,
668 			    sizeof (sin6_t), NULL, 0);
669 		}
670 		if (mp1 == NULL) {
671 			icmp_err_ack(q, mp, TSYSERR, ENOMEM);
672 			return;
673 		}
674 
675 		/*
676 		 * Send ok_ack for T_CONN_REQ
677 		 */
678 		mp = mi_tpi_ok_ack_alloc(mp);
679 		if (mp == NULL) {
680 			/* Unable to reuse the T_CONN_REQ for the ack. */
681 			icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
682 			return;
683 		}
684 		putnext(connp->conn_rq, mp);
685 		putnext(connp->conn_rq, mp1);
686 	}
687 }
688 
689 static int
690 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
691     cred_t *cr)
692 {
693 	icmp_t	*icmp;
694 	sin_t	*sin;
695 	sin6_t	*sin6;
696 	mblk_t  *ire_mp;
697 	int	error;
698 	ipaddr_t	v4dst;
699 	in6_addr_t	v6dst;
700 
701 	icmp = connp->conn_icmp;
702 
703 	if (sa == NULL || !OK_32PTR((char *)sa)) {
704 		return (EINVAL);
705 	}
706 
707 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
708 	if (ire_mp == NULL)
709 		return (ENOMEM);
710 	DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
711 
712 
713 	ASSERT(sa != NULL && len != 0);
714 
715 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
716 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
717 		rw_exit(&icmp->icmp_rwlock);
718 		freeb(ire_mp);
719 		return (-TOUTSTATE);
720 	}
721 
722 	switch (len) {
723 	case sizeof (sin_t):
724 		sin = (sin_t *)sa;
725 
726 		ASSERT(icmp->icmp_family == AF_INET);
727 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
728 
729 		v4dst = sin->sin_addr.s_addr;
730 		/*
731 		 * Interpret a zero destination to mean loopback.
732 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
733 		 * generate the T_CONN_CON.
734 		 */
735 		if (v4dst == INADDR_ANY) {
736 			v4dst = htonl(INADDR_LOOPBACK);
737 		}
738 
739 		IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
740 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
741 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
742 		    icmp->icmp_ip_snd_options_len;
743 		icmp->icmp_v6dst.sin6_addr = v6dst;
744 		icmp->icmp_v6dst.sin6_family = AF_INET6;
745 		icmp->icmp_v6dst.sin6_flowinfo = 0;
746 		icmp->icmp_v6dst.sin6_port = 0;
747 
748 		/*
749 		 * If the destination address is multicast and
750 		 * an outgoing multicast interface has been set,
751 		 * use the address of that interface as our
752 		 * source address if no source address has been set.
753 		 */
754 		if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
755 		    CLASSD(v4dst) &&
756 		    icmp->icmp_multicast_if_addr != INADDR_ANY) {
757 			IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
758 			    &icmp->icmp_v6src);
759 		}
760 		break;
761 	case sizeof (sin6_t):
762 		sin6 = (sin6_t *)sa;
763 
764 		/* No support for mapped addresses on raw sockets */
765 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
766 			rw_exit(&icmp->icmp_rwlock);
767 			freeb(ire_mp);
768 			return (EADDRNOTAVAIL);
769 		}
770 
771 		ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
772 		ASSERT(icmp->icmp_family == AF_INET6);
773 
774 		icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
775 
776 		icmp->icmp_v6dst = *sin6;
777 		icmp->icmp_v6dst.sin6_port = 0;
778 
779 		/*
780 		 * Interpret a zero destination to mean loopback.
781 		 * Update the T_CONN_REQ (sin/sin6) since it is used to
782 		 * generate the T_CONN_CON.
783 		 */
784 		if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) {
785 			icmp->icmp_v6dst.sin6_addr = ipv6_loopback;
786 		}
787 		/*
788 		 * If the destination address is multicast and
789 		 * an outgoing multicast interface has been set,
790 		 * then the ip bind logic will pick the correct source
791 		 * address (i.e. matching the outgoing multicast interface).
792 		 */
793 		break;
794 	}
795 
796 	icmp->icmp_pending_op = T_CONN_REQ;
797 
798 	if (icmp->icmp_state == TS_DATA_XFER) {
799 		/* Already connected - clear out state */
800 		icmp->icmp_v6src = icmp->icmp_bound_v6src;
801 		icmp->icmp_state = TS_IDLE;
802 	}
803 
804 	icmp->icmp_state = TS_DATA_XFER;
805 	rw_exit(&icmp->icmp_rwlock);
806 
807 	if (icmp->icmp_family == AF_INET6) {
808 		error = ip_proto_bind_connected_v6(connp, &ire_mp,
809 		    icmp->icmp_proto, &icmp->icmp_v6src, 0,
810 		    &icmp->icmp_v6dst.sin6_addr,
811 		    NULL, sin6->sin6_port, B_TRUE, B_TRUE, cr);
812 	} else {
813 		error = ip_proto_bind_connected_v4(connp, &ire_mp,
814 		    icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0,
815 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port,
816 		    B_TRUE, B_TRUE, cr);
817 	}
818 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
819 	return (error);
820 }
821 
822 static void
823 icmp_close_free(conn_t *connp)
824 {
825 	icmp_t *icmp = connp->conn_icmp;
826 
827 	/* If there are any options associated with the stream, free them. */
828 	if (icmp->icmp_ip_snd_options != NULL) {
829 		mi_free((char *)icmp->icmp_ip_snd_options);
830 		icmp->icmp_ip_snd_options = NULL;
831 		icmp->icmp_ip_snd_options_len = 0;
832 	}
833 
834 	if (icmp->icmp_filter != NULL) {
835 		kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
836 		icmp->icmp_filter = NULL;
837 	}
838 
839 	/* Free memory associated with sticky options */
840 	if (icmp->icmp_sticky_hdrs_len != 0) {
841 		kmem_free(icmp->icmp_sticky_hdrs,
842 		    icmp->icmp_sticky_hdrs_len);
843 		icmp->icmp_sticky_hdrs = NULL;
844 		icmp->icmp_sticky_hdrs_len = 0;
845 	}
846 
847 	if (icmp->icmp_last_cred != NULL) {
848 		crfree(icmp->icmp_last_cred);
849 		icmp->icmp_last_cred = NULL;
850 	}
851 
852 	if (icmp->icmp_effective_cred != NULL) {
853 		crfree(icmp->icmp_effective_cred);
854 		icmp->icmp_effective_cred = NULL;
855 	}
856 
857 	ip6_pkt_free(&icmp->icmp_sticky_ipp);
858 
859 	/*
860 	 * Clear any fields which the kmem_cache constructor clears.
861 	 * Only icmp_connp needs to be preserved.
862 	 * TBD: We should make this more efficient to avoid clearing
863 	 * everything.
864 	 */
865 	ASSERT(icmp->icmp_connp == connp);
866 	bzero(icmp, sizeof (icmp_t));
867 	icmp->icmp_connp = connp;
868 }
869 
870 static int
871 rawip_do_close(conn_t *connp)
872 {
873 	ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
874 
875 	ip_quiesce_conn(connp);
876 
877 	if (!IPCL_IS_NONSTR(connp)) {
878 		qprocsoff(connp->conn_rq);
879 	}
880 
881 	ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL &&
882 	    connp->conn_icmp->icmp_fallback_queue_tail == NULL);
883 	icmp_close_free(connp);
884 
885 	/*
886 	 * Now we are truly single threaded on this stream, and can
887 	 * delete the things hanging off the connp, and finally the connp.
888 	 * We removed this connp from the fanout list, it cannot be
889 	 * accessed thru the fanouts, and we already waited for the
890 	 * conn_ref to drop to 0. We are already in close, so
891 	 * there cannot be any other thread from the top. qprocsoff
892 	 * has completed, and service has completed or won't run in
893 	 * future.
894 	 */
895 	ASSERT(connp->conn_ref == 1);
896 
897 	if (!IPCL_IS_NONSTR(connp)) {
898 		inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
899 	} else {
900 		ip_free_helper_stream(connp);
901 	}
902 
903 	connp->conn_ref--;
904 	ipcl_conn_destroy(connp);
905 
906 	return (0);
907 }
908 
909 static int
910 icmp_close(queue_t *q, int flags)
911 {
912 	conn_t  *connp;
913 
914 	if (flags & SO_FALLBACK) {
915 		/*
916 		 * stream is being closed while in fallback
917 		 * simply free the resources that were allocated
918 		 */
919 		inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
920 		qprocsoff(q);
921 		goto done;
922 	}
923 
924 	connp = Q_TO_CONN(q);
925 	(void) rawip_do_close(connp);
926 done:
927 	q->q_ptr = WR(q)->q_ptr = NULL;
928 	return (0);
929 }
930 
931 /*
932  * This routine handles each T_DISCON_REQ message passed to icmp
933  * as an indicating that ICMP is no longer connected. This results
934  * in sending a T_BIND_REQ to IP to restore the binding to just
935  * the local address.
936  *
937  * The disconnect completes in rawip_post_ip_bind_connect.
938  */
939 static int
940 icmp_do_disconnect(conn_t *connp)
941 {
942 	icmp_t	*icmp;
943 	mblk_t	*ire_mp;
944 	int error;
945 
946 	icmp = connp->conn_icmp;
947 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
948 	if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
949 		rw_exit(&icmp->icmp_rwlock);
950 		return (-TOUTSTATE);
951 	}
952 	icmp->icmp_pending_op = T_DISCON_REQ;
953 	icmp->icmp_v6src = icmp->icmp_bound_v6src;
954 	icmp->icmp_state = TS_IDLE;
955 
956 
957 	if (icmp->icmp_family == AF_INET6) {
958 		/* Rebuild the header template */
959 		error = icmp_build_hdrs(icmp);
960 		if (error != 0) {
961 			icmp->icmp_pending_op = -1;
962 			rw_exit(&icmp->icmp_rwlock);
963 			return (error);
964 		}
965 	}
966 
967 	rw_exit(&icmp->icmp_rwlock);
968 	ire_mp = allocb(sizeof (ire_t), BPRI_HI);
969 	if (ire_mp == NULL) {
970 		return (ENOMEM);
971 	}
972 
973 	if (icmp->icmp_family == AF_INET6) {
974 		error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
975 		    &icmp->icmp_bound_v6src, 0, B_TRUE);
976 	} else {
977 
978 		error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
979 		    V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE);
980 	}
981 
982 	rawip_post_ip_bind_connect(icmp, ire_mp, error);
983 
984 	return (error);
985 }
986 
987 static void
988 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
989 {
990 	conn_t	*connp = Q_TO_CONN(q);
991 	int	error;
992 
993 	/*
994 	 * Allocate the largest primitive we need to send back
995 	 * T_error_ack is > than T_ok_ack
996 	 */
997 	mp = reallocb(mp, sizeof (struct T_error_ack), 1);
998 	if (mp == NULL) {
999 		/* Unable to reuse the T_DISCON_REQ for the ack. */
1000 		icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1001 		return;
1002 	}
1003 
1004 	error = icmp_do_disconnect(connp);
1005 
1006 	if (error != 0) {
1007 		if (error > 0) {
1008 			icmp_err_ack(q, mp, 0, error);
1009 		} else {
1010 			icmp_err_ack(q, mp, -error, 0);
1011 		}
1012 	} else {
1013 		mp = mi_tpi_ok_ack_alloc(mp);
1014 		ASSERT(mp != NULL);
1015 		qreply(q, mp);
1016 	}
1017 
1018 }
1019 
1020 static int
1021 icmp_disconnect(conn_t *connp)
1022 {
1023 	int	error;
1024 	icmp_t	*icmp = connp->conn_icmp;
1025 
1026 	icmp->icmp_dgram_errind = B_FALSE;
1027 
1028 	error = icmp_do_disconnect(connp);
1029 
1030 	if (error < 0)
1031 		error = proto_tlitosyserr(-error);
1032 	return (error);
1033 }
1034 
1035 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1036 static void
1037 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1038 {
1039 	if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1040 		qreply(q, mp);
1041 }
1042 
1043 /* Shorthand to generate and send TPI error acks to our client */
1044 static void
1045 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1046     t_scalar_t t_error, int sys_error)
1047 {
1048 	struct T_error_ack	*teackp;
1049 
1050 	if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1051 	    M_PCPROTO, T_ERROR_ACK)) != NULL) {
1052 		teackp = (struct T_error_ack *)mp->b_rptr;
1053 		teackp->ERROR_prim = primitive;
1054 		teackp->TLI_error = t_error;
1055 		teackp->UNIX_error = sys_error;
1056 		qreply(q, mp);
1057 	}
1058 }
1059 
1060 /*
1061  * icmp_icmp_error is called by icmp_input to process ICMP
1062  * messages passed up by IP.
1063  * Generates the appropriate permanent (non-transient) errors.
1064  * Assumes that IP has pulled up everything up to and including
1065  * the ICMP header.
1066  */
1067 static void
1068 icmp_icmp_error(conn_t *connp, mblk_t *mp)
1069 {
1070 	icmph_t *icmph;
1071 	ipha_t	*ipha;
1072 	int	iph_hdr_length;
1073 	sin_t	sin;
1074 	mblk_t	*mp1;
1075 	int	error = 0;
1076 	icmp_t	*icmp = connp->conn_icmp;
1077 
1078 	ipha = (ipha_t *)mp->b_rptr;
1079 
1080 	ASSERT(OK_32PTR(mp->b_rptr));
1081 
1082 	if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1083 		ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1084 		icmp_icmp_error_ipv6(connp, mp);
1085 		return;
1086 	}
1087 
1088 	/*
1089 	 * icmp does not support v4 mapped addresses
1090 	 * so we can never be here for a V6 socket
1091 	 * i.e. icmp_family == AF_INET6
1092 	 */
1093 	ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) &&
1094 	    (icmp->icmp_family == AF_INET));
1095 
1096 	ASSERT(icmp->icmp_family == AF_INET);
1097 
1098 	/* Skip past the outer IP and ICMP headers */
1099 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1100 	icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
1101 	ipha = (ipha_t *)&icmph[1];
1102 	iph_hdr_length = IPH_HDR_LENGTH(ipha);
1103 
1104 	switch (icmph->icmph_type) {
1105 	case ICMP_DEST_UNREACHABLE:
1106 		switch (icmph->icmph_code) {
1107 		case ICMP_FRAGMENTATION_NEEDED:
1108 			/*
1109 			 * IP has already adjusted the path MTU.
1110 			 */
1111 			break;
1112 		case ICMP_PORT_UNREACHABLE:
1113 		case ICMP_PROTOCOL_UNREACHABLE:
1114 			error = ECONNREFUSED;
1115 			break;
1116 		default:
1117 			/* Transient errors */
1118 			break;
1119 		}
1120 		break;
1121 	default:
1122 		/* Transient errors */
1123 		break;
1124 	}
1125 	if (error == 0) {
1126 		freemsg(mp);
1127 		return;
1128 	}
1129 
1130 	/*
1131 	 * Deliver T_UDERROR_IND when the application has asked for it.
1132 	 * The socket layer enables this automatically when connected.
1133 	 */
1134 	if (!icmp->icmp_dgram_errind) {
1135 		freemsg(mp);
1136 		return;
1137 	}
1138 
1139 	sin = sin_null;
1140 	sin.sin_family = AF_INET;
1141 	sin.sin_addr.s_addr = ipha->ipha_dst;
1142 
1143 	if (IPCL_IS_NONSTR(connp)) {
1144 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1145 		if (icmp->icmp_state == TS_DATA_XFER) {
1146 			if (sin.sin_addr.s_addr ==
1147 			    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) {
1148 				rw_exit(&icmp->icmp_rwlock);
1149 				(*connp->conn_upcalls->su_set_error)
1150 				    (connp->conn_upper_handle, error);
1151 				goto done;
1152 			}
1153 		} else {
1154 			icmp->icmp_delayed_error = error;
1155 			*((sin_t *)&icmp->icmp_delayed_addr) = sin;
1156 		}
1157 		rw_exit(&icmp->icmp_rwlock);
1158 	} else {
1159 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
1160 		    0, error);
1161 		if (mp1 != NULL)
1162 			putnext(connp->conn_rq, mp1);
1163 	}
1164 done:
1165 	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
1166 	freemsg(mp);
1167 }
1168 
1169 /*
1170  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6
1171  * for IPv6 packets.
1172  * Send permanent (non-transient) errors upstream.
1173  * Assumes that IP has pulled up all the extension headers as well
1174  * as the ICMPv6 header.
1175  */
1176 static void
1177 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
1178 {
1179 	icmp6_t		*icmp6;
1180 	ip6_t		*ip6h, *outer_ip6h;
1181 	uint16_t	iph_hdr_length;
1182 	uint8_t		*nexthdrp;
1183 	sin6_t		sin6;
1184 	mblk_t		*mp1;
1185 	int		error = 0;
1186 	icmp_t		*icmp = connp->conn_icmp;
1187 
1188 	outer_ip6h = (ip6_t *)mp->b_rptr;
1189 	if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1190 		iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1191 	else
1192 		iph_hdr_length = IPV6_HDR_LEN;
1193 
1194 	icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1195 	ip6h = (ip6_t *)&icmp6[1];
1196 	if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1197 		freemsg(mp);
1198 		return;
1199 	}
1200 
1201 	switch (icmp6->icmp6_type) {
1202 	case ICMP6_DST_UNREACH:
1203 		switch (icmp6->icmp6_code) {
1204 		case ICMP6_DST_UNREACH_NOPORT:
1205 			error = ECONNREFUSED;
1206 			break;
1207 		case ICMP6_DST_UNREACH_ADMIN:
1208 		case ICMP6_DST_UNREACH_NOROUTE:
1209 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
1210 		case ICMP6_DST_UNREACH_ADDR:
1211 			/* Transient errors */
1212 			break;
1213 		default:
1214 			break;
1215 		}
1216 		break;
1217 	case ICMP6_PACKET_TOO_BIG: {
1218 		struct T_unitdata_ind	*tudi;
1219 		struct T_opthdr		*toh;
1220 		size_t			udi_size;
1221 		mblk_t			*newmp;
1222 		t_scalar_t		opt_length = sizeof (struct T_opthdr) +
1223 		    sizeof (struct ip6_mtuinfo);
1224 		sin6_t			*sin6;
1225 		struct ip6_mtuinfo	*mtuinfo;
1226 
1227 		/*
1228 		 * If the application has requested to receive path mtu
1229 		 * information, send up an empty message containing an
1230 		 * IPV6_PATHMTU ancillary data item.
1231 		 */
1232 		if (!icmp->icmp_ipv6_recvpathmtu)
1233 			break;
1234 
1235 		udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1236 		    opt_length;
1237 		if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1238 			BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1239 			break;
1240 		}
1241 
1242 		/*
1243 		 * newmp->b_cont is left to NULL on purpose.  This is an
1244 		 * empty message containing only ancillary data.
1245 		 */
1246 		newmp->b_datap->db_type = M_PROTO;
1247 		tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1248 		newmp->b_wptr = (uchar_t *)tudi + udi_size;
1249 		tudi->PRIM_type = T_UNITDATA_IND;
1250 		tudi->SRC_length = sizeof (sin6_t);
1251 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1252 		tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1253 		tudi->OPT_length = opt_length;
1254 
1255 		sin6 = (sin6_t *)&tudi[1];
1256 		bzero(sin6, sizeof (sin6_t));
1257 		sin6->sin6_family = AF_INET6;
1258 		sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr;
1259 
1260 		toh = (struct T_opthdr *)&sin6[1];
1261 		toh->level = IPPROTO_IPV6;
1262 		toh->name = IPV6_PATHMTU;
1263 		toh->len = opt_length;
1264 		toh->status = 0;
1265 
1266 		mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1267 		bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1268 		mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1269 		mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1270 		mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1271 		/*
1272 		 * We've consumed everything we need from the original
1273 		 * message.  Free it, then send our empty message.
1274 		 */
1275 		freemsg(mp);
1276 		icmp_ulp_recv(connp, newmp);
1277 
1278 		return;
1279 	}
1280 	case ICMP6_TIME_EXCEEDED:
1281 		/* Transient errors */
1282 		break;
1283 	case ICMP6_PARAM_PROB:
1284 		/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1285 		if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1286 		    (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1287 		    (uchar_t *)nexthdrp) {
1288 			error = ECONNREFUSED;
1289 			break;
1290 		}
1291 		break;
1292 	}
1293 	if (error == 0) {
1294 		freemsg(mp);
1295 		return;
1296 	}
1297 
1298 	/*
1299 	 * Deliver T_UDERROR_IND when the application has asked for it.
1300 	 * The socket layer enables this automatically when connected.
1301 	 */
1302 	if (!icmp->icmp_dgram_errind) {
1303 		freemsg(mp);
1304 		return;
1305 	}
1306 
1307 	sin6 = sin6_null;
1308 	sin6.sin6_family = AF_INET6;
1309 	sin6.sin6_addr = ip6h->ip6_dst;
1310 	sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1311 
1312 	if (IPCL_IS_NONSTR(connp)) {
1313 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1314 		if (icmp->icmp_state == TS_DATA_XFER) {
1315 			if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1316 			    &icmp->icmp_v6dst.sin6_addr)) {
1317 				rw_exit(&icmp->icmp_rwlock);
1318 				(*connp->conn_upcalls->su_set_error)
1319 				    (connp->conn_upper_handle, error);
1320 				goto done;
1321 			}
1322 		} else {
1323 			icmp->icmp_delayed_error = error;
1324 			*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1325 		}
1326 		rw_exit(&icmp->icmp_rwlock);
1327 	} else {
1328 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1329 		    NULL, 0, error);
1330 		if (mp1 != NULL)
1331 			putnext(connp->conn_rq, mp1);
1332 	}
1333 done:
1334 	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
1335 	freemsg(mp);
1336 }
1337 
1338 /*
1339  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1340  * The local address is filled in if endpoint is bound. The remote address
1341  * is filled in if remote address has been precified ("connected endpoint")
1342  * (The concept of connected CLTS sockets is alien to published TPI
1343  *  but we support it anyway).
1344  */
1345 static void
1346 icmp_addr_req(queue_t *q, mblk_t *mp)
1347 {
1348 	icmp_t	*icmp = Q_TO_ICMP(q);
1349 	mblk_t	*ackmp;
1350 	struct T_addr_ack *taa;
1351 
1352 	/* Make it large enough for worst case */
1353 	ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1354 	    2 * sizeof (sin6_t), 1);
1355 	if (ackmp == NULL) {
1356 		icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1357 		return;
1358 	}
1359 	taa = (struct T_addr_ack *)ackmp->b_rptr;
1360 
1361 	bzero(taa, sizeof (struct T_addr_ack));
1362 	ackmp->b_wptr = (uchar_t *)&taa[1];
1363 
1364 	taa->PRIM_type = T_ADDR_ACK;
1365 	ackmp->b_datap->db_type = M_PCPROTO;
1366 	rw_enter(&icmp->icmp_rwlock, RW_READER);
1367 	/*
1368 	 * Note: Following code assumes 32 bit alignment of basic
1369 	 * data structures like sin_t and struct T_addr_ack.
1370 	 */
1371 	if (icmp->icmp_state != TS_UNBND) {
1372 		/*
1373 		 * Fill in local address
1374 		 */
1375 		taa->LOCADDR_offset = sizeof (*taa);
1376 		if (icmp->icmp_family == AF_INET) {
1377 			sin_t	*sin;
1378 
1379 			taa->LOCADDR_length = sizeof (sin_t);
1380 			sin = (sin_t *)&taa[1];
1381 			/* Fill zeroes and then intialize non-zero fields */
1382 			*sin = sin_null;
1383 			sin->sin_family = AF_INET;
1384 			if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
1385 			    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1386 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src,
1387 				    sin->sin_addr.s_addr);
1388 			} else {
1389 				/*
1390 				 * INADDR_ANY
1391 				 * icmp_v6src is not set, we might be bound to
1392 				 * broadcast/multicast. Use icmp_bound_v6src as
1393 				 * local address instead (that could
1394 				 * also still be INADDR_ANY)
1395 				 */
1396 				IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src,
1397 				    sin->sin_addr.s_addr);
1398 			}
1399 			ackmp->b_wptr = (uchar_t *)&sin[1];
1400 		} else {
1401 			sin6_t	*sin6;
1402 
1403 			ASSERT(icmp->icmp_family == AF_INET6);
1404 			taa->LOCADDR_length = sizeof (sin6_t);
1405 			sin6 = (sin6_t *)&taa[1];
1406 			/* Fill zeroes and then intialize non-zero fields */
1407 			*sin6 = sin6_null;
1408 			sin6->sin6_family = AF_INET6;
1409 			if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
1410 				sin6->sin6_addr = icmp->icmp_v6src;
1411 			} else {
1412 				/*
1413 				 * UNSPECIFIED
1414 				 * icmp_v6src is not set, we might be bound to
1415 				 * broadcast/multicast. Use icmp_bound_v6src as
1416 				 * local address instead (that could
1417 				 * also still be UNSPECIFIED)
1418 				 */
1419 				sin6->sin6_addr = icmp->icmp_bound_v6src;
1420 			}
1421 			ackmp->b_wptr = (uchar_t *)&sin6[1];
1422 		}
1423 	}
1424 	rw_exit(&icmp->icmp_rwlock);
1425 	ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1426 	qreply(q, ackmp);
1427 }
1428 
1429 static void
1430 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1431 {
1432 	*tap = icmp_g_t_info_ack;
1433 
1434 	if (icmp->icmp_family == AF_INET6)
1435 		tap->ADDR_size = sizeof (sin6_t);
1436 	else
1437 		tap->ADDR_size = sizeof (sin_t);
1438 	tap->CURRENT_state = icmp->icmp_state;
1439 	tap->OPT_size = icmp_max_optsize;
1440 }
1441 
1442 static void
1443 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1444     t_uscalar_t cap_bits1)
1445 {
1446 	tcap->CAP_bits1 = 0;
1447 
1448 	if (cap_bits1 & TC1_INFO) {
1449 		icmp_copy_info(&tcap->INFO_ack, icmp);
1450 		tcap->CAP_bits1 |= TC1_INFO;
1451 	}
1452 }
1453 
1454 /*
1455  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1456  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1457  * icmp_g_t_info_ack.  The current state of the stream is copied from
1458  * icmp_state.
1459  */
1460 static void
1461 icmp_capability_req(queue_t *q, mblk_t *mp)
1462 {
1463 	icmp_t			*icmp = Q_TO_ICMP(q);
1464 	t_uscalar_t		cap_bits1;
1465 	struct T_capability_ack	*tcap;
1466 
1467 	cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1468 
1469 	mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1470 	    mp->b_datap->db_type, T_CAPABILITY_ACK);
1471 	if (!mp)
1472 		return;
1473 
1474 	tcap = (struct T_capability_ack *)mp->b_rptr;
1475 
1476 	icmp_do_capability_ack(icmp, tcap, cap_bits1);
1477 
1478 	qreply(q, mp);
1479 }
1480 
1481 /*
1482  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1483  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1484  * The current state of the stream is copied from icmp_state.
1485  */
1486 static void
1487 icmp_info_req(queue_t *q, mblk_t *mp)
1488 {
1489 	icmp_t	*icmp = Q_TO_ICMP(q);
1490 
1491 	mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1492 	    T_INFO_ACK);
1493 	if (!mp)
1494 		return;
1495 	icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1496 	qreply(q, mp);
1497 }
1498 
1499 /* For /dev/icmp aka AF_INET open */
1500 static int
1501 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1502     int family)
1503 {
1504 	conn_t *connp;
1505 	dev_t	conn_dev;
1506 	icmp_stack_t *is;
1507 	int	error;
1508 
1509 	conn_dev = NULL;
1510 
1511 	/* If the stream is already open, return immediately. */
1512 	if (q->q_ptr != NULL)
1513 		return (0);
1514 
1515 	if (sflag == MODOPEN)
1516 		return (EINVAL);
1517 
1518 	/*
1519 	 * Since ICMP is not used so heavily, allocating from the small
1520 	 * arena should be sufficient.
1521 	 */
1522 	if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1523 		return (EBUSY);
1524 	}
1525 
1526 	if (flag & SO_FALLBACK) {
1527 		/*
1528 		 * Non streams socket needs a stream to fallback to
1529 		 */
1530 		RD(q)->q_ptr = (void *)conn_dev;
1531 		WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1532 		WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1533 		qprocson(q);
1534 		return (0);
1535 	}
1536 
1537 	connp = icmp_open(family, credp, &error, KM_SLEEP);
1538 	if (connp == NULL) {
1539 		ASSERT(error != NULL);
1540 		inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1541 		return (error);
1542 	}
1543 
1544 	*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1545 	connp->conn_dev = conn_dev;
1546 	connp->conn_minor_arena = ip_minor_arena_sa;
1547 
1548 	is = connp->conn_icmp->icmp_is;
1549 
1550 	/*
1551 	 * Initialize the icmp_t structure for this stream.
1552 	 */
1553 	q->q_ptr = connp;
1554 	WR(q)->q_ptr = connp;
1555 	connp->conn_rq = q;
1556 	connp->conn_wq = WR(q);
1557 
1558 	if (connp->conn_icmp->icmp_family == AF_INET6) {
1559 		/* Build initial header template for transmit */
1560 		rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
1561 		if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) {
1562 			rw_exit(&connp->conn_icmp->icmp_rwlock);
1563 			inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
1564 			ipcl_conn_destroy(connp);
1565 			return (error);
1566 		}
1567 		rw_exit(&connp->conn_icmp->icmp_rwlock);
1568 	}
1569 
1570 
1571 	q->q_hiwat = is->is_recv_hiwat;
1572 	WR(q)->q_hiwat = is->is_xmit_hiwat;
1573 	WR(q)->q_lowat = is->is_xmit_lowat;
1574 
1575 	qprocson(q);
1576 
1577 	/* Set the Stream head write offset. */
1578 	(void) proto_set_tx_wroff(q, connp,
1579 	    connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra);
1580 	(void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat);
1581 
1582 	mutex_enter(&connp->conn_lock);
1583 	connp->conn_state_flags &= ~CONN_INCIPIENT;
1584 	mutex_exit(&connp->conn_lock);
1585 
1586 	return (0);
1587 }
1588 
1589 /* For /dev/icmp4 aka AF_INET open */
1590 static int
1591 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1592 {
1593 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1594 }
1595 
1596 /* For /dev/icmp6 aka AF_INET6 open */
1597 static int
1598 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1599 {
1600 	return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1601 }
1602 
1603 /*
1604  * This is the open routine for icmp.  It allocates a icmp_t structure for
1605  * the stream and, on the first open of the module, creates an ND table.
1606  */
1607 /* ARGSUSED */
1608 static conn_t *
1609 icmp_open(int family, cred_t *credp, int *err, int flags)
1610 {
1611 	icmp_t	*icmp;
1612 	conn_t *connp;
1613 	zoneid_t zoneid;
1614 	netstack_t *ns;
1615 	icmp_stack_t *is;
1616 	boolean_t isv6 = B_FALSE;
1617 
1618 	*err = secpolicy_net_icmpaccess(credp);
1619 	if (*err != 0)
1620 		return (NULL);
1621 
1622 	if (family == AF_INET6)
1623 		isv6 = B_TRUE;
1624 	ns = netstack_find_by_cred(credp);
1625 	ASSERT(ns != NULL);
1626 	is = ns->netstack_icmp;
1627 	ASSERT(is != NULL);
1628 
1629 	/*
1630 	 * For exclusive stacks we set the zoneid to zero
1631 	 * to make ICMP operate as if in the global zone.
1632 	 */
1633 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1634 		zoneid = GLOBAL_ZONEID;
1635 	else
1636 		zoneid = crgetzoneid(credp);
1637 
1638 	ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1639 
1640 	connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1641 	icmp = connp->conn_icmp;
1642 	icmp->icmp_v6dst = sin6_null;
1643 
1644 	/*
1645 	 * ipcl_conn_create did a netstack_hold. Undo the hold that was
1646 	 * done by netstack_find_by_cred()
1647 	 */
1648 	netstack_rele(ns);
1649 
1650 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
1651 	ASSERT(connp->conn_ulp == IPPROTO_ICMP);
1652 	ASSERT(connp->conn_icmp == icmp);
1653 	ASSERT(icmp->icmp_connp == connp);
1654 
1655 	/* Set the initial state of the stream and the privilege status. */
1656 	icmp->icmp_state = TS_UNBND;
1657 	if (isv6) {
1658 		icmp->icmp_ipversion = IPV6_VERSION;
1659 		icmp->icmp_family = AF_INET6;
1660 		connp->conn_ulp = IPPROTO_ICMPV6;
1661 		/* May be changed by a SO_PROTOTYPE socket option. */
1662 		icmp->icmp_proto = IPPROTO_ICMPV6;
1663 		icmp->icmp_checksum_off = 2;	/* Offset for icmp6_cksum */
1664 		icmp->icmp_max_hdr_len = IPV6_HDR_LEN;
1665 		icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit;
1666 		connp->conn_af_isv6 = B_TRUE;
1667 		connp->conn_flags |= IPCL_ISV6;
1668 	} else {
1669 		icmp->icmp_ipversion = IPV4_VERSION;
1670 		icmp->icmp_family = AF_INET;
1671 		/* May be changed by a SO_PROTOTYPE socket option. */
1672 		icmp->icmp_proto = IPPROTO_ICMP;
1673 		icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH;
1674 		icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl;
1675 		connp->conn_af_isv6 = B_FALSE;
1676 		connp->conn_flags &= ~IPCL_ISV6;
1677 	}
1678 	icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1679 	icmp->icmp_pending_op = -1;
1680 	connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
1681 	connp->conn_zoneid = zoneid;
1682 
1683 	/*
1684 	 * If the caller has the process-wide flag set, then default to MAC
1685 	 * exempt mode.  This allows read-down to unlabeled hosts.
1686 	 */
1687 	if (getpflags(NET_MAC_AWARE, credp) != 0)
1688 		connp->conn_mac_exempt = B_TRUE;
1689 
1690 	connp->conn_ulp_labeled = is_system_labeled();
1691 
1692 	icmp->icmp_is = is;
1693 
1694 	connp->conn_recv = icmp_input;
1695 	crhold(credp);
1696 	connp->conn_cred = credp;
1697 
1698 	rw_exit(&icmp->icmp_rwlock);
1699 
1700 	connp->conn_flow_cntrld = B_FALSE;
1701 	return (connp);
1702 }
1703 
1704 /*
1705  * Which ICMP options OK to set through T_UNITDATA_REQ...
1706  */
1707 /* ARGSUSED */
1708 static boolean_t
1709 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1710 {
1711 	return (B_TRUE);
1712 }
1713 
1714 /*
1715  * This routine gets default values of certain options whose default
1716  * values are maintained by protcol specific code
1717  */
1718 /* ARGSUSED */
1719 int
1720 icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
1721 {
1722 	icmp_t *icmp = Q_TO_ICMP(q);
1723 	icmp_stack_t *is = icmp->icmp_is;
1724 	int *i1 = (int *)ptr;
1725 
1726 	switch (level) {
1727 	case IPPROTO_IP:
1728 		switch (name) {
1729 		case IP_MULTICAST_TTL:
1730 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1731 			return (sizeof (uchar_t));
1732 		case IP_MULTICAST_LOOP:
1733 			*ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1734 			return (sizeof (uchar_t));
1735 		}
1736 		break;
1737 	case IPPROTO_IPV6:
1738 		switch (name) {
1739 		case IPV6_MULTICAST_HOPS:
1740 			*i1 = IP_DEFAULT_MULTICAST_TTL;
1741 			return (sizeof (int));
1742 		case IPV6_MULTICAST_LOOP:
1743 			*i1 = IP_DEFAULT_MULTICAST_LOOP;
1744 			return (sizeof (int));
1745 		case IPV6_UNICAST_HOPS:
1746 			*i1 = is->is_ipv6_hoplimit;
1747 			return (sizeof (int));
1748 		}
1749 		break;
1750 	case IPPROTO_ICMPV6:
1751 		switch (name) {
1752 		case ICMP6_FILTER:
1753 			/* Make it look like "pass all" */
1754 			ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1755 			return (sizeof (icmp6_filter_t));
1756 		}
1757 		break;
1758 	}
1759 	return (-1);
1760 }
1761 
1762 /*
1763  * This routine retrieves the current status of socket options.
1764  * It returns the size of the option retrieved.
1765  */
1766 int
1767 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1768 {
1769 	icmp_t		*icmp = connp->conn_icmp;
1770 	icmp_stack_t	*is = icmp->icmp_is;
1771 	int		*i1 = (int *)ptr;
1772 	ip6_pkt_t	*ipp = &icmp->icmp_sticky_ipp;
1773 	int		ret = 0;
1774 
1775 	ASSERT(RW_READ_HELD(&icmp->icmp_rwlock));
1776 	switch (level) {
1777 	case SOL_SOCKET:
1778 		switch (name) {
1779 		case SO_DEBUG:
1780 			*i1 = icmp->icmp_debug;
1781 			break;
1782 		case SO_TYPE:
1783 			*i1 = SOCK_RAW;
1784 			break;
1785 		case SO_PROTOTYPE:
1786 			*i1 = icmp->icmp_proto;
1787 			break;
1788 		case SO_REUSEADDR:
1789 			*i1 = icmp->icmp_reuseaddr;
1790 			break;
1791 
1792 		/*
1793 		 * The following three items are available here,
1794 		 * but are only meaningful to IP.
1795 		 */
1796 		case SO_DONTROUTE:
1797 			*i1 = icmp->icmp_dontroute;
1798 			break;
1799 		case SO_USELOOPBACK:
1800 			*i1 = icmp->icmp_useloopback;
1801 			break;
1802 		case SO_BROADCAST:
1803 			*i1 = icmp->icmp_broadcast;
1804 			break;
1805 
1806 		case SO_SNDBUF:
1807 			ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX);
1808 			*i1 = icmp->icmp_xmit_hiwat;
1809 			break;
1810 		case SO_RCVBUF:
1811 			ASSERT(icmp->icmp_recv_hiwat <= INT_MAX);
1812 			*i1 = icmp->icmp_recv_hiwat;
1813 			break;
1814 		case SO_DGRAM_ERRIND:
1815 			*i1 = icmp->icmp_dgram_errind;
1816 			break;
1817 		case SO_TIMESTAMP:
1818 			*i1 = icmp->icmp_timestamp;
1819 			break;
1820 		case SO_MAC_EXEMPT:
1821 			*i1 = connp->conn_mac_exempt;
1822 			break;
1823 		case SO_DOMAIN:
1824 			*i1 = icmp->icmp_family;
1825 			break;
1826 
1827 		/*
1828 		 * Following four not meaningful for icmp
1829 		 * Action is same as "default" to which we fallthrough
1830 		 * so we keep them in comments.
1831 		 * case SO_LINGER:
1832 		 * case SO_KEEPALIVE:
1833 		 * case SO_OOBINLINE:
1834 		 * case SO_ALLZONES:
1835 		 */
1836 		default:
1837 			ret = -1;
1838 			goto done;
1839 		}
1840 		break;
1841 	case IPPROTO_IP:
1842 		/*
1843 		 * Only allow IPv4 option processing on IPv4 sockets.
1844 		 */
1845 		if (icmp->icmp_family != AF_INET) {
1846 			ret = -1;
1847 			goto done;
1848 		}
1849 
1850 		switch (name) {
1851 		case IP_OPTIONS:
1852 		case T_IP_OPTIONS:
1853 			/* Options are passed up with each packet */
1854 			ret = 0;
1855 			goto done;
1856 		case IP_HDRINCL:
1857 			*i1 = (int)icmp->icmp_hdrincl;
1858 			break;
1859 		case IP_TOS:
1860 		case T_IP_TOS:
1861 			*i1 = (int)icmp->icmp_type_of_service;
1862 			break;
1863 		case IP_TTL:
1864 			*i1 = (int)icmp->icmp_ttl;
1865 			break;
1866 		case IP_MULTICAST_IF:
1867 			/* 0 address if not set */
1868 			*(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
1869 			ret = sizeof (ipaddr_t);
1870 			goto done;
1871 		case IP_MULTICAST_TTL:
1872 			*(uchar_t *)ptr = icmp->icmp_multicast_ttl;
1873 			ret = sizeof (uchar_t);
1874 			goto done;
1875 		case IP_MULTICAST_LOOP:
1876 			*ptr = connp->conn_multicast_loop;
1877 			ret = sizeof (uint8_t);
1878 			goto done;
1879 		case IP_BOUND_IF:
1880 			/* Zero if not set */
1881 			*i1 = icmp->icmp_bound_if;
1882 			break;	/* goto sizeof (int) option return */
1883 		case IP_UNSPEC_SRC:
1884 			*ptr = icmp->icmp_unspec_source;
1885 			break;	/* goto sizeof (int) option return */
1886 		case IP_RECVIF:
1887 			*ptr = icmp->icmp_recvif;
1888 			break;	/* goto sizeof (int) option return */
1889 		case IP_BROADCAST_TTL:
1890 			*(uchar_t *)ptr = connp->conn_broadcast_ttl;
1891 			return (sizeof (uchar_t));
1892 		case IP_RECVPKTINFO:
1893 			/*
1894 			 * This also handles IP_PKTINFO.
1895 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
1896 			 * Differentiation is based on the size of the argument
1897 			 * passed in.
1898 			 * This option is handled in IP which will return an
1899 			 * error for IP_PKTINFO as it's not supported as a
1900 			 * sticky option.
1901 			 */
1902 			ret = -EINVAL;
1903 			goto done;
1904 		/*
1905 		 * Cannot "get" the value of following options
1906 		 * at this level. Action is same as "default" to
1907 		 * which we fallthrough so we keep them in comments.
1908 		 *
1909 		 * case IP_ADD_MEMBERSHIP:
1910 		 * case IP_DROP_MEMBERSHIP:
1911 		 * case IP_BLOCK_SOURCE:
1912 		 * case IP_UNBLOCK_SOURCE:
1913 		 * case IP_ADD_SOURCE_MEMBERSHIP:
1914 		 * case IP_DROP_SOURCE_MEMBERSHIP:
1915 		 * case MCAST_JOIN_GROUP:
1916 		 * case MCAST_LEAVE_GROUP:
1917 		 * case MCAST_BLOCK_SOURCE:
1918 		 * case MCAST_UNBLOCK_SOURCE:
1919 		 * case MCAST_JOIN_SOURCE_GROUP:
1920 		 * case MCAST_LEAVE_SOURCE_GROUP:
1921 		 * case MRT_INIT:
1922 		 * case MRT_DONE:
1923 		 * case MRT_ADD_VIF:
1924 		 * case MRT_DEL_VIF:
1925 		 * case MRT_ADD_MFC:
1926 		 * case MRT_DEL_MFC:
1927 		 * case MRT_VERSION:
1928 		 * case MRT_ASSERT:
1929 		 * case IP_SEC_OPT:
1930 		 * case IP_NEXTHOP:
1931 		 */
1932 		default:
1933 			ret = -1;
1934 			goto done;
1935 		}
1936 		break;
1937 	case IPPROTO_IPV6:
1938 		/*
1939 		 * Only allow IPv6 option processing on native IPv6 sockets.
1940 		 */
1941 		if (icmp->icmp_family != AF_INET6) {
1942 			ret = -1;
1943 			goto done;
1944 		}
1945 		switch (name) {
1946 		case IPV6_UNICAST_HOPS:
1947 			*i1 = (unsigned int)icmp->icmp_ttl;
1948 			break;
1949 		case IPV6_MULTICAST_IF:
1950 			/* 0 index if not set */
1951 			*i1 = icmp->icmp_multicast_if_index;
1952 			break;
1953 		case IPV6_MULTICAST_HOPS:
1954 			*i1 = icmp->icmp_multicast_ttl;
1955 			break;
1956 		case IPV6_MULTICAST_LOOP:
1957 			*i1 = connp->conn_multicast_loop;
1958 			break;
1959 		case IPV6_BOUND_IF:
1960 			/* Zero if not set */
1961 			*i1 = icmp->icmp_bound_if;
1962 			break;
1963 		case IPV6_UNSPEC_SRC:
1964 			*i1 = icmp->icmp_unspec_source;
1965 			break;
1966 		case IPV6_CHECKSUM:
1967 			/*
1968 			 * Return offset or -1 if no checksum offset.
1969 			 * Does not apply to IPPROTO_ICMPV6
1970 			 */
1971 			if (icmp->icmp_proto == IPPROTO_ICMPV6) {
1972 				ret = -1;
1973 				goto done;
1974 			}
1975 
1976 			if (icmp->icmp_raw_checksum) {
1977 				*i1 = icmp->icmp_checksum_off;
1978 			} else {
1979 				*i1 = -1;
1980 			}
1981 			break;
1982 		case IPV6_JOIN_GROUP:
1983 		case IPV6_LEAVE_GROUP:
1984 		case MCAST_JOIN_GROUP:
1985 		case MCAST_LEAVE_GROUP:
1986 		case MCAST_BLOCK_SOURCE:
1987 		case MCAST_UNBLOCK_SOURCE:
1988 		case MCAST_JOIN_SOURCE_GROUP:
1989 		case MCAST_LEAVE_SOURCE_GROUP:
1990 			/* cannot "get" the value for these */
1991 			ret = -1;
1992 			goto done;
1993 		case IPV6_RECVPKTINFO:
1994 			*i1 = icmp->icmp_ip_recvpktinfo;
1995 			break;
1996 		case IPV6_RECVTCLASS:
1997 			*i1 = icmp->icmp_ipv6_recvtclass;
1998 			break;
1999 		case IPV6_RECVPATHMTU:
2000 			*i1 = icmp->icmp_ipv6_recvpathmtu;
2001 			break;
2002 		case IPV6_V6ONLY:
2003 			*i1 = 1;
2004 			break;
2005 		case IPV6_RECVHOPLIMIT:
2006 			*i1 = icmp->icmp_ipv6_recvhoplimit;
2007 			break;
2008 		case IPV6_RECVHOPOPTS:
2009 			*i1 = icmp->icmp_ipv6_recvhopopts;
2010 			break;
2011 		case IPV6_RECVDSTOPTS:
2012 			*i1 = icmp->icmp_ipv6_recvdstopts;
2013 			break;
2014 		case _OLD_IPV6_RECVDSTOPTS:
2015 			*i1 = icmp->icmp_old_ipv6_recvdstopts;
2016 			break;
2017 		case IPV6_RECVRTHDRDSTOPTS:
2018 			*i1 = icmp->icmp_ipv6_recvrtdstopts;
2019 			break;
2020 		case IPV6_RECVRTHDR:
2021 			*i1 = icmp->icmp_ipv6_recvrthdr;
2022 			break;
2023 		case IPV6_PKTINFO: {
2024 			/* XXX assumes that caller has room for max size! */
2025 			struct in6_pktinfo *pkti;
2026 
2027 			pkti = (struct in6_pktinfo *)ptr;
2028 			if (ipp->ipp_fields & IPPF_IFINDEX)
2029 				pkti->ipi6_ifindex = ipp->ipp_ifindex;
2030 			else
2031 				pkti->ipi6_ifindex = 0;
2032 			if (ipp->ipp_fields & IPPF_ADDR)
2033 				pkti->ipi6_addr = ipp->ipp_addr;
2034 			else
2035 				pkti->ipi6_addr = ipv6_all_zeros;
2036 			ret = sizeof (struct in6_pktinfo);
2037 			goto done;
2038 		}
2039 		case IPV6_NEXTHOP: {
2040 			sin6_t *sin6 = (sin6_t *)ptr;
2041 
2042 			if (!(ipp->ipp_fields & IPPF_NEXTHOP))
2043 				return (0);
2044 			*sin6 = sin6_null;
2045 			sin6->sin6_family = AF_INET6;
2046 			sin6->sin6_addr = ipp->ipp_nexthop;
2047 			ret = (sizeof (sin6_t));
2048 			goto done;
2049 		}
2050 		case IPV6_HOPOPTS:
2051 			if (!(ipp->ipp_fields & IPPF_HOPOPTS))
2052 				return (0);
2053 			if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6)
2054 				return (0);
2055 			bcopy((char *)ipp->ipp_hopopts +
2056 			    icmp->icmp_label_len_v6, ptr,
2057 			    ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2058 			if (icmp->icmp_label_len_v6 > 0) {
2059 				ptr[0] = ((char *)ipp->ipp_hopopts)[0];
2060 				ptr[1] = (ipp->ipp_hopoptslen -
2061 				    icmp->icmp_label_len_v6 + 7) / 8 - 1;
2062 			}
2063 			ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
2064 			goto done;
2065 		case IPV6_RTHDRDSTOPTS:
2066 			if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
2067 				return (0);
2068 			bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
2069 			ret = ipp->ipp_rtdstoptslen;
2070 			goto done;
2071 		case IPV6_RTHDR:
2072 			if (!(ipp->ipp_fields & IPPF_RTHDR))
2073 				return (0);
2074 			bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
2075 			ret = ipp->ipp_rthdrlen;
2076 			goto done;
2077 		case IPV6_DSTOPTS:
2078 			if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2079 				ret = 0;
2080 				goto done;
2081 			}
2082 			bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
2083 			ret = ipp->ipp_dstoptslen;
2084 			goto done;
2085 		case IPV6_PATHMTU:
2086 			if (!(ipp->ipp_fields & IPPF_PATHMTU)) {
2087 				ret = 0;
2088 			} else {
2089 				ret = ip_fill_mtuinfo(
2090 				    &icmp->icmp_v6dst.sin6_addr, 0,
2091 				    (struct ip6_mtuinfo *)ptr,
2092 				    is->is_netstack);
2093 			}
2094 			goto done;
2095 		case IPV6_TCLASS:
2096 			if (ipp->ipp_fields & IPPF_TCLASS)
2097 				*i1 = ipp->ipp_tclass;
2098 			else
2099 				*i1 = IPV6_FLOW_TCLASS(
2100 				    IPV6_DEFAULT_VERS_AND_FLOW);
2101 			break;
2102 		default:
2103 			ret = -1;
2104 			goto done;
2105 		}
2106 		break;
2107 	case IPPROTO_ICMPV6:
2108 		/*
2109 		 * Only allow IPv6 option processing on native IPv6 sockets.
2110 		 */
2111 		if (icmp->icmp_family != AF_INET6) {
2112 			ret = -1;
2113 		}
2114 
2115 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
2116 			ret = -1;
2117 		}
2118 
2119 		switch (name) {
2120 		case ICMP6_FILTER:
2121 			if (icmp->icmp_filter == NULL) {
2122 				/* Make it look like "pass all" */
2123 				ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
2124 			} else {
2125 				(void) bcopy(icmp->icmp_filter, ptr,
2126 				    sizeof (icmp6_filter_t));
2127 			}
2128 			ret = sizeof (icmp6_filter_t);
2129 			goto done;
2130 		default:
2131 			ret = -1;
2132 			goto done;
2133 		}
2134 	default:
2135 		ret = -1;
2136 		goto done;
2137 	}
2138 	ret = sizeof (int);
2139 done:
2140 	return (ret);
2141 }
2142 
2143 /*
2144  * This routine retrieves the current status of socket options.
2145  * It returns the size of the option retrieved.
2146  */
2147 int
2148 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
2149 {
2150 	conn_t  *connp = Q_TO_CONN(q);
2151 	icmp_t	*icmp = connp->conn_icmp;
2152 	int 	err;
2153 
2154 	rw_enter(&icmp->icmp_rwlock, RW_READER);
2155 	err = icmp_opt_get(connp, level, name, ptr);
2156 	rw_exit(&icmp->icmp_rwlock);
2157 	return (err);
2158 }
2159 
2160 int
2161 icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
2162     uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
2163     void *thisdg_attrs, boolean_t checkonly)
2164 {
2165 
2166 	int	*i1 = (int *)invalp;
2167 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
2168 	icmp_t *icmp = connp->conn_icmp;
2169 	icmp_stack_t *is = icmp->icmp_is;
2170 	int	error;
2171 
2172 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
2173 	/*
2174 	 * For fixed length options, no sanity check
2175 	 * of passed in length is done. It is assumed *_optcom_req()
2176 	 * routines do the right thing.
2177 	 */
2178 	switch (level) {
2179 	case SOL_SOCKET:
2180 		switch (name) {
2181 		case SO_DEBUG:
2182 			if (!checkonly)
2183 				icmp->icmp_debug = onoff;
2184 			break;
2185 		case SO_PROTOTYPE:
2186 			if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2187 			    (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2188 			    secpolicy_net_rawaccess(cr) != 0) {
2189 				*outlenp = 0;
2190 				return (EACCES);
2191 			}
2192 			/* Can't use IPPROTO_RAW with IPv6 */
2193 			if ((*i1 & 0xFF) == IPPROTO_RAW &&
2194 			    icmp->icmp_family == AF_INET6) {
2195 				*outlenp = 0;
2196 				return (EPROTONOSUPPORT);
2197 			}
2198 			if (checkonly) {
2199 				/* T_CHECK case */
2200 				*(int *)outvalp = (*i1 & 0xFF);
2201 				break;
2202 			}
2203 			icmp->icmp_proto = *i1 & 0xFF;
2204 			if ((icmp->icmp_proto == IPPROTO_RAW ||
2205 			    icmp->icmp_proto == IPPROTO_IGMP) &&
2206 			    icmp->icmp_family == AF_INET)
2207 				icmp->icmp_hdrincl = 1;
2208 			else
2209 				icmp->icmp_hdrincl = 0;
2210 
2211 			if (icmp->icmp_family == AF_INET6 &&
2212 			    icmp->icmp_proto == IPPROTO_ICMPV6) {
2213 				/* Set offset for icmp6_cksum */
2214 				icmp->icmp_raw_checksum = 0;
2215 				icmp->icmp_checksum_off = 2;
2216 			}
2217 			if (icmp->icmp_proto == IPPROTO_UDP ||
2218 			    icmp->icmp_proto == IPPROTO_TCP ||
2219 			    icmp->icmp_proto == IPPROTO_SCTP) {
2220 				icmp->icmp_no_tp_cksum = 1;
2221 				icmp->icmp_sticky_ipp.ipp_fields |=
2222 				    IPPF_NO_CKSUM;
2223 			} else {
2224 				icmp->icmp_no_tp_cksum = 0;
2225 				icmp->icmp_sticky_ipp.ipp_fields &=
2226 				    ~IPPF_NO_CKSUM;
2227 			}
2228 
2229 			if (icmp->icmp_filter != NULL &&
2230 			    icmp->icmp_proto != IPPROTO_ICMPV6) {
2231 				kmem_free(icmp->icmp_filter,
2232 				    sizeof (icmp6_filter_t));
2233 				icmp->icmp_filter = NULL;
2234 			}
2235 
2236 			/* Rebuild the header template */
2237 			error = icmp_build_hdrs(icmp);
2238 			if (error != 0) {
2239 				*outlenp = 0;
2240 				return (error);
2241 			}
2242 
2243 			/*
2244 			 * For SCTP, we don't use icmp_bind_proto() for
2245 			 * raw socket binding.  Note that we do not need
2246 			 * to set *outlenp.
2247 			 * FIXME: how does SCTP work?
2248 			 */
2249 			if (icmp->icmp_proto == IPPROTO_SCTP)
2250 				return (0);
2251 
2252 			*outlenp = sizeof (int);
2253 			*(int *)outvalp = *i1 & 0xFF;
2254 
2255 			/* Drop lock across the bind operation */
2256 			rw_exit(&icmp->icmp_rwlock);
2257 			(void) icmp_bind_proto(connp);
2258 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2259 			return (0);
2260 		case SO_REUSEADDR:
2261 			if (!checkonly) {
2262 				icmp->icmp_reuseaddr = onoff;
2263 				PASS_OPT_TO_IP(connp);
2264 			}
2265 			break;
2266 
2267 		/*
2268 		 * The following three items are available here,
2269 		 * but are only meaningful to IP.
2270 		 */
2271 		case SO_DONTROUTE:
2272 			if (!checkonly) {
2273 				icmp->icmp_dontroute = onoff;
2274 				PASS_OPT_TO_IP(connp);
2275 			}
2276 			break;
2277 		case SO_USELOOPBACK:
2278 			if (!checkonly) {
2279 				icmp->icmp_useloopback = onoff;
2280 				PASS_OPT_TO_IP(connp);
2281 			}
2282 			break;
2283 		case SO_BROADCAST:
2284 			if (!checkonly) {
2285 				icmp->icmp_broadcast = onoff;
2286 				PASS_OPT_TO_IP(connp);
2287 			}
2288 			break;
2289 
2290 		case SO_SNDBUF:
2291 			if (*i1 > is->is_max_buf) {
2292 				*outlenp = 0;
2293 				return (ENOBUFS);
2294 			}
2295 			if (!checkonly) {
2296 				if (!IPCL_IS_NONSTR(connp)) {
2297 					connp->conn_wq->q_hiwat = *i1;
2298 				}
2299 				icmp->icmp_xmit_hiwat = *i1;
2300 			}
2301 			break;
2302 		case SO_RCVBUF:
2303 			if (*i1 > is->is_max_buf) {
2304 				*outlenp = 0;
2305 				return (ENOBUFS);
2306 			}
2307 			if (!checkonly) {
2308 				icmp->icmp_recv_hiwat = *i1;
2309 				rw_exit(&icmp->icmp_rwlock);
2310 				(void) proto_set_rx_hiwat(connp->conn_rq, connp,
2311 				    *i1);
2312 				rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2313 			}
2314 			break;
2315 		case SO_DGRAM_ERRIND:
2316 			if (!checkonly)
2317 				icmp->icmp_dgram_errind = onoff;
2318 			break;
2319 		case SO_ALLZONES:
2320 			/*
2321 			 * "soft" error (negative)
2322 			 * option not handled at this level
2323 			 * Note: Do not modify *outlenp
2324 			 */
2325 			return (-EINVAL);
2326 		case SO_TIMESTAMP:
2327 			if (!checkonly) {
2328 				icmp->icmp_timestamp = onoff;
2329 			}
2330 			break;
2331 		case SO_MAC_EXEMPT:
2332 			/*
2333 			 * "soft" error (negative)
2334 			 * option not handled at this level
2335 			 * Note: Do not modify *outlenp
2336 			 */
2337 			return (-EINVAL);
2338 		case SO_RCVTIMEO:
2339 		case SO_SNDTIMEO:
2340 			/*
2341 			 * Pass these two options in order for third part
2342 			 * protocol usage. Here just return directly.
2343 			 */
2344 			return (0);
2345 		/*
2346 		 * Following three not meaningful for icmp
2347 		 * Action is same as "default" so we keep them
2348 		 * in comments.
2349 		 * case SO_LINGER:
2350 		 * case SO_KEEPALIVE:
2351 		 * case SO_OOBINLINE:
2352 		 */
2353 		default:
2354 			*outlenp = 0;
2355 			return (EINVAL);
2356 		}
2357 		break;
2358 	case IPPROTO_IP:
2359 		/*
2360 		 * Only allow IPv4 option processing on IPv4 sockets.
2361 		 */
2362 		if (icmp->icmp_family != AF_INET) {
2363 			*outlenp = 0;
2364 			return (ENOPROTOOPT);
2365 		}
2366 		switch (name) {
2367 		case IP_OPTIONS:
2368 		case T_IP_OPTIONS:
2369 			/* Save options for use by IP. */
2370 			if ((inlen & 0x3) ||
2371 			    inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) {
2372 				*outlenp = 0;
2373 				return (EINVAL);
2374 			}
2375 			if (checkonly)
2376 				break;
2377 
2378 			if (!tsol_option_set(&icmp->icmp_ip_snd_options,
2379 			    &icmp->icmp_ip_snd_options_len,
2380 			    icmp->icmp_label_len, invalp, inlen)) {
2381 				*outlenp = 0;
2382 				return (ENOMEM);
2383 			}
2384 
2385 			icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
2386 			    icmp->icmp_ip_snd_options_len;
2387 			rw_exit(&icmp->icmp_rwlock);
2388 			(void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL:
2389 			    RD(connp->conn_rq), connp,
2390 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
2391 			rw_enter(&icmp->icmp_rwlock, RW_WRITER);
2392 			break;
2393 		case IP_HDRINCL:
2394 			if (!checkonly)
2395 				icmp->icmp_hdrincl = onoff;
2396 			break;
2397 		case IP_TOS:
2398 		case T_IP_TOS:
2399 			if (!checkonly) {
2400 				icmp->icmp_type_of_service = (uint8_t)*i1;
2401 			}
2402 			break;
2403 		case IP_TTL:
2404 			if (!checkonly) {
2405 				icmp->icmp_ttl = (uint8_t)*i1;
2406 			}
2407 			break;
2408 		case IP_MULTICAST_IF:
2409 			/*
2410 			 * TODO should check OPTMGMT reply and undo this if
2411 			 * there is an error.
2412 			 */
2413 			if (!checkonly) {
2414 				icmp->icmp_multicast_if_addr = *i1;
2415 				PASS_OPT_TO_IP(connp);
2416 			}
2417 			break;
2418 		case IP_MULTICAST_TTL:
2419 			if (!checkonly)
2420 				icmp->icmp_multicast_ttl = *invalp;
2421 			break;
2422 		case IP_MULTICAST_LOOP:
2423 			if (!checkonly) {
2424 				connp->conn_multicast_loop =
2425 				    (*invalp == 0) ? 0 : 1;
2426 				PASS_OPT_TO_IP(connp);
2427 			}
2428 			break;
2429 		case IP_BOUND_IF:
2430 			if (!checkonly) {
2431 				icmp->icmp_bound_if = *i1;
2432 				PASS_OPT_TO_IP(connp);
2433 			}
2434 			break;
2435 		case IP_UNSPEC_SRC:
2436 			if (!checkonly) {
2437 				icmp->icmp_unspec_source = onoff;
2438 				PASS_OPT_TO_IP(connp);
2439 			}
2440 			break;
2441 		case IP_BROADCAST_TTL:
2442 			if (!checkonly)
2443 				connp->conn_broadcast_ttl = *invalp;
2444 			break;
2445 		case IP_RECVIF:
2446 			if (!checkonly) {
2447 				icmp->icmp_recvif = onoff;
2448 			}
2449 			/*
2450 			 * pass to ip
2451 			 */
2452 			return (-EINVAL);
2453 		case IP_PKTINFO: {
2454 			/*
2455 			 * This also handles IP_RECVPKTINFO.
2456 			 * IP_PKTINFO and IP_RECVPKTINFO have the same value.
2457 			 * Differentiation is based on the size of the argument
2458 			 * passed in.
2459 			 */
2460 			struct in_pktinfo *pktinfop;
2461 			ip4_pkt_t *attr_pktinfop;
2462 
2463 			if (checkonly)
2464 				break;
2465 
2466 			if (inlen == sizeof (int)) {
2467 				/*
2468 				 * This is IP_RECVPKTINFO option.
2469 				 * Keep a local copy of wether this option is
2470 				 * set or not and pass it down to IP for
2471 				 * processing.
2472 				 */
2473 				icmp->icmp_ip_recvpktinfo = onoff;
2474 				return (-EINVAL);
2475 			}
2476 
2477 
2478 			if (inlen != sizeof (struct in_pktinfo)) {
2479 				return (EINVAL);
2480 			}
2481 
2482 			if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
2483 			    == NULL) {
2484 				/*
2485 				 * sticky option is not supported
2486 				 */
2487 				return (EINVAL);
2488 			}
2489 
2490 			pktinfop = (struct in_pktinfo *)invalp;
2491 
2492 			/*
2493 			 * Atleast one of the values should be specified
2494 			 */
2495 			if (pktinfop->ipi_ifindex == 0 &&
2496 			    pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
2497 				return (EINVAL);
2498 			}
2499 
2500 			attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
2501 			attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
2502 		}
2503 			break;
2504 		case IP_ADD_MEMBERSHIP:
2505 		case IP_DROP_MEMBERSHIP:
2506 		case IP_BLOCK_SOURCE:
2507 		case IP_UNBLOCK_SOURCE:
2508 		case IP_ADD_SOURCE_MEMBERSHIP:
2509 		case IP_DROP_SOURCE_MEMBERSHIP:
2510 		case MCAST_JOIN_GROUP:
2511 		case MCAST_LEAVE_GROUP:
2512 		case MCAST_BLOCK_SOURCE:
2513 		case MCAST_UNBLOCK_SOURCE:
2514 		case MCAST_JOIN_SOURCE_GROUP:
2515 		case MCAST_LEAVE_SOURCE_GROUP:
2516 		case MRT_INIT:
2517 		case MRT_DONE:
2518 		case MRT_ADD_VIF:
2519 		case MRT_DEL_VIF:
2520 		case MRT_ADD_MFC:
2521 		case MRT_DEL_MFC:
2522 		case MRT_VERSION:
2523 		case MRT_ASSERT:
2524 		case IP_SEC_OPT:
2525 		case IP_NEXTHOP:
2526 			/*
2527 			 * "soft" error (negative)
2528 			 * option not handled at this level
2529 			 * Note: Do not modify *outlenp
2530 			 */
2531 			return (-EINVAL);
2532 		default:
2533 			*outlenp = 0;
2534 			return (EINVAL);
2535 		}
2536 		break;
2537 	case IPPROTO_IPV6: {
2538 		ip6_pkt_t		*ipp;
2539 		boolean_t		sticky;
2540 
2541 		if (icmp->icmp_family != AF_INET6) {
2542 			*outlenp = 0;
2543 			return (ENOPROTOOPT);
2544 		}
2545 		/*
2546 		 * Deal with both sticky options and ancillary data
2547 		 */
2548 		if (thisdg_attrs == NULL) {
2549 			/* sticky options, or none */
2550 			ipp = &icmp->icmp_sticky_ipp;
2551 			sticky = B_TRUE;
2552 		} else {
2553 			/* ancillary data */
2554 			ipp = (ip6_pkt_t *)thisdg_attrs;
2555 			sticky = B_FALSE;
2556 		}
2557 
2558 		switch (name) {
2559 		case IPV6_MULTICAST_IF:
2560 			if (!checkonly) {
2561 				icmp->icmp_multicast_if_index = *i1;
2562 				PASS_OPT_TO_IP(connp);
2563 			}
2564 			break;
2565 		case IPV6_UNICAST_HOPS:
2566 			/* -1 means use default */
2567 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2568 				*outlenp = 0;
2569 				return (EINVAL);
2570 			}
2571 			if (!checkonly) {
2572 				if (*i1 == -1) {
2573 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2574 					    is->is_ipv6_hoplimit;
2575 					ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
2576 					/* Pass modified value to IP. */
2577 					*i1 = ipp->ipp_hoplimit;
2578 				} else {
2579 					icmp->icmp_ttl = ipp->ipp_unicast_hops =
2580 					    (uint8_t)*i1;
2581 					ipp->ipp_fields |= IPPF_UNICAST_HOPS;
2582 				}
2583 				/* Rebuild the header template */
2584 				error = icmp_build_hdrs(icmp);
2585 				if (error != 0) {
2586 					*outlenp = 0;
2587 					return (error);
2588 				}
2589 			}
2590 			break;
2591 		case IPV6_MULTICAST_HOPS:
2592 			/* -1 means use default */
2593 			if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
2594 				*outlenp = 0;
2595 				return (EINVAL);
2596 			}
2597 			if (!checkonly) {
2598 				if (*i1 == -1) {
2599 					icmp->icmp_multicast_ttl =
2600 					    ipp->ipp_multicast_hops =
2601 					    IP_DEFAULT_MULTICAST_TTL;
2602 					ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
2603 					/* Pass modified value to IP. */
2604 					*i1 = icmp->icmp_multicast_ttl;
2605 				} else {
2606 					icmp->icmp_multicast_ttl =
2607 					    ipp->ipp_multicast_hops =
2608 					    (uint8_t)*i1;
2609 					ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
2610 				}
2611 			}
2612 			break;
2613 		case IPV6_MULTICAST_LOOP:
2614 			if (*i1 != 0 && *i1 != 1) {
2615 				*outlenp = 0;
2616 				return (EINVAL);
2617 			}
2618 			if (!checkonly) {
2619 				connp->conn_multicast_loop = *i1;
2620 				PASS_OPT_TO_IP(connp);
2621 			}
2622 			break;
2623 		case IPV6_CHECKSUM:
2624 			/*
2625 			 * Integer offset into the user data of where the
2626 			 * checksum is located.
2627 			 * Offset of -1 disables option.
2628 			 * Does not apply to IPPROTO_ICMPV6.
2629 			 */
2630 			if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) {
2631 				*outlenp = 0;
2632 				return (EINVAL);
2633 			}
2634 			if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2635 				/* Negative or not 16 bit aligned offset */
2636 				*outlenp = 0;
2637 				return (EINVAL);
2638 			}
2639 			if (checkonly)
2640 				break;
2641 
2642 			if (*i1 == -1) {
2643 				icmp->icmp_raw_checksum = 0;
2644 				ipp->ipp_fields &= ~IPPF_RAW_CKSUM;
2645 			} else {
2646 				icmp->icmp_raw_checksum = 1;
2647 				icmp->icmp_checksum_off = *i1;
2648 				ipp->ipp_fields |= IPPF_RAW_CKSUM;
2649 			}
2650 			/* Rebuild the header template */
2651 			error = icmp_build_hdrs(icmp);
2652 			if (error != 0) {
2653 				*outlenp = 0;
2654 				return (error);
2655 			}
2656 			break;
2657 		case IPV6_JOIN_GROUP:
2658 		case IPV6_LEAVE_GROUP:
2659 		case MCAST_JOIN_GROUP:
2660 		case MCAST_LEAVE_GROUP:
2661 		case MCAST_BLOCK_SOURCE:
2662 		case MCAST_UNBLOCK_SOURCE:
2663 		case MCAST_JOIN_SOURCE_GROUP:
2664 		case MCAST_LEAVE_SOURCE_GROUP:
2665 			/*
2666 			 * "soft" error (negative)
2667 			 * option not handled at this level
2668 			 * Note: Do not modify *outlenp
2669 			 */
2670 			return (-EINVAL);
2671 		case IPV6_BOUND_IF:
2672 			if (!checkonly) {
2673 				icmp->icmp_bound_if = *i1;
2674 				PASS_OPT_TO_IP(connp);
2675 			}
2676 			break;
2677 		case IPV6_UNSPEC_SRC:
2678 			if (!checkonly) {
2679 				icmp->icmp_unspec_source = onoff;
2680 				PASS_OPT_TO_IP(connp);
2681 			}
2682 			break;
2683 		case IPV6_RECVTCLASS:
2684 			if (!checkonly) {
2685 				icmp->icmp_ipv6_recvtclass = onoff;
2686 				PASS_OPT_TO_IP(connp);
2687 			}
2688 			break;
2689 		/*
2690 		 * Set boolean switches for ancillary data delivery
2691 		 */
2692 		case IPV6_RECVPKTINFO:
2693 			if (!checkonly) {
2694 				icmp->icmp_ip_recvpktinfo = onoff;
2695 				PASS_OPT_TO_IP(connp);
2696 			}
2697 			break;
2698 		case IPV6_RECVPATHMTU:
2699 			if (!checkonly) {
2700 				icmp->icmp_ipv6_recvpathmtu = onoff;
2701 				PASS_OPT_TO_IP(connp);
2702 			}
2703 			break;
2704 		case IPV6_RECVHOPLIMIT:
2705 			if (!checkonly) {
2706 				icmp->icmp_ipv6_recvhoplimit = onoff;
2707 				PASS_OPT_TO_IP(connp);
2708 			}
2709 			break;
2710 		case IPV6_RECVHOPOPTS:
2711 			if (!checkonly) {
2712 				icmp->icmp_ipv6_recvhopopts = onoff;
2713 				PASS_OPT_TO_IP(connp);
2714 			}
2715 			break;
2716 		case IPV6_RECVDSTOPTS:
2717 			if (!checkonly) {
2718 				icmp->icmp_ipv6_recvdstopts = onoff;
2719 				PASS_OPT_TO_IP(connp);
2720 			}
2721 			break;
2722 		case _OLD_IPV6_RECVDSTOPTS:
2723 			if (!checkonly)
2724 				icmp->icmp_old_ipv6_recvdstopts = onoff;
2725 			break;
2726 		case IPV6_RECVRTHDRDSTOPTS:
2727 			if (!checkonly) {
2728 				icmp->icmp_ipv6_recvrtdstopts = onoff;
2729 				PASS_OPT_TO_IP(connp);
2730 			}
2731 			break;
2732 		case IPV6_RECVRTHDR:
2733 			if (!checkonly) {
2734 				icmp->icmp_ipv6_recvrthdr = onoff;
2735 				PASS_OPT_TO_IP(connp);
2736 			}
2737 			break;
2738 		/*
2739 		 * Set sticky options or ancillary data.
2740 		 * If sticky options, (re)build any extension headers
2741 		 * that might be needed as a result.
2742 		 */
2743 		case IPV6_PKTINFO:
2744 			/*
2745 			 * The source address and ifindex are verified
2746 			 * in ip_opt_set(). For ancillary data the
2747 			 * source address is checked in ip_wput_v6.
2748 			 */
2749 			if (inlen != 0 && inlen !=
2750 			    sizeof (struct in6_pktinfo)) {
2751 				return (EINVAL);
2752 			}
2753 			if (checkonly)
2754 				break;
2755 
2756 			if (inlen == 0) {
2757 				ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
2758 				ipp->ipp_sticky_ignored |=
2759 				    (IPPF_IFINDEX|IPPF_ADDR);
2760 			} else {
2761 				struct in6_pktinfo *pkti;
2762 
2763 				pkti = (struct in6_pktinfo *)invalp;
2764 				ipp->ipp_ifindex = pkti->ipi6_ifindex;
2765 				ipp->ipp_addr = pkti->ipi6_addr;
2766 				if (ipp->ipp_ifindex != 0)
2767 					ipp->ipp_fields |= IPPF_IFINDEX;
2768 				else
2769 					ipp->ipp_fields &= ~IPPF_IFINDEX;
2770 				if (!IN6_IS_ADDR_UNSPECIFIED(
2771 				    &ipp->ipp_addr))
2772 					ipp->ipp_fields |= IPPF_ADDR;
2773 				else
2774 					ipp->ipp_fields &= ~IPPF_ADDR;
2775 			}
2776 			if (sticky) {
2777 				error = icmp_build_hdrs(icmp);
2778 				if (error != 0)
2779 					return (error);
2780 				PASS_OPT_TO_IP(connp);
2781 			}
2782 			break;
2783 		case IPV6_HOPLIMIT:
2784 			/* This option can only be used as ancillary data. */
2785 			if (sticky)
2786 				return (EINVAL);
2787 			if (inlen != 0 && inlen != sizeof (int))
2788 				return (EINVAL);
2789 			if (checkonly)
2790 				break;
2791 
2792 			if (inlen == 0) {
2793 				ipp->ipp_fields &= ~IPPF_HOPLIMIT;
2794 				ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
2795 			} else {
2796 				if (*i1 > 255 || *i1 < -1)
2797 					return (EINVAL);
2798 				if (*i1 == -1)
2799 					ipp->ipp_hoplimit =
2800 					    is->is_ipv6_hoplimit;
2801 				else
2802 					ipp->ipp_hoplimit = *i1;
2803 				ipp->ipp_fields |= IPPF_HOPLIMIT;
2804 			}
2805 			break;
2806 		case IPV6_TCLASS:
2807 			/*
2808 			 * IPV6_RECVTCLASS accepts -1 as use kernel default
2809 			 * and [0, 255] as the actualy traffic class.
2810 			 */
2811 			if (inlen != 0 && inlen != sizeof (int)) {
2812 				return (EINVAL);
2813 			}
2814 			if (checkonly)
2815 				break;
2816 
2817 			if (inlen == 0) {
2818 				ipp->ipp_fields &= ~IPPF_TCLASS;
2819 				ipp->ipp_sticky_ignored |= IPPF_TCLASS;
2820 			} else {
2821 				if (*i1 >= 256 || *i1 < -1)
2822 					return (EINVAL);
2823 				if (*i1 == -1) {
2824 					ipp->ipp_tclass =
2825 					    IPV6_FLOW_TCLASS(
2826 					    IPV6_DEFAULT_VERS_AND_FLOW);
2827 				} else {
2828 					ipp->ipp_tclass = *i1;
2829 				}
2830 				ipp->ipp_fields |= IPPF_TCLASS;
2831 			}
2832 			if (sticky) {
2833 				error = icmp_build_hdrs(icmp);
2834 				if (error != 0)
2835 					return (error);
2836 			}
2837 			break;
2838 		case IPV6_NEXTHOP:
2839 			/*
2840 			 * IP will verify that the nexthop is reachable
2841 			 * and fail for sticky options.
2842 			 */
2843 			if (inlen != 0 && inlen != sizeof (sin6_t)) {
2844 				return (EINVAL);
2845 			}
2846 			if (checkonly)
2847 				break;
2848 
2849 			if (inlen == 0) {
2850 				ipp->ipp_fields &= ~IPPF_NEXTHOP;
2851 				ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
2852 			} else {
2853 				sin6_t *sin6 = (sin6_t *)invalp;
2854 
2855 				if (sin6->sin6_family != AF_INET6) {
2856 					return (EAFNOSUPPORT);
2857 				}
2858 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
2859 					return (EADDRNOTAVAIL);
2860 				}
2861 				ipp->ipp_nexthop = sin6->sin6_addr;
2862 				if (!IN6_IS_ADDR_UNSPECIFIED(
2863 				    &ipp->ipp_nexthop))
2864 					ipp->ipp_fields |= IPPF_NEXTHOP;
2865 				else
2866 					ipp->ipp_fields &= ~IPPF_NEXTHOP;
2867 			}
2868 			if (sticky) {
2869 				error = icmp_build_hdrs(icmp);
2870 				if (error != 0)
2871 					return (error);
2872 				PASS_OPT_TO_IP(connp);
2873 			}
2874 			break;
2875 		case IPV6_HOPOPTS: {
2876 			ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
2877 			/*
2878 			 * Sanity checks - minimum size, size a multiple of
2879 			 * eight bytes, and matching size passed in.
2880 			 */
2881 			if (inlen != 0 &&
2882 			    inlen != (8 * (hopts->ip6h_len + 1))) {
2883 				return (EINVAL);
2884 			}
2885 
2886 			if (checkonly)
2887 				break;
2888 			error = optcom_pkt_set(invalp, inlen, sticky,
2889 			    (uchar_t **)&ipp->ipp_hopopts,
2890 			    &ipp->ipp_hopoptslen,
2891 			    sticky ? icmp->icmp_label_len_v6 : 0);
2892 			if (error != 0)
2893 				return (error);
2894 			if (ipp->ipp_hopoptslen == 0) {
2895 				ipp->ipp_fields &= ~IPPF_HOPOPTS;
2896 				ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
2897 			} else {
2898 				ipp->ipp_fields |= IPPF_HOPOPTS;
2899 			}
2900 			if (sticky) {
2901 				error = icmp_build_hdrs(icmp);
2902 				if (error != 0)
2903 					return (error);
2904 			}
2905 			break;
2906 		}
2907 		case IPV6_RTHDRDSTOPTS: {
2908 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2909 
2910 			/*
2911 			 * Sanity checks - minimum size, size a multiple of
2912 			 * eight bytes, and matching size passed in.
2913 			 */
2914 			if (inlen != 0 &&
2915 			    inlen != (8 * (dopts->ip6d_len + 1)))
2916 				return (EINVAL);
2917 
2918 			if (checkonly)
2919 				break;
2920 
2921 			if (inlen == 0) {
2922 				if (sticky &&
2923 				    (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
2924 					kmem_free(ipp->ipp_rtdstopts,
2925 					    ipp->ipp_rtdstoptslen);
2926 					ipp->ipp_rtdstopts = NULL;
2927 					ipp->ipp_rtdstoptslen = 0;
2928 				}
2929 				ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
2930 				ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
2931 			} else {
2932 				error = optcom_pkt_set(invalp, inlen, sticky,
2933 				    (uchar_t **)&ipp->ipp_rtdstopts,
2934 				    &ipp->ipp_rtdstoptslen, 0);
2935 				if (error != 0)
2936 					return (error);
2937 				ipp->ipp_fields |= IPPF_RTDSTOPTS;
2938 			}
2939 			if (sticky) {
2940 				error = icmp_build_hdrs(icmp);
2941 				if (error != 0)
2942 					return (error);
2943 			}
2944 			break;
2945 		}
2946 		case IPV6_DSTOPTS: {
2947 			ip6_dest_t *dopts = (ip6_dest_t *)invalp;
2948 
2949 			/*
2950 			 * Sanity checks - minimum size, size a multiple of
2951 			 * eight bytes, and matching size passed in.
2952 			 */
2953 			if (inlen != 0 &&
2954 			    inlen != (8 * (dopts->ip6d_len + 1)))
2955 				return (EINVAL);
2956 
2957 			if (checkonly)
2958 				break;
2959 
2960 			if (inlen == 0) {
2961 				if (sticky &&
2962 				    (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
2963 					kmem_free(ipp->ipp_dstopts,
2964 					    ipp->ipp_dstoptslen);
2965 					ipp->ipp_dstopts = NULL;
2966 					ipp->ipp_dstoptslen = 0;
2967 				}
2968 				ipp->ipp_fields &= ~IPPF_DSTOPTS;
2969 				ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
2970 			} else {
2971 				error = optcom_pkt_set(invalp, inlen, sticky,
2972 				    (uchar_t **)&ipp->ipp_dstopts,
2973 				    &ipp->ipp_dstoptslen, 0);
2974 				if (error != 0)
2975 					return (error);
2976 				ipp->ipp_fields |= IPPF_DSTOPTS;
2977 			}
2978 			if (sticky) {
2979 				error = icmp_build_hdrs(icmp);
2980 				if (error != 0)
2981 					return (error);
2982 			}
2983 			break;
2984 		}
2985 		case IPV6_RTHDR: {
2986 			ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
2987 
2988 			/*
2989 			 * Sanity checks - minimum size, size a multiple of
2990 			 * eight bytes, and matching size passed in.
2991 			 */
2992 			if (inlen != 0 &&
2993 			    inlen != (8 * (rt->ip6r_len + 1)))
2994 				return (EINVAL);
2995 
2996 			if (checkonly)
2997 				break;
2998 
2999 			if (inlen == 0) {
3000 				if (sticky &&
3001 				    (ipp->ipp_fields & IPPF_RTHDR) != 0) {
3002 					kmem_free(ipp->ipp_rthdr,
3003 					    ipp->ipp_rthdrlen);
3004 					ipp->ipp_rthdr = NULL;
3005 					ipp->ipp_rthdrlen = 0;
3006 				}
3007 				ipp->ipp_fields &= ~IPPF_RTHDR;
3008 				ipp->ipp_sticky_ignored |= IPPF_RTHDR;
3009 			} else {
3010 				error = optcom_pkt_set(invalp, inlen, sticky,
3011 				    (uchar_t **)&ipp->ipp_rthdr,
3012 				    &ipp->ipp_rthdrlen, 0);
3013 				if (error != 0)
3014 					return (error);
3015 				ipp->ipp_fields |= IPPF_RTHDR;
3016 			}
3017 			if (sticky) {
3018 				error = icmp_build_hdrs(icmp);
3019 				if (error != 0)
3020 					return (error);
3021 			}
3022 			break;
3023 		}
3024 
3025 		case IPV6_DONTFRAG:
3026 			if (checkonly)
3027 				break;
3028 
3029 			if (onoff) {
3030 				ipp->ipp_fields |= IPPF_DONTFRAG;
3031 			} else {
3032 				ipp->ipp_fields &= ~IPPF_DONTFRAG;
3033 			}
3034 			break;
3035 
3036 		case IPV6_USE_MIN_MTU:
3037 			if (inlen != sizeof (int))
3038 				return (EINVAL);
3039 
3040 			if (*i1 < -1 || *i1 > 1)
3041 				return (EINVAL);
3042 
3043 			if (checkonly)
3044 				break;
3045 
3046 			ipp->ipp_fields |= IPPF_USE_MIN_MTU;
3047 			ipp->ipp_use_min_mtu = *i1;
3048 			break;
3049 
3050 		/*
3051 		 * This option can't be set.  Its only returned via
3052 		 * getsockopt() or ancillary data.
3053 		 */
3054 		case IPV6_PATHMTU:
3055 			return (EINVAL);
3056 
3057 		case IPV6_SEC_OPT:
3058 		case IPV6_SRC_PREFERENCES:
3059 		case IPV6_V6ONLY:
3060 			/* Handled at IP level */
3061 			return (-EINVAL);
3062 		default:
3063 			*outlenp = 0;
3064 			return (EINVAL);
3065 		}
3066 		break;
3067 	}		/* end IPPROTO_IPV6 */
3068 
3069 	case IPPROTO_ICMPV6:
3070 		/*
3071 		 * Only allow IPv6 option processing on IPv6 sockets.
3072 		 */
3073 		if (icmp->icmp_family != AF_INET6) {
3074 			*outlenp = 0;
3075 			return (ENOPROTOOPT);
3076 		}
3077 		if (icmp->icmp_proto != IPPROTO_ICMPV6) {
3078 			*outlenp = 0;
3079 			return (ENOPROTOOPT);
3080 		}
3081 		switch (name) {
3082 		case ICMP6_FILTER:
3083 			if (!checkonly) {
3084 				if ((inlen != 0) &&
3085 				    (inlen != sizeof (icmp6_filter_t)))
3086 					return (EINVAL);
3087 
3088 				if (inlen == 0) {
3089 					if (icmp->icmp_filter != NULL) {
3090 						kmem_free(icmp->icmp_filter,
3091 						    sizeof (icmp6_filter_t));
3092 						icmp->icmp_filter = NULL;
3093 					}
3094 				} else {
3095 					if (icmp->icmp_filter == NULL) {
3096 						icmp->icmp_filter = kmem_alloc(
3097 						    sizeof (icmp6_filter_t),
3098 						    KM_NOSLEEP);
3099 						if (icmp->icmp_filter == NULL) {
3100 							*outlenp = 0;
3101 							return (ENOBUFS);
3102 						}
3103 					}
3104 					(void) bcopy(invalp, icmp->icmp_filter,
3105 					    inlen);
3106 				}
3107 			}
3108 			break;
3109 
3110 		default:
3111 			*outlenp = 0;
3112 			return (EINVAL);
3113 		}
3114 		break;
3115 	default:
3116 		*outlenp = 0;
3117 		return (EINVAL);
3118 	}
3119 	/*
3120 	 * Common case of OK return with outval same as inval.
3121 	 */
3122 	if (invalp != outvalp) {
3123 		/* don't trust bcopy for identical src/dst */
3124 		(void) bcopy(invalp, outvalp, inlen);
3125 	}
3126 	*outlenp = inlen;
3127 	return (0);
3128 }
3129 
3130 /* This routine sets socket options. */
3131 /* ARGSUSED */
3132 int
3133 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
3134     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3135     void *thisdg_attrs, cred_t *cr)
3136 {
3137 	boolean_t checkonly;
3138 	int	error;
3139 
3140 	error = 0;
3141 	switch (optset_context) {
3142 	case SETFN_OPTCOM_CHECKONLY:
3143 		checkonly = B_TRUE;
3144 		/*
3145 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
3146 		 * inlen != 0 implies value supplied and
3147 		 * 	we have to "pretend" to set it.
3148 		 * inlen == 0 implies that there is no
3149 		 * 	value part in T_CHECK request and just validation
3150 		 * done elsewhere should be enough, we just return here.
3151 		 */
3152 		if (inlen == 0) {
3153 			*outlenp = 0;
3154 			error = 0;
3155 			goto done;
3156 		}
3157 		break;
3158 	case SETFN_OPTCOM_NEGOTIATE:
3159 		checkonly = B_FALSE;
3160 		break;
3161 	case SETFN_UD_NEGOTIATE:
3162 	case SETFN_CONN_NEGOTIATE:
3163 		checkonly = B_FALSE;
3164 		/*
3165 		 * Negotiating local and "association-related" options
3166 		 * through T_UNITDATA_REQ.
3167 		 *
3168 		 * Following routine can filter out ones we do not
3169 		 * want to be "set" this way.
3170 		 */
3171 		if (!icmp_opt_allow_udr_set(level, name)) {
3172 			*outlenp = 0;
3173 			error = EINVAL;
3174 			goto done;
3175 		}
3176 		break;
3177 	default:
3178 		/*
3179 		 * We should never get here
3180 		 */
3181 		*outlenp = 0;
3182 		error = EINVAL;
3183 		goto done;
3184 	}
3185 
3186 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
3187 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
3188 	error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
3189 	    outvalp, cr, thisdg_attrs, checkonly);
3190 
3191 done:
3192 	return (error);
3193 }
3194 
3195 /* This routine sets socket options. */
3196 /* ARGSUSED */
3197 int
3198 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
3199     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
3200     void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
3201 {
3202 	conn_t	*connp =  Q_TO_CONN(q);
3203 	icmp_t	*icmp;
3204 	int error;
3205 
3206 	icmp = connp->conn_icmp;
3207 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3208 	error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
3209 	    outlenp, outvalp, thisdg_attrs, cr);
3210 	rw_exit(&icmp->icmp_rwlock);
3211 	return (error);
3212 }
3213 
3214 /*
3215  * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl,
3216  * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum.
3217  * The headers include ip6i_t (if needed), ip6_t, and any sticky extension
3218  * headers.
3219  * Returns failure if can't allocate memory.
3220  */
3221 static int
3222 icmp_build_hdrs(icmp_t *icmp)
3223 {
3224 	icmp_stack_t *is = icmp->icmp_is;
3225 	uchar_t	*hdrs;
3226 	uint_t	hdrs_len;
3227 	ip6_t	*ip6h;
3228 	ip6i_t	*ip6i;
3229 	ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
3230 
3231 	ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
3232 	hdrs_len = ip_total_hdrs_len_v6(ipp);
3233 	ASSERT(hdrs_len != 0);
3234 	if (hdrs_len != icmp->icmp_sticky_hdrs_len) {
3235 		/* Need to reallocate */
3236 		if (hdrs_len != 0) {
3237 			hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
3238 			if (hdrs == NULL)
3239 				return (ENOMEM);
3240 		} else {
3241 			hdrs = NULL;
3242 		}
3243 		if (icmp->icmp_sticky_hdrs_len != 0) {
3244 			kmem_free(icmp->icmp_sticky_hdrs,
3245 			    icmp->icmp_sticky_hdrs_len);
3246 		}
3247 		icmp->icmp_sticky_hdrs = hdrs;
3248 		icmp->icmp_sticky_hdrs_len = hdrs_len;
3249 	}
3250 	ip_build_hdrs_v6(icmp->icmp_sticky_hdrs,
3251 	    icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto);
3252 
3253 	/* Set header fields not in ipp */
3254 	if (ipp->ipp_fields & IPPF_HAS_IP6I) {
3255 		ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs;
3256 		ip6h = (ip6_t *)&ip6i[1];
3257 
3258 		if (ipp->ipp_fields & IPPF_RAW_CKSUM) {
3259 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
3260 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
3261 		}
3262 		if (ipp->ipp_fields & IPPF_NO_CKSUM) {
3263 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
3264 		}
3265 	} else {
3266 		ip6h = (ip6_t *)icmp->icmp_sticky_hdrs;
3267 	}
3268 
3269 	if (!(ipp->ipp_fields & IPPF_ADDR))
3270 		ip6h->ip6_src = icmp->icmp_v6src;
3271 
3272 	/* Try to get everything in a single mblk */
3273 	if (hdrs_len > icmp->icmp_max_hdr_len) {
3274 		icmp->icmp_max_hdr_len = hdrs_len;
3275 		rw_exit(&icmp->icmp_rwlock);
3276 		(void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq,
3277 		    icmp->icmp_connp,
3278 		    icmp->icmp_max_hdr_len + is->is_wroff_extra);
3279 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
3280 	}
3281 	return (0);
3282 }
3283 
3284 /*
3285  * This routine retrieves the value of an ND variable in a icmpparam_t
3286  * structure.  It is called through nd_getset when a user reads the
3287  * variable.
3288  */
3289 /* ARGSUSED */
3290 static int
3291 icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
3292 {
3293 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3294 
3295 	(void) mi_mpprintf(mp, "%d", icmppa->icmp_param_value);
3296 	return (0);
3297 }
3298 
3299 /*
3300  * Walk through the param array specified registering each element with the
3301  * named dispatch (ND) handler.
3302  */
3303 static boolean_t
3304 icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt)
3305 {
3306 	for (; cnt-- > 0; icmppa++) {
3307 		if (icmppa->icmp_param_name && icmppa->icmp_param_name[0]) {
3308 			if (!nd_load(ndp, icmppa->icmp_param_name,
3309 			    icmp_param_get, icmp_param_set,
3310 			    (caddr_t)icmppa)) {
3311 				nd_free(ndp);
3312 				return (B_FALSE);
3313 			}
3314 		}
3315 	}
3316 	return (B_TRUE);
3317 }
3318 
3319 /* This routine sets an ND variable in a icmpparam_t structure. */
3320 /* ARGSUSED */
3321 static int
3322 icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
3323 {
3324 	long		new_value;
3325 	icmpparam_t	*icmppa = (icmpparam_t *)cp;
3326 
3327 	/*
3328 	 * Fail the request if the new value does not lie within the
3329 	 * required bounds.
3330 	 */
3331 	if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
3332 	    new_value < icmppa->icmp_param_min ||
3333 	    new_value > icmppa->icmp_param_max) {
3334 		return (EINVAL);
3335 	}
3336 	/* Set the new value */
3337 	icmppa->icmp_param_value = new_value;
3338 	return (0);
3339 }
3340 
3341 static mblk_t *
3342 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
3343 {
3344 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
3345 	if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
3346 		/*
3347 		 * fallback has started but messages have not been moved yet
3348 		 */
3349 		if (icmp->icmp_fallback_queue_head == NULL) {
3350 			ASSERT(icmp->icmp_fallback_queue_tail == NULL);
3351 			icmp->icmp_fallback_queue_head = mp;
3352 			icmp->icmp_fallback_queue_tail = mp;
3353 		} else {
3354 			ASSERT(icmp->icmp_fallback_queue_tail != NULL);
3355 			icmp->icmp_fallback_queue_tail->b_next = mp;
3356 			icmp->icmp_fallback_queue_tail = mp;
3357 		}
3358 		return (NULL);
3359 	} else {
3360 		/*
3361 		 * Fallback completed, let the caller putnext() the mblk.
3362 		 */
3363 		return (mp);
3364 	}
3365 }
3366 
3367 /*
3368  * Deliver data to ULP. In case we have a socket, and it's falling back to
3369  * TPI, then we'll queue the mp for later processing.
3370  */
3371 static void
3372 icmp_ulp_recv(conn_t *connp, mblk_t *mp)
3373 {
3374 
3375 	if (IPCL_IS_NONSTR(connp)) {
3376 		icmp_t *icmp = connp->conn_icmp;
3377 		int error;
3378 
3379 		if ((*connp->conn_upcalls->su_recv)
3380 		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
3381 		    NULL) < 0) {
3382 			mutex_enter(&icmp->icmp_recv_lock);
3383 			if (error == ENOSPC) {
3384 				/*
3385 				 * let's confirm while holding the lock
3386 				 */
3387 				if ((*connp->conn_upcalls->su_recv)
3388 				    (connp->conn_upper_handle, NULL, 0, 0,
3389 				    &error, NULL) < 0) {
3390 					ASSERT(error == ENOSPC);
3391 					if (error == ENOSPC) {
3392 						connp->conn_flow_cntrld =
3393 						    B_TRUE;
3394 					}
3395 				}
3396 				mutex_exit(&icmp->icmp_recv_lock);
3397 			} else {
3398 				ASSERT(error == EOPNOTSUPP);
3399 				mp = icmp_queue_fallback(icmp, mp);
3400 				mutex_exit(&icmp->icmp_recv_lock);
3401 				if (mp != NULL)
3402 					putnext(connp->conn_rq, mp);
3403 			}
3404 		}
3405 		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
3406 	} else {
3407 		putnext(connp->conn_rq, mp);
3408 	}
3409 }
3410 
3411 /*ARGSUSED2*/
3412 static void
3413 icmp_input(void *arg1, mblk_t *mp, void *arg2)
3414 {
3415 	conn_t *connp = (conn_t *)arg1;
3416 	struct T_unitdata_ind	*tudi;
3417 	uchar_t			*rptr;
3418 	icmp_t			*icmp;
3419 	icmp_stack_t		*is;
3420 	sin_t			*sin;
3421 	sin6_t			*sin6;
3422 	ip6_t			*ip6h;
3423 	ip6i_t			*ip6i;
3424 	mblk_t			*mp1;
3425 	int			hdr_len;
3426 	ipha_t			*ipha;
3427 	int			udi_size;	/* Size of T_unitdata_ind */
3428 	uint_t			ipvers;
3429 	ip6_pkt_t		ipp;
3430 	uint8_t			nexthdr;
3431 	ip_pktinfo_t		*pinfo = NULL;
3432 	mblk_t			*options_mp = NULL;
3433 	uint_t			icmp_opt = 0;
3434 	boolean_t		icmp_ipv6_recvhoplimit = B_FALSE;
3435 	uint_t			hopstrip;
3436 
3437 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
3438 
3439 	icmp = connp->conn_icmp;
3440 	is = icmp->icmp_is;
3441 	rptr = mp->b_rptr;
3442 	ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
3443 	ASSERT(OK_32PTR(rptr));
3444 
3445 	/*
3446 	 * IP should have prepended the options data in an M_CTL
3447 	 * Check M_CTL "type" to make sure are not here bcos of
3448 	 * a valid ICMP message
3449 	 */
3450 	if (DB_TYPE(mp) == M_CTL) {
3451 		/*
3452 		 * FIXME: does IP still do this?
3453 		 * IP sends up the IPSEC_IN message for handling IPSEC
3454 		 * policy at the TCP level. We don't need it here.
3455 		 */
3456 		if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) {
3457 			mp1 = mp->b_cont;
3458 			freeb(mp);
3459 			mp = mp1;
3460 			rptr = mp->b_rptr;
3461 		} else if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
3462 		    ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
3463 		    IN_PKTINFO) {
3464 			/*
3465 			 * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
3466 			 * has been prepended to the packet by IP. We need to
3467 			 * extract the mblk and adjust the rptr
3468 			 */
3469 			pinfo = (ip_pktinfo_t *)mp->b_rptr;
3470 			options_mp = mp;
3471 			mp = mp->b_cont;
3472 			rptr = mp->b_rptr;
3473 		} else {
3474 			/*
3475 			 * ICMP messages.
3476 			 */
3477 			icmp_icmp_error(connp, mp);
3478 			return;
3479 		}
3480 	}
3481 
3482 	/*
3483 	 * Discard message if it is misaligned or smaller than the IP header.
3484 	 */
3485 	if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) {
3486 		freemsg(mp);
3487 		if (options_mp != NULL)
3488 			freeb(options_mp);
3489 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3490 		return;
3491 	}
3492 	ipvers = IPH_HDR_VERSION((ipha_t *)rptr);
3493 
3494 	/* Handle M_DATA messages containing IP packets messages */
3495 	if (ipvers == IPV4_VERSION) {
3496 		/*
3497 		 * Special case where IP attaches
3498 		 * the IRE needs to be handled so that we don't send up
3499 		 * IRE to the user land.
3500 		 */
3501 		ipha = (ipha_t *)rptr;
3502 		hdr_len = IPH_HDR_LENGTH(ipha);
3503 
3504 		if (ipha->ipha_protocol == IPPROTO_TCP) {
3505 			tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3506 
3507 			if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) ==
3508 			    TH_SYN) && mp->b_cont != NULL) {
3509 				mp1 = mp->b_cont;
3510 				if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3511 					freeb(mp1);
3512 					mp->b_cont = NULL;
3513 				}
3514 			}
3515 		}
3516 		if (is->is_bsd_compat) {
3517 			ushort_t len;
3518 			len = ntohs(ipha->ipha_length);
3519 
3520 			if (mp->b_datap->db_ref > 1) {
3521 				/*
3522 				 * Allocate a new IP header so that we can
3523 				 * modify ipha_length.
3524 				 */
3525 				mblk_t	*mp1;
3526 
3527 				mp1 = allocb(hdr_len, BPRI_MED);
3528 				if (!mp1) {
3529 					freemsg(mp);
3530 					if (options_mp != NULL)
3531 						freeb(options_mp);
3532 					BUMP_MIB(&is->is_rawip_mib,
3533 					    rawipInErrors);
3534 					return;
3535 				}
3536 				bcopy(rptr, mp1->b_rptr, hdr_len);
3537 				mp->b_rptr = rptr + hdr_len;
3538 				rptr = mp1->b_rptr;
3539 				ipha = (ipha_t *)rptr;
3540 				mp1->b_cont = mp;
3541 				mp1->b_wptr = rptr + hdr_len;
3542 				mp = mp1;
3543 			}
3544 			len -= hdr_len;
3545 			ipha->ipha_length = htons(len);
3546 		}
3547 	}
3548 
3549 	/*
3550 	 * This is the inbound data path.  Packets are passed upstream as
3551 	 * T_UNITDATA_IND messages with full IP headers still attached.
3552 	 */
3553 	if (icmp->icmp_family == AF_INET) {
3554 		ASSERT(ipvers == IPV4_VERSION);
3555 		udi_size =  sizeof (struct T_unitdata_ind) + sizeof (sin_t);
3556 		if (icmp->icmp_recvif && (pinfo != NULL) &&
3557 		    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3558 			udi_size += sizeof (struct T_opthdr) +
3559 			    sizeof (uint_t);
3560 		}
3561 
3562 		if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3563 		    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3564 			udi_size += sizeof (struct T_opthdr) +
3565 			    sizeof (struct in_pktinfo);
3566 		}
3567 
3568 		/*
3569 		 * If SO_TIMESTAMP is set allocate the appropriate sized
3570 		 * buffer. Since gethrestime() expects a pointer aligned
3571 		 * argument, we allocate space necessary for extra
3572 		 * alignment (even though it might not be used).
3573 		 */
3574 		if (icmp->icmp_timestamp) {
3575 			udi_size += sizeof (struct T_opthdr) +
3576 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3577 		}
3578 		mp1 = allocb(udi_size, BPRI_MED);
3579 		if (mp1 == NULL) {
3580 			freemsg(mp);
3581 			if (options_mp != NULL)
3582 				freeb(options_mp);
3583 			BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3584 			return;
3585 		}
3586 		mp1->b_cont = mp;
3587 		mp = mp1;
3588 		tudi = (struct T_unitdata_ind *)mp->b_rptr;
3589 		mp->b_datap->db_type = M_PROTO;
3590 		mp->b_wptr = (uchar_t *)tudi + udi_size;
3591 		tudi->PRIM_type = T_UNITDATA_IND;
3592 		tudi->SRC_length = sizeof (sin_t);
3593 		tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3594 		sin = (sin_t *)&tudi[1];
3595 		*sin = sin_null;
3596 		sin->sin_family = AF_INET;
3597 		sin->sin_addr.s_addr = ipha->ipha_src;
3598 		tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
3599 		    sizeof (sin_t);
3600 		udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
3601 		tudi->OPT_length = udi_size;
3602 
3603 		/*
3604 		 * Add options if IP_RECVIF is set
3605 		 */
3606 		if (udi_size != 0) {
3607 			char *dstopt;
3608 
3609 			dstopt = (char *)&sin[1];
3610 			if (icmp->icmp_recvif && (pinfo != NULL) &&
3611 			    (pinfo->ip_pkt_flags & IPF_RECVIF)) {
3612 
3613 				struct T_opthdr *toh;
3614 				uint_t		*dstptr;
3615 
3616 				toh = (struct T_opthdr *)dstopt;
3617 				toh->level = IPPROTO_IP;
3618 				toh->name = IP_RECVIF;
3619 				toh->len = sizeof (struct T_opthdr) +
3620 				    sizeof (uint_t);
3621 				toh->status = 0;
3622 				dstopt += sizeof (struct T_opthdr);
3623 				dstptr = (uint_t *)dstopt;
3624 				*dstptr = pinfo->ip_pkt_ifindex;
3625 				dstopt += sizeof (uint_t);
3626 				udi_size -= toh->len;
3627 			}
3628 			if (icmp->icmp_timestamp) {
3629 				struct	T_opthdr *toh;
3630 
3631 				toh = (struct T_opthdr *)dstopt;
3632 				toh->level = SOL_SOCKET;
3633 				toh->name = SCM_TIMESTAMP;
3634 				toh->len = sizeof (struct T_opthdr) +
3635 				    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3636 				toh->status = 0;
3637 				dstopt += sizeof (struct T_opthdr);
3638 				/* Align for gethrestime() */
3639 				dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
3640 				    sizeof (intptr_t));
3641 				gethrestime((timestruc_t *)dstopt);
3642 				dstopt = (char *)toh + toh->len;
3643 				udi_size -= toh->len;
3644 			}
3645 			if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
3646 			    (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
3647 				struct	T_opthdr *toh;
3648 				struct	in_pktinfo *pktinfop;
3649 
3650 				toh = (struct T_opthdr *)dstopt;
3651 				toh->level = IPPROTO_IP;
3652 				toh->name = IP_PKTINFO;
3653 				toh->len = sizeof (struct T_opthdr) +
3654 				    sizeof (in_pktinfo_t);
3655 				toh->status = 0;
3656 				dstopt += sizeof (struct T_opthdr);
3657 				pktinfop = (struct in_pktinfo *)dstopt;
3658 				pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
3659 				pktinfop->ipi_spec_dst =
3660 				    pinfo->ip_pkt_match_addr;
3661 
3662 				pktinfop->ipi_addr.s_addr = ipha->ipha_dst;
3663 
3664 				dstopt += sizeof (struct in_pktinfo);
3665 				udi_size -= toh->len;
3666 			}
3667 
3668 			/* Consumed all of allocated space */
3669 			ASSERT(udi_size == 0);
3670 		}
3671 
3672 		if (options_mp != NULL)
3673 			freeb(options_mp);
3674 
3675 		BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
3676 		goto deliver;
3677 	}
3678 
3679 	/*
3680 	 * We don't need options_mp in the IPv6 path.
3681 	 */
3682 	if (options_mp != NULL) {
3683 		freeb(options_mp);
3684 		options_mp = NULL;
3685 	}
3686 
3687 	/*
3688 	 * Discard message if it is smaller than the IPv6 header
3689 	 * or if the header is malformed.
3690 	 */
3691 	if ((mp->b_wptr - rptr) < sizeof (ip6_t) ||
3692 	    IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION ||
3693 	    icmp->icmp_family != AF_INET6) {
3694 		freemsg(mp);
3695 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3696 		return;
3697 	}
3698 
3699 	/* Initialize */
3700 	ipp.ipp_fields = 0;
3701 	hopstrip = 0;
3702 
3703 	ip6h = (ip6_t *)rptr;
3704 	/*
3705 	 * Call on ip_find_hdr_v6 which gets the total hdr len
3706 	 * as well as individual lenghts of ext hdrs (and ptrs to
3707 	 * them).
3708 	 */
3709 	if (ip6h->ip6_nxt != icmp->icmp_proto) {
3710 		/* Look for ifindex information */
3711 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3712 			ip6i = (ip6i_t *)ip6h;
3713 			if (ip6i->ip6i_flags & IP6I_IFINDEX) {
3714 				ASSERT(ip6i->ip6i_ifindex != 0);
3715 				ipp.ipp_fields |= IPPF_IFINDEX;
3716 				ipp.ipp_ifindex = ip6i->ip6i_ifindex;
3717 			}
3718 			rptr = (uchar_t *)&ip6i[1];
3719 			mp->b_rptr = rptr;
3720 			if (rptr == mp->b_wptr) {
3721 				mp1 = mp->b_cont;
3722 				freeb(mp);
3723 				mp = mp1;
3724 				rptr = mp->b_rptr;
3725 			}
3726 			ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN);
3727 			ip6h = (ip6_t *)rptr;
3728 		}
3729 		hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr);
3730 
3731 		/*
3732 		 * We need to lie a bit to the user because users inside
3733 		 * labeled compartments should not see their own labels.  We
3734 		 * assume that in all other respects IP has checked the label,
3735 		 * and that the label is always first among the options.  (If
3736 		 * it's not first, then this code won't see it, and the option
3737 		 * will be passed along to the user.)
3738 		 *
3739 		 * If we had multilevel ICMP sockets, then the following code
3740 		 * should be skipped for them to allow the user to see the
3741 		 * label.
3742 		 *
3743 		 * Alignment restrictions in the definition of IP options
3744 		 * (namely, the requirement that the 4-octet DOI goes on a
3745 		 * 4-octet boundary) mean that we know exactly where the option
3746 		 * should start, but we're lenient for other hosts.
3747 		 *
3748 		 * Note that there are no multilevel ICMP or raw IP sockets
3749 		 * yet, thus nobody ever sees the IP6OPT_LS option.
3750 		 */
3751 		if ((ipp.ipp_fields & IPPF_HOPOPTS) &&
3752 		    ipp.ipp_hopoptslen > 5 && is_system_labeled()) {
3753 			const uchar_t *ucp =
3754 			    (const uchar_t *)ipp.ipp_hopopts + 2;
3755 			int remlen = ipp.ipp_hopoptslen - 2;
3756 
3757 			while (remlen > 0) {
3758 				if (*ucp == IP6OPT_PAD1) {
3759 					remlen--;
3760 					ucp++;
3761 				} else if (*ucp == IP6OPT_PADN) {
3762 					remlen -= ucp[1] + 2;
3763 					ucp += ucp[1] + 2;
3764 				} else if (*ucp == ip6opt_ls) {
3765 					hopstrip = (ucp -
3766 					    (const uchar_t *)ipp.ipp_hopopts) +
3767 					    ucp[1] + 2;
3768 					hopstrip = (hopstrip + 7) & ~7;
3769 					break;
3770 				} else {
3771 					/* label option must be first */
3772 					break;
3773 				}
3774 			}
3775 		}
3776 	} else {
3777 		hdr_len = IPV6_HDR_LEN;
3778 		ip6i = NULL;
3779 		nexthdr = ip6h->ip6_nxt;
3780 	}
3781 	/*
3782 	 * One special case where IP attaches the IRE needs to
3783 	 * be handled so that we don't send up IRE to the user land.
3784 	 */
3785 	if (nexthdr == IPPROTO_TCP) {
3786 		tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
3787 
3788 		if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) &&
3789 		    mp->b_cont != NULL) {
3790 			mp1 = mp->b_cont;
3791 			if (mp1->b_datap->db_type == IRE_DB_TYPE) {
3792 				freeb(mp1);
3793 				mp->b_cont = NULL;
3794 			}
3795 		}
3796 	}
3797 	/*
3798 	 * Check a filter for ICMPv6 types if needed.
3799 	 * Verify raw checksums if needed.
3800 	 */
3801 	if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) {
3802 		if (icmp->icmp_filter != NULL) {
3803 			int type;
3804 
3805 			/* Assumes that IP has done the pullupmsg */
3806 			type = mp->b_rptr[hdr_len];
3807 
3808 			ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr);
3809 			if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
3810 				freemsg(mp);
3811 				return;
3812 			}
3813 		} else {
3814 			/* Checksum */
3815 			uint16_t	*up;
3816 			uint32_t	sum;
3817 			int		remlen;
3818 
3819 			up = (uint16_t *)&ip6h->ip6_src;
3820 
3821 			remlen = msgdsize(mp) - hdr_len;
3822 			sum = htons(icmp->icmp_proto + remlen)
3823 			    + up[0] + up[1] + up[2] + up[3]
3824 			    + up[4] + up[5] + up[6] + up[7]
3825 			    + up[8] + up[9] + up[10] + up[11]
3826 			    + up[12] + up[13] + up[14] + up[15];
3827 			sum = (sum & 0xffff) + (sum >> 16);
3828 			sum = IP_CSUM(mp, hdr_len, sum);
3829 			if (sum != 0) {
3830 				/* IPv6 RAW checksum failed */
3831 				ip0dbg(("icmp_rput: RAW checksum "
3832 				    "failed %x\n", sum));
3833 				freemsg(mp);
3834 				BUMP_MIB(&is->is_rawip_mib,
3835 				    rawipInCksumErrs);
3836 				return;
3837 			}
3838 		}
3839 	}
3840 	/* Skip all the IPv6 headers per API */
3841 	mp->b_rptr += hdr_len;
3842 
3843 	udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3844 
3845 	/*
3846 	 * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to
3847 	 * maintain state information, instead of relying on icmp_t
3848 	 * structure, since there arent any locks protecting these members
3849 	 * and there is a window where there might be a race between a
3850 	 * thread setting options on the write side and a thread reading
3851 	 * these options on the read size.
3852 	 */
3853 	if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
3854 	    IPPF_RTHDR|IPPF_IFINDEX)) {
3855 		if (icmp->icmp_ipv6_recvhopopts &&
3856 		    (ipp.ipp_fields & IPPF_HOPOPTS) &&
3857 		    ipp.ipp_hopoptslen > hopstrip) {
3858 			udi_size += sizeof (struct T_opthdr) +
3859 			    ipp.ipp_hopoptslen - hopstrip;
3860 			icmp_opt |= IPPF_HOPOPTS;
3861 		}
3862 		if ((icmp->icmp_ipv6_recvdstopts ||
3863 		    icmp->icmp_old_ipv6_recvdstopts) &&
3864 		    (ipp.ipp_fields & IPPF_DSTOPTS)) {
3865 			udi_size += sizeof (struct T_opthdr) +
3866 			    ipp.ipp_dstoptslen;
3867 			icmp_opt |= IPPF_DSTOPTS;
3868 		}
3869 		if (((icmp->icmp_ipv6_recvdstopts &&
3870 		    icmp->icmp_ipv6_recvrthdr &&
3871 		    (ipp.ipp_fields & IPPF_RTHDR)) ||
3872 		    icmp->icmp_ipv6_recvrtdstopts) &&
3873 		    (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
3874 			udi_size += sizeof (struct T_opthdr) +
3875 			    ipp.ipp_rtdstoptslen;
3876 			icmp_opt |= IPPF_RTDSTOPTS;
3877 		}
3878 		if (icmp->icmp_ipv6_recvrthdr &&
3879 		    (ipp.ipp_fields & IPPF_RTHDR)) {
3880 			udi_size += sizeof (struct T_opthdr) +
3881 			    ipp.ipp_rthdrlen;
3882 			icmp_opt |= IPPF_RTHDR;
3883 		}
3884 		if (icmp->icmp_ip_recvpktinfo &&
3885 		    (ipp.ipp_fields & IPPF_IFINDEX)) {
3886 			udi_size += sizeof (struct T_opthdr) +
3887 			    sizeof (struct in6_pktinfo);
3888 			icmp_opt |= IPPF_IFINDEX;
3889 		}
3890 	}
3891 	if (icmp->icmp_ipv6_recvhoplimit) {
3892 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3893 		icmp_ipv6_recvhoplimit = B_TRUE;
3894 	}
3895 
3896 	if (icmp->icmp_ipv6_recvtclass)
3897 		udi_size += sizeof (struct T_opthdr) + sizeof (int);
3898 
3899 	/*
3900 	 * If SO_TIMESTAMP is set allocate the appropriate sized
3901 	 * buffer. Since gethrestime() expects a pointer aligned
3902 	 * argument, we allocate space necessary for extra
3903 	 * alignment (even though it might not be used).
3904 	 */
3905 	if (icmp->icmp_timestamp) {
3906 		udi_size += sizeof (struct T_opthdr) +
3907 		    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
3908 	}
3909 
3910 	mp1 = allocb(udi_size, BPRI_MED);
3911 	if (mp1 == NULL) {
3912 		freemsg(mp);
3913 		BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
3914 		return;
3915 	}
3916 	mp1->b_cont = mp;
3917 	mp = mp1;
3918 	mp->b_datap->db_type = M_PROTO;
3919 	tudi = (struct T_unitdata_ind *)mp->b_rptr;
3920 	mp->b_wptr = (uchar_t *)tudi + udi_size;
3921 	tudi->PRIM_type = T_UNITDATA_IND;
3922 	tudi->SRC_length = sizeof (sin6_t);
3923 	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
3924 	tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
3925 	udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
3926 	tudi->OPT_length = udi_size;
3927 	sin6 = (sin6_t *)&tudi[1];
3928 	sin6->sin6_port = 0;
3929 	sin6->sin6_family = AF_INET6;
3930 
3931 	sin6->sin6_addr = ip6h->ip6_src;
3932 	/* No sin6_flowinfo per API */
3933 	sin6->sin6_flowinfo = 0;
3934 	/* For link-scope source pass up scope id */
3935 	if ((ipp.ipp_fields & IPPF_IFINDEX) &&
3936 	    IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
3937 		sin6->sin6_scope_id = ipp.ipp_ifindex;
3938 	else
3939 		sin6->sin6_scope_id = 0;
3940 
3941 	sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
3942 	    icmp->icmp_zoneid, is->is_netstack);
3943 
3944 	if (udi_size != 0) {
3945 		uchar_t *dstopt;
3946 
3947 		dstopt = (uchar_t *)&sin6[1];
3948 		if (icmp_opt & IPPF_IFINDEX) {
3949 			struct T_opthdr *toh;
3950 			struct in6_pktinfo *pkti;
3951 
3952 			toh = (struct T_opthdr *)dstopt;
3953 			toh->level = IPPROTO_IPV6;
3954 			toh->name = IPV6_PKTINFO;
3955 			toh->len = sizeof (struct T_opthdr) +
3956 			    sizeof (*pkti);
3957 			toh->status = 0;
3958 			dstopt += sizeof (struct T_opthdr);
3959 			pkti = (struct in6_pktinfo *)dstopt;
3960 			pkti->ipi6_addr = ip6h->ip6_dst;
3961 			pkti->ipi6_ifindex = ipp.ipp_ifindex;
3962 			dstopt += sizeof (*pkti);
3963 			udi_size -= toh->len;
3964 		}
3965 		if (icmp_ipv6_recvhoplimit) {
3966 			struct T_opthdr *toh;
3967 
3968 			toh = (struct T_opthdr *)dstopt;
3969 			toh->level = IPPROTO_IPV6;
3970 			toh->name = IPV6_HOPLIMIT;
3971 			toh->len = sizeof (struct T_opthdr) +
3972 			    sizeof (uint_t);
3973 			toh->status = 0;
3974 			dstopt += sizeof (struct T_opthdr);
3975 			*(uint_t *)dstopt = ip6h->ip6_hops;
3976 			dstopt += sizeof (uint_t);
3977 			udi_size -= toh->len;
3978 		}
3979 		if (icmp->icmp_ipv6_recvtclass) {
3980 			struct T_opthdr *toh;
3981 
3982 			toh = (struct T_opthdr *)dstopt;
3983 			toh->level = IPPROTO_IPV6;
3984 			toh->name = IPV6_TCLASS;
3985 			toh->len = sizeof (struct T_opthdr) +
3986 			    sizeof (uint_t);
3987 			toh->status = 0;
3988 			dstopt += sizeof (struct T_opthdr);
3989 			*(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
3990 			dstopt += sizeof (uint_t);
3991 			udi_size -= toh->len;
3992 		}
3993 		if (icmp->icmp_timestamp) {
3994 			struct  T_opthdr *toh;
3995 
3996 			toh = (struct T_opthdr *)dstopt;
3997 			toh->level = SOL_SOCKET;
3998 			toh->name = SCM_TIMESTAMP;
3999 			toh->len = sizeof (struct T_opthdr) +
4000 			    sizeof (timestruc_t) + _POINTER_ALIGNMENT;
4001 			toh->status = 0;
4002 			dstopt += sizeof (struct T_opthdr);
4003 			/* Align for gethrestime() */
4004 			dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
4005 			    sizeof (intptr_t));
4006 			gethrestime((timestruc_t *)dstopt);
4007 			dstopt = (uchar_t *)toh + toh->len;
4008 			udi_size -= toh->len;
4009 		}
4010 
4011 		if (icmp_opt & IPPF_HOPOPTS) {
4012 			struct T_opthdr *toh;
4013 
4014 			toh = (struct T_opthdr *)dstopt;
4015 			toh->level = IPPROTO_IPV6;
4016 			toh->name = IPV6_HOPOPTS;
4017 			toh->len = sizeof (struct T_opthdr) +
4018 			    ipp.ipp_hopoptslen - hopstrip;
4019 			toh->status = 0;
4020 			dstopt += sizeof (struct T_opthdr);
4021 			bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt,
4022 			    ipp.ipp_hopoptslen - hopstrip);
4023 			if (hopstrip > 0) {
4024 				/* copy next header value and fake length */
4025 				dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0];
4026 				dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] -
4027 				    hopstrip / 8;
4028 			}
4029 			dstopt += ipp.ipp_hopoptslen - hopstrip;
4030 			udi_size -= toh->len;
4031 		}
4032 		if (icmp_opt & IPPF_RTDSTOPTS) {
4033 			struct T_opthdr *toh;
4034 
4035 			toh = (struct T_opthdr *)dstopt;
4036 			toh->level = IPPROTO_IPV6;
4037 			toh->name = IPV6_DSTOPTS;
4038 			toh->len = sizeof (struct T_opthdr) +
4039 			    ipp.ipp_rtdstoptslen;
4040 			toh->status = 0;
4041 			dstopt += sizeof (struct T_opthdr);
4042 			bcopy(ipp.ipp_rtdstopts, dstopt,
4043 			    ipp.ipp_rtdstoptslen);
4044 			dstopt += ipp.ipp_rtdstoptslen;
4045 			udi_size -= toh->len;
4046 		}
4047 		if (icmp_opt & IPPF_RTHDR) {
4048 			struct T_opthdr *toh;
4049 
4050 			toh = (struct T_opthdr *)dstopt;
4051 			toh->level = IPPROTO_IPV6;
4052 			toh->name = IPV6_RTHDR;
4053 			toh->len = sizeof (struct T_opthdr) +
4054 			    ipp.ipp_rthdrlen;
4055 			toh->status = 0;
4056 			dstopt += sizeof (struct T_opthdr);
4057 			bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
4058 			dstopt += ipp.ipp_rthdrlen;
4059 			udi_size -= toh->len;
4060 		}
4061 		if (icmp_opt & IPPF_DSTOPTS) {
4062 			struct T_opthdr *toh;
4063 
4064 			toh = (struct T_opthdr *)dstopt;
4065 			toh->level = IPPROTO_IPV6;
4066 			toh->name = IPV6_DSTOPTS;
4067 			toh->len = sizeof (struct T_opthdr) +
4068 			    ipp.ipp_dstoptslen;
4069 			toh->status = 0;
4070 			dstopt += sizeof (struct T_opthdr);
4071 			bcopy(ipp.ipp_dstopts, dstopt,
4072 			    ipp.ipp_dstoptslen);
4073 			dstopt += ipp.ipp_dstoptslen;
4074 			udi_size -= toh->len;
4075 		}
4076 		/* Consumed all of allocated space */
4077 		ASSERT(udi_size == 0);
4078 	}
4079 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
4080 
4081 deliver:
4082 	icmp_ulp_recv(connp, mp);
4083 
4084 }
4085 
4086 /*
4087  * return SNMP stuff in buffer in mpdata
4088  */
4089 mblk_t *
4090 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
4091 {
4092 	mblk_t			*mpdata;
4093 	struct opthdr		*optp;
4094 	conn_t			*connp = Q_TO_CONN(q);
4095 	icmp_stack_t		*is = connp->conn_netstack->netstack_icmp;
4096 	mblk_t			*mp2ctl;
4097 
4098 	/*
4099 	 * make a copy of the original message
4100 	 */
4101 	mp2ctl = copymsg(mpctl);
4102 
4103 	if (mpctl == NULL ||
4104 	    (mpdata = mpctl->b_cont) == NULL) {
4105 		freemsg(mpctl);
4106 		freemsg(mp2ctl);
4107 		return (0);
4108 	}
4109 
4110 	/* fixed length structure for IPv4 and IPv6 counters */
4111 	optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
4112 	optp->level = EXPER_RAWIP;
4113 	optp->name = 0;
4114 	(void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
4115 	    sizeof (is->is_rawip_mib));
4116 	optp->len = msgdsize(mpdata);
4117 	qreply(q, mpctl);
4118 
4119 	return (mp2ctl);
4120 }
4121 
4122 /*
4123  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
4124  * TODO:  If this ever actually tries to set anything, it needs to be
4125  * to do the appropriate locking.
4126  */
4127 /* ARGSUSED */
4128 int
4129 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
4130     uchar_t *ptr, int len)
4131 {
4132 	switch (level) {
4133 	case EXPER_RAWIP:
4134 		return (0);
4135 	default:
4136 		return (1);
4137 	}
4138 }
4139 
4140 /*
4141  * This routine creates a T_UDERROR_IND message and passes it upstream.
4142  * The address and options are copied from the T_UNITDATA_REQ message
4143  * passed in mp.  This message is freed.
4144  */
4145 static void
4146 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
4147 {
4148 	mblk_t	*mp1;
4149 	uchar_t	*rptr = mp->b_rptr;
4150 	struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr;
4151 
4152 	mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset],
4153 	    tudr->DEST_length, (char *)&rptr[tudr->OPT_offset],
4154 	    tudr->OPT_length, err);
4155 	if (mp1)
4156 		qreply(q, mp1);
4157 	freemsg(mp);
4158 }
4159 
4160 
4161 static int
4162 rawip_do_unbind(conn_t *connp)
4163 {
4164 	icmp_t *icmp = connp->conn_icmp;
4165 
4166 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4167 	/* If a bind has not been done, we can't unbind. */
4168 	if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
4169 		rw_exit(&icmp->icmp_rwlock);
4170 		return (-TOUTSTATE);
4171 	}
4172 	icmp->icmp_pending_op = T_UNBIND_REQ;
4173 	rw_exit(&icmp->icmp_rwlock);
4174 
4175 	/*
4176 	 * Call ip to unbind
4177 	 */
4178 
4179 	ip_unbind(connp);
4180 
4181 	/*
4182 	 * Once we're unbound from IP, the pending operation may be cleared
4183 	 * here.
4184 	 */
4185 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
4186 	V6_SET_ZERO(icmp->icmp_v6src);
4187 	V6_SET_ZERO(icmp->icmp_bound_v6src);
4188 	icmp->icmp_pending_op = -1;
4189 	icmp->icmp_state = TS_UNBND;
4190 	if (icmp->icmp_family == AF_INET6)
4191 		(void) icmp_build_hdrs(icmp);
4192 	rw_exit(&icmp->icmp_rwlock);
4193 	return (0);
4194 }
4195 
4196 /*
4197  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
4198  * After some error checking, the message is passed downstream to ip.
4199  */
4200 static void
4201 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
4202 {
4203 	conn_t	*connp = Q_TO_CONN(q);
4204 	int	error;
4205 
4206 	ASSERT(mp->b_cont == NULL);
4207 	error = rawip_do_unbind(connp);
4208 	if (error) {
4209 		if (error < 0) {
4210 			icmp_err_ack(q, mp, -error, 0);
4211 		} else {
4212 			icmp_err_ack(q, mp, 0, error);
4213 		}
4214 		return;
4215 	}
4216 
4217 	/*
4218 	 * Convert mp into a T_OK_ACK
4219 	 */
4220 
4221 	mp = mi_tpi_ok_ack_alloc(mp);
4222 
4223 	/*
4224 	 * should not happen in practice... T_OK_ACK is smaller than the
4225 	 * original message.
4226 	 */
4227 	ASSERT(mp != NULL);
4228 	ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
4229 	qreply(q, mp);
4230 }
4231 
4232 
4233 /*
4234  * Process IPv4 packets that already include an IP header.
4235  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
4236  * IPPROTO_IGMP).
4237  */
4238 static int
4239 icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
4240     ip4_pkt_t *pktinfop)
4241 {
4242 	icmp_stack_t *is = icmp->icmp_is;
4243 	ipha_t	*ipha;
4244 	int	ip_hdr_length;
4245 	int	tp_hdr_len;
4246 	int	error;
4247 	uchar_t	ip_snd_opt[IP_MAX_OPT_LENGTH];
4248 	uint32_t ip_snd_opt_len = 0;
4249 	mblk_t	*mp1;
4250 	uint_t	pkt_len;
4251 	ip_opt_info_t optinfo;
4252 	pid_t	cpid;
4253 	cred_t	*cr;
4254 
4255 	rw_enter(&icmp->icmp_rwlock, RW_READER);
4256 
4257 	optinfo.ip_opt_flags = 0;
4258 	optinfo.ip_opt_ill_index = 0;
4259 	ipha = (ipha_t *)mp->b_rptr;
4260 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
4261 	if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
4262 		if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
4263 			ASSERT(icmp != NULL);
4264 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4265 			freemsg(mp);
4266 			rw_exit(&icmp->icmp_rwlock);
4267 			return (0);
4268 		}
4269 		ipha = (ipha_t *)mp->b_rptr;
4270 	}
4271 	ipha->ipha_version_and_hdr_length =
4272 	    (IP_VERSION<<4) | (ip_hdr_length>>2);
4273 
4274 	/*
4275 	 * Check if our saved options are valid; update if not.
4276 	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
4277 	 * to different destination may require different labels,
4278 	 * or worse, ICMP packets to same IP address may require
4279 	 * different labels due to use of shared all-zones address.
4280 	 * We use conn_lock to ensure that lastdst, ip_snd_options,
4281 	 * and ip_snd_options_len are consistent for the current
4282 	 * destination and are updated atomically.
4283 	 */
4284 	mutex_enter(&connp->conn_lock);
4285 	if (is_system_labeled()) {
4286 		/*
4287 		 * Recompute the Trusted Extensions security label if
4288 		 * we're not going to the same destination as last
4289 		 * time or the cred attached to the received mblk
4290 		 * changed.
4291 		 */
4292 		cr = msg_getcred(mp, &cpid);
4293 		if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4294 		    V4_PART_OF_V6(icmp->icmp_v6lastdst) != ipha->ipha_dst ||
4295 		    cr != icmp->icmp_last_cred) {
4296 			error = icmp_update_label(icmp, mp, ipha->ipha_dst);
4297 			if (error != 0) {
4298 				mutex_exit(&connp->conn_lock);
4299 				rw_exit(&icmp->icmp_rwlock);
4300 				return (error);
4301 			}
4302 		}
4303 		/*
4304 		 * Apply credentials with modified security label if they
4305 		 * exist. icmp_update_label() may have generated these
4306 		 * credentials for packets to unlabeled remote nodes.
4307 		 */
4308 		if (icmp->icmp_effective_cred != NULL)
4309 			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
4310 	}
4311 
4312 	if (icmp->icmp_ip_snd_options_len > 0) {
4313 		ip_snd_opt_len = icmp->icmp_ip_snd_options_len;
4314 		bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
4315 	}
4316 	mutex_exit(&connp->conn_lock);
4317 
4318 	/*
4319 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4320 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4321 	 * tell IP that the application has sent a complete IP header and not
4322 	 * to compute the transport checksum nor change the DF flag.
4323 	 */
4324 	ipha->ipha_ident = IP_HDR_INCLUDED;
4325 	ipha->ipha_hdr_checksum = 0;
4326 	ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
4327 	/* Insert options if any */
4328 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4329 		/*
4330 		 * Put the IP header plus any transport header that is
4331 		 * checksumed by ip_wput into the first mblk. (ip_wput assumes
4332 		 * that at least the checksum field is in the first mblk.)
4333 		 */
4334 		switch (ipha->ipha_protocol) {
4335 		case IPPROTO_UDP:
4336 			tp_hdr_len = 8;
4337 			break;
4338 		case IPPROTO_TCP:
4339 			tp_hdr_len = 20;
4340 			break;
4341 		default:
4342 			tp_hdr_len = 0;
4343 			break;
4344 		}
4345 		/*
4346 		 * The code below assumes that IP_SIMPLE_HDR_LENGTH plus
4347 		 * tp_hdr_len bytes will be in a single mblk.
4348 		 */
4349 		if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH +
4350 		    tp_hdr_len)) {
4351 			if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH +
4352 			    tp_hdr_len)) {
4353 				BUMP_MIB(&is->is_rawip_mib,
4354 				    rawipOutErrors);
4355 				freemsg(mp);
4356 				rw_exit(&icmp->icmp_rwlock);
4357 				return (0);
4358 			}
4359 			ipha = (ipha_t *)mp->b_rptr;
4360 		}
4361 
4362 		/*
4363 		 * if the length is larger then the max allowed IP packet,
4364 		 * then send an error and abort the processing.
4365 		 */
4366 		pkt_len = ntohs(ipha->ipha_length)
4367 		    + ip_snd_opt_len;
4368 		if (pkt_len > IP_MAXPACKET) {
4369 			rw_exit(&icmp->icmp_rwlock);
4370 			return (EMSGSIZE);
4371 		}
4372 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
4373 		    tp_hdr_len, BPRI_LO))) {
4374 			rw_exit(&icmp->icmp_rwlock);
4375 			return (ENOMEM);
4376 		}
4377 		mp1->b_rptr += is->is_wroff_extra;
4378 		mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
4379 
4380 		ipha->ipha_length = htons((uint16_t)pkt_len);
4381 		bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH);
4382 
4383 		/* Copy transport header if any */
4384 		bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len);
4385 		mp1->b_wptr += tp_hdr_len;
4386 
4387 		/* Add options */
4388 		ipha = (ipha_t *)mp1->b_rptr;
4389 		bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len);
4390 
4391 		/* Drop IP header and transport header from original */
4392 		(void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len);
4393 
4394 		mp1->b_cont = mp;
4395 		mp = mp1;
4396 		/*
4397 		 * Massage source route putting first source
4398 		 * route in ipha_dst.
4399 		 */
4400 		(void) ip_massage_options(ipha, is->is_netstack);
4401 	}
4402 
4403 	if (pktinfop != NULL) {
4404 		/*
4405 		 * Over write the source address provided in the header
4406 		 */
4407 		if (pktinfop->ip4_addr != INADDR_ANY) {
4408 			ipha->ipha_src = pktinfop->ip4_addr;
4409 			optinfo.ip_opt_flags = IP_VERIFY_SRC;
4410 		}
4411 
4412 		if (pktinfop->ip4_ill_index != 0) {
4413 			optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4414 		}
4415 	}
4416 
4417 	rw_exit(&icmp->icmp_rwlock);
4418 
4419 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4420 	return (0);
4421 }
4422 
4423 static int
4424 icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
4425 {
4426 	int err;
4427 	uchar_t opt_storage[IP_MAX_OPT_LENGTH];
4428 	icmp_stack_t		*is = icmp->icmp_is;
4429 	conn_t			*connp = icmp->icmp_connp;
4430 	cred_t	*cred;
4431 	cred_t	*msg_cred;
4432 	cred_t	*effective_cred;
4433 
4434 	/*
4435 	 * All Solaris components should pass a db_credp
4436 	 * for this message, hence we ASSERT.
4437 	 * On production kernels we return an error to be robust against
4438 	 * random streams modules sitting on top of us.
4439 	 */
4440 	cred = msg_cred = msg_getcred(mp, NULL);
4441 	ASSERT(cred != NULL);
4442 	if (cred == NULL)
4443 		return (EINVAL);
4444 
4445 	/*
4446 	 * Verify the destination is allowed to receive packets at
4447 	 * the security label of the message data. check_dest()
4448 	 * may create a new effective cred for this message
4449 	 * with a modified label or label flags.
4450 	 */
4451 	if ((err = tsol_check_dest(cred, &dst, IPV4_VERSION,
4452 	    connp->conn_mac_exempt, &effective_cred)) != 0)
4453 		goto done;
4454 	if (effective_cred != NULL)
4455 		cred = effective_cred;
4456 
4457 	/*
4458 	 * Calculate the security label to be placed in the text
4459 	 * of the message (if any).
4460 	 */
4461 	if ((err = tsol_compute_label(cred, dst, opt_storage,
4462 	    is->is_netstack->netstack_ip)) != 0)
4463 		goto done;
4464 
4465 	/*
4466 	 * Insert the security label in the cached ip options,
4467 	 * removing any old label that may exist.
4468 	 */
4469 	if ((err = tsol_update_options(&icmp->icmp_ip_snd_options,
4470 	    &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len,
4471 	    opt_storage)) != 0)
4472 		goto done;
4473 
4474 	/*
4475 	 * Save the destination address and cred we used to generate
4476 	 * the security label text.
4477 	 */
4478 	IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
4479 	if (cred != icmp->icmp_effective_cred) {
4480 		if (icmp->icmp_effective_cred != NULL)
4481 			crfree(icmp->icmp_effective_cred);
4482 		crhold(cred);
4483 		icmp->icmp_effective_cred = cred;
4484 	}
4485 
4486 	if (msg_cred != icmp->icmp_last_cred) {
4487 		if (icmp->icmp_last_cred != NULL)
4488 			crfree(icmp->icmp_last_cred);
4489 		crhold(msg_cred);
4490 		icmp->icmp_last_cred = msg_cred;
4491 	}
4492 
4493 done:
4494 	if (effective_cred != NULL)
4495 		crfree(effective_cred);
4496 
4497 	if (err != 0) {
4498 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4499 		DTRACE_PROBE4(
4500 		    tx__ip__log__drop__updatelabel__icmp,
4501 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4502 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4503 		return (err);
4504 	}
4505 	return (0);
4506 }
4507 
4508 /*
4509  * This routine handles all messages passed downstream.  It either
4510  * consumes the message or passes it downstream; it never queues a
4511  * a message.
4512  */
4513 static void
4514 icmp_wput(queue_t *q, mblk_t *mp)
4515 {
4516 	uchar_t	*rptr = mp->b_rptr;
4517 	mblk_t	*mp1;
4518 #define	tudr ((struct T_unitdata_req *)rptr)
4519 	size_t	ip_len;
4520 	conn_t	*connp = Q_TO_CONN(q);
4521 	icmp_t	*icmp = connp->conn_icmp;
4522 	icmp_stack_t *is = icmp->icmp_is;
4523 	sin6_t	*sin6;
4524 	sin_t	*sin;
4525 	ipaddr_t	v4dst;
4526 	ip4_pkt_t	pktinfo;
4527 	ip4_pkt_t	*pktinfop = &pktinfo;
4528 	ip6_pkt_t	ipp_s;  /* For ancillary data options */
4529 	ip6_pkt_t	*ipp = &ipp_s;
4530 	int error;
4531 
4532 	ipp->ipp_fields = 0;
4533 	ipp->ipp_sticky_ignored = 0;
4534 
4535 	switch (mp->b_datap->db_type) {
4536 	case M_DATA:
4537 		if (icmp->icmp_hdrincl) {
4538 			ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
4539 			error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL);
4540 			if (error != 0)
4541 				icmp_ud_err(q, mp, error);
4542 			return;
4543 		}
4544 		freemsg(mp);
4545 		return;
4546 	case M_PROTO:
4547 	case M_PCPROTO:
4548 		ip_len = mp->b_wptr - rptr;
4549 		if (ip_len >= sizeof (struct T_unitdata_req)) {
4550 			/* Expedite valid T_UNITDATA_REQ to below the switch */
4551 			if (((union T_primitives *)rptr)->type
4552 			    == T_UNITDATA_REQ)
4553 				break;
4554 		}
4555 		/* FALLTHRU */
4556 	default:
4557 		icmp_wput_other(q, mp);
4558 		return;
4559 	}
4560 
4561 	/* Handle T_UNITDATA_REQ messages here. */
4562 
4563 	mp1 = mp->b_cont;
4564 	if (mp1 == NULL) {
4565 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4566 		icmp_ud_err(q, mp, EPROTO);
4567 		return;
4568 	}
4569 
4570 	if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
4571 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4572 		icmp_ud_err(q, mp, EADDRNOTAVAIL);
4573 		return;
4574 	}
4575 
4576 	switch (icmp->icmp_family) {
4577 	case AF_INET6:
4578 		sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
4579 		if (!OK_32PTR((char *)sin6) ||
4580 		    tudr->DEST_length != sizeof (sin6_t) ||
4581 		    sin6->sin6_family != AF_INET6) {
4582 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4583 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4584 			return;
4585 		}
4586 
4587 		/* No support for mapped addresses on raw sockets */
4588 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4589 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4590 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4591 			return;
4592 		}
4593 
4594 		/*
4595 		 * Destination is a native IPv6 address.
4596 		 * Send out an IPv6 format packet.
4597 		 */
4598 		if (tudr->OPT_length != 0) {
4599 			int error;
4600 
4601 			error = 0;
4602 			if (icmp_unitdata_opt_process(q, mp, &error,
4603 			    (void *)ipp) < 0) {
4604 				/* failure */
4605 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4606 				icmp_ud_err(q, mp, error);
4607 				return;
4608 			}
4609 			ASSERT(error == 0);
4610 		}
4611 
4612 		error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp);
4613 		goto done;
4614 
4615 	case AF_INET:
4616 		sin = (sin_t *)&rptr[tudr->DEST_offset];
4617 		if (!OK_32PTR((char *)sin) ||
4618 		    tudr->DEST_length != sizeof (sin_t) ||
4619 		    sin->sin_family != AF_INET) {
4620 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4621 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
4622 			return;
4623 		}
4624 		/* Extract and ipaddr */
4625 		v4dst = sin->sin_addr.s_addr;
4626 		break;
4627 
4628 	default:
4629 		ASSERT(0);
4630 	}
4631 
4632 	pktinfop->ip4_ill_index = 0;
4633 	pktinfop->ip4_addr = INADDR_ANY;
4634 
4635 	/*
4636 	 * If options passed in, feed it for verification and handling
4637 	 */
4638 	if (tudr->OPT_length != 0) {
4639 		int error;
4640 
4641 		error = 0;
4642 		if (icmp_unitdata_opt_process(q, mp, &error,
4643 		    (void *)pktinfop) < 0) {
4644 			/* failure */
4645 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4646 			icmp_ud_err(q, mp, error);
4647 			return;
4648 		}
4649 		ASSERT(error == 0);
4650 		/*
4651 		 * Note: Success in processing options.
4652 		 * mp option buffer represented by
4653 		 * OPT_length/offset now potentially modified
4654 		 * and contain option setting results
4655 		 */
4656 	}
4657 
4658 	error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop);
4659 done:
4660 	if (error != 0) {
4661 		icmp_ud_err(q, mp, error);
4662 		return;
4663 	} else {
4664 		mp->b_cont = NULL;
4665 		freeb(mp);
4666 	}
4667 }
4668 
4669 
4670 /* ARGSUSED */
4671 static void
4672 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4673 {
4674 #ifdef DEBUG
4675 	cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4676 #endif
4677 	freemsg(mp);
4678 }
4679 
4680 static int
4681 raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst,
4682     ip4_pkt_t *pktinfop)
4683 {
4684 	ipha_t	*ipha;
4685 	size_t	ip_len;
4686 	icmp_t	*icmp = connp->conn_icmp;
4687 	icmp_stack_t *is = icmp->icmp_is;
4688 	int	ip_hdr_length;
4689 	ip_opt_info_t	optinfo;
4690 	uchar_t	ip_snd_opt[IP_MAX_OPT_LENGTH];
4691 	uint32_t ip_snd_opt_len = 0;
4692 	pid_t	cpid;
4693 	cred_t	*cr;
4694 
4695 	optinfo.ip_opt_flags = 0;
4696 	optinfo.ip_opt_ill_index = 0;
4697 
4698 	if (icmp->icmp_state == TS_UNBND) {
4699 		/* If a port has not been bound to the stream, fail. */
4700 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4701 		return (EPROTO);
4702 	}
4703 
4704 	if (v4dst == INADDR_ANY)
4705 		v4dst = htonl(INADDR_LOOPBACK);
4706 
4707 	/* Protocol 255 contains full IP headers */
4708 	if (icmp->icmp_hdrincl)
4709 		return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop));
4710 
4711 	rw_enter(&icmp->icmp_rwlock, RW_READER);
4712 
4713 	/*
4714 	 * Check if our saved options are valid; update if not.
4715 	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
4716 	 * to different destination may require different labels,
4717 	 * or worse, ICMP packets to same IP address may require
4718 	 * different labels due to use of shared all-zones address.
4719 	 * We use conn_lock to ensure that lastdst, ip_snd_options,
4720 	 * and ip_snd_options_len are consistent for the current
4721 	 * destination and are updated atomically.
4722 	 */
4723 	mutex_enter(&connp->conn_lock);
4724 	if (is_system_labeled()) {
4725 
4726 		/*
4727 		 * Recompute the Trusted Extensions security label if we're not
4728 		 * going to the same destination as last time or the cred
4729 		 * attached to the received mblk changed.
4730 		 */
4731 		cr = msg_getcred(mp, &cpid);
4732 		if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
4733 		    V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst ||
4734 		    cr != icmp->icmp_last_cred) {
4735 			int error = icmp_update_label(icmp, mp, v4dst);
4736 			if (error != 0) {
4737 				mutex_exit(&connp->conn_lock);
4738 				rw_exit(&icmp->icmp_rwlock);
4739 				return (error);
4740 			}
4741 		}
4742 		/*
4743 		 * Apply credentials with modified security label if they
4744 		 * exist. icmp_update_label() may have generated these
4745 		 * credentials for packets to unlabeled remote nodes.
4746 		 */
4747 		if (icmp->icmp_effective_cred != NULL)
4748 			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
4749 	}
4750 
4751 	if (icmp->icmp_ip_snd_options_len > 0) {
4752 		ip_snd_opt_len = icmp->icmp_ip_snd_options_len;
4753 		bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
4754 	}
4755 	mutex_exit(&connp->conn_lock);
4756 
4757 	/* Add an IP header */
4758 	ip_hdr_length = IP_SIMPLE_HDR_LENGTH + ip_snd_opt_len;
4759 	ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length];
4760 	if ((uchar_t *)ipha < mp->b_datap->db_base ||
4761 	    mp->b_datap->db_ref != 1 ||
4762 	    !OK_32PTR(ipha)) {
4763 		mblk_t	*mp1;
4764 		if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
4765 		    BPRI_LO))) {
4766 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4767 			rw_exit(&icmp->icmp_rwlock);
4768 			return (ENOMEM);
4769 		}
4770 		mp1->b_cont = mp;
4771 		ipha = (ipha_t *)mp1->b_datap->db_lim;
4772 		mp1->b_wptr = (uchar_t *)ipha;
4773 		ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
4774 		mp = mp1;
4775 	}
4776 #ifdef	_BIG_ENDIAN
4777 	/* Set version, header length, and tos */
4778 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4779 	    ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
4780 	    icmp->icmp_type_of_service);
4781 	/* Set ttl and protocol */
4782 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto;
4783 #else
4784 	/* Set version, header length, and tos */
4785 	*(uint16_t *)&ipha->ipha_version_and_hdr_length =
4786 	    ((icmp->icmp_type_of_service << 8) |
4787 	    ((IP_VERSION << 4) | (ip_hdr_length>>2)));
4788 	/* Set ttl and protocol */
4789 	*(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl;
4790 #endif
4791 	if (pktinfop->ip4_addr != INADDR_ANY) {
4792 		ipha->ipha_src = pktinfop->ip4_addr;
4793 		optinfo.ip_opt_flags = IP_VERIFY_SRC;
4794 	} else {
4795 
4796 		/*
4797 		 * Copy our address into the packet.  If this is zero,
4798 		 * ip will fill in the real source address.
4799 		 */
4800 		IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src);
4801 	}
4802 
4803 	ipha->ipha_fragment_offset_and_flags = 0;
4804 
4805 	if (pktinfop->ip4_ill_index != 0) {
4806 		optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
4807 	}
4808 
4809 
4810 	/*
4811 	 * For the socket of SOCK_RAW type, the checksum is provided in the
4812 	 * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
4813 	 * tell IP that the application has sent a complete IP header and not
4814 	 * to compute the transport checksum nor change the DF flag.
4815 	 */
4816 	ipha->ipha_ident = IP_HDR_INCLUDED;
4817 
4818 	/* Finish common formatting of the packet. */
4819 	mp->b_rptr = (uchar_t *)ipha;
4820 
4821 	ip_len = mp->b_wptr - (uchar_t *)ipha;
4822 	if (mp->b_cont != NULL)
4823 		ip_len += msgdsize(mp->b_cont);
4824 
4825 	/*
4826 	 * Set the length into the IP header.
4827 	 * If the length is greater than the maximum allowed by IP,
4828 	 * then free the message and return. Do not try and send it
4829 	 * as this can cause problems in layers below.
4830 	 */
4831 	if (ip_len > IP_MAXPACKET) {
4832 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4833 		rw_exit(&icmp->icmp_rwlock);
4834 		return (EMSGSIZE);
4835 	}
4836 	ipha->ipha_length = htons((uint16_t)ip_len);
4837 	/*
4838 	 * Copy in the destination address request
4839 	 */
4840 	ipha->ipha_dst = v4dst;
4841 
4842 	/*
4843 	 * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
4844 	 */
4845 	if (CLASSD(v4dst))
4846 		ipha->ipha_ttl = icmp->icmp_multicast_ttl;
4847 
4848 	/* Copy in options if any */
4849 	if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
4850 		bcopy(ip_snd_opt,
4851 		    &ipha[1], ip_snd_opt_len);
4852 		/*
4853 		 * Massage source route putting first source route in ipha_dst.
4854 		 * Ignore the destination in the T_unitdata_req.
4855 		 */
4856 		(void) ip_massage_options(ipha, is->is_netstack);
4857 	}
4858 
4859 	rw_exit(&icmp->icmp_rwlock);
4860 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4861 
4862 	ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
4863 	return (0);
4864 }
4865 
4866 static int
4867 icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
4868 {
4869 	int err;
4870 	uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
4871 	icmp_stack_t		*is = icmp->icmp_is;
4872 	conn_t			*connp = icmp->icmp_connp;
4873 	cred_t	*cred;
4874 	cred_t	*msg_cred;
4875 	cred_t	*effective_cred;
4876 
4877 	/*
4878 	 * All Solaris components should pass a db_credp
4879 	 * for this message, hence we ASSERT.
4880 	 * On production kernels we return an error to be robust against
4881 	 * random streams modules sitting on top of us.
4882 	 */
4883 	cred = msg_cred = msg_getcred(mp, NULL);
4884 	ASSERT(cred != NULL);
4885 	if (cred == NULL)
4886 		return (EINVAL);
4887 
4888 	/*
4889 	 * Verify the destination is allowed to receive packets at
4890 	 * the security label of the message data. check_dest()
4891 	 * may create a new effective cred for this message
4892 	 * with a modified label or label flags.
4893 	 */
4894 	if ((err = tsol_check_dest(cred, dst, IPV6_VERSION,
4895 	    connp->conn_mac_exempt, &effective_cred)) != 0)
4896 		goto done;
4897 	if (effective_cred != NULL)
4898 		cred = effective_cred;
4899 
4900 	/*
4901 	 * Calculate the security label to be placed in the text
4902 	 * of the message (if any).
4903 	 */
4904 	if ((err = tsol_compute_label_v6(cred, dst, opt_storage,
4905 	    is->is_netstack->netstack_ip)) != 0)
4906 		goto done;
4907 
4908 	/*
4909 	 * Insert the security label in the cached ip options,
4910 	 * removing any old label that may exist.
4911 	 */
4912 	if ((err = tsol_update_sticky(&icmp->icmp_sticky_ipp,
4913 	    &icmp->icmp_label_len_v6, opt_storage)) != 0)
4914 		goto done;
4915 
4916 	/*
4917 	 * Save the destination address and cred we used to generate
4918 	 * the security label text.
4919 	 */
4920 	icmp->icmp_v6lastdst = *dst;
4921 	if (cred != icmp->icmp_effective_cred) {
4922 		if (icmp->icmp_effective_cred != NULL)
4923 			crfree(icmp->icmp_effective_cred);
4924 		crhold(cred);
4925 		icmp->icmp_effective_cred = cred;
4926 	}
4927 
4928 	if (msg_cred != icmp->icmp_last_cred) {
4929 		if (icmp->icmp_last_cred != NULL)
4930 			crfree(icmp->icmp_last_cred);
4931 		crhold(msg_cred);
4932 		icmp->icmp_last_cred = msg_cred;
4933 	}
4934 
4935 done:
4936 	if (effective_cred != NULL)
4937 		crfree(effective_cred);
4938 
4939 	if (err != 0) {
4940 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4941 		DTRACE_PROBE4(
4942 		    tx__ip__log__drop__updatelabel__icmp6,
4943 		    char *, "icmp(1) failed to update options(2) on mp(3)",
4944 		    icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
4945 		return (err);
4946 	}
4947 	return (0);
4948 }
4949 
4950 /*
4951  * raw_ip_send_data_v6():
4952  * Assumes that icmp_wput did some sanity checking on the destination
4953  * address, but that the label may not yet be correct.
4954  */
4955 static int
4956 raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6,
4957     ip6_pkt_t *ipp)
4958 {
4959 	ip6_t			*ip6h;
4960 	ip6i_t			*ip6i;	/* mp->b_rptr even if no ip6i_t */
4961 	int			ip_hdr_len = IPV6_HDR_LEN;
4962 	size_t			ip_len;
4963 	icmp_t			*icmp = connp->conn_icmp;
4964 	icmp_stack_t		*is = icmp->icmp_is;
4965 	ip6_pkt_t		*tipp;
4966 	ip6_hbh_t		*hopoptsptr = NULL;
4967 	uint_t			hopoptslen = 0;
4968 	uint32_t		csum = 0;
4969 	uint_t			ignore = 0;
4970 	uint_t			option_exists = 0, is_sticky = 0;
4971 	uint8_t			*cp;
4972 	uint8_t			*nxthdr_ptr;
4973 	in6_addr_t		ip6_dst;
4974 	pid_t			cpid;
4975 	cred_t			*cr;
4976 
4977 	rw_enter(&icmp->icmp_rwlock, RW_READER);
4978 
4979 	/*
4980 	 * If the local address is a mapped address return
4981 	 * an error.
4982 	 * It would be possible to send an IPv6 packet but the
4983 	 * response would never make it back to the application
4984 	 * since it is bound to a mapped address.
4985 	 */
4986 	if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
4987 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4988 		rw_exit(&icmp->icmp_rwlock);
4989 		return (EADDRNOTAVAIL);
4990 	}
4991 
4992 	ignore = ipp->ipp_sticky_ignored;
4993 	if (sin6->sin6_scope_id != 0 &&
4994 	    IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
4995 		/*
4996 		 * IPPF_SCOPE_ID is special.  It's neither a sticky
4997 		 * option nor ancillary data.  It needs to be
4998 		 * explicitly set in options_exists.
4999 		 */
5000 		option_exists |= IPPF_SCOPE_ID;
5001 	}
5002 
5003 	/*
5004 	 * Compute the destination address
5005 	 */
5006 	ip6_dst = sin6->sin6_addr;
5007 	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5008 		ip6_dst = ipv6_loopback;
5009 
5010 	/*
5011 	 * Check if our saved options are valid; update if not.
5012 	 * TSOL Note: Since we are not in WRITER mode, ICMP packets
5013 	 * to different destination may require different labels,
5014 	 * or worse, ICMP packets to same IP address may require
5015 	 * different labels due to use of shared all-zones address.
5016 	 * We use conn_lock to ensure that lastdst, sticky ipp_hopopts,
5017 	 * and sticky ipp_hopoptslen are consistent for the current
5018 	 * destination and are updated atomically.
5019 	 */
5020 	mutex_enter(&connp->conn_lock);
5021 	if (is_system_labeled()) {
5022 		/*
5023 		 * Recompute the Trusted Extensions security label if we're
5024 		 * not going to the same destination as last time or the cred
5025 		 * attached to the received mblk changed. This is done in a
5026 		 * separate routine to avoid blowing up our stack here.
5027 		 */
5028 		cr = msg_getcred(mp, &cpid);
5029 		if (!IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst) ||
5030 		    cr != icmp->icmp_last_cred) {
5031 			int error = 0;
5032 			error = icmp_update_label_v6(icmp, mp, &ip6_dst);
5033 			if (error != 0) {
5034 				mutex_exit(&connp->conn_lock);
5035 				rw_exit(&icmp->icmp_rwlock);
5036 				return (error);
5037 			}
5038 		}
5039 
5040 		/*
5041 		 * Apply credentials with modified security label if they exist.
5042 		 * icmp_update_label_v6() may have generated these credentials
5043 		 * for MAC-Exempt connections.
5044 		 */
5045 		if (icmp->icmp_effective_cred != NULL)
5046 			mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
5047 	}
5048 
5049 	/*
5050 	 * If there's a security label here, then we ignore any options the
5051 	 * user may try to set.  We keep the peer's label as a hidden sticky
5052 	 * option.
5053 	 */
5054 	if (icmp->icmp_label_len_v6 > 0) {
5055 		ignore &= ~IPPF_HOPOPTS;
5056 		ipp->ipp_fields &= ~IPPF_HOPOPTS;
5057 	}
5058 
5059 	if ((icmp->icmp_sticky_ipp.ipp_fields == 0) &&
5060 	    (ipp->ipp_fields == 0)) {
5061 		/* No sticky options nor ancillary data. */
5062 		mutex_exit(&connp->conn_lock);
5063 		goto no_options;
5064 	}
5065 
5066 	/*
5067 	 * Go through the options figuring out where each is going to
5068 	 * come from and build two masks.  The first mask indicates if
5069 	 * the option exists at all.  The second mask indicates if the
5070 	 * option is sticky or ancillary.
5071 	 */
5072 	if (!(ignore & IPPF_HOPOPTS)) {
5073 		if (ipp->ipp_fields & IPPF_HOPOPTS) {
5074 			option_exists |= IPPF_HOPOPTS;
5075 			ip_hdr_len += ipp->ipp_hopoptslen;
5076 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
5077 			option_exists |= IPPF_HOPOPTS;
5078 			is_sticky |= IPPF_HOPOPTS;
5079 			ASSERT(icmp->icmp_sticky_ipp.ipp_hopoptslen != 0);
5080 			hopoptsptr = kmem_alloc(
5081 			    icmp->icmp_sticky_ipp.ipp_hopoptslen, KM_NOSLEEP);
5082 			if (hopoptsptr == NULL) {
5083 				mutex_exit(&connp->conn_lock);
5084 				rw_exit(&icmp->icmp_rwlock);
5085 				return (ENOMEM);
5086 			}
5087 			hopoptslen = icmp->icmp_sticky_ipp.ipp_hopoptslen;
5088 			bcopy(icmp->icmp_sticky_ipp.ipp_hopopts, hopoptsptr,
5089 			    hopoptslen);
5090 			ip_hdr_len += hopoptslen;
5091 		}
5092 	}
5093 	mutex_exit(&connp->conn_lock);
5094 
5095 	if (!(ignore & IPPF_RTHDR)) {
5096 		if (ipp->ipp_fields & IPPF_RTHDR) {
5097 			option_exists |= IPPF_RTHDR;
5098 			ip_hdr_len += ipp->ipp_rthdrlen;
5099 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
5100 			option_exists |= IPPF_RTHDR;
5101 			is_sticky |= IPPF_RTHDR;
5102 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen;
5103 		}
5104 	}
5105 
5106 	if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
5107 		/*
5108 		 * Need to have a router header to use these.
5109 		 */
5110 		if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
5111 			option_exists |= IPPF_RTDSTOPTS;
5112 			ip_hdr_len += ipp->ipp_rtdstoptslen;
5113 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
5114 			option_exists |= IPPF_RTDSTOPTS;
5115 			is_sticky |= IPPF_RTDSTOPTS;
5116 			ip_hdr_len +=
5117 			    icmp->icmp_sticky_ipp.ipp_rtdstoptslen;
5118 		}
5119 	}
5120 
5121 	if (!(ignore & IPPF_DSTOPTS)) {
5122 		if (ipp->ipp_fields & IPPF_DSTOPTS) {
5123 			option_exists |= IPPF_DSTOPTS;
5124 			ip_hdr_len += ipp->ipp_dstoptslen;
5125 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
5126 			option_exists |= IPPF_DSTOPTS;
5127 			is_sticky |= IPPF_DSTOPTS;
5128 			ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen;
5129 		}
5130 	}
5131 
5132 	if (!(ignore & IPPF_IFINDEX)) {
5133 		if (ipp->ipp_fields & IPPF_IFINDEX) {
5134 			option_exists |= IPPF_IFINDEX;
5135 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
5136 			option_exists |= IPPF_IFINDEX;
5137 			is_sticky |= IPPF_IFINDEX;
5138 		}
5139 	}
5140 
5141 	if (!(ignore & IPPF_ADDR)) {
5142 		if (ipp->ipp_fields & IPPF_ADDR) {
5143 			option_exists |= IPPF_ADDR;
5144 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) {
5145 			option_exists |= IPPF_ADDR;
5146 			is_sticky |= IPPF_ADDR;
5147 		}
5148 	}
5149 
5150 	if (!(ignore & IPPF_DONTFRAG)) {
5151 		if (ipp->ipp_fields & IPPF_DONTFRAG) {
5152 			option_exists |= IPPF_DONTFRAG;
5153 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
5154 			option_exists |= IPPF_DONTFRAG;
5155 			is_sticky |= IPPF_DONTFRAG;
5156 		}
5157 	}
5158 
5159 	if (!(ignore & IPPF_USE_MIN_MTU)) {
5160 		if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
5161 			option_exists |= IPPF_USE_MIN_MTU;
5162 		} else if (icmp->icmp_sticky_ipp.ipp_fields &
5163 		    IPPF_USE_MIN_MTU) {
5164 			option_exists |= IPPF_USE_MIN_MTU;
5165 			is_sticky |= IPPF_USE_MIN_MTU;
5166 		}
5167 	}
5168 
5169 	if (!(ignore & IPPF_NEXTHOP)) {
5170 		if (ipp->ipp_fields & IPPF_NEXTHOP) {
5171 			option_exists |= IPPF_NEXTHOP;
5172 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) {
5173 			option_exists |= IPPF_NEXTHOP;
5174 			is_sticky |= IPPF_NEXTHOP;
5175 		}
5176 	}
5177 
5178 	if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
5179 		option_exists |= IPPF_HOPLIMIT;
5180 	/* IPV6_HOPLIMIT can never be sticky */
5181 	ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
5182 
5183 	if (!(ignore & IPPF_UNICAST_HOPS) &&
5184 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
5185 		option_exists |= IPPF_UNICAST_HOPS;
5186 		is_sticky |= IPPF_UNICAST_HOPS;
5187 	}
5188 
5189 	if (!(ignore & IPPF_MULTICAST_HOPS) &&
5190 	    (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
5191 		option_exists |= IPPF_MULTICAST_HOPS;
5192 		is_sticky |= IPPF_MULTICAST_HOPS;
5193 	}
5194 
5195 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) {
5196 		/* This is a sticky socket option only */
5197 		option_exists |= IPPF_NO_CKSUM;
5198 		is_sticky |= IPPF_NO_CKSUM;
5199 	}
5200 
5201 	if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) {
5202 		/* This is a sticky socket option only */
5203 		option_exists |= IPPF_RAW_CKSUM;
5204 		is_sticky |= IPPF_RAW_CKSUM;
5205 	}
5206 
5207 	if (!(ignore & IPPF_TCLASS)) {
5208 		if (ipp->ipp_fields & IPPF_TCLASS) {
5209 			option_exists |= IPPF_TCLASS;
5210 		} else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
5211 			option_exists |= IPPF_TCLASS;
5212 			is_sticky |= IPPF_TCLASS;
5213 		}
5214 	}
5215 
5216 no_options:
5217 
5218 	/*
5219 	 * If any options carried in the ip6i_t were specified, we
5220 	 * need to account for the ip6i_t in the data we'll be sending
5221 	 * down.
5222 	 */
5223 	if (option_exists & IPPF_HAS_IP6I)
5224 		ip_hdr_len += sizeof (ip6i_t);
5225 
5226 	/* check/fix buffer config, setup pointers into it */
5227 	ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len];
5228 	if ((mp->b_datap->db_ref != 1) ||
5229 	    ((unsigned char *)ip6h < mp->b_datap->db_base) ||
5230 	    !OK_32PTR(ip6h)) {
5231 		mblk_t	*mp1;
5232 
5233 		/* Try to get everything in a single mblk next time */
5234 		if (ip_hdr_len > icmp->icmp_max_hdr_len) {
5235 			icmp->icmp_max_hdr_len = ip_hdr_len;
5236 
5237 			(void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp,
5238 			    icmp->icmp_max_hdr_len + is->is_wroff_extra);
5239 		}
5240 		mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
5241 		if (!mp1) {
5242 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5243 			kmem_free(hopoptsptr, hopoptslen);
5244 			rw_exit(&icmp->icmp_rwlock);
5245 			return (ENOMEM);
5246 		}
5247 		mp1->b_cont = mp;
5248 		mp1->b_wptr = mp1->b_datap->db_lim;
5249 		ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
5250 		mp = mp1;
5251 	}
5252 	mp->b_rptr = (unsigned char *)ip6h;
5253 	ip6i = (ip6i_t *)ip6h;
5254 
5255 #define	ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
5256 	if (option_exists & IPPF_HAS_IP6I) {
5257 		ip6h = (ip6_t *)&ip6i[1];
5258 		ip6i->ip6i_flags = 0;
5259 		ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5260 
5261 		/* sin6_scope_id takes precendence over IPPF_IFINDEX */
5262 		if (option_exists & IPPF_SCOPE_ID) {
5263 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5264 			ip6i->ip6i_ifindex = sin6->sin6_scope_id;
5265 		} else if (option_exists & IPPF_IFINDEX) {
5266 			tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
5267 			ASSERT(tipp->ipp_ifindex != 0);
5268 			ip6i->ip6i_flags |= IP6I_IFINDEX;
5269 			ip6i->ip6i_ifindex = tipp->ipp_ifindex;
5270 		}
5271 
5272 		if (option_exists & IPPF_RAW_CKSUM) {
5273 			ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
5274 			ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
5275 		}
5276 
5277 		if (option_exists & IPPF_NO_CKSUM) {
5278 			ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
5279 		}
5280 
5281 		if (option_exists & IPPF_ADDR) {
5282 			/*
5283 			 * Enable per-packet source address verification if
5284 			 * IPV6_PKTINFO specified the source address.
5285 			 * ip6_src is set in the transport's _wput function.
5286 			 */
5287 			ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
5288 		}
5289 
5290 		if (option_exists & IPPF_DONTFRAG) {
5291 			ip6i->ip6i_flags |= IP6I_DONTFRAG;
5292 		}
5293 
5294 		if (option_exists & IPPF_USE_MIN_MTU) {
5295 			ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
5296 			    ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
5297 		}
5298 
5299 		if (option_exists & IPPF_NEXTHOP) {
5300 			tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
5301 			ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
5302 			ip6i->ip6i_flags |= IP6I_NEXTHOP;
5303 			ip6i->ip6i_nexthop = tipp->ipp_nexthop;
5304 		}
5305 
5306 		/*
5307 		 * tell IP this is an ip6i_t private header
5308 		 */
5309 		ip6i->ip6i_nxt = IPPROTO_RAW;
5310 	}
5311 
5312 	/* Initialize IPv6 header */
5313 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
5314 	bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
5315 
5316 	/* Set the hoplimit of the outgoing packet. */
5317 	if (option_exists & IPPF_HOPLIMIT) {
5318 		/* IPV6_HOPLIMIT ancillary data overrides all other settings. */
5319 		ip6h->ip6_hops = ipp->ipp_hoplimit;
5320 		ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5321 	} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
5322 		ip6h->ip6_hops = icmp->icmp_multicast_ttl;
5323 		if (option_exists & IPPF_MULTICAST_HOPS)
5324 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5325 	} else {
5326 		ip6h->ip6_hops = icmp->icmp_ttl;
5327 		if (option_exists & IPPF_UNICAST_HOPS)
5328 			ip6i->ip6i_flags |= IP6I_HOPLIMIT;
5329 	}
5330 
5331 	if (option_exists & IPPF_ADDR) {
5332 		tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
5333 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
5334 		ip6h->ip6_src = tipp->ipp_addr;
5335 	} else {
5336 		/*
5337 		 * The source address was not set using IPV6_PKTINFO.
5338 		 * First look at the bound source.
5339 		 * If unspecified fallback to __sin6_src_id.
5340 		 */
5341 		ip6h->ip6_src = icmp->icmp_v6src;
5342 		if (sin6->__sin6_src_id != 0 &&
5343 		    IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
5344 			ip_srcid_find_id(sin6->__sin6_src_id,
5345 			    &ip6h->ip6_src, icmp->icmp_zoneid,
5346 			    is->is_netstack);
5347 		}
5348 	}
5349 
5350 	nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
5351 	cp = (uint8_t *)&ip6h[1];
5352 
5353 	/*
5354 	 * Here's where we have to start stringing together
5355 	 * any extension headers in the right order:
5356 	 * Hop-by-hop, destination, routing, and final destination opts.
5357 	 */
5358 	if (option_exists & IPPF_HOPOPTS) {
5359 		/* Hop-by-hop options */
5360 		ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
5361 
5362 		*nxthdr_ptr = IPPROTO_HOPOPTS;
5363 		nxthdr_ptr = &hbh->ip6h_nxt;
5364 
5365 		if (hopoptslen == 0) {
5366 			tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
5367 			bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen);
5368 			cp += tipp->ipp_hopoptslen;
5369 		} else {
5370 			bcopy(hopoptsptr, cp, hopoptslen);
5371 			cp += hopoptslen;
5372 			kmem_free(hopoptsptr, hopoptslen);
5373 		}
5374 	}
5375 	/*
5376 	 * En-route destination options
5377 	 * Only do them if there's a routing header as well
5378 	 */
5379 	if (option_exists & IPPF_RTDSTOPTS) {
5380 		ip6_dest_t *dst = (ip6_dest_t *)cp;
5381 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
5382 
5383 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5384 		nxthdr_ptr = &dst->ip6d_nxt;
5385 
5386 		bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
5387 		cp += tipp->ipp_rtdstoptslen;
5388 	}
5389 	/*
5390 	 * Routing header next
5391 	 */
5392 	if (option_exists & IPPF_RTHDR) {
5393 		ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
5394 		tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
5395 
5396 		*nxthdr_ptr = IPPROTO_ROUTING;
5397 		nxthdr_ptr = &rt->ip6r_nxt;
5398 
5399 		bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
5400 		cp += tipp->ipp_rthdrlen;
5401 	}
5402 	/*
5403 	 * Do ultimate destination options
5404 	 */
5405 	if (option_exists & IPPF_DSTOPTS) {
5406 		ip6_dest_t *dest = (ip6_dest_t *)cp;
5407 		tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
5408 
5409 		*nxthdr_ptr = IPPROTO_DSTOPTS;
5410 		nxthdr_ptr = &dest->ip6d_nxt;
5411 
5412 		bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
5413 		cp += tipp->ipp_dstoptslen;
5414 	}
5415 
5416 	/*
5417 	 * Now set the last header pointer to the proto passed in
5418 	 */
5419 	ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len);
5420 	*nxthdr_ptr = icmp->icmp_proto;
5421 
5422 	/*
5423 	 * Copy in the destination address
5424 	 */
5425 	ip6h->ip6_dst = ip6_dst;
5426 
5427 	ip6h->ip6_vcf =
5428 	    (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
5429 	    (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
5430 
5431 	if (option_exists & IPPF_TCLASS) {
5432 		tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
5433 		ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
5434 		    tipp->ipp_tclass);
5435 	}
5436 	if (option_exists & IPPF_RTHDR) {
5437 		ip6_rthdr_t	*rth;
5438 
5439 		/*
5440 		 * Perform any processing needed for source routing.
5441 		 * We know that all extension headers will be in the same mblk
5442 		 * as the IPv6 header.
5443 		 */
5444 		rth = ip_find_rthdr_v6(ip6h, mp->b_wptr);
5445 		if (rth != NULL && rth->ip6r_segleft != 0) {
5446 			if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
5447 				/*
5448 				 * Drop packet - only support Type 0 routing.
5449 				 * Notify the application as well.
5450 				 */
5451 				BUMP_MIB(&is->is_rawip_mib,
5452 				    rawipOutErrors);
5453 				rw_exit(&icmp->icmp_rwlock);
5454 				return (EPROTO);
5455 			}
5456 			/*
5457 			 * rth->ip6r_len is twice the number of
5458 			 * addresses in the header
5459 			 */
5460 			if (rth->ip6r_len & 0x1) {
5461 				BUMP_MIB(&is->is_rawip_mib,
5462 				    rawipOutErrors);
5463 				rw_exit(&icmp->icmp_rwlock);
5464 				return (EPROTO);
5465 			}
5466 			/*
5467 			 * Shuffle the routing header and ip6_dst
5468 			 * addresses, and get the checksum difference
5469 			 * between the first hop (in ip6_dst) and
5470 			 * the destination (in the last routing hdr entry).
5471 			 */
5472 			csum = ip_massage_options_v6(ip6h, rth,
5473 			    is->is_netstack);
5474 			/*
5475 			 * Verify that the first hop isn't a mapped address.
5476 			 * Routers along the path need to do this verification
5477 			 * for subsequent hops.
5478 			 */
5479 			if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
5480 				BUMP_MIB(&is->is_rawip_mib,
5481 				    rawipOutErrors);
5482 				rw_exit(&icmp->icmp_rwlock);
5483 				return (EADDRNOTAVAIL);
5484 			}
5485 		}
5486 	}
5487 
5488 	ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
5489 	if (mp->b_cont != NULL)
5490 		ip_len += msgdsize(mp->b_cont);
5491 
5492 	/*
5493 	 * Set the length into the IP header.
5494 	 * If the length is greater than the maximum allowed by IP,
5495 	 * then free the message and return. Do not try and send it
5496 	 * as this can cause problems in layers below.
5497 	 */
5498 	if (ip_len > IP_MAXPACKET) {
5499 		BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5500 		rw_exit(&icmp->icmp_rwlock);
5501 		return (EMSGSIZE);
5502 	}
5503 	if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
5504 		uint_t	cksum_off;	/* From ip6i == mp->b_rptr */
5505 		uint16_t *cksum_ptr;
5506 		uint_t	ext_hdrs_len;
5507 
5508 		/* ICMPv6 must have an offset matching icmp6_cksum offset */
5509 		ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 ||
5510 		    icmp->icmp_checksum_off == 2);
5511 
5512 		/*
5513 		 * We make it easy for IP to include our pseudo header
5514 		 * by putting our length in uh_checksum, modified (if
5515 		 * we have a routing header) by the checksum difference
5516 		 * between the ultimate destination and first hop addresses.
5517 		 * Note: ICMPv6 must always checksum the packet.
5518 		 */
5519 		cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
5520 		if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) {
5521 			if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) {
5522 				BUMP_MIB(&is->is_rawip_mib,
5523 				    rawipOutErrors);
5524 				freemsg(mp);
5525 				rw_exit(&icmp->icmp_rwlock);
5526 				return (0);
5527 			}
5528 			ip6i = (ip6i_t *)mp->b_rptr;
5529 			if (ip6i->ip6i_nxt == IPPROTO_RAW)
5530 				ip6h = (ip6_t *)&ip6i[1];
5531 			else
5532 				ip6h = (ip6_t *)ip6i;
5533 		}
5534 		/* Add payload length to checksum */
5535 		ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN -
5536 		    (int)((uchar_t *)ip6h - (uchar_t *)ip6i);
5537 		csum += htons(ip_len - ext_hdrs_len);
5538 
5539 		cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off);
5540 		csum = (csum & 0xFFFF) + (csum >> 16);
5541 		*cksum_ptr = (uint16_t)csum;
5542 	}
5543 
5544 #ifdef _LITTLE_ENDIAN
5545 	ip_len = htons(ip_len);
5546 #endif
5547 	ip6h->ip6_plen = (uint16_t)ip_len;
5548 
5549 	/* We're done. Pass the packet to IP */
5550 	rw_exit(&icmp->icmp_rwlock);
5551 	BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
5552 	ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT);
5553 	return (0);
5554 }
5555 
5556 static void
5557 icmp_wput_other(queue_t *q, mblk_t *mp)
5558 {
5559 	uchar_t	*rptr = mp->b_rptr;
5560 	struct iocblk *iocp;
5561 #define	tudr ((struct T_unitdata_req *)rptr)
5562 	conn_t	*connp = Q_TO_CONN(q);
5563 	icmp_t	*icmp = connp->conn_icmp;
5564 	icmp_stack_t *is = icmp->icmp_is;
5565 	cred_t *cr;
5566 
5567 	switch (mp->b_datap->db_type) {
5568 	case M_PROTO:
5569 	case M_PCPROTO:
5570 		if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
5571 			/*
5572 			 * If the message does not contain a PRIM_type,
5573 			 * throw it away.
5574 			 */
5575 			freemsg(mp);
5576 			return;
5577 		}
5578 		switch (((union T_primitives *)rptr)->type) {
5579 		case T_ADDR_REQ:
5580 			icmp_addr_req(q, mp);
5581 			return;
5582 		case O_T_BIND_REQ:
5583 		case T_BIND_REQ:
5584 			icmp_tpi_bind(q, mp);
5585 			return;
5586 		case T_CONN_REQ:
5587 			icmp_tpi_connect(q, mp);
5588 			return;
5589 		case T_CAPABILITY_REQ:
5590 			icmp_capability_req(q, mp);
5591 			return;
5592 		case T_INFO_REQ:
5593 			icmp_info_req(q, mp);
5594 			return;
5595 		case T_UNITDATA_REQ:
5596 			/*
5597 			 * If a T_UNITDATA_REQ gets here, the address must
5598 			 * be bad.  Valid T_UNITDATA_REQs are found above
5599 			 * and break to below this switch.
5600 			 */
5601 			icmp_ud_err(q, mp, EADDRNOTAVAIL);
5602 			return;
5603 		case T_UNBIND_REQ:
5604 			icmp_tpi_unbind(q, mp);
5605 			return;
5606 
5607 		case T_SVR4_OPTMGMT_REQ:
5608 			/*
5609 			 * All Solaris components should pass a db_credp
5610 			 * for this TPI message, hence we ASSERT.
5611 			 * But in case there is some other M_PROTO that looks
5612 			 * like a TPI message sent by some other kernel
5613 			 * component, we check and return an error.
5614 			 */
5615 			cr = msg_getcred(mp, NULL);
5616 			ASSERT(cr != NULL);
5617 			if (cr == NULL) {
5618 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5619 				return;
5620 			}
5621 
5622 			if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
5623 			    cr)) {
5624 				/* Only IP can return anything meaningful */
5625 				(void) svr4_optcom_req(q, mp, cr,
5626 				    &icmp_opt_obj, B_TRUE);
5627 			}
5628 			return;
5629 
5630 		case T_OPTMGMT_REQ:
5631 			/*
5632 			 * All Solaris components should pass a db_credp
5633 			 * for this TPI message, hence we ASSERT.
5634 			 * But in case there is some other M_PROTO that looks
5635 			 * like a TPI message sent by some other kernel
5636 			 * component, we check and return an error.
5637 			 */
5638 			cr = msg_getcred(mp, NULL);
5639 			ASSERT(cr != NULL);
5640 			if (cr == NULL) {
5641 				icmp_err_ack(q, mp, TSYSERR, EINVAL);
5642 				return;
5643 			}
5644 			/* Only IP can return anything meaningful */
5645 			(void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE);
5646 			return;
5647 
5648 		case T_DISCON_REQ:
5649 			icmp_tpi_disconnect(q, mp);
5650 			return;
5651 
5652 		/* The following TPI message is not supported by icmp. */
5653 		case O_T_CONN_RES:
5654 		case T_CONN_RES:
5655 			icmp_err_ack(q, mp, TNOTSUPPORT, 0);
5656 			return;
5657 
5658 		/* The following 3 TPI requests are illegal for icmp. */
5659 		case T_DATA_REQ:
5660 		case T_EXDATA_REQ:
5661 		case T_ORDREL_REQ:
5662 			freemsg(mp);
5663 			(void) putctl1(RD(q), M_ERROR, EPROTO);
5664 			return;
5665 		default:
5666 			break;
5667 		}
5668 		break;
5669 	case M_IOCTL:
5670 		iocp = (struct iocblk *)mp->b_rptr;
5671 		switch (iocp->ioc_cmd) {
5672 		case TI_GETPEERNAME:
5673 			if (icmp->icmp_state != TS_DATA_XFER) {
5674 				/*
5675 				 * If a default destination address has not
5676 				 * been associated with the stream, then we
5677 				 * don't know the peer's name.
5678 				 */
5679 				iocp->ioc_error = ENOTCONN;
5680 		err_ret:;
5681 				iocp->ioc_count = 0;
5682 				mp->b_datap->db_type = M_IOCACK;
5683 				qreply(q, mp);
5684 				return;
5685 			}
5686 			/* FALLTHRU */
5687 		case TI_GETMYNAME:
5688 			/*
5689 			 * For TI_GETPEERNAME and TI_GETMYNAME, we first
5690 			 * need to copyin the user's strbuf structure.
5691 			 * Processing will continue in the M_IOCDATA case
5692 			 * below.
5693 			 */
5694 			mi_copyin(q, mp, NULL,
5695 			    SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
5696 			return;
5697 		case ND_SET:
5698 			/* nd_getset performs the necessary error checking */
5699 		case ND_GET:
5700 			if (nd_getset(q, is->is_nd, mp)) {
5701 				qreply(q, mp);
5702 				return;
5703 			}
5704 			break;
5705 		case _SIOCSOCKFALLBACK:
5706 			/*
5707 			 * socket is falling back to be a
5708 			 * streams socket. Nothing  to do
5709 			 */
5710 			iocp->ioc_count = 0;
5711 			iocp->ioc_rval = 0;
5712 			qreply(q, mp);
5713 			return;
5714 		default:
5715 			break;
5716 		}
5717 		break;
5718 	case M_IOCDATA:
5719 		icmp_wput_iocdata(q, mp);
5720 		return;
5721 	default:
5722 		break;
5723 	}
5724 	ip_wput(q, mp);
5725 }
5726 
5727 /*
5728  * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA
5729  * messages.
5730  */
5731 static void
5732 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
5733 {
5734 	mblk_t	*mp1;
5735 	STRUCT_HANDLE(strbuf, sb);
5736 	icmp_t	*icmp;
5737 	uint_t	addrlen;
5738 	uint_t	error;
5739 
5740 	/* Make sure it is one of ours. */
5741 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5742 	case TI_GETMYNAME:
5743 	case TI_GETPEERNAME:
5744 		break;
5745 	default:
5746 		icmp = Q_TO_ICMP(q);
5747 		ip_output(icmp->icmp_connp, mp, q, IP_WPUT);
5748 		return;
5749 	}
5750 	switch (mi_copy_state(q, mp, &mp1)) {
5751 	case -1:
5752 		return;
5753 	case MI_COPY_CASE(MI_COPY_IN, 1):
5754 		break;
5755 	case MI_COPY_CASE(MI_COPY_OUT, 1):
5756 		/*
5757 		 * The address has been copied out, so now
5758 		 * copyout the strbuf.
5759 		 */
5760 		mi_copyout(q, mp);
5761 		return;
5762 	case MI_COPY_CASE(MI_COPY_OUT, 2):
5763 		/*
5764 		 * The address and strbuf have been copied out.
5765 		 * We're done, so just acknowledge the original
5766 		 * M_IOCTL.
5767 		 */
5768 		mi_copy_done(q, mp, 0);
5769 		return;
5770 	default:
5771 		/*
5772 		 * Something strange has happened, so acknowledge
5773 		 * the original M_IOCTL with an EPROTO error.
5774 		 */
5775 		mi_copy_done(q, mp, EPROTO);
5776 		return;
5777 	}
5778 	/*
5779 	 * Now we have the strbuf structure for TI_GETMYNAME
5780 	 * and TI_GETPEERNAME.  Next we copyout the requested
5781 	 * address and then we'll copyout the strbuf.
5782 	 */
5783 	STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
5784 	    (void *)mp1->b_rptr);
5785 	icmp = Q_TO_ICMP(q);
5786 	if (icmp->icmp_family == AF_INET)
5787 		addrlen = sizeof (sin_t);
5788 	else
5789 		addrlen = sizeof (sin6_t);
5790 
5791 	if (STRUCT_FGET(sb, maxlen) < addrlen) {
5792 		mi_copy_done(q, mp, EINVAL);
5793 		return;
5794 	}
5795 
5796 	mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
5797 
5798 	if (mp1 == NULL)
5799 		return;
5800 
5801 	rw_enter(&icmp->icmp_rwlock, RW_READER);
5802 	switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5803 	case TI_GETMYNAME:
5804 		error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr,
5805 		    &addrlen);
5806 		break;
5807 	case TI_GETPEERNAME:
5808 		error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr,
5809 		    &addrlen);
5810 		break;
5811 	}
5812 	rw_exit(&icmp->icmp_rwlock);
5813 
5814 	if (error != 0) {
5815 		mi_copy_done(q, mp, error);
5816 	} else {
5817 		mp1->b_wptr += addrlen;
5818 		STRUCT_FSET(sb, len, addrlen);
5819 
5820 		/* Copy out the address */
5821 		mi_copyout(q, mp);
5822 	}
5823 }
5824 
5825 static int
5826 icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
5827     void *thisdg_attrs)
5828 {
5829 	struct T_unitdata_req *udreqp;
5830 	int is_absreq_failure;
5831 	cred_t *cr;
5832 
5833 	udreqp = (struct T_unitdata_req *)mp->b_rptr;
5834 	*errorp = 0;
5835 
5836 	/*
5837 	 * All Solaris components should pass a db_credp
5838 	 * for this TPI message, hence we ASSERT.
5839 	 * But in case there is some other M_PROTO that looks
5840 	 * like a TPI message sent by some other kernel
5841 	 * component, we check and return an error.
5842 	 */
5843 	cr = msg_getcred(mp, NULL);
5844 	ASSERT(cr != NULL);
5845 	if (cr == NULL)
5846 		return (-1);
5847 
5848 	*errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
5849 	    udreqp->OPT_offset, cr, &icmp_opt_obj,
5850 	    thisdg_attrs, &is_absreq_failure);
5851 
5852 	if (*errorp != 0) {
5853 		/*
5854 		 * Note: No special action needed in this
5855 		 * module for "is_absreq_failure"
5856 		 */
5857 		return (-1);		/* failure */
5858 	}
5859 	ASSERT(is_absreq_failure == 0);
5860 	return (0);	/* success */
5861 }
5862 
5863 void
5864 icmp_ddi_g_init(void)
5865 {
5866 	icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
5867 	    icmp_opt_obj.odb_opt_arr_cnt);
5868 
5869 	/*
5870 	 * We want to be informed each time a stack is created or
5871 	 * destroyed in the kernel, so we can maintain the
5872 	 * set of icmp_stack_t's.
5873 	 */
5874 	netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
5875 }
5876 
5877 void
5878 icmp_ddi_g_destroy(void)
5879 {
5880 	netstack_unregister(NS_ICMP);
5881 }
5882 
5883 #define	INET_NAME	"ip"
5884 
5885 /*
5886  * Initialize the ICMP stack instance.
5887  */
5888 static void *
5889 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5890 {
5891 	icmp_stack_t	*is;
5892 	icmpparam_t	*pa;
5893 	int		error = 0;
5894 	major_t		major;
5895 
5896 	is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5897 	is->is_netstack = ns;
5898 
5899 	pa = (icmpparam_t *)kmem_alloc(sizeof (icmp_param_arr), KM_SLEEP);
5900 	is->is_param_arr = pa;
5901 	bcopy(icmp_param_arr, is->is_param_arr, sizeof (icmp_param_arr));
5902 
5903 	(void) icmp_param_register(&is->is_nd,
5904 	    is->is_param_arr, A_CNT(icmp_param_arr));
5905 	is->is_ksp = rawip_kstat_init(stackid);
5906 
5907 	major = mod_name_to_major(INET_NAME);
5908 	error = ldi_ident_from_major(major, &is->is_ldi_ident);
5909 	ASSERT(error == 0);
5910 	return (is);
5911 }
5912 
5913 /*
5914  * Free the ICMP stack instance.
5915  */
5916 static void
5917 rawip_stack_fini(netstackid_t stackid, void *arg)
5918 {
5919 	icmp_stack_t *is = (icmp_stack_t *)arg;
5920 
5921 	nd_free(&is->is_nd);
5922 	kmem_free(is->is_param_arr, sizeof (icmp_param_arr));
5923 	is->is_param_arr = NULL;
5924 
5925 	rawip_kstat_fini(stackid, is->is_ksp);
5926 	is->is_ksp = NULL;
5927 	ldi_ident_release(is->is_ldi_ident);
5928 	kmem_free(is, sizeof (*is));
5929 }
5930 
5931 static void *
5932 rawip_kstat_init(netstackid_t stackid) {
5933 	kstat_t	*ksp;
5934 
5935 	rawip_named_kstat_t template = {
5936 		{ "inDatagrams",	KSTAT_DATA_UINT32, 0 },
5937 		{ "inCksumErrs",	KSTAT_DATA_UINT32, 0 },
5938 		{ "inErrors",		KSTAT_DATA_UINT32, 0 },
5939 		{ "outDatagrams",	KSTAT_DATA_UINT32, 0 },
5940 		{ "outErrors",		KSTAT_DATA_UINT32, 0 },
5941 	};
5942 
5943 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5944 					KSTAT_TYPE_NAMED,
5945 					NUM_OF_FIELDS(rawip_named_kstat_t),
5946 					0, stackid);
5947 	if (ksp == NULL || ksp->ks_data == NULL)
5948 		return (NULL);
5949 
5950 	bcopy(&template, ksp->ks_data, sizeof (template));
5951 	ksp->ks_update = rawip_kstat_update;
5952 	ksp->ks_private = (void *)(uintptr_t)stackid;
5953 
5954 	kstat_install(ksp);
5955 	return (ksp);
5956 }
5957 
5958 static void
5959 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5960 {
5961 	if (ksp != NULL) {
5962 		ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5963 		kstat_delete_netstack(ksp, stackid);
5964 	}
5965 }
5966 
5967 static int
5968 rawip_kstat_update(kstat_t *ksp, int rw)
5969 {
5970 	rawip_named_kstat_t *rawipkp;
5971 	netstackid_t	stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5972 	netstack_t	*ns;
5973 	icmp_stack_t	*is;
5974 
5975 	if ((ksp == NULL) || (ksp->ks_data == NULL))
5976 		return (EIO);
5977 
5978 	if (rw == KSTAT_WRITE)
5979 		return (EACCES);
5980 
5981 	rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5982 
5983 	ns = netstack_find_by_stackid(stackid);
5984 	if (ns == NULL)
5985 		return (-1);
5986 	is = ns->netstack_icmp;
5987 	if (is == NULL) {
5988 		netstack_rele(ns);
5989 		return (-1);
5990 	}
5991 	rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5992 	rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5993 	rawipkp->inErrors.value.ui32 =	   is->is_rawip_mib.rawipInErrors;
5994 	rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5995 	rawipkp->outErrors.value.ui32 =	   is->is_rawip_mib.rawipOutErrors;
5996 	netstack_rele(ns);
5997 	return (0);
5998 }
5999 
6000 /* ARGSUSED */
6001 int
6002 rawip_accept(sock_lower_handle_t lproto_handle,
6003     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
6004     cred_t *cr)
6005 {
6006 	return (EOPNOTSUPP);
6007 }
6008 
6009 /* ARGSUSED */
6010 int
6011 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6012     socklen_t len, cred_t *cr)
6013 {
6014 	conn_t  *connp = (conn_t *)proto_handle;
6015 	int error;
6016 
6017 	/* All Solaris components should pass a cred for this operation. */
6018 	ASSERT(cr != NULL);
6019 
6020 	/* Binding to a NULL address really means unbind */
6021 	if (sa == NULL)
6022 		error = rawip_do_unbind(connp);
6023 	else
6024 		error = rawip_do_bind(connp, sa, len);
6025 
6026 	if (error < 0) {
6027 		if (error == -TOUTSTATE)
6028 			error = EINVAL;
6029 		else
6030 			error = proto_tlitosyserr(-error);
6031 	}
6032 	return (error);
6033 }
6034 
6035 static int
6036 rawip_implicit_bind(conn_t *connp)
6037 {
6038 	sin6_t sin6addr;
6039 	sin_t *sin;
6040 	sin6_t *sin6;
6041 	socklen_t len;
6042 	int error;
6043 
6044 	if (connp->conn_icmp->icmp_family == AF_INET) {
6045 		len = sizeof (struct sockaddr_in);
6046 		sin = (sin_t *)&sin6addr;
6047 		*sin = sin_null;
6048 		sin->sin_family = AF_INET;
6049 		sin->sin_addr.s_addr = INADDR_ANY;
6050 	} else {
6051 		ASSERT(connp->conn_icmp->icmp_family == AF_INET6);
6052 		len = sizeof (sin6_t);
6053 		sin6 = (sin6_t *)&sin6addr;
6054 		*sin6 = sin6_null;
6055 		sin6->sin6_family = AF_INET6;
6056 		V6_SET_ZERO(sin6->sin6_addr);
6057 	}
6058 
6059 	error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
6060 
6061 	return ((error < 0) ? proto_tlitosyserr(-error) : error);
6062 }
6063 
6064 static int
6065 rawip_unbind(conn_t *connp)
6066 {
6067 	int error;
6068 
6069 	error = rawip_do_unbind(connp);
6070 	if (error < 0) {
6071 		error = proto_tlitosyserr(-error);
6072 	}
6073 	return (error);
6074 }
6075 
6076 /* ARGSUSED */
6077 int
6078 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
6079 {
6080 	return (EOPNOTSUPP);
6081 }
6082 
6083 /* ARGSUSED */
6084 int
6085 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
6086     socklen_t len, sock_connid_t *id, cred_t *cr)
6087 {
6088 	conn_t	*connp = (conn_t *)proto_handle;
6089 	icmp_t *icmp = connp->conn_icmp;
6090 	int	error;
6091 	boolean_t did_bind = B_FALSE;
6092 
6093 	/* All Solaris components should pass a cred for this operation. */
6094 	ASSERT(cr != NULL);
6095 
6096 	if (sa == NULL) {
6097 		/*
6098 		 * Disconnect
6099 		 * Make sure we are connected
6100 		 */
6101 		if (icmp->icmp_state != TS_DATA_XFER)
6102 			return (EINVAL);
6103 
6104 		error = icmp_disconnect(connp);
6105 		return (error);
6106 	}
6107 
6108 	error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
6109 	if (error != 0)
6110 		return (error);
6111 
6112 	/* do an implicit bind if necessary */
6113 	if (icmp->icmp_state == TS_UNBND) {
6114 		error = rawip_implicit_bind(connp);
6115 		/*
6116 		 * We could be racing with an actual bind, in which case
6117 		 * we would see EPROTO. We cross our fingers and try
6118 		 * to connect.
6119 		 */
6120 		if (!(error == 0 || error == EPROTO))
6121 			return (error);
6122 		did_bind = B_TRUE;
6123 	}
6124 
6125 	/*
6126 	 * set SO_DGRAM_ERRIND
6127 	 */
6128 	icmp->icmp_dgram_errind = B_TRUE;
6129 
6130 	error = rawip_do_connect(connp, sa, len, cr);
6131 
6132 	if (error != 0 && did_bind) {
6133 		int unbind_err;
6134 
6135 		unbind_err = rawip_unbind(connp);
6136 		ASSERT(unbind_err == 0);
6137 	}
6138 
6139 	if (error == 0) {
6140 		*id = 0;
6141 		(*connp->conn_upcalls->su_connected)
6142 		    (connp->conn_upper_handle, 0, NULL, -1);
6143 	} else if (error < 0) {
6144 		error = proto_tlitosyserr(-error);
6145 	}
6146 	return (error);
6147 }
6148 
6149 /* ARGSUSED */
6150 int
6151 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
6152     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
6153 {
6154 	conn_t  *connp = (conn_t *)proto_handle;
6155 	icmp_t	*icmp;
6156 	struct T_capability_ack tca;
6157 	struct sockaddr_in6 laddr, faddr;
6158 	socklen_t laddrlen, faddrlen;
6159 	short opts;
6160 	struct stroptions *stropt;
6161 	mblk_t *stropt_mp;
6162 	int error;
6163 
6164 	icmp = connp->conn_icmp;
6165 
6166 	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
6167 
6168 	/*
6169 	 * setup the fallback stream that was allocated
6170 	 */
6171 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
6172 	connp->conn_minor_arena = WR(q)->q_ptr;
6173 
6174 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
6175 
6176 	WR(q)->q_qinfo = &icmpwinit;
6177 
6178 	connp->conn_rq = RD(q);
6179 	connp->conn_wq = WR(q);
6180 
6181 	/* Notify stream head about options before sending up data */
6182 	stropt_mp->b_datap->db_type = M_SETOPTS;
6183 	stropt_mp->b_wptr += sizeof (*stropt);
6184 	stropt = (struct stroptions *)stropt_mp->b_rptr;
6185 	stropt->so_flags = SO_WROFF | SO_HIWAT;
6186 	stropt->so_wroff =
6187 	    (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra);
6188 	stropt->so_hiwat = icmp->icmp_recv_hiwat;
6189 	putnext(RD(q), stropt_mp);
6190 
6191 	/*
6192 	 * free helper stream
6193 	 */
6194 	ip_free_helper_stream(connp);
6195 
6196 	/*
6197 	 * Collect the information needed to sync with the sonode
6198 	 */
6199 	icmp_do_capability_ack(icmp, &tca, TC1_INFO);
6200 
6201 	laddrlen = faddrlen = sizeof (sin6_t);
6202 	(void) rawip_getsockname((sock_lower_handle_t)connp,
6203 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
6204 	error = rawip_getpeername((sock_lower_handle_t)connp,
6205 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
6206 	if (error != 0)
6207 		faddrlen = 0;
6208 	opts = 0;
6209 	if (icmp->icmp_dgram_errind)
6210 		opts |= SO_DGRAM_ERRIND;
6211 	if (icmp->icmp_dontroute)
6212 		opts |= SO_DONTROUTE;
6213 
6214 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
6215 	    (struct sockaddr *)&laddr, laddrlen,
6216 	    (struct sockaddr *)&faddr, faddrlen, opts);
6217 
6218 	/*
6219 	 * Attempts to send data up during fallback will result in it being
6220 	 * queued in udp_t. Now we push up any queued packets.
6221 	 */
6222 	mutex_enter(&icmp->icmp_recv_lock);
6223 	while (icmp->icmp_fallback_queue_head != NULL) {
6224 		mblk_t	*mp;
6225 
6226 		mp = icmp->icmp_fallback_queue_head;
6227 		icmp->icmp_fallback_queue_head = mp->b_next;
6228 		mp->b_next = NULL;
6229 		mutex_exit(&icmp->icmp_recv_lock);
6230 		putnext(RD(q), mp);
6231 		mutex_enter(&icmp->icmp_recv_lock);
6232 	}
6233 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
6234 
6235 	/*
6236 	 * No longer a streams less socket
6237 	 */
6238 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6239 	connp->conn_flags &= ~IPCL_NONSTR;
6240 	rw_exit(&icmp->icmp_rwlock);
6241 
6242 	mutex_exit(&icmp->icmp_recv_lock);
6243 
6244 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
6245 	    icmp->icmp_fallback_queue_tail == NULL);
6246 
6247 	ASSERT(connp->conn_ref >= 1);
6248 
6249 	return (0);
6250 }
6251 
6252 /* ARGSUSED */
6253 sock_lower_handle_t
6254 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
6255     uint_t *smodep, int *errorp, int flags, cred_t *credp)
6256 {
6257 	conn_t *connp;
6258 
6259 	if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
6260 		*errorp = EPROTONOSUPPORT;
6261 		return (NULL);
6262 	}
6263 
6264 	connp = icmp_open(family, credp, errorp, flags);
6265 	if (connp != NULL) {
6266 		icmp_stack_t *is;
6267 
6268 		is = connp->conn_icmp->icmp_is;
6269 		connp->conn_flags |= IPCL_NONSTR;
6270 
6271 		if (connp->conn_icmp->icmp_family == AF_INET6) {
6272 			/* Build initial header template for transmit */
6273 			rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
6274 			if ((*errorp =
6275 			    icmp_build_hdrs(connp->conn_icmp)) != 0) {
6276 				rw_exit(&connp->conn_icmp->icmp_rwlock);
6277 				ipcl_conn_destroy(connp);
6278 				return (NULL);
6279 			}
6280 			rw_exit(&connp->conn_icmp->icmp_rwlock);
6281 		}
6282 
6283 		connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat;
6284 		connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat;
6285 
6286 		if ((*errorp = ip_create_helper_stream(connp,
6287 		    is->is_ldi_ident)) != 0) {
6288 			cmn_err(CE_CONT, "create of IP helper stream failed\n");
6289 			(void) rawip_do_close(connp);
6290 			return (NULL);
6291 		}
6292 
6293 		mutex_enter(&connp->conn_lock);
6294 		connp->conn_state_flags &= ~CONN_INCIPIENT;
6295 		mutex_exit(&connp->conn_lock);
6296 		*sock_downcalls = &sock_rawip_downcalls;
6297 		*smodep = SM_ATOMIC;
6298 	} else {
6299 		ASSERT(*errorp != 0);
6300 	}
6301 
6302 	return ((sock_lower_handle_t)connp);
6303 }
6304 
6305 /* ARGSUSED */
6306 void
6307 rawip_activate(sock_lower_handle_t proto_handle,
6308     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
6309     cred_t *cr)
6310 {
6311 	conn_t 			*connp = (conn_t *)proto_handle;
6312 	icmp_stack_t 		*is = connp->conn_icmp->icmp_is;
6313 	struct sock_proto_props sopp;
6314 
6315 	/* All Solaris components should pass a cred for this operation. */
6316 	ASSERT(cr != NULL);
6317 
6318 	connp->conn_upcalls = sock_upcalls;
6319 	connp->conn_upper_handle = sock_handle;
6320 
6321 	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
6322 	    SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
6323 	sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len +
6324 	    is->is_wroff_extra;
6325 	sopp.sopp_rxhiwat = is->is_recv_hiwat;
6326 	sopp.sopp_rxlowat = icmp_mod_info.mi_lowat;
6327 	sopp.sopp_maxblk = INFPSZ;
6328 	sopp.sopp_maxpsz = IP_MAXPACKET;
6329 	sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
6330 	    icmp_mod_info.mi_minpsz;
6331 
6332 	(*connp->conn_upcalls->su_set_proto_props)
6333 	    (connp->conn_upper_handle, &sopp);
6334 }
6335 
6336 static int
6337 rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6338 {
6339 	sin_t	*sin = (sin_t *)sa;
6340 	sin6_t	*sin6 = (sin6_t *)sa;
6341 
6342 	ASSERT(icmp != NULL);
6343 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6344 
6345 	switch (icmp->icmp_family) {
6346 	case AF_INET:
6347 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6348 		if (*salenp < sizeof (sin_t))
6349 			return (EINVAL);
6350 
6351 		*salenp = sizeof (sin_t);
6352 		*sin = sin_null;
6353 		sin->sin_family = AF_INET;
6354 		if (icmp->icmp_state == TS_UNBND) {
6355 			break;
6356 		}
6357 
6358 		if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
6359 		    !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6360 			sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src);
6361 		} else {
6362 			/*
6363 			 * INADDR_ANY
6364 			 * icmp_v6src is not set, we might be bound to
6365 			 * broadcast/multicast. Use icmp_bound_v6src as
6366 			 * local address instead (that could
6367 			 * also still be INADDR_ANY)
6368 			 */
6369 			sin->sin_addr.s_addr =
6370 			    V4_PART_OF_V6(icmp->icmp_bound_v6src);
6371 		}
6372 		break;
6373 	case AF_INET6:
6374 
6375 		if (*salenp < sizeof (sin6_t))
6376 			return (EINVAL);
6377 
6378 		*salenp = sizeof (sin6_t);
6379 		*sin6 = sin6_null;
6380 		sin6->sin6_family = AF_INET6;
6381 		if (icmp->icmp_state == TS_UNBND) {
6382 			break;
6383 		}
6384 		if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
6385 			sin6->sin6_addr = icmp->icmp_v6src;
6386 		} else {
6387 			/*
6388 			 * UNSPECIFIED
6389 			 * icmp_v6src is not set, we might be bound to
6390 			 * broadcast/multicast. Use icmp_bound_v6src as
6391 			 * local address instead (that could
6392 			 * also still be UNSPECIFIED)
6393 			 */
6394 
6395 			sin6->sin6_addr = icmp->icmp_bound_v6src;
6396 		}
6397 		break;
6398 	}
6399 	return (0);
6400 }
6401 
6402 static int
6403 rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
6404 {
6405 	sin_t   *sin = (sin_t *)sa;
6406 	sin6_t  *sin6 = (sin6_t *)sa;
6407 
6408 	ASSERT(icmp != NULL);
6409 	ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
6410 
6411 	if (icmp->icmp_state != TS_DATA_XFER)
6412 		return (ENOTCONN);
6413 
6414 	sa->sa_family = icmp->icmp_family;
6415 	switch (icmp->icmp_family) {
6416 	case AF_INET:
6417 		ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
6418 
6419 		if (*salenp < sizeof (sin_t))
6420 			return (EINVAL);
6421 
6422 		*salenp = sizeof (sin_t);
6423 		*sin = sin_null;
6424 		sin->sin_family = AF_INET;
6425 		sin->sin_addr.s_addr =
6426 		    V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6427 		break;
6428 	case AF_INET6:
6429 		if (*salenp < sizeof (sin6_t))
6430 			return (EINVAL);
6431 
6432 		*salenp = sizeof (sin6_t);
6433 		*sin6 = sin6_null;
6434 		*sin6 = icmp->icmp_v6dst;
6435 		break;
6436 	}
6437 	return (0);
6438 }
6439 
6440 /* ARGSUSED */
6441 int
6442 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6443     socklen_t *salenp, cred_t *cr)
6444 {
6445 	conn_t  *connp = (conn_t *)proto_handle;
6446 	icmp_t  *icmp = connp->conn_icmp;
6447 	int	error;
6448 
6449 	/* All Solaris components should pass a cred for this operation. */
6450 	ASSERT(cr != NULL);
6451 
6452 	ASSERT(icmp != NULL);
6453 
6454 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6455 
6456 	error = rawip_do_getpeername(icmp, sa, salenp);
6457 
6458 	rw_exit(&icmp->icmp_rwlock);
6459 
6460 	return (error);
6461 }
6462 
6463 /* ARGSUSED */
6464 int
6465 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
6466     socklen_t *salenp, cred_t *cr)
6467 {
6468 	conn_t  *connp = (conn_t *)proto_handle;
6469 	icmp_t	*icmp = connp->conn_icmp;
6470 	int	error;
6471 
6472 	/* All Solaris components should pass a cred for this operation. */
6473 	ASSERT(cr != NULL);
6474 
6475 	ASSERT(icmp != NULL);
6476 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6477 
6478 	error = rawip_do_getsockname(icmp, sa, salenp);
6479 
6480 	rw_exit(&icmp->icmp_rwlock);
6481 
6482 	return (error);
6483 }
6484 
6485 int
6486 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6487     const void *optvalp, socklen_t optlen, cred_t *cr)
6488 {
6489 	conn_t	*connp = (conn_t *)proto_handle;
6490 	icmp_t *icmp = connp->conn_icmp;
6491 	int error;
6492 
6493 	/* All Solaris components should pass a cred for this operation. */
6494 	ASSERT(cr != NULL);
6495 
6496 	error = proto_opt_check(level, option_name, optlen, NULL,
6497 	    icmp_opt_obj.odb_opt_des_arr,
6498 	    icmp_opt_obj.odb_opt_arr_cnt,
6499 	    icmp_opt_obj.odb_topmost_tpiprovider,
6500 	    B_TRUE, B_FALSE, cr);
6501 
6502 	if (error != 0) {
6503 		/*
6504 		 * option not recognized
6505 		 */
6506 		if (error < 0) {
6507 			error = proto_tlitosyserr(-error);
6508 		}
6509 		return (error);
6510 	}
6511 
6512 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6513 	error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
6514 	    option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
6515 	    (uchar_t *)optvalp, NULL, cr);
6516 	rw_exit(&icmp->icmp_rwlock);
6517 
6518 	if (error < 0) {
6519 		/*
6520 		 * Pass on to ip
6521 		 */
6522 		error = ip_set_options(connp, level, option_name, optvalp,
6523 		    optlen, cr);
6524 	}
6525 
6526 	ASSERT(error >= 0);
6527 
6528 	return (error);
6529 }
6530 
6531 int
6532 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
6533     void *optvalp, socklen_t *optlen, cred_t *cr)
6534 {
6535 	int		error;
6536 	conn_t		*connp = (conn_t *)proto_handle;
6537 	icmp_t		*icmp = connp->conn_icmp;
6538 	t_uscalar_t	max_optbuf_len;
6539 	void		*optvalp_buf;
6540 	int		len;
6541 
6542 	/* All Solaris components should pass a cred for this operation. */
6543 	ASSERT(cr != NULL);
6544 
6545 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
6546 	    icmp_opt_obj.odb_opt_des_arr,
6547 	    icmp_opt_obj.odb_opt_arr_cnt,
6548 	    icmp_opt_obj.odb_topmost_tpiprovider,
6549 	    B_FALSE, B_TRUE, cr);
6550 
6551 	if (error != 0) {
6552 		if (error < 0) {
6553 			error = proto_tlitosyserr(-error);
6554 		}
6555 		return (error);
6556 	}
6557 
6558 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
6559 	rw_enter(&icmp->icmp_rwlock, RW_READER);
6560 	len = icmp_opt_get(connp, level, option_name, optvalp_buf);
6561 	rw_exit(&icmp->icmp_rwlock);
6562 
6563 	if (len < 0) {
6564 		/*
6565 		 * Pass on to IP
6566 		 */
6567 		kmem_free(optvalp_buf, max_optbuf_len);
6568 		return (ip_get_options(connp, level, option_name, optvalp,
6569 		    optlen, cr));
6570 	} else {
6571 		/*
6572 		 * update optlen and copy option value
6573 		 */
6574 		t_uscalar_t size = MIN(len, *optlen);
6575 		bcopy(optvalp_buf, optvalp, size);
6576 		bcopy(&size, optlen, sizeof (size));
6577 
6578 		kmem_free(optvalp_buf, max_optbuf_len);
6579 		return (0);
6580 	}
6581 }
6582 
6583 /* ARGSUSED */
6584 int
6585 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
6586 {
6587 	conn_t	*connp = (conn_t *)proto_handle;
6588 
6589 	/* All Solaris components should pass a cred for this operation. */
6590 	ASSERT(cr != NULL);
6591 
6592 	(void) rawip_do_close(connp);
6593 	return (0);
6594 }
6595 
6596 /* ARGSUSED */
6597 int
6598 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
6599 {
6600 	conn_t  *connp = (conn_t *)proto_handle;
6601 
6602 	/* All Solaris components should pass a cred for this operation. */
6603 	ASSERT(cr != NULL);
6604 
6605 	/* shut down the send side */
6606 	if (how != SHUT_RD)
6607 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6608 		    SOCK_OPCTL_SHUT_SEND, 0);
6609 	/* shut down the recv side */
6610 	if (how != SHUT_WR)
6611 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
6612 		    SOCK_OPCTL_SHUT_RECV, 0);
6613 	return (0);
6614 }
6615 
6616 void
6617 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
6618 {
6619 	conn_t  *connp = (conn_t *)proto_handle;
6620 	icmp_t	*icmp = connp->conn_icmp;
6621 
6622 	mutex_enter(&icmp->icmp_recv_lock);
6623 	connp->conn_flow_cntrld = B_FALSE;
6624 	mutex_exit(&icmp->icmp_recv_lock);
6625 }
6626 
6627 int
6628 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
6629     int mode, int32_t *rvalp, cred_t *cr)
6630 {
6631 	conn_t  	*connp = (conn_t *)proto_handle;
6632 	int		error;
6633 
6634 	/* All Solaris components should pass a cred for this operation. */
6635 	ASSERT(cr != NULL);
6636 
6637 	switch (cmd) {
6638 	case ND_SET:
6639 	case ND_GET:
6640 	case _SIOCSOCKFALLBACK:
6641 	case TI_GETPEERNAME:
6642 	case TI_GETMYNAME:
6643 #ifdef DEBUG
6644 		cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
6645 		    " socket", cmd);
6646 #endif
6647 		error = EINVAL;
6648 		break;
6649 	default:
6650 		/*
6651 		 * Pass on to IP using helper stream
6652 		 */
6653 		error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
6654 		    cmd, arg, mode, cr, rvalp);
6655 		break;
6656 	}
6657 	return (error);
6658 }
6659 
6660 /* ARGSUSED */
6661 int
6662 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
6663     cred_t *cr)
6664 {
6665 	conn_t *connp = (conn_t *)proto_handle;
6666 	icmp_t	*icmp = connp->conn_icmp;
6667 	icmp_stack_t *is = icmp->icmp_is;
6668 	int error = 0;
6669 	boolean_t bypass_dgram_errind = B_FALSE;
6670 
6671 	ASSERT(DB_TYPE(mp) == M_DATA);
6672 
6673 	/* All Solaris components should pass a cred for this operation. */
6674 	ASSERT(cr != NULL);
6675 
6676 	/* If labeled then sockfs should have already set db_credp */
6677 	ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
6678 
6679 	/* do an implicit bind if necessary */
6680 	if (icmp->icmp_state == TS_UNBND) {
6681 		error = rawip_implicit_bind(connp);
6682 		/*
6683 		 * We could be racing with an actual bind, in which case
6684 		 * we would see EPROTO. We cross our fingers and try
6685 		 * to connect.
6686 		 */
6687 		if (!(error == 0 || error == EPROTO)) {
6688 			freemsg(mp);
6689 			return (error);
6690 		}
6691 	}
6692 
6693 	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
6694 
6695 	if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) {
6696 		error = EISCONN;
6697 		goto done_lock;
6698 	}
6699 
6700 	switch (icmp->icmp_family) {
6701 	case AF_INET6: {
6702 		sin6_t	*sin6;
6703 		ip6_pkt_t	ipp_s;	/* For ancillary data options */
6704 		ip6_pkt_t	*ipp = &ipp_s;
6705 
6706 		sin6 = (sin6_t *)msg->msg_name;
6707 		if (sin6 != NULL) {
6708 			error = proto_verify_ip_addr(icmp->icmp_family,
6709 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6710 			if (error != 0) {
6711 				bypass_dgram_errind = B_TRUE;
6712 				goto done_lock;
6713 			}
6714 			if (icmp->icmp_delayed_error != 0) {
6715 				sin6_t  *sin1 = (sin6_t *)msg->msg_name;
6716 				sin6_t  *sin2 = (sin6_t *)
6717 				    &icmp->icmp_delayed_addr;
6718 
6719 				error = icmp->icmp_delayed_error;
6720 				icmp->icmp_delayed_error = 0;
6721 
6722 				/* Compare IP address and port */
6723 
6724 				if (sin1->sin6_port == sin2->sin6_port &&
6725 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
6726 				    &sin2->sin6_addr)) {
6727 					goto done_lock;
6728 				}
6729 			}
6730 		} else {
6731 			/*
6732 			 * Use connected address
6733 			 */
6734 			if (icmp->icmp_state != TS_DATA_XFER) {
6735 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6736 				error = EDESTADDRREQ;
6737 				bypass_dgram_errind = B_TRUE;
6738 				goto done_lock;
6739 			}
6740 			sin6 = &icmp->icmp_v6dst;
6741 		}
6742 
6743 		/* No support for mapped addresses on raw sockets */
6744 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6745 			BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6746 			error = EADDRNOTAVAIL;
6747 			goto done_lock;
6748 		}
6749 
6750 		ipp->ipp_fields = 0;
6751 		ipp->ipp_sticky_ignored = 0;
6752 
6753 		/*
6754 		 * If options passed in, feed it for verification and handling
6755 		 */
6756 		if (msg->msg_controllen != 0) {
6757 			error = process_auxiliary_options(connp,
6758 			    msg->msg_control, msg->msg_controllen,
6759 			    ipp, &icmp_opt_obj, icmp_opt_set, cr);
6760 			if (error != 0) {
6761 				goto done_lock;
6762 			}
6763 		}
6764 
6765 		rw_exit(&icmp->icmp_rwlock);
6766 
6767 		/*
6768 		 * Destination is a native IPv6 address.
6769 		 * Send out an IPv6 format packet.
6770 		 */
6771 
6772 		error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6,
6773 		    ipp);
6774 	}
6775 		break;
6776 	case AF_INET: {
6777 		sin_t	*sin;
6778 		ip4_pkt_t pktinfo;
6779 		ip4_pkt_t *pktinfop = &pktinfo;
6780 		ipaddr_t	v4dst;
6781 
6782 		sin = (sin_t *)msg->msg_name;
6783 		if (sin != NULL) {
6784 			error = proto_verify_ip_addr(icmp->icmp_family,
6785 			    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
6786 			if (error != 0) {
6787 				bypass_dgram_errind = B_TRUE;
6788 				goto done_lock;
6789 			}
6790 			v4dst = sin->sin_addr.s_addr;
6791 			if (icmp->icmp_delayed_error != 0) {
6792 				sin_t *sin1 = (sin_t *)msg->msg_name;
6793 				sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
6794 
6795 				error = icmp->icmp_delayed_error;
6796 				icmp->icmp_delayed_error = 0;
6797 
6798 				/* Compare IP address and port */
6799 				if (sin1->sin_port == sin2->sin_port &&
6800 				    sin1->sin_addr.s_addr ==
6801 				    sin2->sin_addr.s_addr) {
6802 					goto done_lock;
6803 				}
6804 
6805 			}
6806 		} else {
6807 			/*
6808 			 * Use connected address
6809 			 */
6810 			if (icmp->icmp_state != TS_DATA_XFER) {
6811 				BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
6812 				error = EDESTADDRREQ;
6813 				bypass_dgram_errind = B_TRUE;
6814 				goto done_lock;
6815 			}
6816 			v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
6817 		}
6818 
6819 
6820 		pktinfop->ip4_ill_index = 0;
6821 		pktinfop->ip4_addr = INADDR_ANY;
6822 
6823 		/*
6824 		 * If options passed in, feed it for verification and handling
6825 		 */
6826 		if (msg->msg_controllen != 0) {
6827 			error = process_auxiliary_options(connp,
6828 			    msg->msg_control, msg->msg_controllen,
6829 			    pktinfop, &icmp_opt_obj, icmp_opt_set, cr);
6830 			if (error != 0) {
6831 				goto done_lock;
6832 			}
6833 		}
6834 		rw_exit(&icmp->icmp_rwlock);
6835 
6836 		error = raw_ip_send_data_v4(connp->conn_wq, connp, mp,
6837 		    v4dst, pktinfop);
6838 		break;
6839 	}
6840 
6841 	default:
6842 		ASSERT(0);
6843 	}
6844 
6845 	goto done;
6846 
6847 done_lock:
6848 	rw_exit(&icmp->icmp_rwlock);
6849 	if (error != 0) {
6850 		ASSERT(mp != NULL);
6851 		freemsg(mp);
6852 	}
6853 done:
6854 	if (bypass_dgram_errind)
6855 		return (error);
6856 	return (icmp->icmp_dgram_errind ? error : 0);
6857 }
6858 
6859 sock_downcalls_t sock_rawip_downcalls = {
6860 	rawip_activate,
6861 	rawip_accept,
6862 	rawip_bind,
6863 	rawip_listen,
6864 	rawip_connect,
6865 	rawip_getpeername,
6866 	rawip_getsockname,
6867 	rawip_getsockopt,
6868 	rawip_setsockopt,
6869 	rawip_send,
6870 	NULL,
6871 	NULL,
6872 	NULL,
6873 	rawip_shutdown,
6874 	rawip_clr_flowctrl,
6875 	rawip_ioctl,
6876 	rawip_close
6877 };
6878